1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/conf.h> 29 #include <sys/time.h> 30 #include <sys/taskq.h> 31 #include <sys/cmn_err.h> 32 #include <sys/sdt.h> 33 #include <sys/atomic.h> 34 #include <netinet/in.h> 35 #include <inet/ip.h> 36 #include <inet/ip6.h> 37 #include <inet/tcp.h> 38 #include <inet/udp_impl.h> 39 #include <inet/ilb.h> 40 41 #include "ilb_stack.h" 42 #include "ilb_impl.h" 43 #include "ilb_conn.h" 44 #include "ilb_nat.h" 45 46 /* 47 * Timer struct for ilb_conn_t and ilb_sticky_t garbage collection 48 * 49 * start: starting index into the hash table to do gc 50 * end: ending index into the hash table to do gc 51 * ilbs: pointer to the ilb_stack_t of the IP stack 52 * tid_lock: mutex to protect the timer id. 53 * tid: timer id of the timer 54 */ 55 typedef struct ilb_timer_s { 56 uint32_t start; 57 uint32_t end; 58 ilb_stack_t *ilbs; 59 kmutex_t tid_lock; 60 timeout_id_t tid; 61 } ilb_timer_t; 62 63 /* Hash macro for finding the index to the conn hash table */ 64 #define ILB_CONN_HASH(saddr, sport, daddr, dport, hash_size) \ 65 (((*((saddr) + 3) ^ *((daddr) + 3)) * 50653 + \ 66 (*((saddr) + 2) ^ *((daddr) + 2)) * 1369 + \ 67 (*((saddr) + 1) ^ *((daddr) + 1)) * 37 + \ 68 (*(saddr) ^ *(daddr)) + (sport) * 37 + (dport)) & \ 69 ((hash_size) - 1)) 70 71 /* Kmem cache for the conn hash entry */ 72 static struct kmem_cache *ilb_conn_cache = NULL; 73 74 /* 75 * There are 60 timers running to do conn cache garbage collection. Each 76 * gc thread is responsible for 1/60 of the conn hash table. 77 */ 78 static int ilb_conn_timer_size = 60; 79 80 /* Each of the above gc timers wake up every 15s to do the gc. */ 81 static int ilb_conn_cache_timeout = 15; 82 83 #define ILB_STICKY_HASH(saddr, rule, hash_size) \ 84 (((*((saddr) + 3) ^ ((rule) >> 24)) * 29791 + \ 85 (*((saddr) + 2) ^ ((rule) >> 16)) * 961 + \ 86 (*((saddr) + 1) ^ ((rule) >> 8)) * 31 + \ 87 (*(saddr) ^ (rule))) & ((hash_size) - 1)) 88 89 static struct kmem_cache *ilb_sticky_cache = NULL; 90 91 /* 92 * There are 60 timers running to do sticky cache garbage collection. Each 93 * gc thread is responsible for 1/60 of the sticky hash table. 94 */ 95 static int ilb_sticky_timer_size = 60; 96 97 /* Each of the above gc timers wake up every 15s to do the gc. */ 98 static int ilb_sticky_timeout = 15; 99 100 #define ILB_STICKY_REFRELE(s) \ 101 { \ 102 mutex_enter(&(s)->hash->sticky_lock); \ 103 (s)->refcnt--; \ 104 (s)->atime = ddi_get_lbolt64(); \ 105 mutex_exit(&s->hash->sticky_lock); \ 106 } 107 108 109 static void 110 ilb_conn_cache_init(void) 111 { 112 ilb_conn_cache = kmem_cache_create("ilb_conn_cache", 113 sizeof (ilb_conn_t), 0, NULL, NULL, NULL, NULL, NULL, 114 ilb_kmem_flags); 115 } 116 117 void 118 ilb_conn_cache_fini(void) 119 { 120 if (ilb_conn_cache != NULL) { 121 kmem_cache_destroy(ilb_conn_cache); 122 ilb_conn_cache = NULL; 123 } 124 } 125 126 static void 127 ilb_conn_remove_common(ilb_conn_t *connp, boolean_t c2s) 128 { 129 ilb_conn_hash_t *hash; 130 ilb_conn_t **next, **prev; 131 ilb_conn_t **next_prev, **prev_next; 132 133 if (c2s) { 134 hash = connp->conn_c2s_hash; 135 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock)); 136 next = &connp->conn_c2s_next; 137 prev = &connp->conn_c2s_prev; 138 if (*next != NULL) 139 next_prev = &(*next)->conn_c2s_prev; 140 if (*prev != NULL) 141 prev_next = &(*prev)->conn_c2s_next; 142 } else { 143 hash = connp->conn_s2c_hash; 144 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock)); 145 next = &connp->conn_s2c_next; 146 prev = &connp->conn_s2c_prev; 147 if (*next != NULL) 148 next_prev = &(*next)->conn_s2c_prev; 149 if (*prev != NULL) 150 prev_next = &(*prev)->conn_s2c_next; 151 } 152 153 if (hash->ilb_connp == connp) { 154 hash->ilb_connp = *next; 155 if (*next != NULL) 156 *next_prev = NULL; 157 } else { 158 if (*prev != NULL) 159 *prev_next = *next; 160 if (*next != NULL) 161 *next_prev = *prev; 162 } 163 ASSERT(hash->ilb_conn_cnt > 0); 164 hash->ilb_conn_cnt--; 165 166 *next = NULL; 167 *prev = NULL; 168 } 169 170 static void 171 ilb_conn_remove(ilb_conn_t *connp) 172 { 173 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock)); 174 ilb_conn_remove_common(connp, B_TRUE); 175 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock)); 176 ilb_conn_remove_common(connp, B_FALSE); 177 178 if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) { 179 in_port_t port; 180 181 port = ntohs(connp->conn_rule_cache.info.nat_sport); 182 vmem_free(connp->conn_rule_cache.info.src_ent->nse_port_arena, 183 (void *)(uintptr_t)port, 1); 184 } 185 186 if (connp->conn_sticky != NULL) 187 ILB_STICKY_REFRELE(connp->conn_sticky); 188 ILB_SERVER_REFRELE(connp->conn_server); 189 kmem_cache_free(ilb_conn_cache, connp); 190 } 191 192 /* 193 * Routine to do periodic garbage collection of conn hash entries. When 194 * a conn hash timer fires, it dispatches a taskq to call this function 195 * to do the gc. Note that each taskq is responisble for a portion of 196 * the table. The portion is stored in timer->start, timer->end. 197 */ 198 static void 199 ilb_conn_cleanup(void *arg) 200 { 201 ilb_timer_t *timer = (ilb_timer_t *)arg; 202 uint32_t i; 203 ilb_stack_t *ilbs; 204 ilb_conn_hash_t *c2s_hash, *s2c_hash; 205 ilb_conn_t *connp, *nxt_connp; 206 int64_t now; 207 int64_t expiry; 208 boolean_t die_now; 209 210 ilbs = timer->ilbs; 211 c2s_hash = ilbs->ilbs_c2s_conn_hash; 212 ASSERT(c2s_hash != NULL); 213 214 now = ddi_get_lbolt64(); 215 for (i = timer->start; i < timer->end; i++) { 216 mutex_enter(&c2s_hash[i].ilb_conn_hash_lock); 217 if ((connp = c2s_hash[i].ilb_connp) == NULL) { 218 ASSERT(c2s_hash[i].ilb_conn_cnt == 0); 219 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock); 220 continue; 221 } 222 do { 223 ASSERT(c2s_hash[i].ilb_conn_cnt > 0); 224 ASSERT(connp->conn_c2s_hash == &c2s_hash[i]); 225 nxt_connp = connp->conn_c2s_next; 226 expiry = now - SEC_TO_TICK(connp->conn_expiry); 227 if (connp->conn_server->iser_die_time != 0 && 228 connp->conn_server->iser_die_time < now) 229 die_now = B_TRUE; 230 else 231 die_now = B_FALSE; 232 s2c_hash = connp->conn_s2c_hash; 233 mutex_enter(&s2c_hash->ilb_conn_hash_lock); 234 235 if (connp->conn_gc || die_now || 236 (connp->conn_c2s_atime < expiry && 237 connp->conn_s2c_atime < expiry)) { 238 /* Need to update the nat list cur_connp */ 239 if (connp == ilbs->ilbs_conn_list_connp) { 240 ilbs->ilbs_conn_list_connp = 241 connp->conn_c2s_next; 242 } 243 ilb_conn_remove(connp); 244 goto nxt_connp; 245 } 246 247 if (connp->conn_l4 != IPPROTO_TCP) 248 goto nxt_connp; 249 250 /* Update and check TCP related conn info */ 251 if (connp->conn_c2s_tcp_fin_sent && 252 SEQ_GT(connp->conn_s2c_tcp_ack, 253 connp->conn_c2s_tcp_fss)) { 254 connp->conn_c2s_tcp_fin_acked = B_TRUE; 255 } 256 if (connp->conn_s2c_tcp_fin_sent && 257 SEQ_GT(connp->conn_c2s_tcp_ack, 258 connp->conn_s2c_tcp_fss)) { 259 connp->conn_s2c_tcp_fin_acked = B_TRUE; 260 } 261 if (connp->conn_c2s_tcp_fin_acked && 262 connp->conn_s2c_tcp_fin_acked) { 263 ilb_conn_remove(connp); 264 } 265 nxt_connp: 266 mutex_exit(&s2c_hash->ilb_conn_hash_lock); 267 connp = nxt_connp; 268 } while (connp != NULL); 269 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock); 270 } 271 } 272 273 /* Conn hash timer routine. It dispatches a taskq and restart the timer */ 274 static void 275 ilb_conn_timer(void *arg) 276 { 277 ilb_timer_t *timer = (ilb_timer_t *)arg; 278 279 (void) taskq_dispatch(timer->ilbs->ilbs_conn_taskq, ilb_conn_cleanup, 280 arg, TQ_SLEEP); 281 mutex_enter(&timer->tid_lock); 282 if (timer->tid == 0) { 283 mutex_exit(&timer->tid_lock); 284 } else { 285 timer->tid = timeout(ilb_conn_timer, arg, 286 SEC_TO_TICK(ilb_conn_cache_timeout)); 287 mutex_exit(&timer->tid_lock); 288 } 289 } 290 291 void 292 ilb_conn_hash_init(ilb_stack_t *ilbs) 293 { 294 extern pri_t minclsyspri; 295 int i, part; 296 ilb_timer_t *tm; 297 char tq_name[TASKQ_NAMELEN]; 298 299 /* 300 * If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to 301 * the next power of 2. 302 */ 303 if (ilbs->ilbs_conn_hash_size & (ilbs->ilbs_conn_hash_size - 1)) { 304 for (i = 0; i < 31; i++) { 305 if (ilbs->ilbs_conn_hash_size < (1 << i)) 306 break; 307 } 308 ilbs->ilbs_conn_hash_size = 1 << i; 309 } 310 311 /* 312 * Can sleep since this should be called when a rule is being added, 313 * hence we are not in interrupt context. 314 */ 315 ilbs->ilbs_c2s_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) * 316 ilbs->ilbs_conn_hash_size, KM_SLEEP); 317 ilbs->ilbs_s2c_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) * 318 ilbs->ilbs_conn_hash_size, KM_SLEEP); 319 320 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) { 321 mutex_init(&ilbs->ilbs_c2s_conn_hash[i].ilb_conn_hash_lock, 322 NULL, MUTEX_DEFAULT, NULL); 323 } 324 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) { 325 mutex_init(&ilbs->ilbs_s2c_conn_hash[i].ilb_conn_hash_lock, 326 NULL, MUTEX_DEFAULT, NULL); 327 } 328 329 if (ilb_conn_cache == NULL) 330 ilb_conn_cache_init(); 331 332 (void) snprintf(tq_name, sizeof (tq_name), "ilb_conn_taskq_%p", 333 (void *)ilbs->ilbs_netstack); 334 ASSERT(ilbs->ilbs_conn_taskq == NULL); 335 ilbs->ilbs_conn_taskq = taskq_create(tq_name, 336 ilb_conn_timer_size * 2, minclsyspri, ilb_conn_timer_size, 337 ilb_conn_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); 338 339 ASSERT(ilbs->ilbs_conn_timer_list == NULL); 340 ilbs->ilbs_conn_timer_list = kmem_zalloc(sizeof (ilb_timer_t) * 341 ilb_conn_timer_size, KM_SLEEP); 342 343 /* 344 * The hash table is divided in equal partition for those timers 345 * to do garbage collection. 346 */ 347 part = ilbs->ilbs_conn_hash_size / ilb_conn_timer_size + 1; 348 for (i = 0; i < ilb_conn_timer_size; i++) { 349 tm = ilbs->ilbs_conn_timer_list + i; 350 tm->start = i * part; 351 tm->end = i * part + part; 352 if (tm->end > ilbs->ilbs_conn_hash_size) 353 tm->end = ilbs->ilbs_conn_hash_size; 354 tm->ilbs = ilbs; 355 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL); 356 /* Spread out the starting execution time of all the timers. */ 357 tm->tid = timeout(ilb_conn_timer, tm, 358 SEC_TO_TICK(ilb_conn_cache_timeout + i)); 359 } 360 } 361 362 void 363 ilb_conn_hash_fini(ilb_stack_t *ilbs) 364 { 365 uint32_t i; 366 ilb_conn_t *connp; 367 368 if (ilbs->ilbs_c2s_conn_hash == NULL) { 369 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL); 370 return; 371 } 372 373 /* Stop all the timers first. */ 374 for (i = 0; i < ilb_conn_timer_size; i++) { 375 timeout_id_t tid; 376 377 /* Setting tid to 0 tells the timer handler not to restart. */ 378 mutex_enter(&ilbs->ilbs_conn_timer_list[i].tid_lock); 379 tid = ilbs->ilbs_conn_timer_list[i].tid; 380 ilbs->ilbs_conn_timer_list[i].tid = 0; 381 mutex_exit(&ilbs->ilbs_conn_timer_list[i].tid_lock); 382 (void) untimeout(tid); 383 } 384 kmem_free(ilbs->ilbs_conn_timer_list, sizeof (ilb_timer_t) * 385 ilb_conn_timer_size); 386 taskq_destroy(ilbs->ilbs_conn_taskq); 387 ilbs->ilbs_conn_taskq = NULL; 388 389 /* Then remove all the conns. */ 390 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) { 391 while ((connp = ilbs->ilbs_s2c_conn_hash->ilb_connp) != NULL) { 392 ilbs->ilbs_s2c_conn_hash->ilb_connp = 393 connp->conn_s2c_next; 394 ILB_SERVER_REFRELE(connp->conn_server); 395 if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) { 396 ilb_nat_src_entry_t *ent; 397 in_port_t port; 398 399 /* 400 * src_ent will be freed in ilb_nat_src_fini(). 401 */ 402 port = ntohs( 403 connp->conn_rule_cache.info.nat_sport); 404 ent = connp->conn_rule_cache.info.src_ent; 405 vmem_free(ent->nse_port_arena, 406 (void *)(uintptr_t)port, 1); 407 } 408 kmem_cache_free(ilb_conn_cache, connp); 409 } 410 } 411 kmem_free(ilbs->ilbs_c2s_conn_hash, sizeof (ilb_conn_hash_t) * 412 ilbs->ilbs_conn_hash_size); 413 kmem_free(ilbs->ilbs_s2c_conn_hash, sizeof (ilb_conn_hash_t) * 414 ilbs->ilbs_conn_hash_size); 415 } 416 417 /* 418 * Internet checksum adjustment calculation routines. We pre-calculate 419 * checksum adjustment so that we don't need to compute the checksum on 420 * the whole packet when we change address/port in the packet. 421 */ 422 423 static void 424 hnat_cksum_v4(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port, 425 in_port_t new_port, uint32_t *adj_sum) 426 { 427 uint32_t sum; 428 429 sum = *oaddr + *(oaddr + 1) + old_port; 430 while ((sum >> 16) != 0) 431 sum = (sum & 0xffff) + (sum >> 16); 432 *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + new_port; 433 } 434 435 static void 436 hnat_cksum_v6(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port, 437 in_port_t new_port, uint32_t *adj_sum) 438 { 439 uint32_t sum = 0; 440 441 sum = *oaddr + *(oaddr + 1) + *(oaddr + 2) + *(oaddr + 3) + 442 *(oaddr + 4) + *(oaddr + 5) + *(oaddr + 6) + *(oaddr + 7) + 443 old_port; 444 while ((sum >> 16) != 0) 445 sum = (sum & 0xffff) + (sum >> 16); 446 *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + 447 *(naddr + 2) + *(naddr + 3) + *(naddr + 4) + *(naddr + 5) + 448 *(naddr + 6) + *(naddr + 7) + new_port; 449 } 450 451 static void 452 fnat_cksum_v4(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1, 453 uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2, 454 in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum) 455 { 456 uint32_t sum; 457 458 sum = *oaddr1 + *(oaddr1 + 1) + old_port1 + *oaddr2 + *(oaddr2 + 1) + 459 old_port2; 460 while ((sum >> 16) != 0) 461 sum = (sum & 0xffff) + (sum >> 16); 462 *adj_sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + new_port1 + 463 *naddr2 + *(naddr2 + 1) + new_port2; 464 } 465 466 static void 467 fnat_cksum_v6(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1, 468 uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2, 469 in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum) 470 { 471 uint32_t sum = 0; 472 473 sum = *oaddr1 + *(oaddr1 + 1) + *(oaddr1 + 2) + *(oaddr1 + 3) + 474 *(oaddr1 + 4) + *(oaddr1 + 5) + *(oaddr1 + 6) + *(oaddr1 + 7) + 475 old_port1; 476 sum += *oaddr2 + *(oaddr2 + 1) + *(oaddr2 + 2) + *(oaddr2 + 3) + 477 *(oaddr2 + 4) + *(oaddr2 + 5) + *(oaddr2 + 6) + *(oaddr2 + 7) + 478 old_port2; 479 while ((sum >> 16) != 0) 480 sum = (sum & 0xffff) + (sum >> 16); 481 sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + *(naddr1 + 2) + 482 *(naddr1 + 3) + *(naddr1 + 4) + *(naddr1 + 5) + *(naddr1 + 6) + 483 *(naddr1 + 7) + new_port1; 484 *adj_sum = sum + *naddr2 + *(naddr2 + 1) + *(naddr2 + 2) + 485 *(naddr2 + 3) + *(naddr2 + 4) + *(naddr2 + 5) + *(naddr2 + 6) + 486 *(naddr2 + 7) + new_port2; 487 } 488 489 /* 490 * Add a conn hash entry to the tables. Note that a conn hash entry 491 * (ilb_conn_t) contains info on both directions. And there are two hash 492 * tables, one for client to server and the other for server to client. 493 * So the same entry is added to both tables and can be ccessed by two 494 * thread simultaneously. But each thread will only access data on one 495 * direction, so there is no conflict. 496 */ 497 int 498 ilb_conn_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_t *server, 499 in6_addr_t *src, in_port_t sport, in6_addr_t *dst, in_port_t dport, 500 ilb_nat_info_t *info, uint32_t *ip_sum, uint32_t *tp_sum, ilb_sticky_t *s) 501 { 502 ilb_conn_t *connp; 503 ilb_conn_hash_t *hash; 504 int i; 505 506 connp = kmem_cache_alloc(ilb_conn_cache, KM_NOSLEEP); 507 if (connp == NULL) { 508 if (s != NULL) { 509 if (rule->ir_topo == ILB_TOPO_IMPL_NAT) { 510 ilb_nat_src_entry_t **entry; 511 512 entry = s->server->iser_nat_src->src_list; 513 vmem_free(entry[s->nat_src_idx]->nse_port_arena, 514 (void *)(uintptr_t)ntohs(info->nat_sport), 515 1); 516 } 517 ILB_STICKY_REFRELE(s); 518 } 519 return (ENOMEM); 520 } 521 522 connp->conn_l4 = rule->ir_proto; 523 524 connp->conn_server = server; 525 ILB_SERVER_REFHOLD(server); 526 connp->conn_sticky = s; 527 528 connp->conn_rule_cache.topo = rule->ir_topo; 529 connp->conn_rule_cache.info = *info; 530 531 connp->conn_gc = B_FALSE; 532 533 connp->conn_expiry = rule->ir_nat_expiry; 534 connp->conn_cr_time = ddi_get_lbolt64(); 535 536 /* Client to server info. */ 537 connp->conn_c2s_saddr = *src; 538 connp->conn_c2s_sport = sport; 539 connp->conn_c2s_daddr = *dst; 540 connp->conn_c2s_dport = dport; 541 542 connp->conn_c2s_atime = ddi_get_lbolt64(); 543 /* The packet ths triggers this creation should be counted */ 544 connp->conn_c2s_pkt_cnt = 1; 545 connp->conn_c2s_tcp_fin_sent = B_FALSE; 546 connp->conn_c2s_tcp_fin_acked = B_FALSE; 547 548 /* Server to client info, before NAT */ 549 switch (rule->ir_topo) { 550 case ILB_TOPO_IMPL_HALF_NAT: 551 connp->conn_s2c_saddr = info->nat_dst; 552 connp->conn_s2c_sport = info->nat_dport; 553 connp->conn_s2c_daddr = *src; 554 connp->conn_s2c_dport = sport; 555 556 /* Pre-calculate checksum changes for both directions */ 557 if (rule->ir_ipver == IPPROTO_IP) { 558 hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3], 559 (uint16_t *)&info->nat_dst.s6_addr32[3], 0, 0, 560 &connp->conn_c2s_ip_sum); 561 hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3], 562 (uint16_t *)&info->nat_dst.s6_addr32[3], dport, 563 info->nat_dport, &connp->conn_c2s_tp_sum); 564 *ip_sum = connp->conn_c2s_ip_sum; 565 *tp_sum = connp->conn_c2s_tp_sum; 566 567 hnat_cksum_v4( 568 (uint16_t *)&info->nat_dst.s6_addr32[3], 569 (uint16_t *)&dst->s6_addr32[3], 0, 0, 570 &connp->conn_s2c_ip_sum); 571 hnat_cksum_v4( 572 (uint16_t *)&info->nat_dst.s6_addr32[3], 573 (uint16_t *)&dst->s6_addr32[3], 574 info->nat_dport, dport, 575 &connp->conn_s2c_tp_sum); 576 } else { 577 connp->conn_c2s_ip_sum = 0; 578 hnat_cksum_v6((uint16_t *)dst, 579 (uint16_t *)&info->nat_dst, dport, 580 info->nat_dport, &connp->conn_c2s_tp_sum); 581 *ip_sum = 0; 582 *tp_sum = connp->conn_c2s_tp_sum; 583 584 connp->conn_s2c_ip_sum = 0; 585 hnat_cksum_v6((uint16_t *)&info->nat_dst, 586 (uint16_t *)dst, info->nat_dport, dport, 587 &connp->conn_s2c_tp_sum); 588 } 589 break; 590 case ILB_TOPO_IMPL_NAT: 591 connp->conn_s2c_saddr = info->nat_dst; 592 connp->conn_s2c_sport = info->nat_dport; 593 connp->conn_s2c_daddr = info->nat_src; 594 connp->conn_s2c_dport = info->nat_sport; 595 596 if (rule->ir_ipver == IPPROTO_IP) { 597 fnat_cksum_v4((uint16_t *)&src->s6_addr32[3], 598 (uint16_t *)&dst->s6_addr32[3], 599 (uint16_t *)&info->nat_src.s6_addr32[3], 600 (uint16_t *)&info->nat_dst.s6_addr32[3], 601 0, 0, 0, 0, &connp->conn_c2s_ip_sum); 602 fnat_cksum_v4((uint16_t *)&src->s6_addr32[3], 603 (uint16_t *)&dst->s6_addr32[3], 604 (uint16_t *)&info->nat_src.s6_addr32[3], 605 (uint16_t *)&info->nat_dst.s6_addr32[3], 606 sport, dport, info->nat_sport, 607 info->nat_dport, &connp->conn_c2s_tp_sum); 608 *ip_sum = connp->conn_c2s_ip_sum; 609 *tp_sum = connp->conn_c2s_tp_sum; 610 611 fnat_cksum_v4( 612 (uint16_t *)&info->nat_src.s6_addr32[3], 613 (uint16_t *)&info->nat_dst.s6_addr32[3], 614 (uint16_t *)&src->s6_addr32[3], 615 (uint16_t *)&dst->s6_addr32[3], 616 0, 0, 0, 0, &connp->conn_s2c_ip_sum); 617 fnat_cksum_v4( 618 (uint16_t *)&info->nat_src.s6_addr32[3], 619 (uint16_t *)&info->nat_dst.s6_addr32[3], 620 (uint16_t *)&src->s6_addr32[3], 621 (uint16_t *)&dst->s6_addr32[3], 622 info->nat_sport, info->nat_dport, 623 sport, dport, &connp->conn_s2c_tp_sum); 624 } else { 625 fnat_cksum_v6((uint16_t *)src, (uint16_t *)dst, 626 (uint16_t *)&info->nat_src, 627 (uint16_t *)&info->nat_dst, 628 sport, dport, info->nat_sport, 629 info->nat_dport, &connp->conn_c2s_tp_sum); 630 connp->conn_c2s_ip_sum = 0; 631 *ip_sum = 0; 632 *tp_sum = connp->conn_c2s_tp_sum; 633 634 fnat_cksum_v6((uint16_t *)&info->nat_src, 635 (uint16_t *)&info->nat_dst, (uint16_t *)src, 636 (uint16_t *)dst, info->nat_sport, 637 info->nat_dport, sport, dport, 638 &connp->conn_s2c_tp_sum); 639 connp->conn_s2c_ip_sum = 0; 640 } 641 break; 642 } 643 644 connp->conn_s2c_atime = ddi_get_lbolt64(); 645 connp->conn_s2c_pkt_cnt = 1; 646 connp->conn_s2c_tcp_fin_sent = B_FALSE; 647 connp->conn_s2c_tcp_fin_acked = B_FALSE; 648 649 /* Add it to the s2c hash table. */ 650 hash = ilbs->ilbs_s2c_conn_hash; 651 i = ILB_CONN_HASH((uint8_t *)&connp->conn_s2c_saddr.s6_addr32[3], 652 ntohs(connp->conn_s2c_sport), 653 (uint8_t *)&connp->conn_s2c_daddr.s6_addr32[3], 654 ntohs(connp->conn_s2c_dport), ilbs->ilbs_conn_hash_size); 655 connp->conn_s2c_hash = &hash[i]; 656 DTRACE_PROBE2(ilb__conn__hash__add__s2c, ilb_conn_t *, connp, int, i); 657 658 mutex_enter(&hash[i].ilb_conn_hash_lock); 659 hash[i].ilb_conn_cnt++; 660 connp->conn_s2c_next = hash[i].ilb_connp; 661 if (hash[i].ilb_connp != NULL) 662 hash[i].ilb_connp->conn_s2c_prev = connp; 663 connp->conn_s2c_prev = NULL; 664 hash[i].ilb_connp = connp; 665 mutex_exit(&hash[i].ilb_conn_hash_lock); 666 667 /* Add it to the c2s hash table. */ 668 hash = ilbs->ilbs_c2s_conn_hash; 669 i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport), 670 (uint8_t *)&dst->s6_addr32[3], ntohs(dport), 671 ilbs->ilbs_conn_hash_size); 672 connp->conn_c2s_hash = &hash[i]; 673 DTRACE_PROBE2(ilb__conn__hash__add__c2s, ilb_conn_t *, connp, int, i); 674 675 mutex_enter(&hash[i].ilb_conn_hash_lock); 676 hash[i].ilb_conn_cnt++; 677 connp->conn_c2s_next = hash[i].ilb_connp; 678 if (hash[i].ilb_connp != NULL) 679 hash[i].ilb_connp->conn_c2s_prev = connp; 680 connp->conn_c2s_prev = NULL; 681 hash[i].ilb_connp = connp; 682 mutex_exit(&hash[i].ilb_conn_hash_lock); 683 684 return (0); 685 } 686 687 /* 688 * If a connection is using TCP, we keep track of simple TCP state transition 689 * so that we know when to clean up an entry. 690 */ 691 static boolean_t 692 update_conn_tcp(ilb_conn_t *connp, void *iph, tcpha_t *tcpha, int32_t pkt_len, 693 boolean_t c2s) 694 { 695 uint32_t ack, seq; 696 int32_t seg_len; 697 698 if (tcpha->tha_flags & TH_RST) 699 return (B_FALSE); 700 701 seg_len = pkt_len - ((uint8_t *)tcpha - (uint8_t *)iph) - 702 TCP_HDR_LENGTH((tcph_t *)tcpha); 703 704 if (tcpha->tha_flags & TH_ACK) 705 ack = ntohl(tcpha->tha_ack); 706 seq = ntohl(tcpha->tha_seq); 707 if (c2s) { 708 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock)); 709 if (tcpha->tha_flags & TH_FIN) { 710 connp->conn_c2s_tcp_fss = seq + seg_len; 711 connp->conn_c2s_tcp_fin_sent = B_TRUE; 712 } 713 connp->conn_c2s_tcp_ack = ack; 714 715 /* Port reuse by the client, restart the conn. */ 716 if (connp->conn_c2s_tcp_fin_sent && 717 SEQ_GT(seq, connp->conn_c2s_tcp_fss + 1)) { 718 connp->conn_c2s_tcp_fin_sent = B_FALSE; 719 connp->conn_c2s_tcp_fin_acked = B_FALSE; 720 } 721 } else { 722 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock)); 723 if (tcpha->tha_flags & TH_FIN) { 724 connp->conn_s2c_tcp_fss = seq + seg_len; 725 connp->conn_s2c_tcp_fin_sent = B_TRUE; 726 } 727 connp->conn_s2c_tcp_ack = ack; 728 729 /* Port reuse by the client, restart the conn. */ 730 if (connp->conn_s2c_tcp_fin_sent && 731 SEQ_GT(seq, connp->conn_s2c_tcp_fss + 1)) { 732 connp->conn_s2c_tcp_fin_sent = B_FALSE; 733 connp->conn_s2c_tcp_fin_acked = B_FALSE; 734 } 735 } 736 737 return (B_TRUE); 738 } 739 740 /* 741 * Helper routint to find conn hash entry given some packet information and 742 * the traffic direction (c2s, client to server?) 743 */ 744 static boolean_t 745 ilb_find_conn(ilb_stack_t *ilbs, void *iph, void *tph, int l4, in6_addr_t *src, 746 in_port_t sport, in6_addr_t *dst, in_port_t dport, 747 ilb_rule_info_t *rule_cache, uint32_t *ip_sum, uint32_t *tp_sum, 748 int32_t pkt_len, boolean_t c2s) 749 { 750 ilb_conn_hash_t *hash; 751 uint_t i; 752 ilb_conn_t *connp; 753 boolean_t tcp_alive; 754 boolean_t ret = B_FALSE; 755 756 i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport), 757 (uint8_t *)&dst->s6_addr32[3], ntohs(dport), 758 ilbs->ilbs_conn_hash_size); 759 if (c2s) { 760 hash = ilbs->ilbs_c2s_conn_hash; 761 mutex_enter(&hash[i].ilb_conn_hash_lock); 762 for (connp = hash[i].ilb_connp; connp != NULL; 763 connp = connp->conn_c2s_next) { 764 if (connp->conn_l4 == l4 && 765 connp->conn_c2s_dport == dport && 766 connp->conn_c2s_sport == sport && 767 IN6_ARE_ADDR_EQUAL(src, &connp->conn_c2s_saddr) && 768 IN6_ARE_ADDR_EQUAL(dst, &connp->conn_c2s_daddr)) { 769 connp->conn_c2s_atime = ddi_get_lbolt64(); 770 connp->conn_c2s_pkt_cnt++; 771 *rule_cache = connp->conn_rule_cache; 772 *ip_sum = connp->conn_c2s_ip_sum; 773 *tp_sum = connp->conn_c2s_tp_sum; 774 ret = B_TRUE; 775 break; 776 } 777 } 778 } else { 779 hash = ilbs->ilbs_s2c_conn_hash; 780 mutex_enter(&hash[i].ilb_conn_hash_lock); 781 for (connp = hash[i].ilb_connp; connp != NULL; 782 connp = connp->conn_s2c_next) { 783 if (connp->conn_l4 == l4 && 784 connp->conn_s2c_dport == dport && 785 connp->conn_s2c_sport == sport && 786 IN6_ARE_ADDR_EQUAL(src, &connp->conn_s2c_saddr) && 787 IN6_ARE_ADDR_EQUAL(dst, &connp->conn_s2c_daddr)) { 788 connp->conn_s2c_atime = ddi_get_lbolt64(); 789 connp->conn_s2c_pkt_cnt++; 790 *rule_cache = connp->conn_rule_cache; 791 *ip_sum = connp->conn_s2c_ip_sum; 792 *tp_sum = connp->conn_s2c_tp_sum; 793 ret = B_TRUE; 794 break; 795 } 796 } 797 } 798 if (ret) { 799 ILB_S_KSTAT(connp->conn_server, pkt_processed); 800 ILB_S_KSTAT_UPDATE(connp->conn_server, bytes_processed, 801 pkt_len); 802 803 switch (l4) { 804 case (IPPROTO_TCP): 805 tcp_alive = update_conn_tcp(connp, iph, tph, pkt_len, 806 c2s); 807 if (!tcp_alive) { 808 connp->conn_gc = B_TRUE; 809 } 810 break; 811 default: 812 break; 813 } 814 } 815 mutex_exit(&hash[i].ilb_conn_hash_lock); 816 817 return (ret); 818 } 819 820 /* 821 * To check if a give packet matches an existing conn hash entry. If it 822 * does, return the information about this entry so that the caller can 823 * do the proper NAT. 824 */ 825 boolean_t 826 ilb_check_conn(ilb_stack_t *ilbs, int l3, void *iph, int l4, void *tph, 827 in6_addr_t *src, in6_addr_t *dst, in_port_t sport, in_port_t dport, 828 uint32_t pkt_len, in6_addr_t *lb_dst) 829 { 830 ilb_rule_info_t rule_cache; 831 uint32_t adj_ip_sum, adj_tp_sum; 832 boolean_t ret; 833 834 /* Check the incoming hash table. */ 835 if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport, 836 &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_TRUE)) { 837 switch (rule_cache.topo) { 838 case ILB_TOPO_IMPL_NAT: 839 *lb_dst = rule_cache.info.nat_dst; 840 ilb_full_nat(l3, iph, l4, tph, &rule_cache.info, 841 adj_ip_sum, adj_tp_sum, B_TRUE); 842 ret = B_TRUE; 843 break; 844 case ILB_TOPO_IMPL_HALF_NAT: 845 *lb_dst = rule_cache.info.nat_dst; 846 ilb_half_nat(l3, iph, l4, tph, &rule_cache.info, 847 adj_ip_sum, adj_tp_sum, B_TRUE); 848 ret = B_TRUE; 849 break; 850 default: 851 ret = B_FALSE; 852 break; 853 } 854 return (ret); 855 } 856 if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport, 857 &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_FALSE)) { 858 switch (rule_cache.topo) { 859 case ILB_TOPO_IMPL_NAT: 860 *lb_dst = rule_cache.info.src; 861 ilb_full_nat(l3, iph, l4, tph, &rule_cache.info, 862 adj_ip_sum, adj_tp_sum, B_FALSE); 863 ret = B_TRUE; 864 break; 865 case ILB_TOPO_IMPL_HALF_NAT: 866 *lb_dst = *dst; 867 ilb_half_nat(l3, iph, l4, tph, &rule_cache.info, 868 adj_ip_sum, adj_tp_sum, B_FALSE); 869 ret = B_TRUE; 870 break; 871 default: 872 ret = B_FALSE; 873 break; 874 } 875 return (ret); 876 } 877 878 return (B_FALSE); 879 } 880 881 /* 882 * To check if an ICMP packet belongs to a connection in one of the conn 883 * hash entries. 884 */ 885 boolean_t 886 ilb_check_icmp_conn(ilb_stack_t *ilbs, mblk_t *mp, int l3, void *out_iph, 887 void *icmph, in6_addr_t *lb_dst) 888 { 889 ilb_conn_hash_t *hash; 890 ipha_t *in_iph4; 891 ip6_t *in_iph6; 892 icmph_t *icmph4; 893 icmp6_t *icmph6; 894 in6_addr_t *in_src_p, *in_dst_p; 895 in_port_t *sport, *dport; 896 int l4; 897 uint_t i; 898 ilb_conn_t *connp; 899 ilb_rule_info_t rule_cache; 900 uint32_t adj_ip_sum; 901 boolean_t full_nat; 902 903 if (l3 == IPPROTO_IP) { 904 in6_addr_t in_src, in_dst; 905 906 icmph4 = (icmph_t *)icmph; 907 in_iph4 = (ipha_t *)&icmph4[1]; 908 909 if ((uint8_t *)in_iph4 + IPH_HDR_LENGTH(in_iph4) + 910 ICMP_MIN_TP_HDR_LEN > mp->b_wptr) { 911 return (B_FALSE); 912 } 913 914 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_src, &in_src); 915 in_src_p = &in_src; 916 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_dst, &in_dst); 917 in_dst_p = &in_dst; 918 919 l4 = in_iph4->ipha_protocol; 920 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP) 921 return (B_FALSE); 922 923 sport = (in_port_t *)((char *)in_iph4 + 924 IPH_HDR_LENGTH(in_iph4)); 925 dport = sport + 1; 926 927 DTRACE_PROBE4(ilb__chk__icmp__conn__v4, uint32_t, 928 in_iph4->ipha_src, uint32_t, in_iph4->ipha_dst, uint16_t, 929 ntohs(*sport), uint16_t, ntohs(*dport)); 930 } else { 931 ASSERT(l3 == IPPROTO_IPV6); 932 933 icmph6 = (icmp6_t *)icmph; 934 in_iph6 = (ip6_t *)&icmph6[1]; 935 in_src_p = &in_iph6->ip6_src; 936 in_dst_p = &in_iph6->ip6_dst; 937 938 if ((uint8_t *)in_iph6 + sizeof (ip6_t) + 939 ICMP_MIN_TP_HDR_LEN > mp->b_wptr) { 940 return (B_FALSE); 941 } 942 943 l4 = in_iph6->ip6_nxt; 944 /* We don't go deep inside an IPv6 packet yet. */ 945 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP) 946 return (B_FALSE); 947 948 sport = (in_port_t *)&in_iph6[1]; 949 dport = sport + 1; 950 951 DTRACE_PROBE4(ilb__chk__icmp__conn__v6, in6_addr_t *, 952 &in_iph6->ip6_src, in6_addr_t *, &in_iph6->ip6_dst, 953 uint16_t, ntohs(*sport), uint16_t, ntohs(*dport)); 954 } 955 956 i = ILB_CONN_HASH((uint8_t *)&in_dst_p->s6_addr32[3], ntohs(*dport), 957 (uint8_t *)&in_src_p->s6_addr32[3], ntohs(*sport), 958 ilbs->ilbs_conn_hash_size); 959 hash = ilbs->ilbs_c2s_conn_hash; 960 961 mutex_enter(&hash[i].ilb_conn_hash_lock); 962 for (connp = hash[i].ilb_connp; connp != NULL; 963 connp = connp->conn_c2s_next) { 964 if (connp->conn_l4 == l4 && 965 connp->conn_c2s_dport == *sport && 966 connp->conn_c2s_sport == *dport && 967 IN6_ARE_ADDR_EQUAL(in_dst_p, &connp->conn_c2s_saddr) && 968 IN6_ARE_ADDR_EQUAL(in_src_p, &connp->conn_c2s_daddr)) { 969 connp->conn_c2s_atime = ddi_get_lbolt64(); 970 connp->conn_c2s_pkt_cnt++; 971 rule_cache = connp->conn_rule_cache; 972 adj_ip_sum = connp->conn_c2s_ip_sum; 973 break; 974 } 975 } 976 mutex_exit(&hash[i].ilb_conn_hash_lock); 977 978 if (connp == NULL) { 979 DTRACE_PROBE(ilb__chk__icmp__conn__failed); 980 return (B_FALSE); 981 } 982 983 switch (rule_cache.topo) { 984 case ILB_TOPO_IMPL_NAT: 985 full_nat = B_TRUE; 986 break; 987 case ILB_TOPO_IMPL_HALF_NAT: 988 full_nat = B_FALSE; 989 break; 990 default: 991 return (B_FALSE); 992 } 993 994 *lb_dst = rule_cache.info.nat_dst; 995 if (l3 == IPPROTO_IP) { 996 ilb_nat_icmpv4(mp, out_iph, icmph4, in_iph4, sport, dport, 997 &rule_cache.info, adj_ip_sum, full_nat); 998 } else { 999 ilb_nat_icmpv6(mp, out_iph, icmph6, in_iph6, sport, dport, 1000 &rule_cache.info, full_nat); 1001 } 1002 return (B_TRUE); 1003 } 1004 1005 /* 1006 * This routine sends up the conn hash table to user land. Note that the 1007 * request is an ioctl, hence we cannot really differentiate requests 1008 * from different clients. There is no context shared between different 1009 * ioctls. Here we make the assumption that the user land ilbd will 1010 * only allow one client to show the conn hash table at any time. 1011 * Otherwise, the results will be "very" inconsistent. 1012 * 1013 * In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants 1014 * to read from the beginning of the able. After a certain entries 1015 * are reported, the kernel remembers the position of the last returned 1016 * entry. When the next ioctl comes in with the ILB_LIST_BEGIN flag, 1017 * it will return entries starting from where it was left off. When 1018 * the end of table is reached, a flag (ILB_LIST_END) is set to tell 1019 * the client that there is no more entry. 1020 * 1021 * It is assumed that the caller has checked the size of nat so that it 1022 * can hold num entries. 1023 */ 1024 /* ARGSUSED */ 1025 int 1026 ilb_list_nat(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_nat_entry_t *nat, 1027 uint32_t *num, uint32_t *flags) 1028 { 1029 ilb_conn_hash_t *hash; 1030 ilb_conn_t *cur_connp; 1031 uint32_t i, j; 1032 int ret = 0; 1033 1034 mutex_enter(&ilbs->ilbs_conn_list_lock); 1035 while (ilbs->ilbs_conn_list_busy) { 1036 if (cv_wait_sig(&ilbs->ilbs_conn_list_cv, 1037 &ilbs->ilbs_conn_list_lock) == 0) { 1038 mutex_exit(&ilbs->ilbs_conn_list_lock); 1039 return (EINTR); 1040 } 1041 } 1042 if ((hash = ilbs->ilbs_c2s_conn_hash) == NULL) { 1043 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL); 1044 mutex_exit(&ilbs->ilbs_conn_list_lock); 1045 *num = 0; 1046 *flags |= ILB_LIST_END; 1047 return (0); 1048 } 1049 ilbs->ilbs_conn_list_busy = B_TRUE; 1050 mutex_exit(&ilbs->ilbs_conn_list_lock); 1051 1052 if (*flags & ILB_LIST_BEGIN) { 1053 i = 0; 1054 mutex_enter(&hash[0].ilb_conn_hash_lock); 1055 cur_connp = hash[0].ilb_connp; 1056 } else if (*flags & ILB_LIST_CONT) { 1057 if (ilbs->ilbs_conn_list_cur == ilbs->ilbs_conn_hash_size) { 1058 *num = 0; 1059 *flags |= ILB_LIST_END; 1060 goto done; 1061 } 1062 i = ilbs->ilbs_conn_list_cur; 1063 mutex_enter(&hash[i].ilb_conn_hash_lock); 1064 cur_connp = ilbs->ilbs_conn_list_connp; 1065 } else { 1066 ret = EINVAL; 1067 goto done; 1068 } 1069 1070 j = 0; 1071 while (j < *num) { 1072 if (cur_connp == NULL) { 1073 mutex_exit(&hash[i].ilb_conn_hash_lock); 1074 if (++i == ilbs->ilbs_conn_hash_size) { 1075 *flags |= ILB_LIST_END; 1076 break; 1077 } 1078 mutex_enter(&hash[i].ilb_conn_hash_lock); 1079 cur_connp = hash[i].ilb_connp; 1080 continue; 1081 } 1082 nat[j].proto = cur_connp->conn_l4; 1083 1084 nat[j].in_global = cur_connp->conn_c2s_daddr; 1085 nat[j].in_global_port = cur_connp->conn_c2s_dport; 1086 nat[j].out_global = cur_connp->conn_c2s_saddr; 1087 nat[j].out_global_port = cur_connp->conn_c2s_sport; 1088 1089 nat[j].in_local = cur_connp->conn_s2c_saddr; 1090 nat[j].in_local_port = cur_connp->conn_s2c_sport; 1091 nat[j].out_local = cur_connp->conn_s2c_daddr; 1092 nat[j].out_local_port = cur_connp->conn_s2c_dport; 1093 1094 nat[j].create_time = TICK_TO_MSEC(cur_connp->conn_cr_time); 1095 nat[j].last_access_time = 1096 TICK_TO_MSEC(cur_connp->conn_c2s_atime); 1097 1098 /* 1099 * The conn_s2c_pkt_cnt may not be accurate since we are not 1100 * holding the s2c hash lock. 1101 */ 1102 nat[j].pkt_cnt = cur_connp->conn_c2s_pkt_cnt + 1103 cur_connp->conn_s2c_pkt_cnt; 1104 j++; 1105 1106 cur_connp = cur_connp->conn_c2s_next; 1107 } 1108 ilbs->ilbs_conn_list_connp = cur_connp; 1109 if (j == *num) 1110 mutex_exit(&hash[i].ilb_conn_hash_lock); 1111 1112 ilbs->ilbs_conn_list_cur = i; 1113 1114 *num = j; 1115 done: 1116 mutex_enter(&ilbs->ilbs_conn_list_lock); 1117 ilbs->ilbs_conn_list_busy = B_FALSE; 1118 cv_signal(&ilbs->ilbs_conn_list_cv); 1119 mutex_exit(&ilbs->ilbs_conn_list_lock); 1120 1121 return (ret); 1122 } 1123 1124 1125 /* 1126 * Stickiness (persistence) handling routines. 1127 */ 1128 1129 1130 static void 1131 ilb_sticky_cache_init(void) 1132 { 1133 ilb_sticky_cache = kmem_cache_create("ilb_sticky_cache", 1134 sizeof (ilb_sticky_t), 0, NULL, NULL, NULL, NULL, NULL, 1135 ilb_kmem_flags); 1136 } 1137 1138 void 1139 ilb_sticky_cache_fini(void) 1140 { 1141 if (ilb_sticky_cache != NULL) { 1142 kmem_cache_destroy(ilb_sticky_cache); 1143 ilb_sticky_cache = NULL; 1144 } 1145 } 1146 1147 void 1148 ilb_sticky_refrele(ilb_sticky_t *s) 1149 { 1150 ILB_STICKY_REFRELE(s); 1151 } 1152 1153 static ilb_sticky_t * 1154 ilb_sticky_lookup(ilb_sticky_hash_t *hash, ilb_rule_t *rule, in6_addr_t *src) 1155 { 1156 ilb_sticky_t *s; 1157 1158 ASSERT(mutex_owned(&hash->sticky_lock)); 1159 1160 for (s = list_head(&hash->sticky_head); s != NULL; 1161 s = list_next(&hash->sticky_head, s)) { 1162 if (s->rule_instance == rule->ir_ks_instance) { 1163 if (IN6_ARE_ADDR_EQUAL(src, &s->src)) 1164 return (s); 1165 } 1166 } 1167 return (NULL); 1168 } 1169 1170 static ilb_sticky_t * 1171 ilb_sticky_add(ilb_sticky_hash_t *hash, ilb_rule_t *rule, ilb_server_t *server, 1172 in6_addr_t *src) 1173 { 1174 ilb_sticky_t *s; 1175 1176 ASSERT(mutex_owned(&hash->sticky_lock)); 1177 1178 if ((s = kmem_cache_alloc(ilb_sticky_cache, KM_NOSLEEP)) == NULL) 1179 return (NULL); 1180 1181 /* 1182 * The rule instance is for handling the scenario when the same 1183 * client talks to different rules at the same time. Stickiness 1184 * is per rule so we can use the rule instance to differentiate 1185 * the client's request. 1186 */ 1187 s->rule_instance = rule->ir_ks_instance; 1188 /* 1189 * Copy the rule name for listing all sticky cache entry. ir_name 1190 * is guaranteed to be NULL terminated. 1191 */ 1192 (void) strcpy(s->rule_name, rule->ir_name); 1193 s->server = server; 1194 1195 /* 1196 * Grab a ref cnt on the server so that it won't go away while 1197 * it is still in the sticky table. 1198 */ 1199 ILB_SERVER_REFHOLD(server); 1200 s->src = *src; 1201 s->expiry = rule->ir_sticky_expiry; 1202 s->refcnt = 1; 1203 s->hash = hash; 1204 1205 /* 1206 * There is no need to set atime here since the refcnt is not 1207 * zero. A sticky entry is removed only when the refcnt is 1208 * zero. But just set it here for debugging purpose. The 1209 * atime is set when a refrele is done on a sticky entry. 1210 */ 1211 s->atime = ddi_get_lbolt64(); 1212 1213 list_insert_head(&hash->sticky_head, s); 1214 hash->sticky_cnt++; 1215 return (s); 1216 } 1217 1218 /* 1219 * This routine checks if there is an existing sticky entry which matches 1220 * a given packet. If there is one, return it. If there is not, create 1221 * a sticky entry using the packet's info. 1222 */ 1223 ilb_server_t * 1224 ilb_sticky_find_add(ilb_stack_t *ilbs, ilb_rule_t *rule, in6_addr_t *src, 1225 ilb_server_t *server, ilb_sticky_t **res, uint16_t *src_ent_idx) 1226 { 1227 int i; 1228 ilb_sticky_hash_t *hash; 1229 ilb_sticky_t *s; 1230 1231 ASSERT(server != NULL); 1232 1233 *res = NULL; 1234 1235 i = ILB_STICKY_HASH((uint8_t *)&src->s6_addr32[3], 1236 (uint32_t)(uintptr_t)rule, ilbs->ilbs_sticky_hash_size); 1237 hash = &ilbs->ilbs_sticky_hash[i]; 1238 1239 /* First check if there is already an entry. */ 1240 mutex_enter(&hash->sticky_lock); 1241 s = ilb_sticky_lookup(hash, rule, src); 1242 1243 /* No sticky entry, add one. */ 1244 if (s == NULL) { 1245 add_new_entry: 1246 s = ilb_sticky_add(hash, rule, server, src); 1247 if (s == NULL) { 1248 mutex_exit(&hash->sticky_lock); 1249 return (NULL); 1250 } 1251 /* 1252 * Find a source for this server. All subseqent requests from 1253 * the same client matching this sticky entry will use this 1254 * source address in doing NAT. The current algorithm is 1255 * simple, rotate the source address. Note that the 1256 * source address array does not change after it's created, so 1257 * it is OK to just increment the cur index. 1258 */ 1259 if (server->iser_nat_src != NULL) { 1260 /* It is a hint, does not need to be atomic. */ 1261 *src_ent_idx = (server->iser_nat_src->cur++ % 1262 server->iser_nat_src->num_src); 1263 s->nat_src_idx = *src_ent_idx; 1264 } 1265 mutex_exit(&hash->sticky_lock); 1266 *res = s; 1267 return (server); 1268 } 1269 1270 /* 1271 * We don't hold any lock accessing iser_enabled. Refer to the 1272 * comment in ilb_server_add() about iser_lock. 1273 */ 1274 if (!s->server->iser_enabled) { 1275 /* 1276 * s->server == server can only happen if there is a race in 1277 * toggling the iser_enabled flag (we don't hold a lock doing 1278 * that) so that the load balance algorithm still returns a 1279 * disabled server. In this case, just drop the packet... 1280 */ 1281 if (s->server == server) { 1282 mutex_exit(&hash->sticky_lock); 1283 return (NULL); 1284 } 1285 1286 /* 1287 * The old server is disabled and there is a new server, use 1288 * the new one to create a sticky entry. Since we will 1289 * add the entry at the beginning, subsequent lookup will 1290 * find this new entry instead of the old one. 1291 */ 1292 goto add_new_entry; 1293 } 1294 1295 s->refcnt++; 1296 *res = s; 1297 mutex_exit(&hash->sticky_lock); 1298 if (server->iser_nat_src != NULL) 1299 *src_ent_idx = s->nat_src_idx; 1300 return (s->server); 1301 } 1302 1303 static void 1304 ilb_sticky_cleanup(void *arg) 1305 { 1306 ilb_timer_t *timer = (ilb_timer_t *)arg; 1307 uint32_t i; 1308 ilb_stack_t *ilbs; 1309 ilb_sticky_hash_t *hash; 1310 ilb_sticky_t *s, *nxt_s; 1311 int64_t now, expiry; 1312 1313 ilbs = timer->ilbs; 1314 hash = ilbs->ilbs_sticky_hash; 1315 ASSERT(hash != NULL); 1316 1317 now = ddi_get_lbolt64(); 1318 for (i = timer->start; i < timer->end; i++) { 1319 mutex_enter(&hash[i].sticky_lock); 1320 for (s = list_head(&hash[i].sticky_head); s != NULL; 1321 s = nxt_s) { 1322 nxt_s = list_next(&hash[i].sticky_head, s); 1323 if (s->refcnt != 0) 1324 continue; 1325 expiry = now - SEC_TO_TICK(s->expiry); 1326 if (s->atime < expiry) { 1327 ILB_SERVER_REFRELE(s->server); 1328 list_remove(&hash[i].sticky_head, s); 1329 kmem_cache_free(ilb_sticky_cache, s); 1330 hash[i].sticky_cnt--; 1331 } 1332 } 1333 mutex_exit(&hash[i].sticky_lock); 1334 } 1335 } 1336 1337 static void 1338 ilb_sticky_timer(void *arg) 1339 { 1340 ilb_timer_t *timer = (ilb_timer_t *)arg; 1341 1342 (void) taskq_dispatch(timer->ilbs->ilbs_sticky_taskq, 1343 ilb_sticky_cleanup, arg, TQ_SLEEP); 1344 mutex_enter(&timer->tid_lock); 1345 if (timer->tid == 0) { 1346 mutex_exit(&timer->tid_lock); 1347 } else { 1348 timer->tid = timeout(ilb_sticky_timer, arg, 1349 SEC_TO_TICK(ilb_sticky_timeout)); 1350 mutex_exit(&timer->tid_lock); 1351 } 1352 } 1353 1354 void 1355 ilb_sticky_hash_init(ilb_stack_t *ilbs) 1356 { 1357 extern pri_t minclsyspri; 1358 int i, part; 1359 char tq_name[TASKQ_NAMELEN]; 1360 ilb_timer_t *tm; 1361 1362 if (ilbs->ilbs_sticky_hash_size & (ilbs->ilbs_sticky_hash_size - 1)) { 1363 for (i = 0; i < 31; i++) { 1364 if (ilbs->ilbs_sticky_hash_size < (1 << i)) 1365 break; 1366 } 1367 ilbs->ilbs_sticky_hash_size = 1 << i; 1368 } 1369 1370 ilbs->ilbs_sticky_hash = kmem_zalloc(sizeof (ilb_sticky_hash_t) * 1371 ilbs->ilbs_sticky_hash_size, KM_SLEEP); 1372 for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) { 1373 mutex_init(&ilbs->ilbs_sticky_hash[i].sticky_lock, NULL, 1374 MUTEX_DEFAULT, NULL); 1375 list_create(&ilbs->ilbs_sticky_hash[i].sticky_head, 1376 sizeof (ilb_sticky_t), 1377 offsetof(ilb_sticky_t, list)); 1378 } 1379 1380 if (ilb_sticky_cache == NULL) 1381 ilb_sticky_cache_init(); 1382 1383 (void) snprintf(tq_name, sizeof (tq_name), "ilb_sticky_taskq_%p", 1384 (void *)ilbs->ilbs_netstack); 1385 ASSERT(ilbs->ilbs_sticky_taskq == NULL); 1386 ilbs->ilbs_sticky_taskq = taskq_create(tq_name, 1387 ilb_sticky_timer_size * 2, minclsyspri, ilb_sticky_timer_size, 1388 ilb_sticky_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); 1389 1390 ASSERT(ilbs->ilbs_sticky_timer_list == NULL); 1391 ilbs->ilbs_sticky_timer_list = kmem_zalloc(sizeof (ilb_timer_t) * 1392 ilb_sticky_timer_size, KM_SLEEP); 1393 part = ilbs->ilbs_sticky_hash_size / ilb_sticky_timer_size + 1; 1394 for (i = 0; i < ilb_sticky_timer_size; i++) { 1395 tm = ilbs->ilbs_sticky_timer_list + i; 1396 tm->start = i * part; 1397 tm->end = i * part + part; 1398 if (tm->end > ilbs->ilbs_sticky_hash_size) 1399 tm->end = ilbs->ilbs_sticky_hash_size; 1400 tm->ilbs = ilbs; 1401 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL); 1402 /* Spread out the starting execution time of all the timers. */ 1403 tm->tid = timeout(ilb_sticky_timer, tm, 1404 SEC_TO_TICK(ilb_sticky_timeout + i)); 1405 } 1406 } 1407 1408 void 1409 ilb_sticky_hash_fini(ilb_stack_t *ilbs) 1410 { 1411 int i; 1412 ilb_sticky_t *s; 1413 1414 if (ilbs->ilbs_sticky_hash == NULL) 1415 return; 1416 1417 /* Stop all the timers first. */ 1418 for (i = 0; i < ilb_sticky_timer_size; i++) { 1419 timeout_id_t tid; 1420 1421 /* Setting tid to 0 tells the timer handler not to restart. */ 1422 mutex_enter(&ilbs->ilbs_sticky_timer_list[i].tid_lock); 1423 tid = ilbs->ilbs_sticky_timer_list[i].tid; 1424 ilbs->ilbs_sticky_timer_list[i].tid = 0; 1425 mutex_exit(&ilbs->ilbs_sticky_timer_list[i].tid_lock); 1426 (void) untimeout(tid); 1427 } 1428 kmem_free(ilbs->ilbs_sticky_timer_list, sizeof (ilb_timer_t) * 1429 ilb_sticky_timer_size); 1430 taskq_destroy(ilbs->ilbs_sticky_taskq); 1431 ilbs->ilbs_sticky_taskq = NULL; 1432 1433 for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) { 1434 while ((s = list_head(&ilbs->ilbs_sticky_hash[i].sticky_head)) 1435 != NULL) { 1436 list_remove(&ilbs->ilbs_sticky_hash[i].sticky_head, s); 1437 ILB_SERVER_REFRELE(s->server); 1438 kmem_free(s, sizeof (ilb_sticky_t)); 1439 } 1440 } 1441 kmem_free(ilbs->ilbs_sticky_hash, ilbs->ilbs_sticky_hash_size * 1442 sizeof (ilb_sticky_hash_t)); 1443 } 1444 1445 /* 1446 * This routine sends up the sticky hash table to user land. Refer to 1447 * the comments before ilb_list_nat(). Both routines assume similar 1448 * conditions. 1449 * 1450 * It is assumed that the caller has checked the size of st so that it 1451 * can hold num entries. 1452 */ 1453 /* ARGSUSED */ 1454 int 1455 ilb_list_sticky(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_sticky_entry_t *st, 1456 uint32_t *num, uint32_t *flags) 1457 { 1458 ilb_sticky_hash_t *hash; 1459 ilb_sticky_t *curp; 1460 uint32_t i, j; 1461 int ret = 0; 1462 1463 mutex_enter(&ilbs->ilbs_sticky_list_lock); 1464 while (ilbs->ilbs_sticky_list_busy) { 1465 if (cv_wait_sig(&ilbs->ilbs_sticky_list_cv, 1466 &ilbs->ilbs_sticky_list_lock) == 0) { 1467 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1468 return (EINTR); 1469 } 1470 } 1471 if ((hash = ilbs->ilbs_sticky_hash) == NULL) { 1472 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1473 *num = 0; 1474 *flags |= ILB_LIST_END; 1475 return (0); 1476 } 1477 ilbs->ilbs_sticky_list_busy = B_TRUE; 1478 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1479 1480 if (*flags & ILB_LIST_BEGIN) { 1481 i = 0; 1482 mutex_enter(&hash[0].sticky_lock); 1483 curp = list_head(&hash[0].sticky_head); 1484 } else if (*flags & ILB_LIST_CONT) { 1485 if (ilbs->ilbs_sticky_list_cur == ilbs->ilbs_sticky_hash_size) { 1486 *num = 0; 1487 *flags |= ILB_LIST_END; 1488 goto done; 1489 } 1490 i = ilbs->ilbs_sticky_list_cur; 1491 mutex_enter(&hash[i].sticky_lock); 1492 curp = ilbs->ilbs_sticky_list_curp; 1493 } else { 1494 ret = EINVAL; 1495 goto done; 1496 } 1497 1498 j = 0; 1499 while (j < *num) { 1500 if (curp == NULL) { 1501 mutex_exit(&hash[i].sticky_lock); 1502 if (++i == ilbs->ilbs_sticky_hash_size) { 1503 *flags |= ILB_LIST_END; 1504 break; 1505 } 1506 mutex_enter(&hash[i].sticky_lock); 1507 curp = list_head(&hash[i].sticky_head); 1508 continue; 1509 } 1510 (void) strcpy(st[j].rule_name, curp->rule_name); 1511 st[j].req_addr = curp->src; 1512 st[j].srv_addr = curp->server->iser_addr_v6; 1513 st[j].expiry_time = TICK_TO_MSEC(curp->expiry); 1514 j++; 1515 curp = list_next(&hash[i].sticky_head, curp); 1516 } 1517 ilbs->ilbs_sticky_list_curp = curp; 1518 if (j == *num) 1519 mutex_exit(&hash[i].sticky_lock); 1520 1521 ilbs->ilbs_sticky_list_cur = i; 1522 1523 *num = j; 1524 done: 1525 mutex_enter(&ilbs->ilbs_sticky_list_lock); 1526 ilbs->ilbs_sticky_list_busy = B_FALSE; 1527 cv_signal(&ilbs->ilbs_sticky_list_cv); 1528 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1529 1530 return (ret); 1531 }