1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/sysmacros.h> 28 #include <sys/types.h> 29 #include <sys/conf.h> 30 #include <sys/time.h> 31 #include <sys/taskq.h> 32 #include <sys/cmn_err.h> 33 #include <sys/sdt.h> 34 #include <sys/atomic.h> 35 #include <netinet/in.h> 36 #include <inet/ip.h> 37 #include <inet/ip6.h> 38 #include <inet/tcp.h> 39 #include <inet/udp_impl.h> 40 #include <inet/ilb.h> 41 42 #include "ilb_stack.h" 43 #include "ilb_impl.h" 44 #include "ilb_conn.h" 45 #include "ilb_nat.h" 46 47 /* 48 * Timer struct for ilb_conn_t and ilb_sticky_t garbage collection 49 * 50 * start: starting index into the hash table to do gc 51 * end: ending index into the hash table to do gc 52 * ilbs: pointer to the ilb_stack_t of the IP stack 53 * tid_lock: mutex to protect the timer id. 54 * tid: timer id of the timer 55 */ 56 typedef struct ilb_timer_s { 57 uint32_t start; 58 uint32_t end; 59 ilb_stack_t *ilbs; 60 kmutex_t tid_lock; 61 timeout_id_t tid; 62 } ilb_timer_t; 63 64 /* Hash macro for finding the index to the conn hash table */ 65 #define ILB_CONN_HASH(saddr, sport, daddr, dport, hash_size) \ 66 (((*((saddr) + 3) ^ *((daddr) + 3)) * 50653 + \ 67 (*((saddr) + 2) ^ *((daddr) + 2)) * 1369 + \ 68 (*((saddr) + 1) ^ *((daddr) + 1)) * 37 + \ 69 (*(saddr) ^ *(daddr)) + (sport) * 37 + (dport)) & \ 70 ((hash_size) - 1)) 71 72 /* Kmem cache for the conn hash entry */ 73 static struct kmem_cache *ilb_conn_cache = NULL; 74 75 /* 76 * There are 60 timers running to do conn cache garbage collection. Each 77 * gc thread is responsible for 1/60 of the conn hash table. 78 */ 79 static int ilb_conn_timer_size = 60; 80 81 /* Each of the above gc timers wake up every 15s to do the gc. */ 82 static int ilb_conn_cache_timeout = 15; 83 84 #define ILB_STICKY_HASH(saddr, rule, hash_size) \ 85 (((*((saddr) + 3) ^ ((rule) >> 24)) * 29791 + \ 86 (*((saddr) + 2) ^ ((rule) >> 16)) * 961 + \ 87 (*((saddr) + 1) ^ ((rule) >> 8)) * 31 + \ 88 (*(saddr) ^ (rule))) & ((hash_size) - 1)) 89 90 static struct kmem_cache *ilb_sticky_cache = NULL; 91 92 /* 93 * There are 60 timers running to do sticky cache garbage collection. Each 94 * gc thread is responsible for 1/60 of the sticky hash table. 95 */ 96 static int ilb_sticky_timer_size = 60; 97 98 /* Each of the above gc timers wake up every 15s to do the gc. */ 99 static int ilb_sticky_timeout = 15; 100 101 #define ILB_STICKY_REFRELE(s) \ 102 { \ 103 mutex_enter(&(s)->hash->sticky_lock); \ 104 (s)->refcnt--; \ 105 (s)->atime = ddi_get_lbolt64(); \ 106 mutex_exit(&s->hash->sticky_lock); \ 107 } 108 109 110 static void 111 ilb_conn_cache_init(void) 112 { 113 ilb_conn_cache = kmem_cache_create("ilb_conn_cache", 114 sizeof (ilb_conn_t), 0, NULL, NULL, NULL, NULL, NULL, 115 ilb_kmem_flags); 116 } 117 118 void 119 ilb_conn_cache_fini(void) 120 { 121 if (ilb_conn_cache != NULL) { 122 kmem_cache_destroy(ilb_conn_cache); 123 ilb_conn_cache = NULL; 124 } 125 } 126 127 static void 128 ilb_conn_remove_common(ilb_conn_t *connp, boolean_t c2s) 129 { 130 ilb_conn_hash_t *hash; 131 ilb_conn_t **next, **prev; 132 ilb_conn_t **next_prev, **prev_next; 133 134 if (c2s) { 135 hash = connp->conn_c2s_hash; 136 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock)); 137 next = &connp->conn_c2s_next; 138 prev = &connp->conn_c2s_prev; 139 if (*next != NULL) 140 next_prev = &(*next)->conn_c2s_prev; 141 if (*prev != NULL) 142 prev_next = &(*prev)->conn_c2s_next; 143 } else { 144 hash = connp->conn_s2c_hash; 145 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock)); 146 next = &connp->conn_s2c_next; 147 prev = &connp->conn_s2c_prev; 148 if (*next != NULL) 149 next_prev = &(*next)->conn_s2c_prev; 150 if (*prev != NULL) 151 prev_next = &(*prev)->conn_s2c_next; 152 } 153 154 if (hash->ilb_connp == connp) { 155 hash->ilb_connp = *next; 156 if (*next != NULL) 157 *next_prev = NULL; 158 } else { 159 if (*prev != NULL) 160 *prev_next = *next; 161 if (*next != NULL) 162 *next_prev = *prev; 163 } 164 ASSERT(hash->ilb_conn_cnt > 0); 165 hash->ilb_conn_cnt--; 166 167 *next = NULL; 168 *prev = NULL; 169 } 170 171 static void 172 ilb_conn_remove(ilb_conn_t *connp) 173 { 174 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock)); 175 ilb_conn_remove_common(connp, B_TRUE); 176 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock)); 177 ilb_conn_remove_common(connp, B_FALSE); 178 179 if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) { 180 in_port_t port; 181 182 port = ntohs(connp->conn_rule_cache.info.nat_sport); 183 vmem_free(connp->conn_rule_cache.info.src_ent->nse_port_arena, 184 (void *)(uintptr_t)port, 1); 185 } 186 187 if (connp->conn_sticky != NULL) 188 ILB_STICKY_REFRELE(connp->conn_sticky); 189 ILB_SERVER_REFRELE(connp->conn_server); 190 kmem_cache_free(ilb_conn_cache, connp); 191 } 192 193 /* 194 * Routine to do periodic garbage collection of conn hash entries. When 195 * a conn hash timer fires, it dispatches a taskq to call this function 196 * to do the gc. Note that each taskq is responisble for a portion of 197 * the table. The portion is stored in timer->start, timer->end. 198 */ 199 static void 200 ilb_conn_cleanup(void *arg) 201 { 202 ilb_timer_t *timer = (ilb_timer_t *)arg; 203 uint32_t i; 204 ilb_stack_t *ilbs; 205 ilb_conn_hash_t *c2s_hash, *s2c_hash; 206 ilb_conn_t *connp, *nxt_connp; 207 int64_t now; 208 int64_t expiry; 209 boolean_t die_now; 210 211 ilbs = timer->ilbs; 212 c2s_hash = ilbs->ilbs_c2s_conn_hash; 213 ASSERT(c2s_hash != NULL); 214 215 now = ddi_get_lbolt64(); 216 for (i = timer->start; i < timer->end; i++) { 217 mutex_enter(&c2s_hash[i].ilb_conn_hash_lock); 218 if ((connp = c2s_hash[i].ilb_connp) == NULL) { 219 ASSERT(c2s_hash[i].ilb_conn_cnt == 0); 220 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock); 221 continue; 222 } 223 do { 224 ASSERT(c2s_hash[i].ilb_conn_cnt > 0); 225 ASSERT(connp->conn_c2s_hash == &c2s_hash[i]); 226 nxt_connp = connp->conn_c2s_next; 227 expiry = now - SEC_TO_TICK(connp->conn_expiry); 228 if (connp->conn_server->iser_die_time != 0 && 229 connp->conn_server->iser_die_time < now) 230 die_now = B_TRUE; 231 else 232 die_now = B_FALSE; 233 s2c_hash = connp->conn_s2c_hash; 234 mutex_enter(&s2c_hash->ilb_conn_hash_lock); 235 236 if (connp->conn_gc || die_now || 237 (connp->conn_c2s_atime < expiry && 238 connp->conn_s2c_atime < expiry)) { 239 /* Need to update the nat list cur_connp */ 240 if (connp == ilbs->ilbs_conn_list_connp) { 241 ilbs->ilbs_conn_list_connp = 242 connp->conn_c2s_next; 243 } 244 ilb_conn_remove(connp); 245 goto nxt_connp; 246 } 247 248 if (connp->conn_l4 != IPPROTO_TCP) 249 goto nxt_connp; 250 251 /* Update and check TCP related conn info */ 252 if (connp->conn_c2s_tcp_fin_sent && 253 SEQ_GT(connp->conn_s2c_tcp_ack, 254 connp->conn_c2s_tcp_fss)) { 255 connp->conn_c2s_tcp_fin_acked = B_TRUE; 256 } 257 if (connp->conn_s2c_tcp_fin_sent && 258 SEQ_GT(connp->conn_c2s_tcp_ack, 259 connp->conn_s2c_tcp_fss)) { 260 connp->conn_s2c_tcp_fin_acked = B_TRUE; 261 } 262 if (connp->conn_c2s_tcp_fin_acked && 263 connp->conn_s2c_tcp_fin_acked) { 264 ilb_conn_remove(connp); 265 } 266 nxt_connp: 267 mutex_exit(&s2c_hash->ilb_conn_hash_lock); 268 connp = nxt_connp; 269 } while (connp != NULL); 270 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock); 271 } 272 } 273 274 /* Conn hash timer routine. It dispatches a taskq and restart the timer */ 275 static void 276 ilb_conn_timer(void *arg) 277 { 278 ilb_timer_t *timer = (ilb_timer_t *)arg; 279 280 (void) taskq_dispatch(timer->ilbs->ilbs_conn_taskq, ilb_conn_cleanup, 281 arg, TQ_SLEEP); 282 mutex_enter(&timer->tid_lock); 283 if (timer->tid == 0) { 284 mutex_exit(&timer->tid_lock); 285 } else { 286 timer->tid = timeout(ilb_conn_timer, arg, 287 SEC_TO_TICK(ilb_conn_cache_timeout)); 288 mutex_exit(&timer->tid_lock); 289 } 290 } 291 292 void 293 ilb_conn_hash_init(ilb_stack_t *ilbs) 294 { 295 extern pri_t minclsyspri; 296 int i, part; 297 ilb_timer_t *tm; 298 char tq_name[TASKQ_NAMELEN]; 299 300 /* 301 * If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to 302 * the next power of 2. 303 */ 304 if (!ISP2(ilbs->ilbs_conn_hash_size)) { 305 for (i = 0; i < 31; i++) { 306 if (ilbs->ilbs_conn_hash_size < (1 << i)) 307 break; 308 } 309 ilbs->ilbs_conn_hash_size = 1 << i; 310 } 311 312 /* 313 * Can sleep since this should be called when a rule is being added, 314 * hence we are not in interrupt context. 315 */ 316 ilbs->ilbs_c2s_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) * 317 ilbs->ilbs_conn_hash_size, KM_SLEEP); 318 ilbs->ilbs_s2c_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) * 319 ilbs->ilbs_conn_hash_size, KM_SLEEP); 320 321 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) { 322 mutex_init(&ilbs->ilbs_c2s_conn_hash[i].ilb_conn_hash_lock, 323 NULL, MUTEX_DEFAULT, NULL); 324 } 325 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) { 326 mutex_init(&ilbs->ilbs_s2c_conn_hash[i].ilb_conn_hash_lock, 327 NULL, MUTEX_DEFAULT, NULL); 328 } 329 330 if (ilb_conn_cache == NULL) 331 ilb_conn_cache_init(); 332 333 (void) snprintf(tq_name, sizeof (tq_name), "ilb_conn_taskq_%p", 334 (void *)ilbs->ilbs_netstack); 335 ASSERT(ilbs->ilbs_conn_taskq == NULL); 336 ilbs->ilbs_conn_taskq = taskq_create(tq_name, 337 ilb_conn_timer_size * 2, minclsyspri, ilb_conn_timer_size, 338 ilb_conn_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); 339 340 ASSERT(ilbs->ilbs_conn_timer_list == NULL); 341 ilbs->ilbs_conn_timer_list = kmem_zalloc(sizeof (ilb_timer_t) * 342 ilb_conn_timer_size, KM_SLEEP); 343 344 /* 345 * The hash table is divided in equal partition for those timers 346 * to do garbage collection. 347 */ 348 part = ilbs->ilbs_conn_hash_size / ilb_conn_timer_size + 1; 349 for (i = 0; i < ilb_conn_timer_size; i++) { 350 tm = ilbs->ilbs_conn_timer_list + i; 351 tm->start = i * part; 352 tm->end = i * part + part; 353 if (tm->end > ilbs->ilbs_conn_hash_size) 354 tm->end = ilbs->ilbs_conn_hash_size; 355 tm->ilbs = ilbs; 356 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL); 357 /* Spread out the starting execution time of all the timers. */ 358 tm->tid = timeout(ilb_conn_timer, tm, 359 SEC_TO_TICK(ilb_conn_cache_timeout + i)); 360 } 361 } 362 363 void 364 ilb_conn_hash_fini(ilb_stack_t *ilbs) 365 { 366 uint32_t i; 367 ilb_conn_t *connp; 368 369 if (ilbs->ilbs_c2s_conn_hash == NULL) { 370 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL); 371 return; 372 } 373 374 /* Stop all the timers first. */ 375 for (i = 0; i < ilb_conn_timer_size; i++) { 376 timeout_id_t tid; 377 378 /* Setting tid to 0 tells the timer handler not to restart. */ 379 mutex_enter(&ilbs->ilbs_conn_timer_list[i].tid_lock); 380 tid = ilbs->ilbs_conn_timer_list[i].tid; 381 ilbs->ilbs_conn_timer_list[i].tid = 0; 382 mutex_exit(&ilbs->ilbs_conn_timer_list[i].tid_lock); 383 (void) untimeout(tid); 384 } 385 kmem_free(ilbs->ilbs_conn_timer_list, sizeof (ilb_timer_t) * 386 ilb_conn_timer_size); 387 taskq_destroy(ilbs->ilbs_conn_taskq); 388 ilbs->ilbs_conn_taskq = NULL; 389 390 /* Then remove all the conns. */ 391 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) { 392 while ((connp = ilbs->ilbs_s2c_conn_hash->ilb_connp) != NULL) { 393 ilbs->ilbs_s2c_conn_hash->ilb_connp = 394 connp->conn_s2c_next; 395 ILB_SERVER_REFRELE(connp->conn_server); 396 if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) { 397 ilb_nat_src_entry_t *ent; 398 in_port_t port; 399 400 /* 401 * src_ent will be freed in ilb_nat_src_fini(). 402 */ 403 port = ntohs( 404 connp->conn_rule_cache.info.nat_sport); 405 ent = connp->conn_rule_cache.info.src_ent; 406 vmem_free(ent->nse_port_arena, 407 (void *)(uintptr_t)port, 1); 408 } 409 kmem_cache_free(ilb_conn_cache, connp); 410 } 411 } 412 kmem_free(ilbs->ilbs_c2s_conn_hash, sizeof (ilb_conn_hash_t) * 413 ilbs->ilbs_conn_hash_size); 414 kmem_free(ilbs->ilbs_s2c_conn_hash, sizeof (ilb_conn_hash_t) * 415 ilbs->ilbs_conn_hash_size); 416 } 417 418 /* 419 * Internet checksum adjustment calculation routines. We pre-calculate 420 * checksum adjustment so that we don't need to compute the checksum on 421 * the whole packet when we change address/port in the packet. 422 */ 423 424 static void 425 hnat_cksum_v4(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port, 426 in_port_t new_port, uint32_t *adj_sum) 427 { 428 uint32_t sum; 429 430 sum = *oaddr + *(oaddr + 1) + old_port; 431 while ((sum >> 16) != 0) 432 sum = (sum & 0xffff) + (sum >> 16); 433 *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + new_port; 434 } 435 436 static void 437 hnat_cksum_v6(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port, 438 in_port_t new_port, uint32_t *adj_sum) 439 { 440 uint32_t sum = 0; 441 442 sum = *oaddr + *(oaddr + 1) + *(oaddr + 2) + *(oaddr + 3) + 443 *(oaddr + 4) + *(oaddr + 5) + *(oaddr + 6) + *(oaddr + 7) + 444 old_port; 445 while ((sum >> 16) != 0) 446 sum = (sum & 0xffff) + (sum >> 16); 447 *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + 448 *(naddr + 2) + *(naddr + 3) + *(naddr + 4) + *(naddr + 5) + 449 *(naddr + 6) + *(naddr + 7) + new_port; 450 } 451 452 static void 453 fnat_cksum_v4(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1, 454 uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2, 455 in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum) 456 { 457 uint32_t sum; 458 459 sum = *oaddr1 + *(oaddr1 + 1) + old_port1 + *oaddr2 + *(oaddr2 + 1) + 460 old_port2; 461 while ((sum >> 16) != 0) 462 sum = (sum & 0xffff) + (sum >> 16); 463 *adj_sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + new_port1 + 464 *naddr2 + *(naddr2 + 1) + new_port2; 465 } 466 467 static void 468 fnat_cksum_v6(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1, 469 uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2, 470 in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum) 471 { 472 uint32_t sum = 0; 473 474 sum = *oaddr1 + *(oaddr1 + 1) + *(oaddr1 + 2) + *(oaddr1 + 3) + 475 *(oaddr1 + 4) + *(oaddr1 + 5) + *(oaddr1 + 6) + *(oaddr1 + 7) + 476 old_port1; 477 sum += *oaddr2 + *(oaddr2 + 1) + *(oaddr2 + 2) + *(oaddr2 + 3) + 478 *(oaddr2 + 4) + *(oaddr2 + 5) + *(oaddr2 + 6) + *(oaddr2 + 7) + 479 old_port2; 480 while ((sum >> 16) != 0) 481 sum = (sum & 0xffff) + (sum >> 16); 482 sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + *(naddr1 + 2) + 483 *(naddr1 + 3) + *(naddr1 + 4) + *(naddr1 + 5) + *(naddr1 + 6) + 484 *(naddr1 + 7) + new_port1; 485 *adj_sum = sum + *naddr2 + *(naddr2 + 1) + *(naddr2 + 2) + 486 *(naddr2 + 3) + *(naddr2 + 4) + *(naddr2 + 5) + *(naddr2 + 6) + 487 *(naddr2 + 7) + new_port2; 488 } 489 490 /* 491 * Add a conn hash entry to the tables. Note that a conn hash entry 492 * (ilb_conn_t) contains info on both directions. And there are two hash 493 * tables, one for client to server and the other for server to client. 494 * So the same entry is added to both tables and can be ccessed by two 495 * thread simultaneously. But each thread will only access data on one 496 * direction, so there is no conflict. 497 */ 498 int 499 ilb_conn_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_t *server, 500 in6_addr_t *src, in_port_t sport, in6_addr_t *dst, in_port_t dport, 501 ilb_nat_info_t *info, uint32_t *ip_sum, uint32_t *tp_sum, ilb_sticky_t *s) 502 { 503 ilb_conn_t *connp; 504 ilb_conn_hash_t *hash; 505 int i; 506 507 connp = kmem_cache_alloc(ilb_conn_cache, KM_NOSLEEP); 508 if (connp == NULL) { 509 if (s != NULL) { 510 if (rule->ir_topo == ILB_TOPO_IMPL_NAT) { 511 ilb_nat_src_entry_t **entry; 512 513 entry = s->server->iser_nat_src->src_list; 514 vmem_free(entry[s->nat_src_idx]->nse_port_arena, 515 (void *)(uintptr_t)ntohs(info->nat_sport), 516 1); 517 } 518 ILB_STICKY_REFRELE(s); 519 } 520 return (ENOMEM); 521 } 522 523 connp->conn_l4 = rule->ir_proto; 524 525 connp->conn_server = server; 526 ILB_SERVER_REFHOLD(server); 527 connp->conn_sticky = s; 528 529 connp->conn_rule_cache.topo = rule->ir_topo; 530 connp->conn_rule_cache.info = *info; 531 532 connp->conn_gc = B_FALSE; 533 534 connp->conn_expiry = rule->ir_nat_expiry; 535 connp->conn_cr_time = ddi_get_lbolt64(); 536 537 /* Client to server info. */ 538 connp->conn_c2s_saddr = *src; 539 connp->conn_c2s_sport = sport; 540 connp->conn_c2s_daddr = *dst; 541 connp->conn_c2s_dport = dport; 542 543 connp->conn_c2s_atime = ddi_get_lbolt64(); 544 /* The packet ths triggers this creation should be counted */ 545 connp->conn_c2s_pkt_cnt = 1; 546 connp->conn_c2s_tcp_fin_sent = B_FALSE; 547 connp->conn_c2s_tcp_fin_acked = B_FALSE; 548 549 /* Server to client info, before NAT */ 550 switch (rule->ir_topo) { 551 case ILB_TOPO_IMPL_HALF_NAT: 552 connp->conn_s2c_saddr = info->nat_dst; 553 connp->conn_s2c_sport = info->nat_dport; 554 connp->conn_s2c_daddr = *src; 555 connp->conn_s2c_dport = sport; 556 557 /* Pre-calculate checksum changes for both directions */ 558 if (rule->ir_ipver == IPPROTO_IP) { 559 hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3], 560 (uint16_t *)&info->nat_dst.s6_addr32[3], 0, 0, 561 &connp->conn_c2s_ip_sum); 562 hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3], 563 (uint16_t *)&info->nat_dst.s6_addr32[3], dport, 564 info->nat_dport, &connp->conn_c2s_tp_sum); 565 *ip_sum = connp->conn_c2s_ip_sum; 566 *tp_sum = connp->conn_c2s_tp_sum; 567 568 hnat_cksum_v4( 569 (uint16_t *)&info->nat_dst.s6_addr32[3], 570 (uint16_t *)&dst->s6_addr32[3], 0, 0, 571 &connp->conn_s2c_ip_sum); 572 hnat_cksum_v4( 573 (uint16_t *)&info->nat_dst.s6_addr32[3], 574 (uint16_t *)&dst->s6_addr32[3], 575 info->nat_dport, dport, 576 &connp->conn_s2c_tp_sum); 577 } else { 578 connp->conn_c2s_ip_sum = 0; 579 hnat_cksum_v6((uint16_t *)dst, 580 (uint16_t *)&info->nat_dst, dport, 581 info->nat_dport, &connp->conn_c2s_tp_sum); 582 *ip_sum = 0; 583 *tp_sum = connp->conn_c2s_tp_sum; 584 585 connp->conn_s2c_ip_sum = 0; 586 hnat_cksum_v6((uint16_t *)&info->nat_dst, 587 (uint16_t *)dst, info->nat_dport, dport, 588 &connp->conn_s2c_tp_sum); 589 } 590 break; 591 case ILB_TOPO_IMPL_NAT: 592 connp->conn_s2c_saddr = info->nat_dst; 593 connp->conn_s2c_sport = info->nat_dport; 594 connp->conn_s2c_daddr = info->nat_src; 595 connp->conn_s2c_dport = info->nat_sport; 596 597 if (rule->ir_ipver == IPPROTO_IP) { 598 fnat_cksum_v4((uint16_t *)&src->s6_addr32[3], 599 (uint16_t *)&dst->s6_addr32[3], 600 (uint16_t *)&info->nat_src.s6_addr32[3], 601 (uint16_t *)&info->nat_dst.s6_addr32[3], 602 0, 0, 0, 0, &connp->conn_c2s_ip_sum); 603 fnat_cksum_v4((uint16_t *)&src->s6_addr32[3], 604 (uint16_t *)&dst->s6_addr32[3], 605 (uint16_t *)&info->nat_src.s6_addr32[3], 606 (uint16_t *)&info->nat_dst.s6_addr32[3], 607 sport, dport, info->nat_sport, 608 info->nat_dport, &connp->conn_c2s_tp_sum); 609 *ip_sum = connp->conn_c2s_ip_sum; 610 *tp_sum = connp->conn_c2s_tp_sum; 611 612 fnat_cksum_v4( 613 (uint16_t *)&info->nat_src.s6_addr32[3], 614 (uint16_t *)&info->nat_dst.s6_addr32[3], 615 (uint16_t *)&src->s6_addr32[3], 616 (uint16_t *)&dst->s6_addr32[3], 617 0, 0, 0, 0, &connp->conn_s2c_ip_sum); 618 fnat_cksum_v4( 619 (uint16_t *)&info->nat_src.s6_addr32[3], 620 (uint16_t *)&info->nat_dst.s6_addr32[3], 621 (uint16_t *)&src->s6_addr32[3], 622 (uint16_t *)&dst->s6_addr32[3], 623 info->nat_sport, info->nat_dport, 624 sport, dport, &connp->conn_s2c_tp_sum); 625 } else { 626 fnat_cksum_v6((uint16_t *)src, (uint16_t *)dst, 627 (uint16_t *)&info->nat_src, 628 (uint16_t *)&info->nat_dst, 629 sport, dport, info->nat_sport, 630 info->nat_dport, &connp->conn_c2s_tp_sum); 631 connp->conn_c2s_ip_sum = 0; 632 *ip_sum = 0; 633 *tp_sum = connp->conn_c2s_tp_sum; 634 635 fnat_cksum_v6((uint16_t *)&info->nat_src, 636 (uint16_t *)&info->nat_dst, (uint16_t *)src, 637 (uint16_t *)dst, info->nat_sport, 638 info->nat_dport, sport, dport, 639 &connp->conn_s2c_tp_sum); 640 connp->conn_s2c_ip_sum = 0; 641 } 642 break; 643 } 644 645 connp->conn_s2c_atime = ddi_get_lbolt64(); 646 connp->conn_s2c_pkt_cnt = 1; 647 connp->conn_s2c_tcp_fin_sent = B_FALSE; 648 connp->conn_s2c_tcp_fin_acked = B_FALSE; 649 650 /* Add it to the s2c hash table. */ 651 hash = ilbs->ilbs_s2c_conn_hash; 652 i = ILB_CONN_HASH((uint8_t *)&connp->conn_s2c_saddr.s6_addr32[3], 653 ntohs(connp->conn_s2c_sport), 654 (uint8_t *)&connp->conn_s2c_daddr.s6_addr32[3], 655 ntohs(connp->conn_s2c_dport), ilbs->ilbs_conn_hash_size); 656 connp->conn_s2c_hash = &hash[i]; 657 DTRACE_PROBE2(ilb__conn__hash__add__s2c, ilb_conn_t *, connp, int, i); 658 659 mutex_enter(&hash[i].ilb_conn_hash_lock); 660 hash[i].ilb_conn_cnt++; 661 connp->conn_s2c_next = hash[i].ilb_connp; 662 if (hash[i].ilb_connp != NULL) 663 hash[i].ilb_connp->conn_s2c_prev = connp; 664 connp->conn_s2c_prev = NULL; 665 hash[i].ilb_connp = connp; 666 mutex_exit(&hash[i].ilb_conn_hash_lock); 667 668 /* Add it to the c2s hash table. */ 669 hash = ilbs->ilbs_c2s_conn_hash; 670 i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport), 671 (uint8_t *)&dst->s6_addr32[3], ntohs(dport), 672 ilbs->ilbs_conn_hash_size); 673 connp->conn_c2s_hash = &hash[i]; 674 DTRACE_PROBE2(ilb__conn__hash__add__c2s, ilb_conn_t *, connp, int, i); 675 676 mutex_enter(&hash[i].ilb_conn_hash_lock); 677 hash[i].ilb_conn_cnt++; 678 connp->conn_c2s_next = hash[i].ilb_connp; 679 if (hash[i].ilb_connp != NULL) 680 hash[i].ilb_connp->conn_c2s_prev = connp; 681 connp->conn_c2s_prev = NULL; 682 hash[i].ilb_connp = connp; 683 mutex_exit(&hash[i].ilb_conn_hash_lock); 684 685 return (0); 686 } 687 688 /* 689 * If a connection is using TCP, we keep track of simple TCP state transition 690 * so that we know when to clean up an entry. 691 */ 692 static boolean_t 693 update_conn_tcp(ilb_conn_t *connp, void *iph, tcpha_t *tcpha, int32_t pkt_len, 694 boolean_t c2s) 695 { 696 uint32_t ack, seq; 697 int32_t seg_len; 698 699 if (tcpha->tha_flags & TH_RST) 700 return (B_FALSE); 701 702 seg_len = pkt_len - ((uint8_t *)tcpha - (uint8_t *)iph) - 703 TCP_HDR_LENGTH((tcph_t *)tcpha); 704 705 if (tcpha->tha_flags & TH_ACK) 706 ack = ntohl(tcpha->tha_ack); 707 seq = ntohl(tcpha->tha_seq); 708 if (c2s) { 709 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock)); 710 if (tcpha->tha_flags & TH_FIN) { 711 connp->conn_c2s_tcp_fss = seq + seg_len; 712 connp->conn_c2s_tcp_fin_sent = B_TRUE; 713 } 714 connp->conn_c2s_tcp_ack = ack; 715 716 /* Port reuse by the client, restart the conn. */ 717 if (connp->conn_c2s_tcp_fin_sent && 718 SEQ_GT(seq, connp->conn_c2s_tcp_fss + 1)) { 719 connp->conn_c2s_tcp_fin_sent = B_FALSE; 720 connp->conn_c2s_tcp_fin_acked = B_FALSE; 721 } 722 } else { 723 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock)); 724 if (tcpha->tha_flags & TH_FIN) { 725 connp->conn_s2c_tcp_fss = seq + seg_len; 726 connp->conn_s2c_tcp_fin_sent = B_TRUE; 727 } 728 connp->conn_s2c_tcp_ack = ack; 729 730 /* Port reuse by the client, restart the conn. */ 731 if (connp->conn_s2c_tcp_fin_sent && 732 SEQ_GT(seq, connp->conn_s2c_tcp_fss + 1)) { 733 connp->conn_s2c_tcp_fin_sent = B_FALSE; 734 connp->conn_s2c_tcp_fin_acked = B_FALSE; 735 } 736 } 737 738 return (B_TRUE); 739 } 740 741 /* 742 * Helper routint to find conn hash entry given some packet information and 743 * the traffic direction (c2s, client to server?) 744 */ 745 static boolean_t 746 ilb_find_conn(ilb_stack_t *ilbs, void *iph, void *tph, int l4, in6_addr_t *src, 747 in_port_t sport, in6_addr_t *dst, in_port_t dport, 748 ilb_rule_info_t *rule_cache, uint32_t *ip_sum, uint32_t *tp_sum, 749 int32_t pkt_len, boolean_t c2s) 750 { 751 ilb_conn_hash_t *hash; 752 uint_t i; 753 ilb_conn_t *connp; 754 boolean_t tcp_alive; 755 boolean_t ret = B_FALSE; 756 757 i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport), 758 (uint8_t *)&dst->s6_addr32[3], ntohs(dport), 759 ilbs->ilbs_conn_hash_size); 760 if (c2s) { 761 hash = ilbs->ilbs_c2s_conn_hash; 762 mutex_enter(&hash[i].ilb_conn_hash_lock); 763 for (connp = hash[i].ilb_connp; connp != NULL; 764 connp = connp->conn_c2s_next) { 765 if (connp->conn_l4 == l4 && 766 connp->conn_c2s_dport == dport && 767 connp->conn_c2s_sport == sport && 768 IN6_ARE_ADDR_EQUAL(src, &connp->conn_c2s_saddr) && 769 IN6_ARE_ADDR_EQUAL(dst, &connp->conn_c2s_daddr)) { 770 connp->conn_c2s_atime = ddi_get_lbolt64(); 771 connp->conn_c2s_pkt_cnt++; 772 *rule_cache = connp->conn_rule_cache; 773 *ip_sum = connp->conn_c2s_ip_sum; 774 *tp_sum = connp->conn_c2s_tp_sum; 775 ret = B_TRUE; 776 break; 777 } 778 } 779 } else { 780 hash = ilbs->ilbs_s2c_conn_hash; 781 mutex_enter(&hash[i].ilb_conn_hash_lock); 782 for (connp = hash[i].ilb_connp; connp != NULL; 783 connp = connp->conn_s2c_next) { 784 if (connp->conn_l4 == l4 && 785 connp->conn_s2c_dport == dport && 786 connp->conn_s2c_sport == sport && 787 IN6_ARE_ADDR_EQUAL(src, &connp->conn_s2c_saddr) && 788 IN6_ARE_ADDR_EQUAL(dst, &connp->conn_s2c_daddr)) { 789 connp->conn_s2c_atime = ddi_get_lbolt64(); 790 connp->conn_s2c_pkt_cnt++; 791 *rule_cache = connp->conn_rule_cache; 792 *ip_sum = connp->conn_s2c_ip_sum; 793 *tp_sum = connp->conn_s2c_tp_sum; 794 ret = B_TRUE; 795 break; 796 } 797 } 798 } 799 if (ret) { 800 ILB_S_KSTAT(connp->conn_server, pkt_processed); 801 ILB_S_KSTAT_UPDATE(connp->conn_server, bytes_processed, 802 pkt_len); 803 804 switch (l4) { 805 case (IPPROTO_TCP): 806 tcp_alive = update_conn_tcp(connp, iph, tph, pkt_len, 807 c2s); 808 if (!tcp_alive) { 809 connp->conn_gc = B_TRUE; 810 } 811 break; 812 default: 813 break; 814 } 815 } 816 mutex_exit(&hash[i].ilb_conn_hash_lock); 817 818 return (ret); 819 } 820 821 /* 822 * To check if a give packet matches an existing conn hash entry. If it 823 * does, return the information about this entry so that the caller can 824 * do the proper NAT. 825 */ 826 boolean_t 827 ilb_check_conn(ilb_stack_t *ilbs, int l3, void *iph, int l4, void *tph, 828 in6_addr_t *src, in6_addr_t *dst, in_port_t sport, in_port_t dport, 829 uint32_t pkt_len, in6_addr_t *lb_dst) 830 { 831 ilb_rule_info_t rule_cache; 832 uint32_t adj_ip_sum, adj_tp_sum; 833 boolean_t ret; 834 835 /* Check the incoming hash table. */ 836 if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport, 837 &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_TRUE)) { 838 switch (rule_cache.topo) { 839 case ILB_TOPO_IMPL_NAT: 840 *lb_dst = rule_cache.info.nat_dst; 841 ilb_full_nat(l3, iph, l4, tph, &rule_cache.info, 842 adj_ip_sum, adj_tp_sum, B_TRUE); 843 ret = B_TRUE; 844 break; 845 case ILB_TOPO_IMPL_HALF_NAT: 846 *lb_dst = rule_cache.info.nat_dst; 847 ilb_half_nat(l3, iph, l4, tph, &rule_cache.info, 848 adj_ip_sum, adj_tp_sum, B_TRUE); 849 ret = B_TRUE; 850 break; 851 default: 852 ret = B_FALSE; 853 break; 854 } 855 return (ret); 856 } 857 if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport, 858 &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_FALSE)) { 859 switch (rule_cache.topo) { 860 case ILB_TOPO_IMPL_NAT: 861 *lb_dst = rule_cache.info.src; 862 ilb_full_nat(l3, iph, l4, tph, &rule_cache.info, 863 adj_ip_sum, adj_tp_sum, B_FALSE); 864 ret = B_TRUE; 865 break; 866 case ILB_TOPO_IMPL_HALF_NAT: 867 *lb_dst = *dst; 868 ilb_half_nat(l3, iph, l4, tph, &rule_cache.info, 869 adj_ip_sum, adj_tp_sum, B_FALSE); 870 ret = B_TRUE; 871 break; 872 default: 873 ret = B_FALSE; 874 break; 875 } 876 return (ret); 877 } 878 879 return (B_FALSE); 880 } 881 882 /* 883 * To check if an ICMP packet belongs to a connection in one of the conn 884 * hash entries. 885 */ 886 boolean_t 887 ilb_check_icmp_conn(ilb_stack_t *ilbs, mblk_t *mp, int l3, void *out_iph, 888 void *icmph, in6_addr_t *lb_dst) 889 { 890 ilb_conn_hash_t *hash; 891 ipha_t *in_iph4; 892 ip6_t *in_iph6; 893 icmph_t *icmph4; 894 icmp6_t *icmph6; 895 in6_addr_t *in_src_p, *in_dst_p; 896 in_port_t *sport, *dport; 897 int l4; 898 uint_t i; 899 ilb_conn_t *connp; 900 ilb_rule_info_t rule_cache; 901 uint32_t adj_ip_sum; 902 boolean_t full_nat; 903 904 if (l3 == IPPROTO_IP) { 905 in6_addr_t in_src, in_dst; 906 907 icmph4 = (icmph_t *)icmph; 908 in_iph4 = (ipha_t *)&icmph4[1]; 909 910 if ((uint8_t *)in_iph4 + IPH_HDR_LENGTH(in_iph4) + 911 ICMP_MIN_TP_HDR_LEN > mp->b_wptr) { 912 return (B_FALSE); 913 } 914 915 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_src, &in_src); 916 in_src_p = &in_src; 917 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_dst, &in_dst); 918 in_dst_p = &in_dst; 919 920 l4 = in_iph4->ipha_protocol; 921 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP) 922 return (B_FALSE); 923 924 sport = (in_port_t *)((char *)in_iph4 + 925 IPH_HDR_LENGTH(in_iph4)); 926 dport = sport + 1; 927 928 DTRACE_PROBE4(ilb__chk__icmp__conn__v4, uint32_t, 929 in_iph4->ipha_src, uint32_t, in_iph4->ipha_dst, uint16_t, 930 ntohs(*sport), uint16_t, ntohs(*dport)); 931 } else { 932 ASSERT(l3 == IPPROTO_IPV6); 933 934 icmph6 = (icmp6_t *)icmph; 935 in_iph6 = (ip6_t *)&icmph6[1]; 936 in_src_p = &in_iph6->ip6_src; 937 in_dst_p = &in_iph6->ip6_dst; 938 939 if ((uint8_t *)in_iph6 + sizeof (ip6_t) + 940 ICMP_MIN_TP_HDR_LEN > mp->b_wptr) { 941 return (B_FALSE); 942 } 943 944 l4 = in_iph6->ip6_nxt; 945 /* We don't go deep inside an IPv6 packet yet. */ 946 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP) 947 return (B_FALSE); 948 949 sport = (in_port_t *)&in_iph6[1]; 950 dport = sport + 1; 951 952 DTRACE_PROBE4(ilb__chk__icmp__conn__v6, in6_addr_t *, 953 &in_iph6->ip6_src, in6_addr_t *, &in_iph6->ip6_dst, 954 uint16_t, ntohs(*sport), uint16_t, ntohs(*dport)); 955 } 956 957 i = ILB_CONN_HASH((uint8_t *)&in_dst_p->s6_addr32[3], ntohs(*dport), 958 (uint8_t *)&in_src_p->s6_addr32[3], ntohs(*sport), 959 ilbs->ilbs_conn_hash_size); 960 hash = ilbs->ilbs_c2s_conn_hash; 961 962 mutex_enter(&hash[i].ilb_conn_hash_lock); 963 for (connp = hash[i].ilb_connp; connp != NULL; 964 connp = connp->conn_c2s_next) { 965 if (connp->conn_l4 == l4 && 966 connp->conn_c2s_dport == *sport && 967 connp->conn_c2s_sport == *dport && 968 IN6_ARE_ADDR_EQUAL(in_dst_p, &connp->conn_c2s_saddr) && 969 IN6_ARE_ADDR_EQUAL(in_src_p, &connp->conn_c2s_daddr)) { 970 connp->conn_c2s_atime = ddi_get_lbolt64(); 971 connp->conn_c2s_pkt_cnt++; 972 rule_cache = connp->conn_rule_cache; 973 adj_ip_sum = connp->conn_c2s_ip_sum; 974 break; 975 } 976 } 977 mutex_exit(&hash[i].ilb_conn_hash_lock); 978 979 if (connp == NULL) { 980 DTRACE_PROBE(ilb__chk__icmp__conn__failed); 981 return (B_FALSE); 982 } 983 984 switch (rule_cache.topo) { 985 case ILB_TOPO_IMPL_NAT: 986 full_nat = B_TRUE; 987 break; 988 case ILB_TOPO_IMPL_HALF_NAT: 989 full_nat = B_FALSE; 990 break; 991 default: 992 return (B_FALSE); 993 } 994 995 *lb_dst = rule_cache.info.nat_dst; 996 if (l3 == IPPROTO_IP) { 997 ilb_nat_icmpv4(mp, out_iph, icmph4, in_iph4, sport, dport, 998 &rule_cache.info, adj_ip_sum, full_nat); 999 } else { 1000 ilb_nat_icmpv6(mp, out_iph, icmph6, in_iph6, sport, dport, 1001 &rule_cache.info, full_nat); 1002 } 1003 return (B_TRUE); 1004 } 1005 1006 /* 1007 * This routine sends up the conn hash table to user land. Note that the 1008 * request is an ioctl, hence we cannot really differentiate requests 1009 * from different clients. There is no context shared between different 1010 * ioctls. Here we make the assumption that the user land ilbd will 1011 * only allow one client to show the conn hash table at any time. 1012 * Otherwise, the results will be "very" inconsistent. 1013 * 1014 * In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants 1015 * to read from the beginning of the able. After a certain entries 1016 * are reported, the kernel remembers the position of the last returned 1017 * entry. When the next ioctl comes in with the ILB_LIST_BEGIN flag, 1018 * it will return entries starting from where it was left off. When 1019 * the end of table is reached, a flag (ILB_LIST_END) is set to tell 1020 * the client that there is no more entry. 1021 * 1022 * It is assumed that the caller has checked the size of nat so that it 1023 * can hold num entries. 1024 */ 1025 /* ARGSUSED */ 1026 int 1027 ilb_list_nat(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_nat_entry_t *nat, 1028 uint32_t *num, uint32_t *flags) 1029 { 1030 ilb_conn_hash_t *hash; 1031 ilb_conn_t *cur_connp; 1032 uint32_t i, j; 1033 int ret = 0; 1034 1035 mutex_enter(&ilbs->ilbs_conn_list_lock); 1036 while (ilbs->ilbs_conn_list_busy) { 1037 if (cv_wait_sig(&ilbs->ilbs_conn_list_cv, 1038 &ilbs->ilbs_conn_list_lock) == 0) { 1039 mutex_exit(&ilbs->ilbs_conn_list_lock); 1040 return (EINTR); 1041 } 1042 } 1043 if ((hash = ilbs->ilbs_c2s_conn_hash) == NULL) { 1044 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL); 1045 mutex_exit(&ilbs->ilbs_conn_list_lock); 1046 *num = 0; 1047 *flags |= ILB_LIST_END; 1048 return (0); 1049 } 1050 ilbs->ilbs_conn_list_busy = B_TRUE; 1051 mutex_exit(&ilbs->ilbs_conn_list_lock); 1052 1053 if (*flags & ILB_LIST_BEGIN) { 1054 i = 0; 1055 mutex_enter(&hash[0].ilb_conn_hash_lock); 1056 cur_connp = hash[0].ilb_connp; 1057 } else if (*flags & ILB_LIST_CONT) { 1058 if (ilbs->ilbs_conn_list_cur == ilbs->ilbs_conn_hash_size) { 1059 *num = 0; 1060 *flags |= ILB_LIST_END; 1061 goto done; 1062 } 1063 i = ilbs->ilbs_conn_list_cur; 1064 mutex_enter(&hash[i].ilb_conn_hash_lock); 1065 cur_connp = ilbs->ilbs_conn_list_connp; 1066 } else { 1067 ret = EINVAL; 1068 goto done; 1069 } 1070 1071 j = 0; 1072 while (j < *num) { 1073 if (cur_connp == NULL) { 1074 mutex_exit(&hash[i].ilb_conn_hash_lock); 1075 if (++i == ilbs->ilbs_conn_hash_size) { 1076 *flags |= ILB_LIST_END; 1077 break; 1078 } 1079 mutex_enter(&hash[i].ilb_conn_hash_lock); 1080 cur_connp = hash[i].ilb_connp; 1081 continue; 1082 } 1083 nat[j].proto = cur_connp->conn_l4; 1084 1085 nat[j].in_global = cur_connp->conn_c2s_daddr; 1086 nat[j].in_global_port = cur_connp->conn_c2s_dport; 1087 nat[j].out_global = cur_connp->conn_c2s_saddr; 1088 nat[j].out_global_port = cur_connp->conn_c2s_sport; 1089 1090 nat[j].in_local = cur_connp->conn_s2c_saddr; 1091 nat[j].in_local_port = cur_connp->conn_s2c_sport; 1092 nat[j].out_local = cur_connp->conn_s2c_daddr; 1093 nat[j].out_local_port = cur_connp->conn_s2c_dport; 1094 1095 nat[j].create_time = TICK_TO_MSEC(cur_connp->conn_cr_time); 1096 nat[j].last_access_time = 1097 TICK_TO_MSEC(cur_connp->conn_c2s_atime); 1098 1099 /* 1100 * The conn_s2c_pkt_cnt may not be accurate since we are not 1101 * holding the s2c hash lock. 1102 */ 1103 nat[j].pkt_cnt = cur_connp->conn_c2s_pkt_cnt + 1104 cur_connp->conn_s2c_pkt_cnt; 1105 j++; 1106 1107 cur_connp = cur_connp->conn_c2s_next; 1108 } 1109 ilbs->ilbs_conn_list_connp = cur_connp; 1110 if (j == *num) 1111 mutex_exit(&hash[i].ilb_conn_hash_lock); 1112 1113 ilbs->ilbs_conn_list_cur = i; 1114 1115 *num = j; 1116 done: 1117 mutex_enter(&ilbs->ilbs_conn_list_lock); 1118 ilbs->ilbs_conn_list_busy = B_FALSE; 1119 cv_signal(&ilbs->ilbs_conn_list_cv); 1120 mutex_exit(&ilbs->ilbs_conn_list_lock); 1121 1122 return (ret); 1123 } 1124 1125 1126 /* 1127 * Stickiness (persistence) handling routines. 1128 */ 1129 1130 1131 static void 1132 ilb_sticky_cache_init(void) 1133 { 1134 ilb_sticky_cache = kmem_cache_create("ilb_sticky_cache", 1135 sizeof (ilb_sticky_t), 0, NULL, NULL, NULL, NULL, NULL, 1136 ilb_kmem_flags); 1137 } 1138 1139 void 1140 ilb_sticky_cache_fini(void) 1141 { 1142 if (ilb_sticky_cache != NULL) { 1143 kmem_cache_destroy(ilb_sticky_cache); 1144 ilb_sticky_cache = NULL; 1145 } 1146 } 1147 1148 void 1149 ilb_sticky_refrele(ilb_sticky_t *s) 1150 { 1151 ILB_STICKY_REFRELE(s); 1152 } 1153 1154 static ilb_sticky_t * 1155 ilb_sticky_lookup(ilb_sticky_hash_t *hash, ilb_rule_t *rule, in6_addr_t *src) 1156 { 1157 ilb_sticky_t *s; 1158 1159 ASSERT(mutex_owned(&hash->sticky_lock)); 1160 1161 for (s = list_head(&hash->sticky_head); s != NULL; 1162 s = list_next(&hash->sticky_head, s)) { 1163 if (s->rule_instance == rule->ir_ks_instance) { 1164 if (IN6_ARE_ADDR_EQUAL(src, &s->src)) 1165 return (s); 1166 } 1167 } 1168 return (NULL); 1169 } 1170 1171 static ilb_sticky_t * 1172 ilb_sticky_add(ilb_sticky_hash_t *hash, ilb_rule_t *rule, ilb_server_t *server, 1173 in6_addr_t *src) 1174 { 1175 ilb_sticky_t *s; 1176 1177 ASSERT(mutex_owned(&hash->sticky_lock)); 1178 1179 if ((s = kmem_cache_alloc(ilb_sticky_cache, KM_NOSLEEP)) == NULL) 1180 return (NULL); 1181 1182 /* 1183 * The rule instance is for handling the scenario when the same 1184 * client talks to different rules at the same time. Stickiness 1185 * is per rule so we can use the rule instance to differentiate 1186 * the client's request. 1187 */ 1188 s->rule_instance = rule->ir_ks_instance; 1189 /* 1190 * Copy the rule name for listing all sticky cache entry. ir_name 1191 * is guaranteed to be NULL terminated. 1192 */ 1193 (void) strcpy(s->rule_name, rule->ir_name); 1194 s->server = server; 1195 1196 /* 1197 * Grab a ref cnt on the server so that it won't go away while 1198 * it is still in the sticky table. 1199 */ 1200 ILB_SERVER_REFHOLD(server); 1201 s->src = *src; 1202 s->expiry = rule->ir_sticky_expiry; 1203 s->refcnt = 1; 1204 s->hash = hash; 1205 1206 /* 1207 * There is no need to set atime here since the refcnt is not 1208 * zero. A sticky entry is removed only when the refcnt is 1209 * zero. But just set it here for debugging purpose. The 1210 * atime is set when a refrele is done on a sticky entry. 1211 */ 1212 s->atime = ddi_get_lbolt64(); 1213 1214 list_insert_head(&hash->sticky_head, s); 1215 hash->sticky_cnt++; 1216 return (s); 1217 } 1218 1219 /* 1220 * This routine checks if there is an existing sticky entry which matches 1221 * a given packet. If there is one, return it. If there is not, create 1222 * a sticky entry using the packet's info. 1223 */ 1224 ilb_server_t * 1225 ilb_sticky_find_add(ilb_stack_t *ilbs, ilb_rule_t *rule, in6_addr_t *src, 1226 ilb_server_t *server, ilb_sticky_t **res, uint16_t *src_ent_idx) 1227 { 1228 int i; 1229 ilb_sticky_hash_t *hash; 1230 ilb_sticky_t *s; 1231 1232 ASSERT(server != NULL); 1233 1234 *res = NULL; 1235 1236 i = ILB_STICKY_HASH((uint8_t *)&src->s6_addr32[3], 1237 (uint32_t)(uintptr_t)rule, ilbs->ilbs_sticky_hash_size); 1238 hash = &ilbs->ilbs_sticky_hash[i]; 1239 1240 /* First check if there is already an entry. */ 1241 mutex_enter(&hash->sticky_lock); 1242 s = ilb_sticky_lookup(hash, rule, src); 1243 1244 /* No sticky entry, add one. */ 1245 if (s == NULL) { 1246 add_new_entry: 1247 s = ilb_sticky_add(hash, rule, server, src); 1248 if (s == NULL) { 1249 mutex_exit(&hash->sticky_lock); 1250 return (NULL); 1251 } 1252 /* 1253 * Find a source for this server. All subseqent requests from 1254 * the same client matching this sticky entry will use this 1255 * source address in doing NAT. The current algorithm is 1256 * simple, rotate the source address. Note that the 1257 * source address array does not change after it's created, so 1258 * it is OK to just increment the cur index. 1259 */ 1260 if (server->iser_nat_src != NULL) { 1261 /* It is a hint, does not need to be atomic. */ 1262 *src_ent_idx = (server->iser_nat_src->cur++ % 1263 server->iser_nat_src->num_src); 1264 s->nat_src_idx = *src_ent_idx; 1265 } 1266 mutex_exit(&hash->sticky_lock); 1267 *res = s; 1268 return (server); 1269 } 1270 1271 /* 1272 * We don't hold any lock accessing iser_enabled. Refer to the 1273 * comment in ilb_server_add() about iser_lock. 1274 */ 1275 if (!s->server->iser_enabled) { 1276 /* 1277 * s->server == server can only happen if there is a race in 1278 * toggling the iser_enabled flag (we don't hold a lock doing 1279 * that) so that the load balance algorithm still returns a 1280 * disabled server. In this case, just drop the packet... 1281 */ 1282 if (s->server == server) { 1283 mutex_exit(&hash->sticky_lock); 1284 return (NULL); 1285 } 1286 1287 /* 1288 * The old server is disabled and there is a new server, use 1289 * the new one to create a sticky entry. Since we will 1290 * add the entry at the beginning, subsequent lookup will 1291 * find this new entry instead of the old one. 1292 */ 1293 goto add_new_entry; 1294 } 1295 1296 s->refcnt++; 1297 *res = s; 1298 mutex_exit(&hash->sticky_lock); 1299 if (server->iser_nat_src != NULL) 1300 *src_ent_idx = s->nat_src_idx; 1301 return (s->server); 1302 } 1303 1304 static void 1305 ilb_sticky_cleanup(void *arg) 1306 { 1307 ilb_timer_t *timer = (ilb_timer_t *)arg; 1308 uint32_t i; 1309 ilb_stack_t *ilbs; 1310 ilb_sticky_hash_t *hash; 1311 ilb_sticky_t *s, *nxt_s; 1312 int64_t now, expiry; 1313 1314 ilbs = timer->ilbs; 1315 hash = ilbs->ilbs_sticky_hash; 1316 ASSERT(hash != NULL); 1317 1318 now = ddi_get_lbolt64(); 1319 for (i = timer->start; i < timer->end; i++) { 1320 mutex_enter(&hash[i].sticky_lock); 1321 for (s = list_head(&hash[i].sticky_head); s != NULL; 1322 s = nxt_s) { 1323 nxt_s = list_next(&hash[i].sticky_head, s); 1324 if (s->refcnt != 0) 1325 continue; 1326 expiry = now - SEC_TO_TICK(s->expiry); 1327 if (s->atime < expiry) { 1328 ILB_SERVER_REFRELE(s->server); 1329 list_remove(&hash[i].sticky_head, s); 1330 kmem_cache_free(ilb_sticky_cache, s); 1331 hash[i].sticky_cnt--; 1332 } 1333 } 1334 mutex_exit(&hash[i].sticky_lock); 1335 } 1336 } 1337 1338 static void 1339 ilb_sticky_timer(void *arg) 1340 { 1341 ilb_timer_t *timer = (ilb_timer_t *)arg; 1342 1343 (void) taskq_dispatch(timer->ilbs->ilbs_sticky_taskq, 1344 ilb_sticky_cleanup, arg, TQ_SLEEP); 1345 mutex_enter(&timer->tid_lock); 1346 if (timer->tid == 0) { 1347 mutex_exit(&timer->tid_lock); 1348 } else { 1349 timer->tid = timeout(ilb_sticky_timer, arg, 1350 SEC_TO_TICK(ilb_sticky_timeout)); 1351 mutex_exit(&timer->tid_lock); 1352 } 1353 } 1354 1355 void 1356 ilb_sticky_hash_init(ilb_stack_t *ilbs) 1357 { 1358 extern pri_t minclsyspri; 1359 int i, part; 1360 char tq_name[TASKQ_NAMELEN]; 1361 ilb_timer_t *tm; 1362 1363 if (!ISP2(ilbs->ilbs_sticky_hash_size)) { 1364 for (i = 0; i < 31; i++) { 1365 if (ilbs->ilbs_sticky_hash_size < (1 << i)) 1366 break; 1367 } 1368 ilbs->ilbs_sticky_hash_size = 1 << i; 1369 } 1370 1371 ilbs->ilbs_sticky_hash = kmem_zalloc(sizeof (ilb_sticky_hash_t) * 1372 ilbs->ilbs_sticky_hash_size, KM_SLEEP); 1373 for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) { 1374 mutex_init(&ilbs->ilbs_sticky_hash[i].sticky_lock, NULL, 1375 MUTEX_DEFAULT, NULL); 1376 list_create(&ilbs->ilbs_sticky_hash[i].sticky_head, 1377 sizeof (ilb_sticky_t), 1378 offsetof(ilb_sticky_t, list)); 1379 } 1380 1381 if (ilb_sticky_cache == NULL) 1382 ilb_sticky_cache_init(); 1383 1384 (void) snprintf(tq_name, sizeof (tq_name), "ilb_sticky_taskq_%p", 1385 (void *)ilbs->ilbs_netstack); 1386 ASSERT(ilbs->ilbs_sticky_taskq == NULL); 1387 ilbs->ilbs_sticky_taskq = taskq_create(tq_name, 1388 ilb_sticky_timer_size * 2, minclsyspri, ilb_sticky_timer_size, 1389 ilb_sticky_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); 1390 1391 ASSERT(ilbs->ilbs_sticky_timer_list == NULL); 1392 ilbs->ilbs_sticky_timer_list = kmem_zalloc(sizeof (ilb_timer_t) * 1393 ilb_sticky_timer_size, KM_SLEEP); 1394 part = ilbs->ilbs_sticky_hash_size / ilb_sticky_timer_size + 1; 1395 for (i = 0; i < ilb_sticky_timer_size; i++) { 1396 tm = ilbs->ilbs_sticky_timer_list + i; 1397 tm->start = i * part; 1398 tm->end = i * part + part; 1399 if (tm->end > ilbs->ilbs_sticky_hash_size) 1400 tm->end = ilbs->ilbs_sticky_hash_size; 1401 tm->ilbs = ilbs; 1402 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL); 1403 /* Spread out the starting execution time of all the timers. */ 1404 tm->tid = timeout(ilb_sticky_timer, tm, 1405 SEC_TO_TICK(ilb_sticky_timeout + i)); 1406 } 1407 } 1408 1409 void 1410 ilb_sticky_hash_fini(ilb_stack_t *ilbs) 1411 { 1412 int i; 1413 ilb_sticky_t *s; 1414 1415 if (ilbs->ilbs_sticky_hash == NULL) 1416 return; 1417 1418 /* Stop all the timers first. */ 1419 for (i = 0; i < ilb_sticky_timer_size; i++) { 1420 timeout_id_t tid; 1421 1422 /* Setting tid to 0 tells the timer handler not to restart. */ 1423 mutex_enter(&ilbs->ilbs_sticky_timer_list[i].tid_lock); 1424 tid = ilbs->ilbs_sticky_timer_list[i].tid; 1425 ilbs->ilbs_sticky_timer_list[i].tid = 0; 1426 mutex_exit(&ilbs->ilbs_sticky_timer_list[i].tid_lock); 1427 (void) untimeout(tid); 1428 } 1429 kmem_free(ilbs->ilbs_sticky_timer_list, sizeof (ilb_timer_t) * 1430 ilb_sticky_timer_size); 1431 taskq_destroy(ilbs->ilbs_sticky_taskq); 1432 ilbs->ilbs_sticky_taskq = NULL; 1433 1434 for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) { 1435 while ((s = list_head(&ilbs->ilbs_sticky_hash[i].sticky_head)) 1436 != NULL) { 1437 list_remove(&ilbs->ilbs_sticky_hash[i].sticky_head, s); 1438 ILB_SERVER_REFRELE(s->server); 1439 kmem_free(s, sizeof (ilb_sticky_t)); 1440 } 1441 } 1442 kmem_free(ilbs->ilbs_sticky_hash, ilbs->ilbs_sticky_hash_size * 1443 sizeof (ilb_sticky_hash_t)); 1444 } 1445 1446 /* 1447 * This routine sends up the sticky hash table to user land. Refer to 1448 * the comments before ilb_list_nat(). Both routines assume similar 1449 * conditions. 1450 * 1451 * It is assumed that the caller has checked the size of st so that it 1452 * can hold num entries. 1453 */ 1454 /* ARGSUSED */ 1455 int 1456 ilb_list_sticky(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_sticky_entry_t *st, 1457 uint32_t *num, uint32_t *flags) 1458 { 1459 ilb_sticky_hash_t *hash; 1460 ilb_sticky_t *curp; 1461 uint32_t i, j; 1462 int ret = 0; 1463 1464 mutex_enter(&ilbs->ilbs_sticky_list_lock); 1465 while (ilbs->ilbs_sticky_list_busy) { 1466 if (cv_wait_sig(&ilbs->ilbs_sticky_list_cv, 1467 &ilbs->ilbs_sticky_list_lock) == 0) { 1468 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1469 return (EINTR); 1470 } 1471 } 1472 if ((hash = ilbs->ilbs_sticky_hash) == NULL) { 1473 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1474 *num = 0; 1475 *flags |= ILB_LIST_END; 1476 return (0); 1477 } 1478 ilbs->ilbs_sticky_list_busy = B_TRUE; 1479 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1480 1481 if (*flags & ILB_LIST_BEGIN) { 1482 i = 0; 1483 mutex_enter(&hash[0].sticky_lock); 1484 curp = list_head(&hash[0].sticky_head); 1485 } else if (*flags & ILB_LIST_CONT) { 1486 if (ilbs->ilbs_sticky_list_cur == ilbs->ilbs_sticky_hash_size) { 1487 *num = 0; 1488 *flags |= ILB_LIST_END; 1489 goto done; 1490 } 1491 i = ilbs->ilbs_sticky_list_cur; 1492 mutex_enter(&hash[i].sticky_lock); 1493 curp = ilbs->ilbs_sticky_list_curp; 1494 } else { 1495 ret = EINVAL; 1496 goto done; 1497 } 1498 1499 j = 0; 1500 while (j < *num) { 1501 if (curp == NULL) { 1502 mutex_exit(&hash[i].sticky_lock); 1503 if (++i == ilbs->ilbs_sticky_hash_size) { 1504 *flags |= ILB_LIST_END; 1505 break; 1506 } 1507 mutex_enter(&hash[i].sticky_lock); 1508 curp = list_head(&hash[i].sticky_head); 1509 continue; 1510 } 1511 (void) strcpy(st[j].rule_name, curp->rule_name); 1512 st[j].req_addr = curp->src; 1513 st[j].srv_addr = curp->server->iser_addr_v6; 1514 st[j].expiry_time = TICK_TO_MSEC(curp->expiry); 1515 j++; 1516 curp = list_next(&hash[i].sticky_head, curp); 1517 } 1518 ilbs->ilbs_sticky_list_curp = curp; 1519 if (j == *num) 1520 mutex_exit(&hash[i].sticky_lock); 1521 1522 ilbs->ilbs_sticky_list_cur = i; 1523 1524 *num = j; 1525 done: 1526 mutex_enter(&ilbs->ilbs_sticky_list_lock); 1527 ilbs->ilbs_sticky_list_busy = B_FALSE; 1528 cv_signal(&ilbs->ilbs_sticky_list_cv); 1529 mutex_exit(&ilbs->ilbs_sticky_list_lock); 1530 1531 return (ret); 1532 }