1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <sys/sysmacros.h>
  28 #include <sys/types.h>
  29 #include <sys/conf.h>
  30 #include <sys/time.h>
  31 #include <sys/taskq.h>
  32 #include <sys/cmn_err.h>
  33 #include <sys/sdt.h>
  34 #include <sys/atomic.h>
  35 #include <netinet/in.h>
  36 #include <inet/ip.h>
  37 #include <inet/ip6.h>
  38 #include <inet/tcp.h>
  39 #include <inet/udp_impl.h>
  40 #include <inet/ilb.h>
  41 
  42 #include "ilb_stack.h"
  43 #include "ilb_impl.h"
  44 #include "ilb_conn.h"
  45 #include "ilb_nat.h"
  46 
  47 /*
  48  * Timer struct for ilb_conn_t and ilb_sticky_t garbage collection
  49  *
  50  * start: starting index into the hash table to do gc
  51  * end: ending index into the hash table to do gc
  52  * ilbs: pointer to the ilb_stack_t of the IP stack
  53  * tid_lock: mutex to protect the timer id.
  54  * tid: timer id of the timer
  55  */
  56 typedef struct ilb_timer_s {
  57         uint32_t        start;
  58         uint32_t        end;
  59         ilb_stack_t     *ilbs;
  60         kmutex_t        tid_lock;
  61         timeout_id_t    tid;
  62 } ilb_timer_t;
  63 
  64 /* Hash macro for finding the index to the conn hash table */
  65 #define ILB_CONN_HASH(saddr, sport, daddr, dport, hash_size)    \
  66         (((*((saddr) + 3) ^ *((daddr) + 3)) * 50653 +           \
  67         (*((saddr) + 2) ^ *((daddr) + 2)) * 1369 +              \
  68         (*((saddr) + 1) ^ *((daddr) + 1)) * 37 +                \
  69         (*(saddr) ^ *(daddr)) + (sport) * 37 + (dport)) &   \
  70         ((hash_size) - 1))
  71 
  72 /* Kmem cache for the conn hash entry */
  73 static struct kmem_cache *ilb_conn_cache = NULL;
  74 
  75 /*
  76  * There are 60 timers running to do conn cache garbage collection.  Each
  77  * gc thread is responsible for 1/60 of the conn hash table.
  78  */
  79 static int ilb_conn_timer_size = 60;
  80 
  81 /* Each of the above gc timers wake up every 15s to do the gc. */
  82 static int ilb_conn_cache_timeout = 15;
  83 
  84 #define ILB_STICKY_HASH(saddr, rule, hash_size)                 \
  85         (((*((saddr) + 3) ^ ((rule) >> 24)) * 29791 +             \
  86         (*((saddr) + 2) ^ ((rule) >> 16)) * 961 +         \
  87         (*((saddr) + 1) ^ ((rule) >> 8)) * 31 +                   \
  88         (*(saddr) ^ (rule))) & ((hash_size) - 1))
  89 
  90 static struct kmem_cache *ilb_sticky_cache = NULL;
  91 
  92 /*
  93  * There are 60 timers running to do sticky cache garbage collection.  Each
  94  * gc thread is responsible for 1/60 of the sticky hash table.
  95  */
  96 static int ilb_sticky_timer_size = 60;
  97 
  98 /* Each of the above gc timers wake up every 15s to do the gc. */
  99 static int ilb_sticky_timeout = 15;
 100 
 101 #define ILB_STICKY_REFRELE(s)                   \
 102 {                                               \
 103         mutex_enter(&(s)->hash->sticky_lock); \
 104         (s)->refcnt--;                               \
 105         (s)->atime = ddi_get_lbolt64();              \
 106         mutex_exit(&s->hash->sticky_lock);    \
 107 }
 108 
 109 
 110 static void
 111 ilb_conn_cache_init(void)
 112 {
 113         ilb_conn_cache = kmem_cache_create("ilb_conn_cache",
 114             sizeof (ilb_conn_t), 0, NULL, NULL, NULL, NULL, NULL,
 115             ilb_kmem_flags);
 116 }
 117 
 118 void
 119 ilb_conn_cache_fini(void)
 120 {
 121         if (ilb_conn_cache != NULL) {
 122                 kmem_cache_destroy(ilb_conn_cache);
 123                 ilb_conn_cache = NULL;
 124         }
 125 }
 126 
 127 static void
 128 ilb_conn_remove_common(ilb_conn_t *connp, boolean_t c2s)
 129 {
 130         ilb_conn_hash_t *hash;
 131         ilb_conn_t **next, **prev;
 132         ilb_conn_t **next_prev, **prev_next;
 133 
 134         if (c2s) {
 135                 hash = connp->conn_c2s_hash;
 136                 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
 137                 next = &connp->conn_c2s_next;
 138                 prev = &connp->conn_c2s_prev;
 139                 if (*next != NULL)
 140                         next_prev = &(*next)->conn_c2s_prev;
 141                 if (*prev != NULL)
 142                         prev_next = &(*prev)->conn_c2s_next;
 143         } else {
 144                 hash = connp->conn_s2c_hash;
 145                 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
 146                 next = &connp->conn_s2c_next;
 147                 prev = &connp->conn_s2c_prev;
 148                 if (*next != NULL)
 149                         next_prev = &(*next)->conn_s2c_prev;
 150                 if (*prev != NULL)
 151                         prev_next = &(*prev)->conn_s2c_next;
 152         }
 153 
 154         if (hash->ilb_connp == connp) {
 155                 hash->ilb_connp = *next;
 156                 if (*next != NULL)
 157                         *next_prev = NULL;
 158         } else {
 159                 if (*prev != NULL)
 160                         *prev_next = *next;
 161                 if (*next != NULL)
 162                         *next_prev = *prev;
 163         }
 164         ASSERT(hash->ilb_conn_cnt > 0);
 165         hash->ilb_conn_cnt--;
 166 
 167         *next = NULL;
 168         *prev = NULL;
 169 }
 170 
 171 static void
 172 ilb_conn_remove(ilb_conn_t *connp)
 173 {
 174         ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
 175         ilb_conn_remove_common(connp, B_TRUE);
 176         ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
 177         ilb_conn_remove_common(connp, B_FALSE);
 178 
 179         if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
 180                 in_port_t port;
 181 
 182                 port = ntohs(connp->conn_rule_cache.info.nat_sport);
 183                 vmem_free(connp->conn_rule_cache.info.src_ent->nse_port_arena,
 184                     (void *)(uintptr_t)port, 1);
 185         }
 186 
 187         if (connp->conn_sticky != NULL)
 188                 ILB_STICKY_REFRELE(connp->conn_sticky);
 189         ILB_SERVER_REFRELE(connp->conn_server);
 190         kmem_cache_free(ilb_conn_cache, connp);
 191 }
 192 
 193 /*
 194  * Routine to do periodic garbage collection of conn hash entries.  When
 195  * a conn hash timer fires, it dispatches a taskq to call this function
 196  * to do the gc.  Note that each taskq is responisble for a portion of
 197  * the table.  The portion is stored in timer->start, timer->end.
 198  */
 199 static void
 200 ilb_conn_cleanup(void *arg)
 201 {
 202         ilb_timer_t *timer = (ilb_timer_t *)arg;
 203         uint32_t i;
 204         ilb_stack_t *ilbs;
 205         ilb_conn_hash_t *c2s_hash, *s2c_hash;
 206         ilb_conn_t *connp, *nxt_connp;
 207         int64_t now;
 208         int64_t expiry;
 209         boolean_t die_now;
 210 
 211         ilbs = timer->ilbs;
 212         c2s_hash = ilbs->ilbs_c2s_conn_hash;
 213         ASSERT(c2s_hash != NULL);
 214 
 215         now = ddi_get_lbolt64();
 216         for (i = timer->start; i < timer->end; i++) {
 217                 mutex_enter(&c2s_hash[i].ilb_conn_hash_lock);
 218                 if ((connp = c2s_hash[i].ilb_connp) == NULL) {
 219                         ASSERT(c2s_hash[i].ilb_conn_cnt == 0);
 220                         mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
 221                         continue;
 222                 }
 223                 do {
 224                         ASSERT(c2s_hash[i].ilb_conn_cnt > 0);
 225                         ASSERT(connp->conn_c2s_hash == &c2s_hash[i]);
 226                         nxt_connp = connp->conn_c2s_next;
 227                         expiry = now - SEC_TO_TICK(connp->conn_expiry);
 228                         if (connp->conn_server->iser_die_time != 0 &&
 229                             connp->conn_server->iser_die_time < now)
 230                                 die_now = B_TRUE;
 231                         else
 232                                 die_now = B_FALSE;
 233                         s2c_hash = connp->conn_s2c_hash;
 234                         mutex_enter(&s2c_hash->ilb_conn_hash_lock);
 235 
 236                         if (connp->conn_gc || die_now ||
 237                             (connp->conn_c2s_atime < expiry &&
 238                             connp->conn_s2c_atime < expiry)) {
 239                                 /* Need to update the nat list cur_connp */
 240                                 if (connp == ilbs->ilbs_conn_list_connp) {
 241                                         ilbs->ilbs_conn_list_connp =
 242                                             connp->conn_c2s_next;
 243                                 }
 244                                 ilb_conn_remove(connp);
 245                                 goto nxt_connp;
 246                         }
 247 
 248                         if (connp->conn_l4 != IPPROTO_TCP)
 249                                 goto nxt_connp;
 250 
 251                         /* Update and check TCP related conn info */
 252                         if (connp->conn_c2s_tcp_fin_sent &&
 253                             SEQ_GT(connp->conn_s2c_tcp_ack,
 254                             connp->conn_c2s_tcp_fss)) {
 255                                 connp->conn_c2s_tcp_fin_acked = B_TRUE;
 256                         }
 257                         if (connp->conn_s2c_tcp_fin_sent &&
 258                             SEQ_GT(connp->conn_c2s_tcp_ack,
 259                             connp->conn_s2c_tcp_fss)) {
 260                                 connp->conn_s2c_tcp_fin_acked = B_TRUE;
 261                         }
 262                         if (connp->conn_c2s_tcp_fin_acked &&
 263                             connp->conn_s2c_tcp_fin_acked) {
 264                                 ilb_conn_remove(connp);
 265                         }
 266 nxt_connp:
 267                         mutex_exit(&s2c_hash->ilb_conn_hash_lock);
 268                         connp = nxt_connp;
 269                 } while (connp != NULL);
 270                 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
 271         }
 272 }
 273 
 274 /* Conn hash timer routine.  It dispatches a taskq and restart the timer */
 275 static void
 276 ilb_conn_timer(void *arg)
 277 {
 278         ilb_timer_t *timer = (ilb_timer_t *)arg;
 279 
 280         (void) taskq_dispatch(timer->ilbs->ilbs_conn_taskq, ilb_conn_cleanup,
 281             arg, TQ_SLEEP);
 282         mutex_enter(&timer->tid_lock);
 283         if (timer->tid == 0) {
 284                 mutex_exit(&timer->tid_lock);
 285         } else {
 286                 timer->tid = timeout(ilb_conn_timer, arg,
 287                     SEC_TO_TICK(ilb_conn_cache_timeout));
 288                 mutex_exit(&timer->tid_lock);
 289         }
 290 }
 291 
 292 void
 293 ilb_conn_hash_init(ilb_stack_t *ilbs)
 294 {
 295         extern pri_t minclsyspri;
 296         int i, part;
 297         ilb_timer_t *tm;
 298         char tq_name[TASKQ_NAMELEN];
 299 
 300         /*
 301          * If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to
 302          * the next power of 2.
 303          */
 304         if (!ISP2(ilbs->ilbs_conn_hash_size)) {
 305                 for (i = 0; i < 31; i++) {
 306                         if (ilbs->ilbs_conn_hash_size < (1 << i))
 307                                 break;
 308                 }
 309                 ilbs->ilbs_conn_hash_size = 1 << i;
 310         }
 311 
 312         /*
 313          * Can sleep since this should be called when a rule is being added,
 314          * hence we are not in interrupt context.
 315          */
 316         ilbs->ilbs_c2s_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
 317             ilbs->ilbs_conn_hash_size, KM_SLEEP);
 318         ilbs->ilbs_s2c_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
 319             ilbs->ilbs_conn_hash_size, KM_SLEEP);
 320 
 321         for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
 322                 mutex_init(&ilbs->ilbs_c2s_conn_hash[i].ilb_conn_hash_lock,
 323                     NULL, MUTEX_DEFAULT, NULL);
 324         }
 325         for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
 326                 mutex_init(&ilbs->ilbs_s2c_conn_hash[i].ilb_conn_hash_lock,
 327                     NULL, MUTEX_DEFAULT, NULL);
 328         }
 329 
 330         if (ilb_conn_cache == NULL)
 331                 ilb_conn_cache_init();
 332 
 333         (void) snprintf(tq_name, sizeof (tq_name), "ilb_conn_taskq_%p",
 334             (void *)ilbs->ilbs_netstack);
 335         ASSERT(ilbs->ilbs_conn_taskq == NULL);
 336         ilbs->ilbs_conn_taskq = taskq_create(tq_name,
 337             ilb_conn_timer_size * 2, minclsyspri, ilb_conn_timer_size,
 338             ilb_conn_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
 339 
 340         ASSERT(ilbs->ilbs_conn_timer_list == NULL);
 341         ilbs->ilbs_conn_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
 342             ilb_conn_timer_size, KM_SLEEP);
 343 
 344         /*
 345          * The hash table is divided in equal partition for those timers
 346          * to do garbage collection.
 347          */
 348         part = ilbs->ilbs_conn_hash_size / ilb_conn_timer_size + 1;
 349         for (i = 0; i < ilb_conn_timer_size; i++) {
 350                 tm = ilbs->ilbs_conn_timer_list + i;
 351                 tm->start = i * part;
 352                 tm->end = i * part + part;
 353                 if (tm->end > ilbs->ilbs_conn_hash_size)
 354                         tm->end = ilbs->ilbs_conn_hash_size;
 355                 tm->ilbs = ilbs;
 356                 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
 357                 /* Spread out the starting execution time of all the timers. */
 358                 tm->tid = timeout(ilb_conn_timer, tm,
 359                     SEC_TO_TICK(ilb_conn_cache_timeout + i));
 360         }
 361 }
 362 
 363 void
 364 ilb_conn_hash_fini(ilb_stack_t *ilbs)
 365 {
 366         uint32_t i;
 367         ilb_conn_t *connp;
 368 
 369         if (ilbs->ilbs_c2s_conn_hash == NULL) {
 370                 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
 371                 return;
 372         }
 373 
 374         /* Stop all the timers first. */
 375         for (i = 0; i < ilb_conn_timer_size; i++) {
 376                 timeout_id_t tid;
 377 
 378                 /* Setting tid to 0 tells the timer handler not to restart. */
 379                 mutex_enter(&ilbs->ilbs_conn_timer_list[i].tid_lock);
 380                 tid = ilbs->ilbs_conn_timer_list[i].tid;
 381                 ilbs->ilbs_conn_timer_list[i].tid = 0;
 382                 mutex_exit(&ilbs->ilbs_conn_timer_list[i].tid_lock);
 383                 (void) untimeout(tid);
 384         }
 385         kmem_free(ilbs->ilbs_conn_timer_list, sizeof (ilb_timer_t) *
 386             ilb_conn_timer_size);
 387         taskq_destroy(ilbs->ilbs_conn_taskq);
 388         ilbs->ilbs_conn_taskq = NULL;
 389 
 390         /* Then remove all the conns. */
 391         for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
 392                 while ((connp = ilbs->ilbs_s2c_conn_hash->ilb_connp) != NULL) {
 393                         ilbs->ilbs_s2c_conn_hash->ilb_connp =
 394                             connp->conn_s2c_next;
 395                         ILB_SERVER_REFRELE(connp->conn_server);
 396                         if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
 397                                 ilb_nat_src_entry_t *ent;
 398                                 in_port_t port;
 399 
 400                                 /*
 401                                  * src_ent will be freed in ilb_nat_src_fini().
 402                                  */
 403                                 port = ntohs(
 404                                     connp->conn_rule_cache.info.nat_sport);
 405                                 ent = connp->conn_rule_cache.info.src_ent;
 406                                 vmem_free(ent->nse_port_arena,
 407                                     (void *)(uintptr_t)port, 1);
 408                         }
 409                         kmem_cache_free(ilb_conn_cache, connp);
 410                 }
 411         }
 412         kmem_free(ilbs->ilbs_c2s_conn_hash, sizeof (ilb_conn_hash_t) *
 413             ilbs->ilbs_conn_hash_size);
 414         kmem_free(ilbs->ilbs_s2c_conn_hash, sizeof (ilb_conn_hash_t) *
 415             ilbs->ilbs_conn_hash_size);
 416 }
 417 
 418 /*
 419  * Internet checksum adjustment calculation routines.  We pre-calculate
 420  * checksum adjustment so that we don't need to compute the checksum on
 421  * the whole packet when we change address/port in the packet.
 422  */
 423 
 424 static void
 425 hnat_cksum_v4(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
 426     in_port_t new_port, uint32_t *adj_sum)
 427 {
 428         uint32_t sum;
 429 
 430         sum = *oaddr + *(oaddr + 1) + old_port;
 431         while ((sum >> 16) != 0)
 432                 sum = (sum & 0xffff) + (sum >> 16);
 433         *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + new_port;
 434 }
 435 
 436 static void
 437 hnat_cksum_v6(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
 438     in_port_t new_port, uint32_t *adj_sum)
 439 {
 440         uint32_t sum = 0;
 441 
 442         sum = *oaddr + *(oaddr + 1) + *(oaddr + 2) + *(oaddr + 3) +
 443             *(oaddr + 4) + *(oaddr + 5) + *(oaddr + 6) + *(oaddr + 7) +
 444             old_port;
 445         while ((sum >> 16) != 0)
 446                 sum = (sum & 0xffff) + (sum >> 16);
 447         *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) +
 448             *(naddr + 2) + *(naddr + 3) + *(naddr + 4) + *(naddr + 5) +
 449             *(naddr + 6) + *(naddr + 7) + new_port;
 450 }
 451 
 452 static void
 453 fnat_cksum_v4(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
 454     uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
 455     in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
 456 {
 457         uint32_t sum;
 458 
 459         sum = *oaddr1 + *(oaddr1 + 1) + old_port1 + *oaddr2 + *(oaddr2 + 1) +
 460             old_port2;
 461         while ((sum >> 16) != 0)
 462                 sum = (sum & 0xffff) + (sum >> 16);
 463         *adj_sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + new_port1 +
 464             *naddr2 + *(naddr2 + 1) + new_port2;
 465 }
 466 
 467 static void
 468 fnat_cksum_v6(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
 469     uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
 470     in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
 471 {
 472         uint32_t sum = 0;
 473 
 474         sum = *oaddr1 + *(oaddr1 + 1) + *(oaddr1 + 2) + *(oaddr1 + 3) +
 475             *(oaddr1 + 4) + *(oaddr1 + 5) + *(oaddr1 + 6) + *(oaddr1 + 7) +
 476             old_port1;
 477         sum += *oaddr2 + *(oaddr2 + 1) + *(oaddr2 + 2) + *(oaddr2 + 3) +
 478             *(oaddr2 + 4) + *(oaddr2 + 5) + *(oaddr2 + 6) + *(oaddr2 + 7) +
 479             old_port2;
 480         while ((sum >> 16) != 0)
 481                 sum = (sum & 0xffff) + (sum >> 16);
 482         sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + *(naddr1 + 2) +
 483             *(naddr1 + 3) + *(naddr1 + 4) + *(naddr1 + 5) + *(naddr1 + 6) +
 484             *(naddr1 + 7) + new_port1;
 485         *adj_sum = sum + *naddr2 + *(naddr2 + 1) + *(naddr2 + 2) +
 486             *(naddr2 + 3) + *(naddr2 + 4) + *(naddr2 + 5) + *(naddr2 + 6) +
 487             *(naddr2 + 7) + new_port2;
 488 }
 489 
 490 /*
 491  * Add a conn hash entry to the tables.  Note that a conn hash entry
 492  * (ilb_conn_t) contains info on both directions.  And there are two hash
 493  * tables, one for client to server and the other for server to client.
 494  * So the same entry is added to both tables and can be ccessed by two
 495  * thread simultaneously.  But each thread will only access data on one
 496  * direction, so there is no conflict.
 497  */
 498 int
 499 ilb_conn_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_t *server,
 500     in6_addr_t *src, in_port_t sport, in6_addr_t *dst, in_port_t dport,
 501     ilb_nat_info_t *info, uint32_t *ip_sum, uint32_t *tp_sum, ilb_sticky_t *s)
 502 {
 503         ilb_conn_t *connp;
 504         ilb_conn_hash_t *hash;
 505         int i;
 506 
 507         connp = kmem_cache_alloc(ilb_conn_cache, KM_NOSLEEP);
 508         if (connp == NULL) {
 509                 if (s != NULL) {
 510                         if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
 511                                 ilb_nat_src_entry_t **entry;
 512 
 513                                 entry = s->server->iser_nat_src->src_list;
 514                                 vmem_free(entry[s->nat_src_idx]->nse_port_arena,
 515                                     (void *)(uintptr_t)ntohs(info->nat_sport),
 516                                     1);
 517                         }
 518                         ILB_STICKY_REFRELE(s);
 519                 }
 520                 return (ENOMEM);
 521         }
 522 
 523         connp->conn_l4 = rule->ir_proto;
 524 
 525         connp->conn_server = server;
 526         ILB_SERVER_REFHOLD(server);
 527         connp->conn_sticky = s;
 528 
 529         connp->conn_rule_cache.topo = rule->ir_topo;
 530         connp->conn_rule_cache.info = *info;
 531 
 532         connp->conn_gc = B_FALSE;
 533 
 534         connp->conn_expiry = rule->ir_nat_expiry;
 535         connp->conn_cr_time = ddi_get_lbolt64();
 536 
 537         /* Client to server info. */
 538         connp->conn_c2s_saddr = *src;
 539         connp->conn_c2s_sport = sport;
 540         connp->conn_c2s_daddr = *dst;
 541         connp->conn_c2s_dport = dport;
 542 
 543         connp->conn_c2s_atime = ddi_get_lbolt64();
 544         /* The packet ths triggers this creation should be counted */
 545         connp->conn_c2s_pkt_cnt = 1;
 546         connp->conn_c2s_tcp_fin_sent = B_FALSE;
 547         connp->conn_c2s_tcp_fin_acked = B_FALSE;
 548 
 549         /* Server to client info, before NAT */
 550         switch (rule->ir_topo) {
 551         case ILB_TOPO_IMPL_HALF_NAT:
 552                 connp->conn_s2c_saddr = info->nat_dst;
 553                 connp->conn_s2c_sport = info->nat_dport;
 554                 connp->conn_s2c_daddr = *src;
 555                 connp->conn_s2c_dport = sport;
 556 
 557                 /* Pre-calculate checksum changes for both directions */
 558                 if (rule->ir_ipver == IPPROTO_IP) {
 559                         hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
 560                             (uint16_t *)&info->nat_dst.s6_addr32[3], 0, 0,
 561                             &connp->conn_c2s_ip_sum);
 562                         hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
 563                             (uint16_t *)&info->nat_dst.s6_addr32[3], dport,
 564                             info->nat_dport, &connp->conn_c2s_tp_sum);
 565                         *ip_sum = connp->conn_c2s_ip_sum;
 566                         *tp_sum = connp->conn_c2s_tp_sum;
 567 
 568                         hnat_cksum_v4(
 569                             (uint16_t *)&info->nat_dst.s6_addr32[3],
 570                             (uint16_t *)&dst->s6_addr32[3], 0, 0,
 571                             &connp->conn_s2c_ip_sum);
 572                         hnat_cksum_v4(
 573                             (uint16_t *)&info->nat_dst.s6_addr32[3],
 574                             (uint16_t *)&dst->s6_addr32[3],
 575                             info->nat_dport, dport,
 576                             &connp->conn_s2c_tp_sum);
 577                 } else {
 578                         connp->conn_c2s_ip_sum = 0;
 579                         hnat_cksum_v6((uint16_t *)dst,
 580                             (uint16_t *)&info->nat_dst, dport,
 581                             info->nat_dport, &connp->conn_c2s_tp_sum);
 582                         *ip_sum = 0;
 583                         *tp_sum = connp->conn_c2s_tp_sum;
 584 
 585                         connp->conn_s2c_ip_sum = 0;
 586                         hnat_cksum_v6((uint16_t *)&info->nat_dst,
 587                             (uint16_t *)dst, info->nat_dport, dport,
 588                             &connp->conn_s2c_tp_sum);
 589                 }
 590                 break;
 591         case ILB_TOPO_IMPL_NAT:
 592                 connp->conn_s2c_saddr = info->nat_dst;
 593                 connp->conn_s2c_sport = info->nat_dport;
 594                 connp->conn_s2c_daddr = info->nat_src;
 595                 connp->conn_s2c_dport = info->nat_sport;
 596 
 597                 if (rule->ir_ipver == IPPROTO_IP) {
 598                         fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
 599                             (uint16_t *)&dst->s6_addr32[3],
 600                             (uint16_t *)&info->nat_src.s6_addr32[3],
 601                             (uint16_t *)&info->nat_dst.s6_addr32[3],
 602                             0, 0, 0, 0, &connp->conn_c2s_ip_sum);
 603                         fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
 604                             (uint16_t *)&dst->s6_addr32[3],
 605                             (uint16_t *)&info->nat_src.s6_addr32[3],
 606                             (uint16_t *)&info->nat_dst.s6_addr32[3],
 607                             sport, dport, info->nat_sport,
 608                             info->nat_dport, &connp->conn_c2s_tp_sum);
 609                         *ip_sum = connp->conn_c2s_ip_sum;
 610                         *tp_sum = connp->conn_c2s_tp_sum;
 611 
 612                         fnat_cksum_v4(
 613                             (uint16_t *)&info->nat_src.s6_addr32[3],
 614                             (uint16_t *)&info->nat_dst.s6_addr32[3],
 615                             (uint16_t *)&src->s6_addr32[3],
 616                             (uint16_t *)&dst->s6_addr32[3],
 617                             0, 0, 0, 0, &connp->conn_s2c_ip_sum);
 618                         fnat_cksum_v4(
 619                             (uint16_t *)&info->nat_src.s6_addr32[3],
 620                             (uint16_t *)&info->nat_dst.s6_addr32[3],
 621                             (uint16_t *)&src->s6_addr32[3],
 622                             (uint16_t *)&dst->s6_addr32[3],
 623                             info->nat_sport, info->nat_dport,
 624                             sport, dport, &connp->conn_s2c_tp_sum);
 625                 } else {
 626                         fnat_cksum_v6((uint16_t *)src, (uint16_t *)dst,
 627                             (uint16_t *)&info->nat_src,
 628                             (uint16_t *)&info->nat_dst,
 629                             sport, dport, info->nat_sport,
 630                             info->nat_dport, &connp->conn_c2s_tp_sum);
 631                         connp->conn_c2s_ip_sum = 0;
 632                         *ip_sum = 0;
 633                         *tp_sum = connp->conn_c2s_tp_sum;
 634 
 635                         fnat_cksum_v6((uint16_t *)&info->nat_src,
 636                             (uint16_t *)&info->nat_dst, (uint16_t *)src,
 637                             (uint16_t *)dst, info->nat_sport,
 638                             info->nat_dport, sport, dport,
 639                             &connp->conn_s2c_tp_sum);
 640                         connp->conn_s2c_ip_sum = 0;
 641                 }
 642                 break;
 643         }
 644 
 645         connp->conn_s2c_atime = ddi_get_lbolt64();
 646         connp->conn_s2c_pkt_cnt = 1;
 647         connp->conn_s2c_tcp_fin_sent = B_FALSE;
 648         connp->conn_s2c_tcp_fin_acked = B_FALSE;
 649 
 650         /* Add it to the s2c hash table. */
 651         hash = ilbs->ilbs_s2c_conn_hash;
 652         i = ILB_CONN_HASH((uint8_t *)&connp->conn_s2c_saddr.s6_addr32[3],
 653             ntohs(connp->conn_s2c_sport),
 654             (uint8_t *)&connp->conn_s2c_daddr.s6_addr32[3],
 655             ntohs(connp->conn_s2c_dport), ilbs->ilbs_conn_hash_size);
 656         connp->conn_s2c_hash = &hash[i];
 657         DTRACE_PROBE2(ilb__conn__hash__add__s2c, ilb_conn_t *, connp, int, i);
 658 
 659         mutex_enter(&hash[i].ilb_conn_hash_lock);
 660         hash[i].ilb_conn_cnt++;
 661         connp->conn_s2c_next = hash[i].ilb_connp;
 662         if (hash[i].ilb_connp != NULL)
 663                 hash[i].ilb_connp->conn_s2c_prev = connp;
 664         connp->conn_s2c_prev = NULL;
 665         hash[i].ilb_connp = connp;
 666         mutex_exit(&hash[i].ilb_conn_hash_lock);
 667 
 668         /* Add it to the c2s hash table. */
 669         hash = ilbs->ilbs_c2s_conn_hash;
 670         i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
 671             (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
 672             ilbs->ilbs_conn_hash_size);
 673         connp->conn_c2s_hash = &hash[i];
 674         DTRACE_PROBE2(ilb__conn__hash__add__c2s, ilb_conn_t *, connp, int, i);
 675 
 676         mutex_enter(&hash[i].ilb_conn_hash_lock);
 677         hash[i].ilb_conn_cnt++;
 678         connp->conn_c2s_next = hash[i].ilb_connp;
 679         if (hash[i].ilb_connp != NULL)
 680                 hash[i].ilb_connp->conn_c2s_prev = connp;
 681         connp->conn_c2s_prev = NULL;
 682         hash[i].ilb_connp = connp;
 683         mutex_exit(&hash[i].ilb_conn_hash_lock);
 684 
 685         return (0);
 686 }
 687 
 688 /*
 689  * If a connection is using TCP, we keep track of simple TCP state transition
 690  * so that we know when to clean up an entry.
 691  */
 692 static boolean_t
 693 update_conn_tcp(ilb_conn_t *connp, void *iph, tcpha_t *tcpha, int32_t pkt_len,
 694     boolean_t c2s)
 695 {
 696         uint32_t ack, seq;
 697         int32_t seg_len;
 698 
 699         if (tcpha->tha_flags & TH_RST)
 700                 return (B_FALSE);
 701 
 702         seg_len = pkt_len - ((uint8_t *)tcpha - (uint8_t *)iph) -
 703             TCP_HDR_LENGTH((tcph_t *)tcpha);
 704 
 705         if (tcpha->tha_flags & TH_ACK)
 706                 ack = ntohl(tcpha->tha_ack);
 707         seq = ntohl(tcpha->tha_seq);
 708         if (c2s) {
 709                 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
 710                 if (tcpha->tha_flags & TH_FIN) {
 711                         connp->conn_c2s_tcp_fss = seq + seg_len;
 712                         connp->conn_c2s_tcp_fin_sent = B_TRUE;
 713                 }
 714                 connp->conn_c2s_tcp_ack = ack;
 715 
 716                 /* Port reuse by the client, restart the conn. */
 717                 if (connp->conn_c2s_tcp_fin_sent &&
 718                     SEQ_GT(seq, connp->conn_c2s_tcp_fss + 1)) {
 719                         connp->conn_c2s_tcp_fin_sent = B_FALSE;
 720                         connp->conn_c2s_tcp_fin_acked = B_FALSE;
 721                 }
 722         } else {
 723                 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
 724                 if (tcpha->tha_flags & TH_FIN) {
 725                         connp->conn_s2c_tcp_fss = seq + seg_len;
 726                         connp->conn_s2c_tcp_fin_sent = B_TRUE;
 727                 }
 728                 connp->conn_s2c_tcp_ack = ack;
 729 
 730                 /* Port reuse by the client, restart the conn. */
 731                 if (connp->conn_s2c_tcp_fin_sent &&
 732                     SEQ_GT(seq, connp->conn_s2c_tcp_fss + 1)) {
 733                         connp->conn_s2c_tcp_fin_sent = B_FALSE;
 734                         connp->conn_s2c_tcp_fin_acked = B_FALSE;
 735                 }
 736         }
 737 
 738         return (B_TRUE);
 739 }
 740 
 741 /*
 742  * Helper routint to find conn hash entry given some packet information and
 743  * the traffic direction (c2s, client to server?)
 744  */
 745 static boolean_t
 746 ilb_find_conn(ilb_stack_t *ilbs, void *iph, void *tph, int l4, in6_addr_t *src,
 747     in_port_t sport, in6_addr_t *dst, in_port_t dport,
 748     ilb_rule_info_t *rule_cache, uint32_t *ip_sum, uint32_t *tp_sum,
 749     int32_t pkt_len, boolean_t c2s)
 750 {
 751         ilb_conn_hash_t *hash;
 752         uint_t i;
 753         ilb_conn_t *connp;
 754         boolean_t tcp_alive;
 755         boolean_t ret = B_FALSE;
 756 
 757         i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
 758             (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
 759             ilbs->ilbs_conn_hash_size);
 760         if (c2s) {
 761                 hash = ilbs->ilbs_c2s_conn_hash;
 762                 mutex_enter(&hash[i].ilb_conn_hash_lock);
 763                 for (connp = hash[i].ilb_connp; connp != NULL;
 764                     connp = connp->conn_c2s_next) {
 765                         if (connp->conn_l4 == l4 &&
 766                             connp->conn_c2s_dport == dport &&
 767                             connp->conn_c2s_sport == sport &&
 768                             IN6_ARE_ADDR_EQUAL(src, &connp->conn_c2s_saddr) &&
 769                             IN6_ARE_ADDR_EQUAL(dst, &connp->conn_c2s_daddr)) {
 770                                 connp->conn_c2s_atime = ddi_get_lbolt64();
 771                                 connp->conn_c2s_pkt_cnt++;
 772                                 *rule_cache = connp->conn_rule_cache;
 773                                 *ip_sum = connp->conn_c2s_ip_sum;
 774                                 *tp_sum = connp->conn_c2s_tp_sum;
 775                                 ret = B_TRUE;
 776                                 break;
 777                         }
 778                 }
 779         } else {
 780                 hash = ilbs->ilbs_s2c_conn_hash;
 781                 mutex_enter(&hash[i].ilb_conn_hash_lock);
 782                 for (connp = hash[i].ilb_connp; connp != NULL;
 783                     connp = connp->conn_s2c_next) {
 784                         if (connp->conn_l4 == l4 &&
 785                             connp->conn_s2c_dport == dport &&
 786                             connp->conn_s2c_sport == sport &&
 787                             IN6_ARE_ADDR_EQUAL(src, &connp->conn_s2c_saddr) &&
 788                             IN6_ARE_ADDR_EQUAL(dst, &connp->conn_s2c_daddr)) {
 789                                 connp->conn_s2c_atime = ddi_get_lbolt64();
 790                                 connp->conn_s2c_pkt_cnt++;
 791                                 *rule_cache = connp->conn_rule_cache;
 792                                 *ip_sum = connp->conn_s2c_ip_sum;
 793                                 *tp_sum = connp->conn_s2c_tp_sum;
 794                                 ret = B_TRUE;
 795                                 break;
 796                         }
 797                 }
 798         }
 799         if (ret) {
 800                 ILB_S_KSTAT(connp->conn_server, pkt_processed);
 801                 ILB_S_KSTAT_UPDATE(connp->conn_server, bytes_processed,
 802                     pkt_len);
 803 
 804                 switch (l4) {
 805                 case (IPPROTO_TCP):
 806                         tcp_alive = update_conn_tcp(connp, iph, tph, pkt_len,
 807                             c2s);
 808                         if (!tcp_alive) {
 809                                 connp->conn_gc = B_TRUE;
 810                         }
 811                         break;
 812                 default:
 813                         break;
 814                 }
 815         }
 816         mutex_exit(&hash[i].ilb_conn_hash_lock);
 817 
 818         return (ret);
 819 }
 820 
 821 /*
 822  * To check if a give packet matches an existing conn hash entry.  If it
 823  * does, return the information about this entry so that the caller can
 824  * do the proper NAT.
 825  */
 826 boolean_t
 827 ilb_check_conn(ilb_stack_t *ilbs, int l3, void *iph, int l4, void *tph,
 828     in6_addr_t *src, in6_addr_t *dst, in_port_t sport, in_port_t dport,
 829     uint32_t pkt_len, in6_addr_t *lb_dst)
 830 {
 831         ilb_rule_info_t rule_cache;
 832         uint32_t adj_ip_sum, adj_tp_sum;
 833         boolean_t ret;
 834 
 835         /* Check the incoming hash table. */
 836         if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
 837             &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_TRUE)) {
 838                 switch (rule_cache.topo) {
 839                 case ILB_TOPO_IMPL_NAT:
 840                         *lb_dst = rule_cache.info.nat_dst;
 841                         ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
 842                             adj_ip_sum, adj_tp_sum, B_TRUE);
 843                         ret = B_TRUE;
 844                         break;
 845                 case ILB_TOPO_IMPL_HALF_NAT:
 846                         *lb_dst = rule_cache.info.nat_dst;
 847                         ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
 848                             adj_ip_sum, adj_tp_sum, B_TRUE);
 849                         ret = B_TRUE;
 850                         break;
 851                 default:
 852                         ret = B_FALSE;
 853                         break;
 854                 }
 855                 return (ret);
 856         }
 857         if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
 858             &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_FALSE)) {
 859                 switch (rule_cache.topo) {
 860                 case ILB_TOPO_IMPL_NAT:
 861                         *lb_dst = rule_cache.info.src;
 862                         ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
 863                             adj_ip_sum, adj_tp_sum, B_FALSE);
 864                         ret = B_TRUE;
 865                         break;
 866                 case ILB_TOPO_IMPL_HALF_NAT:
 867                         *lb_dst = *dst;
 868                         ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
 869                             adj_ip_sum, adj_tp_sum, B_FALSE);
 870                         ret = B_TRUE;
 871                         break;
 872                 default:
 873                         ret = B_FALSE;
 874                         break;
 875                 }
 876                 return (ret);
 877         }
 878 
 879         return (B_FALSE);
 880 }
 881 
 882 /*
 883  * To check if an ICMP packet belongs to a connection in one of the conn
 884  * hash entries.
 885  */
 886 boolean_t
 887 ilb_check_icmp_conn(ilb_stack_t *ilbs, mblk_t *mp, int l3, void *out_iph,
 888     void *icmph, in6_addr_t *lb_dst)
 889 {
 890         ilb_conn_hash_t *hash;
 891         ipha_t *in_iph4;
 892         ip6_t *in_iph6;
 893         icmph_t *icmph4;
 894         icmp6_t *icmph6;
 895         in6_addr_t *in_src_p, *in_dst_p;
 896         in_port_t *sport, *dport;
 897         int l4;
 898         uint_t i;
 899         ilb_conn_t *connp;
 900         ilb_rule_info_t rule_cache;
 901         uint32_t adj_ip_sum;
 902         boolean_t full_nat;
 903 
 904         if (l3 == IPPROTO_IP) {
 905                 in6_addr_t in_src, in_dst;
 906 
 907                 icmph4 = (icmph_t *)icmph;
 908                 in_iph4 = (ipha_t *)&icmph4[1];
 909 
 910                 if ((uint8_t *)in_iph4 + IPH_HDR_LENGTH(in_iph4) +
 911                     ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
 912                         return (B_FALSE);
 913                 }
 914 
 915                 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_src, &in_src);
 916                 in_src_p = &in_src;
 917                 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_dst, &in_dst);
 918                 in_dst_p = &in_dst;
 919 
 920                 l4 = in_iph4->ipha_protocol;
 921                 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
 922                         return (B_FALSE);
 923 
 924                 sport = (in_port_t *)((char *)in_iph4 +
 925                     IPH_HDR_LENGTH(in_iph4));
 926                 dport = sport + 1;
 927 
 928                 DTRACE_PROBE4(ilb__chk__icmp__conn__v4, uint32_t,
 929                     in_iph4->ipha_src, uint32_t, in_iph4->ipha_dst, uint16_t,
 930                     ntohs(*sport), uint16_t, ntohs(*dport));
 931         } else {
 932                 ASSERT(l3 == IPPROTO_IPV6);
 933 
 934                 icmph6 = (icmp6_t *)icmph;
 935                 in_iph6 = (ip6_t *)&icmph6[1];
 936                 in_src_p = &in_iph6->ip6_src;
 937                 in_dst_p = &in_iph6->ip6_dst;
 938 
 939                 if ((uint8_t *)in_iph6 + sizeof (ip6_t) +
 940                     ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
 941                         return (B_FALSE);
 942                 }
 943 
 944                 l4 = in_iph6->ip6_nxt;
 945                 /* We don't go deep inside an IPv6 packet yet. */
 946                 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
 947                         return (B_FALSE);
 948 
 949                 sport = (in_port_t *)&in_iph6[1];
 950                 dport = sport + 1;
 951 
 952                 DTRACE_PROBE4(ilb__chk__icmp__conn__v6, in6_addr_t *,
 953                     &in_iph6->ip6_src, in6_addr_t *, &in_iph6->ip6_dst,
 954                     uint16_t, ntohs(*sport), uint16_t, ntohs(*dport));
 955         }
 956 
 957         i = ILB_CONN_HASH((uint8_t *)&in_dst_p->s6_addr32[3], ntohs(*dport),
 958             (uint8_t *)&in_src_p->s6_addr32[3], ntohs(*sport),
 959             ilbs->ilbs_conn_hash_size);
 960         hash = ilbs->ilbs_c2s_conn_hash;
 961 
 962         mutex_enter(&hash[i].ilb_conn_hash_lock);
 963         for (connp = hash[i].ilb_connp; connp != NULL;
 964             connp = connp->conn_c2s_next) {
 965                 if (connp->conn_l4 == l4 &&
 966                     connp->conn_c2s_dport == *sport &&
 967                     connp->conn_c2s_sport == *dport &&
 968                     IN6_ARE_ADDR_EQUAL(in_dst_p, &connp->conn_c2s_saddr) &&
 969                     IN6_ARE_ADDR_EQUAL(in_src_p, &connp->conn_c2s_daddr)) {
 970                         connp->conn_c2s_atime = ddi_get_lbolt64();
 971                         connp->conn_c2s_pkt_cnt++;
 972                         rule_cache = connp->conn_rule_cache;
 973                         adj_ip_sum = connp->conn_c2s_ip_sum;
 974                         break;
 975                 }
 976         }
 977         mutex_exit(&hash[i].ilb_conn_hash_lock);
 978 
 979         if (connp == NULL) {
 980                 DTRACE_PROBE(ilb__chk__icmp__conn__failed);
 981                 return (B_FALSE);
 982         }
 983 
 984         switch (rule_cache.topo) {
 985         case ILB_TOPO_IMPL_NAT:
 986                 full_nat = B_TRUE;
 987                 break;
 988         case ILB_TOPO_IMPL_HALF_NAT:
 989                 full_nat = B_FALSE;
 990                 break;
 991         default:
 992                 return (B_FALSE);
 993         }
 994 
 995         *lb_dst = rule_cache.info.nat_dst;
 996         if (l3 == IPPROTO_IP) {
 997                 ilb_nat_icmpv4(mp, out_iph, icmph4, in_iph4, sport, dport,
 998                     &rule_cache.info, adj_ip_sum, full_nat);
 999         } else {
1000                 ilb_nat_icmpv6(mp, out_iph, icmph6, in_iph6, sport, dport,
1001                     &rule_cache.info, full_nat);
1002         }
1003         return (B_TRUE);
1004 }
1005 
1006 /*
1007  * This routine sends up the conn hash table to user land.  Note that the
1008  * request is an ioctl, hence we cannot really differentiate requests
1009  * from different clients.  There is no context shared between different
1010  * ioctls.  Here we make the assumption that the user land ilbd will
1011  * only allow one client to show the conn hash table at any time.
1012  * Otherwise, the results will be "very" inconsistent.
1013  *
1014  * In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants
1015  * to read from the beginning of the able.  After a certain entries
1016  * are reported, the kernel remembers the position of the last returned
1017  * entry.  When the next ioctl comes in with the ILB_LIST_BEGIN flag,
1018  * it will return entries starting from where it was left off.  When
1019  * the end of table is reached, a flag (ILB_LIST_END) is set to tell
1020  * the client that there is no more entry.
1021  *
1022  * It is assumed that the caller has checked the size of nat so that it
1023  * can hold num entries.
1024  */
1025 /* ARGSUSED */
1026 int
1027 ilb_list_nat(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_nat_entry_t *nat,
1028     uint32_t *num, uint32_t *flags)
1029 {
1030         ilb_conn_hash_t *hash;
1031         ilb_conn_t *cur_connp;
1032         uint32_t i, j;
1033         int ret = 0;
1034 
1035         mutex_enter(&ilbs->ilbs_conn_list_lock);
1036         while (ilbs->ilbs_conn_list_busy) {
1037                 if (cv_wait_sig(&ilbs->ilbs_conn_list_cv,
1038                     &ilbs->ilbs_conn_list_lock) == 0) {
1039                         mutex_exit(&ilbs->ilbs_conn_list_lock);
1040                         return (EINTR);
1041                 }
1042         }
1043         if ((hash = ilbs->ilbs_c2s_conn_hash) == NULL) {
1044                 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
1045                 mutex_exit(&ilbs->ilbs_conn_list_lock);
1046                 *num = 0;
1047                 *flags |= ILB_LIST_END;
1048                 return (0);
1049         }
1050         ilbs->ilbs_conn_list_busy = B_TRUE;
1051         mutex_exit(&ilbs->ilbs_conn_list_lock);
1052 
1053         if (*flags & ILB_LIST_BEGIN) {
1054                 i = 0;
1055                 mutex_enter(&hash[0].ilb_conn_hash_lock);
1056                 cur_connp = hash[0].ilb_connp;
1057         } else if (*flags & ILB_LIST_CONT) {
1058                 if (ilbs->ilbs_conn_list_cur == ilbs->ilbs_conn_hash_size) {
1059                         *num = 0;
1060                         *flags |= ILB_LIST_END;
1061                         goto done;
1062                 }
1063                 i = ilbs->ilbs_conn_list_cur;
1064                 mutex_enter(&hash[i].ilb_conn_hash_lock);
1065                 cur_connp = ilbs->ilbs_conn_list_connp;
1066         } else {
1067                 ret = EINVAL;
1068                 goto done;
1069         }
1070 
1071         j = 0;
1072         while (j < *num) {
1073                 if (cur_connp == NULL) {
1074                         mutex_exit(&hash[i].ilb_conn_hash_lock);
1075                         if (++i == ilbs->ilbs_conn_hash_size) {
1076                                 *flags |= ILB_LIST_END;
1077                                 break;
1078                         }
1079                         mutex_enter(&hash[i].ilb_conn_hash_lock);
1080                         cur_connp = hash[i].ilb_connp;
1081                         continue;
1082                 }
1083                 nat[j].proto = cur_connp->conn_l4;
1084 
1085                 nat[j].in_global = cur_connp->conn_c2s_daddr;
1086                 nat[j].in_global_port = cur_connp->conn_c2s_dport;
1087                 nat[j].out_global = cur_connp->conn_c2s_saddr;
1088                 nat[j].out_global_port = cur_connp->conn_c2s_sport;
1089 
1090                 nat[j].in_local = cur_connp->conn_s2c_saddr;
1091                 nat[j].in_local_port = cur_connp->conn_s2c_sport;
1092                 nat[j].out_local = cur_connp->conn_s2c_daddr;
1093                 nat[j].out_local_port = cur_connp->conn_s2c_dport;
1094 
1095                 nat[j].create_time = TICK_TO_MSEC(cur_connp->conn_cr_time);
1096                 nat[j].last_access_time =
1097                     TICK_TO_MSEC(cur_connp->conn_c2s_atime);
1098 
1099                 /*
1100                  * The conn_s2c_pkt_cnt may not be accurate since we are not
1101                  * holding the s2c hash lock.
1102                  */
1103                 nat[j].pkt_cnt = cur_connp->conn_c2s_pkt_cnt +
1104                     cur_connp->conn_s2c_pkt_cnt;
1105                 j++;
1106 
1107                 cur_connp = cur_connp->conn_c2s_next;
1108         }
1109         ilbs->ilbs_conn_list_connp = cur_connp;
1110         if (j == *num)
1111                 mutex_exit(&hash[i].ilb_conn_hash_lock);
1112 
1113         ilbs->ilbs_conn_list_cur = i;
1114 
1115         *num = j;
1116 done:
1117         mutex_enter(&ilbs->ilbs_conn_list_lock);
1118         ilbs->ilbs_conn_list_busy = B_FALSE;
1119         cv_signal(&ilbs->ilbs_conn_list_cv);
1120         mutex_exit(&ilbs->ilbs_conn_list_lock);
1121 
1122         return (ret);
1123 }
1124 
1125 
1126 /*
1127  * Stickiness (persistence) handling routines.
1128  */
1129 
1130 
1131 static void
1132 ilb_sticky_cache_init(void)
1133 {
1134         ilb_sticky_cache = kmem_cache_create("ilb_sticky_cache",
1135             sizeof (ilb_sticky_t), 0, NULL, NULL, NULL, NULL, NULL,
1136             ilb_kmem_flags);
1137 }
1138 
1139 void
1140 ilb_sticky_cache_fini(void)
1141 {
1142         if (ilb_sticky_cache != NULL) {
1143                 kmem_cache_destroy(ilb_sticky_cache);
1144                 ilb_sticky_cache = NULL;
1145         }
1146 }
1147 
1148 void
1149 ilb_sticky_refrele(ilb_sticky_t *s)
1150 {
1151         ILB_STICKY_REFRELE(s);
1152 }
1153 
1154 static ilb_sticky_t *
1155 ilb_sticky_lookup(ilb_sticky_hash_t *hash, ilb_rule_t *rule, in6_addr_t *src)
1156 {
1157         ilb_sticky_t *s;
1158 
1159         ASSERT(mutex_owned(&hash->sticky_lock));
1160 
1161         for (s = list_head(&hash->sticky_head); s != NULL;
1162             s = list_next(&hash->sticky_head, s)) {
1163                 if (s->rule_instance == rule->ir_ks_instance) {
1164                         if (IN6_ARE_ADDR_EQUAL(src, &s->src))
1165                                 return (s);
1166                 }
1167         }
1168         return (NULL);
1169 }
1170 
1171 static ilb_sticky_t *
1172 ilb_sticky_add(ilb_sticky_hash_t *hash, ilb_rule_t *rule, ilb_server_t *server,
1173     in6_addr_t *src)
1174 {
1175         ilb_sticky_t *s;
1176 
1177         ASSERT(mutex_owned(&hash->sticky_lock));
1178 
1179         if ((s = kmem_cache_alloc(ilb_sticky_cache, KM_NOSLEEP)) == NULL)
1180                 return (NULL);
1181 
1182         /*
1183          * The rule instance is for handling the scenario when the same
1184          * client talks to different rules at the same time.  Stickiness
1185          * is per rule so we can use the rule instance to differentiate
1186          * the client's request.
1187          */
1188         s->rule_instance = rule->ir_ks_instance;
1189         /*
1190          * Copy the rule name for listing all sticky cache entry.  ir_name
1191          * is guaranteed to be NULL terminated.
1192          */
1193         (void) strcpy(s->rule_name, rule->ir_name);
1194         s->server = server;
1195 
1196         /*
1197          * Grab a ref cnt on the server so that it won't go away while
1198          * it is still in the sticky table.
1199          */
1200         ILB_SERVER_REFHOLD(server);
1201         s->src = *src;
1202         s->expiry = rule->ir_sticky_expiry;
1203         s->refcnt = 1;
1204         s->hash = hash;
1205 
1206         /*
1207          * There is no need to set atime here since the refcnt is not
1208          * zero.  A sticky entry is removed only when the refcnt is
1209          * zero.  But just set it here for debugging purpose.  The
1210          * atime is set when a refrele is done on a sticky entry.
1211          */
1212         s->atime = ddi_get_lbolt64();
1213 
1214         list_insert_head(&hash->sticky_head, s);
1215         hash->sticky_cnt++;
1216         return (s);
1217 }
1218 
1219 /*
1220  * This routine checks if there is an existing sticky entry which matches
1221  * a given packet.  If there is one, return it.  If there is not, create
1222  * a sticky entry using the packet's info.
1223  */
1224 ilb_server_t *
1225 ilb_sticky_find_add(ilb_stack_t *ilbs, ilb_rule_t *rule, in6_addr_t *src,
1226     ilb_server_t *server, ilb_sticky_t **res, uint16_t *src_ent_idx)
1227 {
1228         int i;
1229         ilb_sticky_hash_t *hash;
1230         ilb_sticky_t *s;
1231 
1232         ASSERT(server != NULL);
1233 
1234         *res = NULL;
1235 
1236         i = ILB_STICKY_HASH((uint8_t *)&src->s6_addr32[3],
1237             (uint32_t)(uintptr_t)rule, ilbs->ilbs_sticky_hash_size);
1238         hash = &ilbs->ilbs_sticky_hash[i];
1239 
1240         /* First check if there is already an entry. */
1241         mutex_enter(&hash->sticky_lock);
1242         s = ilb_sticky_lookup(hash, rule, src);
1243 
1244         /* No sticky entry, add one. */
1245         if (s == NULL) {
1246 add_new_entry:
1247                 s = ilb_sticky_add(hash, rule, server, src);
1248                 if (s == NULL) {
1249                         mutex_exit(&hash->sticky_lock);
1250                         return (NULL);
1251                 }
1252                 /*
1253                  * Find a source for this server.  All subseqent requests from
1254                  * the same client matching this sticky entry will use this
1255                  * source address in doing NAT.  The current algorithm is
1256                  * simple, rotate the source address.  Note that the
1257                  * source address array does not change after it's created, so
1258                  * it is OK to just increment the cur index.
1259                  */
1260                 if (server->iser_nat_src != NULL) {
1261                         /* It is a hint, does not need to be atomic. */
1262                         *src_ent_idx = (server->iser_nat_src->cur++ %
1263                             server->iser_nat_src->num_src);
1264                         s->nat_src_idx = *src_ent_idx;
1265                 }
1266                 mutex_exit(&hash->sticky_lock);
1267                 *res = s;
1268                 return (server);
1269         }
1270 
1271         /*
1272          * We don't hold any lock accessing iser_enabled.  Refer to the
1273          * comment in ilb_server_add() about iser_lock.
1274          */
1275         if (!s->server->iser_enabled) {
1276                 /*
1277                  * s->server == server can only happen if there is a race in
1278                  * toggling the iser_enabled flag (we don't hold a lock doing
1279                  * that) so that the load balance algorithm still returns a
1280                  * disabled server.  In this case, just drop the packet...
1281                  */
1282                 if (s->server == server) {
1283                         mutex_exit(&hash->sticky_lock);
1284                         return (NULL);
1285                 }
1286 
1287                 /*
1288                  * The old server is disabled and there is a new server, use
1289                  * the new one to create a sticky entry.  Since we will
1290                  * add the entry at the beginning, subsequent lookup will
1291                  * find this new entry instead of the old one.
1292                  */
1293                 goto add_new_entry;
1294         }
1295 
1296         s->refcnt++;
1297         *res = s;
1298         mutex_exit(&hash->sticky_lock);
1299         if (server->iser_nat_src != NULL)
1300                 *src_ent_idx = s->nat_src_idx;
1301         return (s->server);
1302 }
1303 
1304 static void
1305 ilb_sticky_cleanup(void *arg)
1306 {
1307         ilb_timer_t *timer = (ilb_timer_t *)arg;
1308         uint32_t i;
1309         ilb_stack_t *ilbs;
1310         ilb_sticky_hash_t *hash;
1311         ilb_sticky_t *s, *nxt_s;
1312         int64_t now, expiry;
1313 
1314         ilbs = timer->ilbs;
1315         hash = ilbs->ilbs_sticky_hash;
1316         ASSERT(hash != NULL);
1317 
1318         now = ddi_get_lbolt64();
1319         for (i = timer->start; i < timer->end; i++) {
1320                 mutex_enter(&hash[i].sticky_lock);
1321                 for (s = list_head(&hash[i].sticky_head); s != NULL;
1322                     s = nxt_s) {
1323                         nxt_s = list_next(&hash[i].sticky_head, s);
1324                         if (s->refcnt != 0)
1325                                 continue;
1326                         expiry = now - SEC_TO_TICK(s->expiry);
1327                         if (s->atime < expiry) {
1328                                 ILB_SERVER_REFRELE(s->server);
1329                                 list_remove(&hash[i].sticky_head, s);
1330                                 kmem_cache_free(ilb_sticky_cache, s);
1331                                 hash[i].sticky_cnt--;
1332                         }
1333                 }
1334                 mutex_exit(&hash[i].sticky_lock);
1335         }
1336 }
1337 
1338 static void
1339 ilb_sticky_timer(void *arg)
1340 {
1341         ilb_timer_t *timer = (ilb_timer_t *)arg;
1342 
1343         (void) taskq_dispatch(timer->ilbs->ilbs_sticky_taskq,
1344             ilb_sticky_cleanup, arg, TQ_SLEEP);
1345         mutex_enter(&timer->tid_lock);
1346         if (timer->tid == 0) {
1347                 mutex_exit(&timer->tid_lock);
1348         } else {
1349                 timer->tid = timeout(ilb_sticky_timer, arg,
1350                     SEC_TO_TICK(ilb_sticky_timeout));
1351                 mutex_exit(&timer->tid_lock);
1352         }
1353 }
1354 
1355 void
1356 ilb_sticky_hash_init(ilb_stack_t *ilbs)
1357 {
1358         extern pri_t minclsyspri;
1359         int i, part;
1360         char tq_name[TASKQ_NAMELEN];
1361         ilb_timer_t *tm;
1362 
1363         if (!ISP2(ilbs->ilbs_sticky_hash_size)) {
1364                 for (i = 0; i < 31; i++) {
1365                         if (ilbs->ilbs_sticky_hash_size < (1 << i))
1366                                 break;
1367                 }
1368                 ilbs->ilbs_sticky_hash_size = 1 << i;
1369         }
1370 
1371         ilbs->ilbs_sticky_hash = kmem_zalloc(sizeof (ilb_sticky_hash_t) *
1372             ilbs->ilbs_sticky_hash_size, KM_SLEEP);
1373         for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1374                 mutex_init(&ilbs->ilbs_sticky_hash[i].sticky_lock, NULL,
1375                     MUTEX_DEFAULT, NULL);
1376                 list_create(&ilbs->ilbs_sticky_hash[i].sticky_head,
1377                     sizeof (ilb_sticky_t),
1378                     offsetof(ilb_sticky_t, list));
1379         }
1380 
1381         if (ilb_sticky_cache == NULL)
1382                 ilb_sticky_cache_init();
1383 
1384         (void) snprintf(tq_name, sizeof (tq_name), "ilb_sticky_taskq_%p",
1385             (void *)ilbs->ilbs_netstack);
1386         ASSERT(ilbs->ilbs_sticky_taskq == NULL);
1387         ilbs->ilbs_sticky_taskq = taskq_create(tq_name,
1388             ilb_sticky_timer_size * 2, minclsyspri, ilb_sticky_timer_size,
1389             ilb_sticky_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
1390 
1391         ASSERT(ilbs->ilbs_sticky_timer_list == NULL);
1392         ilbs->ilbs_sticky_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
1393             ilb_sticky_timer_size, KM_SLEEP);
1394         part = ilbs->ilbs_sticky_hash_size / ilb_sticky_timer_size + 1;
1395         for (i = 0; i < ilb_sticky_timer_size; i++) {
1396                 tm = ilbs->ilbs_sticky_timer_list + i;
1397                 tm->start = i * part;
1398                 tm->end = i * part + part;
1399                 if (tm->end > ilbs->ilbs_sticky_hash_size)
1400                         tm->end = ilbs->ilbs_sticky_hash_size;
1401                 tm->ilbs = ilbs;
1402                 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
1403                 /* Spread out the starting execution time of all the timers. */
1404                 tm->tid = timeout(ilb_sticky_timer, tm,
1405                     SEC_TO_TICK(ilb_sticky_timeout + i));
1406         }
1407 }
1408 
1409 void
1410 ilb_sticky_hash_fini(ilb_stack_t *ilbs)
1411 {
1412         int i;
1413         ilb_sticky_t *s;
1414 
1415         if (ilbs->ilbs_sticky_hash == NULL)
1416                 return;
1417 
1418         /* Stop all the timers first. */
1419         for (i = 0; i < ilb_sticky_timer_size; i++) {
1420                 timeout_id_t tid;
1421 
1422                 /* Setting tid to 0 tells the timer handler not to restart. */
1423                 mutex_enter(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1424                 tid = ilbs->ilbs_sticky_timer_list[i].tid;
1425                 ilbs->ilbs_sticky_timer_list[i].tid = 0;
1426                 mutex_exit(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1427                 (void) untimeout(tid);
1428         }
1429         kmem_free(ilbs->ilbs_sticky_timer_list, sizeof (ilb_timer_t) *
1430             ilb_sticky_timer_size);
1431         taskq_destroy(ilbs->ilbs_sticky_taskq);
1432         ilbs->ilbs_sticky_taskq = NULL;
1433 
1434         for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1435                 while ((s = list_head(&ilbs->ilbs_sticky_hash[i].sticky_head))
1436                     != NULL) {
1437                         list_remove(&ilbs->ilbs_sticky_hash[i].sticky_head, s);
1438                         ILB_SERVER_REFRELE(s->server);
1439                         kmem_free(s, sizeof (ilb_sticky_t));
1440                 }
1441         }
1442         kmem_free(ilbs->ilbs_sticky_hash, ilbs->ilbs_sticky_hash_size *
1443             sizeof (ilb_sticky_hash_t));
1444 }
1445 
1446 /*
1447  * This routine sends up the sticky hash table to user land.  Refer to
1448  * the comments before ilb_list_nat().  Both routines assume similar
1449  * conditions.
1450  *
1451  * It is assumed that the caller has checked the size of st so that it
1452  * can hold num entries.
1453  */
1454 /* ARGSUSED */
1455 int
1456 ilb_list_sticky(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_sticky_entry_t *st,
1457     uint32_t *num, uint32_t *flags)
1458 {
1459         ilb_sticky_hash_t *hash;
1460         ilb_sticky_t *curp;
1461         uint32_t i, j;
1462         int ret = 0;
1463 
1464         mutex_enter(&ilbs->ilbs_sticky_list_lock);
1465         while (ilbs->ilbs_sticky_list_busy) {
1466                 if (cv_wait_sig(&ilbs->ilbs_sticky_list_cv,
1467                     &ilbs->ilbs_sticky_list_lock) == 0) {
1468                         mutex_exit(&ilbs->ilbs_sticky_list_lock);
1469                         return (EINTR);
1470                 }
1471         }
1472         if ((hash = ilbs->ilbs_sticky_hash) == NULL) {
1473                 mutex_exit(&ilbs->ilbs_sticky_list_lock);
1474                 *num = 0;
1475                 *flags |= ILB_LIST_END;
1476                 return (0);
1477         }
1478         ilbs->ilbs_sticky_list_busy = B_TRUE;
1479         mutex_exit(&ilbs->ilbs_sticky_list_lock);
1480 
1481         if (*flags & ILB_LIST_BEGIN) {
1482                 i = 0;
1483                 mutex_enter(&hash[0].sticky_lock);
1484                 curp = list_head(&hash[0].sticky_head);
1485         } else if (*flags & ILB_LIST_CONT) {
1486                 if (ilbs->ilbs_sticky_list_cur == ilbs->ilbs_sticky_hash_size) {
1487                         *num = 0;
1488                         *flags |= ILB_LIST_END;
1489                         goto done;
1490                 }
1491                 i = ilbs->ilbs_sticky_list_cur;
1492                 mutex_enter(&hash[i].sticky_lock);
1493                 curp = ilbs->ilbs_sticky_list_curp;
1494         } else {
1495                 ret = EINVAL;
1496                 goto done;
1497         }
1498 
1499         j = 0;
1500         while (j < *num) {
1501                 if (curp == NULL) {
1502                         mutex_exit(&hash[i].sticky_lock);
1503                         if (++i == ilbs->ilbs_sticky_hash_size) {
1504                                 *flags |= ILB_LIST_END;
1505                                 break;
1506                         }
1507                         mutex_enter(&hash[i].sticky_lock);
1508                         curp = list_head(&hash[i].sticky_head);
1509                         continue;
1510                 }
1511                 (void) strcpy(st[j].rule_name, curp->rule_name);
1512                 st[j].req_addr = curp->src;
1513                 st[j].srv_addr = curp->server->iser_addr_v6;
1514                 st[j].expiry_time = TICK_TO_MSEC(curp->expiry);
1515                 j++;
1516                 curp = list_next(&hash[i].sticky_head, curp);
1517         }
1518         ilbs->ilbs_sticky_list_curp = curp;
1519         if (j == *num)
1520                 mutex_exit(&hash[i].sticky_lock);
1521 
1522         ilbs->ilbs_sticky_list_cur = i;
1523 
1524         *num = j;
1525 done:
1526         mutex_enter(&ilbs->ilbs_sticky_list_lock);
1527         ilbs->ilbs_sticky_list_busy = B_FALSE;
1528         cv_signal(&ilbs->ilbs_sticky_list_cv);
1529         mutex_exit(&ilbs->ilbs_sticky_list_lock);
1530 
1531         return (ret);
1532 }