1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/stream.h>
  29 #include <sys/strsun.h>
  30 #include <sys/zone.h>
  31 #include <sys/ddi.h>
  32 #include <sys/disp.h>
  33 #include <sys/sunddi.h>
  34 #include <sys/cmn_err.h>
  35 #include <sys/debug.h>
  36 #include <sys/atomic.h>
  37 #include <sys/callb.h>
  38 #define _SUN_TPI_VERSION 2
  39 #include <sys/tihdr.h>
  40 
  41 #include <inet/common.h>
  42 #include <inet/mi.h>
  43 #include <inet/mib2.h>
  44 #include <inet/snmpcom.h>
  45 
  46 #include <netinet/ip6.h>
  47 #include <netinet/icmp6.h>
  48 
  49 #include <inet/ip.h>
  50 #include <inet/ip_impl.h>
  51 #include <inet/ip6.h>
  52 #include <inet/ip6_asp.h>
  53 #include <inet/ip_multi.h>
  54 #include <inet/ip_if.h>
  55 #include <inet/ip_ire.h>
  56 #include <inet/ip_ftable.h>
  57 #include <inet/ip_rts.h>
  58 #include <inet/ip_ndp.h>
  59 #include <inet/ipclassifier.h>
  60 #include <inet/ip_listutils.h>
  61 
  62 #include <sys/sunddi.h>
  63 
  64 /*
  65  * Routines for handling destination cache entries.
  66  * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
  67  * That entry holds both the IP ident value and the dce generation number.
  68  *
  69  * Any time a DCE is changed significantly (different path MTU, but NOT
  70  * different ULP info!), the dce_generation number is increased.
  71  * Also, when a new DCE is created, the dce_generation number in the default
  72  * DCE is bumped. That allows the dce_t information to be cached efficiently
  73  * as long as the entity caching the dce_t also caches the dce_generation,
  74  * and compares the cached generation to detect any changes.
  75  * Furthermore, when a DCE is deleted, if there are any outstanding references
  76  * to the DCE it will be marked as condemned. The condemned mark is
  77  * a designated generation number which is never otherwise used, hence
  78  * the single comparison with the generation number captures that as well.
  79  *
  80  * An example of code which caches is as follows:
  81  *
  82  *      if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
  83  *              The DCE has changed
  84  *              mystruct->my_dce = dce_lookup_pkt(mp, ixa,
  85  *                  &mystruct->my_dce_generation);
  86  *              Not needed in practice, since we have the default DCE:
  87  *              if (DCE_IS_CONDEMNED(mystruct->my_dce))
  88  *                      return failure;
  89  *      }
  90  *
  91  * Note that for IPv6 link-local addresses we record the ifindex since the
  92  * link-locals are not globally unique.
  93  */
  94 
  95 /*
  96  * Hash bucket structure for DCEs
  97  */
  98 typedef struct dcb_s {
  99         krwlock_t       dcb_lock;
 100         uint32_t        dcb_cnt;
 101         dce_t           *dcb_dce;
 102 } dcb_t;
 103 
 104 static void     dce_delete_locked(dcb_t *, dce_t *);
 105 static void     dce_make_condemned(dce_t *);
 106 
 107 static kmem_cache_t *dce_cache;
 108 static kthread_t *dce_reclaim_thread;
 109 static kmutex_t dce_reclaim_lock;
 110 static kcondvar_t dce_reclaim_cv;
 111 static int dce_reclaim_shutdown;
 112 
 113 /* Global so it can be tuned in /etc/system. This must be a power of two. */
 114 uint_t ip_dce_hash_size = 1024;
 115 
 116 /* The time in seconds between executions of the IP DCE reclaim worker. */
 117 uint_t ip_dce_reclaim_interval = 60;
 118 
 119 /* The factor of the DCE threshold at which to start hard reclaims */
 120 uint_t ip_dce_reclaim_threshold_hard = 2;
 121 
 122 /* Operates on a uint64_t */
 123 #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
 124 
 125 /*
 126  * Reclaim a fraction of dce's in the dcb.
 127  * For now we have a higher probability to delete DCEs without DCE_PMTU.
 128  */
 129 static void
 130 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
 131 {
 132         uint_t  fraction_pmtu = fraction*4;
 133         uint_t  hash;
 134         dce_t   *dce, *nextdce;
 135         hrtime_t seed = gethrtime();
 136         uint_t  retained = 0;
 137         uint_t  max = ipst->ips_ip_dce_reclaim_threshold;
 138 
 139         max *= ip_dce_reclaim_threshold_hard;
 140 
 141         rw_enter(&dcb->dcb_lock, RW_WRITER);
 142         for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
 143                 nextdce = dce->dce_next;
 144                 /* Clear DCEF_PMTU if the pmtu is too old */
 145                 mutex_enter(&dce->dce_lock);
 146                 if ((dce->dce_flags & DCEF_PMTU) &&
 147                     TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
 148                     ipst->ips_ip_pathmtu_interval) {
 149                         dce->dce_flags &= ~DCEF_PMTU;
 150                         mutex_exit(&dce->dce_lock);
 151                         dce_increment_generation(dce);
 152                 } else {
 153                         mutex_exit(&dce->dce_lock);
 154                 }
 155 
 156                 if (max == 0 || retained < max) {
 157                         hash = RANDOM_HASH((uint64_t)((uintptr_t)dce | seed));
 158 
 159                         if (dce->dce_flags & DCEF_PMTU) {
 160                                 if (hash % fraction_pmtu != 0) {
 161                                         retained++;
 162                                         continue;
 163                                 }
 164                         } else {
 165                                 if (hash % fraction != 0) {
 166                                         retained++;
 167                                         continue;
 168                                 }
 169                         }
 170                 }
 171 
 172                 IP_STAT(ipst, ip_dce_reclaim_deleted);
 173                 dce_delete_locked(dcb, dce);
 174                 dce_refrele(dce);
 175         }
 176         rw_exit(&dcb->dcb_lock);
 177 }
 178 
 179 /*
 180  * kmem_cache callback to free up memory.
 181  *
 182  */
 183 static void
 184 ip_dce_reclaim_stack(ip_stack_t *ipst)
 185 {
 186         int     i;
 187 
 188         IP_STAT(ipst, ip_dce_reclaim_calls);
 189         for (i = 0; i < ipst->ips_dce_hashsize; i++) {
 190                 dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
 191                     ipst->ips_ip_dce_reclaim_fraction);
 192 
 193                 dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
 194                     ipst->ips_ip_dce_reclaim_fraction);
 195         }
 196 
 197         /*
 198          * Walk all CONNs that can have a reference on an ire, nce or dce.
 199          * Get them to update any stale references to drop any refholds they
 200          * have.
 201          */
 202         ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
 203 }
 204 
 205 /*
 206  * Called by dce_reclaim_worker() below, and no one else.  Typically this will
 207  * mean that the number of entries in the hash buckets has exceeded a tunable
 208  * threshold.
 209  */
 210 static void
 211 ip_dce_reclaim(void)
 212 {
 213         netstack_handle_t nh;
 214         netstack_t *ns;
 215         ip_stack_t *ipst;
 216 
 217         ASSERT(curthread == dce_reclaim_thread);
 218 
 219         netstack_next_init(&nh);
 220         while ((ns = netstack_next(&nh)) != NULL) {
 221                 /*
 222                  * netstack_next() can return a netstack_t with a NULL
 223                  * netstack_ip at boot time.
 224                  */
 225                 if ((ipst = ns->netstack_ip) == NULL) {
 226                         netstack_rele(ns);
 227                         continue;
 228                 }
 229                 if (atomic_swap_uint(&ipst->ips_dce_reclaim_needed, 0) != 0)
 230                         ip_dce_reclaim_stack(ipst);
 231                 netstack_rele(ns);
 232         }
 233         netstack_next_fini(&nh);
 234 }
 235 
 236 /* ARGSUSED */
 237 static void
 238 dce_reclaim_worker(void *arg)
 239 {
 240         callb_cpr_t     cprinfo;
 241 
 242         CALLB_CPR_INIT(&cprinfo, &dce_reclaim_lock, callb_generic_cpr,
 243             "dce_reclaim_worker");
 244 
 245         mutex_enter(&dce_reclaim_lock);
 246         while (!dce_reclaim_shutdown) {
 247                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
 248                 (void) cv_timedwait(&dce_reclaim_cv, &dce_reclaim_lock,
 249                     ddi_get_lbolt() + ip_dce_reclaim_interval * hz);
 250                 CALLB_CPR_SAFE_END(&cprinfo, &dce_reclaim_lock);
 251 
 252                 if (dce_reclaim_shutdown)
 253                         break;
 254 
 255                 mutex_exit(&dce_reclaim_lock);
 256                 ip_dce_reclaim();
 257                 mutex_enter(&dce_reclaim_lock);
 258         }
 259 
 260         ASSERT(MUTEX_HELD(&dce_reclaim_lock));
 261         dce_reclaim_thread = NULL;
 262         dce_reclaim_shutdown = 0;
 263         cv_broadcast(&dce_reclaim_cv);
 264         CALLB_CPR_EXIT(&cprinfo);   /* drops the lock */
 265 
 266         thread_exit();
 267 }
 268 
 269 void
 270 dce_g_init(void)
 271 {
 272         dce_cache = kmem_cache_create("dce_cache",
 273             sizeof (dce_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 274 
 275         mutex_init(&dce_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
 276         cv_init(&dce_reclaim_cv, NULL, CV_DEFAULT, NULL);
 277 
 278         dce_reclaim_thread = thread_create(NULL, 0, dce_reclaim_worker,
 279             NULL, 0, &p0, TS_RUN, minclsyspri);
 280 }
 281 
 282 void
 283 dce_g_destroy(void)
 284 {
 285         mutex_enter(&dce_reclaim_lock);
 286         dce_reclaim_shutdown = 1;
 287         cv_signal(&dce_reclaim_cv);
 288         while (dce_reclaim_thread != NULL)
 289                 cv_wait(&dce_reclaim_cv, &dce_reclaim_lock);
 290         mutex_exit(&dce_reclaim_lock);
 291 
 292         cv_destroy(&dce_reclaim_cv);
 293         mutex_destroy(&dce_reclaim_lock);
 294 
 295         kmem_cache_destroy(dce_cache);
 296 }
 297 
 298 /*
 299  * Allocate a default DCE and a hash table for per-IP address DCEs
 300  */
 301 void
 302 dce_stack_init(ip_stack_t *ipst)
 303 {
 304         int     i;
 305 
 306         ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
 307         bzero(ipst->ips_dce_default, sizeof (dce_t));
 308         ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
 309         ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
 310         ipst->ips_dce_default->dce_last_change_time =
 311             TICK_TO_SEC(ddi_get_lbolt64());
 312         ipst->ips_dce_default->dce_refcnt = 1;    /* Should never go away */
 313         ipst->ips_dce_default->dce_ipst = ipst;
 314 
 315         /* This must be a power of two since we are using IRE_ADDR_HASH macro */
 316         ipst->ips_dce_hashsize = ip_dce_hash_size;
 317         ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
 318             sizeof (dcb_t), KM_SLEEP);
 319         ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
 320             sizeof (dcb_t), KM_SLEEP);
 321         for (i = 0; i < ipst->ips_dce_hashsize; i++) {
 322                 rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
 323                     NULL);
 324                 rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
 325                     NULL);
 326         }
 327 }
 328 
 329 void
 330 dce_stack_destroy(ip_stack_t *ipst)
 331 {
 332         int i;
 333         for (i = 0; i < ipst->ips_dce_hashsize; i++) {
 334                 rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
 335                 rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
 336         }
 337         kmem_free(ipst->ips_dce_hash_v4,
 338             ipst->ips_dce_hashsize * sizeof (dcb_t));
 339         ipst->ips_dce_hash_v4 = NULL;
 340         kmem_free(ipst->ips_dce_hash_v6,
 341             ipst->ips_dce_hashsize * sizeof (dcb_t));
 342         ipst->ips_dce_hash_v6 = NULL;
 343         ipst->ips_dce_hashsize = 0;
 344 
 345         ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
 346         kmem_cache_free(dce_cache, ipst->ips_dce_default);
 347         ipst->ips_dce_default = NULL;
 348 }
 349 
 350 /* When any DCE is good enough */
 351 dce_t *
 352 dce_get_default(ip_stack_t *ipst)
 353 {
 354         dce_t           *dce;
 355 
 356         dce = ipst->ips_dce_default;
 357         dce_refhold(dce);
 358         return (dce);
 359 }
 360 
 361 /*
 362  * Generic for IPv4 and IPv6.
 363  *
 364  * Used by callers that need to cache e.g., the datapath
 365  * Returns the generation number in the last argument.
 366  */
 367 dce_t *
 368 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
 369 {
 370         if (ixa->ixa_flags & IXAF_IS_IPV4) {
 371                 /*
 372                  * If we have a source route we need to look for the final
 373                  * destination in the source route option.
 374                  */
 375                 ipaddr_t final_dst;
 376                 ipha_t *ipha = (ipha_t *)mp->b_rptr;
 377 
 378                 final_dst = ip_get_dst(ipha);
 379                 return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
 380         } else {
 381                 uint_t ifindex;
 382                 /*
 383                  * If we have a routing header we need to look for the final
 384                  * destination in the routing extension header.
 385                  */
 386                 in6_addr_t final_dst;
 387                 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
 388 
 389                 final_dst = ip_get_dst_v6(ip6h, mp, NULL);
 390                 ifindex = 0;
 391                 if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
 392                         ifindex = ixa->ixa_nce->nce_common->ncec_ill->
 393                             ill_phyint->phyint_ifindex;
 394                 }
 395                 return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
 396                     generationp));
 397         }
 398 }
 399 
 400 /*
 401  * Used by callers that need to cache e.g., the datapath
 402  * Returns the generation number in the last argument.
 403  */
 404 dce_t *
 405 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
 406 {
 407         uint_t          hash;
 408         dcb_t           *dcb;
 409         dce_t           *dce;
 410 
 411         /* Set *generationp before dropping the lock(s) that allow additions */
 412         if (generationp != NULL)
 413                 *generationp = ipst->ips_dce_default->dce_generation;
 414 
 415         hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
 416         dcb = &ipst->ips_dce_hash_v4[hash];
 417         rw_enter(&dcb->dcb_lock, RW_READER);
 418         for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 419                 if (dce->dce_v4addr == dst) {
 420                         mutex_enter(&dce->dce_lock);
 421                         if (!DCE_IS_CONDEMNED(dce)) {
 422                                 dce_refhold(dce);
 423                                 if (generationp != NULL)
 424                                         *generationp = dce->dce_generation;
 425                                 mutex_exit(&dce->dce_lock);
 426                                 rw_exit(&dcb->dcb_lock);
 427                                 return (dce);
 428                         }
 429                         mutex_exit(&dce->dce_lock);
 430                 }
 431         }
 432         rw_exit(&dcb->dcb_lock);
 433         /* Not found */
 434         dce = ipst->ips_dce_default;
 435         dce_refhold(dce);
 436         return (dce);
 437 }
 438 
 439 /*
 440  * Used by callers that need to cache e.g., the datapath
 441  * Returns the generation number in the last argument.
 442  * ifindex should only be set for link-locals
 443  */
 444 dce_t *
 445 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
 446     uint_t *generationp)
 447 {
 448         uint_t          hash;
 449         dcb_t           *dcb;
 450         dce_t           *dce;
 451 
 452         /* Set *generationp before dropping the lock(s) that allow additions */
 453         if (generationp != NULL)
 454                 *generationp = ipst->ips_dce_default->dce_generation;
 455 
 456         hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
 457         dcb = &ipst->ips_dce_hash_v6[hash];
 458         rw_enter(&dcb->dcb_lock, RW_READER);
 459         for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 460                 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
 461                     dce->dce_ifindex == ifindex) {
 462                         mutex_enter(&dce->dce_lock);
 463                         if (!DCE_IS_CONDEMNED(dce)) {
 464                                 dce_refhold(dce);
 465                                 if (generationp != NULL)
 466                                         *generationp = dce->dce_generation;
 467                                 mutex_exit(&dce->dce_lock);
 468                                 rw_exit(&dcb->dcb_lock);
 469                                 return (dce);
 470                         }
 471                         mutex_exit(&dce->dce_lock);
 472                 }
 473         }
 474         rw_exit(&dcb->dcb_lock);
 475         /* Not found */
 476         dce = ipst->ips_dce_default;
 477         dce_refhold(dce);
 478         return (dce);
 479 }
 480 
 481 /*
 482  * Atomically looks for a non-default DCE, and if not found tries to create one.
 483  * If there is no memory it returns NULL.
 484  * When an entry is created we increase the generation number on
 485  * the default DCE so that conn_ip_output will detect there is a new DCE.
 486  */
 487 dce_t *
 488 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
 489 {
 490         uint_t          hash;
 491         dcb_t           *dcb;
 492         dce_t           *dce;
 493 
 494         hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
 495         dcb = &ipst->ips_dce_hash_v4[hash];
 496         /*
 497          * Assuming that we get fairly even distribution across all of the
 498          * buckets, once one bucket is overly full, prune the whole cache.
 499          */
 500         if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
 501                 atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
 502         rw_enter(&dcb->dcb_lock, RW_WRITER);
 503         for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 504                 if (dce->dce_v4addr == dst) {
 505                         mutex_enter(&dce->dce_lock);
 506                         if (!DCE_IS_CONDEMNED(dce)) {
 507                                 dce_refhold(dce);
 508                                 mutex_exit(&dce->dce_lock);
 509                                 rw_exit(&dcb->dcb_lock);
 510                                 return (dce);
 511                         }
 512                         mutex_exit(&dce->dce_lock);
 513                 }
 514         }
 515         dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
 516         if (dce == NULL) {
 517                 rw_exit(&dcb->dcb_lock);
 518                 return (NULL);
 519         }
 520         bzero(dce, sizeof (dce_t));
 521         dce->dce_ipst = ipst;        /* No netstack_hold */
 522         dce->dce_v4addr = dst;
 523         dce->dce_generation = DCE_GENERATION_INITIAL;
 524         dce->dce_ipversion = IPV4_VERSION;
 525         dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
 526         dce_refhold(dce);       /* For the hash list */
 527 
 528         /* Link into list */
 529         if (dcb->dcb_dce != NULL)
 530                 dcb->dcb_dce->dce_ptpn = &dce->dce_next;
 531         dce->dce_next = dcb->dcb_dce;
 532         dce->dce_ptpn = &dcb->dcb_dce;
 533         dcb->dcb_dce = dce;
 534         dce->dce_bucket = dcb;
 535         atomic_add_32(&dcb->dcb_cnt, 1);
 536         dce_refhold(dce);       /* For the caller */
 537         rw_exit(&dcb->dcb_lock);
 538 
 539         /* Initialize dce_ident to be different than for the last packet */
 540         dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
 541 
 542         dce_increment_generation(ipst->ips_dce_default);
 543         return (dce);
 544 }
 545 
 546 /*
 547  * Atomically looks for a non-default DCE, and if not found tries to create one.
 548  * If there is no memory it returns NULL.
 549  * When an entry is created we increase the generation number on
 550  * the default DCE so that conn_ip_output will detect there is a new DCE.
 551  * ifindex should only be used with link-local addresses.
 552  */
 553 dce_t *
 554 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
 555 {
 556         uint_t          hash;
 557         dcb_t           *dcb;
 558         dce_t           *dce;
 559 
 560         /* We should not create entries for link-locals w/o an ifindex */
 561         ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
 562 
 563         hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
 564         dcb = &ipst->ips_dce_hash_v6[hash];
 565         /*
 566          * Assuming that we get fairly even distribution across all of the
 567          * buckets, once one bucket is overly full, prune the whole cache.
 568          */
 569         if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
 570                 atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
 571         rw_enter(&dcb->dcb_lock, RW_WRITER);
 572         for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 573                 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
 574                     dce->dce_ifindex == ifindex) {
 575                         mutex_enter(&dce->dce_lock);
 576                         if (!DCE_IS_CONDEMNED(dce)) {
 577                                 dce_refhold(dce);
 578                                 mutex_exit(&dce->dce_lock);
 579                                 rw_exit(&dcb->dcb_lock);
 580                                 return (dce);
 581                         }
 582                         mutex_exit(&dce->dce_lock);
 583                 }
 584         }
 585 
 586         dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
 587         if (dce == NULL) {
 588                 rw_exit(&dcb->dcb_lock);
 589                 return (NULL);
 590         }
 591         bzero(dce, sizeof (dce_t));
 592         dce->dce_ipst = ipst;        /* No netstack_hold */
 593         dce->dce_v6addr = *dst;
 594         dce->dce_ifindex = ifindex;
 595         dce->dce_generation = DCE_GENERATION_INITIAL;
 596         dce->dce_ipversion = IPV6_VERSION;
 597         dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
 598         dce_refhold(dce);       /* For the hash list */
 599 
 600         /* Link into list */
 601         if (dcb->dcb_dce != NULL)
 602                 dcb->dcb_dce->dce_ptpn = &dce->dce_next;
 603         dce->dce_next = dcb->dcb_dce;
 604         dce->dce_ptpn = &dcb->dcb_dce;
 605         dcb->dcb_dce = dce;
 606         dce->dce_bucket = dcb;
 607         atomic_add_32(&dcb->dcb_cnt, 1);
 608         dce_refhold(dce);       /* For the caller */
 609         rw_exit(&dcb->dcb_lock);
 610 
 611         /* Initialize dce_ident to be different than for the last packet */
 612         dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
 613         dce_increment_generation(ipst->ips_dce_default);
 614         return (dce);
 615 }
 616 
 617 /*
 618  * Set/update uinfo. Creates a per-destination dce if none exists.
 619  *
 620  * Note that we do not bump the generation number here.
 621  * New connections will find the new uinfo.
 622  *
 623  * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
 624  */
 625 static void
 626 dce_setuinfo(dce_t *dce, iulp_t *uinfo)
 627 {
 628         /*
 629          * Update the round trip time estimate and/or the max frag size
 630          * and/or the slow start threshold.
 631          *
 632          * We serialize multiple advises using dce_lock.
 633          */
 634         mutex_enter(&dce->dce_lock);
 635         /* Gard against setting to zero */
 636         if (uinfo->iulp_rtt != 0) {
 637                 /*
 638                  * If there is no old cached values, initialize them
 639                  * conservatively.  Set them to be (1.5 * new value).
 640                  */
 641                 if (dce->dce_uinfo.iulp_rtt != 0) {
 642                         dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
 643                             uinfo->iulp_rtt) >> 1;
 644                 } else {
 645                         dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
 646                             (uinfo->iulp_rtt >> 1);
 647                 }
 648                 if (dce->dce_uinfo.iulp_rtt_sd != 0) {
 649                         dce->dce_uinfo.iulp_rtt_sd =
 650                             (dce->dce_uinfo.iulp_rtt_sd +
 651                             uinfo->iulp_rtt_sd) >> 1;
 652                 } else {
 653                         dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
 654                             (uinfo->iulp_rtt_sd >> 1);
 655                 }
 656         }
 657         if (uinfo->iulp_mtu != 0) {
 658                 if (dce->dce_flags & DCEF_PMTU) {
 659                         dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
 660                 } else {
 661                         dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
 662                         dce->dce_flags |= DCEF_PMTU;
 663                 }
 664                 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
 665         }
 666         if (uinfo->iulp_ssthresh != 0) {
 667                 if (dce->dce_uinfo.iulp_ssthresh != 0)
 668                         dce->dce_uinfo.iulp_ssthresh =
 669                             (uinfo->iulp_ssthresh +
 670                             dce->dce_uinfo.iulp_ssthresh) >> 1;
 671                 else
 672                         dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
 673         }
 674         /* We have uinfo for sure */
 675         dce->dce_flags |= DCEF_UINFO;
 676         mutex_exit(&dce->dce_lock);
 677 }
 678 
 679 
 680 int
 681 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
 682 {
 683         dce_t *dce;
 684 
 685         dce = dce_lookup_and_add_v4(dst, ipst);
 686         if (dce == NULL)
 687                 return (ENOMEM);
 688 
 689         dce_setuinfo(dce, uinfo);
 690         dce_refrele(dce);
 691         return (0);
 692 }
 693 
 694 int
 695 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
 696     ip_stack_t *ipst)
 697 {
 698         dce_t *dce;
 699 
 700         dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
 701         if (dce == NULL)
 702                 return (ENOMEM);
 703 
 704         dce_setuinfo(dce, uinfo);
 705         dce_refrele(dce);
 706         return (0);
 707 }
 708 
 709 /* Common routine for IPv4 and IPv6 */
 710 int
 711 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
 712     ip_stack_t *ipst)
 713 {
 714         ipaddr_t dst4;
 715 
 716         if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
 717                 IN6_V4MAPPED_TO_IPADDR(dst, dst4);
 718                 return (dce_update_uinfo_v4(dst4, uinfo, ipst));
 719         } else {
 720                 return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
 721         }
 722 }
 723 
 724 static void
 725 dce_make_condemned(dce_t *dce)
 726 {
 727         ip_stack_t      *ipst = dce->dce_ipst;
 728 
 729         mutex_enter(&dce->dce_lock);
 730         ASSERT(!DCE_IS_CONDEMNED(dce));
 731         dce->dce_generation = DCE_GENERATION_CONDEMNED;
 732         mutex_exit(&dce->dce_lock);
 733         /* Count how many condemned dces for kmem_cache callback */
 734         atomic_add_32(&ipst->ips_num_dce_condemned, 1);
 735 }
 736 
 737 /*
 738  * Increment the generation avoiding the special condemned value
 739  */
 740 void
 741 dce_increment_generation(dce_t *dce)
 742 {
 743         uint_t generation;
 744 
 745         mutex_enter(&dce->dce_lock);
 746         if (!DCE_IS_CONDEMNED(dce)) {
 747                 generation = dce->dce_generation + 1;
 748                 if (generation == DCE_GENERATION_CONDEMNED)
 749                         generation = DCE_GENERATION_INITIAL;
 750                 ASSERT(generation != DCE_GENERATION_VERIFY);
 751                 dce->dce_generation = generation;
 752         }
 753         mutex_exit(&dce->dce_lock);
 754 }
 755 
 756 /*
 757  * Increment the generation number on all dces that have a path MTU and
 758  * the default DCE. Used when ill_mtu or ill_mc_mtu changes.
 759  */
 760 void
 761 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
 762 {
 763         int             i;
 764         dcb_t           *dcb;
 765         dce_t           *dce;
 766 
 767         for (i = 0; i < ipst->ips_dce_hashsize; i++) {
 768                 if (isv6)
 769                         dcb = &ipst->ips_dce_hash_v6[i];
 770                 else
 771                         dcb = &ipst->ips_dce_hash_v4[i];
 772                 rw_enter(&dcb->dcb_lock, RW_WRITER);
 773                 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 774                         if (DCE_IS_CONDEMNED(dce))
 775                                 continue;
 776                         dce_increment_generation(dce);
 777                 }
 778                 rw_exit(&dcb->dcb_lock);
 779         }
 780         dce_increment_generation(ipst->ips_dce_default);
 781 }
 782 
 783 /*
 784  * Caller needs to do a dce_refrele since we can't do the
 785  * dce_refrele under dcb_lock.
 786  */
 787 static void
 788 dce_delete_locked(dcb_t *dcb, dce_t *dce)
 789 {
 790         dce->dce_bucket = NULL;
 791         *dce->dce_ptpn = dce->dce_next;
 792         if (dce->dce_next != NULL)
 793                 dce->dce_next->dce_ptpn = dce->dce_ptpn;
 794         dce->dce_ptpn = NULL;
 795         dce->dce_next = NULL;
 796         atomic_add_32(&dcb->dcb_cnt, -1);
 797         dce_make_condemned(dce);
 798 }
 799 
 800 static void
 801 dce_inactive(dce_t *dce)
 802 {
 803         ip_stack_t      *ipst = dce->dce_ipst;
 804 
 805         ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
 806         ASSERT(dce->dce_ptpn == NULL);
 807         ASSERT(dce->dce_bucket == NULL);
 808 
 809         /* Count how many condemned dces for kmem_cache callback */
 810         if (DCE_IS_CONDEMNED(dce))
 811                 atomic_add_32(&ipst->ips_num_dce_condemned, -1);
 812 
 813         kmem_cache_free(dce_cache, dce);
 814 }
 815 
 816 void
 817 dce_refrele(dce_t *dce)
 818 {
 819         ASSERT(dce->dce_refcnt != 0);
 820         if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
 821                 dce_inactive(dce);
 822 }
 823 
 824 void
 825 dce_refhold(dce_t *dce)
 826 {
 827         atomic_add_32(&dce->dce_refcnt, 1);
 828         ASSERT(dce->dce_refcnt != 0);
 829 }
 830 
 831 /* No tracing support yet hence the same as the above functions */
 832 void
 833 dce_refrele_notr(dce_t *dce)
 834 {
 835         ASSERT(dce->dce_refcnt != 0);
 836         if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
 837                 dce_inactive(dce);
 838 }
 839 
 840 void
 841 dce_refhold_notr(dce_t *dce)
 842 {
 843         atomic_add_32(&dce->dce_refcnt, 1);
 844         ASSERT(dce->dce_refcnt != 0);
 845 }
 846 
 847 /* Report both the IPv4 and IPv6 DCEs. */
 848 mblk_t *
 849 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
 850 {
 851         struct opthdr           *optp;
 852         mblk_t                  *mp2ctl;
 853         dest_cache_entry_t      dest_cache;
 854         mblk_t                  *mp_tail = NULL;
 855         dce_t                   *dce;
 856         dcb_t                   *dcb;
 857         int                     i;
 858         uint64_t                current_time;
 859 
 860         current_time = TICK_TO_SEC(ddi_get_lbolt64());
 861 
 862         /*
 863          * make a copy of the original message
 864          */
 865         mp2ctl = copymsg(mpctl);
 866 
 867         /* First we do IPv4 entries */
 868         optp = (struct opthdr *)&mpctl->b_rptr[
 869             sizeof (struct T_optmgmt_ack)];
 870         optp->level = MIB2_IP;
 871         optp->name = EXPER_IP_DCE;
 872 
 873         for (i = 0; i < ipst->ips_dce_hashsize; i++) {
 874                 dcb = &ipst->ips_dce_hash_v4[i];
 875                 rw_enter(&dcb->dcb_lock, RW_READER);
 876                 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 877                         dest_cache.DestIpv4Address = dce->dce_v4addr;
 878                         dest_cache.DestFlags = dce->dce_flags;
 879                         if (dce->dce_flags & DCEF_PMTU)
 880                                 dest_cache.DestPmtu = dce->dce_pmtu;
 881                         else
 882                                 dest_cache.DestPmtu = 0;
 883                         dest_cache.DestIdent = dce->dce_ident;
 884                         dest_cache.DestIfindex = 0;
 885                         dest_cache.DestAge = current_time -
 886                             dce->dce_last_change_time;
 887                         if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
 888                             (char *)&dest_cache, (int)sizeof (dest_cache))) {
 889                                 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
 890                                     "failed to allocate %u bytes\n",
 891                                     (uint_t)sizeof (dest_cache)));
 892                         }
 893                 }
 894                 rw_exit(&dcb->dcb_lock);
 895         }
 896         optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
 897         ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
 898             (int)optp->level, (int)optp->name, (int)optp->len));
 899         qreply(q, mpctl);
 900 
 901         if (mp2ctl == NULL) {
 902                 /* Copymsg failed above */
 903                 return (NULL);
 904         }
 905 
 906         /* Now for IPv6 */
 907         mpctl = mp2ctl;
 908         mp_tail = NULL;
 909         mp2ctl = copymsg(mpctl);
 910         optp = (struct opthdr *)&mpctl->b_rptr[
 911             sizeof (struct T_optmgmt_ack)];
 912         optp->level = MIB2_IP6;
 913         optp->name = EXPER_IP_DCE;
 914 
 915         for (i = 0; i < ipst->ips_dce_hashsize; i++) {
 916                 dcb = &ipst->ips_dce_hash_v6[i];
 917                 rw_enter(&dcb->dcb_lock, RW_READER);
 918                 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 919                         dest_cache.DestIpv6Address = dce->dce_v6addr;
 920                         dest_cache.DestFlags = dce->dce_flags;
 921                         if (dce->dce_flags & DCEF_PMTU)
 922                                 dest_cache.DestPmtu = dce->dce_pmtu;
 923                         else
 924                                 dest_cache.DestPmtu = 0;
 925                         dest_cache.DestIdent = dce->dce_ident;
 926                         if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
 927                                 dest_cache.DestIfindex = dce->dce_ifindex;
 928                         else
 929                                 dest_cache.DestIfindex = 0;
 930                         dest_cache.DestAge = current_time -
 931                             dce->dce_last_change_time;
 932                         if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
 933                             (char *)&dest_cache, (int)sizeof (dest_cache))) {
 934                                 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
 935                                     "failed to allocate %u bytes\n",
 936                                     (uint_t)sizeof (dest_cache)));
 937                         }
 938                 }
 939                 rw_exit(&dcb->dcb_lock);
 940         }
 941         optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
 942         ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
 943             (int)optp->level, (int)optp->name, (int)optp->len));
 944         qreply(q, mpctl);
 945 
 946         return (mp2ctl);
 947 }
 948 
 949 /*
 950  * Remove IPv6 DCEs which refer to an ifindex that is going away.
 951  * This is not required for correctness, but it avoids netstat -d
 952  * showing stale stuff that will never be used.
 953  */
 954 void
 955 dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
 956 {
 957         uint_t  i;
 958         dcb_t   *dcb;
 959         dce_t   *dce, *nextdce;
 960 
 961         for (i = 0; i < ipst->ips_dce_hashsize; i++) {
 962                 dcb = &ipst->ips_dce_hash_v6[i];
 963                 rw_enter(&dcb->dcb_lock, RW_WRITER);
 964 
 965                 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
 966                         nextdce = dce->dce_next;
 967                         if (dce->dce_ifindex == ifindex) {
 968                                 dce_delete_locked(dcb, dce);
 969                                 dce_refrele(dce);
 970                         }
 971                 }
 972                 rw_exit(&dcb->dcb_lock);
 973         }
 974 }