1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 1990 Mentat Inc.
  24  */
  25 
  26 /*
  27  * This file contains routines that manipulate Internet Routing Entries (IREs).
  28  */
  29 
  30 #include <sys/types.h>
  31 #include <sys/stream.h>
  32 #include <sys/stropts.h>
  33 #include <sys/strsun.h>
  34 #include <sys/strsubr.h>
  35 #include <sys/ddi.h>
  36 #include <sys/cmn_err.h>
  37 #include <sys/policy.h>
  38 
  39 #include <sys/systm.h>
  40 #include <sys/kmem.h>
  41 #include <sys/param.h>
  42 #include <sys/socket.h>
  43 #include <net/if.h>
  44 #include <net/route.h>
  45 #include <netinet/in.h>
  46 #include <net/if_dl.h>
  47 #include <netinet/ip6.h>
  48 #include <netinet/icmp6.h>
  49 
  50 #include <inet/common.h>
  51 #include <inet/mi.h>
  52 #include <inet/ip.h>
  53 #include <inet/ip6.h>
  54 #include <inet/ip_ndp.h>
  55 #include <inet/arp.h>
  56 #include <inet/ip_if.h>
  57 #include <inet/ip_ire.h>
  58 #include <inet/ip_ftable.h>
  59 #include <inet/ip_rts.h>
  60 #include <inet/nd.h>
  61 #include <inet/tunables.h>
  62 
  63 #include <inet/tcp.h>
  64 #include <inet/ipclassifier.h>
  65 #include <sys/zone.h>
  66 #include <sys/cpuvar.h>
  67 
  68 #include <sys/tsol/label.h>
  69 #include <sys/tsol/tnet.h>
  70 
  71 struct kmem_cache *rt_entry_cache;
  72 
  73 typedef struct nce_clookup_s {
  74         ipaddr_t ncecl_addr;
  75         boolean_t ncecl_found;
  76 } nce_clookup_t;
  77 
  78 /*
  79  * Synchronization notes:
  80  *
  81  * The fields of the ire_t struct are protected in the following way :
  82  *
  83  * ire_next/ire_ptpn
  84  *
  85  *      - bucket lock of the forwarding table in which is ire stored.
  86  *
  87  * ire_ill, ire_u *except* ire_gateway_addr[v6], ire_mask,
  88  * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags,
  89  * ire_bucket
  90  *
  91  *      - Set in ire_create_v4/v6 and never changes after that. Thus,
  92  *        we don't need a lock whenever these fields are accessed.
  93  *
  94  *      - ire_bucket and ire_masklen (also set in ire_create) is set in
  95  *        ire_add before inserting in the bucket and never
  96  *        changes after that. Thus we don't need a lock whenever these
  97  *        fields are accessed.
  98  *
  99  * ire_gateway_addr_v4[v6]
 100  *
 101  *      - ire_gateway_addr_v4[v6] is set during ire_create and later modified
 102  *        by rts_setgwr[v6]. As ire_gateway_addr is a uint32_t, updates to
 103  *        it assumed to be atomic and hence the other parts of the code
 104  *        does not use any locks. ire_gateway_addr_v6 updates are not atomic
 105  *        and hence any access to it uses ire_lock to get/set the right value.
 106  *
 107  * ire_refcnt, ire_identical_ref
 108  *
 109  *      - Updated atomically using atomic_add_32
 110  *
 111  * ire_ssthresh, ire_rtt_sd, ire_rtt, ire_ib_pkt_count, ire_ob_pkt_count
 112  *
 113  *      - Assumes that 32 bit writes are atomic. No locks. ire_lock is
 114  *        used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt.
 115  *
 116  * ire_generation
 117  *      - Under ire_lock
 118  *
 119  * ire_nce_cache
 120  *      - Under ire_lock
 121  *
 122  * ire_dep_parent (To next IRE in recursive lookup chain)
 123  *      - Under ips_ire_dep_lock. Write held when modifying. Read held when
 124  *        walking. We also hold ire_lock when modifying to allow the data path
 125  *        to only acquire ire_lock.
 126  *
 127  * ire_dep_parent_generation (Generation number from ire_dep_parent)
 128  *      - Under ips_ire_dep_lock and/or ire_lock. (A read claim on the dep_lock
 129  *        and ire_lock held when modifying)
 130  *
 131  * ire_dep_children (From parent to first child)
 132  * ire_dep_sib_next (linked list of siblings)
 133  * ire_dep_sib_ptpn (linked list of siblings)
 134  *      - Under ips_ire_dep_lock. Write held when modifying. Read held when
 135  *        walking.
 136  *
 137  * As we always hold the bucket locks in all the places while accessing
 138  * the above values, it is natural to use them for protecting them.
 139  *
 140  * We have a forwarding table for IPv4 and IPv6. The IPv6 forwarding table
 141  * (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t
 142  * structures. ip_forwarding_table_v6 is allocated dynamically in
 143  * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads
 144  * initializing the same bucket. Once a bucket is initialized, it is never
 145  * de-alloacted. This assumption enables us to access
 146  * ip_forwarding_table_v6[i] without any locks.
 147  *
 148  * The forwarding table for IPv4 is a radix tree whose leaves
 149  * are rt_entry structures containing the irb_t for the rt_dst. The irb_t
 150  * for IPv4 is dynamically allocated and freed.
 151  *
 152  * Each irb_t - ire bucket structure has a lock to protect
 153  * a bucket and the ires residing in the bucket have a back pointer to
 154  * the bucket structure. It also has a reference count for the number
 155  * of threads walking the bucket - irb_refcnt which is bumped up
 156  * using the irb_refhold function. The flags irb_marks can be
 157  * set to IRB_MARK_CONDEMNED indicating that there are some ires
 158  * in this bucket that are IRE_IS_CONDEMNED and the
 159  * last thread to leave the bucket should delete the ires. Usually
 160  * this is done by the irb_refrele function which is used to decrement
 161  * the reference count on a bucket. See comments above irb_t structure
 162  * definition in ip.h for further details.
 163  *
 164  * The ire_refhold/ire_refrele functions operate on the ire which increments/
 165  * decrements the reference count, ire_refcnt, atomically on the ire.
 166  * ire_refcnt is modified only using those functions. Operations on the IRE
 167  * could be described as follows :
 168  *
 169  * CREATE an ire with reference count initialized to 1.
 170  *
 171  * ADDITION of an ire holds the bucket lock, checks for duplicates
 172  * and then adds the ire. ire_add returns the ire after
 173  * bumping up once more i.e the reference count is 2. This is to avoid
 174  * an extra lookup in the functions calling ire_add which wants to
 175  * work with the ire after adding.
 176  *
 177  * LOOKUP of an ire bumps up the reference count using ire_refhold
 178  * function. It is valid to bump up the referece count of the IRE,
 179  * after the lookup has returned an ire. Following are the lookup
 180  * functions that return an HELD ire :
 181  *
 182  * ire_ftable_lookup[_v6], ire_lookup_multi_ill[_v6]
 183  *
 184  * DELETION of an ire holds the bucket lock, removes it from the list
 185  * and then decrements the reference count for having removed from the list
 186  * by using the ire_refrele function. If some other thread has looked up
 187  * the ire, the reference count would have been bumped up and hence
 188  * this ire will not be freed once deleted. It will be freed once the
 189  * reference count drops to zero.
 190  *
 191  * Add and Delete acquires the bucket lock as RW_WRITER, while all the
 192  * lookups acquire the bucket lock as RW_READER.
 193  *
 194  * The general rule is to do the ire_refrele in the function
 195  * that is passing the ire as an argument.
 196  *
 197  * In trying to locate ires the following points are to be noted.
 198  *
 199  * IRE_IS_CONDEMNED signifies that the ire has been logically deleted and is
 200  * to be ignored when walking the ires using ire_next.
 201  *
 202  * Zones note:
 203  *      Walking IREs within a given zone also walks certain ires in other
 204  *      zones.  This is done intentionally.  IRE walks with a specified
 205  *      zoneid are used only when doing informational reports, and
 206  *      zone users want to see things that they can access. See block
 207  *      comment in ire_walk_ill_match().
 208  */
 209 
 210 /*
 211  * The size of the forwarding table.  We will make sure that it is a
 212  * power of 2 in ip_ire_init().
 213  * Setable in /etc/system
 214  */
 215 uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE;
 216 
 217 struct  kmem_cache      *ire_cache;
 218 struct  kmem_cache      *ncec_cache;
 219 struct  kmem_cache      *nce_cache;
 220 
 221 static ire_t    ire_null;
 222 
 223 static ire_t    *ire_add_v4(ire_t *ire);
 224 static void     ire_delete_v4(ire_t *ire);
 225 static void     ire_dep_invalidate_children(ire_t *child);
 226 static void     ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers,
 227     zoneid_t zoneid, ip_stack_t *);
 228 static void     ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type,
 229     pfv_t func, void *arg, uchar_t vers, ill_t *ill);
 230 #ifdef DEBUG
 231 static void     ire_trace_cleanup(const ire_t *);
 232 #endif
 233 static void     ire_dep_incr_generation_locked(ire_t *);
 234 
 235 /*
 236  * Following are the functions to increment/decrement the reference
 237  * count of the IREs and IRBs (ire bucket).
 238  *
 239  * 1) We bump up the reference count of an IRE to make sure that
 240  *    it does not get deleted and freed while we are using it.
 241  *    Typically all the lookup functions hold the bucket lock,
 242  *    and look for the IRE. If it finds an IRE, it bumps up the
 243  *    reference count before dropping the lock. Sometimes we *may* want
 244  *    to bump up the reference count after we *looked* up i.e without
 245  *    holding the bucket lock. So, the ire_refhold function does not assert
 246  *    on the bucket lock being held. Any thread trying to delete from
 247  *    the hash bucket can still do so but cannot free the IRE if
 248  *    ire_refcnt is not 0.
 249  *
 250  * 2) We bump up the reference count on the bucket where the IRE resides
 251  *    (IRB), when we want to prevent the IREs getting deleted from a given
 252  *    hash bucket. This makes life easier for ire_walk type functions which
 253  *    wants to walk the IRE list, call a function, but needs to drop
 254  *    the bucket lock to prevent recursive rw_enters. While the
 255  *    lock is dropped, the list could be changed by other threads or
 256  *    the same thread could end up deleting the ire or the ire pointed by
 257  *    ire_next. ire_refholding the ire or ire_next is not sufficient as
 258  *    a delete will still remove the ire from the bucket while we have
 259  *    dropped the lock and hence the ire_next would be NULL. Thus, we
 260  *    need a mechanism to prevent deletions from a given bucket.
 261  *
 262  *    To prevent deletions, we bump up the reference count on the
 263  *    bucket. If the bucket is held, ire_delete just marks both
 264  *    the ire and irb as CONDEMNED. When the
 265  *    reference count on the bucket drops to zero, all the CONDEMNED ires
 266  *    are deleted. We don't have to bump up the reference count on the
 267  *    bucket if we are walking the bucket and never have to drop the bucket
 268  *    lock. Note that irb_refhold does not prevent addition of new ires
 269  *    in the list. It is okay because addition of new ires will not cause
 270  *    ire_next to point to freed memory. We do irb_refhold only when
 271  *    all of the 3 conditions are true :
 272  *
 273  *    1) The code needs to walk the IRE bucket from start to end.
 274  *    2) It may have to drop the bucket lock sometimes while doing (1)
 275  *    3) It does not want any ires to be deleted meanwhile.
 276  */
 277 
 278 /*
 279  * Bump up the reference count on the hash bucket - IRB to
 280  * prevent ires from being deleted in this bucket.
 281  */
 282 void
 283 irb_refhold(irb_t *irb)
 284 {
 285         rw_enter(&irb->irb_lock, RW_WRITER);
 286         irb->irb_refcnt++;
 287         ASSERT(irb->irb_refcnt != 0);
 288         rw_exit(&irb->irb_lock);
 289 }
 290 
 291 void
 292 irb_refhold_locked(irb_t *irb)
 293 {
 294         ASSERT(RW_WRITE_HELD(&irb->irb_lock));
 295         irb->irb_refcnt++;
 296         ASSERT(irb->irb_refcnt != 0);
 297 }
 298 
 299 /*
 300  * Note: when IRB_MARK_DYNAMIC is not set the irb_t
 301  * is statically allocated, so that when the irb_refcnt goes to 0,
 302  * we simply clean up the ire list and continue.
 303  */
 304 void
 305 irb_refrele(irb_t *irb)
 306 {
 307         if (irb->irb_marks & IRB_MARK_DYNAMIC) {
 308                 irb_refrele_ftable(irb);
 309         } else {
 310                 rw_enter(&irb->irb_lock, RW_WRITER);
 311                 ASSERT(irb->irb_refcnt != 0);
 312                 if (--irb->irb_refcnt        == 0 &&
 313                     (irb->irb_marks & IRB_MARK_CONDEMNED)) {
 314                         ire_t *ire_list;
 315 
 316                         ire_list = ire_unlink(irb);
 317                         rw_exit(&irb->irb_lock);
 318                         ASSERT(ire_list != NULL);
 319                         ire_cleanup(ire_list);
 320                 } else {
 321                         rw_exit(&irb->irb_lock);
 322                 }
 323         }
 324 }
 325 
 326 
 327 /*
 328  * Bump up the reference count on the IRE. We cannot assert that the
 329  * bucket lock is being held as it is legal to bump up the reference
 330  * count after the first lookup has returned the IRE without
 331  * holding the lock.
 332  */
 333 void
 334 ire_refhold(ire_t *ire)
 335 {
 336         atomic_inc_32(&(ire)->ire_refcnt);
 337         ASSERT((ire)->ire_refcnt != 0);
 338 #ifdef DEBUG
 339         ire_trace_ref(ire);
 340 #endif
 341 }
 342 
 343 void
 344 ire_refhold_notr(ire_t *ire)
 345 {
 346         atomic_inc_32(&(ire)->ire_refcnt);
 347         ASSERT((ire)->ire_refcnt != 0);
 348 }
 349 
 350 void
 351 ire_refhold_locked(ire_t *ire)
 352 {
 353 #ifdef DEBUG
 354         ire_trace_ref(ire);
 355 #endif
 356         ire->ire_refcnt++;
 357 }
 358 
 359 /*
 360  * Release a ref on an IRE.
 361  *
 362  * Must not be called while holding any locks. Otherwise if this is
 363  * the last reference to be released there is a chance of recursive mutex
 364  * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
 365  * to restart an ioctl. The one exception is when the caller is sure that
 366  * this is not the last reference to be released. Eg. if the caller is
 367  * sure that the ire has not been deleted and won't be deleted.
 368  *
 369  * In architectures e.g sun4u, where atomic_add_32_nv is just
 370  * a cas, we need to maintain the right memory barrier semantics
 371  * as that of mutex_exit i.e all the loads and stores should complete
 372  * before the cas is executed. membar_exit() does that here.
 373  */
 374 void
 375 ire_refrele(ire_t *ire)
 376 {
 377 #ifdef DEBUG
 378         ire_untrace_ref(ire);
 379 #endif
 380         ASSERT((ire)->ire_refcnt != 0);
 381         membar_exit();
 382         if (atomic_dec_32_nv(&(ire)->ire_refcnt) == 0)
 383                 ire_inactive(ire);
 384 }
 385 
 386 void
 387 ire_refrele_notr(ire_t *ire)
 388 {
 389         ASSERT((ire)->ire_refcnt != 0);
 390         membar_exit();
 391         if (atomic_dec_32_nv(&(ire)->ire_refcnt) == 0)
 392                 ire_inactive(ire);
 393 }
 394 
 395 /*
 396  * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY]
 397  * IOCTL[s].  The NO_REPLY form is used by TCP to tell IP that it is
 398  * having problems reaching a particular destination.
 399  * This will make IP consider alternate routes (e.g., when there are
 400  * muliple default routes), and it will also make IP discard any (potentially)
 401  * stale redirect.
 402  * Management processes may want to use the version that generates a reply.
 403  *
 404  * With the use of NUD like behavior for IPv4/ARP in addition to IPv6
 405  * this function shouldn't be necessary for IP to recover from a bad redirect,
 406  * a bad default router (when there are multiple default routers), or
 407  * a stale ND/ARP entry. But we retain it in any case.
 408  * For instance, this is helpful when TCP suspects a failure before NUD does.
 409  */
 410 int
 411 ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
 412 {
 413         uchar_t         *addr_ucp;
 414         uint_t          ipversion;
 415         sin_t           *sin;
 416         sin6_t          *sin6;
 417         ipaddr_t        v4addr;
 418         in6_addr_t      v6addr;
 419         ire_t           *ire;
 420         ipid_t          *ipid;
 421         zoneid_t        zoneid;
 422         ip_stack_t      *ipst;
 423 
 424         ASSERT(q->q_next == NULL);
 425         zoneid = IPCL_ZONEID(Q_TO_CONN(q));
 426         ipst = CONNQ_TO_IPST(q);
 427 
 428         /*
 429          * Check privilege using the ioctl credential; if it is NULL
 430          * then this is a kernel message and therefor privileged.
 431          */
 432         if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0)
 433                 return (EPERM);
 434 
 435         ipid = (ipid_t *)mp->b_rptr;
 436 
 437         addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset,
 438             ipid->ipid_addr_length);
 439         if (addr_ucp == NULL || !OK_32PTR(addr_ucp))
 440                 return (EINVAL);
 441         switch (ipid->ipid_addr_length) {
 442         case sizeof (sin_t):
 443                 /*
 444                  * got complete (sockaddr) address - increment addr_ucp to point
 445                  * at the ip_addr field.
 446                  */
 447                 sin = (sin_t *)addr_ucp;
 448                 addr_ucp = (uchar_t *)&sin->sin_addr.s_addr;
 449                 ipversion = IPV4_VERSION;
 450                 break;
 451         case sizeof (sin6_t):
 452                 /*
 453                  * got complete (sockaddr) address - increment addr_ucp to point
 454                  * at the ip_addr field.
 455                  */
 456                 sin6 = (sin6_t *)addr_ucp;
 457                 addr_ucp = (uchar_t *)&sin6->sin6_addr;
 458                 ipversion = IPV6_VERSION;
 459                 break;
 460         default:
 461                 return (EINVAL);
 462         }
 463         if (ipversion == IPV4_VERSION) {
 464                 /* Extract the destination address. */
 465                 bcopy(addr_ucp, &v4addr, IP_ADDR_LEN);
 466 
 467                 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL,
 468                     zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
 469         } else {
 470                 /* Extract the destination address. */
 471                 bcopy(addr_ucp, &v6addr, IPV6_ADDR_LEN);
 472 
 473                 ire = ire_ftable_lookup_v6(&v6addr, NULL, NULL, 0, NULL,
 474                     zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
 475         }
 476         if (ire != NULL) {
 477                 if (ipversion == IPV4_VERSION) {
 478                         ip_rts_change(RTM_LOSING, ire->ire_addr,
 479                             ire->ire_gateway_addr, ire->ire_mask,
 480                             (Q_TO_CONN(q))->conn_laddr_v4,  0, 0, 0,
 481                             (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
 482                             ire->ire_ipst);
 483                 }
 484                 (void) ire_no_good(ire);
 485                 ire_refrele(ire);
 486         }
 487         return (0);
 488 }
 489 
 490 /*
 491  * Initialize the ire that is specific to IPv4 part and call
 492  * ire_init_common to finish it.
 493  * Returns zero or errno.
 494  */
 495 int
 496 ire_init_v4(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *gateway,
 497     ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags,
 498     tsol_gc_t *gc, ip_stack_t *ipst)
 499 {
 500         int error;
 501 
 502         /*
 503          * Reject IRE security attribute creation/initialization
 504          * if system is not running in Trusted mode.
 505          */
 506         if (gc != NULL && !is_system_labeled())
 507                 return (EINVAL);
 508 
 509         BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_alloced);
 510 
 511         if (addr != NULL)
 512                 bcopy(addr, &ire->ire_addr, IP_ADDR_LEN);
 513         if (gateway != NULL)
 514                 bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN);
 515 
 516         /* Make sure we don't have stray values in some fields */
 517         switch (type) {
 518         case IRE_LOOPBACK:
 519         case IRE_HOST:
 520         case IRE_BROADCAST:
 521         case IRE_LOCAL:
 522         case IRE_IF_CLONE:
 523                 ire->ire_mask = IP_HOST_MASK;
 524                 ire->ire_masklen = IPV4_ABITS;
 525                 break;
 526         case IRE_PREFIX:
 527         case IRE_DEFAULT:
 528         case IRE_IF_RESOLVER:
 529         case IRE_IF_NORESOLVER:
 530                 if (mask != NULL) {
 531                         bcopy(mask, &ire->ire_mask, IP_ADDR_LEN);
 532                         ire->ire_masklen = ip_mask_to_plen(ire->ire_mask);
 533                 }
 534                 break;
 535         case IRE_MULTICAST:
 536         case IRE_NOROUTE:
 537                 ASSERT(mask == NULL);
 538                 break;
 539         default:
 540                 ASSERT(0);
 541                 return (EINVAL);
 542         }
 543 
 544         error = ire_init_common(ire, type, ill, zoneid, flags, IPV4_VERSION,
 545             gc, ipst);
 546         if (error != NULL)
 547                 return (error);
 548 
 549         /* Determine which function pointers to use */
 550         ire->ire_postfragfn = ip_xmit;               /* Common case */
 551 
 552         switch (ire->ire_type) {
 553         case IRE_LOCAL:
 554                 ire->ire_sendfn = ire_send_local_v4;
 555                 ire->ire_recvfn = ire_recv_local_v4;
 556                 ASSERT(ire->ire_ill != NULL);
 557                 if (ire->ire_ill->ill_flags & ILLF_NOACCEPT)
 558                         ire->ire_recvfn = ire_recv_noaccept_v6;
 559                 break;
 560         case IRE_LOOPBACK:
 561                 ire->ire_sendfn = ire_send_local_v4;
 562                 ire->ire_recvfn = ire_recv_loopback_v4;
 563                 break;
 564         case IRE_BROADCAST:
 565                 ire->ire_postfragfn = ip_postfrag_loopcheck;
 566                 ire->ire_sendfn = ire_send_broadcast_v4;
 567                 ire->ire_recvfn = ire_recv_broadcast_v4;
 568                 break;
 569         case IRE_MULTICAST:
 570                 ire->ire_postfragfn = ip_postfrag_loopcheck;
 571                 ire->ire_sendfn = ire_send_multicast_v4;
 572                 ire->ire_recvfn = ire_recv_multicast_v4;
 573                 break;
 574         default:
 575                 /*
 576                  * For IRE_IF_ALL and IRE_OFFLINK we forward received
 577                  * packets by default.
 578                  */
 579                 ire->ire_sendfn = ire_send_wire_v4;
 580                 ire->ire_recvfn = ire_recv_forward_v4;
 581                 break;
 582         }
 583         if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 584                 ire->ire_sendfn = ire_send_noroute_v4;
 585                 ire->ire_recvfn = ire_recv_noroute_v4;
 586         } else if (ire->ire_flags & RTF_MULTIRT) {
 587                 ire->ire_postfragfn = ip_postfrag_multirt_v4;
 588                 ire->ire_sendfn = ire_send_multirt_v4;
 589                 /* Multirt receive of broadcast uses ire_recv_broadcast_v4 */
 590                 if (ire->ire_type != IRE_BROADCAST)
 591                         ire->ire_recvfn = ire_recv_multirt_v4;
 592         }
 593         ire->ire_nce_capable = ire_determine_nce_capable(ire);
 594         return (0);
 595 }
 596 
 597 /*
 598  * Determine ire_nce_capable
 599  */
 600 boolean_t
 601 ire_determine_nce_capable(ire_t *ire)
 602 {
 603         int max_masklen;
 604 
 605         if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
 606             (ire->ire_type & IRE_MULTICAST))
 607                 return (B_TRUE);
 608 
 609         if (ire->ire_ipversion == IPV4_VERSION)
 610                 max_masklen = IPV4_ABITS;
 611         else
 612                 max_masklen = IPV6_ABITS;
 613 
 614         if ((ire->ire_type & IRE_ONLINK) && ire->ire_masklen == max_masklen)
 615                 return (B_TRUE);
 616         return (B_FALSE);
 617 }
 618 
 619 /*
 620  * ire_create is called to allocate and initialize a new IRE.
 621  *
 622  * NOTE : This is called as writer sometimes though not required
 623  * by this function.
 624  */
 625 ire_t *
 626 ire_create(uchar_t *addr, uchar_t *mask, uchar_t *gateway,
 627     ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags, tsol_gc_t *gc,
 628     ip_stack_t *ipst)
 629 {
 630         ire_t   *ire;
 631         int     error;
 632 
 633         ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
 634         if (ire == NULL) {
 635                 DTRACE_PROBE(kmem__cache__alloc);
 636                 return (NULL);
 637         }
 638         *ire = ire_null;
 639 
 640         error = ire_init_v4(ire, addr, mask, gateway, type, ill, zoneid, flags,
 641             gc, ipst);
 642         if (error != 0) {
 643                 DTRACE_PROBE2(ire__init, ire_t *, ire, int, error);
 644                 kmem_cache_free(ire_cache, ire);
 645                 return (NULL);
 646         }
 647         return (ire);
 648 }
 649 
 650 /*
 651  * Common to IPv4 and IPv6
 652  * Returns zero or errno.
 653  */
 654 int
 655 ire_init_common(ire_t *ire, ushort_t type, ill_t *ill, zoneid_t zoneid,
 656     uint_t flags, uchar_t ipversion, tsol_gc_t *gc, ip_stack_t *ipst)
 657 {
 658         int error;
 659 
 660 #ifdef DEBUG
 661         if (ill != NULL) {
 662                 if (ill->ill_isv6)
 663                         ASSERT(ipversion == IPV6_VERSION);
 664                 else
 665                         ASSERT(ipversion == IPV4_VERSION);
 666         }
 667 #endif /* DEBUG */
 668 
 669         /*
 670          * Create/initialize IRE security attribute only in Trusted mode;
 671          * if the passed in gc is non-NULL, we expect that the caller
 672          * has held a reference to it and will release it when this routine
 673          * returns a failure, otherwise we own the reference.  We do this
 674          * prior to initializing the rest IRE fields.
 675          */
 676         if (is_system_labeled()) {
 677                 if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST |
 678                     IRE_IF_ALL | IRE_MULTICAST | IRE_NOROUTE)) != 0) {
 679                         /* release references on behalf of caller */
 680                         if (gc != NULL)
 681                                 GC_REFRELE(gc);
 682                 } else {
 683                         error = tsol_ire_init_gwattr(ire, ipversion, gc);
 684                         if (error != 0)
 685                                 return (error);
 686                 }
 687         }
 688 
 689         ire->ire_type = type;
 690         ire->ire_flags = RTF_UP | flags;
 691         ire->ire_create_time = (uint32_t)gethrestime_sec();
 692         ire->ire_generation = IRE_GENERATION_INITIAL;
 693 
 694         /*
 695          * The ill_ire_cnt isn't increased until
 696          * the IRE is added to ensure that a walker will find
 697          * all IREs that hold a reference on an ill.
 698          *
 699          * Note that ill_ire_multicast doesn't hold a ref on the ill since
 700          * ire_add() is not called for the IRE_MULTICAST.
 701          */
 702         ire->ire_ill = ill;
 703         ire->ire_zoneid = zoneid;
 704         ire->ire_ipversion = ipversion;
 705 
 706         mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL);
 707         ire->ire_refcnt = 1;
 708         ire->ire_identical_ref = 1;  /* Number of ire_delete's needed */
 709         ire->ire_ipst = ipst;        /* No netstack_hold */
 710         ire->ire_trace_disable = B_FALSE;
 711 
 712         return (0);
 713 }
 714 
 715 /*
 716  * This creates an IRE_BROADCAST based on the arguments.
 717  * A mirror is ire_lookup_bcast().
 718  *
 719  * Any supression of unneeded ones is done in ire_add_v4.
 720  * We add one IRE_BROADCAST per address. ire_send_broadcast_v4()
 721  * takes care of generating a loopback copy of the packet.
 722  */
 723 ire_t **
 724 ire_create_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid, ire_t **irep)
 725 {
 726         ip_stack_t      *ipst = ill->ill_ipst;
 727 
 728         ASSERT(IAM_WRITER_ILL(ill));
 729 
 730         *irep++ = ire_create(
 731             (uchar_t *)&addr,                       /* dest addr */
 732             (uchar_t *)&ip_g_all_ones,              /* mask */
 733             NULL,                               /* no gateway */
 734             IRE_BROADCAST,
 735             ill,
 736             zoneid,
 737             RTF_KERNEL,
 738             NULL,
 739             ipst);
 740 
 741         return (irep);
 742 }
 743 
 744 /*
 745  * This looks up an IRE_BROADCAST based on the arguments.
 746  * Mirrors ire_create_bcast().
 747  */
 748 ire_t *
 749 ire_lookup_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
 750 {
 751         ire_t           *ire;
 752         int             match_args;
 753 
 754         match_args = MATCH_IRE_TYPE | MATCH_IRE_ILL | MATCH_IRE_GW |
 755             MATCH_IRE_MASK | MATCH_IRE_ZONEONLY;
 756 
 757         if (IS_UNDER_IPMP(ill))
 758                 match_args |= MATCH_IRE_TESTHIDDEN;
 759 
 760         ire = ire_ftable_lookup_v4(
 761             addr,                               /* dest addr */
 762             ip_g_all_ones,                      /* mask */
 763             0,                                  /* no gateway */
 764             IRE_BROADCAST,
 765             ill,
 766             zoneid,
 767             NULL,
 768             match_args,
 769             0,
 770             ill->ill_ipst,
 771             NULL);
 772         return (ire);
 773 }
 774 
 775 /* Arrange to call the specified function for every IRE in the world. */
 776 void
 777 ire_walk(pfv_t func, void *arg, ip_stack_t *ipst)
 778 {
 779         ire_walk_ipvers(func, arg, 0, ALL_ZONES, ipst);
 780 }
 781 
 782 void
 783 ire_walk_v4(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst)
 784 {
 785         ire_walk_ipvers(func, arg, IPV4_VERSION, zoneid, ipst);
 786 }
 787 
 788 void
 789 ire_walk_v6(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst)
 790 {
 791         ire_walk_ipvers(func, arg, IPV6_VERSION, zoneid, ipst);
 792 }
 793 
 794 /*
 795  * Walk a particular version. version == 0 means both v4 and v6.
 796  */
 797 static void
 798 ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid,
 799     ip_stack_t *ipst)
 800 {
 801         if (vers != IPV6_VERSION) {
 802                 /*
 803                  * ip_forwarding_table variable doesn't matter for IPv4 since
 804                  * ire_walk_ill_tables uses ips_ip_ftable for IPv4.
 805                  */
 806                 ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE,
 807                     0, NULL,
 808                     NULL, zoneid, ipst);
 809         }
 810         if (vers != IPV4_VERSION) {
 811                 ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE,
 812                     ipst->ips_ip6_ftable_hash_size,
 813                     ipst->ips_ip_forwarding_table_v6,
 814                     NULL, zoneid, ipst);
 815         }
 816 }
 817 
 818 /*
 819  * Arrange to call the specified function for every IRE that matches the ill.
 820  */
 821 void
 822 ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
 823     ill_t *ill)
 824 {
 825         uchar_t vers = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
 826 
 827         ire_walk_ill_ipvers(match_flags, ire_type, func, arg, vers, ill);
 828 }
 829 
 830 /*
 831  * Walk a particular ill and version.
 832  */
 833 static void
 834 ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func,
 835     void *arg, uchar_t vers, ill_t *ill)
 836 {
 837         ip_stack_t      *ipst = ill->ill_ipst;
 838 
 839         if (vers == IPV4_VERSION) {
 840                 ire_walk_ill_tables(match_flags, ire_type, func, arg,
 841                     IP_MASK_TABLE_SIZE,
 842                     0, NULL,
 843                     ill, ALL_ZONES, ipst);
 844         }
 845         if (vers != IPV4_VERSION) {
 846                 ire_walk_ill_tables(match_flags, ire_type, func, arg,
 847                     IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size,
 848                     ipst->ips_ip_forwarding_table_v6,
 849                     ill, ALL_ZONES, ipst);
 850         }
 851 }
 852 
 853 /*
 854  * Do the specific matching of IREs to shared-IP zones.
 855  *
 856  * We have the same logic as in ire_match_args but implemented slightly
 857  * differently.
 858  */
 859 boolean_t
 860 ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
 861     ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst)
 862 {
 863         ill_t *dst_ill = ire->ire_ill;
 864 
 865         ASSERT(match_flags != 0 || zoneid != ALL_ZONES);
 866 
 867         if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
 868             ire->ire_zoneid != ALL_ZONES) {
 869                 /*
 870                  * We're walking the IREs for a specific zone. The only relevant
 871                  * IREs are:
 872                  * - all IREs with a matching ire_zoneid
 873                  * - IRE_IF_ALL IREs for interfaces with a usable source addr
 874                  *   with a matching zone
 875                  * - IRE_OFFLINK with a gateway reachable from the zone
 876                  * Note that ealier we only did the IRE_OFFLINK check for
 877                  * IRE_DEFAULT (and only when we had multiple IRE_DEFAULTs).
 878                  */
 879                 if (ire->ire_type & IRE_ONLINK) {
 880                         uint_t  ifindex;
 881 
 882                         /*
 883                          * Note there is no IRE_INTERFACE on vniN thus
 884                          * can't do an IRE lookup for a matching route.
 885                          */
 886                         ifindex = dst_ill->ill_usesrc_ifindex;
 887                         if (ifindex == 0)
 888                                 return (B_FALSE);
 889 
 890                         /*
 891                          * If there is a usable source address in the
 892                          * zone, then it's ok to return an
 893                          * IRE_INTERFACE
 894                          */
 895                         if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
 896                             zoneid, ipst)) {
 897                                 return (B_FALSE);
 898                         }
 899                 }
 900                 if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
 901                         ipif_t  *tipif;
 902 
 903                         mutex_enter(&dst_ill->ill_lock);
 904                         for (tipif = dst_ill->ill_ipif;
 905                             tipif != NULL; tipif = tipif->ipif_next) {
 906                                 if (!IPIF_IS_CONDEMNED(tipif) &&
 907                                     (tipif->ipif_flags & IPIF_UP) &&
 908                                     (tipif->ipif_zoneid == zoneid ||
 909                                     tipif->ipif_zoneid == ALL_ZONES))
 910                                         break;
 911                         }
 912                         mutex_exit(&dst_ill->ill_lock);
 913                         if (tipif == NULL) {
 914                                 return (B_FALSE);
 915                         }
 916                 }
 917         }
 918         /*
 919          * Except for ALL_ZONES, we only match the offlink routes
 920          * where ire_gateway_addr has an IRE_INTERFACE for the zoneid.
 921          * Since we can have leftover routes after the IP addresses have
 922          * changed, the global zone will also match offlink routes where the
 923          * gateway is unreachable from any zone.
 924          */
 925         if ((ire->ire_type & IRE_OFFLINK) && zoneid != ALL_ZONES) {
 926                 in6_addr_t gw_addr_v6;
 927                 boolean_t reach;
 928 
 929                 if (ire->ire_ipversion == IPV4_VERSION) {
 930                         reach = ire_gateway_ok_zone_v4(ire->ire_gateway_addr,
 931                             zoneid, dst_ill, NULL, ipst, B_FALSE);
 932                 } else {
 933                         ASSERT(ire->ire_ipversion == IPV6_VERSION);
 934                         mutex_enter(&ire->ire_lock);
 935                         gw_addr_v6 = ire->ire_gateway_addr_v6;
 936                         mutex_exit(&ire->ire_lock);
 937 
 938                         reach = ire_gateway_ok_zone_v6(&gw_addr_v6, zoneid,
 939                             dst_ill, NULL, ipst, B_FALSE);
 940                 }
 941                 if (!reach) {
 942                         if (zoneid != GLOBAL_ZONEID)
 943                                 return (B_FALSE);
 944 
 945                         /*
 946                          * Check if ALL_ZONES reachable - if not then let the
 947                          * global zone see it.
 948                          */
 949                         if (ire->ire_ipversion == IPV4_VERSION) {
 950                                 reach = ire_gateway_ok_zone_v4(
 951                                     ire->ire_gateway_addr, ALL_ZONES,
 952                                     dst_ill, NULL, ipst, B_FALSE);
 953                         } else {
 954                                 reach = ire_gateway_ok_zone_v6(&gw_addr_v6,
 955                                     ALL_ZONES, dst_ill, NULL, ipst, B_FALSE);
 956                         }
 957                         if (reach) {
 958                                 /*
 959                                  * Some other zone could see it, hence hide it
 960                                  * in the global zone.
 961                                  */
 962                                 return (B_FALSE);
 963                         }
 964                 }
 965         }
 966 
 967         if (((!(match_flags & MATCH_IRE_TYPE)) ||
 968             (ire->ire_type & ire_type)) &&
 969             ((!(match_flags & MATCH_IRE_ILL)) ||
 970             (dst_ill == ill ||
 971             dst_ill != NULL && IS_IN_SAME_ILLGRP(dst_ill, ill)))) {
 972                 return (B_TRUE);
 973         }
 974         return (B_FALSE);
 975 }
 976 
 977 int
 978 rtfunc(struct radix_node *rn, void *arg)
 979 {
 980         struct rtfuncarg *rtf = arg;
 981         struct rt_entry *rt;
 982         irb_t *irb;
 983         ire_t *ire;
 984         boolean_t ret;
 985 
 986         rt = (struct rt_entry *)rn;
 987         ASSERT(rt != NULL);
 988         irb = &rt->rt_irb;
 989         for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
 990                 if ((rtf->rt_match_flags != 0) ||
 991                     (rtf->rt_zoneid != ALL_ZONES)) {
 992                         ret = ire_walk_ill_match(rtf->rt_match_flags,
 993                             rtf->rt_ire_type, ire,
 994                             rtf->rt_ill, rtf->rt_zoneid, rtf->rt_ipst);
 995                 } else {
 996                         ret = B_TRUE;
 997                 }
 998                 if (ret)
 999                         (*rtf->rt_func)(ire, rtf->rt_arg);
1000         }
1001         return (0);
1002 }
1003 
1004 /*
1005  * Walk the ftable entries that match the ill.
1006  */
1007 void
1008 ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func,
1009     void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl,
1010     ill_t *ill, zoneid_t zoneid,
1011     ip_stack_t *ipst)
1012 {
1013         irb_t   *irb_ptr;
1014         irb_t   *irb;
1015         ire_t   *ire;
1016         int i, j;
1017         boolean_t ret;
1018         struct rtfuncarg rtfarg;
1019 
1020         ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL));
1021         ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0));
1022 
1023         /* knobs such that routine is called only for v6 case */
1024         if (ipftbl == ipst->ips_ip_forwarding_table_v6) {
1025                 for (i = (ftbl_sz - 1);  i >= 0; i--) {
1026                         if ((irb_ptr = ipftbl[i]) == NULL)
1027                                 continue;
1028                         for (j = 0; j < htbl_sz; j++) {
1029                                 irb = &irb_ptr[j];
1030                                 if (irb->irb_ire == NULL)
1031                                         continue;
1032 
1033                                 irb_refhold(irb);
1034                                 for (ire = irb->irb_ire; ire != NULL;
1035                                     ire = ire->ire_next) {
1036                                         if (match_flags == 0 &&
1037                                             zoneid == ALL_ZONES) {
1038                                                 ret = B_TRUE;
1039                                         } else {
1040                                                 ret =
1041                                                     ire_walk_ill_match(
1042                                                     match_flags,
1043                                                     ire_type, ire, ill,
1044                                                     zoneid, ipst);
1045                                         }
1046                                         if (ret)
1047                                                 (*func)(ire, arg);
1048                                 }
1049                                 irb_refrele(irb);
1050                         }
1051                 }
1052         } else {
1053                 bzero(&rtfarg, sizeof (rtfarg));
1054                 rtfarg.rt_func = func;
1055                 rtfarg.rt_arg = arg;
1056                 if (match_flags != 0) {
1057                         rtfarg.rt_match_flags = match_flags;
1058                 }
1059                 rtfarg.rt_ire_type = ire_type;
1060                 rtfarg.rt_ill = ill;
1061                 rtfarg.rt_zoneid = zoneid;
1062                 rtfarg.rt_ipst = ipst;  /* No netstack_hold */
1063                 (void) ipst->ips_ip_ftable->rnh_walktree_mt(
1064                     ipst->ips_ip_ftable,
1065                     rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
1066         }
1067 }
1068 
1069 /*
1070  * This function takes a mask and returns
1071  * number of bits set in the mask. If no
1072  * bit is set it returns 0.
1073  * Assumes a contiguous mask.
1074  */
1075 int
1076 ip_mask_to_plen(ipaddr_t mask)
1077 {
1078         return (mask == 0 ? 0 : IP_ABITS - (ffs(ntohl(mask)) -1));
1079 }
1080 
1081 /*
1082  * Convert length for a mask to the mask.
1083  */
1084 ipaddr_t
1085 ip_plen_to_mask(uint_t masklen)
1086 {
1087         if (masklen == 0)
1088                 return (0);
1089 
1090         return (htonl(IP_HOST_MASK << (IP_ABITS - masklen)));
1091 }
1092 
1093 void
1094 ire_atomic_end(irb_t *irb_ptr, ire_t *ire)
1095 {
1096         ill_t           *ill;
1097 
1098         ill = ire->ire_ill;
1099         if (ill != NULL)
1100                 mutex_exit(&ill->ill_lock);
1101         rw_exit(&irb_ptr->irb_lock);
1102 }
1103 
1104 /*
1105  * ire_add_v[46] atomically make sure that the ill associated
1106  * with the new ire is not going away i.e., we check ILL_CONDEMNED.
1107  */
1108 int
1109 ire_atomic_start(irb_t *irb_ptr, ire_t *ire)
1110 {
1111         ill_t           *ill;
1112 
1113         ill = ire->ire_ill;
1114 
1115         rw_enter(&irb_ptr->irb_lock, RW_WRITER);
1116         if (ill != NULL) {
1117                 mutex_enter(&ill->ill_lock);
1118 
1119                 /*
1120                  * Don't allow IRE's to be created on dying ills, or on
1121                  * ill's for which the last ipif is going down, or ones which
1122                  * don't have even a single UP interface
1123                  */
1124                 if ((ill->ill_state_flags &
1125                     (ILL_CONDEMNED|ILL_DOWN_IN_PROGRESS)) != 0) {
1126                         ire_atomic_end(irb_ptr, ire);
1127                         DTRACE_PROBE1(ire__add__on__dying__ill, ire_t *, ire);
1128                         return (ENXIO);
1129                 }
1130 
1131                 if (IS_UNDER_IPMP(ill)) {
1132                         int     error = 0;
1133                         mutex_enter(&ill->ill_phyint->phyint_lock);
1134                         if (!ipmp_ill_is_active(ill) &&
1135                             IRE_HIDDEN_TYPE(ire->ire_type) &&
1136                             !ire->ire_testhidden) {
1137                                 error = EINVAL;
1138                         }
1139                         mutex_exit(&ill->ill_phyint->phyint_lock);
1140                         if (error != 0) {
1141                                 ire_atomic_end(irb_ptr, ire);
1142                                 return (error);
1143                         }
1144                 }
1145 
1146         }
1147         return (0);
1148 }
1149 
1150 /*
1151  * Add a fully initialized IRE to the forwarding table.
1152  * This returns NULL on failure, or a held IRE on success.
1153  * Normally the returned IRE is the same as the argument. But a different
1154  * IRE will be returned if the added IRE is deemed identical to an existing
1155  * one. In that case ire_identical_ref will be increased.
1156  * The caller always needs to do an ire_refrele() on the returned IRE.
1157  */
1158 ire_t *
1159 ire_add(ire_t *ire)
1160 {
1161         if (IRE_HIDDEN_TYPE(ire->ire_type) &&
1162             ire->ire_ill != NULL && IS_UNDER_IPMP(ire->ire_ill)) {
1163                 /*
1164                  * IREs hosted on interfaces that are under IPMP
1165                  * should be hidden so that applications don't
1166                  * accidentally end up sending packets with test
1167                  * addresses as their source addresses, or
1168                  * sending out interfaces that are e.g. IFF_INACTIVE.
1169                  * Hide them here.
1170                  */
1171                 ire->ire_testhidden = B_TRUE;
1172         }
1173 
1174         if (ire->ire_ipversion == IPV6_VERSION)
1175                 return (ire_add_v6(ire));
1176         else
1177                 return (ire_add_v4(ire));
1178 }
1179 
1180 /*
1181  * Add a fully initialized IPv4 IRE to the forwarding table.
1182  * This returns NULL on failure, or a held IRE on success.
1183  * Normally the returned IRE is the same as the argument. But a different
1184  * IRE will be returned if the added IRE is deemed identical to an existing
1185  * one. In that case ire_identical_ref will be increased.
1186  * The caller always needs to do an ire_refrele() on the returned IRE.
1187  */
1188 static ire_t *
1189 ire_add_v4(ire_t *ire)
1190 {
1191         ire_t   *ire1;
1192         irb_t   *irb_ptr;
1193         ire_t   **irep;
1194         int     match_flags;
1195         int     error;
1196         ip_stack_t      *ipst = ire->ire_ipst;
1197 
1198         if (ire->ire_ill != NULL)
1199                 ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock));
1200         ASSERT(ire->ire_ipversion == IPV4_VERSION);
1201 
1202         /* Make sure the address is properly masked. */
1203         ire->ire_addr &= ire->ire_mask;
1204 
1205         match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
1206 
1207         if (ire->ire_ill != NULL) {
1208                 match_flags |= MATCH_IRE_ILL;
1209         }
1210         irb_ptr = ire_get_bucket(ire);
1211         if (irb_ptr == NULL) {
1212                 printf("no bucket for %p\n", (void *)ire);
1213                 ire_delete(ire);
1214                 return (NULL);
1215         }
1216 
1217         /*
1218          * Start the atomic add of the ire. Grab the ill lock,
1219          * the bucket lock. Check for condemned.
1220          */
1221         error = ire_atomic_start(irb_ptr, ire);
1222         if (error != 0) {
1223                 printf("no ire_atomic_start for %p\n", (void *)ire);
1224                 ire_delete(ire);
1225                 irb_refrele(irb_ptr);
1226                 return (NULL);
1227         }
1228         /*
1229          * If we are creating a hidden IRE, make sure we search for
1230          * hidden IREs when searching for duplicates below.
1231          * Otherwise, we might find an IRE on some other interface
1232          * that's not marked hidden.
1233          */
1234         if (ire->ire_testhidden)
1235                 match_flags |= MATCH_IRE_TESTHIDDEN;
1236 
1237         /*
1238          * Atomically check for duplicate and insert in the table.
1239          */
1240         for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1241                 if (IRE_IS_CONDEMNED(ire1))
1242                         continue;
1243                 /*
1244                  * Here we need an exact match on zoneid, i.e.,
1245                  * ire_match_args doesn't fit.
1246                  */
1247                 if (ire1->ire_zoneid != ire->ire_zoneid)
1248                         continue;
1249 
1250                 if (ire1->ire_type != ire->ire_type)
1251                         continue;
1252 
1253                 /*
1254                  * Note: We do not allow multiple routes that differ only
1255                  * in the gateway security attributes; such routes are
1256                  * considered duplicates.
1257                  * To change that we explicitly have to treat them as
1258                  * different here.
1259                  */
1260                 if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask,
1261                     ire->ire_gateway_addr, ire->ire_type, ire->ire_ill,
1262                     ire->ire_zoneid, NULL, match_flags)) {
1263                         /*
1264                          * Return the old ire after doing a REFHOLD.
1265                          * As most of the callers continue to use the IRE
1266                          * after adding, we return a held ire. This will
1267                          * avoid a lookup in the caller again. If the callers
1268                          * don't want to use it, they need to do a REFRELE.
1269                          *
1270                          * We only allow exactly one IRE_IF_CLONE for any dst,
1271                          * so, if the is an IF_CLONE, return the ire without
1272                          * an identical_ref, but with an ire_ref held.
1273                          */
1274                         if (ire->ire_type != IRE_IF_CLONE) {
1275                                 atomic_inc_32(&ire1->ire_identical_ref);
1276                                 DTRACE_PROBE2(ire__add__exist, ire_t *, ire1,
1277                                     ire_t *, ire);
1278                         }
1279                         ire_refhold(ire1);
1280                         ire_atomic_end(irb_ptr, ire);
1281                         ire_delete(ire);
1282                         irb_refrele(irb_ptr);
1283                         return (ire1);
1284                 }
1285         }
1286 
1287         /*
1288          * Normally we do head insertion since most things do not care about
1289          * the order of the IREs in the bucket. Note that ip_cgtp_bcast_add
1290          * assumes we at least do head insertion so that its IRE_BROADCAST
1291          * arrive ahead of existing IRE_HOST for the same address.
1292          * However, due to shared-IP zones (and restrict_interzone_loopback)
1293          * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
1294          * address. For that reason we do tail insertion for IRE_IF_CLONE.
1295          * Due to the IRE_BROADCAST on cgtp0, which must be last in the bucket,
1296          * we do tail insertion of IRE_BROADCASTs that do not have RTF_MULTIRT
1297          * set.
1298          */
1299         irep = (ire_t **)irb_ptr;
1300         if ((ire->ire_type & IRE_IF_CLONE) ||
1301             ((ire->ire_type & IRE_BROADCAST) &&
1302             !(ire->ire_flags & RTF_MULTIRT))) {
1303                 while ((ire1 = *irep) != NULL)
1304                         irep = &ire1->ire_next;
1305         }
1306         /* Insert at *irep */
1307         ire1 = *irep;
1308         if (ire1 != NULL)
1309                 ire1->ire_ptpn = &ire->ire_next;
1310         ire->ire_next = ire1;
1311         /* Link the new one in. */
1312         ire->ire_ptpn = irep;
1313 
1314         /*
1315          * ire_walk routines de-reference ire_next without holding
1316          * a lock. Before we point to the new ire, we want to make
1317          * sure the store that sets the ire_next of the new ire
1318          * reaches global visibility, so that ire_walk routines
1319          * don't see a truncated list of ires i.e if the ire_next
1320          * of the new ire gets set after we do "*irep = ire" due
1321          * to re-ordering, the ire_walk thread will see a NULL
1322          * once it accesses the ire_next of the new ire.
1323          * membar_producer() makes sure that the following store
1324          * happens *after* all of the above stores.
1325          */
1326         membar_producer();
1327         *irep = ire;
1328         ire->ire_bucket = irb_ptr;
1329         /*
1330          * We return a bumped up IRE above. Keep it symmetrical
1331          * so that the callers will always have to release. This
1332          * helps the callers of this function because they continue
1333          * to use the IRE after adding and hence they don't have to
1334          * lookup again after we return the IRE.
1335          *
1336          * NOTE : We don't have to use atomics as this is appearing
1337          * in the list for the first time and no one else can bump
1338          * up the reference count on this yet.
1339          */
1340         ire_refhold_locked(ire);
1341         BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted);
1342 
1343         irb_ptr->irb_ire_cnt++;
1344         if (irb_ptr->irb_marks & IRB_MARK_DYNAMIC)
1345                 irb_ptr->irb_nire++;
1346 
1347         if (ire->ire_ill != NULL) {
1348                 ire->ire_ill->ill_ire_cnt++;
1349                 ASSERT(ire->ire_ill->ill_ire_cnt != 0);   /* Wraparound */
1350         }
1351 
1352         ire_atomic_end(irb_ptr, ire);
1353 
1354         /* Make any caching of the IREs be notified or updated */
1355         ire_flush_cache_v4(ire, IRE_FLUSH_ADD);
1356 
1357         if (ire->ire_ill != NULL)
1358                 ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock));
1359         irb_refrele(irb_ptr);
1360         return (ire);
1361 }
1362 
1363 /*
1364  * irb_refrele is the only caller of the function. ire_unlink calls to
1365  * do the final cleanup for this ire.
1366  */
1367 void
1368 ire_cleanup(ire_t *ire)
1369 {
1370         ire_t *ire_next;
1371         ip_stack_t *ipst = ire->ire_ipst;
1372 
1373         ASSERT(ire != NULL);
1374 
1375         while (ire != NULL) {
1376                 ire_next = ire->ire_next;
1377                 if (ire->ire_ipversion == IPV4_VERSION) {
1378                         ire_delete_v4(ire);
1379                         BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
1380                             ire_stats_deleted);
1381                 } else {
1382                         ASSERT(ire->ire_ipversion == IPV6_VERSION);
1383                         ire_delete_v6(ire);
1384                         BUMP_IRE_STATS(ipst->ips_ire_stats_v6,
1385                             ire_stats_deleted);
1386                 }
1387                 /*
1388                  * Now it's really out of the list. Before doing the
1389                  * REFRELE, set ire_next to NULL as ire_inactive asserts
1390                  * so.
1391                  */
1392                 ire->ire_next = NULL;
1393                 ire_refrele_notr(ire);
1394                 ire = ire_next;
1395         }
1396 }
1397 
1398 /*
1399  * irb_refrele is the only caller of the function. It calls to unlink
1400  * all the CONDEMNED ires from this bucket.
1401  */
1402 ire_t *
1403 ire_unlink(irb_t *irb)
1404 {
1405         ire_t *ire;
1406         ire_t *ire1;
1407         ire_t **ptpn;
1408         ire_t *ire_list = NULL;
1409 
1410         ASSERT(RW_WRITE_HELD(&irb->irb_lock));
1411         ASSERT(((irb->irb_marks & IRB_MARK_DYNAMIC) && irb->irb_refcnt == 1) ||
1412             (irb->irb_refcnt == 0));
1413         ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED);
1414         ASSERT(irb->irb_ire != NULL);
1415 
1416         for (ire = irb->irb_ire; ire != NULL; ire = ire1) {
1417                 ire1 = ire->ire_next;
1418                 if (IRE_IS_CONDEMNED(ire)) {
1419                         ptpn = ire->ire_ptpn;
1420                         ire1 = ire->ire_next;
1421                         if (ire1)
1422                                 ire1->ire_ptpn = ptpn;
1423                         *ptpn = ire1;
1424                         ire->ire_ptpn = NULL;
1425                         ire->ire_next = NULL;
1426 
1427                         /*
1428                          * We need to call ire_delete_v4 or ire_delete_v6 to
1429                          * clean up dependents and the redirects pointing at
1430                          * the default gateway. We need to drop the lock
1431                          * as ire_flush_cache/ire_delete_host_redircts require
1432                          * so. But we can't drop the lock, as ire_unlink needs
1433                          * to atomically remove the ires from the list.
1434                          * So, create a temporary list of CONDEMNED ires
1435                          * for doing ire_delete_v4/ire_delete_v6 operations
1436                          * later on.
1437                          */
1438                         ire->ire_next = ire_list;
1439                         ire_list = ire;
1440                 }
1441         }
1442         irb->irb_marks &= ~IRB_MARK_CONDEMNED;
1443         return (ire_list);
1444 }
1445 
1446 /*
1447  * Clean up the radix node for this ire. Must be called by irb_refrele
1448  * when there are no ire's left in the bucket. Returns TRUE if the bucket
1449  * is deleted and freed.
1450  */
1451 boolean_t
1452 irb_inactive(irb_t *irb)
1453 {
1454         struct rt_entry *rt;
1455         struct radix_node *rn;
1456         ip_stack_t *ipst = irb->irb_ipst;
1457 
1458         ASSERT(irb->irb_ipst != NULL);
1459 
1460         rt = IRB2RT(irb);
1461         rn = (struct radix_node *)rt;
1462 
1463         /* first remove it from the radix tree. */
1464         RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
1465         rw_enter(&irb->irb_lock, RW_WRITER);
1466         if (irb->irb_refcnt == 1 && irb->irb_nire == 0) {
1467                 rn = ipst->ips_ip_ftable->rnh_deladdr(rn->rn_key, rn->rn_mask,
1468                     ipst->ips_ip_ftable);
1469                 DTRACE_PROBE1(irb__free, rt_t *,  rt);
1470                 ASSERT((void *)rn == (void *)rt);
1471                 Free(rt, rt_entry_cache);
1472                 /* irb_lock is freed */
1473                 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
1474                 return (B_TRUE);
1475         }
1476         rw_exit(&irb->irb_lock);
1477         RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
1478         return (B_FALSE);
1479 }
1480 
1481 /*
1482  * Delete the specified IRE.
1483  * We assume that if ire_bucket is not set then ire_ill->ill_ire_cnt was
1484  * not incremented i.e., that the insertion in the bucket and the increment
1485  * of that counter is done atomically.
1486  */
1487 void
1488 ire_delete(ire_t *ire)
1489 {
1490         ire_t   *ire1;
1491         ire_t   **ptpn;
1492         irb_t   *irb;
1493         ip_stack_t      *ipst = ire->ire_ipst;
1494 
1495         if ((irb = ire->ire_bucket) == NULL) {
1496                 /*
1497                  * It was never inserted in the list. Should call REFRELE
1498                  * to free this IRE.
1499                  */
1500                 ire_make_condemned(ire);
1501                 ire_refrele_notr(ire);
1502                 return;
1503         }
1504 
1505         /*
1506          * Move the use counts from an IRE_IF_CLONE to its parent
1507          * IRE_INTERFACE.
1508          * We need to do this before acquiring irb_lock.
1509          */
1510         if (ire->ire_type & IRE_IF_CLONE) {
1511                 ire_t *parent;
1512 
1513                 rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
1514                 if ((parent = ire->ire_dep_parent) != NULL) {
1515                         parent->ire_ob_pkt_count += ire->ire_ob_pkt_count;
1516                         parent->ire_ib_pkt_count += ire->ire_ib_pkt_count;
1517                         ire->ire_ob_pkt_count = 0;
1518                         ire->ire_ib_pkt_count = 0;
1519                 }
1520                 rw_exit(&ipst->ips_ire_dep_lock);
1521         }
1522 
1523         rw_enter(&irb->irb_lock, RW_WRITER);
1524         if (ire->ire_ptpn == NULL) {
1525                 /*
1526                  * Some other thread has removed us from the list.
1527                  * It should have done the REFRELE for us.
1528                  */
1529                 rw_exit(&irb->irb_lock);
1530                 return;
1531         }
1532 
1533         if (!IRE_IS_CONDEMNED(ire)) {
1534                 /* Is this an IRE representing multiple duplicate entries? */
1535                 ASSERT(ire->ire_identical_ref >= 1);
1536                 if (atomic_dec_32_nv(&ire->ire_identical_ref) != 0) {
1537                         /* Removed one of the identical parties */
1538                         rw_exit(&irb->irb_lock);
1539                         return;
1540                 }
1541 
1542                 irb->irb_ire_cnt--;
1543                 ire_make_condemned(ire);
1544         }
1545 
1546         if (irb->irb_refcnt != 0) {
1547                 /*
1548                  * The last thread to leave this bucket will
1549                  * delete this ire.
1550                  */
1551                 irb->irb_marks |= IRB_MARK_CONDEMNED;
1552                 rw_exit(&irb->irb_lock);
1553                 return;
1554         }
1555 
1556         /*
1557          * Normally to delete an ire, we walk the bucket. While we
1558          * walk the bucket, we normally bump up irb_refcnt and hence
1559          * we return from above where we mark CONDEMNED and the ire
1560          * gets deleted from ire_unlink. This case is where somebody
1561          * knows the ire e.g by doing a lookup, and wants to delete the
1562          * IRE. irb_refcnt would be 0 in this case if nobody is walking
1563          * the bucket.
1564          */
1565         ptpn = ire->ire_ptpn;
1566         ire1 = ire->ire_next;
1567         if (ire1 != NULL)
1568                 ire1->ire_ptpn = ptpn;
1569         ASSERT(ptpn != NULL);
1570         *ptpn = ire1;
1571         ire->ire_ptpn = NULL;
1572         ire->ire_next = NULL;
1573         if (ire->ire_ipversion == IPV6_VERSION) {
1574                 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_deleted);
1575         } else {
1576                 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_deleted);
1577         }
1578         rw_exit(&irb->irb_lock);
1579 
1580         /* Cleanup dependents and related stuff */
1581         if (ire->ire_ipversion == IPV6_VERSION) {
1582                 ire_delete_v6(ire);
1583         } else {
1584                 ire_delete_v4(ire);
1585         }
1586         /*
1587          * We removed it from the list. Decrement the
1588          * reference count.
1589          */
1590         ire_refrele_notr(ire);
1591 }
1592 
1593 /*
1594  * Delete the specified IRE.
1595  * All calls should use ire_delete().
1596  * Sometimes called as writer though not required by this function.
1597  *
1598  * NOTE : This function is called only if the ire was added
1599  * in the list.
1600  */
1601 static void
1602 ire_delete_v4(ire_t *ire)
1603 {
1604         ip_stack_t      *ipst = ire->ire_ipst;
1605 
1606         ASSERT(ire->ire_refcnt >= 1);
1607         ASSERT(ire->ire_ipversion == IPV4_VERSION);
1608 
1609         ire_flush_cache_v4(ire, IRE_FLUSH_DELETE);
1610         if (ire->ire_type == IRE_DEFAULT) {
1611                 /*
1612                  * when a default gateway is going away
1613                  * delete all the host redirects pointing at that
1614                  * gateway.
1615                  */
1616                 ire_delete_host_redirects(ire->ire_gateway_addr, ipst);
1617         }
1618 
1619         /*
1620          * If we are deleting an IRE_INTERFACE then we make sure we also
1621          * delete any IRE_IF_CLONE that has been created from it.
1622          * Those are always in ire_dep_children.
1623          */
1624         if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != NULL)
1625                 ire_dep_delete_if_clone(ire);
1626 
1627         /* Remove from parent dependencies and child */
1628         rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
1629         if (ire->ire_dep_parent != NULL)
1630                 ire_dep_remove(ire);
1631 
1632         while (ire->ire_dep_children != NULL)
1633                 ire_dep_remove(ire->ire_dep_children);
1634         rw_exit(&ipst->ips_ire_dep_lock);
1635 }
1636 
1637 /*
1638  * ire_refrele is the only caller of the function. It calls
1639  * to free the ire when the reference count goes to zero.
1640  */
1641 void
1642 ire_inactive(ire_t *ire)
1643 {
1644         ill_t   *ill;
1645         irb_t   *irb;
1646         ip_stack_t      *ipst = ire->ire_ipst;
1647 
1648         ASSERT(ire->ire_refcnt == 0);
1649         ASSERT(ire->ire_ptpn == NULL);
1650         ASSERT(ire->ire_next == NULL);
1651 
1652         /* Count how many condemned ires for kmem_cache callback */
1653         ASSERT(IRE_IS_CONDEMNED(ire));
1654         atomic_add_32(&ipst->ips_num_ire_condemned, -1);
1655 
1656         if (ire->ire_gw_secattr != NULL) {
1657                 ire_gw_secattr_free(ire->ire_gw_secattr);
1658                 ire->ire_gw_secattr = NULL;
1659         }
1660 
1661         /*
1662          * ire_nce_cache is cleared in ire_delete, and we make sure we don't
1663          * set it once the ire is marked condemned.
1664          */
1665         ASSERT(ire->ire_nce_cache == NULL);
1666 
1667         /*
1668          * Since any parent would have a refhold on us they would already
1669          * have been removed.
1670          */
1671         ASSERT(ire->ire_dep_parent == NULL);
1672         ASSERT(ire->ire_dep_sib_next == NULL);
1673         ASSERT(ire->ire_dep_sib_ptpn == NULL);
1674 
1675         /*
1676          * Since any children would have a refhold on us they should have
1677          * already been removed.
1678          */
1679         ASSERT(ire->ire_dep_children == NULL);
1680 
1681         /*
1682          * ill_ire_ref is increased when the IRE is inserted in the
1683          * bucket - not when the IRE is created.
1684          */
1685         irb = ire->ire_bucket;
1686         ill = ire->ire_ill;
1687         if (irb != NULL && ill != NULL) {
1688                 mutex_enter(&ill->ill_lock);
1689                 ASSERT(ill->ill_ire_cnt != 0);
1690                 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
1691                     (char *), "ire", (void *), ire);
1692                 ill->ill_ire_cnt--;
1693                 if (ILL_DOWN_OK(ill)) {
1694                         /* Drops the ill lock */
1695                         ipif_ill_refrele_tail(ill);
1696                 } else {
1697                         mutex_exit(&ill->ill_lock);
1698                 }
1699         }
1700         ire->ire_ill = NULL;
1701 
1702         /* This should be true for both V4 and V6 */
1703         if (irb != NULL && (irb->irb_marks & IRB_MARK_DYNAMIC)) {
1704                 rw_enter(&irb->irb_lock, RW_WRITER);
1705                 irb->irb_nire--;
1706                 /*
1707                  * Instead of examining the conditions for freeing
1708                  * the radix node here, we do it by calling
1709                  * irb_refrele which is a single point in the code
1710                  * that embeds that logic. Bump up the refcnt to
1711                  * be able to call irb_refrele
1712                  */
1713                 irb_refhold_locked(irb);
1714                 rw_exit(&irb->irb_lock);
1715                 irb_refrele(irb);
1716         }
1717 
1718 #ifdef DEBUG
1719         ire_trace_cleanup(ire);
1720 #endif
1721         mutex_destroy(&ire->ire_lock);
1722         if (ire->ire_ipversion == IPV6_VERSION) {
1723                 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_freed);
1724         } else {
1725                 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
1726         }
1727         kmem_cache_free(ire_cache, ire);
1728 }
1729 
1730 /*
1731  * ire_update_generation is the callback function provided by
1732  * ire_get_bucket() to update the generation number of any
1733  * matching shorter route when a new route is added.
1734  *
1735  * This fucntion always returns a failure return (B_FALSE)
1736  * to force the caller (rn_matchaddr_args)
1737  * to back-track up the tree looking for shorter matches.
1738  */
1739 /* ARGSUSED */
1740 static boolean_t
1741 ire_update_generation(struct radix_node *rn, void *arg)
1742 {
1743         struct rt_entry *rt = (struct rt_entry *)rn;
1744 
1745         /* We need to handle all in the same bucket */
1746         irb_increment_generation(&rt->rt_irb);
1747         return (B_FALSE);
1748 }
1749 
1750 /*
1751  * Take care of all the generation numbers in the bucket.
1752  */
1753 void
1754 irb_increment_generation(irb_t *irb)
1755 {
1756         ire_t *ire;
1757         ip_stack_t *ipst;
1758 
1759         if (irb == NULL || irb->irb_ire_cnt == 0)
1760                 return;
1761 
1762         ipst = irb->irb_ipst;
1763         /*
1764          * we cannot do an irb_refhold/irb_refrele here as the caller
1765          * already has the global RADIX_NODE_HEAD_WLOCK, and the irb_refrele
1766          * may result in an attempt to free the irb_t, which also needs
1767          * the RADIX_NODE_HEAD lock. However, since we want to traverse the
1768          * irb_ire list without fear of having a condemned ire removed from
1769          * the list, we acquire the irb_lock as WRITER. Moreover, since
1770          * the ire_generation increments are done under the ire_dep_lock,
1771          * acquire the locks in the prescribed lock order first.
1772          */
1773         rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
1774         rw_enter(&irb->irb_lock, RW_WRITER);
1775         for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
1776                 if (!IRE_IS_CONDEMNED(ire))
1777                         ire_increment_generation(ire);  /* Ourselves */
1778                 ire_dep_incr_generation_locked(ire);    /* Dependants */
1779         }
1780         rw_exit(&irb->irb_lock);
1781         rw_exit(&ipst->ips_ire_dep_lock);
1782 }
1783 
1784 /*
1785  * When an IRE is added or deleted this routine is called to make sure
1786  * any caching of IRE information is notified or updated.
1787  *
1788  * The flag argument indicates if the flush request is due to addition
1789  * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
1790  * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
1791  */
1792 void
1793 ire_flush_cache_v4(ire_t *ire, int flag)
1794 {
1795         irb_t *irb = ire->ire_bucket;
1796         struct rt_entry *rt = IRB2RT(irb);
1797         ip_stack_t *ipst = ire->ire_ipst;
1798 
1799         /*
1800          * IRE_IF_CLONE ire's don't provide any new information
1801          * than the parent from which they are cloned, so don't
1802          * perturb the generation numbers.
1803          */
1804         if (ire->ire_type & IRE_IF_CLONE)
1805                 return;
1806 
1807         /*
1808          * Ensure that an ire_add during a lookup serializes the updates of the
1809          * generation numbers under the radix head lock so that the lookup gets
1810          * either the old ire and old generation number, or a new ire and new
1811          * generation number.
1812          */
1813         RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
1814 
1815         /*
1816          * If a route was just added, we need to notify everybody that
1817          * has cached an IRE_NOROUTE since there might now be a better
1818          * route for them.
1819          */
1820         if (flag == IRE_FLUSH_ADD) {
1821                 ire_increment_generation(ipst->ips_ire_reject_v4);
1822                 ire_increment_generation(ipst->ips_ire_blackhole_v4);
1823         }
1824 
1825         /* Adding a default can't otherwise provide a better route */
1826         if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) {
1827                 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
1828                 return;
1829         }
1830 
1831         switch (flag) {
1832         case IRE_FLUSH_DELETE:
1833         case IRE_FLUSH_GWCHANGE:
1834                 /*
1835                  * Update ire_generation for all ire_dep_children chains
1836                  * starting with this IRE
1837                  */
1838                 ire_dep_incr_generation(ire);
1839                 break;
1840         case IRE_FLUSH_ADD:
1841                 /*
1842                  * Update the generation numbers of all shorter matching routes.
1843                  * ire_update_generation takes care of the dependants by
1844                  * using ire_dep_incr_generation.
1845                  */
1846                 (void) ipst->ips_ip_ftable->rnh_matchaddr_args(&rt->rt_dst,
1847                     ipst->ips_ip_ftable, ire_update_generation, NULL);
1848                 break;
1849         }
1850         RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
1851 }
1852 
1853 /*
1854  * Matches the arguments passed with the values in the ire.
1855  *
1856  * Note: for match types that match using "ill" passed in, ill
1857  * must be checked for non-NULL before calling this routine.
1858  */
1859 boolean_t
1860 ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
1861     int type, const ill_t *ill, zoneid_t zoneid,
1862     const ts_label_t *tsl, int match_flags)
1863 {
1864         ill_t *ire_ill = NULL, *dst_ill;
1865         ip_stack_t *ipst = ire->ire_ipst;
1866 
1867         ASSERT(ire->ire_ipversion == IPV4_VERSION);
1868         ASSERT((ire->ire_addr & ~ire->ire_mask) == 0);
1869         ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) ||
1870             (ill != NULL && !ill->ill_isv6));
1871 
1872         /*
1873          * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it is
1874          * in fact hidden, to ensure the caller gets the right one.
1875          */
1876         if (ire->ire_testhidden) {
1877                 if (!(match_flags & MATCH_IRE_TESTHIDDEN))
1878                         return (B_FALSE);
1879         }
1880 
1881         if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
1882             ire->ire_zoneid != ALL_ZONES) {
1883                 /*
1884                  * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
1885                  * does not match that of ire_zoneid, a failure to
1886                  * match is reported at this point. Otherwise, since some IREs
1887                  * that are available in the global zone can be used in local
1888                  * zones, additional checks need to be performed:
1889                  *
1890                  * IRE_LOOPBACK
1891                  *      entries should never be matched in this situation.
1892                  *      Each zone has its own IRE_LOOPBACK.
1893                  *
1894                  * IRE_LOCAL
1895                  *      We allow them for any zoneid. ire_route_recursive
1896                  *      does additional checks when
1897                  *      ip_restrict_interzone_loopback is set.
1898                  *
1899                  * If ill_usesrc_ifindex is set
1900                  *      Then we check if the zone has a valid source address
1901                  *      on the usesrc ill.
1902                  *
1903                  * If ire_ill is set, then check that the zone has an ipif
1904                  *      on that ill.
1905                  *
1906                  * Outside of this function (in ire_round_robin) we check
1907                  * that any IRE_OFFLINK has a gateway that reachable from the
1908                  * zone when we have multiple choices (ECMP).
1909                  */
1910                 if (match_flags & MATCH_IRE_ZONEONLY)
1911                         return (B_FALSE);
1912                 if (ire->ire_type & IRE_LOOPBACK)
1913                         return (B_FALSE);
1914 
1915                 if (ire->ire_type & IRE_LOCAL)
1916                         goto matchit;
1917 
1918                 /*
1919                  * The normal case of IRE_ONLINK has a matching zoneid.
1920                  * Here we handle the case when shared-IP zones have been
1921                  * configured with IP addresses on vniN. In that case it
1922                  * is ok for traffic from a zone to use IRE_ONLINK routes
1923                  * if the ill has a usesrc pointing at vniN
1924                  */
1925                 dst_ill = ire->ire_ill;
1926                 if (ire->ire_type & IRE_ONLINK) {
1927                         uint_t  ifindex;
1928 
1929                         /*
1930                          * Note there is no IRE_INTERFACE on vniN thus
1931                          * can't do an IRE lookup for a matching route.
1932                          */
1933                         ifindex = dst_ill->ill_usesrc_ifindex;
1934                         if (ifindex == 0)
1935                                 return (B_FALSE);
1936 
1937                         /*
1938                          * If there is a usable source address in the
1939                          * zone, then it's ok to return this IRE_INTERFACE
1940                          */
1941                         if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
1942                             zoneid, ipst)) {
1943                                 ip3dbg(("ire_match_args: no usrsrc for zone"
1944                                     " dst_ill %p\n", (void *)dst_ill));
1945                                 return (B_FALSE);
1946                         }
1947                 }
1948                 /*
1949                  * For example, with
1950                  * route add 11.0.0.0 gw1 -ifp bge0
1951                  * route add 11.0.0.0 gw2 -ifp bge1
1952                  * this code would differentiate based on
1953                  * where the sending zone has addresses.
1954                  * Only if the zone has an address on bge0 can it use the first
1955                  * route. It isn't clear if this behavior is documented
1956                  * anywhere.
1957                  */
1958                 if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
1959                         ipif_t  *tipif;
1960 
1961                         mutex_enter(&dst_ill->ill_lock);
1962                         for (tipif = dst_ill->ill_ipif;
1963                             tipif != NULL; tipif = tipif->ipif_next) {
1964                                 if (!IPIF_IS_CONDEMNED(tipif) &&
1965                                     (tipif->ipif_flags & IPIF_UP) &&
1966                                     (tipif->ipif_zoneid == zoneid ||
1967                                     tipif->ipif_zoneid == ALL_ZONES))
1968                                         break;
1969                         }
1970                         mutex_exit(&dst_ill->ill_lock);
1971                         if (tipif == NULL) {
1972                                 return (B_FALSE);
1973                         }
1974                 }
1975         }
1976 
1977 matchit:
1978         ire_ill = ire->ire_ill;
1979         if (match_flags & MATCH_IRE_ILL) {
1980 
1981                 /*
1982                  * If asked to match an ill, we *must* match
1983                  * on the ire_ill for ipmp test addresses, or
1984                  * any of the ill in the group for data addresses.
1985                  * If we don't, we may as well fail.
1986                  * However, we need an exception for IRE_LOCALs to ensure
1987                  * we loopback packets even sent to test addresses on different
1988                  * interfaces in the group.
1989                  */
1990                 if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
1991                     !(ire->ire_type & IRE_LOCAL)) {
1992                         if (ire->ire_ill != ill)
1993                                 return (B_FALSE);
1994                 } else  {
1995                         match_flags &= ~MATCH_IRE_TESTHIDDEN;
1996                         /*
1997                          * We know that ill is not NULL, but ire_ill could be
1998                          * NULL
1999                          */
2000                         if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill))
2001                                 return (B_FALSE);
2002                 }
2003         }
2004         if (match_flags & MATCH_IRE_SRC_ILL) {
2005                 if (ire_ill == NULL)
2006                         return (B_FALSE);
2007                 if (!IS_ON_SAME_LAN(ill, ire_ill)) {
2008                         if (ire_ill->ill_usesrc_ifindex == 0 ||
2009                             (ire_ill->ill_usesrc_ifindex !=
2010                             ill->ill_phyint->phyint_ifindex))
2011                                 return (B_FALSE);
2012                 }
2013         }
2014 
2015         if ((ire->ire_addr == (addr & mask)) &&
2016             ((!(match_flags & MATCH_IRE_GW)) ||
2017             (ire->ire_gateway_addr == gateway)) &&
2018             ((!(match_flags & MATCH_IRE_DIRECT)) ||
2019             !(ire->ire_flags & RTF_INDIRECT)) &&
2020             ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) &&
2021             ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) &&
2022             ((!(match_flags & MATCH_IRE_MASK)) || (ire->ire_mask == mask)) &&
2023             ((!(match_flags & MATCH_IRE_SECATTR)) ||
2024             (!is_system_labeled()) ||
2025             (tsol_ire_match_gwattr(ire, tsl) == 0))) {
2026                 /* We found the matched IRE */
2027                 return (B_TRUE);
2028         }
2029         return (B_FALSE);
2030 }
2031 
2032 /*
2033  * Check if the IRE_LOCAL uses the same ill as another route would use.
2034  * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE,
2035  * then we don't allow this IRE_LOCAL to be used.
2036  * We always return an IRE; will be RTF_REJECT if no route available.
2037  */
2038 ire_t *
2039 ire_alt_local(ire_t *ire, zoneid_t zoneid, const ts_label_t *tsl,
2040     const ill_t *ill, uint_t *generationp)
2041 {
2042         ip_stack_t      *ipst = ire->ire_ipst;
2043         ire_t           *alt_ire;
2044         uint_t          ire_type;
2045         uint_t          generation;
2046         uint_t          match_flags;
2047 
2048         ASSERT(ire->ire_type & IRE_LOCAL);
2049         ASSERT(ire->ire_ill != NULL);
2050 
2051         /*
2052          * Need to match on everything but local.
2053          * This might result in the creation of a IRE_IF_CLONE for the
2054          * same address as the IRE_LOCAL when restrict_interzone_loopback is
2055          * set. ire_add_*() ensures that the IRE_IF_CLONE are tail inserted
2056          * to make sure the IRE_LOCAL is always found first.
2057          */
2058         ire_type = (IRE_ONLINK | IRE_OFFLINK) & ~(IRE_LOCAL|IRE_LOOPBACK);
2059         match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
2060         if (ill != NULL)
2061                 match_flags |= MATCH_IRE_ILL;
2062 
2063         if (ire->ire_ipversion == IPV4_VERSION) {
2064                 alt_ire = ire_route_recursive_v4(ire->ire_addr, ire_type,
2065                     ill, zoneid, tsl, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
2066                     NULL, &generation);
2067         } else {
2068                 alt_ire = ire_route_recursive_v6(&ire->ire_addr_v6, ire_type,
2069                     ill, zoneid, tsl, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
2070                     NULL, &generation);
2071         }
2072         ASSERT(alt_ire != NULL);
2073 
2074         if (alt_ire->ire_ill == ire->ire_ill) {
2075                 /* Going out the same ILL - ok to send to IRE_LOCAL */
2076                 ire_refrele(alt_ire);
2077         } else {
2078                 /* Different ill - ignore IRE_LOCAL */
2079                 ire_refrele(ire);
2080                 ire = alt_ire;
2081                 if (generationp != NULL)
2082                         *generationp = generation;
2083         }
2084         return (ire);
2085 }
2086 
2087 boolean_t
2088 ire_find_zoneid(struct radix_node *rn, void *arg)
2089 {
2090         struct rt_entry *rt = (struct rt_entry *)rn;
2091         irb_t *irb;
2092         ire_t *ire;
2093         ire_ftable_args_t *margs = arg;
2094 
2095         ASSERT(rt != NULL);
2096 
2097         irb = &rt->rt_irb;
2098 
2099         if (irb->irb_ire_cnt == 0)
2100                 return (B_FALSE);
2101 
2102         rw_enter(&irb->irb_lock, RW_READER);
2103         for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
2104                 if (IRE_IS_CONDEMNED(ire))
2105                         continue;
2106 
2107                 if (!(ire->ire_type & IRE_INTERFACE))
2108                         continue;
2109 
2110                 if (ire->ire_zoneid != ALL_ZONES &&
2111                     ire->ire_zoneid != margs->ift_zoneid)
2112                         continue;
2113 
2114                 if (margs->ift_ill != NULL && margs->ift_ill != ire->ire_ill)
2115                         continue;
2116 
2117                 if (is_system_labeled() &&
2118                     tsol_ire_match_gwattr(ire, margs->ift_tsl) != 0)
2119                         continue;
2120 
2121                 rw_exit(&irb->irb_lock);
2122                 return (B_TRUE);
2123         }
2124         rw_exit(&irb->irb_lock);
2125         return (B_FALSE);
2126 }
2127 
2128 /*
2129  * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
2130  * gateway address. If ill is non-NULL we also match on it.
2131  * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
2132  */
2133 boolean_t
2134 ire_gateway_ok_zone_v4(ipaddr_t gateway, zoneid_t zoneid, ill_t *ill,
2135     const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held)
2136 {
2137         struct rt_sockaddr rdst;
2138         struct rt_entry *rt;
2139         ire_ftable_args_t margs;
2140 
2141         ASSERT(ill == NULL || !ill->ill_isv6);
2142         if (lock_held)
2143                 ASSERT(RW_READ_HELD(&ipst->ips_ip_ftable->rnh_lock));
2144         else
2145                 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
2146 
2147         bzero(&rdst, sizeof (rdst));
2148         rdst.rt_sin_len = sizeof (rdst);
2149         rdst.rt_sin_family = AF_INET;
2150         rdst.rt_sin_addr.s_addr = gateway;
2151 
2152         /*
2153          * We only use margs for ill, zoneid, and tsl matching in
2154          * ire_find_zoneid
2155          */
2156         bzero(&margs, sizeof (margs));
2157         margs.ift_ill = ill;
2158         margs.ift_zoneid = zoneid;
2159         margs.ift_tsl = tsl;
2160         rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
2161             ipst->ips_ip_ftable, ire_find_zoneid, (void *)&margs);
2162 
2163         if (!lock_held)
2164                 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
2165 
2166         return (rt != NULL);
2167 }
2168 
2169 /*
2170  * ire_walk routine to delete a fraction of redirect IREs and IRE_CLONE_IF IREs.
2171  * The fraction argument tells us what fraction of the IREs to delete.
2172  * Common for IPv4 and IPv6.
2173  * Used when memory backpressure.
2174  */
2175 static void
2176 ire_delete_reclaim(ire_t *ire, char *arg)
2177 {
2178         ip_stack_t      *ipst = ire->ire_ipst;
2179         uint_t          fraction = *(uint_t *)arg;
2180         uint_t          rand;
2181 
2182         if ((ire->ire_flags & RTF_DYNAMIC) ||
2183             (ire->ire_type & IRE_IF_CLONE)) {
2184 
2185                 /* Pick a random number */
2186                 rand = (uint_t)ddi_get_lbolt() +
2187                     IRE_ADDR_HASH_V6(ire->ire_addr_v6, 256);
2188 
2189                 /* Use truncation */
2190                 if ((rand/fraction)*fraction == rand) {
2191                         IP_STAT(ipst, ip_ire_reclaim_deleted);
2192                         ire_delete(ire);
2193                 }
2194         }
2195 
2196 }
2197 
2198 /*
2199  * kmem_cache callback to free up memory.
2200  *
2201  * Free a fraction (ips_ip_ire_reclaim_fraction) of things IP added dynamically
2202  * (RTF_DYNAMIC and IRE_IF_CLONE).
2203  */
2204 static void
2205 ip_ire_reclaim_stack(ip_stack_t *ipst)
2206 {
2207         uint_t  fraction = ipst->ips_ip_ire_reclaim_fraction;
2208 
2209         IP_STAT(ipst, ip_ire_reclaim_calls);
2210 
2211         ire_walk(ire_delete_reclaim, &fraction, ipst);
2212 
2213         /*
2214          * Walk all CONNs that can have a reference on an ire, nce or dce.
2215          * Get them to update any stale references to drop any refholds they
2216          * have.
2217          */
2218         ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
2219 }
2220 
2221 /*
2222  * Called by the memory allocator subsystem directly, when the system
2223  * is running low on memory.
2224  */
2225 /* ARGSUSED */
2226 void
2227 ip_ire_reclaim(void *args)
2228 {
2229         netstack_handle_t nh;
2230         netstack_t *ns;
2231         ip_stack_t *ipst;
2232 
2233         netstack_next_init(&nh);
2234         while ((ns = netstack_next(&nh)) != NULL) {
2235                 /*
2236                  * netstack_next() can return a netstack_t with a NULL
2237                  * netstack_ip at boot time.
2238                  */
2239                 if ((ipst = ns->netstack_ip) == NULL) {
2240                         netstack_rele(ns);
2241                         continue;
2242                 }
2243                 ip_ire_reclaim_stack(ipst);
2244                 netstack_rele(ns);
2245         }
2246         netstack_next_fini(&nh);
2247 }
2248 
2249 static void
2250 power2_roundup(uint32_t *value)
2251 {
2252         int i;
2253 
2254         for (i = 1; i < 31; i++) {
2255                 if (*value <= (1 << i))
2256                         break;
2257         }
2258         *value = (1 << i);
2259 }
2260 
2261 /* Global init for all zones */
2262 void
2263 ip_ire_g_init()
2264 {
2265         /*
2266          * Create kmem_caches.  ip_ire_reclaim() and ip_nce_reclaim()
2267          * will give disposable IREs back to system when needed.
2268          * This needs to be done here before anything else, since
2269          * ire_add() expects the cache to be created.
2270          */
2271         ire_cache = kmem_cache_create("ire_cache",
2272             sizeof (ire_t), 0, NULL, NULL,
2273             ip_ire_reclaim, NULL, NULL, 0);
2274 
2275         ncec_cache = kmem_cache_create("ncec_cache",
2276             sizeof (ncec_t), 0, NULL, NULL,
2277             ip_nce_reclaim, NULL, NULL, 0);
2278         nce_cache = kmem_cache_create("nce_cache",
2279             sizeof (nce_t), 0, NULL, NULL,
2280             NULL, NULL, NULL, 0);
2281 
2282         rt_entry_cache = kmem_cache_create("rt_entry",
2283             sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0);
2284 
2285         /*
2286          * Have radix code setup kmem caches etc.
2287          */
2288         rn_init();
2289 }
2290 
2291 void
2292 ip_ire_init(ip_stack_t *ipst)
2293 {
2294         ire_t   *ire;
2295         int     error;
2296 
2297         mutex_init(&ipst->ips_ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0);
2298 
2299         (void) rn_inithead((void **)&ipst->ips_ip_ftable, 32);
2300 
2301         /*
2302          * Make sure that the forwarding table size is a power of 2.
2303          * The IRE*_ADDR_HASH() macroes depend on that.
2304          */
2305         ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size;
2306         power2_roundup(&ipst->ips_ip6_ftable_hash_size);
2307 
2308         /*
2309          * Allocate/initialize a pair of IRE_NOROUTEs for each of IPv4 and IPv6.
2310          * The ire_reject_v* has RTF_REJECT set, and the ire_blackhole_v* has
2311          * RTF_BLACKHOLE set. We use the latter for transient errors such
2312          * as memory allocation failures and tripping on IRE_IS_CONDEMNED
2313          * entries.
2314          */
2315         ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
2316         *ire = ire_null;
2317         error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
2318             RTF_REJECT|RTF_UP, NULL, ipst);
2319         ASSERT(error == 0);
2320         ipst->ips_ire_reject_v4 = ire;
2321 
2322         ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
2323         *ire = ire_null;
2324         error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
2325             RTF_REJECT|RTF_UP, NULL, ipst);
2326         ASSERT(error == 0);
2327         ipst->ips_ire_reject_v6 = ire;
2328 
2329         ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
2330         *ire = ire_null;
2331         error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
2332             RTF_BLACKHOLE|RTF_UP, NULL, ipst);
2333         ASSERT(error == 0);
2334         ipst->ips_ire_blackhole_v4 = ire;
2335 
2336         ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
2337         *ire = ire_null;
2338         error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
2339             RTF_BLACKHOLE|RTF_UP, NULL, ipst);
2340         ASSERT(error == 0);
2341         ipst->ips_ire_blackhole_v6 = ire;
2342 
2343         rw_init(&ipst->ips_ip6_ire_head_lock, NULL, RW_DEFAULT, NULL);
2344         rw_init(&ipst->ips_ire_dep_lock, NULL, RW_DEFAULT, NULL);
2345 }
2346 
2347 void
2348 ip_ire_g_fini(void)
2349 {
2350         kmem_cache_destroy(ire_cache);
2351         kmem_cache_destroy(ncec_cache);
2352         kmem_cache_destroy(nce_cache);
2353         kmem_cache_destroy(rt_entry_cache);
2354 
2355         rn_fini();
2356 }
2357 
2358 void
2359 ip_ire_fini(ip_stack_t *ipst)
2360 {
2361         int i;
2362 
2363         ire_make_condemned(ipst->ips_ire_reject_v6);
2364         ire_refrele_notr(ipst->ips_ire_reject_v6);
2365         ipst->ips_ire_reject_v6 = NULL;
2366 
2367         ire_make_condemned(ipst->ips_ire_reject_v4);
2368         ire_refrele_notr(ipst->ips_ire_reject_v4);
2369         ipst->ips_ire_reject_v4 = NULL;
2370 
2371         ire_make_condemned(ipst->ips_ire_blackhole_v6);
2372         ire_refrele_notr(ipst->ips_ire_blackhole_v6);
2373         ipst->ips_ire_blackhole_v6 = NULL;
2374 
2375         ire_make_condemned(ipst->ips_ire_blackhole_v4);
2376         ire_refrele_notr(ipst->ips_ire_blackhole_v4);
2377         ipst->ips_ire_blackhole_v4 = NULL;
2378 
2379         /*
2380          * Delete all IREs - assumes that the ill/ipifs have
2381          * been removed so what remains are just the ftable to handle.
2382          */
2383         ire_walk(ire_delete, NULL, ipst);
2384 
2385         rn_freehead(ipst->ips_ip_ftable);
2386         ipst->ips_ip_ftable = NULL;
2387 
2388         rw_destroy(&ipst->ips_ire_dep_lock);
2389         rw_destroy(&ipst->ips_ip6_ire_head_lock);
2390 
2391         mutex_destroy(&ipst->ips_ire_ft_init_lock);
2392 
2393         for (i = 0; i < IP6_MASK_TABLE_SIZE; i++) {
2394                 irb_t *ptr;
2395                 int j;
2396 
2397                 if ((ptr = ipst->ips_ip_forwarding_table_v6[i]) == NULL)
2398                         continue;
2399 
2400                 for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) {
2401                         ASSERT(ptr[j].irb_ire == NULL);
2402                         rw_destroy(&ptr[j].irb_lock);
2403                 }
2404                 mi_free(ptr);
2405                 ipst->ips_ip_forwarding_table_v6[i] = NULL;
2406         }
2407 }
2408 
2409 #ifdef DEBUG
2410 void
2411 ire_trace_ref(ire_t *ire)
2412 {
2413         mutex_enter(&ire->ire_lock);
2414         if (ire->ire_trace_disable) {
2415                 mutex_exit(&ire->ire_lock);
2416                 return;
2417         }
2418 
2419         if (th_trace_ref(ire, ire->ire_ipst)) {
2420                 mutex_exit(&ire->ire_lock);
2421         } else {
2422                 ire->ire_trace_disable = B_TRUE;
2423                 mutex_exit(&ire->ire_lock);
2424                 ire_trace_cleanup(ire);
2425         }
2426 }
2427 
2428 void
2429 ire_untrace_ref(ire_t *ire)
2430 {
2431         mutex_enter(&ire->ire_lock);
2432         if (!ire->ire_trace_disable)
2433                 th_trace_unref(ire);
2434         mutex_exit(&ire->ire_lock);
2435 }
2436 
2437 static void
2438 ire_trace_cleanup(const ire_t *ire)
2439 {
2440         th_trace_cleanup(ire, ire->ire_trace_disable);
2441 }
2442 #endif /* DEBUG */
2443 
2444 /*
2445  * Find, or create if needed, the nce_t pointer to the neighbor cache
2446  * entry ncec_t for an IPv4 address. The nce_t will be created on the ill_t
2447  * in the non-IPMP case, or on the cast-ill in the IPMP bcast/mcast case, or
2448  * on the next available under-ill (selected by the IPMP rotor) in the
2449  * unicast IPMP case.
2450  *
2451  * If a neighbor-cache entry has to be created (i.e., one does not already
2452  * exist in the nce list) the ncec_lladdr and ncec_state of the neighbor cache
2453  * entry are initialized in nce_add_v4(). The broadcast, multicast, and
2454  * link-layer type determine the contents of {ncec_state, ncec_lladdr} of
2455  * the ncec_t created. The ncec_lladdr is non-null for all link types with
2456  * non-zero ill_phys_addr_length, though the contents may be zero in cases
2457  * where the link-layer type is not known at the time of creation
2458  * (e.g., IRE_IFRESOLVER links)
2459  *
2460  * All IRE_BROADCAST entries have ncec_state = ND_REACHABLE, and the nce_lladr
2461  * has the physical broadcast address of the outgoing interface.
2462  * For unicast ire entries,
2463  *   - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created
2464  *     ncec_t with 0 nce_lladr contents, and will be in the ND_INITIAL state.
2465  *   - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link
2466  *     layer resolution is necessary, so that the ncec_t will be in the
2467  *     ND_REACHABLE state
2468  *
2469  * The link layer information needed for broadcast addresses, and for
2470  * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that
2471  * never needs re-verification for the lifetime of the ncec_t. These are
2472  * therefore marked NCE_F_NONUD.
2473  *
2474  * The nce returned will be created such that the nce_ill == ill that
2475  * is passed in. Note that the nce itself may not have ncec_ill == ill
2476  * where IPMP links are involved.
2477  */
2478 static nce_t *
2479 ire_nce_init(ill_t *ill, const void *addr, int ire_type)
2480 {
2481         int             err;
2482         nce_t           *nce = NULL;
2483         uint16_t        ncec_flags;
2484         uchar_t         *hwaddr;
2485         boolean_t       need_refrele = B_FALSE;
2486         ill_t           *in_ill = ill;
2487         boolean_t       is_unicast;
2488         uint_t          hwaddr_len;
2489 
2490         is_unicast = ((ire_type & (IRE_MULTICAST|IRE_BROADCAST)) == 0);
2491         if (IS_IPMP(ill) ||
2492             ((ire_type & IRE_BROADCAST) && IS_UNDER_IPMP(ill))) {
2493                 if ((ill = ipmp_ill_hold_xmit_ill(ill, is_unicast)) == NULL)
2494                         return (NULL);
2495                 need_refrele = B_TRUE;
2496         }
2497         ncec_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0;
2498 
2499         switch (ire_type) {
2500         case IRE_BROADCAST:
2501                 ASSERT(!ill->ill_isv6);
2502                 ncec_flags |= (NCE_F_BCAST|NCE_F_NONUD);
2503                 break;
2504         case IRE_MULTICAST:
2505                 ncec_flags |= (NCE_F_MCAST|NCE_F_NONUD);
2506                 break;
2507         }
2508 
2509         if (ill->ill_net_type == IRE_IF_NORESOLVER && is_unicast) {
2510                 hwaddr = ill->ill_dest_addr;
2511         } else {
2512                 hwaddr = NULL;
2513         }
2514         hwaddr_len = ill->ill_phys_addr_length;
2515 
2516 retry:
2517         /* nce_state will be computed by nce_add_common() */
2518         if (!ill->ill_isv6) {
2519                 err = nce_lookup_then_add_v4(ill, hwaddr, hwaddr_len, addr,
2520                     ncec_flags, ND_UNCHANGED, &nce);
2521         } else {
2522                 err = nce_lookup_then_add_v6(ill, hwaddr, hwaddr_len, addr,
2523                     ncec_flags, ND_UNCHANGED, &nce);
2524         }
2525 
2526         switch (err) {
2527         case 0:
2528                 break;
2529         case EEXIST:
2530                 /*
2531                  * When subnets change or partially overlap what was once
2532                  * a broadcast address could now be a unicast, or vice versa.
2533                  */
2534                 if (((ncec_flags ^ nce->nce_common->ncec_flags) &
2535                     NCE_F_BCAST) != 0) {
2536                         ASSERT(!ill->ill_isv6);
2537                         ncec_delete(nce->nce_common);
2538                         nce_refrele(nce);
2539                         goto retry;
2540                 }
2541                 break;
2542         default:
2543                 DTRACE_PROBE2(nce__init__fail, ill_t *, ill, int, err);
2544                 if (need_refrele)
2545                         ill_refrele(ill);
2546                 return (NULL);
2547         }
2548         /*
2549          * If the ill was an under-ill of an IPMP group, we need to verify
2550          * that it is still active so that we select an active interface in
2551          * the group. However, since ipmp_ill_is_active ASSERTs for
2552          * IS_UNDER_IPMP(), we first need to verify that the ill is an
2553          * under-ill, and since this is being done in the data path, the
2554          * only way to ascertain this is by holding the ill_g_lock.
2555          */
2556         rw_enter(&ill->ill_ipst->ips_ill_g_lock, RW_READER);
2557         mutex_enter(&ill->ill_lock);
2558         mutex_enter(&ill->ill_phyint->phyint_lock);
2559         if (need_refrele && IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) {
2560                 /*
2561                  * need_refrele implies that the under ill was selected by
2562                  * ipmp_ill_hold_xmit_ill() because either the in_ill was an
2563                  * ipmp_ill, or we are sending a non-unicast packet on an
2564                  * under_ill. However, when we get here, the ill selected by
2565                  * ipmp_ill_hold_xmit_ill was pulled out of the active set
2566                  * (for unicast) or cast_ill nomination (for !unicast) after
2567                  * it was picked as the outgoing ill.  We have to pick an
2568                  * active interface and/or cast_ill in the group.
2569                  */
2570                 mutex_exit(&ill->ill_phyint->phyint_lock);
2571                 nce_delete(nce);
2572                 mutex_exit(&ill->ill_lock);
2573                 rw_exit(&ill->ill_ipst->ips_ill_g_lock);
2574                 nce_refrele(nce);
2575                 ill_refrele(ill);
2576                 if ((ill = ipmp_ill_hold_xmit_ill(in_ill, is_unicast)) == NULL)
2577                         return (NULL);
2578                 goto retry;
2579         } else {
2580                 mutex_exit(&ill->ill_phyint->phyint_lock);
2581                 mutex_exit(&ill->ill_lock);
2582                 rw_exit(&ill->ill_ipst->ips_ill_g_lock);
2583         }
2584 done:
2585         ASSERT(nce->nce_ill == ill);
2586         if (need_refrele)
2587                 ill_refrele(ill);
2588         return (nce);
2589 }
2590 
2591 nce_t *
2592 arp_nce_init(ill_t *ill, in_addr_t addr4, int ire_type)
2593 {
2594         return (ire_nce_init(ill, &addr4, ire_type));
2595 }
2596 
2597 nce_t *
2598 ndp_nce_init(ill_t *ill, const in6_addr_t *addr6, int ire_type)
2599 {
2600         ASSERT((ire_type & IRE_BROADCAST) == 0);
2601         return (ire_nce_init(ill, addr6, ire_type));
2602 }
2603 
2604 /*
2605  * The caller should hold irb_lock as a writer if the ire is in a bucket.
2606  * This routine will clear ire_nce_cache, and we make sure that we can never
2607  * set ire_nce_cache after the ire is marked condemned.
2608  */
2609 void
2610 ire_make_condemned(ire_t *ire)
2611 {
2612         ip_stack_t      *ipst = ire->ire_ipst;
2613         nce_t           *nce;
2614 
2615         mutex_enter(&ire->ire_lock);
2616         ASSERT(ire->ire_bucket == NULL ||
2617             RW_WRITE_HELD(&ire->ire_bucket->irb_lock));
2618         ASSERT(!IRE_IS_CONDEMNED(ire));
2619         ire->ire_generation = IRE_GENERATION_CONDEMNED;
2620         /* Count how many condemned ires for kmem_cache callback */
2621         atomic_inc_32(&ipst->ips_num_ire_condemned);
2622         nce = ire->ire_nce_cache;
2623         ire->ire_nce_cache = NULL;
2624         mutex_exit(&ire->ire_lock);
2625         if (nce != NULL)
2626                 nce_refrele(nce);
2627 }
2628 
2629 /*
2630  * Increment the generation avoiding the special condemned value
2631  */
2632 void
2633 ire_increment_generation(ire_t *ire)
2634 {
2635         uint_t generation;
2636 
2637         mutex_enter(&ire->ire_lock);
2638         /*
2639          * Even though the caller has a hold it can't prevent a concurrent
2640          * ire_delete marking the IRE condemned
2641          */
2642         if (!IRE_IS_CONDEMNED(ire)) {
2643                 generation = ire->ire_generation + 1;
2644                 if (generation == IRE_GENERATION_CONDEMNED)
2645                         generation = IRE_GENERATION_INITIAL;
2646                 ASSERT(generation != IRE_GENERATION_VERIFY);
2647                 ire->ire_generation = generation;
2648         }
2649         mutex_exit(&ire->ire_lock);
2650 }
2651 
2652 /*
2653  * Increment ire_generation on all the IRE_MULTICASTs
2654  * Used when the default multicast interface (as determined by
2655  * ill_lookup_multicast) might have changed.
2656  *
2657  * That includes the zoneid, IFF_ flags, the IPv6 scope of the address, and
2658  * ill unplumb.
2659  */
2660 void
2661 ire_increment_multicast_generation(ip_stack_t *ipst, boolean_t isv6)
2662 {
2663         ill_t   *ill;
2664         ill_walk_context_t ctx;
2665 
2666         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2667         if (isv6)
2668                 ill = ILL_START_WALK_V6(&ctx, ipst);
2669         else
2670                 ill = ILL_START_WALK_V4(&ctx, ipst);
2671         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
2672                 if (ILL_IS_CONDEMNED(ill))
2673                         continue;
2674                 if (ill->ill_ire_multicast != NULL)
2675                         ire_increment_generation(ill->ill_ire_multicast);
2676         }
2677         rw_exit(&ipst->ips_ill_g_lock);
2678 }
2679 
2680 /*
2681  * Return a held IRE_NOROUTE with RTF_REJECT set
2682  */
2683 ire_t *
2684 ire_reject(ip_stack_t *ipst, boolean_t isv6)
2685 {
2686         ire_t *ire;
2687 
2688         if (isv6)
2689                 ire = ipst->ips_ire_reject_v6;
2690         else
2691                 ire = ipst->ips_ire_reject_v4;
2692 
2693         ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED);
2694         ire_refhold(ire);
2695         return (ire);
2696 }
2697 
2698 /*
2699  * Return a held IRE_NOROUTE with RTF_BLACKHOLE set
2700  */
2701 ire_t *
2702 ire_blackhole(ip_stack_t *ipst, boolean_t isv6)
2703 {
2704         ire_t *ire;
2705 
2706         if (isv6)
2707                 ire = ipst->ips_ire_blackhole_v6;
2708         else
2709                 ire = ipst->ips_ire_blackhole_v4;
2710 
2711         ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED);
2712         ire_refhold(ire);
2713         return (ire);
2714 }
2715 
2716 /*
2717  * Return a held IRE_MULTICAST.
2718  */
2719 ire_t *
2720 ire_multicast(ill_t *ill)
2721 {
2722         ire_t *ire = ill->ill_ire_multicast;
2723 
2724         ASSERT(ire == NULL || ire->ire_generation != IRE_GENERATION_CONDEMNED);
2725         if (ire == NULL)
2726                 ire = ire_blackhole(ill->ill_ipst, ill->ill_isv6);
2727         else
2728                 ire_refhold(ire);
2729         return (ire);
2730 }
2731 
2732 /*
2733  * Given an IRE return its nexthop IRE. The nexthop IRE is an IRE_ONLINK
2734  * that is an exact match (i.e., a /32 for IPv4 and /128 for IPv6).
2735  * This can return an RTF_REJECT|RTF_BLACKHOLE.
2736  * The returned IRE is held.
2737  * The assumption is that ip_select_route() has been called and returned the
2738  * IRE (thus ip_select_route would have set up the ire_dep* information.)
2739  * If some IRE is deleteted then ire_dep_remove() will have been called and
2740  * we might not find a nexthop IRE, in which case we return NULL.
2741  */
2742 ire_t *
2743 ire_nexthop(ire_t *ire)
2744 {
2745         ip_stack_t      *ipst = ire->ire_ipst;
2746 
2747         /* Acquire lock to walk ire_dep_parent */
2748         rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
2749         while (ire != NULL) {
2750                 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2751                         goto done;
2752                 }
2753                 /*
2754                  * If we find an IRE_ONLINK we are done. This includes
2755                  * the case of IRE_MULTICAST.
2756                  * Note that in order to send packets we need a host-specific
2757                  * IRE_IF_ALL first in the ire_dep_parent chain. Normally this
2758                  * is done by inserting an IRE_IF_CLONE if the IRE_INTERFACE
2759                  * was not host specific.
2760                  * However, ip_rts_request doesn't want to send packets
2761                  * hence doesn't want to allocate an IRE_IF_CLONE. Yet
2762                  * it needs an IRE_IF_ALL to get to the ill. Thus
2763                  * we return IRE_IF_ALL that are not host specific here.
2764                  */
2765                 if (ire->ire_type & IRE_ONLINK)
2766                         goto done;
2767                 ire = ire->ire_dep_parent;
2768         }
2769         rw_exit(&ipst->ips_ire_dep_lock);
2770         return (NULL);
2771 
2772 done:
2773         ire_refhold(ire);
2774         rw_exit(&ipst->ips_ire_dep_lock);
2775         return (ire);
2776 }
2777 
2778 /*
2779  * Find the ill used to send packets. This will be NULL in case
2780  * of a reject or blackhole.
2781  * The returned ill is held; caller needs to do ill_refrele when done.
2782  */
2783 ill_t *
2784 ire_nexthop_ill(ire_t *ire)
2785 {
2786         ill_t           *ill;
2787 
2788         ire = ire_nexthop(ire);
2789         if (ire == NULL)
2790                 return (NULL);
2791 
2792         /* ire_ill can not change for an existing ire */
2793         ill = ire->ire_ill;
2794         if (ill != NULL)
2795                 ill_refhold(ill);
2796         ire_refrele(ire);
2797         return (ill);
2798 }
2799 
2800 #ifdef DEBUG
2801 static boolean_t
2802 parent_has_child(ire_t *parent, ire_t *child)
2803 {
2804         ire_t   *ire;
2805         ire_t   *prev;
2806 
2807         ire = parent->ire_dep_children;
2808         prev = NULL;
2809         while (ire != NULL) {
2810                 if (prev == NULL) {
2811                         ASSERT(ire->ire_dep_sib_ptpn ==
2812                             &(parent->ire_dep_children));
2813                 } else {
2814                         ASSERT(ire->ire_dep_sib_ptpn ==
2815                             &(prev->ire_dep_sib_next));
2816                 }
2817                 if (ire == child)
2818                         return (B_TRUE);
2819                 prev = ire;
2820                 ire = ire->ire_dep_sib_next;
2821         }
2822         return (B_FALSE);
2823 }
2824 
2825 static void
2826 ire_dep_verify(ire_t *ire)
2827 {
2828         ire_t           *parent = ire->ire_dep_parent;
2829         ire_t           *child = ire->ire_dep_children;
2830 
2831         ASSERT(ire->ire_ipversion == IPV4_VERSION ||
2832             ire->ire_ipversion == IPV6_VERSION);
2833         if (parent != NULL) {
2834                 ASSERT(parent->ire_ipversion == IPV4_VERSION ||
2835                     parent->ire_ipversion == IPV6_VERSION);
2836                 ASSERT(parent->ire_refcnt >= 1);
2837                 ASSERT(parent_has_child(parent, ire));
2838         }
2839         if (child != NULL) {
2840                 ASSERT(child->ire_ipversion == IPV4_VERSION ||
2841                     child->ire_ipversion == IPV6_VERSION);
2842                 ASSERT(child->ire_dep_parent == ire);
2843                 ASSERT(child->ire_dep_sib_ptpn != NULL);
2844                 ASSERT(parent_has_child(ire, child));
2845         }
2846 }
2847 #endif /* DEBUG */
2848 
2849 /*
2850  * Assumes ire_dep_parent is set. Remove this child from its parent's linkage.
2851  */
2852 void
2853 ire_dep_remove(ire_t *ire)
2854 {
2855         ip_stack_t      *ipst = ire->ire_ipst;
2856         ire_t           *parent = ire->ire_dep_parent;
2857         ire_t           *next;
2858         nce_t           *nce;
2859 
2860         ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock));
2861         ASSERT(ire->ire_dep_parent != NULL);
2862         ASSERT(ire->ire_dep_sib_ptpn != NULL);
2863 
2864 #ifdef DEBUG
2865         ire_dep_verify(ire);
2866         ire_dep_verify(parent);
2867 #endif
2868 
2869         next = ire->ire_dep_sib_next;
2870         if (next != NULL)
2871                 next->ire_dep_sib_ptpn = ire->ire_dep_sib_ptpn;
2872 
2873         ASSERT(*(ire->ire_dep_sib_ptpn) == ire);
2874         *(ire->ire_dep_sib_ptpn) = ire->ire_dep_sib_next;
2875 
2876         ire->ire_dep_sib_ptpn = NULL;
2877         ire->ire_dep_sib_next = NULL;
2878 
2879         mutex_enter(&ire->ire_lock);
2880         parent = ire->ire_dep_parent;
2881         ire->ire_dep_parent = NULL;
2882         mutex_exit(&ire->ire_lock);
2883 
2884         /*
2885          * Make sure all our children, grandchildren, etc set
2886          * ire_dep_parent_generation to IRE_GENERATION_VERIFY since
2887          * we can no longer guarantee than the children have a current
2888          * ire_nce_cache and ire_nexthop_ill().
2889          */
2890         if (ire->ire_dep_children != NULL)
2891                 ire_dep_invalidate_children(ire->ire_dep_children);
2892 
2893         /*
2894          * Since the parent is gone we make sure we clear ire_nce_cache.
2895          * We can clear it under ire_lock even if the IRE is used
2896          */
2897         mutex_enter(&ire->ire_lock);
2898         nce = ire->ire_nce_cache;
2899         ire->ire_nce_cache = NULL;
2900         mutex_exit(&ire->ire_lock);
2901         if (nce != NULL)
2902                 nce_refrele(nce);
2903 
2904 #ifdef DEBUG
2905         ire_dep_verify(ire);
2906         ire_dep_verify(parent);
2907 #endif
2908 
2909         ire_refrele_notr(parent);
2910         ire_refrele_notr(ire);
2911 }
2912 
2913 /*
2914  * Insert the child in the linkage of the parent
2915  */
2916 static void
2917 ire_dep_parent_insert(ire_t *child, ire_t *parent)
2918 {
2919         ip_stack_t      *ipst = child->ire_ipst;
2920         ire_t           *next;
2921 
2922         ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock));
2923         ASSERT(child->ire_dep_parent == NULL);
2924 
2925 #ifdef DEBUG
2926         ire_dep_verify(child);
2927         ire_dep_verify(parent);
2928 #endif
2929         /* No parents => no siblings */
2930         ASSERT(child->ire_dep_sib_ptpn == NULL);
2931         ASSERT(child->ire_dep_sib_next == NULL);
2932 
2933         ire_refhold_notr(parent);
2934         ire_refhold_notr(child);
2935 
2936         /* Head insertion */
2937         next = parent->ire_dep_children;
2938         if (next != NULL) {
2939                 ASSERT(next->ire_dep_sib_ptpn == &(parent->ire_dep_children));
2940                 child->ire_dep_sib_next = next;
2941                 next->ire_dep_sib_ptpn = &(child->ire_dep_sib_next);
2942         }
2943         parent->ire_dep_children = child;
2944         child->ire_dep_sib_ptpn = &(parent->ire_dep_children);
2945 
2946         mutex_enter(&child->ire_lock);
2947         child->ire_dep_parent = parent;
2948         mutex_exit(&child->ire_lock);
2949 
2950 #ifdef DEBUG
2951         ire_dep_verify(child);
2952         ire_dep_verify(parent);
2953 #endif
2954 }
2955 
2956 
2957 /*
2958  * Given count worth of ires and generations, build ire_dep_* relationships
2959  * from ires[0] to ires[count-1]. Record generations[i+1] in
2960  * ire_dep_parent_generation for ires[i].
2961  * We graft onto an existing parent chain by making sure that we don't
2962  * touch ire_dep_parent for ires[count-1].
2963  *
2964  * We check for any condemned ire_generation count and return B_FALSE in
2965  * that case so that the caller can tear it apart.
2966  *
2967  * Note that generations[0] is not used. Caller handles that.
2968  */
2969 boolean_t
2970 ire_dep_build(ire_t *ires[], uint_t generations[], uint_t count)
2971 {
2972         ire_t           *ire = ires[0];
2973         ip_stack_t      *ipst;
2974         uint_t          i;
2975 
2976         ASSERT(count > 0);
2977         if (count == 1) {
2978                 /* No work to do */
2979                 return (B_TRUE);
2980         }
2981         ipst = ire->ire_ipst;
2982         rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
2983         /*
2984          * Do not remove the linkage for any existing parent chain i.e.,
2985          * ires[count-1] is left alone.
2986          */
2987         for (i = 0; i < count-1; i++) {
2988                 /* Remove existing parent if we need to change it */
2989                 if (ires[i]->ire_dep_parent != NULL &&
2990                     ires[i]->ire_dep_parent != ires[i+1])
2991                         ire_dep_remove(ires[i]);
2992         }
2993 
2994         for (i = 0; i < count - 1; i++) {
2995                 ASSERT(ires[i]->ire_ipversion == IPV4_VERSION ||
2996                     ires[i]->ire_ipversion == IPV6_VERSION);
2997                 /* Does it need to change? */
2998                 if (ires[i]->ire_dep_parent != ires[i+1])
2999                         ire_dep_parent_insert(ires[i], ires[i+1]);
3000 
3001                 mutex_enter(&ires[i+1]->ire_lock);
3002                 if (IRE_IS_CONDEMNED(ires[i+1])) {
3003                         mutex_exit(&ires[i+1]->ire_lock);
3004                         rw_exit(&ipst->ips_ire_dep_lock);
3005                         return (B_FALSE);
3006                 }
3007                 mutex_exit(&ires[i+1]->ire_lock);
3008 
3009                 mutex_enter(&ires[i]->ire_lock);
3010                 ires[i]->ire_dep_parent_generation = generations[i+1];
3011                 mutex_exit(&ires[i]->ire_lock);
3012         }
3013         rw_exit(&ipst->ips_ire_dep_lock);
3014         return (B_TRUE);
3015 }
3016 
3017 /*
3018  * Given count worth of ires, unbuild ire_dep_* relationships
3019  * from ires[0] to ires[count-1].
3020  */
3021 void
3022 ire_dep_unbuild(ire_t *ires[], uint_t count)
3023 {
3024         ip_stack_t      *ipst;
3025         uint_t          i;
3026 
3027         if (count == 0) {
3028                 /* No work to do */
3029                 return;
3030         }
3031         ipst = ires[0]->ire_ipst;
3032         rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
3033         for (i = 0; i < count; i++) {
3034                 ASSERT(ires[i]->ire_ipversion == IPV4_VERSION ||
3035                     ires[i]->ire_ipversion == IPV6_VERSION);
3036                 if (ires[i]->ire_dep_parent != NULL)
3037                         ire_dep_remove(ires[i]);
3038                 mutex_enter(&ires[i]->ire_lock);
3039                 ires[i]->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
3040                 mutex_exit(&ires[i]->ire_lock);
3041         }
3042         rw_exit(&ipst->ips_ire_dep_lock);
3043 }
3044 
3045 /*
3046  * Both the forwarding and the outbound code paths can trip on
3047  * a condemned NCE, in which case we call this function.
3048  * We have two different behaviors: if the NCE was UNREACHABLE
3049  * it is an indication that something failed. In that case
3050  * we see if we should look for a different IRE (for example,
3051  * delete any matching redirect IRE, or try a different
3052  * IRE_DEFAULT (ECMP)). We mark the ire as bad so a hopefully
3053  * different IRE will be picked next time we send/forward.
3054  *
3055  * If we are called by the output path then fail_if_better is set
3056  * and we return NULL if there could be a better IRE. This is because the
3057  * output path retries the IRE lookup. (The input/forward path can not retry.)
3058  *
3059  * If the NCE was not unreachable then we pick/allocate a
3060  * new (most likely ND_INITIAL) NCE and proceed with it.
3061  *
3062  * ipha/ip6h are needed for multicast packets; ipha needs to be
3063  * set for IPv4 and ip6h needs to be set for IPv6 packets.
3064  */
3065 nce_t *
3066 ire_handle_condemned_nce(nce_t *nce, ire_t *ire, ipha_t *ipha, ip6_t *ip6h,
3067     boolean_t fail_if_better)
3068 {
3069         if (nce->nce_common->ncec_state == ND_UNREACHABLE) {
3070                 if (ire_no_good(ire) && fail_if_better) {
3071                         /*
3072                          * Did some changes, or ECMP likely to exist.
3073                          * Make ip_output look for a different IRE
3074                          */
3075                         return (NULL);
3076                 }
3077         }
3078         if (ire_revalidate_nce(ire) == ENETUNREACH) {
3079                 /* The ire_dep_parent chain went bad, or no memory? */
3080                 (void) ire_no_good(ire);
3081                 return (NULL);
3082         }
3083         if (ire->ire_ipversion == IPV4_VERSION) {
3084                 ASSERT(ipha != NULL);
3085                 nce = ire_to_nce(ire, ipha->ipha_dst, NULL);
3086         } else {
3087                 ASSERT(ip6h != NULL);
3088                 nce = ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst);
3089         }
3090 
3091         if (nce == NULL)
3092                 return (NULL);
3093         if (nce->nce_is_condemned) {
3094                 nce_refrele(nce);
3095                 return (NULL);
3096         }
3097         return (nce);
3098 }
3099 
3100 /*
3101  * The caller has found that the ire is bad, either due to a reference to an NCE
3102  * in ND_UNREACHABLE state, or a MULTIRT route whose gateway can't be resolved.
3103  * We update things so a subsequent attempt to send to the destination
3104  * is likely to find different IRE, or that a new NCE would be created.
3105  *
3106  * Returns B_TRUE if it is likely that a subsequent ire_ftable_lookup would
3107  * find a different route (either due to having deleted a redirect, or there
3108  * being ECMP routes.)
3109  *
3110  * If we have a redirect (RTF_DYNAMIC) we delete it.
3111  * Otherwise we increment ire_badcnt and increment the generation number so
3112  * that a cached ixa_ire will redo the route selection. ire_badcnt is taken
3113  * into account in the route selection when we have multiple choices (multiple
3114  * default routes or ECMP in general).
3115  * Any time ip_select_route find an ire with a condemned ire_nce_cache
3116  * (e.g., if no equal cost route to the bad one) ip_select_route will make
3117  * sure the NCE is revalidated to avoid getting stuck on a
3118  * NCE_F_CONDMNED ncec that caused ire_no_good to be called.
3119  */
3120 boolean_t
3121 ire_no_good(ire_t *ire)
3122 {
3123         ip_stack_t      *ipst = ire->ire_ipst;
3124         ire_t           *ire2;
3125         nce_t           *nce;
3126 
3127         if (ire->ire_flags & RTF_DYNAMIC) {
3128                 ire_delete(ire);
3129                 return (B_TRUE);
3130         }
3131         if (ire->ire_flags & RTF_INDIRECT) {
3132                 /* Check if next IRE is a redirect */
3133                 rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
3134                 if (ire->ire_dep_parent != NULL &&
3135                     (ire->ire_dep_parent->ire_flags & RTF_DYNAMIC)) {
3136                         ire2 = ire->ire_dep_parent;
3137                         ire_refhold(ire2);
3138                 } else {
3139                         ire2 = NULL;
3140                 }
3141                 rw_exit(&ipst->ips_ire_dep_lock);
3142                 if (ire2 != NULL) {
3143                         ire_delete(ire2);
3144                         ire_refrele(ire2);
3145                         return (B_TRUE);
3146                 }
3147         }
3148         /*
3149          * No redirect involved. Increment badcnt so that if we have ECMP
3150          * routes we are likely to pick a different one for the next packet.
3151          *
3152          * If the NCE is unreachable and condemned we should drop the reference
3153          * to it so that a new NCE can be created.
3154          *
3155          * Finally we increment the generation number so that any ixa_ire
3156          * cache will be revalidated.
3157          */
3158         mutex_enter(&ire->ire_lock);
3159         ire->ire_badcnt++;
3160         ire->ire_last_badcnt = TICK_TO_SEC(ddi_get_lbolt64());
3161         nce = ire->ire_nce_cache;
3162         if (nce != NULL && nce->nce_is_condemned &&
3163             nce->nce_common->ncec_state == ND_UNREACHABLE)
3164                 ire->ire_nce_cache = NULL;
3165         else
3166                 nce = NULL;
3167         mutex_exit(&ire->ire_lock);
3168         if (nce != NULL)
3169                 nce_refrele(nce);
3170 
3171         ire_increment_generation(ire);
3172         ire_dep_incr_generation(ire);
3173 
3174         return (ire->ire_bucket->irb_ire_cnt > 1);
3175 }
3176 
3177 /*
3178  * Walk ire_dep_parent chain and validate that ire_dep_parent->ire_generation ==
3179  * ire_dep_parent_generation.
3180  * If they all match we just return ire_generation from the topmost IRE.
3181  * Otherwise we propagate the mismatch by setting all ire_dep_parent_generation
3182  * above the mismatch to IRE_GENERATION_VERIFY and also returning
3183  * IRE_GENERATION_VERIFY.
3184  */
3185 uint_t
3186 ire_dep_validate_generations(ire_t *ire)
3187 {
3188         ip_stack_t      *ipst = ire->ire_ipst;
3189         uint_t          generation;
3190         ire_t           *ire1;
3191 
3192         rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
3193         generation = ire->ire_generation;    /* Assuming things match */
3194         for (ire1 = ire; ire1 != NULL; ire1 = ire1->ire_dep_parent) {
3195                 ASSERT(ire1->ire_ipversion == IPV4_VERSION ||
3196                     ire1->ire_ipversion == IPV6_VERSION);
3197                 if (ire1->ire_dep_parent == NULL)
3198                         break;
3199                 if (ire1->ire_dep_parent_generation !=
3200                     ire1->ire_dep_parent->ire_generation)
3201                         goto mismatch;
3202         }
3203         rw_exit(&ipst->ips_ire_dep_lock);
3204         return (generation);
3205 
3206 mismatch:
3207         generation = IRE_GENERATION_VERIFY;
3208         /* Fill from top down to the mismatch with _VERIFY */
3209         while (ire != ire1) {
3210                 ASSERT(ire->ire_ipversion == IPV4_VERSION ||
3211                     ire->ire_ipversion == IPV6_VERSION);
3212                 mutex_enter(&ire->ire_lock);
3213                 ire->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
3214                 mutex_exit(&ire->ire_lock);
3215                 ire = ire->ire_dep_parent;
3216         }
3217         rw_exit(&ipst->ips_ire_dep_lock);
3218         return (generation);
3219 }
3220 
3221 /*
3222  * Used when we need to return an ire with ire_dep_parent, but we
3223  * know the chain is invalid for instance we didn't create an IRE_IF_CLONE
3224  * Using IRE_GENERATION_VERIFY means that next time we'll redo the
3225  * recursive lookup.
3226  */
3227 void
3228 ire_dep_invalidate_generations(ire_t *ire)
3229 {
3230         ip_stack_t      *ipst = ire->ire_ipst;
3231 
3232         rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
3233         while (ire != NULL) {
3234                 ASSERT(ire->ire_ipversion == IPV4_VERSION ||
3235                     ire->ire_ipversion == IPV6_VERSION);
3236                 mutex_enter(&ire->ire_lock);
3237                 ire->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
3238                 mutex_exit(&ire->ire_lock);
3239                 ire = ire->ire_dep_parent;
3240         }
3241         rw_exit(&ipst->ips_ire_dep_lock);
3242 }
3243 
3244 /* Set _VERIFY ire_dep_parent_generation for all children recursively */
3245 static void
3246 ire_dep_invalidate_children(ire_t *child)
3247 {
3248         ip_stack_t      *ipst = child->ire_ipst;
3249 
3250         ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock));
3251         /* Depth first */
3252         if (child->ire_dep_children != NULL)
3253                 ire_dep_invalidate_children(child->ire_dep_children);
3254 
3255         while (child != NULL) {
3256                 mutex_enter(&child->ire_lock);
3257                 child->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
3258                 mutex_exit(&child->ire_lock);
3259                 child = child->ire_dep_sib_next;
3260         }
3261 }
3262 
3263 static void
3264 ire_dep_increment_children(ire_t *child)
3265 {
3266         ip_stack_t      *ipst = child->ire_ipst;
3267 
3268         ASSERT(RW_READ_HELD(&ipst->ips_ire_dep_lock));
3269         /* Depth first */
3270         if (child->ire_dep_children != NULL)
3271                 ire_dep_increment_children(child->ire_dep_children);
3272 
3273         while (child != NULL) {
3274                 if (!IRE_IS_CONDEMNED(child))
3275                         ire_increment_generation(child);
3276                 child = child->ire_dep_sib_next;
3277         }
3278 }
3279 
3280 /*
3281  * Walk all the children of this ire recursively and increment their
3282  * generation number.
3283  */
3284 static void
3285 ire_dep_incr_generation_locked(ire_t *parent)
3286 {
3287         ASSERT(RW_READ_HELD(&parent->ire_ipst->ips_ire_dep_lock));
3288         if (parent->ire_dep_children != NULL)
3289                 ire_dep_increment_children(parent->ire_dep_children);
3290 }
3291 
3292 void
3293 ire_dep_incr_generation(ire_t *parent)
3294 {
3295         ip_stack_t      *ipst = parent->ire_ipst;
3296 
3297         rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
3298         ire_dep_incr_generation_locked(parent);
3299         rw_exit(&ipst->ips_ire_dep_lock);
3300 }
3301 
3302 /*
3303  * Get a new ire_nce_cache for this IRE as well as its nexthop.
3304  * Returns zero if it succeeds. Can fail due to lack of memory or when
3305  * the route has become unreachable. Returns ENOMEM and ENETUNREACH in those
3306  * cases.
3307  *
3308  * In the in.mpathd case, the ire will have ire_testhidden
3309  * set; so we should create the ncec for the underlying ill.
3310  *
3311  * Note that the error returned by ire_revalidate_nce() is ignored by most
3312  * callers except ire_handle_condemned_nce(), which handles the ENETUNREACH
3313  * error to mark potentially bad ire's. For all the other callers, an
3314  * error return could indicate a transient condition like ENOMEM, or could
3315  * be the result of an interface that is going down/unplumbing. In the former
3316  * case (transient error), we would leave the old stale ire/ire_nce_cache
3317  * in place, and possibly use incorrect link-layer information to send packets
3318  * but would eventually recover. In the latter case (ill down/replumb),
3319  * ire_revalidate_nce() might return a condemned nce back, but we would then
3320  * recover in the packet output path.
3321  */
3322 int
3323 ire_revalidate_nce(ire_t *ire)
3324 {
3325         nce_t           *nce, *old_nce;
3326         ire_t           *nexthop;
3327 
3328         /*
3329          * For multicast we conceptually have an NCE but we don't store it
3330          * in ire_nce_cache; when ire_to_nce is called we allocate the nce.
3331          */
3332         if (ire->ire_type & IRE_MULTICAST)
3333                 return (0);
3334 
3335         /* ire_testhidden should only be set on under-interfaces */
3336         ASSERT(!ire->ire_testhidden || !IS_IPMP(ire->ire_ill));
3337 
3338         nexthop = ire_nexthop(ire);
3339         if (nexthop == NULL) {
3340                 /* The route is potentially bad */
3341                 (void) ire_no_good(ire);
3342                 return (ENETUNREACH);
3343         }
3344         if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
3345                 ASSERT(ire->ire_ill != NULL);
3346 
3347                 if (ire->ire_ipversion == IPV4_VERSION)
3348                         nce = nce_lookup_v4(ire->ire_ill, &ire->ire_addr);
3349                 else
3350                         nce = nce_lookup_v6(ire->ire_ill, &ire->ire_addr_v6);
3351         } else {
3352                 ASSERT(nexthop->ire_type & IRE_ONLINK);
3353                 if (ire->ire_ipversion == IPV4_VERSION) {
3354                         nce = arp_nce_init(nexthop->ire_ill, nexthop->ire_addr,
3355                             nexthop->ire_type);
3356                 } else {
3357                         nce = ndp_nce_init(nexthop->ire_ill,
3358                             &nexthop->ire_addr_v6, nexthop->ire_type);
3359                 }
3360         }
3361         if (nce == NULL) {
3362                 /*
3363                  * Leave the old stale one in place to avoid a NULL
3364                  * ire_nce_cache.
3365                  */
3366                 ire_refrele(nexthop);
3367                 return (ENOMEM);
3368         }
3369 
3370         if (nexthop != ire) {
3371                 /* Update the nexthop ire */
3372                 mutex_enter(&nexthop->ire_lock);
3373                 old_nce = nexthop->ire_nce_cache;
3374                 if (!IRE_IS_CONDEMNED(nexthop)) {
3375                         nce_refhold(nce);
3376                         nexthop->ire_nce_cache = nce;
3377                 } else {
3378                         nexthop->ire_nce_cache = NULL;
3379                 }
3380                 mutex_exit(&nexthop->ire_lock);
3381                 if (old_nce != NULL)
3382                         nce_refrele(old_nce);
3383         }
3384         ire_refrele(nexthop);
3385 
3386         mutex_enter(&ire->ire_lock);
3387         old_nce = ire->ire_nce_cache;
3388         if (!IRE_IS_CONDEMNED(ire)) {
3389                 nce_refhold(nce);
3390                 ire->ire_nce_cache = nce;
3391         } else {
3392                 ire->ire_nce_cache = NULL;
3393         }
3394         mutex_exit(&ire->ire_lock);
3395         if (old_nce != NULL)
3396                 nce_refrele(old_nce);
3397 
3398         nce_refrele(nce);
3399         return (0);
3400 }
3401 
3402 /*
3403  * Get a held nce for a given ire.
3404  * In the common case this is just from ire_nce_cache.
3405  * For IRE_MULTICAST this needs to do an explicit lookup since we do not
3406  * have an IRE_MULTICAST per address.
3407  * Note that this explicitly returns CONDEMNED NCEs. The caller needs those
3408  * so they can check whether the NCE went unreachable (as opposed to was
3409  * condemned for some other reason).
3410  */
3411 nce_t *
3412 ire_to_nce(ire_t *ire, ipaddr_t v4nexthop, const in6_addr_t *v6nexthop)
3413 {
3414         nce_t   *nce;
3415 
3416         if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
3417                 return (NULL);
3418 
3419         /* ire_testhidden should only be set on under-interfaces */
3420         ASSERT(!ire->ire_testhidden || !IS_IPMP(ire->ire_ill));
3421 
3422         mutex_enter(&ire->ire_lock);
3423         nce = ire->ire_nce_cache;
3424         if (nce != NULL) {
3425                 nce_refhold(nce);
3426                 mutex_exit(&ire->ire_lock);
3427                 return (nce);
3428         }
3429         mutex_exit(&ire->ire_lock);
3430 
3431         if (ire->ire_type & IRE_MULTICAST) {
3432                 ASSERT(ire->ire_ill != NULL);
3433 
3434                 if (ire->ire_ipversion == IPV4_VERSION) {
3435                         ASSERT(v6nexthop == NULL);
3436 
3437                         nce = arp_nce_init(ire->ire_ill, v4nexthop,
3438                             ire->ire_type);
3439                 } else {
3440                         ASSERT(v6nexthop != NULL);
3441                         ASSERT(v4nexthop == 0);
3442                         nce = ndp_nce_init(ire->ire_ill, v6nexthop,
3443                             ire->ire_type);
3444                 }
3445                 return (nce);
3446         }
3447         return (NULL);
3448 }
3449 
3450 nce_t *
3451 ire_to_nce_pkt(ire_t *ire, mblk_t *mp)
3452 {
3453         ipha_t          *ipha;
3454         ip6_t           *ip6h;
3455 
3456         if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
3457                 ipha = (ipha_t *)mp->b_rptr;
3458                 return (ire_to_nce(ire, ipha->ipha_dst, NULL));
3459         } else {
3460                 ip6h = (ip6_t *)mp->b_rptr;
3461                 return (ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst));
3462         }
3463 }
3464 
3465 /*
3466  * Given an IRE_INTERFACE (that matches more than one address) create
3467  * and return an IRE_IF_CLONE for the specific address.
3468  * Return the generation number.
3469  * Returns NULL is no memory for the IRE.
3470  * Handles both IPv4 and IPv6.
3471  *
3472  * IRE_IF_CLONE entries may only be created adn added by calling
3473  * ire_create_if_clone(), and we depend on the fact that ire_add will
3474  * atomically ensure that attempts to add multiple identical IRE_IF_CLONE
3475  * entries will not result in duplicate (i.e., ire_identical_ref > 1)
3476  * CLONE entries, so that a single ire_delete is sufficient to remove the
3477  * CLONE.
3478  */
3479 ire_t *
3480 ire_create_if_clone(ire_t *ire_if, const in6_addr_t *addr, uint_t *generationp)
3481 {
3482         ire_t           *ire;
3483         ire_t           *nire;
3484 
3485         if (ire_if->ire_ipversion == IPV4_VERSION) {
3486                 ipaddr_t        v4addr;
3487                 ipaddr_t        mask = IP_HOST_MASK;
3488 
3489                 ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
3490                 IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
3491 
3492                 ire = ire_create(
3493                     (uchar_t *)&v4addr,                     /* dest address */
3494                     (uchar_t *)&mask,                       /* mask */
3495                     (uchar_t *)&ire_if->ire_gateway_addr,
3496                     IRE_IF_CLONE,                       /* IRE type */
3497                     ire_if->ire_ill,
3498                     ire_if->ire_zoneid,
3499                     ire_if->ire_flags | RTF_HOST,
3500                     NULL,               /* No security attr for IRE_IF_ALL */
3501                     ire_if->ire_ipst);
3502         } else {
3503                 ASSERT(!IN6_IS_ADDR_V4MAPPED(addr));
3504                 ire = ire_create_v6(
3505                     addr,                               /* dest address */
3506                     &ipv6_all_ones,                 /* mask */
3507                     &ire_if->ire_gateway_addr_v6,        /* gateway addr */
3508                     IRE_IF_CLONE,                       /* IRE type */
3509                     ire_if->ire_ill,
3510                     ire_if->ire_zoneid,
3511                     ire_if->ire_flags | RTF_HOST,
3512                     NULL,               /* No security attr for IRE_IF_ALL */
3513                     ire_if->ire_ipst);
3514         }
3515         if (ire == NULL)
3516                 return (NULL);
3517 
3518         /* Take the metrics, in particular the mtu, from the IRE_IF */
3519         ire->ire_metrics = ire_if->ire_metrics;
3520 
3521         nire = ire_add(ire);
3522         if (nire == NULL) /* Some failure */
3523                 return (NULL);
3524 
3525         if (generationp != NULL)
3526                 *generationp = nire->ire_generation;
3527 
3528         return (nire);
3529 }
3530 
3531 /*
3532  * The argument is an IRE_INTERFACE. Delete all of IRE_IF_CLONE in the
3533  * ire_dep_children (just walk the ire_dep_sib_next since they are all
3534  * immediate children.)
3535  * Since we hold a lock while we remove them we need to defer the actual
3536  * calls to ire_delete() until we have dropped the lock. This makes things
3537  * less efficient since we restart at the top after dropping the lock. But
3538  * we only run when an IRE_INTERFACE is deleted which is infrquent.
3539  *
3540  * Note that ire_dep_children can be any mixture of offlink routes and
3541  * IRE_IF_CLONE entries.
3542  */
3543 void
3544 ire_dep_delete_if_clone(ire_t *parent)
3545 {
3546         ip_stack_t      *ipst = parent->ire_ipst;
3547         ire_t           *child, *next;
3548 
3549 restart:
3550         rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
3551         if (parent->ire_dep_children == NULL) {
3552                 rw_exit(&ipst->ips_ire_dep_lock);
3553                 return;
3554         }
3555         child = parent->ire_dep_children;
3556         while (child != NULL) {
3557                 next = child->ire_dep_sib_next;
3558                 if ((child->ire_type & IRE_IF_CLONE) &&
3559                     !IRE_IS_CONDEMNED(child)) {
3560                         ire_refhold(child);
3561                         rw_exit(&ipst->ips_ire_dep_lock);
3562                         ire_delete(child);
3563                         ASSERT(IRE_IS_CONDEMNED(child));
3564                         ire_refrele(child);
3565                         goto restart;
3566                 }
3567                 child = next;
3568         }
3569         rw_exit(&ipst->ips_ire_dep_lock);
3570 }
3571 
3572 /*
3573  * In the preferred/strict src multihoming modes, unbound routes (i.e.,
3574  * ire_t entries with ire_unbound set to B_TRUE) are bound to an interface
3575  * by selecting the first available interface that has an interface route for
3576  * the ire_gateway. If that interface is subsequently brought down, ill_downi()
3577  * will call ire_rebind() so that the unbound route can be bound to some other
3578  * matching interface thereby preserving the intended reachability information
3579  * from the original unbound route.
3580  */
3581 void
3582 ire_rebind(ire_t *ire)
3583 {
3584         ire_t   *gw_ire, *new_ire;
3585         int     match_flags = MATCH_IRE_TYPE;
3586         ill_t   *gw_ill;
3587         boolean_t isv6 = (ire->ire_ipversion == IPV6_VERSION);
3588         ip_stack_t *ipst = ire->ire_ipst;
3589 
3590         ASSERT(ire->ire_unbound);
3591 again:
3592         if (isv6) {
3593                 gw_ire = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6, 0, 0,
3594                     IRE_INTERFACE, NULL, ALL_ZONES, NULL, match_flags, 0,
3595                     ipst, NULL);
3596         } else {
3597                 gw_ire = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
3598                     IRE_INTERFACE, NULL, ALL_ZONES, NULL, match_flags, 0,
3599                     ipst, NULL);
3600         }
3601         if (gw_ire == NULL) {
3602                 /* see comments in ip_rt_add[_v6]() for IPMP */
3603                 if (match_flags & MATCH_IRE_TESTHIDDEN)
3604                         return;
3605 
3606                 match_flags |= MATCH_IRE_TESTHIDDEN;
3607                 goto again;
3608         }
3609         gw_ill = gw_ire->ire_ill;
3610         if (isv6) {
3611                 new_ire = ire_create_v6(&ire->ire_addr_v6, &ire->ire_mask_v6,
3612                     &ire->ire_gateway_addr_v6, ire->ire_type, gw_ill,
3613                     ire->ire_zoneid, ire->ire_flags, NULL, ipst);
3614         } else {
3615                 new_ire = ire_create((uchar_t *)&ire->ire_addr,
3616                     (uchar_t *)&ire->ire_mask,
3617                     (uchar_t *)&ire->ire_gateway_addr, ire->ire_type, gw_ill,
3618                     ire->ire_zoneid, ire->ire_flags, NULL, ipst);
3619         }
3620         ire_refrele(gw_ire);
3621         if (new_ire == NULL)
3622                 return;
3623         new_ire->ire_unbound = B_TRUE;
3624         new_ire = ire_add(new_ire);
3625         if (new_ire != NULL)
3626                 ire_refrele(new_ire);
3627 }