1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 #include <sys/types.h>
  26 #include <sys/stream.h>
  27 #include <sys/stropts.h>
  28 #include <sys/strsun.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/errno.h>
  31 #include <sys/dlpi.h>
  32 #include <sys/socket.h>
  33 #include <sys/ddi.h>
  34 #include <sys/sunddi.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/debug.h>
  37 #include <sys/vtrace.h>
  38 #include <sys/kmem.h>
  39 #include <sys/zone.h>
  40 #include <sys/ethernet.h>
  41 #include <sys/sdt.h>
  42 #include <sys/mac.h>
  43 
  44 #include <net/if.h>
  45 #include <net/if_types.h>
  46 #include <net/if_dl.h>
  47 #include <net/route.h>
  48 #include <netinet/in.h>
  49 #include <netinet/ip6.h>
  50 #include <netinet/icmp6.h>
  51 
  52 #include <inet/common.h>
  53 #include <inet/mi.h>
  54 #include <inet/mib2.h>
  55 #include <inet/nd.h>
  56 #include <inet/ip.h>
  57 #include <inet/ip_impl.h>
  58 #include <inet/ipclassifier.h>
  59 #include <inet/ip_if.h>
  60 #include <inet/ip_ire.h>
  61 #include <inet/ip_rts.h>
  62 #include <inet/ip6.h>
  63 #include <inet/ip_ndp.h>
  64 #include <inet/sctp_ip.h>
  65 #include <inet/ip_arp.h>
  66 #include <inet/ip2mac_impl.h>
  67 
  68 #define ANNOUNCE_INTERVAL(isv6) \
  69         (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
  70         ipst->ips_ip_arp_publish_interval)
  71 
  72 #define DEFENSE_INTERVAL(isv6) \
  73         (isv6 ? ipst->ips_ndp_defend_interval : \
  74         ipst->ips_arp_defend_interval)
  75 
  76 /* Non-tunable probe interval, based on link capabilities */
  77 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)
  78 
  79 /*
  80  * The IPv4 Link Local address space is special; we do extra duplicate checking
  81  * there, as the entire assignment mechanism rests on random numbers.
  82  */
  83 #define IS_IPV4_LL_SPACE(ptr)   (((uchar_t *)ptr)[0] == 169 && \
  84                                 ((uchar_t *)ptr)[1] == 254)
  85 
  86 /*
  87  * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
  88  * in to the ncec*add* functions.
  89  *
  90  * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
  91  * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
  92  * that we will respond to requests for the protocol address.
  93  */
  94 #define NCE_EXTERNAL_FLAGS_MASK \
  95         (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
  96         NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
  97         NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
  98 
  99 /*
 100  * Lock ordering:
 101  *
 102  *      ndp_g_lock -> ill_lock -> ncec_lock
 103  *
 104  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
 105  * ncec_next.  ncec_lock protects the contents of the NCE (particularly
 106  * ncec_refcnt).
 107  */
 108 
 109 static  void    nce_cleanup_list(ncec_t *ncec);
 110 static  void    nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
 111 static  ncec_t  *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
 112     ncec_t *);
 113 static  nce_t   *nce_lookup_addr(ill_t *, const in6_addr_t *);
 114 static  int     nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
 115     uint16_t ncec_flags, nce_t **newnce);
 116 static  int     nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
 117     uint16_t ncec_flags, nce_t **newnce);
 118 static  boolean_t       ndp_xmit(ill_t *ill, uint32_t operation,
 119     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
 120     const in6_addr_t *target, int flag);
 121 static void     ncec_refhold_locked(ncec_t *);
 122 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
 123 static  void    nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
 124 static  int     nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
 125     uint16_t, uint16_t, nce_t **);
 126 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
 127 static nce_t *nce_add(ill_t *, ncec_t *);
 128 static void nce_inactive(nce_t *);
 129 extern nce_t    *nce_lookup(ill_t *, const in6_addr_t *);
 130 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
 131 static int      nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
 132     uint16_t, uint16_t, nce_t **);
 133 static int      nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
 134     uint16_t, uint16_t, nce_t **);
 135 static int  nce_add_v6_postprocess(nce_t *);
 136 static int  nce_add_v4_postprocess(nce_t *);
 137 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
 138 static clock_t nce_fuzz_interval(clock_t, boolean_t);
 139 static void nce_resolv_ipmp_ok(ncec_t *);
 140 static void nce_walk_common(ill_t *, pfi_t, void *);
 141 static void nce_start_timer(ncec_t *, uint_t);
 142 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
 143 static void nce_fastpath_trigger(nce_t *);
 144 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
 145 
 146 #ifdef DEBUG
 147 static void     ncec_trace_cleanup(const ncec_t *);
 148 #endif
 149 
 150 #define NCE_HASH_PTR_V4(ipst, addr)                                     \
 151         (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
 152 
 153 #define NCE_HASH_PTR_V6(ipst, addr)                              \
 154         (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
 155                 NCE_TABLE_SIZE)]))
 156 
 157 extern kmem_cache_t *ncec_cache;
 158 extern kmem_cache_t *nce_cache;
 159 
 160 /*
 161  * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
 162  * If src_ill is not null, the ncec_addr is bound to src_ill. The
 163  * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
 164  * the probe is sent on the ncec_ill (in the non-IPMP case) or the
 165  * IPMP cast_ill (in the IPMP case).
 166  *
 167  * Note that the probe interval is based on the src_ill for IPv6, and
 168  * the ncec_xmit_interval for IPv4.
 169  */
 170 static void
 171 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
 172 {
 173         boolean_t dropped;
 174         uint32_t probe_interval;
 175 
 176         ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
 177         ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
 178         if (ncec->ncec_ipversion == IPV6_VERSION) {
 179                 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
 180                     ncec->ncec_lladdr, ncec->ncec_lladdr_length,
 181                     &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
 182                 probe_interval = ILL_PROBE_INTERVAL(src_ill);
 183         } else {
 184                 /* IPv4 DAD delay the initial probe. */
 185                 if (send_probe)
 186                         dropped = arp_probe(ncec);
 187                 else
 188                         dropped = B_TRUE;
 189                 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
 190                     !send_probe);
 191         }
 192         if (!dropped) {
 193                 mutex_enter(&ncec->ncec_lock);
 194                 ncec->ncec_pcnt--;
 195                 mutex_exit(&ncec->ncec_lock);
 196         }
 197         nce_restart_timer(ncec, probe_interval);
 198 }
 199 
 200 /*
 201  * Compute default flags to use for an advertisement of this ncec's address.
 202  */
 203 static int
 204 nce_advert_flags(const ncec_t *ncec)
 205 {
 206         int flag = 0;
 207 
 208         if (ncec->ncec_flags & NCE_F_ISROUTER)
 209                 flag |= NDP_ISROUTER;
 210         if (!(ncec->ncec_flags & NCE_F_ANYCAST))
 211                 flag |= NDP_ORIDE;
 212 
 213         return (flag);
 214 }
 215 
 216 /*
 217  * NDP Cache Entry creation routine.
 218  * This routine must always be called with ndp6->ndp_g_lock held.
 219  */
 220 int
 221 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
 222     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
 223 {
 224         int             err;
 225         nce_t           *nce;
 226 
 227         ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
 228         ASSERT(ill != NULL && ill->ill_isv6);
 229 
 230         err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
 231             &nce);
 232         if (err != 0)
 233                 return (err);
 234         ASSERT(newnce != NULL);
 235         *newnce = nce;
 236         return (err);
 237 }
 238 
 239 /*
 240  * Post-processing routine to be executed after nce_add_v6(). This function
 241  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
 242  * and must be called without any locks held.
 243  */
 244 int
 245 nce_add_v6_postprocess(nce_t *nce)
 246 {
 247         ncec_t          *ncec = nce->nce_common;
 248         boolean_t       dropped = B_FALSE;
 249         uchar_t         *hw_addr = ncec->ncec_lladdr;
 250         uint_t          hw_addr_len = ncec->ncec_lladdr_length;
 251         ill_t           *ill = ncec->ncec_ill;
 252         int             err = 0;
 253         uint16_t        flags = ncec->ncec_flags;
 254         ip_stack_t      *ipst = ill->ill_ipst;
 255         boolean_t       trigger_fastpath = B_TRUE;
 256 
 257         /*
 258          * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
 259          * we call nce_fastpath as soon as the ncec is resolved in nce_process.
 260          * We call nce_fastpath from nce_update if the link layer address of
 261          * the peer changes from nce_update
 262          */
 263         if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
 264             (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
 265                 trigger_fastpath = B_FALSE;
 266 
 267         if (trigger_fastpath)
 268                 nce_fastpath_trigger(nce);
 269         if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
 270                 ill_t *hwaddr_ill;
 271                 /*
 272                  * Unicast entry that needs DAD.
 273                  */
 274                 if (IS_IPMP(ill)) {
 275                         hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
 276                             hw_addr, hw_addr_len);
 277                 } else {
 278                         hwaddr_ill = ill;
 279                 }
 280                 nce_dad(ncec, hwaddr_ill, B_TRUE);
 281                 err = EINPROGRESS;
 282         } else if (flags & NCE_F_UNSOL_ADV) {
 283                 /*
 284                  * We account for the transmit below by assigning one
 285                  * less than the ndd variable. Subsequent decrements
 286                  * are done in nce_timer.
 287                  */
 288                 mutex_enter(&ncec->ncec_lock);
 289                 ncec->ncec_unsolicit_count =
 290                     ipst->ips_ip_ndp_unsolicit_count - 1;
 291                 mutex_exit(&ncec->ncec_lock);
 292                 dropped = ndp_xmit(ill,
 293                     ND_NEIGHBOR_ADVERT,
 294                     hw_addr,
 295                     hw_addr_len,
 296                     &ncec->ncec_addr,    /* Source and target of the adv */
 297                     &ipv6_all_hosts_mcast, /* Destination of the packet */
 298                     nce_advert_flags(ncec));
 299                 mutex_enter(&ncec->ncec_lock);
 300                 if (dropped)
 301                         ncec->ncec_unsolicit_count++;
 302                 else
 303                         ncec->ncec_last_time_defended = ddi_get_lbolt();
 304                 if (ncec->ncec_unsolicit_count != 0) {
 305                         nce_start_timer(ncec,
 306                             ipst->ips_ip_ndp_unsolicit_interval);
 307                 }
 308                 mutex_exit(&ncec->ncec_lock);
 309         }
 310         return (err);
 311 }
 312 
 313 /*
 314  * Atomically lookup and add (if needed) Neighbor Cache information for
 315  * an address.
 316  *
 317  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
 318  * are always added pointing at the ipmp_ill. Thus, when the ill passed
 319  * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
 320  * entries will be created, both pointing at the same ncec_t. The nce_t
 321  * entries will have their nce_ill set to the ipmp_ill and the under_ill
 322  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
 323  * Local addresses are always created on the ill passed to nce_add_v6.
 324  */
 325 int
 326 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
 327     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
 328 {
 329         int             err = 0;
 330         ip_stack_t      *ipst = ill->ill_ipst;
 331         nce_t           *nce, *upper_nce = NULL;
 332         ill_t           *in_ill = ill;
 333         boolean_t       need_ill_refrele = B_FALSE;
 334 
 335         if (flags & NCE_F_MCAST) {
 336                 /*
 337                  * hw_addr will be figured out in nce_set_multicast_v6;
 338                  * caller has to select the cast_ill
 339                  */
 340                 ASSERT(hw_addr == NULL);
 341                 ASSERT(!IS_IPMP(ill));
 342                 err = nce_set_multicast_v6(ill, addr, flags, newnce);
 343                 return (err);
 344         }
 345         ASSERT(ill->ill_isv6);
 346         if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
 347                 ill = ipmp_ill_hold_ipmp_ill(ill);
 348                 if (ill == NULL)
 349                         return (ENXIO);
 350                 need_ill_refrele = B_TRUE;
 351         }
 352 
 353         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 354         nce = nce_lookup_addr(ill, addr);
 355         if (nce == NULL) {
 356                 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
 357                     &nce);
 358         } else {
 359                 err = EEXIST;
 360         }
 361         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 362         if (err == 0)
 363                 err = nce_add_v6_postprocess(nce);
 364         if (in_ill != ill && nce != NULL) {
 365                 nce_t *under_nce = NULL;
 366 
 367                 /*
 368                  * in_ill was the under_ill. Try to create the under_nce.
 369                  * Hold the ill_g_lock to prevent changes to group membership
 370                  * until we are done.
 371                  */
 372                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 373                 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
 374                         DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
 375                             ill_t *, ill);
 376                         rw_exit(&ipst->ips_ill_g_lock);
 377                         err = ENXIO;
 378                         nce_refrele(nce);
 379                         nce = NULL;
 380                         goto bail;
 381                 }
 382                 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
 383                 if (under_nce == NULL) {
 384                         rw_exit(&ipst->ips_ill_g_lock);
 385                         err = EINVAL;
 386                         nce_refrele(nce);
 387                         nce = NULL;
 388                         goto bail;
 389                 }
 390                 rw_exit(&ipst->ips_ill_g_lock);
 391                 upper_nce = nce;
 392                 nce = under_nce; /* will be returned to caller */
 393                 if (NCE_ISREACHABLE(nce->nce_common))
 394                         nce_fastpath_trigger(under_nce);
 395         }
 396         /* nce_refrele is deferred until the lock is dropped  */
 397         if (nce != NULL) {
 398                 if (newnce != NULL)
 399                         *newnce = nce;
 400                 else
 401                         nce_refrele(nce);
 402         }
 403 bail:
 404         if (upper_nce != NULL)
 405                 nce_refrele(upper_nce);
 406         if (need_ill_refrele)
 407                 ill_refrele(ill);
 408         return (err);
 409 }
 410 
 411 /*
 412  * Remove all the CONDEMNED nces from the appropriate hash table.
 413  * We create a private list of NCEs, these may have ires pointing
 414  * to them, so the list will be passed through to clean up dependent
 415  * ires and only then we can do ncec_refrele() which can make NCE inactive.
 416  */
 417 static void
 418 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
 419 {
 420         ncec_t *ncec1;
 421         ncec_t **ptpn;
 422 
 423         ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
 424         ASSERT(ndp->ndp_g_walker == 0);
 425         for (; ncec; ncec = ncec1) {
 426                 ncec1 = ncec->ncec_next;
 427                 mutex_enter(&ncec->ncec_lock);
 428                 if (NCE_ISCONDEMNED(ncec)) {
 429                         ptpn = ncec->ncec_ptpn;
 430                         ncec1 = ncec->ncec_next;
 431                         if (ncec1 != NULL)
 432                                 ncec1->ncec_ptpn = ptpn;
 433                         *ptpn = ncec1;
 434                         ncec->ncec_ptpn = NULL;
 435                         ncec->ncec_next = NULL;
 436                         ncec->ncec_next = *free_nce_list;
 437                         *free_nce_list = ncec;
 438                 }
 439                 mutex_exit(&ncec->ncec_lock);
 440         }
 441 }
 442 
 443 /*
 444  * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
 445  *    will return this NCE. Also no new timeouts will
 446  *    be started (See nce_restart_timer).
 447  * 2. Cancel any currently running timeouts.
 448  * 3. If there is an ndp walker, return. The walker will do the cleanup.
 449  *    This ensures that walkers see a consistent list of NCEs while walking.
 450  * 4. Otherwise remove the NCE from the list of NCEs
 451  */
 452 void
 453 ncec_delete(ncec_t *ncec)
 454 {
 455         ncec_t  **ptpn;
 456         ncec_t  *ncec1;
 457         int     ipversion = ncec->ncec_ipversion;
 458         ndp_g_t *ndp;
 459         ip_stack_t      *ipst = ncec->ncec_ipst;
 460 
 461         if (ipversion == IPV4_VERSION)
 462                 ndp = ipst->ips_ndp4;
 463         else
 464                 ndp = ipst->ips_ndp6;
 465 
 466         /* Serialize deletes */
 467         mutex_enter(&ncec->ncec_lock);
 468         if (NCE_ISCONDEMNED(ncec)) {
 469                 /* Some other thread is doing the delete */
 470                 mutex_exit(&ncec->ncec_lock);
 471                 return;
 472         }
 473         /*
 474          * Caller has a refhold. Also 1 ref for being in the list. Thus
 475          * refcnt has to be >= 2
 476          */
 477         ASSERT(ncec->ncec_refcnt >= 2);
 478         ncec->ncec_flags |= NCE_F_CONDEMNED;
 479         mutex_exit(&ncec->ncec_lock);
 480 
 481         /* Count how many condemned ires for kmem_cache callback */
 482         atomic_inc_32(&ipst->ips_num_nce_condemned);
 483         nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
 484 
 485         /* Complete any waiting callbacks */
 486         ncec_cb_dispatch(ncec);
 487 
 488         /*
 489          * Cancel any running timer. Timeout can't be restarted
 490          * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
 491          * Passing invalid timeout id is fine.
 492          */
 493         if (ncec->ncec_timeout_id != 0) {
 494                 (void) untimeout(ncec->ncec_timeout_id);
 495                 ncec->ncec_timeout_id = 0;
 496         }
 497 
 498         mutex_enter(&ndp->ndp_g_lock);
 499         if (ncec->ncec_ptpn == NULL) {
 500                 /*
 501                  * The last ndp walker has already removed this ncec from
 502                  * the list after we marked the ncec CONDEMNED and before
 503                  * we grabbed the global lock.
 504                  */
 505                 mutex_exit(&ndp->ndp_g_lock);
 506                 return;
 507         }
 508         if (ndp->ndp_g_walker > 0) {
 509                 /*
 510                  * Can't unlink. The walker will clean up
 511                  */
 512                 ndp->ndp_g_walker_cleanup = B_TRUE;
 513                 mutex_exit(&ndp->ndp_g_lock);
 514                 return;
 515         }
 516 
 517         /*
 518          * Now remove the ncec from the list. nce_restart_timer won't restart
 519          * the timer since it is marked CONDEMNED.
 520          */
 521         ptpn = ncec->ncec_ptpn;
 522         ncec1 = ncec->ncec_next;
 523         if (ncec1 != NULL)
 524                 ncec1->ncec_ptpn = ptpn;
 525         *ptpn = ncec1;
 526         ncec->ncec_ptpn = NULL;
 527         ncec->ncec_next = NULL;
 528         mutex_exit(&ndp->ndp_g_lock);
 529 
 530         /* Removed from ncec_ptpn/ncec_next list */
 531         ncec_refrele_notr(ncec);
 532 }
 533 
 534 void
 535 ncec_inactive(ncec_t *ncec)
 536 {
 537         mblk_t          **mpp;
 538         ill_t           *ill = ncec->ncec_ill;
 539         ip_stack_t      *ipst = ncec->ncec_ipst;
 540 
 541         ASSERT(ncec->ncec_refcnt == 0);
 542         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
 543 
 544         /* Count how many condemned nces for kmem_cache callback */
 545         if (NCE_ISCONDEMNED(ncec))
 546                 atomic_add_32(&ipst->ips_num_nce_condemned, -1);
 547 
 548         /* Free all allocated messages */
 549         mpp = &ncec->ncec_qd_mp;
 550         while (*mpp != NULL) {
 551                 mblk_t  *mp;
 552 
 553                 mp = *mpp;
 554                 *mpp = mp->b_next;
 555 
 556                 inet_freemsg(mp);
 557         }
 558         /*
 559          * must have been cleaned up in ncec_delete
 560          */
 561         ASSERT(list_is_empty(&ncec->ncec_cb));
 562         list_destroy(&ncec->ncec_cb);
 563         /*
 564          * free the ncec_lladdr if one was allocated in nce_add_common()
 565          */
 566         if (ncec->ncec_lladdr_length > 0)
 567                 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
 568 
 569 #ifdef DEBUG
 570         ncec_trace_cleanup(ncec);
 571 #endif
 572 
 573         mutex_enter(&ill->ill_lock);
 574         DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
 575             (char *), "ncec", (void *), ncec);
 576         ill->ill_ncec_cnt--;
 577         ncec->ncec_ill = NULL;
 578         /*
 579          * If the number of ncec's associated with this ill have dropped
 580          * to zero, check whether we need to restart any operation that
 581          * is waiting for this to happen.
 582          */
 583         if (ILL_DOWN_OK(ill)) {
 584                 /* ipif_ill_refrele_tail drops the ill_lock */
 585                 ipif_ill_refrele_tail(ill);
 586         } else {
 587                 mutex_exit(&ill->ill_lock);
 588         }
 589 
 590         mutex_destroy(&ncec->ncec_lock);
 591         kmem_cache_free(ncec_cache, ncec);
 592 }
 593 
 594 /*
 595  * ncec_walk routine.  Delete the ncec if it is associated with the ill
 596  * that is going away.  Always called as a writer.
 597  */
 598 void
 599 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg)
 600 {
 601         if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) {
 602                 ncec_delete(ncec);
 603         }
 604 }
 605 
 606 /*
 607  * Neighbor Cache cleanup logic for a list of ncec_t entries.
 608  */
 609 static void
 610 nce_cleanup_list(ncec_t *ncec)
 611 {
 612         ncec_t *ncec_next;
 613 
 614         ASSERT(ncec != NULL);
 615         while (ncec != NULL) {
 616                 ncec_next = ncec->ncec_next;
 617                 ncec->ncec_next = NULL;
 618 
 619                 /*
 620                  * It is possible for the last ndp walker (this thread)
 621                  * to come here after ncec_delete has marked the ncec CONDEMNED
 622                  * and before it has removed the ncec from the fastpath list
 623                  * or called untimeout. So we need to do it here. It is safe
 624                  * for both ncec_delete and this thread to do it twice or
 625                  * even simultaneously since each of the threads has a
 626                  * reference on the ncec.
 627                  */
 628                 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
 629                 /*
 630                  * Cancel any running timer. Timeout can't be restarted
 631                  * since CONDEMNED is set. The ncec_lock can't be
 632                  * held across untimeout though passing invalid timeout
 633                  * id is fine.
 634                  */
 635                 if (ncec->ncec_timeout_id != 0) {
 636                         (void) untimeout(ncec->ncec_timeout_id);
 637                         ncec->ncec_timeout_id = 0;
 638                 }
 639                 /* Removed from ncec_ptpn/ncec_next list */
 640                 ncec_refrele_notr(ncec);
 641                 ncec = ncec_next;
 642         }
 643 }
 644 
 645 /*
 646  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
 647  */
 648 boolean_t
 649 nce_restart_dad(ncec_t *ncec)
 650 {
 651         boolean_t started;
 652         ill_t *ill, *hwaddr_ill;
 653 
 654         if (ncec == NULL)
 655                 return (B_FALSE);
 656         ill = ncec->ncec_ill;
 657         mutex_enter(&ncec->ncec_lock);
 658         if (ncec->ncec_state == ND_PROBE) {
 659                 mutex_exit(&ncec->ncec_lock);
 660                 started = B_TRUE;
 661         } else if (ncec->ncec_state == ND_REACHABLE) {
 662                 ASSERT(ncec->ncec_lladdr != NULL);
 663                 ncec->ncec_state = ND_PROBE;
 664                 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
 665                 /*
 666                  * Slight cheat here: we don't use the initial probe delay
 667                  * for IPv4 in this obscure case.
 668                  */
 669                 mutex_exit(&ncec->ncec_lock);
 670                 if (IS_IPMP(ill)) {
 671                         hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
 672                             ncec->ncec_lladdr, ncec->ncec_lladdr_length);
 673                 } else {
 674                         hwaddr_ill = ill;
 675                 }
 676                 nce_dad(ncec, hwaddr_ill, B_TRUE);
 677                 started = B_TRUE;
 678         } else {
 679                 mutex_exit(&ncec->ncec_lock);
 680                 started = B_FALSE;
 681         }
 682         return (started);
 683 }
 684 
 685 /*
 686  * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
 687  * If one is found, the refcnt on the ncec will be incremented.
 688  */
 689 ncec_t *
 690 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
 691 {
 692         ncec_t          *ncec;
 693         ip_stack_t      *ipst = ill->ill_ipst;
 694 
 695         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 696         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 697 
 698         /* Get head of v6 hash table */
 699         ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
 700         ncec = ncec_lookup_illgrp(ill, addr, ncec);
 701         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 702         rw_exit(&ipst->ips_ill_g_lock);
 703         return (ncec);
 704 }
 705 /*
 706  * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
 707  * If one is found, the refcnt on the ncec will be incremented.
 708  */
 709 ncec_t *
 710 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
 711 {
 712         ncec_t  *ncec = NULL;
 713         in6_addr_t addr6;
 714         ip_stack_t *ipst = ill->ill_ipst;
 715 
 716         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
 717         mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
 718 
 719         /* Get head of v4 hash table */
 720         ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
 721         IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
 722         ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
 723         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
 724         rw_exit(&ipst->ips_ill_g_lock);
 725         return (ncec);
 726 }
 727 
 728 /*
 729  * Cache entry lookup.  Try to find an ncec matching the parameters passed.
 730  * If an ncec is found, increment the hold count on that ncec.
 731  * The caller passes in the start of the appropriate hash table, and must
 732  * be holding the appropriate global lock (ndp_g_lock). In addition, since
 733  * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
 734  * must be held as reader.
 735  *
 736  * This function always matches across the ipmp group.
 737  */
 738 ncec_t *
 739 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
 740 {
 741         ndp_g_t         *ndp;
 742         ip_stack_t      *ipst = ill->ill_ipst;
 743 
 744         if (ill->ill_isv6)
 745                 ndp = ipst->ips_ndp6;
 746         else
 747                 ndp = ipst->ips_ndp4;
 748 
 749         ASSERT(ill != NULL);
 750         ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
 751         if (IN6_IS_ADDR_UNSPECIFIED(addr))
 752                 return (NULL);
 753         for (; ncec != NULL; ncec = ncec->ncec_next) {
 754                 if (ncec->ncec_ill == ill ||
 755                     IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
 756                         if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
 757                                 mutex_enter(&ncec->ncec_lock);
 758                                 if (!NCE_ISCONDEMNED(ncec)) {
 759                                         ncec_refhold_locked(ncec);
 760                                         mutex_exit(&ncec->ncec_lock);
 761                                         break;
 762                                 }
 763                                 mutex_exit(&ncec->ncec_lock);
 764                         }
 765                 }
 766         }
 767         return (ncec);
 768 }
 769 
 770 /*
 771  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
 772  * entries for ill only, i.e., when ill is part of an ipmp group,
 773  * nce_lookup_v4 will never try to match across the group.
 774  */
 775 nce_t *
 776 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
 777 {
 778         nce_t *nce;
 779         in6_addr_t addr6;
 780         ip_stack_t *ipst = ill->ill_ipst;
 781 
 782         mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
 783         IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
 784         nce = nce_lookup_addr(ill, &addr6);
 785         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
 786         return (nce);
 787 }
 788 
 789 /*
 790  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
 791  * entries for ill only, i.e., when ill is part of an ipmp group,
 792  * nce_lookup_v6 will never try to match across the group.
 793  */
 794 nce_t *
 795 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
 796 {
 797         nce_t *nce;
 798         ip_stack_t *ipst = ill->ill_ipst;
 799 
 800         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
 801         nce = nce_lookup_addr(ill, addr6);
 802         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
 803         return (nce);
 804 }
 805 
 806 static nce_t *
 807 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
 808 {
 809         nce_t *nce;
 810 
 811         ASSERT(ill != NULL);
 812 #ifdef DEBUG
 813         if (ill->ill_isv6)
 814                 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
 815         else
 816                 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
 817 #endif
 818         mutex_enter(&ill->ill_lock);
 819         nce = nce_lookup(ill, addr);
 820         mutex_exit(&ill->ill_lock);
 821         return (nce);
 822 }
 823 
 824 
 825 /*
 826  * Router turned to host.  We need to make sure that cached copies of the ncec
 827  * are not used for forwarding packets if they were derived from the default
 828  * route, and that the default route itself is removed, as  required by
 829  * section 7.2.5 of RFC 2461.
 830  *
 831  * Note that the ncec itself probably has valid link-layer information for the
 832  * nexthop, so that there is no reason to delete the ncec, as long as the
 833  * ISROUTER flag is turned off.
 834  */
 835 static void
 836 ncec_router_to_host(ncec_t *ncec)
 837 {
 838         ire_t           *ire;
 839         ip_stack_t      *ipst = ncec->ncec_ipst;
 840 
 841         mutex_enter(&ncec->ncec_lock);
 842         ncec->ncec_flags &= ~NCE_F_ISROUTER;
 843         mutex_exit(&ncec->ncec_lock);
 844 
 845         ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
 846             &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
 847             MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
 848         if (ire != NULL) {
 849                 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
 850                 ire_delete(ire);
 851                 ire_refrele(ire);
 852         }
 853 }
 854 
 855 /*
 856  * Process passed in parameters either from an incoming packet or via
 857  * user ioctl.
 858  */
 859 void
 860 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
 861 {
 862         ill_t   *ill = ncec->ncec_ill;
 863         uint32_t hw_addr_len = ill->ill_phys_addr_length;
 864         boolean_t ll_updated = B_FALSE;
 865         boolean_t ll_changed;
 866         nce_t   *nce;
 867 
 868         ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
 869         /*
 870          * No updates of link layer address or the neighbor state is
 871          * allowed, when the cache is in NONUD state.  This still
 872          * allows for responding to reachability solicitation.
 873          */
 874         mutex_enter(&ncec->ncec_lock);
 875         if (ncec->ncec_state == ND_INCOMPLETE) {
 876                 if (hw_addr == NULL) {
 877                         mutex_exit(&ncec->ncec_lock);
 878                         return;
 879                 }
 880                 nce_set_ll(ncec, hw_addr);
 881                 /*
 882                  * Update ncec state and send the queued packets
 883                  * back to ip this time ire will be added.
 884                  */
 885                 if (flag & ND_NA_FLAG_SOLICITED) {
 886                         nce_update(ncec, ND_REACHABLE, NULL);
 887                 } else {
 888                         nce_update(ncec, ND_STALE, NULL);
 889                 }
 890                 mutex_exit(&ncec->ncec_lock);
 891                 nce = nce_fastpath(ncec, B_TRUE, NULL);
 892                 nce_resolv_ok(ncec);
 893                 if (nce != NULL)
 894                         nce_refrele(nce);
 895                 return;
 896         }
 897         ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
 898         if (!is_adv) {
 899                 /* If this is a SOLICITATION request only */
 900                 if (ll_changed)
 901                         nce_update(ncec, ND_STALE, hw_addr);
 902                 mutex_exit(&ncec->ncec_lock);
 903                 ncec_cb_dispatch(ncec);
 904                 return;
 905         }
 906         if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
 907                 /* If in any other state than REACHABLE, ignore */
 908                 if (ncec->ncec_state == ND_REACHABLE) {
 909                         nce_update(ncec, ND_STALE, NULL);
 910                 }
 911                 mutex_exit(&ncec->ncec_lock);
 912                 ncec_cb_dispatch(ncec);
 913                 return;
 914         } else {
 915                 if (ll_changed) {
 916                         nce_update(ncec, ND_UNCHANGED, hw_addr);
 917                         ll_updated = B_TRUE;
 918                 }
 919                 if (flag & ND_NA_FLAG_SOLICITED) {
 920                         nce_update(ncec, ND_REACHABLE, NULL);
 921                 } else {
 922                         if (ll_updated) {
 923                                 nce_update(ncec, ND_STALE, NULL);
 924                         }
 925                 }
 926                 mutex_exit(&ncec->ncec_lock);
 927                 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
 928                     NCE_F_ISROUTER)) {
 929                         ncec_router_to_host(ncec);
 930                 } else {
 931                         ncec_cb_dispatch(ncec);
 932                 }
 933         }
 934 }
 935 
 936 /*
 937  * Pass arg1 to the pfi supplied, along with each ncec in existence.
 938  * ncec_walk() places a REFHOLD on the ncec and drops the lock when
 939  * walking the hash list.
 940  */
 941 void
 942 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
 943     boolean_t trace)
 944 {
 945         ncec_t  *ncec;
 946         ncec_t  *ncec1;
 947         ncec_t  **ncep;
 948         ncec_t  *free_nce_list = NULL;
 949 
 950         mutex_enter(&ndp->ndp_g_lock);
 951         /* Prevent ncec_delete from unlink and free of NCE */
 952         ndp->ndp_g_walker++;
 953         mutex_exit(&ndp->ndp_g_lock);
 954         for (ncep = ndp->nce_hash_tbl;
 955             ncep < A_END(ndp->nce_hash_tbl); ncep++) {
 956                 for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
 957                         ncec1 = ncec->ncec_next;
 958                         if (ill == NULL || ncec->ncec_ill == ill) {
 959                                 if (trace) {
 960                                         ncec_refhold(ncec);
 961                                         (*pfi)(ncec, arg1);
 962                                         ncec_refrele(ncec);
 963                                 } else {
 964                                         ncec_refhold_notr(ncec);
 965                                         (*pfi)(ncec, arg1);
 966                                         ncec_refrele_notr(ncec);
 967                                 }
 968                         }
 969                 }
 970         }
 971         mutex_enter(&ndp->ndp_g_lock);
 972         ndp->ndp_g_walker--;
 973         if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
 974                 /* Time to delete condemned entries */
 975                 for (ncep = ndp->nce_hash_tbl;
 976                     ncep < A_END(ndp->nce_hash_tbl); ncep++) {
 977                         ncec = *ncep;
 978                         if (ncec != NULL) {
 979                                 nce_remove(ndp, ncec, &free_nce_list);
 980                         }
 981                 }
 982                 ndp->ndp_g_walker_cleanup = B_FALSE;
 983         }
 984 
 985         mutex_exit(&ndp->ndp_g_lock);
 986 
 987         if (free_nce_list != NULL) {
 988                 nce_cleanup_list(free_nce_list);
 989         }
 990 }
 991 
 992 /*
 993  * Walk everything.
 994  * Note that ill can be NULL hence can't derive the ipst from it.
 995  */
 996 void
 997 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
 998 {
 999         ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1000         ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1001 }
1002 
1003 /*
1004  * For each interface an entry is added for the unspecified multicast group.
1005  * Here that mapping is used to form the multicast cache entry for a particular
1006  * multicast destination.
1007  */
1008 static int
1009 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1010     uint16_t flags, nce_t **newnce)
1011 {
1012         uchar_t         *hw_addr;
1013         int             err = 0;
1014         ip_stack_t      *ipst = ill->ill_ipst;
1015         nce_t           *nce;
1016 
1017         ASSERT(ill != NULL);
1018         ASSERT(ill->ill_isv6);
1019         ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1020 
1021         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1022         nce = nce_lookup_addr(ill, dst);
1023         if (nce != NULL) {
1024                 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1025                 goto done;
1026         }
1027         if (ill->ill_net_type == IRE_IF_RESOLVER) {
1028                 /*
1029                  * For IRE_IF_RESOLVER a hardware mapping can be
1030                  * generated.
1031                  */
1032                 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1033                 if (hw_addr == NULL) {
1034                         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1035                         return (ENOMEM);
1036                 }
1037                 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1038         } else {
1039                 /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1040                 hw_addr = NULL;
1041         }
1042         ASSERT((flags & NCE_F_MCAST) != 0);
1043         ASSERT((flags & NCE_F_NONUD) != 0);
1044         /* nce_state will be computed by nce_add_common() */
1045         err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1046             ND_UNCHANGED, &nce);
1047         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1048         if (err == 0)
1049                 err = nce_add_v6_postprocess(nce);
1050         if (hw_addr != NULL)
1051                 kmem_free(hw_addr, ill->ill_nd_lla_len);
1052         if (err != 0) {
1053                 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1054                 return (err);
1055         }
1056 done:
1057         ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1058         if (newnce != NULL)
1059                 *newnce = nce;
1060         else
1061                 nce_refrele(nce);
1062         return (0);
1063 }
1064 
1065 /*
1066  * Return the link layer address, and any flags of a ncec.
1067  */
1068 int
1069 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1070 {
1071         ncec_t          *ncec;
1072         in6_addr_t      *addr;
1073         sin6_t          *sin6;
1074 
1075         ASSERT(ill != NULL && ill->ill_isv6);
1076         sin6 = (sin6_t *)&lnr->lnr_addr;
1077         addr =  &sin6->sin6_addr;
1078 
1079         /*
1080          * NOTE: if the ill is an IPMP interface, then match against the whole
1081          * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1082          * addresses for the data addresses on an IPMP interface even though
1083          * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1084          */
1085         ncec = ncec_lookup_illgrp_v6(ill, addr);
1086         if (ncec == NULL)
1087                 return (ESRCH);
1088         /* If no link layer address is available yet, return ESRCH */
1089         if (!NCE_ISREACHABLE(ncec)) {
1090                 ncec_refrele(ncec);
1091                 return (ESRCH);
1092         }
1093         lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1094         bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1095             lnr->lnr_hdw_len);
1096         if (ncec->ncec_flags & NCE_F_ISROUTER)
1097                 lnr->lnr_flags = NDF_ISROUTER_ON;
1098         if (ncec->ncec_flags & NCE_F_ANYCAST)
1099                 lnr->lnr_flags |= NDF_ANYCAST_ON;
1100         ncec_refrele(ncec);
1101         return (0);
1102 }
1103 
1104 /*
1105  * Finish setting up the Enable/Disable multicast for the driver.
1106  */
1107 mblk_t *
1108 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1109     uint32_t hw_addr_offset, mblk_t *mp)
1110 {
1111         uchar_t         *hw_addr;
1112         ipaddr_t        v4group;
1113         uchar_t         *addr;
1114 
1115         ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1116         if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1117                 IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1118 
1119                 ASSERT(CLASSD(v4group));
1120                 ASSERT(!(ill->ill_isv6));
1121 
1122                 addr = (uchar_t *)&v4group;
1123         } else {
1124                 ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1125                 ASSERT(ill->ill_isv6);
1126 
1127                 addr = (uchar_t *)v6group;
1128         }
1129         hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1130         if (hw_addr == NULL) {
1131                 ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1132                 freemsg(mp);
1133                 return (NULL);
1134         }
1135 
1136         ip_mcast_mapping(ill, addr, hw_addr);
1137         return (mp);
1138 }
1139 
1140 void
1141 ip_ndp_resolve(ncec_t *ncec)
1142 {
1143         in_addr_t       sender4 = INADDR_ANY;
1144         in6_addr_t      sender6 = ipv6_all_zeros;
1145         ill_t           *src_ill;
1146         uint32_t        ms;
1147 
1148         src_ill = nce_resolve_src(ncec, &sender6);
1149         if (src_ill == NULL) {
1150                 /* Make sure we try again later */
1151                 ms = ncec->ncec_ill->ill_reachable_retrans_time;
1152                 nce_restart_timer(ncec, (clock_t)ms);
1153                 return;
1154         }
1155         if (ncec->ncec_ipversion == IPV4_VERSION)
1156                 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1157         mutex_enter(&ncec->ncec_lock);
1158         if (ncec->ncec_ipversion == IPV6_VERSION)
1159                 ms = ndp_solicit(ncec, sender6, src_ill);
1160         else
1161                 ms = arp_request(ncec, sender4, src_ill);
1162         mutex_exit(&ncec->ncec_lock);
1163         if (ms == 0) {
1164                 if (ncec->ncec_state != ND_REACHABLE) {
1165                         if (ncec->ncec_ipversion == IPV6_VERSION)
1166                                 ndp_resolv_failed(ncec);
1167                         else
1168                                 arp_resolv_failed(ncec);
1169                         ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1170                         nce_make_unreachable(ncec);
1171                         ncec_delete(ncec);
1172                 }
1173         } else {
1174                 nce_restart_timer(ncec, (clock_t)ms);
1175         }
1176 done:
1177         ill_refrele(src_ill);
1178 }
1179 
1180 /*
1181  * Send an IPv6 neighbor solicitation.
1182  * Returns number of milliseconds after which we should either rexmit or abort.
1183  * Return of zero means we should abort.
1184  * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1185  * The optional source address is used as a hint to ndp_solicit for
1186  * which source to use in the packet.
1187  *
1188  * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1189  * the packet.
1190  */
1191 uint32_t
1192 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1193 {
1194         in6_addr_t      dst;
1195         boolean_t       dropped = B_FALSE;
1196 
1197         ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1198         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1199 
1200         if (ncec->ncec_rcnt == 0)
1201                 return (0);
1202 
1203         dst = ncec->ncec_addr;
1204         ncec->ncec_rcnt--;
1205         mutex_exit(&ncec->ncec_lock);
1206         dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1207             ill->ill_phys_addr_length, &src, &dst, 0);
1208         mutex_enter(&ncec->ncec_lock);
1209         if (dropped)
1210                 ncec->ncec_rcnt++;
1211         return (ncec->ncec_ill->ill_reachable_retrans_time);
1212 }
1213 
1214 /*
1215  * Attempt to recover an address on an interface that's been marked as a
1216  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1217  * no easy way to just probe the address and have the right thing happen if
1218  * it's no longer in use.  Instead, we just bring it up normally and allow the
1219  * regular interface start-up logic to probe for a remaining duplicate and take
1220  * us back down if necessary.
1221  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1222  * ip_ndp_excl.
1223  */
1224 /* ARGSUSED */
1225 void
1226 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1227 {
1228         ill_t   *ill = rq->q_ptr;
1229         ipif_t  *ipif;
1230         in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1231         in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1232         boolean_t addr_equal;
1233 
1234         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1235                 /*
1236                  * We do not support recovery of proxy ARP'd interfaces,
1237                  * because the system lacks a complete proxy ARP mechanism.
1238                  */
1239                 if (ill->ill_isv6) {
1240                         addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1241                             addr6);
1242                 } else {
1243                         addr_equal = (ipif->ipif_lcl_addr == *addr4);
1244                 }
1245 
1246                 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1247                         continue;
1248 
1249                 /*
1250                  * If we have already recovered or if the interface is going
1251                  * away, then ignore.
1252                  */
1253                 mutex_enter(&ill->ill_lock);
1254                 if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1255                     (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1256                         mutex_exit(&ill->ill_lock);
1257                         continue;
1258                 }
1259 
1260                 ipif->ipif_flags &= ~IPIF_DUPLICATE;
1261                 ill->ill_ipif_dup_count--;
1262                 mutex_exit(&ill->ill_lock);
1263                 ipif->ipif_was_dup = B_TRUE;
1264 
1265                 if (ill->ill_isv6) {
1266                         VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1267                         (void) ipif_up_done_v6(ipif);
1268                 } else {
1269                         VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1270                             EINPROGRESS);
1271                         (void) ipif_up_done(ipif);
1272                 }
1273         }
1274         freeb(mp);
1275 }
1276 
1277 /*
1278  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1279  * As long as someone else holds the address, the interface will stay down.
1280  * When that conflict goes away, the interface is brought back up.  This is
1281  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1282  * server will recover from a failure.
1283  *
1284  * For DHCP and temporary addresses, recovery is not done in the kernel.
1285  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1286  *
1287  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1288  */
1289 void
1290 ipif_dup_recovery(void *arg)
1291 {
1292         ipif_t *ipif = arg;
1293 
1294         ipif->ipif_recovery_id = 0;
1295         if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1296                 return;
1297 
1298         /*
1299          * No lock, because this is just an optimization.
1300          */
1301         if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1302                 return;
1303 
1304         /* If the link is down, we'll retry this later */
1305         if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1306                 return;
1307 
1308         ipif_do_recovery(ipif);
1309 }
1310 
1311 /*
1312  * Perform interface recovery by forcing the duplicate interfaces up and
1313  * allowing the system to determine which ones should stay up.
1314  *
1315  * Called both by recovery timer expiry and link-up notification.
1316  */
1317 void
1318 ipif_do_recovery(ipif_t *ipif)
1319 {
1320         ill_t *ill = ipif->ipif_ill;
1321         mblk_t *mp;
1322         ip_stack_t *ipst = ill->ill_ipst;
1323         size_t mp_size;
1324 
1325         if (ipif->ipif_isv6)
1326                 mp_size = sizeof (ipif->ipif_v6lcl_addr);
1327         else
1328                 mp_size = sizeof (ipif->ipif_lcl_addr);
1329         mp = allocb(mp_size, BPRI_MED);
1330         if (mp == NULL) {
1331                 mutex_enter(&ill->ill_lock);
1332                 if (ipst->ips_ip_dup_recovery > 0 &&
1333                     ipif->ipif_recovery_id == 0 &&
1334                     !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1335                         ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1336                             ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1337                 }
1338                 mutex_exit(&ill->ill_lock);
1339         } else {
1340                 /*
1341                  * A recovery timer may still be running if we got here from
1342                  * ill_restart_dad(); cancel that timer.
1343                  */
1344                 if (ipif->ipif_recovery_id != 0)
1345                         (void) untimeout(ipif->ipif_recovery_id);
1346                 ipif->ipif_recovery_id = 0;
1347 
1348                 if (ipif->ipif_isv6) {
1349                         bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1350                             sizeof (ipif->ipif_v6lcl_addr));
1351                 } else  {
1352                         bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1353                             sizeof (ipif->ipif_lcl_addr));
1354                 }
1355                 ill_refhold(ill);
1356                 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1357                     B_FALSE);
1358         }
1359 }
1360 
1361 /*
1362  * Find the MAC and IP addresses in an NA/NS message.
1363  */
1364 static void
1365 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1366     in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1367 {
1368         icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1369         nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1370         uchar_t *addr;
1371         int alen;
1372 
1373         /* icmp_inbound_v6 ensures this */
1374         ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1375 
1376         addr = ira->ira_l2src;
1377         alen = ill->ill_phys_addr_length;
1378         if (alen > 0) {
1379                 *haddr = addr;
1380                 *haddrlenp = alen;
1381         } else {
1382                 *haddr = NULL;
1383                 *haddrlenp = 0;
1384         }
1385 
1386         /* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1387         *targp = ns->nd_ns_target;
1388 }
1389 
1390 /*
1391  * This is for exclusive changes due to NDP duplicate address detection
1392  * failure.
1393  */
1394 /* ARGSUSED */
1395 static void
1396 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1397 {
1398         ill_t   *ill = rq->q_ptr;
1399         ipif_t  *ipif;
1400         uchar_t *haddr;
1401         uint_t  haddrlen;
1402         ip_stack_t *ipst = ill->ill_ipst;
1403         in6_addr_t targ;
1404         ip_recv_attr_t iras;
1405         mblk_t  *attrmp;
1406 
1407         attrmp = mp;
1408         mp = mp->b_cont;
1409         attrmp->b_cont = NULL;
1410         if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1411                 /* The ill or ip_stack_t disappeared on us */
1412                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1413                 ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1414                 freemsg(mp);
1415                 ira_cleanup(&iras, B_TRUE);
1416                 return;
1417         }
1418 
1419         ASSERT(ill == iras.ira_rill);
1420 
1421         ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1422         if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1423                 /*
1424                  * Ignore conflicts generated by misbehaving switches that
1425                  * just reflect our own messages back to us.  For IPMP, we may
1426                  * see reflections across any ill in the illgrp.
1427                  *
1428                  * RFC2462 and revisions tried to detect both the case
1429                  * when a statically configured IPv6 address is a duplicate,
1430                  * and the case when the L2 address itself is a duplicate. The
1431                  * later is important because, with stateles address autoconf,
1432                  * if the L2 address is a duplicate, the resulting IPv6
1433                  * address(es) would also be duplicates. We rely on DAD of the
1434                  * IPv6 address itself to detect the latter case.
1435                  */
1436                 /* For an under ill_grp can change under lock */
1437                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1438                 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1439                     IS_UNDER_IPMP(ill) &&
1440                     ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1441                     haddrlen) != NULL) {
1442                         rw_exit(&ipst->ips_ill_g_lock);
1443                         goto ignore_conflict;
1444                 }
1445                 rw_exit(&ipst->ips_ill_g_lock);
1446         }
1447 
1448         /*
1449          * Look up the appropriate ipif.
1450          */
1451         ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1452         if (ipif == NULL)
1453                 goto ignore_conflict;
1454 
1455         /* Reload the ill to match the ipif */
1456         ill = ipif->ipif_ill;
1457 
1458         /* If it's already duplicate or ineligible, then don't do anything. */
1459         if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1460                 ipif_refrele(ipif);
1461                 goto ignore_conflict;
1462         }
1463 
1464         /*
1465          * If this is a failure during duplicate recovery, then don't
1466          * complain.  It may take a long time to recover.
1467          */
1468         if (!ipif->ipif_was_dup) {
1469                 char ibuf[LIFNAMSIZ];
1470                 char hbuf[MAC_STR_LEN];
1471                 char sbuf[INET6_ADDRSTRLEN];
1472 
1473                 ipif_get_name(ipif, ibuf, sizeof (ibuf));
1474                 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1475                     " disabled", ibuf,
1476                     inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1477                     mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1478         }
1479         mutex_enter(&ill->ill_lock);
1480         ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1481         ipif->ipif_flags |= IPIF_DUPLICATE;
1482         ill->ill_ipif_dup_count++;
1483         mutex_exit(&ill->ill_lock);
1484         (void) ipif_down(ipif, NULL, NULL);
1485         (void) ipif_down_tail(ipif);
1486         mutex_enter(&ill->ill_lock);
1487         if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1488             ill->ill_net_type == IRE_IF_RESOLVER &&
1489             !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1490             ipst->ips_ip_dup_recovery > 0) {
1491                 ASSERT(ipif->ipif_recovery_id == 0);
1492                 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1493                     ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1494         }
1495         mutex_exit(&ill->ill_lock);
1496         ipif_refrele(ipif);
1497 
1498 ignore_conflict:
1499         freemsg(mp);
1500         ira_cleanup(&iras, B_TRUE);
1501 }
1502 
1503 /*
1504  * Handle failure by tearing down the ipifs with the specified address.  Note
1505  * that tearing down the ipif also means deleting the ncec through ipif_down, so
1506  * it's not possible to do recovery by just restarting the ncec timer.  Instead,
1507  * we start a timer on the ipif.
1508  * Caller has to free mp;
1509  */
1510 static void
1511 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1512 {
1513         const uchar_t   *haddr;
1514         ill_t           *ill = ira->ira_rill;
1515 
1516         /*
1517          * Ignore conflicts generated by misbehaving switches that just
1518          * reflect our own messages back to us.
1519          */
1520 
1521         /* icmp_inbound_v6 ensures this */
1522         ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1523         haddr = ira->ira_l2src;
1524         if (haddr != NULL &&
1525             bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1526                 return;
1527         }
1528 
1529         if ((mp = copymsg(mp)) != NULL) {
1530                 mblk_t  *attrmp;
1531 
1532                 attrmp = ip_recv_attr_to_mblk(ira);
1533                 if (attrmp == NULL) {
1534                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1535                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
1536                         freemsg(mp);
1537                 } else {
1538                         ASSERT(attrmp->b_cont == NULL);
1539                         attrmp->b_cont = mp;
1540                         mp = attrmp;
1541                         ill_refhold(ill);
1542                         qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1543                             B_FALSE);
1544                 }
1545         }
1546 }
1547 
1548 /*
1549  * Handle a discovered conflict: some other system is advertising that it owns
1550  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1551  * interface.
1552  *
1553  * Handles both IPv4 and IPv6
1554  */
1555 boolean_t
1556 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1557 {
1558         ipif_t          *ipif;
1559         clock_t         now;
1560         uint_t          maxdefense;
1561         uint_t          defs;
1562         ill_t           *ill = ira->ira_ill;
1563         ip_stack_t      *ipst = ill->ill_ipst;
1564         uint32_t        elapsed;
1565         boolean_t       isv6 = ill->ill_isv6;
1566         ipaddr_t        ncec_addr;
1567 
1568         if (isv6) {
1569                 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1570                     ipst);
1571         } else {
1572                 if (arp_no_defense) {
1573                         /*
1574                          * Yes, there is a conflict, but no, we do not
1575                          * defend ourself.
1576                          */
1577                         return (B_TRUE);
1578                 }
1579                 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1580                 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1581                     ipst);
1582         }
1583         if (ipif == NULL)
1584                 return (B_FALSE);
1585 
1586         /*
1587          * First, figure out if this address is disposable.
1588          */
1589         if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1590                 maxdefense = ipst->ips_ip_max_temp_defend;
1591         else
1592                 maxdefense = ipst->ips_ip_max_defend;
1593 
1594         /*
1595          * Now figure out how many times we've defended ourselves.  Ignore
1596          * defenses that happened long in the past.
1597          */
1598         now = ddi_get_lbolt();
1599         elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1600         mutex_enter(&ncec->ncec_lock);
1601         if ((defs = ncec->ncec_defense_count) > 0 &&
1602             elapsed > ipst->ips_ip_defend_interval) {
1603                 /*
1604                  * ip_defend_interval has elapsed.
1605                  * reset the defense count.
1606                  */
1607                 ncec->ncec_defense_count = defs = 0;
1608         }
1609         ncec->ncec_defense_count++;
1610         ncec->ncec_last_time_defended = now;
1611         mutex_exit(&ncec->ncec_lock);
1612         ipif_refrele(ipif);
1613 
1614         /*
1615          * If we've defended ourselves too many times already, then give up and
1616          * tear down the interface(s) using this address.
1617          * Otherwise, caller has to defend by sending out an announce.
1618          */
1619         if (defs >= maxdefense) {
1620                 if (isv6)
1621                         ndp_failure(mp, ira);
1622                 else
1623                         arp_failure(mp, ira);
1624         } else {
1625                 return (B_TRUE); /* caller must defend this address */
1626         }
1627         return (B_FALSE);
1628 }
1629 
1630 /*
1631  * Handle reception of Neighbor Solicitation messages.
1632  */
1633 static void
1634 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1635 {
1636         ill_t           *ill = ira->ira_ill, *under_ill;
1637         nd_neighbor_solicit_t *ns;
1638         uint32_t        hlen = ill->ill_phys_addr_length;
1639         uchar_t         *haddr = NULL;
1640         icmp6_t         *icmp_nd;
1641         ip6_t           *ip6h;
1642         ncec_t          *our_ncec = NULL;
1643         in6_addr_t      target;
1644         in6_addr_t      src;
1645         int             len;
1646         int             flag = 0;
1647         nd_opt_hdr_t    *opt = NULL;
1648         boolean_t       bad_solicit = B_FALSE;
1649         mib2_ipv6IfIcmpEntry_t  *mib = ill->ill_icmp6_mib;
1650         boolean_t       need_ill_refrele = B_FALSE;
1651 
1652         ip6h = (ip6_t *)mp->b_rptr;
1653         icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1654         len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1655         src = ip6h->ip6_src;
1656         ns = (nd_neighbor_solicit_t *)icmp_nd;
1657         target = ns->nd_ns_target;
1658         if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1659             IN6_IS_ADDR_LOOPBACK(&target)) {
1660                 if (ip_debug > 2) {
1661                         /* ip1dbg */
1662                         pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1663                             AF_INET6, &target);
1664                 }
1665                 bad_solicit = B_TRUE;
1666                 goto done;
1667         }
1668         if (len > sizeof (nd_neighbor_solicit_t)) {
1669                 /* Options present */
1670                 opt = (nd_opt_hdr_t *)&ns[1];
1671                 len -= sizeof (nd_neighbor_solicit_t);
1672                 if (!ndp_verify_optlen(opt, len)) {
1673                         ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1674                         bad_solicit = B_TRUE;
1675                         goto done;
1676                 }
1677         }
1678         if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1679                 /* Check to see if this is a valid DAD solicitation */
1680                 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1681                         if (ip_debug > 2) {
1682                                 /* ip1dbg */
1683                                 pr_addr_dbg("ndp_input_solicit: IPv6 "
1684                                     "Destination is not solicited node "
1685                                     "multicast %s\n", AF_INET6,
1686                                     &ip6h->ip6_dst);
1687                         }
1688                         bad_solicit = B_TRUE;
1689                         goto done;
1690                 }
1691         }
1692 
1693         /*
1694          * NOTE: with IPMP, it's possible the nominated multicast ill (which
1695          * received this packet if it's multicast) is not the ill tied to
1696          * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1697          * to ensure we find the associated NCE.
1698          */
1699         our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1700         /*
1701          * If this is a valid Solicitation for an address we are publishing,
1702          * then a PUBLISH entry should exist in the cache
1703          */
1704         if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1705                 ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1706                     "ifname=%s ", ill->ill_name));
1707                 if (ip_debug > 2) {
1708                         /* ip1dbg */
1709                         pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1710                 }
1711                 if (our_ncec == NULL)
1712                         bad_solicit = B_TRUE;
1713                 goto done;
1714         }
1715 
1716         /* At this point we should have a verified NS per spec */
1717         if (opt != NULL) {
1718                 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1719                 if (opt != NULL) {
1720                         haddr = (uchar_t *)&opt[1];
1721                         if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1722                             hlen == 0) {
1723                                 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1724                                 bad_solicit = B_TRUE;
1725                                 goto done;
1726                         }
1727                 }
1728         }
1729 
1730         /* If sending directly to peer, set the unicast flag */
1731         if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1732                 flag |= NDP_UNICAST;
1733 
1734         /*
1735          * Create/update the entry for the soliciting node on the ipmp_ill.
1736          * or respond to outstanding queries, don't if
1737          * the source is unspecified address.
1738          */
1739         if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1740                 int     err;
1741                 nce_t   *nnce;
1742 
1743                 ASSERT(ill->ill_isv6);
1744                 /*
1745                  * Regular solicitations *must* include the Source Link-Layer
1746                  * Address option.  Ignore messages that do not.
1747                  */
1748                 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1749                         ip1dbg(("ndp_input_solicit: source link-layer address "
1750                             "option missing with a specified source.\n"));
1751                         bad_solicit = B_TRUE;
1752                         goto done;
1753                 }
1754 
1755                 /*
1756                  * This is a regular solicitation.  If we're still in the
1757                  * process of verifying the address, then don't respond at all
1758                  * and don't keep track of the sender.
1759                  */
1760                 if (our_ncec->ncec_state == ND_PROBE)
1761                         goto done;
1762 
1763                 /*
1764                  * If the solicitation doesn't have sender hardware address
1765                  * (legal for unicast solicitation), then process without
1766                  * installing the return NCE.  Either we already know it, or
1767                  * we'll be forced to look it up when (and if) we reply to the
1768                  * packet.
1769                  */
1770                 if (haddr == NULL)
1771                         goto no_source;
1772 
1773                 under_ill = ill;
1774                 if (IS_UNDER_IPMP(under_ill)) {
1775                         ill = ipmp_ill_hold_ipmp_ill(under_ill);
1776                         if (ill == NULL)
1777                                 ill = under_ill;
1778                         else
1779                                 need_ill_refrele = B_TRUE;
1780                 }
1781                 err = nce_lookup_then_add_v6(ill,
1782                     haddr, hlen,
1783                     &src,   /* Soliciting nodes address */
1784                     0,
1785                     ND_STALE,
1786                     &nnce);
1787 
1788                 if (need_ill_refrele) {
1789                         ill_refrele(ill);
1790                         ill = under_ill;
1791                         need_ill_refrele =  B_FALSE;
1792                 }
1793                 switch (err) {
1794                 case 0:
1795                         /* done with this entry */
1796                         nce_refrele(nnce);
1797                         break;
1798                 case EEXIST:
1799                         /*
1800                          * B_FALSE indicates this is not an an advertisement.
1801                          */
1802                         nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1803                         nce_refrele(nnce);
1804                         break;
1805                 default:
1806                         ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1807                             err));
1808                         goto done;
1809                 }
1810 no_source:
1811                 flag |= NDP_SOLICITED;
1812         } else {
1813                 /*
1814                  * No source link layer address option should be present in a
1815                  * valid DAD request.
1816                  */
1817                 if (haddr != NULL) {
1818                         ip1dbg(("ndp_input_solicit: source link-layer address "
1819                             "option present with an unspecified source.\n"));
1820                         bad_solicit = B_TRUE;
1821                         goto done;
1822                 }
1823                 if (our_ncec->ncec_state == ND_PROBE) {
1824                         /*
1825                          * Internally looped-back probes will have
1826                          * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1827                          * transmissions.
1828                          */
1829                         if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1830                                 /*
1831                                  * If someone else is probing our address, then
1832                                  * we've crossed wires.  Declare failure.
1833                                  */
1834                                 ndp_failure(mp, ira);
1835                         }
1836                         goto done;
1837                 }
1838                 /*
1839                  * This is a DAD probe.  Multicast the advertisement to the
1840                  * all-nodes address.
1841                  */
1842                 src = ipv6_all_hosts_mcast;
1843         }
1844         flag |= nce_advert_flags(our_ncec);
1845         (void) ndp_xmit(ill,
1846             ND_NEIGHBOR_ADVERT,
1847             our_ncec->ncec_lladdr,
1848             our_ncec->ncec_lladdr_length,
1849             &target,        /* Source and target of the advertisement pkt */
1850             &src,   /* IP Destination (source of original pkt) */
1851             flag);
1852 done:
1853         if (bad_solicit)
1854                 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1855         if (our_ncec != NULL)
1856                 ncec_refrele(our_ncec);
1857 }
1858 
1859 /*
1860  * Handle reception of Neighbor Solicitation messages
1861  */
1862 void
1863 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1864 {
1865         ill_t           *ill = ira->ira_ill;
1866         nd_neighbor_advert_t *na;
1867         uint32_t        hlen = ill->ill_phys_addr_length;
1868         uchar_t         *haddr = NULL;
1869         icmp6_t         *icmp_nd;
1870         ip6_t           *ip6h;
1871         ncec_t          *dst_ncec = NULL;
1872         in6_addr_t      target;
1873         nd_opt_hdr_t    *opt = NULL;
1874         int             len;
1875         ip_stack_t      *ipst = ill->ill_ipst;
1876         mib2_ipv6IfIcmpEntry_t  *mib = ill->ill_icmp6_mib;
1877 
1878         ip6h = (ip6_t *)mp->b_rptr;
1879         icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1880         len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1881         na = (nd_neighbor_advert_t *)icmp_nd;
1882 
1883         if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1884             (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1885                 ip1dbg(("ndp_input_advert: Target is multicast but the "
1886                     "solicited flag is not zero\n"));
1887                 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1888                 return;
1889         }
1890         target = na->nd_na_target;
1891         if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1892             IN6_IS_ADDR_LOOPBACK(&target)) {
1893                 if (ip_debug > 2) {
1894                         /* ip1dbg */
1895                         pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1896                             AF_INET6, &target);
1897                 }
1898                 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1899                 return;
1900         }
1901         if (len > sizeof (nd_neighbor_advert_t)) {
1902                 opt = (nd_opt_hdr_t *)&na[1];
1903                 if (!ndp_verify_optlen(opt,
1904                     len - sizeof (nd_neighbor_advert_t))) {
1905                         ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1906                         BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1907                         return;
1908                 }
1909                 /* At this point we have a verified NA per spec */
1910                 len -= sizeof (nd_neighbor_advert_t);
1911                 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1912                 if (opt != NULL) {
1913                         haddr = (uchar_t *)&opt[1];
1914                         if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1915                             hlen == 0) {
1916                                 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1917                                 BUMP_MIB(mib,
1918                                     ipv6IfIcmpInBadNeighborAdvertisements);
1919                                 return;
1920                         }
1921                 }
1922         }
1923 
1924         /*
1925          * NOTE: we match across the illgrp since we need to do DAD for all of
1926          * our local addresses, and those are spread across all the active
1927          * ills in the group.
1928          */
1929         if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1930                 return;
1931 
1932         if (NCE_PUBLISH(dst_ncec)) {
1933                 /*
1934                  * Someone just advertised an addresses that we publish. First,
1935                  * check it it was us -- if so, we can safely ignore it.
1936                  * We don't get the haddr from the ira_l2src because, in the
1937                  * case that the packet originated from us, on an IPMP group,
1938                  * the ira_l2src may would be the link-layer address of the
1939                  * cast_ill used to send the packet, which may not be the same
1940                  * as the dst_ncec->ncec_lladdr of the address.
1941                  */
1942                 if (haddr != NULL) {
1943                         if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1944                                 goto out;
1945 
1946                         if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1947                                 goto out;   /* from us -- no conflict */
1948 
1949                         /*
1950                          * If we're in an IPMP group, check if this is an echo
1951                          * from another ill in the group.  Use the double-
1952                          * checked locking pattern to avoid grabbing
1953                          * ill_g_lock in the non-IPMP case.
1954                          */
1955                         if (IS_UNDER_IPMP(ill)) {
1956                                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1957                                 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1958                                     ill->ill_grp, haddr, hlen) != NULL) {
1959                                         rw_exit(&ipst->ips_ill_g_lock);
1960                                         goto out;
1961                                 }
1962                                 rw_exit(&ipst->ips_ill_g_lock);
1963                         }
1964                 }
1965 
1966                 /*
1967                  * This appears to be a real conflict.  If we're trying to
1968                  * configure this NCE (ND_PROBE), then shut it down.
1969                  * Otherwise, handle the discovered conflict.
1970                  */
1971                 if (dst_ncec->ncec_state == ND_PROBE) {
1972                         ndp_failure(mp, ira);
1973                 } else {
1974                         if (ip_nce_conflict(mp, ira, dst_ncec)) {
1975                                 char hbuf[MAC_STR_LEN];
1976                                 char sbuf[INET6_ADDRSTRLEN];
1977 
1978                                 cmn_err(CE_WARN,
1979                                     "node '%s' is using %s on %s",
1980                                     inet_ntop(AF_INET6, &target, sbuf,
1981                                     sizeof (sbuf)),
1982                                     haddr == NULL ? "<none>" :
1983                                     mac_colon_addr(haddr, hlen, hbuf,
1984                                     sizeof (hbuf)), ill->ill_name);
1985                                 /*
1986                                  * RFC 4862, Section 5.4.4 does not mandate
1987                                  * any specific behavior when an NA matches
1988                                  * a non-tentative address assigned to the
1989                                  * receiver. We make the choice of defending
1990                                  * our address, based on the assumption that
1991                                  * the sender has not detected the Duplicate.
1992                                  *
1993                                  * ncec_last_time_defended has been adjusted
1994                                  * in ip_nce_conflict()
1995                                  */
1996                                 (void) ndp_announce(dst_ncec);
1997                         }
1998                 }
1999         } else {
2000                 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2001                         dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2002 
2003                 /* B_TRUE indicates this an advertisement */
2004                 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2005         }
2006 out:
2007         ncec_refrele(dst_ncec);
2008 }
2009 
2010 /*
2011  * Process NDP neighbor solicitation/advertisement messages.
2012  * The checksum has already checked o.k before reaching here.
2013  * Information about the datalink header is contained in ira_l2src, but
2014  * that should be ignored for loopback packets.
2015  */
2016 void
2017 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2018 {
2019         ill_t           *ill = ira->ira_rill;
2020         icmp6_t         *icmp_nd;
2021         ip6_t           *ip6h;
2022         int             len;
2023         mib2_ipv6IfIcmpEntry_t  *mib = ill->ill_icmp6_mib;
2024         ill_t           *orig_ill = NULL;
2025 
2026         /*
2027          * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2028          * and make it be the IPMP upper so avoid being confused by a packet
2029          * addressed to a unicast address on a different ill.
2030          */
2031         if (IS_UNDER_IPMP(ill)) {
2032                 orig_ill = ill;
2033                 ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2034                 if (ill == NULL) {
2035                         ill = orig_ill;
2036                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2037                         ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2038                             mp, ill);
2039                         freemsg(mp);
2040                         return;
2041                 }
2042                 ASSERT(ill != orig_ill);
2043                 orig_ill = ira->ira_ill;
2044                 ira->ira_ill = ill;
2045                 mib = ill->ill_icmp6_mib;
2046         }
2047         if (!pullupmsg(mp, -1)) {
2048                 ip1dbg(("ndp_input: pullupmsg failed\n"));
2049                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2050                 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2051                 goto done;
2052         }
2053         ip6h = (ip6_t *)mp->b_rptr;
2054         if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2055                 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2056                 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2057                 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2058                 goto done;
2059         }
2060         /*
2061          * NDP does not accept any extension headers between the
2062          * IP header and the ICMP header since e.g. a routing
2063          * header could be dangerous.
2064          * This assumes that any AH or ESP headers are removed
2065          * by ip prior to passing the packet to ndp_input.
2066          */
2067         if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2068                 ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2069                     ip6h->ip6_nxt));
2070                 ip_drop_input("Wrong next header", mp, ill);
2071                 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2072                 goto done;
2073         }
2074         icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2075         ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2076             icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2077         if (icmp_nd->icmp6_code != 0) {
2078                 ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2079                 ip_drop_input("code non-zero", mp, ill);
2080                 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2081                 goto done;
2082         }
2083         len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2084         /*
2085          * Make sure packet length is large enough for either
2086          * a NS or a NA icmp packet.
2087          */
2088         if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2089                 ip1dbg(("ndp_input: packet too short\n"));
2090                 ip_drop_input("packet too short", mp, ill);
2091                 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2092                 goto done;
2093         }
2094         if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2095                 ndp_input_solicit(mp, ira);
2096         } else {
2097                 ndp_input_advert(mp, ira);
2098         }
2099 done:
2100         freemsg(mp);
2101         if (orig_ill != NULL) {
2102                 ill_refrele(ill);
2103                 ira->ira_ill = orig_ill;
2104         }
2105 }
2106 
2107 /*
2108  * ndp_xmit is called to form and transmit a ND solicitation or
2109  * advertisement ICMP packet.
2110  *
2111  * If the source address is unspecified and this isn't a probe (used for
2112  * duplicate address detection), an appropriate source address and link layer
2113  * address will be chosen here.  The link layer address option is included if
2114  * the source is specified (i.e., all non-probe packets), and omitted (per the
2115  * specification) otherwise.
2116  *
2117  * It returns B_FALSE only if it does a successful put() to the
2118  * corresponding ill's ill_wq otherwise returns B_TRUE.
2119  */
2120 static boolean_t
2121 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2122     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2123 {
2124         uint32_t        len;
2125         icmp6_t         *icmp6;
2126         mblk_t          *mp;
2127         ip6_t           *ip6h;
2128         nd_opt_hdr_t    *opt;
2129         uint_t          plen;
2130         zoneid_t        zoneid = GLOBAL_ZONEID;
2131         ill_t           *hwaddr_ill = ill;
2132         ip_xmit_attr_t  ixas;
2133         ip_stack_t      *ipst = ill->ill_ipst;
2134         boolean_t       need_refrele = B_FALSE;
2135         boolean_t       probe = B_FALSE;
2136 
2137         if (IS_UNDER_IPMP(ill)) {
2138                 probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2139                 /*
2140                  * We send non-probe packets on the upper IPMP interface.
2141                  * ip_output_simple() will use cast_ill for sending any
2142                  * multicast packets. Note that we can't follow the same
2143                  * logic for probe packets because all interfaces in the ipmp
2144                  * group may have failed, so that we really want to only try
2145                  * to send the ND packet on the ill corresponding to the src
2146                  * address.
2147                  */
2148                 if (!probe) {
2149                         ill = ipmp_ill_hold_ipmp_ill(ill);
2150                         if (ill != NULL)
2151                                 need_refrele = B_TRUE;
2152                         else
2153                                 ill = hwaddr_ill;
2154                 }
2155         }
2156 
2157         /*
2158          * If we have a unspecified source(sender) address, select a
2159          * proper source address for the solicitation here itself so
2160          * that we can initialize the h/w address correctly.
2161          *
2162          * If the sender is specified then we use this address in order
2163          * to lookup the zoneid before calling ip_output_v6(). This is to
2164          * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2165          * by IP (we cannot guarantee that the global zone has an interface
2166          * route to the destination).
2167          *
2168          * Note that the NA never comes here with the unspecified source
2169          * address.
2170          */
2171 
2172         /*
2173          * Probes will have unspec src at this point.
2174          */
2175         if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2176                 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2177                 /*
2178                  * It's possible for ipif_lookup_addr_zoneid_v6() to return
2179                  * ALL_ZONES if it cannot find a matching ipif for the address
2180                  * we are trying to use. In this case we err on the side of
2181                  * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2182                  */
2183                 if (zoneid == ALL_ZONES)
2184                         zoneid = GLOBAL_ZONEID;
2185         }
2186 
2187         plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2188         len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2189         mp = allocb(len,  BPRI_LO);
2190         if (mp == NULL) {
2191                 if (need_refrele)
2192                         ill_refrele(ill);
2193                 return (B_TRUE);
2194         }
2195 
2196         bzero((char *)mp->b_rptr, len);
2197         mp->b_wptr = mp->b_rptr + len;
2198 
2199         bzero(&ixas, sizeof (ixas));
2200         ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2201 
2202         ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2203         ixas.ixa_ipst = ipst;
2204         ixas.ixa_cred = kcred;
2205         ixas.ixa_cpid = NOPID;
2206         ixas.ixa_tsl = NULL;
2207         ixas.ixa_zoneid = zoneid;
2208 
2209         ip6h = (ip6_t *)mp->b_rptr;
2210         ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2211         ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2212         ip6h->ip6_nxt = IPPROTO_ICMPV6;
2213         ip6h->ip6_hops = IPV6_MAX_HOPS;
2214         ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2215         ip6h->ip6_dst = *target;
2216         icmp6 = (icmp6_t *)&ip6h[1];
2217 
2218         if (hw_addr_len != 0) {
2219                 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2220                     sizeof (nd_neighbor_advert_t));
2221         } else {
2222                 opt = NULL;
2223         }
2224         if (operation == ND_NEIGHBOR_SOLICIT) {
2225                 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2226 
2227                 if (opt != NULL && !(flag & NDP_PROBE)) {
2228                         /*
2229                          * Note that we don't send out SLLA for ND probes
2230                          * per RFC 4862, even though we do send out the src
2231                          * haddr for IPv4 DAD probes, even though both IPv4
2232                          * and IPv6 go out with the unspecified/INADDR_ANY
2233                          * src IP addr.
2234                          */
2235                         opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2236                 }
2237                 ip6h->ip6_src = *sender;
2238                 ns->nd_ns_target = *target;
2239                 if (!(flag & NDP_UNICAST)) {
2240                         /* Form multicast address of the target */
2241                         ip6h->ip6_dst = ipv6_solicited_node_mcast;
2242                         ip6h->ip6_dst.s6_addr32[3] |=
2243                             ns->nd_ns_target.s6_addr32[3];
2244                 }
2245         } else {
2246                 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2247 
2248                 ASSERT(!(flag & NDP_PROBE));
2249                 if (opt != NULL)
2250                         opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2251                 ip6h->ip6_src = *sender;
2252                 na->nd_na_target = *sender;
2253                 if (flag & NDP_ISROUTER)
2254                         na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2255                 if (flag & NDP_SOLICITED)
2256                         na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2257                 if (flag & NDP_ORIDE)
2258                         na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2259         }
2260 
2261         if (!(flag & NDP_PROBE)) {
2262                 if (hw_addr != NULL && opt != NULL) {
2263                         /* Fill in link layer address and option len */
2264                         opt->nd_opt_len = (uint8_t)plen;
2265                         bcopy(hw_addr, &opt[1], hw_addr_len);
2266                 }
2267         }
2268         if (opt != NULL && opt->nd_opt_type == 0) {
2269                 /* If there's no link layer address option, then strip it. */
2270                 len -= plen * 8;
2271                 mp->b_wptr = mp->b_rptr + len;
2272                 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2273         }
2274 
2275         icmp6->icmp6_type = (uint8_t)operation;
2276         icmp6->icmp6_code = 0;
2277         /*
2278          * Prepare for checksum by putting icmp length in the icmp
2279          * checksum field. The checksum is calculated in ip_output.c.
2280          */
2281         icmp6->icmp6_cksum = ip6h->ip6_plen;
2282 
2283         (void) ip_output_simple(mp, &ixas);
2284         ixa_cleanup(&ixas);
2285         if (need_refrele)
2286                 ill_refrele(ill);
2287         return (B_FALSE);
2288 }
2289 
2290 /*
2291  * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2292  * The datapath uses this as an indication that there
2293  * is a problem (as opposed to a NCE that was just
2294  * reclaimed due to lack of memory.
2295  * Note that static ARP entries never become unreachable.
2296  */
2297 void
2298 nce_make_unreachable(ncec_t *ncec)
2299 {
2300         mutex_enter(&ncec->ncec_lock);
2301         ncec->ncec_state = ND_UNREACHABLE;
2302         mutex_exit(&ncec->ncec_lock);
2303 }
2304 
2305 /*
2306  * NCE retransmit timer. Common to IPv4 and IPv6.
2307  * This timer goes off when:
2308  * a. It is time to retransmit a resolution for resolver.
2309  * b. It is time to send reachability probes.
2310  */
2311 void
2312 nce_timer(void *arg)
2313 {
2314         ncec_t          *ncec = arg;
2315         ill_t           *ill = ncec->ncec_ill, *src_ill;
2316         char            addrbuf[INET6_ADDRSTRLEN];
2317         boolean_t       dropped = B_FALSE;
2318         ip_stack_t      *ipst = ncec->ncec_ipst;
2319         boolean_t       isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2320         in_addr_t       sender4 = INADDR_ANY;
2321         in6_addr_t      sender6 = ipv6_all_zeros;
2322 
2323         /*
2324          * The timer has to be cancelled by ncec_delete before doing the final
2325          * refrele. So the NCE is guaranteed to exist when the timer runs
2326          * until it clears the timeout_id. Before clearing the timeout_id
2327          * bump up the refcnt so that we can continue to use the ncec
2328          */
2329         ASSERT(ncec != NULL);
2330         mutex_enter(&ncec->ncec_lock);
2331         ncec_refhold_locked(ncec);
2332         ncec->ncec_timeout_id = 0;
2333         mutex_exit(&ncec->ncec_lock);
2334 
2335         src_ill = nce_resolve_src(ncec, &sender6);
2336         /* if we could not find a sender address, return */
2337         if (src_ill == NULL) {
2338                 if (!isv6) {
2339                         IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2340                         ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2341                             &sender4, addrbuf, sizeof (addrbuf))));
2342                 } else {
2343                         ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2344                             &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2345                 }
2346                 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2347                 ncec_refrele(ncec);
2348                 return;
2349         }
2350         if (!isv6)
2351                 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2352 
2353         mutex_enter(&ncec->ncec_lock);
2354         /*
2355          * Check the reachability state.
2356          */
2357         switch (ncec->ncec_state) {
2358         case ND_DELAY:
2359                 ASSERT(ncec->ncec_lladdr != NULL);
2360                 ncec->ncec_state = ND_PROBE;
2361                 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2362                 if (isv6) {
2363                         mutex_exit(&ncec->ncec_lock);
2364                         dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2365                             src_ill->ill_phys_addr,
2366                             src_ill->ill_phys_addr_length,
2367                             &sender6, &ncec->ncec_addr,
2368                             NDP_UNICAST);
2369                 } else {
2370                         dropped = (arp_request(ncec, sender4, src_ill) == 0);
2371                         mutex_exit(&ncec->ncec_lock);
2372                 }
2373                 if (!dropped) {
2374                         mutex_enter(&ncec->ncec_lock);
2375                         ncec->ncec_pcnt--;
2376                         mutex_exit(&ncec->ncec_lock);
2377                 }
2378                 if (ip_debug > 3) {
2379                         /* ip2dbg */
2380                         pr_addr_dbg("nce_timer: state for %s changed "
2381                             "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2382                 }
2383                 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2384                 break;
2385         case ND_PROBE:
2386                 /* must be retransmit timer */
2387                 ASSERT(ncec->ncec_pcnt >= -1);
2388                 if (ncec->ncec_pcnt > 0) {
2389                         /*
2390                          * As per RFC2461, the ncec gets deleted after
2391                          * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2392                          * Note that the first unicast solicitation is sent
2393                          * during the DELAY state.
2394                          */
2395                         ip2dbg(("nce_timer: pcount=%x dst %s\n",
2396                             ncec->ncec_pcnt,
2397                             inet_ntop((isv6? AF_INET6 : AF_INET),
2398                             &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2399                         if (NCE_PUBLISH(ncec)) {
2400                                 mutex_exit(&ncec->ncec_lock);
2401                                 /*
2402                                  * send out a probe; note that src_ill
2403                                  * is ignored by nce_dad() for all
2404                                  * DAD message types other than IPv6
2405                                  * unicast probes
2406                                  */
2407                                 nce_dad(ncec, src_ill, B_TRUE);
2408                         } else {
2409                                 ASSERT(src_ill != NULL);
2410                                 if (isv6) {
2411                                         mutex_exit(&ncec->ncec_lock);
2412                                         dropped = ndp_xmit(src_ill,
2413                                             ND_NEIGHBOR_SOLICIT,
2414                                             src_ill->ill_phys_addr,
2415                                             src_ill->ill_phys_addr_length,
2416                                             &sender6, &ncec->ncec_addr,
2417                                             NDP_UNICAST);
2418                                 } else {
2419                                         /*
2420                                          * since the nce is REACHABLE,
2421                                          * the ARP request will be sent out
2422                                          * as a link-layer unicast.
2423                                          */
2424                                         dropped = (arp_request(ncec, sender4,
2425                                             src_ill) == 0);
2426                                         mutex_exit(&ncec->ncec_lock);
2427                                 }
2428                                 if (!dropped) {
2429                                         mutex_enter(&ncec->ncec_lock);
2430                                         ncec->ncec_pcnt--;
2431                                         mutex_exit(&ncec->ncec_lock);
2432                                 }
2433                                 nce_restart_timer(ncec,
2434                                     ill->ill_reachable_retrans_time);
2435                         }
2436                 } else if (ncec->ncec_pcnt < 0) {
2437                         /* No hope, delete the ncec */
2438                         /* Tell datapath it went bad */
2439                         ncec->ncec_state = ND_UNREACHABLE;
2440                         mutex_exit(&ncec->ncec_lock);
2441                         if (ip_debug > 2) {
2442                                 /* ip1dbg */
2443                                 pr_addr_dbg("nce_timer: Delete NCE for"
2444                                     " dst %s\n", (isv6? AF_INET6: AF_INET),
2445                                     &ncec->ncec_addr);
2446                         }
2447                         /* if static ARP can't delete. */
2448                         if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2449                                 ncec_delete(ncec);
2450 
2451                 } else if (!NCE_PUBLISH(ncec)) {
2452                         /*
2453                          * Probe count is 0 for a dynamic entry (one that we
2454                          * ourselves are not publishing). We should never get
2455                          * here if NONUD was requested, hence the ASSERT below.
2456                          */
2457                         ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2458                         ip2dbg(("nce_timer: pcount=%x dst %s\n",
2459                             ncec->ncec_pcnt, inet_ntop(AF_INET6,
2460                             &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2461                         ncec->ncec_pcnt--;
2462                         mutex_exit(&ncec->ncec_lock);
2463                         /* Wait one interval before killing */
2464                         nce_restart_timer(ncec,
2465                             ill->ill_reachable_retrans_time);
2466                 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2467                         ipif_t *ipif;
2468                         ipaddr_t ncec_addr;
2469 
2470                         /*
2471                          * We're done probing, and we can now declare this
2472                          * address to be usable.  Let IP know that it's ok to
2473                          * use.
2474                          */
2475                         ncec->ncec_state = ND_REACHABLE;
2476                         ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2477                         mutex_exit(&ncec->ncec_lock);
2478                         if (isv6) {
2479                                 ipif = ipif_lookup_addr_exact_v6(
2480                                     &ncec->ncec_addr, ill, ipst);
2481                         } else {
2482                                 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2483                                     ncec_addr);
2484                                 ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2485                                     ipst);
2486                         }
2487                         if (ipif != NULL) {
2488                                 if (ipif->ipif_was_dup) {
2489                                         char ibuf[LIFNAMSIZ];
2490                                         char sbuf[INET6_ADDRSTRLEN];
2491 
2492                                         ipif->ipif_was_dup = B_FALSE;
2493                                         (void) inet_ntop(AF_INET6,
2494                                             &ipif->ipif_v6lcl_addr,
2495                                             sbuf, sizeof (sbuf));
2496                                         ipif_get_name(ipif, ibuf,
2497                                             sizeof (ibuf));
2498                                         cmn_err(CE_NOTE, "recovered address "
2499                                             "%s on %s", sbuf, ibuf);
2500                                 }
2501                                 if ((ipif->ipif_flags & IPIF_UP) &&
2502                                     !ipif->ipif_addr_ready)
2503                                         ipif_up_notify(ipif);
2504                                 ipif->ipif_addr_ready = 1;
2505                                 ipif_refrele(ipif);
2506                         }
2507                         if (!isv6 && arp_no_defense)
2508                                 break;
2509                         /* Begin defending our new address */
2510                         if (ncec->ncec_unsolicit_count > 0) {
2511                                 ncec->ncec_unsolicit_count--;
2512                                 if (isv6) {
2513                                         dropped = ndp_announce(ncec);
2514                                 } else {
2515                                         dropped = arp_announce(ncec);
2516                                 }
2517 
2518                                 if (dropped)
2519                                         ncec->ncec_unsolicit_count++;
2520                                 else
2521                                         ncec->ncec_last_time_defended =
2522                                             ddi_get_lbolt();
2523                         }
2524                         if (ncec->ncec_unsolicit_count > 0) {
2525                                 nce_restart_timer(ncec,
2526                                     ANNOUNCE_INTERVAL(isv6));
2527                         } else if (DEFENSE_INTERVAL(isv6) != 0) {
2528                                 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2529                         }
2530                 } else {
2531                         /*
2532                          * This is an address we're probing to be our own, but
2533                          * the ill is down.  Wait until it comes back before
2534                          * doing anything, but switch to reachable state so
2535                          * that the restart will work.
2536                          */
2537                         ncec->ncec_state = ND_REACHABLE;
2538                         mutex_exit(&ncec->ncec_lock);
2539                 }
2540                 break;
2541         case ND_INCOMPLETE: {
2542                 mblk_t  *mp, *nextmp;
2543                 mblk_t  **prevmpp;
2544 
2545                 /*
2546                  * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2547                  * for any IPMP probe packets, and toss them.  IPMP probe
2548                  * packets will always be at the head of ncec_qd_mp, so that
2549                  * we can stop at the first queued ND packet that is
2550                  * not a probe packet.
2551                  */
2552                 prevmpp = &ncec->ncec_qd_mp;
2553                 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2554                         nextmp = mp->b_next;
2555 
2556                         if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2557                                 inet_freemsg(mp);
2558                                 ncec->ncec_nprobes--;
2559                                 *prevmpp = nextmp;
2560                         } else {
2561                                 prevmpp = &mp->b_next;
2562                         }
2563                 }
2564 
2565                 /*
2566                  * Must be resolver's retransmit timer.
2567                  */
2568                 mutex_exit(&ncec->ncec_lock);
2569                 ip_ndp_resolve(ncec);
2570                 break;
2571         }
2572         case ND_REACHABLE:
2573                 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2574                     ncec->ncec_unsolicit_count != 0) ||
2575                     (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2576                         if (ncec->ncec_unsolicit_count > 0) {
2577                                 ncec->ncec_unsolicit_count--;
2578                                 mutex_exit(&ncec->ncec_lock);
2579                                 /*
2580                                  * When we get to zero announcements left,
2581                                  * switch to address defense
2582                                  */
2583                         } else {
2584                                 boolean_t rate_limit;
2585 
2586                                 mutex_exit(&ncec->ncec_lock);
2587                                 rate_limit = ill_defend_rate_limit(ill, ncec);
2588                                 if (rate_limit) {
2589                                         nce_restart_timer(ncec,
2590                                             DEFENSE_INTERVAL(isv6));
2591                                         break;
2592                                 }
2593                         }
2594                         if (isv6) {
2595                                 dropped = ndp_announce(ncec);
2596                         } else {
2597                                 dropped = arp_announce(ncec);
2598                         }
2599                         mutex_enter(&ncec->ncec_lock);
2600                         if (dropped) {
2601                                 ncec->ncec_unsolicit_count++;
2602                         } else {
2603                                 ncec->ncec_last_time_defended =
2604                                     ddi_get_lbolt();
2605                         }
2606                         mutex_exit(&ncec->ncec_lock);
2607                         if (ncec->ncec_unsolicit_count != 0) {
2608                                 nce_restart_timer(ncec,
2609                                     ANNOUNCE_INTERVAL(isv6));
2610                         } else {
2611                                 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2612                         }
2613                 } else {
2614                         mutex_exit(&ncec->ncec_lock);
2615                 }
2616                 break;
2617         default:
2618                 mutex_exit(&ncec->ncec_lock);
2619                 break;
2620         }
2621 done:
2622         ncec_refrele(ncec);
2623         ill_refrele(src_ill);
2624 }
2625 
2626 /*
2627  * Set a link layer address from the ll_addr passed in.
2628  * Copy SAP from ill.
2629  */
2630 static void
2631 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2632 {
2633         ill_t   *ill = ncec->ncec_ill;
2634 
2635         ASSERT(ll_addr != NULL);
2636         if (ill->ill_phys_addr_length > 0) {
2637                 /*
2638                  * The bcopy() below used to be called for the physical address
2639                  * length rather than the link layer address length. For
2640                  * ethernet and many other media, the phys_addr and lla are
2641                  * identical.
2642                  *
2643                  * The phys_addr and lla may not be the same for devices that
2644                  * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2645                  * no known instances of these.
2646                  *
2647                  * For PPP or other interfaces with a zero length
2648                  * physical address, don't do anything here.
2649                  * The bcopy() with a zero phys_addr length was previously
2650                  * a no-op for interfaces with a zero-length physical address.
2651                  * Using the lla for them would change the way they operate.
2652                  * Doing nothing in such cases preserves expected behavior.
2653                  */
2654                 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2655         }
2656 }
2657 
2658 boolean_t
2659 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2660     uint32_t ll_addr_len)
2661 {
2662         ASSERT(ncec->ncec_lladdr != NULL);
2663         if (ll_addr == NULL)
2664                 return (B_FALSE);
2665         if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2666                 return (B_TRUE);
2667         return (B_FALSE);
2668 }
2669 
2670 /*
2671  * Updates the link layer address or the reachability state of
2672  * a cache entry.  Reset probe counter if needed.
2673  */
2674 void
2675 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2676 {
2677         ill_t   *ill = ncec->ncec_ill;
2678         boolean_t need_stop_timer = B_FALSE;
2679         boolean_t need_fastpath_update = B_FALSE;
2680         nce_t   *nce = NULL;
2681         timeout_id_t tid;
2682 
2683         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2684         /*
2685          * If this interface does not do NUD, there is no point
2686          * in allowing an update to the cache entry.  Although
2687          * we will respond to NS.
2688          * The only time we accept an update for a resolver when
2689          * NUD is turned off is when it has just been created.
2690          * Non-Resolvers will always be created as REACHABLE.
2691          */
2692         if (new_state != ND_UNCHANGED) {
2693                 if ((ncec->ncec_flags & NCE_F_NONUD) &&
2694                     (ncec->ncec_state != ND_INCOMPLETE))
2695                         return;
2696                 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2697                 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2698                 need_stop_timer = B_TRUE;
2699                 if (new_state == ND_REACHABLE)
2700                         ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2701                 else {
2702                         /* We force NUD in this case */
2703                         ncec->ncec_last = 0;
2704                 }
2705                 ncec->ncec_state = new_state;
2706                 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2707                 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2708                     new_state == ND_INCOMPLETE);
2709         }
2710         if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2711                 tid = ncec->ncec_timeout_id;
2712                 ncec->ncec_timeout_id = 0;
2713         }
2714         /*
2715          * Re-trigger fastpath probe and
2716          * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2717          * whatever packets that happens to be transmitting at the time.
2718          */
2719         if (new_ll_addr != NULL) {
2720                 bcopy(new_ll_addr, ncec->ncec_lladdr,
2721                     ill->ill_phys_addr_length);
2722                 need_fastpath_update = B_TRUE;
2723         }
2724         mutex_exit(&ncec->ncec_lock);
2725         if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2726                 if (tid != 0)
2727                         (void) untimeout(tid);
2728         }
2729         if (need_fastpath_update) {
2730                 /*
2731                  * Delete any existing existing dlur_mp and fp_mp information.
2732                  * For IPMP interfaces, all underlying ill's must be checked
2733                  * and purged.
2734                  */
2735                 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2736                 /*
2737                  * add the new dlur_mp and fp_mp
2738                  */
2739                 nce = nce_fastpath(ncec, B_TRUE, NULL);
2740                 if (nce != NULL)
2741                         nce_refrele(nce);
2742         }
2743         mutex_enter(&ncec->ncec_lock);
2744 }
2745 
2746 static void
2747 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2748 {
2749         uint_t  count = 0;
2750         mblk_t  **mpp, *tmp;
2751 
2752         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2753 
2754         for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2755                 if (++count > ncec->ncec_ill->ill_max_buf) {
2756                         tmp = ncec->ncec_qd_mp->b_next;
2757                         ncec->ncec_qd_mp->b_next = NULL;
2758                         /*
2759                          * if we never create data addrs on the under_ill
2760                          * does this matter?
2761                          */
2762                         BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2763                             ipIfStatsOutDiscards);
2764                         ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2765                             ncec->ncec_ill);
2766                         freemsg(ncec->ncec_qd_mp);
2767                         ncec->ncec_qd_mp = tmp;
2768                 }
2769         }
2770 
2771         if (head_insert) {
2772                 ncec->ncec_nprobes++;
2773                 mp->b_next = ncec->ncec_qd_mp;
2774                 ncec->ncec_qd_mp = mp;
2775         } else {
2776                 *mpp = mp;
2777         }
2778 }
2779 
2780 /*
2781  * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2782  * queued at the head or tail of the queue based on the input argument
2783  * 'head_insert'. The caller should specify this argument as B_TRUE if this
2784  * packet is an IPMP probe packet, in which case the following happens:
2785  *
2786  *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
2787  *      (non-ipmp_probe) load-speading case where the source address of the ND
2788  *      packet is not tied to ncec_ill. If the ill bound to the source address
2789  *      cannot receive, the response to the ND packet will not be received.
2790  *      However, if ND packets for ncec_ill's probes are queued behind that ND
2791  *      packet, those probes will also fail to be sent, and thus in.mpathd will
2792  *       erroneously conclude that ncec_ill has also failed.
2793  *
2794  *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on
2795  *      the first attempt.  This ensures that ND problems do not manifest as
2796  *      probe RTT spikes.
2797  *
2798  * We achieve this by inserting ipmp_probe() packets at the head of the
2799  * nce_queue.
2800  *
2801  * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2802  * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2803  */
2804 void
2805 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2806 {
2807         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2808         nce_queue_mp_common(ncec, mp, head_insert);
2809 }
2810 
2811 /*
2812  * Called when address resolution failed due to a timeout.
2813  * Send an ICMP unreachable in response to all queued packets.
2814  */
2815 void
2816 ndp_resolv_failed(ncec_t *ncec)
2817 {
2818         mblk_t  *mp, *nxt_mp;
2819         char    buf[INET6_ADDRSTRLEN];
2820         ill_t *ill = ncec->ncec_ill;
2821         ip_recv_attr_t  iras;
2822 
2823         bzero(&iras, sizeof (iras));
2824         iras.ira_flags = 0;
2825         /*
2826          * we are setting the ira_rill to the ipmp_ill (instead of
2827          * the actual ill on which the packet was received), but this
2828          * is ok because we don't actually need the real ira_rill.
2829          * to send the icmp unreachable to the sender.
2830          */
2831         iras.ira_ill = iras.ira_rill = ill;
2832         iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2833         iras.ira_rifindex = iras.ira_ruifindex;
2834 
2835         ip1dbg(("ndp_resolv_failed: dst %s\n",
2836             inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2837         mutex_enter(&ncec->ncec_lock);
2838         mp = ncec->ncec_qd_mp;
2839         ncec->ncec_qd_mp = NULL;
2840         ncec->ncec_nprobes = 0;
2841         mutex_exit(&ncec->ncec_lock);
2842         while (mp != NULL) {
2843                 nxt_mp = mp->b_next;
2844                 mp->b_next = NULL;
2845 
2846                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2847                 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2848                     mp, ill);
2849                 icmp_unreachable_v6(mp,
2850                     ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2851                 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2852                 mp = nxt_mp;
2853         }
2854         ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2855 }
2856 
2857 /*
2858  * Handle the completion of NDP and ARP resolution.
2859  */
2860 void
2861 nce_resolv_ok(ncec_t *ncec)
2862 {
2863         mblk_t *mp;
2864         uint_t pkt_len;
2865         iaflags_t ixaflags = IXAF_NO_TRACE;
2866         nce_t *nce;
2867         ill_t   *ill = ncec->ncec_ill;
2868         boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2869         ip_stack_t *ipst = ill->ill_ipst;
2870 
2871         if (IS_IPMP(ncec->ncec_ill)) {
2872                 nce_resolv_ipmp_ok(ncec);
2873                 return;
2874         }
2875         /* non IPMP case */
2876 
2877         mutex_enter(&ncec->ncec_lock);
2878         ASSERT(ncec->ncec_nprobes == 0);
2879         mp = ncec->ncec_qd_mp;
2880         ncec->ncec_qd_mp = NULL;
2881         mutex_exit(&ncec->ncec_lock);
2882 
2883         while (mp != NULL) {
2884                 mblk_t *nxt_mp;
2885 
2886                 if (ill->ill_isv6) {
2887                         ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2888 
2889                         pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2890                 } else {
2891                         ipha_t *ipha = (ipha_t *)mp->b_rptr;
2892 
2893                         ixaflags |= IXAF_IS_IPV4;
2894                         pkt_len = ntohs(ipha->ipha_length);
2895                 }
2896                 nxt_mp = mp->b_next;
2897                 mp->b_next = NULL;
2898                 /*
2899                  * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2900                  * longer available, but it's ok to drop this flag because TCP
2901                  * has its own flow-control in effect, so TCP packets
2902                  * are not likely to get here when flow-control is in effect.
2903                  */
2904                 mutex_enter(&ill->ill_lock);
2905                 nce = nce_lookup(ill, &ncec->ncec_addr);
2906                 mutex_exit(&ill->ill_lock);
2907 
2908                 if (nce == NULL) {
2909                         if (isv6) {
2910                                 BUMP_MIB(&ipst->ips_ip6_mib,
2911                                     ipIfStatsOutDiscards);
2912                         } else {
2913                                 BUMP_MIB(&ipst->ips_ip_mib,
2914                                     ipIfStatsOutDiscards);
2915                         }
2916                         ip_drop_output("ipIfStatsOutDiscards - no nce",
2917                             mp, NULL);
2918                         freemsg(mp);
2919                 } else {
2920                         /*
2921                          * We don't know the zoneid, but
2922                          * ip_xmit does not care since IXAF_NO_TRACE
2923                          * is set. (We traced the packet the first
2924                          * time through ip_xmit.)
2925                          */
2926                         (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2927                             ALL_ZONES, 0, NULL);
2928                         nce_refrele(nce);
2929                 }
2930                 mp = nxt_mp;
2931         }
2932 
2933         ncec_cb_dispatch(ncec); /* complete callbacks */
2934 }
2935 
2936 /*
2937  * Called by SIOCSNDP* ioctl to add/change an ncec entry
2938  * and the corresponding attributes.
2939  * Disallow states other than ND_REACHABLE or ND_STALE.
2940  */
2941 int
2942 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2943 {
2944         sin6_t          *sin6;
2945         in6_addr_t      *addr;
2946         ncec_t          *ncec;
2947         nce_t           *nce;
2948         int             err = 0;
2949         uint16_t        new_flags = 0;
2950         uint16_t        old_flags = 0;
2951         int             inflags = lnr->lnr_flags;
2952         ip_stack_t      *ipst = ill->ill_ipst;
2953         boolean_t       do_postprocess = B_FALSE;
2954 
2955         ASSERT(ill->ill_isv6);
2956         if ((lnr->lnr_state_create != ND_REACHABLE) &&
2957             (lnr->lnr_state_create != ND_STALE))
2958                 return (EINVAL);
2959 
2960         sin6 = (sin6_t *)&lnr->lnr_addr;
2961         addr = &sin6->sin6_addr;
2962 
2963         mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2964         ASSERT(!IS_UNDER_IPMP(ill));
2965         nce = nce_lookup_addr(ill, addr);
2966         if (nce != NULL)
2967                 new_flags = nce->nce_common->ncec_flags;
2968 
2969         switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2970         case NDF_ISROUTER_ON:
2971                 new_flags |= NCE_F_ISROUTER;
2972                 break;
2973         case NDF_ISROUTER_OFF:
2974                 new_flags &= ~NCE_F_ISROUTER;
2975                 break;
2976         case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2977                 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2978                 if (nce != NULL)
2979                         nce_refrele(nce);
2980                 return (EINVAL);
2981         }
2982         if (inflags & NDF_STATIC)
2983                 new_flags |= NCE_F_STATIC;
2984 
2985         switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2986         case NDF_ANYCAST_ON:
2987                 new_flags |= NCE_F_ANYCAST;
2988                 break;
2989         case NDF_ANYCAST_OFF:
2990                 new_flags &= ~NCE_F_ANYCAST;
2991                 break;
2992         case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2993                 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2994                 if (nce != NULL)
2995                         nce_refrele(nce);
2996                 return (EINVAL);
2997         }
2998 
2999         if (nce == NULL) {
3000                 err = nce_add_v6(ill,
3001                     (uchar_t *)lnr->lnr_hdw_addr,
3002                     ill->ill_phys_addr_length,
3003                     addr,
3004                     new_flags,
3005                     lnr->lnr_state_create,
3006                     &nce);
3007                 if (err != 0) {
3008                         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3009                         ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3010                         return (err);
3011                 } else {
3012                         do_postprocess = B_TRUE;
3013                 }
3014         }
3015         ncec = nce->nce_common;
3016         old_flags = ncec->ncec_flags;
3017         if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3018                 ncec_router_to_host(ncec);
3019                 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3020                 if (do_postprocess)
3021                         err = nce_add_v6_postprocess(nce);
3022                 nce_refrele(nce);
3023                 return (0);
3024         }
3025         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3026 
3027         if (do_postprocess)
3028                 err = nce_add_v6_postprocess(nce);
3029         /*
3030          * err cannot be anything other than 0 because we don't support
3031          * proxy arp of static addresses.
3032          */
3033         ASSERT(err == 0);
3034 
3035         mutex_enter(&ncec->ncec_lock);
3036         ncec->ncec_flags = new_flags;
3037         mutex_exit(&ncec->ncec_lock);
3038         /*
3039          * Note that we ignore the state at this point, which
3040          * should be either STALE or REACHABLE.  Instead we let
3041          * the link layer address passed in to determine the state
3042          * much like incoming packets.
3043          */
3044         nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3045         nce_refrele(nce);
3046         return (0);
3047 }
3048 
3049 /*
3050  * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3051  * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3052  * be held to ensure that they are in the same group.
3053  */
3054 static nce_t *
3055 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3056 {
3057 
3058         nce_t *nce;
3059 
3060         nce = nce_ill_lookup_then_add(ill, ncec);
3061 
3062         if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3063                 return (nce);
3064 
3065         /*
3066          * hold the ncec_lock to synchronize with nce_update() so that,
3067          * at the end of this function, the contents of nce_dlur_mp are
3068          * consistent with ncec->ncec_lladdr, even though some intermediate
3069          * packet may have been sent out with a mangled address, which would
3070          * only be a transient condition.
3071          */
3072         mutex_enter(&ncec->ncec_lock);
3073         if (ncec->ncec_lladdr != NULL) {
3074                 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3075                     NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3076         } else {
3077                 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3078                     ill->ill_sap_length);
3079         }
3080         mutex_exit(&ncec->ncec_lock);
3081         return (nce);
3082 }
3083 
3084 /*
3085  * we make nce_fp_mp to have an M_DATA prepend.
3086  * The caller ensures there is hold on ncec for this function.
3087  * Note that since ill_fastpath_probe() copies the mblk there is
3088  * no need to hold the nce or ncec beyond this function.
3089  *
3090  * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3091  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3092  * and will be returned back by this function, so that no extra nce_refrele
3093  * is required for the caller. The calls from nce_add_common() use this
3094  * method. All other callers (that pass in NULL ncec_nce) will have to do a
3095  * nce_refrele of the returned nce (when it is non-null).
3096  */
3097 nce_t *
3098 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3099 {
3100         nce_t *nce;
3101         ill_t *ill = ncec->ncec_ill;
3102 
3103         ASSERT(ill != NULL);
3104 
3105         if (IS_IPMP(ill) && trigger_fp_req) {
3106                 trigger_fp_req = B_FALSE;
3107                 ipmp_ncec_refresh_nce(ncec);
3108         }
3109 
3110         /*
3111          * If the caller already has the nce corresponding to the ill, use
3112          * that one. Otherwise we have to lookup/add the nce. Calls from
3113          * nce_add_common() fall in the former category, and have just done
3114          * the nce lookup/add that can be reused.
3115          */
3116         if (ncec_nce == NULL)
3117                 nce = nce_fastpath_create(ill, ncec);
3118         else
3119                 nce = ncec_nce;
3120 
3121         if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3122                 return (nce);
3123 
3124         if (trigger_fp_req)
3125                 nce_fastpath_trigger(nce);
3126         return (nce);
3127 }
3128 
3129 /*
3130  * Trigger fastpath on nce. No locks may be held.
3131  */
3132 static void
3133 nce_fastpath_trigger(nce_t *nce)
3134 {
3135         int res;
3136         ill_t *ill = nce->nce_ill;
3137         ncec_t *ncec = nce->nce_common;
3138 
3139         res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3140         /*
3141          * EAGAIN is an indication of a transient error
3142          * i.e. allocation failure etc. leave the ncec in the list it
3143          * will be updated when another probe happens for another ire
3144          * if not it will be taken out of the list when the ire is
3145          * deleted.
3146          */
3147         if (res != 0 && res != EAGAIN && res != ENOTSUP)
3148                 nce_fastpath_list_delete(ill, ncec, NULL);
3149 }
3150 
3151 /*
3152  * Add ncec to the nce fastpath list on ill.
3153  */
3154 static nce_t *
3155 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3156 {
3157         nce_t *nce = NULL;
3158 
3159         ASSERT(MUTEX_HELD(&ill->ill_lock));
3160         /*
3161          * Atomically ensure that the ill is not CONDEMNED and is not going
3162          * down, before adding the NCE.
3163          */
3164         if (ill->ill_state_flags & ILL_CONDEMNED)
3165                 return (NULL);
3166         mutex_enter(&ncec->ncec_lock);
3167         /*
3168          * if ncec has not been deleted and
3169          * is not already in the list add it.
3170          */
3171         if (!NCE_ISCONDEMNED(ncec)) {
3172                 nce = nce_lookup(ill, &ncec->ncec_addr);
3173                 if (nce != NULL)
3174                         goto done;
3175                 nce = nce_add(ill, ncec);
3176         }
3177 done:
3178         mutex_exit(&ncec->ncec_lock);
3179         return (nce);
3180 }
3181 
3182 nce_t *
3183 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3184 {
3185         nce_t *nce;
3186 
3187         mutex_enter(&ill->ill_lock);
3188         nce = nce_ill_lookup_then_add_locked(ill, ncec);
3189         mutex_exit(&ill->ill_lock);
3190         return (nce);
3191 }
3192 
3193 
3194 /*
3195  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3196  * nce is added to the 'dead' list, and the caller must nce_refrele() the
3197  * entry after all locks have been dropped.
3198  */
3199 void
3200 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3201 {
3202         nce_t *nce;
3203 
3204         ASSERT(ill != NULL);
3205 
3206         /* delete any nces referencing the ncec from underlying ills */
3207         if (IS_IPMP(ill))
3208                 ipmp_ncec_delete_nce(ncec);
3209 
3210         /* now the ill itself */
3211         mutex_enter(&ill->ill_lock);
3212         for (nce = list_head(&ill->ill_nce); nce != NULL;
3213             nce = list_next(&ill->ill_nce, nce)) {
3214                 if (nce->nce_common == ncec) {
3215                         nce_refhold(nce);
3216                         nce_delete(nce);
3217                         break;
3218                 }
3219         }
3220         mutex_exit(&ill->ill_lock);
3221         if (nce != NULL) {
3222                 if (dead == NULL)
3223                         nce_refrele(nce);
3224                 else
3225                         list_insert_tail(dead, nce);
3226         }
3227 }
3228 
3229 /*
3230  * when the fastpath response does not fit in the datab
3231  * associated with the existing nce_fp_mp, we delete and
3232  * add the nce to retrigger fastpath based on the information
3233  * in the ncec_t.
3234  */
3235 static nce_t *
3236 nce_delete_then_add(nce_t *nce)
3237 {
3238         ill_t           *ill = nce->nce_ill;
3239         nce_t           *newnce = NULL;
3240 
3241         ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3242             (void *)nce, ill->ill_name));
3243         mutex_enter(&ill->ill_lock);
3244         mutex_enter(&nce->nce_common->ncec_lock);
3245         nce_delete(nce);
3246         /*
3247          * Make sure that ncec is not condemned before adding. We hold the
3248          * ill_lock and ncec_lock to synchronize with ncec_delete() and
3249          * ipmp_ncec_delete_nce()
3250          */
3251         if (!NCE_ISCONDEMNED(nce->nce_common))
3252                 newnce = nce_add(ill, nce->nce_common);
3253         mutex_exit(&nce->nce_common->ncec_lock);
3254         mutex_exit(&ill->ill_lock);
3255         nce_refrele(nce);
3256         return (newnce); /* could be null if nomem */
3257 }
3258 
3259 typedef struct nce_fp_match_s {
3260         nce_t   *nce_fp_match_res;
3261         mblk_t  *nce_fp_match_ack_mp;
3262 } nce_fp_match_t;
3263 
3264 /* ARGSUSED */
3265 static int
3266 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3267 {
3268         nce_fp_match_t  *nce_fp_marg = arg;
3269         ncec_t          *ncec = nce->nce_common;
3270         mblk_t          *mp = nce_fp_marg->nce_fp_match_ack_mp;
3271         uchar_t *mp_rptr, *ud_mp_rptr;
3272         mblk_t          *ud_mp = nce->nce_dlur_mp;
3273         ptrdiff_t       cmplen;
3274 
3275         /*
3276          * mp is the mp associated with the fastpath ack.
3277          * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3278          * under consideration. If the contents match, then the
3279          * fastpath ack is used to update the nce.
3280          */
3281         if (ud_mp == NULL)
3282                 return (0);
3283         mp_rptr = mp->b_rptr;
3284         cmplen = mp->b_wptr - mp_rptr;
3285         ASSERT(cmplen >= 0);
3286 
3287         ud_mp_rptr = ud_mp->b_rptr;
3288         /*
3289          * The ncec is locked here to prevent any other threads from accessing
3290          * and changing nce_dlur_mp when the address becomes resolved to an
3291          * lla while we're in the middle of looking at and comparing the
3292          * hardware address (lla). It is also locked to prevent multiple
3293          * threads in nce_fastpath() from examining nce_dlur_mp at the same
3294          * time.
3295          */
3296         mutex_enter(&ncec->ncec_lock);
3297         if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3298             bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3299                 nce_fp_marg->nce_fp_match_res = nce;
3300                 mutex_exit(&ncec->ncec_lock);
3301                 nce_refhold(nce);
3302                 return (1);
3303         }
3304         mutex_exit(&ncec->ncec_lock);
3305         return (0);
3306 }
3307 
3308 /*
3309  * Update all NCE's that are not in fastpath mode and
3310  * have an nce_fp_mp that matches mp. mp->b_cont contains
3311  * the fastpath header.
3312  *
3313  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3314  */
3315 void
3316 nce_fastpath_update(ill_t *ill,  mblk_t *mp)
3317 {
3318         nce_fp_match_t nce_fp_marg;
3319         nce_t *nce;
3320         mblk_t *nce_fp_mp, *fp_mp;
3321 
3322         nce_fp_marg.nce_fp_match_res = NULL;
3323         nce_fp_marg.nce_fp_match_ack_mp = mp;
3324 
3325         nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3326 
3327         if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3328                 return;
3329 
3330         mutex_enter(&nce->nce_lock);
3331         nce_fp_mp = nce->nce_fp_mp;
3332 
3333         if (nce_fp_mp != NULL) {
3334                 fp_mp = mp->b_cont;
3335                 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3336                     nce_fp_mp->b_datap->db_lim) {
3337                         mutex_exit(&nce->nce_lock);
3338                         nce = nce_delete_then_add(nce);
3339                         if (nce == NULL) {
3340                                 return;
3341                         }
3342                         mutex_enter(&nce->nce_lock);
3343                         nce_fp_mp = nce->nce_fp_mp;
3344                 }
3345         }
3346 
3347         /* Matched - install mp as the fastpath mp */
3348         if (nce_fp_mp == NULL) {
3349                 fp_mp = dupb(mp->b_cont);
3350                 nce->nce_fp_mp = fp_mp;
3351         } else {
3352                 fp_mp = mp->b_cont;
3353                 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3354                 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3355                     + MBLKL(fp_mp);
3356         }
3357         mutex_exit(&nce->nce_lock);
3358         nce_refrele(nce);
3359 }
3360 
3361 /*
3362  * Return a pointer to a given option in the packet.
3363  * Assumes that option part of the packet have already been validated.
3364  */
3365 nd_opt_hdr_t *
3366 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3367 {
3368         while (optlen > 0) {
3369                 if (opt->nd_opt_type == opt_type)
3370                         return (opt);
3371                 optlen -= 8 * opt->nd_opt_len;
3372                 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3373         }
3374         return (NULL);
3375 }
3376 
3377 /*
3378  * Verify all option lengths present are > 0, also check to see
3379  * if the option lengths and packet length are consistent.
3380  */
3381 boolean_t
3382 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3383 {
3384         ASSERT(opt != NULL);
3385         while (optlen > 0) {
3386                 if (opt->nd_opt_len == 0)
3387                         return (B_FALSE);
3388                 optlen -= 8 * opt->nd_opt_len;
3389                 if (optlen < 0)
3390                         return (B_FALSE);
3391                 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3392         }
3393         return (B_TRUE);
3394 }
3395 
3396 /*
3397  * ncec_walk function.
3398  * Free a fraction of the NCE cache entries.
3399  *
3400  * A possible optimization here would be to use ncec_last where possible, and
3401  * delete the least-frequently used entry, which would require more complex
3402  * computation as we walk through the ncec's (e.g., track ncec entries by
3403  * order of ncec_last and/or maintain state)
3404  */
3405 static void
3406 ncec_cache_reclaim(ncec_t *ncec, char *arg)
3407 {
3408         ip_stack_t      *ipst = ncec->ncec_ipst;
3409         uint_t          fraction = *(uint_t *)arg;
3410         uint_t          rand;
3411 
3412         if ((ncec->ncec_flags &
3413             (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3414                 return;
3415         }
3416 
3417         rand = (uint_t)ddi_get_lbolt() +
3418             NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3419         if ((rand/fraction)*fraction == rand) {
3420                 IP_STAT(ipst, ip_nce_reclaim_deleted);
3421                 ncec_delete(ncec);
3422         }
3423 }
3424 
3425 /*
3426  * kmem_cache callback to free up memory.
3427  *
3428  * For now we just delete a fixed fraction.
3429  */
3430 static void
3431 ip_nce_reclaim_stack(ip_stack_t *ipst)
3432 {
3433         uint_t          fraction = ipst->ips_ip_nce_reclaim_fraction;
3434 
3435         IP_STAT(ipst, ip_nce_reclaim_calls);
3436 
3437         ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst);
3438 
3439         /*
3440          * Walk all CONNs that can have a reference on an ire, ncec or dce.
3441          * Get them to update any stale references to drop any refholds they
3442          * have.
3443          */
3444         ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3445 }
3446 
3447 /*
3448  * Called by the memory allocator subsystem directly, when the system
3449  * is running low on memory.
3450  */
3451 /* ARGSUSED */
3452 void
3453 ip_nce_reclaim(void *args)
3454 {
3455         netstack_handle_t nh;
3456         netstack_t *ns;
3457         ip_stack_t *ipst;
3458 
3459         netstack_next_init(&nh);
3460         while ((ns = netstack_next(&nh)) != NULL) {
3461                 /*
3462                  * netstack_next() can return a netstack_t with a NULL
3463                  * netstack_ip at boot time.
3464                  */
3465                 if ((ipst = ns->netstack_ip) == NULL) {
3466                         netstack_rele(ns);
3467                         continue;
3468                 }
3469                 ip_nce_reclaim_stack(ipst);
3470                 netstack_rele(ns);
3471         }
3472         netstack_next_fini(&nh);
3473 }
3474 
3475 #ifdef DEBUG
3476 void
3477 ncec_trace_ref(ncec_t *ncec)
3478 {
3479         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3480 
3481         if (ncec->ncec_trace_disable)
3482                 return;
3483 
3484         if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3485                 ncec->ncec_trace_disable = B_TRUE;
3486                 ncec_trace_cleanup(ncec);
3487         }
3488 }
3489 
3490 void
3491 ncec_untrace_ref(ncec_t *ncec)
3492 {
3493         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3494 
3495         if (!ncec->ncec_trace_disable)
3496                 th_trace_unref(ncec);
3497 }
3498 
3499 static void
3500 ncec_trace_cleanup(const ncec_t *ncec)
3501 {
3502         th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3503 }
3504 #endif
3505 
3506 /*
3507  * Called when address resolution fails due to a timeout.
3508  * Send an ICMP unreachable in response to all queued packets.
3509  */
3510 void
3511 arp_resolv_failed(ncec_t *ncec)
3512 {
3513         mblk_t  *mp, *nxt_mp;
3514         char    buf[INET6_ADDRSTRLEN];
3515         struct in_addr ipv4addr;
3516         ill_t *ill = ncec->ncec_ill;
3517         ip_stack_t *ipst = ncec->ncec_ipst;
3518         ip_recv_attr_t  iras;
3519 
3520         bzero(&iras, sizeof (iras));
3521         iras.ira_flags = IRAF_IS_IPV4;
3522         /*
3523          * we are setting the ira_rill to the ipmp_ill (instead of
3524          * the actual ill on which the packet was received), but this
3525          * is ok because we don't actually need the real ira_rill.
3526          * to send the icmp unreachable to the sender.
3527          */
3528         iras.ira_ill = iras.ira_rill = ill;
3529         iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3530         iras.ira_rifindex = iras.ira_ruifindex;
3531 
3532         IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3533         ip3dbg(("arp_resolv_failed: dst %s\n",
3534             inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3535         mutex_enter(&ncec->ncec_lock);
3536         mp = ncec->ncec_qd_mp;
3537         ncec->ncec_qd_mp = NULL;
3538         ncec->ncec_nprobes = 0;
3539         mutex_exit(&ncec->ncec_lock);
3540         while (mp != NULL) {
3541                 nxt_mp = mp->b_next;
3542                 mp->b_next = NULL;
3543 
3544                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3545                 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3546                     mp, ill);
3547                 if (ipst->ips_ip_arp_icmp_error) {
3548                         ip3dbg(("arp_resolv_failed: "
3549                             "Calling icmp_unreachable\n"));
3550                         icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3551                 } else {
3552                         freemsg(mp);
3553                 }
3554                 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3555                 mp = nxt_mp;
3556         }
3557         ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3558 }
3559 
3560 /*
3561  * if ill is an under_ill, translate it to the ipmp_ill and add the
3562  * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3563  * one on the underlying in_ill) will be created for the
3564  * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3565  */
3566 int
3567 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3568     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3569 {
3570         int     err;
3571         in6_addr_t addr6;
3572         ip_stack_t *ipst = ill->ill_ipst;
3573         nce_t   *nce, *upper_nce = NULL;
3574         ill_t   *in_ill = ill, *under = NULL;
3575         boolean_t need_ill_refrele = B_FALSE;
3576 
3577         if (flags & NCE_F_MCAST) {
3578                 /*
3579                  * hw_addr will be figured out in nce_set_multicast_v4;
3580                  * caller needs to pass in the cast_ill for ipmp
3581                  */
3582                 ASSERT(hw_addr == NULL);
3583                 ASSERT(!IS_IPMP(ill));
3584                 err = nce_set_multicast_v4(ill, addr, flags, newnce);
3585                 return (err);
3586         }
3587 
3588         if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3589                 ill = ipmp_ill_hold_ipmp_ill(ill);
3590                 if (ill == NULL)
3591                         return (ENXIO);
3592                 need_ill_refrele = B_TRUE;
3593         }
3594         if ((flags & NCE_F_BCAST) != 0) {
3595                 /*
3596                  * IPv4 broadcast ncec: compute the hwaddr.
3597                  */
3598                 if (IS_IPMP(ill)) {
3599                         under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3600                         if (under == NULL)  {
3601                                 if (need_ill_refrele)
3602                                         ill_refrele(ill);
3603                                 return (ENETDOWN);
3604                         }
3605                         hw_addr = under->ill_bcast_mp->b_rptr +
3606                             NCE_LL_ADDR_OFFSET(under);
3607                         hw_addr_len = under->ill_phys_addr_length;
3608                 } else {
3609                         hw_addr = ill->ill_bcast_mp->b_rptr +
3610                             NCE_LL_ADDR_OFFSET(ill),
3611                             hw_addr_len = ill->ill_phys_addr_length;
3612                 }
3613         }
3614 
3615         mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3616         IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3617         nce = nce_lookup_addr(ill, &addr6);
3618         if (nce == NULL) {
3619                 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3620                     state, &nce);
3621         } else {
3622                 err = EEXIST;
3623         }
3624         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3625         if (err == 0)
3626                 err = nce_add_v4_postprocess(nce);
3627 
3628         if (in_ill != ill && nce != NULL) {
3629                 nce_t *under_nce = NULL;
3630 
3631                 /*
3632                  * in_ill was the under_ill. Try to create the under_nce.
3633                  * Hold the ill_g_lock to prevent changes to group membership
3634                  * until we are done.
3635                  */
3636                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3637                 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3638                         DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3639                             ill_t *, ill);
3640                         rw_exit(&ipst->ips_ill_g_lock);
3641                         err = ENXIO;
3642                         nce_refrele(nce);
3643                         nce = NULL;
3644                         goto bail;
3645                 }
3646                 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3647                 if (under_nce == NULL) {
3648                         rw_exit(&ipst->ips_ill_g_lock);
3649                         err = EINVAL;
3650                         nce_refrele(nce);
3651                         nce = NULL;
3652                         goto bail;
3653                 }
3654                 rw_exit(&ipst->ips_ill_g_lock);
3655                 upper_nce = nce;
3656                 nce = under_nce; /* will be returned to caller */
3657                 if (NCE_ISREACHABLE(nce->nce_common))
3658                         nce_fastpath_trigger(under_nce);
3659         }
3660         if (nce != NULL) {
3661                 if (newnce != NULL)
3662                         *newnce = nce;
3663                 else
3664                         nce_refrele(nce);
3665         }
3666 bail:
3667         if (under != NULL)
3668                 ill_refrele(under);
3669         if (upper_nce != NULL)
3670                 nce_refrele(upper_nce);
3671         if (need_ill_refrele)
3672                 ill_refrele(ill);
3673 
3674         return (err);
3675 }
3676 
3677 /*
3678  * NDP Cache Entry creation routine for IPv4.
3679  * This routine must always be called with ndp4->ndp_g_lock held.
3680  * Prior to return, ncec_refcnt is incremented.
3681  *
3682  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3683  * are always added pointing at the ipmp_ill. Thus, when the ill passed
3684  * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3685  * entries will be created, both pointing at the same ncec_t. The nce_t
3686  * entries will have their nce_ill set to the ipmp_ill and the under_ill
3687  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3688  * Local addresses are always created on the ill passed to nce_add_v4.
3689  */
3690 int
3691 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3692     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3693 {
3694         int             err;
3695         boolean_t       is_multicast = (flags & NCE_F_MCAST);
3696         struct in6_addr addr6;
3697         nce_t           *nce;
3698 
3699         ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3700         ASSERT(!ill->ill_isv6);
3701         ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3702 
3703         IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3704         err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3705             &nce);
3706         ASSERT(newnce != NULL);
3707         *newnce = nce;
3708         return (err);
3709 }
3710 
3711 /*
3712  * Post-processing routine to be executed after nce_add_v4(). This function
3713  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3714  * and must be called without any locks held.
3715  *
3716  * Always returns 0, but we return an int to keep this symmetric with the
3717  * IPv6 counter-part.
3718  */
3719 int
3720 nce_add_v4_postprocess(nce_t *nce)
3721 {
3722         ncec_t          *ncec = nce->nce_common;
3723         uint16_t        flags = ncec->ncec_flags;
3724         boolean_t       ndp_need_dad = B_FALSE;
3725         boolean_t       dropped;
3726         clock_t         delay;
3727         ip_stack_t      *ipst = ncec->ncec_ill->ill_ipst;
3728         uchar_t         *hw_addr = ncec->ncec_lladdr;
3729         boolean_t       trigger_fastpath = B_TRUE;
3730 
3731         /*
3732          * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3733          * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3734          * We call nce_fastpath from nce_update if the link layer address of
3735          * the peer changes from nce_update
3736          */
3737         if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3738             ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3739                 trigger_fastpath = B_FALSE;
3740 
3741         if (trigger_fastpath)
3742                 nce_fastpath_trigger(nce);
3743 
3744         if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3745                 /*
3746                  * Either the caller (by passing in ND_PROBE)
3747                  * or nce_add_common() (by the internally computed state
3748                  * based on ncec_addr and ill_net_type) has determined
3749                  * that this unicast entry needs DAD. Trigger DAD.
3750                  */
3751                 ndp_need_dad = B_TRUE;
3752         } else if (flags & NCE_F_UNSOL_ADV) {
3753                 /*
3754                  * We account for the transmit below by assigning one
3755                  * less than the ndd variable. Subsequent decrements
3756                  * are done in nce_timer.
3757                  */
3758                 mutex_enter(&ncec->ncec_lock);
3759                 ncec->ncec_unsolicit_count =
3760                     ipst->ips_ip_arp_publish_count - 1;
3761                 mutex_exit(&ncec->ncec_lock);
3762                 dropped = arp_announce(ncec);
3763                 mutex_enter(&ncec->ncec_lock);
3764                 if (dropped)
3765                         ncec->ncec_unsolicit_count++;
3766                 else
3767                         ncec->ncec_last_time_defended = ddi_get_lbolt();
3768                 if (ncec->ncec_unsolicit_count != 0) {
3769                         nce_start_timer(ncec,
3770                             ipst->ips_ip_arp_publish_interval);
3771                 }
3772                 mutex_exit(&ncec->ncec_lock);
3773         }
3774 
3775         /*
3776          * If ncec_xmit_interval is 0, user has configured us to send the first
3777          * probe right away.  Do so, and set up for the subsequent probes.
3778          */
3779         if (ndp_need_dad) {
3780                 mutex_enter(&ncec->ncec_lock);
3781                 if (ncec->ncec_pcnt == 0) {
3782                         /*
3783                          * DAD probes and announce can be
3784                          * administratively disabled by setting the
3785                          * probe_count to zero. Restart the timer in
3786                          * this case to mark the ipif as ready.
3787                          */
3788                         ncec->ncec_unsolicit_count = 0;
3789                         mutex_exit(&ncec->ncec_lock);
3790                         nce_restart_timer(ncec, 0);
3791                 } else {
3792                         mutex_exit(&ncec->ncec_lock);
3793                         delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3794                             ipst->ips_arp_probe_delay :
3795                             ipst->ips_arp_fastprobe_delay);
3796                         nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3797                 }
3798         }
3799         return (0);
3800 }
3801 
3802 /*
3803  * ncec_walk routine to update all entries that have a given destination or
3804  * gateway address and cached link layer (MAC) address.  This is used when ARP
3805  * informs us that a network-to-link-layer mapping may have changed.
3806  */
3807 void
3808 nce_update_hw_changed(ncec_t *ncec, void *arg)
3809 {
3810         nce_hw_map_t *hwm = arg;
3811         ipaddr_t ncec_addr;
3812 
3813         if (ncec->ncec_state != ND_REACHABLE)
3814                 return;
3815 
3816         IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3817         if (ncec_addr != hwm->hwm_addr)
3818                 return;
3819 
3820         mutex_enter(&ncec->ncec_lock);
3821         if (hwm->hwm_flags != 0)
3822                 ncec->ncec_flags = hwm->hwm_flags;
3823         nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3824         mutex_exit(&ncec->ncec_lock);
3825 }
3826 
3827 void
3828 ncec_refhold(ncec_t *ncec)
3829 {
3830         mutex_enter(&(ncec)->ncec_lock);
3831         (ncec)->ncec_refcnt++;
3832         ASSERT((ncec)->ncec_refcnt != 0);
3833 #ifdef DEBUG
3834         ncec_trace_ref(ncec);
3835 #endif
3836         mutex_exit(&(ncec)->ncec_lock);
3837 }
3838 
3839 void
3840 ncec_refhold_notr(ncec_t *ncec)
3841 {
3842         mutex_enter(&(ncec)->ncec_lock);
3843         (ncec)->ncec_refcnt++;
3844         ASSERT((ncec)->ncec_refcnt != 0);
3845         mutex_exit(&(ncec)->ncec_lock);
3846 }
3847 
3848 static void
3849 ncec_refhold_locked(ncec_t *ncec)
3850 {
3851         ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3852         (ncec)->ncec_refcnt++;
3853 #ifdef DEBUG
3854         ncec_trace_ref(ncec);
3855 #endif
3856 }
3857 
3858 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3859 void
3860 ncec_refrele(ncec_t *ncec)
3861 {
3862         mutex_enter(&(ncec)->ncec_lock);
3863 #ifdef DEBUG
3864         ncec_untrace_ref(ncec);
3865 #endif
3866         ASSERT((ncec)->ncec_refcnt != 0);
3867         if (--(ncec)->ncec_refcnt == 0) {
3868                 ncec_inactive(ncec);
3869         } else {
3870                 mutex_exit(&(ncec)->ncec_lock);
3871         }
3872 }
3873 
3874 void
3875 ncec_refrele_notr(ncec_t *ncec)
3876 {
3877         mutex_enter(&(ncec)->ncec_lock);
3878         ASSERT((ncec)->ncec_refcnt != 0);
3879         if (--(ncec)->ncec_refcnt == 0) {
3880                 ncec_inactive(ncec);
3881         } else {
3882                 mutex_exit(&(ncec)->ncec_lock);
3883         }
3884 }
3885 
3886 /*
3887  * Common to IPv4 and IPv6.
3888  */
3889 void
3890 nce_restart_timer(ncec_t *ncec, uint_t ms)
3891 {
3892         timeout_id_t tid;
3893 
3894         ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3895 
3896         /* First cancel any running timer */
3897         mutex_enter(&ncec->ncec_lock);
3898         tid = ncec->ncec_timeout_id;
3899         ncec->ncec_timeout_id = 0;
3900         if (tid != 0) {
3901                 mutex_exit(&ncec->ncec_lock);
3902                 (void) untimeout(tid);
3903                 mutex_enter(&ncec->ncec_lock);
3904         }
3905 
3906         /* Restart timer */
3907         nce_start_timer(ncec, ms);
3908         mutex_exit(&ncec->ncec_lock);
3909 }
3910 
3911 static void
3912 nce_start_timer(ncec_t *ncec, uint_t ms)
3913 {
3914         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3915         /*
3916          * Don't start the timer if the ncec has been deleted, or if the timer
3917          * is already running
3918          */
3919         if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3920                 ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3921                     MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3922         }
3923 }
3924 
3925 int
3926 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3927     uint16_t flags, nce_t **newnce)
3928 {
3929         uchar_t         *hw_addr;
3930         int             err = 0;
3931         ip_stack_t      *ipst = ill->ill_ipst;
3932         in6_addr_t      dst6;
3933         nce_t           *nce;
3934 
3935         ASSERT(!ill->ill_isv6);
3936 
3937         IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3938         mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3939         if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3940                 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3941                 goto done;
3942         }
3943         if (ill->ill_net_type == IRE_IF_RESOLVER) {
3944                 /*
3945                  * For IRE_IF_RESOLVER a hardware mapping can be
3946                  * generated, for IRE_IF_NORESOLVER, resolution cookie
3947                  * in the ill is copied in nce_add_v4().
3948                  */
3949                 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3950                 if (hw_addr == NULL) {
3951                         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3952                         return (ENOMEM);
3953                 }
3954                 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3955         } else {
3956                 /*
3957                  * IRE_IF_NORESOLVER type simply copies the resolution
3958                  * cookie passed in.  So no hw_addr is needed.
3959                  */
3960                 hw_addr = NULL;
3961         }
3962         ASSERT(flags & NCE_F_MCAST);
3963         ASSERT(flags & NCE_F_NONUD);
3964         /* nce_state will be computed by nce_add_common() */
3965         err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3966             ND_UNCHANGED, &nce);
3967         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3968         if (err == 0)
3969                 err = nce_add_v4_postprocess(nce);
3970         if (hw_addr != NULL)
3971                 kmem_free(hw_addr, ill->ill_phys_addr_length);
3972         if (err != 0) {
3973                 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3974                 return (err);
3975         }
3976 done:
3977         if (newnce != NULL)
3978                 *newnce = nce;
3979         else
3980                 nce_refrele(nce);
3981         return (0);
3982 }
3983 
3984 /*
3985  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
3986  * don't want to have to walk the list for every single one, so we gather up
3987  * batches at a time.
3988  */
3989 #define NCE_RESCHED_LIST_LEN    8
3990 
3991 typedef struct {
3992         ill_t   *ncert_ill;
3993         uint_t  ncert_num;
3994         ncec_t  *ncert_nces[NCE_RESCHED_LIST_LEN];
3995 } nce_resched_t;
3996 
3997 /*
3998  * Pick the longest waiting NCEs for defense.
3999  */
4000 /* ARGSUSED */
4001 static int
4002 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4003 {
4004         nce_resched_t *ncert = arg;
4005         ncec_t **ncecs;
4006         ncec_t **ncec_max;
4007         ncec_t *ncec_temp;
4008         ncec_t *ncec = nce->nce_common;
4009 
4010         ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4011         /*
4012          * Only reachable entries that are ready for announcement are eligible.
4013          */
4014         if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4015                 return (0);
4016         if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4017                 ncec_refhold(ncec);
4018                 ncert->ncert_nces[ncert->ncert_num++] = ncec;
4019         } else {
4020                 ncecs = ncert->ncert_nces;
4021                 ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4022                 ncec_refhold(ncec);
4023                 for (; ncecs < ncec_max; ncecs++) {
4024                         ASSERT(ncec != NULL);
4025                         if ((*ncecs)->ncec_last_time_defended >
4026                             ncec->ncec_last_time_defended) {
4027                                 ncec_temp = *ncecs;
4028                                 *ncecs = ncec;
4029                                 ncec = ncec_temp;
4030                         }
4031                 }
4032                 ncec_refrele(ncec);
4033         }
4034         return (0);
4035 }
4036 
4037 /*
4038  * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
4039  * doesn't happen very often (if at all), and thus it needn't be highly
4040  * optimized.  (Note, though, that it's actually O(N) complexity, because the
4041  * outer loop is bounded by a constant rather than by the length of the list.)
4042  */
4043 static void
4044 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4045 {
4046         ncec_t          *ncec;
4047         ip_stack_t      *ipst = ill->ill_ipst;
4048         uint_t          i, defend_rate;
4049 
4050         i = ill->ill_defend_count;
4051         ill->ill_defend_count = 0;
4052         if (ill->ill_isv6)
4053                 defend_rate = ipst->ips_ndp_defend_rate;
4054         else
4055                 defend_rate = ipst->ips_arp_defend_rate;
4056         /* If none could be sitting around, then don't reschedule */
4057         if (i < defend_rate) {
4058                 DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4059                 return;
4060         }
4061         ncert->ncert_ill = ill;
4062         while (ill->ill_defend_count < defend_rate) {
4063                 nce_walk_common(ill, ncec_reschedule, ncert);
4064                 for (i = 0; i < ncert->ncert_num; i++) {
4065 
4066                         ncec = ncert->ncert_nces[i];
4067                         mutex_enter(&ncec->ncec_lock);
4068                         ncec->ncec_flags |= NCE_F_DELAYED;
4069                         mutex_exit(&ncec->ncec_lock);
4070                         /*
4071                          * we plan to schedule this ncec, so incr the
4072                          * defend_count in anticipation.
4073                          */
4074                         if (++ill->ill_defend_count >= defend_rate)
4075                                 break;
4076                 }
4077                 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4078                         break;
4079         }
4080 }
4081 
4082 /*
4083  * Check if the current rate-limiting parameters permit the sending
4084  * of another address defense announcement for both IPv4 and IPv6.
4085  * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4086  * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4087  * determines how many address defense announcements are permitted
4088  * in any `defense_perio' interval.
4089  */
4090 static boolean_t
4091 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4092 {
4093         clock_t         now = ddi_get_lbolt();
4094         ip_stack_t      *ipst = ill->ill_ipst;
4095         clock_t         start = ill->ill_defend_start;
4096         uint32_t        elapsed, defend_period, defend_rate;
4097         nce_resched_t   ncert;
4098         boolean_t       ret;
4099         int             i;
4100 
4101         if (ill->ill_isv6) {
4102                 defend_period = ipst->ips_ndp_defend_period;
4103                 defend_rate = ipst->ips_ndp_defend_rate;
4104         } else {
4105                 defend_period = ipst->ips_arp_defend_period;
4106                 defend_rate = ipst->ips_arp_defend_rate;
4107         }
4108         if (defend_rate == 0)
4109                 return (B_TRUE);
4110         bzero(&ncert, sizeof (ncert));
4111         mutex_enter(&ill->ill_lock);
4112         if (start > 0) {
4113                 elapsed = now - start;
4114                 if (elapsed > SEC_TO_TICK(defend_period)) {
4115                         ill->ill_defend_start = now;
4116                         /*
4117                          * nce_ill_reschedule will attempt to
4118                          * prevent starvation by reschduling the
4119                          * oldest entries, which are marked with
4120                          * the NCE_F_DELAYED flag.
4121                          */
4122                         nce_ill_reschedule(ill, &ncert);
4123                 }
4124         } else {
4125                 ill->ill_defend_start = now;
4126         }
4127         ASSERT(ill->ill_defend_count <= defend_rate);
4128         mutex_enter(&ncec->ncec_lock);
4129         if (ncec->ncec_flags & NCE_F_DELAYED) {
4130                 /*
4131                  * This ncec was rescheduled as one of the really old
4132                  * entries needing on-going defense. The
4133                  * ill_defend_count was already incremented in
4134                  * nce_ill_reschedule. Go ahead and send the announce.
4135                  */
4136                 ncec->ncec_flags &= ~NCE_F_DELAYED;
4137                 mutex_exit(&ncec->ncec_lock);
4138                 ret = B_FALSE;
4139                 goto done;
4140         }
4141         mutex_exit(&ncec->ncec_lock);
4142         if (ill->ill_defend_count < defend_rate)
4143                 ill->ill_defend_count++;
4144         if (ill->ill_defend_count == defend_rate) {
4145                 /*
4146                  * we are no longer allowed to send unbidden defense
4147                  * messages. Wait for rescheduling.
4148                  */
4149                 ret = B_TRUE;
4150         } else {
4151                 ret = B_FALSE;
4152         }
4153 done:
4154         mutex_exit(&ill->ill_lock);
4155         /*
4156          * After all the locks have been dropped we can restart nce timer,
4157          * and refrele the delayed ncecs
4158          */
4159         for (i = 0; i < ncert.ncert_num; i++) {
4160                 clock_t xmit_interval;
4161                 ncec_t  *tmp;
4162 
4163                 tmp = ncert.ncert_nces[i];
4164                 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4165                     B_FALSE);
4166                 nce_restart_timer(tmp, xmit_interval);
4167                 ncec_refrele(tmp);
4168         }
4169         return (ret);
4170 }
4171 
4172 boolean_t
4173 ndp_announce(ncec_t *ncec)
4174 {
4175         return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4176             ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4177             nce_advert_flags(ncec)));
4178 }
4179 
4180 ill_t *
4181 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4182 {
4183         mblk_t          *mp;
4184         in6_addr_t      src6;
4185         ipaddr_t        src4;
4186         ill_t           *ill = ncec->ncec_ill;
4187         ill_t           *src_ill = NULL;
4188         ipif_t          *ipif = NULL;
4189         boolean_t       is_myaddr = NCE_MYADDR(ncec);
4190         boolean_t       isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4191 
4192         ASSERT(src != NULL);
4193         ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4194         src6 = *src;
4195         if (is_myaddr) {
4196                 src6 = ncec->ncec_addr;
4197                 if (!isv6)
4198                         IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4199         } else {
4200                 /*
4201                  * try to find one from the outgoing packet.
4202                  */
4203                 mutex_enter(&ncec->ncec_lock);
4204                 mp = ncec->ncec_qd_mp;
4205                 if (mp != NULL) {
4206                         if (isv6) {
4207                                 ip6_t   *ip6h = (ip6_t *)mp->b_rptr;
4208 
4209                                 src6 = ip6h->ip6_src;
4210                         } else {
4211                                 ipha_t  *ipha = (ipha_t *)mp->b_rptr;
4212 
4213                                 src4 = ipha->ipha_src;
4214                                 IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4215                         }
4216                 }
4217                 mutex_exit(&ncec->ncec_lock);
4218         }
4219 
4220         /*
4221          * For outgoing packets, if the src of outgoing packet is one
4222          * of the assigned interface addresses use it, otherwise we
4223          * will pick the source address below.
4224          * For local addresses (is_myaddr) doing DAD, NDP announce
4225          * messages are mcast. So we use the (IPMP) cast_ill or the
4226          * (non-IPMP) ncec_ill for these message types. The only case
4227          * of unicast DAD messages are for IPv6 ND probes, for which
4228          * we find the ipif_bound_ill corresponding to the ncec_addr.
4229          */
4230         if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4231                 if (isv6) {
4232                         ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4233                             ill->ill_ipst);
4234                 } else {
4235                         ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4236                             ill->ill_ipst);
4237                 }
4238 
4239                 /*
4240                  * If no relevant ipif can be found, then it's not one of our
4241                  * addresses.  Reset to :: and try to find a src for the NS or
4242                  * ARP request using ipif_select_source_v[4,6]  below.
4243                  * If an ipif can be found, but it's not yet done with
4244                  * DAD verification, and we are not being invoked for
4245                  * DAD (i.e., !is_myaddr), then just postpone this
4246                  * transmission until later.
4247                  */
4248                 if (ipif == NULL) {
4249                         src6 = ipv6_all_zeros;
4250                         src4 = INADDR_ANY;
4251                 } else if (!ipif->ipif_addr_ready && !is_myaddr) {
4252                         DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4253                             ncec_t *, ncec, ipif_t *, ipif);
4254                         ipif_refrele(ipif);
4255                         return (NULL);
4256                 }
4257         }
4258 
4259         if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4260                 /*
4261                  * Pick a source address for this solicitation, but
4262                  * restrict the selection to addresses assigned to the
4263                  * output interface.  We do this because the destination will
4264                  * create a neighbor cache entry for the source address of
4265                  * this packet, so the source address had better be a valid
4266                  * neighbor.
4267                  */
4268                 if (isv6) {
4269                         ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4270                             B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4271                             B_FALSE, NULL);
4272                 } else {
4273                         ipaddr_t nce_addr;
4274 
4275                         IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4276                         ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4277                             B_FALSE, NULL);
4278                 }
4279                 if (ipif == NULL && IS_IPMP(ill)) {
4280                         ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4281 
4282                         if (send_ill != NULL) {
4283                                 if (isv6) {
4284                                         ipif = ipif_select_source_v6(send_ill,
4285                                             &ncec->ncec_addr, B_TRUE,
4286                                             IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4287                                             B_FALSE, NULL);
4288                                 } else {
4289                                         IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4290                                             src4);
4291                                         ipif = ipif_select_source_v4(send_ill,
4292                                             src4, ALL_ZONES, B_TRUE, NULL);
4293                                 }
4294                                 ill_refrele(send_ill);
4295                         }
4296                 }
4297 
4298                 if (ipif == NULL) {
4299                         char buf[INET6_ADDRSTRLEN];
4300 
4301                         ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4302                             inet_ntop((isv6 ? AF_INET6 : AF_INET),
4303                             (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4304                         DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4305                         return (NULL);
4306                 }
4307                 src6 = ipif->ipif_v6lcl_addr;
4308         }
4309         *src = src6;
4310         if (ipif != NULL) {
4311                 src_ill = ipif->ipif_ill;
4312                 if (IS_IPMP(src_ill))
4313                         src_ill = ipmp_ipif_hold_bound_ill(ipif);
4314                 else
4315                         ill_refhold(src_ill);
4316                 ipif_refrele(ipif);
4317                 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4318                     ill_t *, src_ill);
4319         }
4320         return (src_ill);
4321 }
4322 
4323 void
4324 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4325     uchar_t *hwaddr, int hwaddr_len, int flags)
4326 {
4327         ill_t   *ill;
4328         ncec_t  *ncec;
4329         nce_t   *nce;
4330         uint16_t new_state;
4331 
4332         ill = (ipif ? ipif->ipif_ill : NULL);
4333         if (ill != NULL) {
4334                 /*
4335                  * only one ncec is possible
4336                  */
4337                 nce = nce_lookup_v4(ill, addr);
4338                 if (nce != NULL) {
4339                         ncec = nce->nce_common;
4340                         mutex_enter(&ncec->ncec_lock);
4341                         if (NCE_ISREACHABLE(ncec))
4342                                 new_state = ND_UNCHANGED;
4343                         else
4344                                 new_state = ND_STALE;
4345                         ncec->ncec_flags = flags;
4346                         nce_update(ncec, new_state, hwaddr);
4347                         mutex_exit(&ncec->ncec_lock);
4348                         nce_refrele(nce);
4349                         return;
4350                 }
4351         } else {
4352                 /*
4353                  * ill is wildcard; clean up all ncec's and ire's
4354                  * that match on addr.
4355                  */
4356                 nce_hw_map_t hwm;
4357 
4358                 hwm.hwm_addr = *addr;
4359                 hwm.hwm_hwlen = hwaddr_len;
4360                 hwm.hwm_hwaddr = hwaddr;
4361                 hwm.hwm_flags = flags;
4362 
4363                 ncec_walk_common(ipst->ips_ndp4, NULL,
4364                     (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE);
4365         }
4366 }
4367 
4368 /*
4369  * Common function to add ncec entries.
4370  * we always add the ncec with ncec_ill == ill, and always create
4371  * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4372  * ncec is !reachable.
4373  *
4374  * When the caller passes in an nce_state of ND_UNCHANGED,
4375  * nce_add_common() will determine the state of the created nce based
4376  * on the ill_net_type and nce_flags used. Otherwise, the nce will
4377  * be created with state set to the passed in nce_state.
4378  */
4379 static int
4380 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4381     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4382 {
4383         static  ncec_t          nce_nil;
4384         uchar_t                 *template = NULL;
4385         int                     err;
4386         ncec_t                  *ncec;
4387         ncec_t                  **ncep;
4388         ip_stack_t              *ipst = ill->ill_ipst;
4389         uint16_t                state;
4390         boolean_t               fastprobe = B_FALSE;
4391         struct ndp_g_s          *ndp;
4392         nce_t                   *nce = NULL;
4393         mblk_t                  *dlur_mp = NULL;
4394 
4395         if (ill->ill_isv6)
4396                 ndp = ill->ill_ipst->ips_ndp6;
4397         else
4398                 ndp = ill->ill_ipst->ips_ndp4;
4399 
4400         *retnce = NULL;
4401 
4402         ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4403 
4404         if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4405                 ip0dbg(("nce_add_common: no addr\n"));
4406                 return (EINVAL);
4407         }
4408         if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4409                 ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4410                 return (EINVAL);
4411         }
4412 
4413         if (ill->ill_isv6) {
4414                 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4415         } else {
4416                 ipaddr_t v4addr;
4417 
4418                 IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4419                 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4420         }
4421 
4422         /*
4423          * The caller has ensured that there is no nce on ill, but there could
4424          * still be an nce_common_t for the address, so that we find exisiting
4425          * ncec_t strucutures first, and atomically add a new nce_t if
4426          * one is found. The ndp_g_lock ensures that we don't cross threads
4427          * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4428          * compare for matches across the illgrp because this function is
4429          * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4430          * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4431          * appropriate.
4432          */
4433         ncec = *ncep;
4434         for (; ncec != NULL; ncec = ncec->ncec_next) {
4435                 if (ncec->ncec_ill == ill) {
4436                         if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4437                                 /*
4438                                  * We should never find *retnce to be
4439                                  * MYADDR, since the caller may then
4440                                  * incorrectly restart a DAD timer that's
4441                                  * already running.  However, if we are in
4442                                  * forwarding mode, and the interface is
4443                                  * moving in/out of groups, the data
4444                                  * path ire lookup (e.g., ire_revalidate_nce)
4445                                  * may  have determined that some destination
4446                                  * is offlink while the control path is adding
4447                                  * that address as a local address.
4448                                  * Recover from  this case by failing the
4449                                  * lookup
4450                                  */
4451                                 if (NCE_MYADDR(ncec))
4452                                         return (ENXIO);
4453                                 *retnce = nce_ill_lookup_then_add(ill, ncec);
4454                                 if (*retnce != NULL)
4455                                         break;
4456                         }
4457                 }
4458         }
4459         if (*retnce != NULL) /* caller must trigger fastpath on nce */
4460                 return (0);
4461 
4462         ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4463         if (ncec == NULL)
4464                 return (ENOMEM);
4465         *ncec = nce_nil;
4466         ncec->ncec_ill = ill;
4467         ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4468         ncec->ncec_flags = flags;
4469         ncec->ncec_ipst = ipst;      /* No netstack_hold */
4470 
4471         if (!ill->ill_isv6) {
4472                 ipaddr_t addr4;
4473 
4474                 /*
4475                  * DAD probe interval and probe count are set based on
4476                  * fast/slow probe settings. If the underlying link doesn't
4477                  * have reliably up/down notifications or if we're working
4478                  * with IPv4 169.254.0.0/16 Link Local Address space, then
4479                  * don't use the fast timers.  Otherwise, use them.
4480                  */
4481                 ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4482                 IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4483                 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4484                         fastprobe = B_TRUE;
4485                 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4486                     !IS_IPV4_LL_SPACE(&addr4)) {
4487                         ill_t *hwaddr_ill;
4488 
4489                         hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4490                             hw_addr_len);
4491                         if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4492                                 fastprobe = B_TRUE;
4493                 }
4494                 if (fastprobe) {
4495                         ncec->ncec_xmit_interval =
4496                             ipst->ips_arp_fastprobe_interval;
4497                         ncec->ncec_pcnt =
4498                             ipst->ips_arp_fastprobe_count;
4499                         ncec->ncec_flags |= NCE_F_FAST;
4500                 } else {
4501                         ncec->ncec_xmit_interval =
4502                             ipst->ips_arp_probe_interval;
4503                         ncec->ncec_pcnt =
4504                             ipst->ips_arp_probe_count;
4505                 }
4506                 if (NCE_PUBLISH(ncec)) {
4507                         ncec->ncec_unsolicit_count =
4508                             ipst->ips_ip_arp_publish_count;
4509                 }
4510         } else {
4511                 /*
4512                  * probe interval is constant: ILL_PROBE_INTERVAL
4513                  * probe count is constant: ND_MAX_UNICAST_SOLICIT
4514                  */
4515                 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4516                 if (NCE_PUBLISH(ncec)) {
4517                         ncec->ncec_unsolicit_count =
4518                             ipst->ips_ip_ndp_unsolicit_count;
4519                 }
4520         }
4521         ncec->ncec_rcnt = ill->ill_xmit_count;
4522         ncec->ncec_addr = *addr;
4523         ncec->ncec_qd_mp = NULL;
4524         ncec->ncec_refcnt = 1; /* for ncec getting created */
4525         mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4526         ncec->ncec_trace_disable = B_FALSE;
4527 
4528         /*
4529          * ncec_lladdr holds link layer address
4530          */
4531         if (hw_addr_len > 0) {
4532                 template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4533                 if (template == NULL) {
4534                         err = ENOMEM;
4535                         goto err_ret;
4536                 }
4537                 ncec->ncec_lladdr = template;
4538                 ncec->ncec_lladdr_length = hw_addr_len;
4539                 bzero(ncec->ncec_lladdr, hw_addr_len);
4540         }
4541         if ((flags & NCE_F_BCAST) != 0) {
4542                 state = ND_REACHABLE;
4543                 ASSERT(hw_addr_len > 0);
4544         } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4545                 state = ND_INITIAL;
4546         } else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4547                 /*
4548                  * NORESOLVER entries are always created in the REACHABLE
4549                  * state.
4550                  */
4551                 state = ND_REACHABLE;
4552                 if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4553                     ill->ill_mactype != DL_IPV4 &&
4554                     ill->ill_mactype != DL_6TO4) {
4555                         /*
4556                          * We create a nce_res_mp with the IP nexthop address
4557                          * as the destination address if the physical length
4558                          * is exactly 4 bytes for point-to-multipoint links
4559                          * that do their own resolution from IP to link-layer
4560                          * address (e.g. IP over X.25).
4561                          */
4562                         bcopy((uchar_t *)addr,
4563                             ncec->ncec_lladdr, ill->ill_phys_addr_length);
4564                 }
4565                 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4566                     ill->ill_mactype != DL_IPV6) {
4567                         /*
4568                          * We create a nce_res_mp with the IP nexthop address
4569                          * as the destination address if the physical legnth
4570                          * is exactly 16 bytes for point-to-multipoint links
4571                          * that do their own resolution from IP to link-layer
4572                          * address.
4573                          */
4574                         bcopy((uchar_t *)addr,
4575                             ncec->ncec_lladdr, ill->ill_phys_addr_length);
4576                 }
4577                 /*
4578                  * Since NUD is not part of the base IPv4 protocol definition,
4579                  * IPv4 neighbor entries on NORESOLVER interfaces will never
4580                  * age, and are marked NCE_F_NONUD.
4581                  */
4582                 if (!ill->ill_isv6)
4583                         ncec->ncec_flags |= NCE_F_NONUD;
4584         } else if (ill->ill_net_type == IRE_LOOPBACK) {
4585                 state = ND_REACHABLE;
4586         }
4587 
4588         if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4589                 /*
4590                  * We are adding an ncec with a deterministic hw_addr,
4591                  * so the state can only be one of {REACHABLE, STALE, PROBE}.
4592                  *
4593                  * if we are adding a unicast ncec for the local address
4594                  * it would be REACHABLE; we would be adding a ND_STALE entry
4595                  * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4596                  * addresses are added in PROBE to trigger DAD.
4597                  */
4598                 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4599                     ill->ill_net_type == IRE_IF_NORESOLVER)
4600                         state = ND_REACHABLE;
4601                 else if (!NCE_PUBLISH(ncec))
4602                         state = ND_STALE;
4603                 else
4604                         state = ND_PROBE;
4605                 if (hw_addr != NULL)
4606                         nce_set_ll(ncec, hw_addr);
4607         }
4608         /* caller overrides internally computed state */
4609         if (nce_state != ND_UNCHANGED)
4610                 state = nce_state;
4611 
4612         if (state == ND_PROBE)
4613                 ncec->ncec_flags |= NCE_F_UNVERIFIED;
4614 
4615         ncec->ncec_state = state;
4616 
4617         if (state == ND_REACHABLE) {
4618                 ncec->ncec_last = ncec->ncec_init_time =
4619                     TICK_TO_MSEC(ddi_get_lbolt64());
4620         } else {
4621                 ncec->ncec_last = 0;
4622                 if (state == ND_INITIAL)
4623                         ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4624         }
4625         list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4626             offsetof(ncec_cb_t, ncec_cb_node));
4627         /*
4628          * have all the memory allocations out of the way before taking locks
4629          * and adding the nce.
4630          */
4631         nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4632         if (nce == NULL) {
4633                 err = ENOMEM;
4634                 goto err_ret;
4635         }
4636         if (ncec->ncec_lladdr != NULL ||
4637             ill->ill_net_type == IRE_IF_NORESOLVER) {
4638                 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4639                     ill->ill_phys_addr_length, ill->ill_sap,
4640                     ill->ill_sap_length);
4641                 if (dlur_mp == NULL) {
4642                         err = ENOMEM;
4643                         goto err_ret;
4644                 }
4645         }
4646 
4647         /*
4648          * Atomically ensure that the ill is not CONDEMNED, before
4649          * adding the NCE.
4650          */
4651         mutex_enter(&ill->ill_lock);
4652         if (ill->ill_state_flags & ILL_CONDEMNED) {
4653                 mutex_exit(&ill->ill_lock);
4654                 err = EINVAL;
4655                 goto err_ret;
4656         }
4657         if (!NCE_MYADDR(ncec) &&
4658             (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4659                 mutex_exit(&ill->ill_lock);
4660                 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4661                 err = EINVAL;
4662                 goto err_ret;
4663         }
4664         /*
4665          * Acquire the ncec_lock even before adding the ncec to the list
4666          * so that it cannot get deleted after the ncec is added, but
4667          * before we add the nce.
4668          */
4669         mutex_enter(&ncec->ncec_lock);
4670         if ((ncec->ncec_next = *ncep) != NULL)
4671                 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4672         *ncep = ncec;
4673         ncec->ncec_ptpn = ncep;
4674 
4675         /* Bump up the number of ncec's referencing this ill */
4676         DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4677             (char *), "ncec", (void *), ncec);
4678         ill->ill_ncec_cnt++;
4679         /*
4680          * Since we hold the ncec_lock at this time, the ncec cannot be
4681          * condemned, and we can safely add the nce.
4682          */
4683         *retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4684         mutex_exit(&ncec->ncec_lock);
4685         mutex_exit(&ill->ill_lock);
4686 
4687         /* caller must trigger fastpath on *retnce */
4688         return (0);
4689 
4690 err_ret:
4691         if (ncec != NULL)
4692                 kmem_cache_free(ncec_cache, ncec);
4693         if (nce != NULL)
4694                 kmem_cache_free(nce_cache, nce);
4695         freemsg(dlur_mp);
4696         if (template != NULL)
4697                 kmem_free(template, ill->ill_phys_addr_length);
4698         return (err);
4699 }
4700 
4701 /*
4702  * take a ref on the nce
4703  */
4704 void
4705 nce_refhold(nce_t *nce)
4706 {
4707         mutex_enter(&nce->nce_lock);
4708         nce->nce_refcnt++;
4709         ASSERT((nce)->nce_refcnt != 0);
4710         mutex_exit(&nce->nce_lock);
4711 }
4712 
4713 /*
4714  * release a ref on the nce; In general, this
4715  * cannot be called with locks held because nce_inactive
4716  * may result in nce_inactive which will take the ill_lock,
4717  * do ipif_ill_refrele_tail etc. Thus the one exception
4718  * where this can be called with locks held is when the caller
4719  * is certain that the nce_refcnt is sufficient to prevent
4720  * the invocation of nce_inactive.
4721  */
4722 void
4723 nce_refrele(nce_t *nce)
4724 {
4725         ASSERT((nce)->nce_refcnt != 0);
4726         mutex_enter(&nce->nce_lock);
4727         if (--nce->nce_refcnt == 0)
4728                 nce_inactive(nce); /* destroys the mutex */
4729         else
4730                 mutex_exit(&nce->nce_lock);
4731 }
4732 
4733 /*
4734  * free the nce after all refs have gone away.
4735  */
4736 static void
4737 nce_inactive(nce_t *nce)
4738 {
4739         ill_t *ill = nce->nce_ill;
4740 
4741         ASSERT(nce->nce_refcnt == 0);
4742 
4743         ncec_refrele_notr(nce->nce_common);
4744         nce->nce_common = NULL;
4745         freemsg(nce->nce_fp_mp);
4746         freemsg(nce->nce_dlur_mp);
4747 
4748         mutex_enter(&ill->ill_lock);
4749         DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4750             (char *), "nce", (void *), nce);
4751         ill->ill_nce_cnt--;
4752         nce->nce_ill = NULL;
4753         /*
4754          * If the number of ncec's associated with this ill have dropped
4755          * to zero, check whether we need to restart any operation that
4756          * is waiting for this to happen.
4757          */
4758         if (ILL_DOWN_OK(ill)) {
4759                 /* ipif_ill_refrele_tail drops the ill_lock */
4760                 ipif_ill_refrele_tail(ill);
4761         } else {
4762                 mutex_exit(&ill->ill_lock);
4763         }
4764 
4765         mutex_destroy(&nce->nce_lock);
4766         kmem_cache_free(nce_cache, nce);
4767 }
4768 
4769 /*
4770  * Add an nce to the ill_nce list.
4771  */
4772 static nce_t *
4773 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
4774 {
4775         bzero(nce, sizeof (*nce));
4776         mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4777         nce->nce_common = ncec;
4778         nce->nce_addr = ncec->ncec_addr;
4779         nce->nce_ill = ill;
4780         DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4781             (char *), "nce", (void *), nce);
4782         ill->ill_nce_cnt++;
4783 
4784         nce->nce_refcnt = 1; /* for the thread */
4785         ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4786         nce->nce_dlur_mp = dlur_mp;
4787 
4788         /* add nce to the ill's fastpath list.  */
4789         nce->nce_refcnt++; /* for the list */
4790         list_insert_head(&ill->ill_nce, nce);
4791         return (nce);
4792 }
4793 
4794 static nce_t *
4795 nce_add(ill_t *ill, ncec_t *ncec)
4796 {
4797         nce_t   *nce;
4798         mblk_t  *dlur_mp = NULL;
4799 
4800         ASSERT(MUTEX_HELD(&ill->ill_lock));
4801         ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4802 
4803         nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4804         if (nce == NULL)
4805                 return (NULL);
4806         if (ncec->ncec_lladdr != NULL ||
4807             ill->ill_net_type == IRE_IF_NORESOLVER) {
4808                 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4809                     ill->ill_phys_addr_length, ill->ill_sap,
4810                     ill->ill_sap_length);
4811                 if (dlur_mp == NULL) {
4812                         kmem_cache_free(nce_cache, nce);
4813                         return (NULL);
4814                 }
4815         }
4816         return (nce_add_impl(ill, ncec, nce, dlur_mp));
4817 }
4818 
4819 /*
4820  * remove the nce from the ill_faspath list
4821  */
4822 void
4823 nce_delete(nce_t *nce)
4824 {
4825         ill_t   *ill = nce->nce_ill;
4826 
4827         ASSERT(MUTEX_HELD(&ill->ill_lock));
4828 
4829         mutex_enter(&nce->nce_lock);
4830         if (nce->nce_is_condemned) {
4831                 /*
4832                  * some other thread has removed this nce from the ill_nce list
4833                  */
4834                 mutex_exit(&nce->nce_lock);
4835                 return;
4836         }
4837         nce->nce_is_condemned = B_TRUE;
4838         mutex_exit(&nce->nce_lock);
4839 
4840         list_remove(&ill->ill_nce, nce);
4841         /*
4842          * even though we are holding the ill_lock, it is ok to
4843          * call nce_refrele here because we know that we should have
4844          * at least 2 refs on the nce: one for the thread, and one
4845          * for the list. The refrele below will release the one for
4846          * the list.
4847          */
4848         nce_refrele(nce);
4849 }
4850 
4851 nce_t *
4852 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4853 {
4854         nce_t *nce = NULL;
4855 
4856         ASSERT(ill != NULL);
4857         ASSERT(MUTEX_HELD(&ill->ill_lock));
4858 
4859         for (nce = list_head(&ill->ill_nce); nce != NULL;
4860             nce = list_next(&ill->ill_nce, nce)) {
4861                 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4862                         break;
4863         }
4864 
4865         /*
4866          * if we found the nce on the ill_nce list while holding
4867          * the ill_lock, then it cannot be condemned yet.
4868          */
4869         if (nce != NULL) {
4870                 ASSERT(!nce->nce_is_condemned);
4871                 nce_refhold(nce);
4872         }
4873         return (nce);
4874 }
4875 
4876 /*
4877  * Walk the ill_nce list on ill. The callback function func() cannot perform
4878  * any destructive actions.
4879  */
4880 static void
4881 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4882 {
4883         nce_t *nce = NULL, *nce_next;
4884 
4885         ASSERT(MUTEX_HELD(&ill->ill_lock));
4886         for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4887                 nce_next = list_next(&ill->ill_nce, nce);
4888                 if (func(ill, nce, arg) != 0)
4889                         break;
4890                 nce = nce_next;
4891         }
4892 }
4893 
4894 void
4895 nce_walk(ill_t *ill, pfi_t func, void *arg)
4896 {
4897         mutex_enter(&ill->ill_lock);
4898         nce_walk_common(ill, func, arg);
4899         mutex_exit(&ill->ill_lock);
4900 }
4901 
4902 void
4903 nce_flush(ill_t *ill, boolean_t flushall)
4904 {
4905         nce_t *nce, *nce_next;
4906         list_t dead;
4907 
4908         list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4909         mutex_enter(&ill->ill_lock);
4910         for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4911                 nce_next = list_next(&ill->ill_nce, nce);
4912                 if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4913                         nce = nce_next;
4914                         continue;
4915                 }
4916                 /*
4917                  * nce_delete requires that the caller should either not
4918                  * be holding locks, or should hold a ref to ensure that
4919                  * we wont hit ncec_inactive. So take a ref and clean up
4920                  * after the list is flushed.
4921                  */
4922                 nce_refhold(nce);
4923                 nce_delete(nce);
4924                 list_insert_tail(&dead, nce);
4925                 nce = nce_next;
4926         }
4927         mutex_exit(&ill->ill_lock);
4928         while ((nce = list_head(&dead)) != NULL) {
4929                 list_remove(&dead, nce);
4930                 nce_refrele(nce);
4931         }
4932         ASSERT(list_is_empty(&dead));
4933         list_destroy(&dead);
4934 }
4935 
4936 /* Return an interval that is anywhere in the [1 .. intv] range */
4937 static clock_t
4938 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4939 {
4940         clock_t rnd, frac;
4941 
4942         (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4943         /* Note that clock_t is signed; must chop off bits */
4944         rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4945         if (initial_time) {
4946                 if (intv <= 0)
4947                         intv = 1;
4948                 else
4949                         intv = (rnd % intv) + 1;
4950         } else {
4951                 /* Compute 'frac' as 20% of the configured interval */
4952                 if ((frac = intv / 5) <= 1)
4953                         frac = 2;
4954                 /* Set intv randomly in the range [intv-frac .. intv+frac] */
4955                 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4956                 intv = 1;
4957         }
4958         return (intv);
4959 }
4960 
4961 void
4962 nce_resolv_ipmp_ok(ncec_t *ncec)
4963 {
4964         mblk_t *mp;
4965         uint_t pkt_len;
4966         iaflags_t ixaflags = IXAF_NO_TRACE;
4967         nce_t *under_nce;
4968         ill_t   *ill = ncec->ncec_ill;
4969         boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4970         ipif_t *src_ipif = NULL;
4971         ip_stack_t *ipst = ill->ill_ipst;
4972         ill_t *send_ill;
4973         uint_t nprobes;
4974 
4975         ASSERT(IS_IPMP(ill));
4976 
4977         mutex_enter(&ncec->ncec_lock);
4978         nprobes = ncec->ncec_nprobes;
4979         mp = ncec->ncec_qd_mp;
4980         ncec->ncec_qd_mp = NULL;
4981         ncec->ncec_nprobes = 0;
4982         mutex_exit(&ncec->ncec_lock);
4983 
4984         while (mp != NULL) {
4985                 mblk_t *nxt_mp;
4986 
4987                 nxt_mp = mp->b_next;
4988                 mp->b_next = NULL;
4989                 if (isv6) {
4990                         ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4991 
4992                         pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4993                         src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
4994                             ill, ALL_ZONES, ipst);
4995                 } else {
4996                         ipha_t *ipha = (ipha_t *)mp->b_rptr;
4997 
4998                         ixaflags |= IXAF_IS_IPV4;
4999                         pkt_len = ntohs(ipha->ipha_length);
5000                         src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5001                             ill, ALL_ZONES, ipst);
5002                 }
5003 
5004                 /*
5005                  * find a new nce based on an under_ill. The first IPMP probe
5006                  * packet gets queued, so we could still find a src_ipif that
5007                  * matches an IPMP test address.
5008                  */
5009                 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5010                         /*
5011                          * if src_ipif is null, this could be either a
5012                          * forwarded packet or a probe whose src got deleted.
5013                          * We identify the former case by looking for the
5014                          * ncec_nprobes: the first ncec_nprobes packets are
5015                          * probes;
5016                          */
5017                         if (src_ipif == NULL && nprobes > 0)
5018                                 goto drop_pkt;
5019 
5020                         /*
5021                          * For forwarded packets, we use the ipmp rotor
5022                          * to find send_ill.
5023                          */
5024                         send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5025                             B_TRUE);
5026                 } else {
5027                         send_ill = src_ipif->ipif_ill;
5028                         ill_refhold(send_ill);
5029                 }
5030 
5031                 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5032                     (ncec_t *), ncec, (ipif_t *),
5033                     src_ipif, (ill_t *), send_ill);
5034 
5035                 if (send_ill == NULL) {
5036                         if (src_ipif != NULL)
5037                                 ipif_refrele(src_ipif);
5038                         goto drop_pkt;
5039                 }
5040                 /* create an under_nce on send_ill */
5041                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5042                 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5043                         under_nce = nce_fastpath_create(send_ill, ncec);
5044                 else
5045                         under_nce = NULL;
5046                 rw_exit(&ipst->ips_ill_g_lock);
5047                 if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5048                         nce_fastpath_trigger(under_nce);
5049 
5050                 ill_refrele(send_ill);
5051                 if (src_ipif != NULL)
5052                         ipif_refrele(src_ipif);
5053 
5054                 if (under_nce != NULL) {
5055                         (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5056                             ALL_ZONES, 0, NULL);
5057                         nce_refrele(under_nce);
5058                         if (nprobes > 0)
5059                                 nprobes--;
5060                         mp = nxt_mp;
5061                         continue;
5062                 }
5063 drop_pkt:
5064                 if (isv6) {
5065                         BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5066                 } else {
5067                         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5068                 }
5069                 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5070                 freemsg(mp);
5071                 if (nprobes > 0)
5072                         nprobes--;
5073                 mp = nxt_mp;
5074         }
5075         ncec_cb_dispatch(ncec); /* complete callbacks */
5076 }