1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 /* Copyright (c) 1990 Mentat Inc. */
  26 
  27 #include <sys/types.h>
  28 #include <sys/stream.h>
  29 #include <sys/strsubr.h>
  30 #include <sys/dlpi.h>
  31 #include <sys/strsun.h>
  32 #include <sys/zone.h>
  33 #include <sys/ddi.h>
  34 #include <sys/sunddi.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/debug.h>
  37 #include <sys/atomic.h>
  38 
  39 #include <sys/systm.h>
  40 #include <sys/param.h>
  41 #include <sys/kmem.h>
  42 #include <sys/sdt.h>
  43 #include <sys/socket.h>
  44 #include <sys/mac.h>
  45 #include <net/if.h>
  46 #include <net/if_arp.h>
  47 #include <net/route.h>
  48 #include <sys/sockio.h>
  49 #include <netinet/in.h>
  50 #include <net/if_dl.h>
  51 
  52 #include <inet/common.h>
  53 #include <inet/mi.h>
  54 #include <inet/mib2.h>
  55 #include <inet/nd.h>
  56 #include <inet/arp.h>
  57 #include <inet/snmpcom.h>
  58 #include <inet/kstatcom.h>
  59 
  60 #include <netinet/igmp_var.h>
  61 #include <netinet/ip6.h>
  62 #include <netinet/icmp6.h>
  63 #include <netinet/sctp.h>
  64 
  65 #include <inet/ip.h>
  66 #include <inet/ip_impl.h>
  67 #include <inet/ip6.h>
  68 #include <inet/ip6_asp.h>
  69 #include <inet/tcp.h>
  70 #include <inet/ip_multi.h>
  71 #include <inet/ip_if.h>
  72 #include <inet/ip_ire.h>
  73 #include <inet/ip_ftable.h>
  74 #include <inet/ip_rts.h>
  75 #include <inet/optcom.h>
  76 #include <inet/ip_ndp.h>
  77 #include <inet/ip_listutils.h>
  78 #include <netinet/igmp.h>
  79 #include <netinet/ip_mroute.h>
  80 #include <inet/ipp_common.h>
  81 
  82 #include <net/pfkeyv2.h>
  83 #include <inet/sadb.h>
  84 #include <inet/ipsec_impl.h>
  85 #include <inet/ipdrop.h>
  86 #include <inet/ip_netinfo.h>
  87 
  88 #include <sys/pattr.h>
  89 #include <inet/ipclassifier.h>
  90 #include <inet/sctp_ip.h>
  91 #include <inet/sctp/sctp_impl.h>
  92 #include <inet/udp_impl.h>
  93 #include <sys/sunddi.h>
  94 
  95 #include <sys/tsol/label.h>
  96 #include <sys/tsol/tnet.h>
  97 
  98 #include <sys/clock_impl.h>       /* For LBOLT_FASTPATH{,64} */
  99 
 100 #ifdef  DEBUG
 101 extern boolean_t skip_sctp_cksum;
 102 #endif
 103 
 104 static int      ip_verify_nce(mblk_t *, ip_xmit_attr_t *);
 105 static int      ip_verify_dce(mblk_t *, ip_xmit_attr_t *);
 106 static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *);
 107 static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *);
 108 static void     ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *);
 109 
 110 /*
 111  * There are two types of output functions for IP used for different
 112  * purposes:
 113  *  - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there
 114  *     is no context in the form of a conn_t. However, there is a
 115  *     ip_xmit_attr_t that the callers use to influence interface selection
 116  *     (needed for ICMP echo as well as IPv6 link-locals) and IPsec.
 117  *
 118  *  - conn_ip_output() is used when sending packets with a conn_t and
 119  *    ip_set_destination has been called to cache information. In that case
 120  *    various socket options are recorded in the ip_xmit_attr_t and should
 121  *    be taken into account.
 122  */
 123 
 124 /*
 125  * The caller *must* have called conn_connect() or ip_attr_connect()
 126  * before calling conn_ip_output(). The caller needs to redo that each time
 127  * the destination IP address or port changes, as well as each time there is
 128  * a change to any socket option that would modify how packets are routed out
 129  * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF).
 130  *
 131  * The ULP caller has to serialize the use of a single ip_xmit_attr_t.
 132  * We assert for that here.
 133  */
 134 int
 135 conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa)
 136 {
 137         iaflags_t       ixaflags = ixa->ixa_flags;
 138         ire_t           *ire;
 139         nce_t           *nce;
 140         dce_t           *dce;
 141         ill_t           *ill;
 142         ip_stack_t      *ipst = ixa->ixa_ipst;
 143         int             error;
 144 
 145         /* We defer ipIfStatsHCOutRequests until an error or we have an ill */
 146 
 147         ASSERT(ixa->ixa_ire != NULL);
 148         /* Note there is no ixa_nce when reject and blackhole routes */
 149         ASSERT(ixa->ixa_dce != NULL);        /* Could be default dce */
 150 
 151 #ifdef DEBUG
 152         ASSERT(ixa->ixa_curthread == NULL);
 153         ixa->ixa_curthread = curthread;
 154 #endif
 155 
 156         /*
 157          * Even on labeled systems we can have a NULL ixa_tsl e.g.,
 158          * for IGMP/MLD traffic.
 159          */
 160 
 161         ire = ixa->ixa_ire;
 162 
 163         /*
 164          * If the ULP says the (old) IRE resulted in reachability we
 165          * record this before determine whether to use a new IRE.
 166          * No locking for performance reasons.
 167          */
 168         if (ixaflags & IXAF_REACH_CONF)
 169                 ire->ire_badcnt = 0;
 170 
 171         /*
 172          * Has routing changed since we cached the results of the lookup?
 173          *
 174          * This check captures all of:
 175          *  - the cached ire being deleted (by means of the special
 176          *    IRE_GENERATION_CONDEMNED)
 177          *  - A potentially better ire being added (ire_generation being
 178          *    increased)
 179          *  - A deletion of the nexthop ire that was used when we did the
 180          *    lookup.
 181          *  - An addition of a potentially better nexthop ire.
 182          * The last two are handled by walking and increasing the generation
 183          * number on all dependant IREs in ire_flush_cache().
 184          *
 185          * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE
 186          * since we ensure that each time we set ixa_ire to such an IRE we
 187          * make sure the ixa_ire_generation does not match (by using
 188          * IRE_GENERATION_VERIFY).
 189          */
 190         if (ire->ire_generation != ixa->ixa_ire_generation) {
 191                 error = ip_verify_ire(mp, ixa);
 192                 if (error != 0) {
 193                         ip_drop_output("ipIfStatsOutDiscards - verify ire",
 194                             mp, NULL);
 195                         goto drop;
 196                 }
 197                 ire = ixa->ixa_ire;
 198                 ASSERT(ire != NULL);
 199                 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 200 #ifdef DEBUG
 201                         ASSERT(ixa->ixa_curthread == curthread);
 202                         ixa->ixa_curthread = NULL;
 203 #endif
 204                         ire->ire_ob_pkt_count++;
 205                         /* ixa_dce might be condemned; use default one */
 206                         return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa,
 207                             &ipst->ips_dce_default->dce_ident));
 208                 }
 209                 /*
 210                  * If the ncec changed then ip_verify_ire already set
 211                  * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
 212                  * so we can recheck the interface mtu.
 213                  */
 214 
 215                 /*
 216                  * Note that ire->ire_generation could already have changed.
 217                  * We catch that next time we send a packet.
 218                  */
 219         }
 220 
 221         /*
 222          * No need to lock access to ixa_nce since the ip_xmit_attr usage
 223          * is single threaded.
 224          */
 225         ASSERT(ixa->ixa_nce != NULL);
 226         nce = ixa->ixa_nce;
 227         if (nce->nce_is_condemned) {
 228                 error = ip_verify_nce(mp, ixa);
 229                 /*
 230                  * In case ZEROCOPY capability become not available, we
 231                  * copy the message and free the original one. We might
 232                  * be copying more data than needed but it doesn't hurt
 233                  * since such change rarely happens.
 234                  */
 235                 switch (error) {
 236                 case 0:
 237                         break;
 238                 case ENOTSUP: { /* ZEROCOPY */
 239                         mblk_t *nmp;
 240 
 241                         if ((nmp = copymsg(mp)) != NULL) {
 242                                 freemsg(mp);
 243                                 mp = nmp;
 244 
 245                                 break;
 246                         }
 247                         /* FALLTHROUGH */
 248                 }
 249                 default:
 250                         ip_drop_output("ipIfStatsOutDiscards - verify nce",
 251                             mp, NULL);
 252                         goto drop;
 253                 }
 254                 ire = ixa->ixa_ire;
 255                 ASSERT(ire != NULL);
 256                 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 257 #ifdef DEBUG
 258                         ASSERT(ixa->ixa_curthread == curthread);
 259                         ixa->ixa_curthread = NULL;
 260 #endif
 261                         ire->ire_ob_pkt_count++;
 262                         /* ixa_dce might be condemned; use default one */
 263                         return ((ire->ire_sendfn)(ire, mp, mp->b_rptr,
 264                             ixa, &ipst->ips_dce_default->dce_ident));
 265                 }
 266                 ASSERT(ixa->ixa_nce != NULL);
 267                 nce = ixa->ixa_nce;
 268 
 269                 /*
 270                  * Note that some other event could already have made
 271                  * the new nce condemned. We catch that next time we
 272                  * try to send a packet.
 273                  */
 274         }
 275         /*
 276          * If there is no per-destination dce_t then we have a reference to
 277          * the default dce_t (which merely contains the dce_ipid).
 278          * The generation check captures both the introduction of a
 279          * per-destination dce_t (e.g., due to ICMP packet too big) and
 280          * any change to the per-destination dce (including it becoming
 281          * condemned by use of the special DCE_GENERATION_CONDEMNED).
 282          */
 283         dce = ixa->ixa_dce;
 284 
 285         /*
 286          * To avoid a periodic timer to increase the path MTU we
 287          * look at dce_last_change_time each time we send a packet.
 288          */
 289         if (dce->dce_flags & DCEF_PMTU) {
 290                 int64_t         now = LBOLT_FASTPATH64;
 291 
 292                 if ((TICK_TO_SEC(now) - dce->dce_last_change_time >
 293                     ipst->ips_ip_pathmtu_interval)) {
 294                         /*
 295                          * Older than 20 minutes. Drop the path MTU information.
 296                          * Since the path MTU changes as a result of this,
 297                          * twiddle ixa_dce_generation to make us go through the
 298                          * dce verification code in conn_ip_output.
 299                          */
 300                         mutex_enter(&dce->dce_lock);
 301                         dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
 302                         dce->dce_last_change_time = TICK_TO_SEC(now);
 303                         mutex_exit(&dce->dce_lock);
 304                         dce_increment_generation(dce);
 305                 }
 306         }
 307 
 308         if (dce->dce_generation != ixa->ixa_dce_generation) {
 309                 error = ip_verify_dce(mp, ixa);
 310                 if (error != 0) {
 311                         ip_drop_output("ipIfStatsOutDiscards - verify dce",
 312                             mp, NULL);
 313                         goto drop;
 314                 }
 315                 dce = ixa->ixa_dce;
 316 
 317                 /*
 318                  * Note that some other event could already have made the
 319                  * new dce's generation number change.
 320                  * We catch that next time we try to send a packet.
 321                  */
 322         }
 323 
 324         ill = nce->nce_ill;
 325 
 326         /*
 327          * An initial ixa_fragsize was set in ip_set_destination
 328          * and we update it if any routing changes above.
 329          * A change to ill_mtu with ifconfig will increase all dce_generation
 330          * so that we will detect that with the generation check. Ditto for
 331          * ill_mc_mtu.
 332          */
 333 
 334         /*
 335          * Caller needs to make sure IXAF_VERIFY_SRC is not set if
 336          * conn_unspec_src.
 337          */
 338         if ((ixaflags & IXAF_VERIFY_SOURCE) &&
 339             ixa->ixa_src_generation != ipst->ips_src_generation) {
 340                 /* Check if the IP source is still assigned to the host. */
 341                 uint_t gen;
 342 
 343                 if (!ip_verify_src(mp, ixa, &gen)) {
 344                         /* Don't send a packet with a source that isn't ours */
 345                         error = EADDRNOTAVAIL;
 346                         ip_drop_output("ipIfStatsOutDiscards - invalid src",
 347                             mp, NULL);
 348                         goto drop;
 349                 }
 350                 /* The source is still valid - update the generation number */
 351                 ixa->ixa_src_generation = gen;
 352         }
 353 
 354         /*
 355          * We don't have an IRE when we fragment, hence ire_ob_pkt_count
 356          * can only count the use prior to fragmentation. However the MIB
 357          * counters on the ill will be incremented in post fragmentation.
 358          */
 359         ire->ire_ob_pkt_count++;
 360         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
 361 
 362         /*
 363          * Based on ire_type and ire_flags call one of:
 364          *      ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK
 365          *      ire_send_multirt_v* - if RTF_MULTIRT
 366          *      ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE
 367          *      ire_send_multicast_v* - for IRE_MULTICAST
 368          *      ire_send_broadcast_v4 - for IRE_BROADCAST
 369          *      ire_send_wire_v* - for the rest.
 370          */
 371 #ifdef DEBUG
 372         ASSERT(ixa->ixa_curthread == curthread);
 373         ixa->ixa_curthread = NULL;
 374 #endif
 375         return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident));
 376 
 377 drop:
 378         if (ixaflags & IXAF_IS_IPV4) {
 379                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 380                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
 381         } else {
 382                 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests);
 383                 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
 384         }
 385         freemsg(mp);
 386 #ifdef DEBUG
 387         ASSERT(ixa->ixa_curthread == curthread);
 388         ixa->ixa_curthread = NULL;
 389 #endif
 390         return (error);
 391 }
 392 
 393 /*
 394  * Handle both IPv4 and IPv6. Sets the generation number
 395  * to allow the caller to know when to call us again.
 396  * Returns true if the source address in the packet is a valid source.
 397  * We handle callers which try to send with a zero address (since we only
 398  * get here if UNSPEC_SRC is not set).
 399  */
 400 boolean_t
 401 ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
 402 {
 403         ip_stack_t      *ipst = ixa->ixa_ipst;
 404 
 405         /*
 406          * Need to grab the generation number before we check to
 407          * avoid a race with a change to the set of local addresses.
 408          * No lock needed since the thread which updates the set of local
 409          * addresses use ipif/ill locks and exit those (hence a store memory
 410          * barrier) before doing the atomic increase of ips_src_generation.
 411          */
 412         if (generationp != NULL)
 413                 *generationp = ipst->ips_src_generation;
 414 
 415         if (ixa->ixa_flags & IXAF_IS_IPV4) {
 416                 ipha_t  *ipha = (ipha_t *)mp->b_rptr;
 417 
 418                 if (ipha->ipha_src == INADDR_ANY)
 419                         return (B_FALSE);
 420 
 421                 return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
 422                     ipst, B_FALSE) != IPVL_BAD);
 423         } else {
 424                 ip6_t   *ip6h = (ip6_t *)mp->b_rptr;
 425                 uint_t  scopeid;
 426 
 427                 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src))
 428                         return (B_FALSE);
 429 
 430                 if (ixa->ixa_flags & IXAF_SCOPEID_SET)
 431                         scopeid = ixa->ixa_scopeid;
 432                 else
 433                         scopeid = 0;
 434 
 435                 return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid,
 436                     ipst, B_FALSE, scopeid) != IPVL_BAD);
 437         }
 438 }
 439 
 440 /*
 441  * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use.
 442  */
 443 int
 444 ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa)
 445 {
 446         uint_t          gen;
 447         ire_t           *ire;
 448         nce_t           *nce;
 449         int             error;
 450         boolean_t       multirt = B_FALSE;
 451 
 452         /*
 453          * Redo ip_select_route.
 454          * Need to grab generation number as part of the lookup to
 455          * avoid race.
 456          */
 457         error = 0;
 458         ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt);
 459         ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
 460         if (error != 0) {
 461                 ire_refrele(ire);
 462                 return (error);
 463         }
 464 
 465         if (ixa->ixa_ire != NULL)
 466                 ire_refrele_notr(ixa->ixa_ire);
 467 #ifdef DEBUG
 468         ire_refhold_notr(ire);
 469         ire_refrele(ire);
 470 #endif
 471         ixa->ixa_ire = ire;
 472         ixa->ixa_ire_generation = gen;
 473         if (multirt) {
 474                 if (ixa->ixa_flags & IXAF_IS_IPV4)
 475                         ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
 476                 else
 477                         ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
 478                 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
 479         } else {
 480                 ixa->ixa_postfragfn = ire->ire_postfragfn;
 481                 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
 482         }
 483 
 484         /*
 485          * Don't look for an nce for reject or blackhole.
 486          * They have ire_generation set to IRE_GENERATION_VERIFY which
 487          * makes conn_ip_output avoid references to ixa_nce.
 488          */
 489         if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 490                 ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY);
 491                 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
 492                 return (0);
 493         }
 494 
 495         /* The NCE could now be different */
 496         nce = ire_to_nce_pkt(ire, mp);
 497         if (nce == NULL) {
 498                 /*
 499                  * Allocation failure. Make sure we redo ire/nce selection
 500                  * next time we send.
 501                  */
 502                 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
 503                 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
 504                 return (ENOBUFS);
 505         }
 506         if (nce == ixa->ixa_nce) {
 507                 /* No change */
 508                 nce_refrele(nce);
 509                 return (0);
 510         }
 511 
 512         /*
 513          * Since the path MTU might change as a result of this
 514          * route change, we twiddle ixa_dce_generation to
 515          * make conn_ip_output go through the ip_verify_dce code.
 516          */
 517         ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
 518 
 519         if (ixa->ixa_nce != NULL)
 520                 nce_refrele(ixa->ixa_nce);
 521         ixa->ixa_nce = nce;
 522         return (0);
 523 }
 524 
 525 /*
 526  * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use.
 527  */
 528 static int
 529 ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa)
 530 {
 531         ire_t           *ire = ixa->ixa_ire;
 532         nce_t           *nce;
 533         int             error = 0;
 534         ipha_t          *ipha = NULL;
 535         ip6_t           *ip6h = NULL;
 536 
 537         if (ire->ire_ipversion == IPV4_VERSION)
 538                 ipha = (ipha_t *)mp->b_rptr;
 539         else
 540                 ip6h = (ip6_t *)mp->b_rptr;
 541 
 542         nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE);
 543         if (nce == NULL) {
 544                 /* Try to find a better ire */
 545                 return (ip_verify_ire(mp, ixa));
 546         }
 547 
 548         /*
 549          * The hardware offloading capabilities, for example LSO, of the
 550          * interface might have changed, so do sanity verification here.
 551          */
 552         if (ixa->ixa_flags & IXAF_VERIFY_LSO) {
 553                 if (!ip_verify_lso(nce->nce_ill, ixa)) {
 554                         ASSERT(ixa->ixa_notify != NULL);
 555                         ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
 556                             IXAN_LSO, 0);
 557                         error = ENOTSUP;
 558                 }
 559         }
 560 
 561         /*
 562          * Verify ZEROCOPY capability of underlying ill. Notify the ULP with
 563          * any ZEROCOPY changes. In case ZEROCOPY capability is not available
 564          * any more, return error so that conn_ip_output() can take care of
 565          * the ZEROCOPY message properly. It's safe to continue send the
 566          * message when ZEROCOPY newly become available.
 567          */
 568         if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) {
 569                 if (!ip_verify_zcopy(nce->nce_ill, ixa)) {
 570                         ASSERT(ixa->ixa_notify != NULL);
 571                         ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
 572                             IXAN_ZCOPY, 0);
 573                         if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0)
 574                                 error = ENOTSUP;
 575                 }
 576         }
 577 
 578         /*
 579          * Since the path MTU might change as a result of this
 580          * change, we twiddle ixa_dce_generation to
 581          * make conn_ip_output go through the ip_verify_dce code.
 582          */
 583         ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
 584 
 585         nce_refrele(ixa->ixa_nce);
 586         ixa->ixa_nce = nce;
 587         return (error);
 588 }
 589 
 590 /*
 591  * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use.
 592  */
 593 static int
 594 ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa)
 595 {
 596         dce_t           *dce;
 597         uint_t          gen;
 598         uint_t          pmtu;
 599 
 600         dce = dce_lookup_pkt(mp, ixa, &gen);
 601         ASSERT(dce != NULL);
 602 
 603         dce_refrele_notr(ixa->ixa_dce);
 604 #ifdef DEBUG
 605         dce_refhold_notr(dce);
 606         dce_refrele(dce);
 607 #endif
 608         ixa->ixa_dce = dce;
 609         ixa->ixa_dce_generation = gen;
 610 
 611         /* Extract the (path) mtu from the dce, ncec_ill etc */
 612         pmtu = ip_get_pmtu(ixa);
 613 
 614         /*
 615          * Tell ULP about PMTU changes - increase or decrease - by returning
 616          * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update
 617          * both ixa_pmtu and ixa_fragsize appropriately.
 618          *
 619          * If ULP doesn't set that flag then we need to update ixa_fragsize
 620          * since routing could have changed the ill after after ixa_fragsize
 621          * was set previously in the conn_ip_output path or in
 622          * ip_set_destination.
 623          *
 624          * In case of LSO, ixa_fragsize might be greater than ixa_pmtu.
 625          *
 626          * In the case of a path MTU increase we send the packet after the
 627          * notify to the ULP.
 628          */
 629         if (ixa->ixa_flags & IXAF_VERIFY_PMTU) {
 630                 if (ixa->ixa_pmtu != pmtu) {
 631                         uint_t oldmtu = ixa->ixa_pmtu;
 632 
 633                         DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu,
 634                             uint32_t, ixa->ixa_pmtu);
 635                         ASSERT(ixa->ixa_notify != NULL);
 636                         ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
 637                             IXAN_PMTU, pmtu);
 638                         if (pmtu < oldmtu)
 639                                 return (EMSGSIZE);
 640                 }
 641         } else {
 642                 ixa->ixa_fragsize = pmtu;
 643         }
 644         return (0);
 645 }
 646 
 647 /*
 648  * Verify LSO usability. Keep the return value simple to indicate whether
 649  * the LSO capability has changed. Handle both IPv4 and IPv6.
 650  */
 651 static boolean_t
 652 ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa)
 653 {
 654         ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab;
 655         ill_lso_capab_t *new_lsoc = ill->ill_lso_capab;
 656 
 657         if (ixa->ixa_flags & IXAF_LSO_CAPAB) {
 658                 /*
 659                  * Not unsable any more.
 660                  */
 661                 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
 662                     (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
 663                     (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
 664                     ((ixa->ixa_flags & IXAF_IS_IPV4) ?
 665                     !ILL_LSO_TCP_IPV4_USABLE(ill) :
 666                     !ILL_LSO_TCP_IPV6_USABLE(ill))) {
 667                         ixa->ixa_flags &= ~IXAF_LSO_CAPAB;
 668 
 669                         return (B_FALSE);
 670                 }
 671 
 672                 /*
 673                  * Capability has changed, refresh the copy in ixa.
 674                  */
 675                 if (lsoc->ill_lso_max != new_lsoc->ill_lso_max) {
 676                         *lsoc = *new_lsoc;
 677 
 678                         return (B_FALSE);
 679                 }
 680         } else { /* Was not usable */
 681                 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
 682                     !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
 683                     !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
 684                     ((ixa->ixa_flags & IXAF_IS_IPV4) ?
 685                     ILL_LSO_TCP_IPV4_USABLE(ill) :
 686                     ILL_LSO_TCP_IPV6_USABLE(ill))) {
 687                         *lsoc = *new_lsoc;
 688                         ixa->ixa_flags |= IXAF_LSO_CAPAB;
 689 
 690                         return (B_FALSE);
 691                 }
 692         }
 693 
 694         return (B_TRUE);
 695 }
 696 
 697 /*
 698  * Verify ZEROCOPY usability. Keep the return value simple to indicate whether
 699  * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6.
 700  */
 701 static boolean_t
 702 ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa)
 703 {
 704         if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) {
 705                 /*
 706                  * Not unsable any more.
 707                  */
 708                 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
 709                     (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
 710                     (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
 711                     !ILL_ZCOPY_USABLE(ill)) {
 712                         ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB;
 713 
 714                         return (B_FALSE);
 715                 }
 716         } else { /* Was not usable */
 717                 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
 718                     !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
 719                     !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
 720                     ILL_ZCOPY_USABLE(ill)) {
 721                         ixa->ixa_flags |= IXAF_ZCOPY_CAPAB;
 722 
 723                         return (B_FALSE);
 724                 }
 725         }
 726 
 727         return (B_TRUE);
 728 }
 729 
 730 
 731 /*
 732  * When there is no conn_t context, this will send a packet.
 733  * The caller must *not* have called conn_connect() or ip_attr_connect()
 734  * before calling ip_output_simple().
 735  * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH.
 736  * Honors IXAF_SET_SOURCE.
 737  *
 738  * We acquire the ire and after calling ire_sendfn we release
 739  * the hold on the ire. Ditto for the nce and dce.
 740  *
 741  * This assumes that the caller has set the following in ip_xmit_attr_t:
 742  *      ixa_tsl, ixa_zoneid, and ixa_ipst must always be set.
 743  *      If ixa_ifindex is non-zero it means send out that ill. (If it is
 744  *      an upper IPMP ill we load balance across the group; if a lower we send
 745  *      on that lower ill without load balancing.)
 746  *      IXAF_IS_IPV4 must be set correctly.
 747  *      If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set.
 748  *      If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup.
 749  *      If neither of those two are set we do an IPsec policy lookup.
 750  *
 751  * We handle setting things like
 752  *      ixa_pktlen
 753  *      ixa_ip_hdr_length
 754  *      ixa->ixa_protocol
 755  *
 756  * The caller may set ixa_xmit_hint, which is used for ECMP selection and
 757  * transmit ring selecting in GLD.
 758  *
 759  * The caller must do an ixa_cleanup() to release any IPsec references
 760  * after we return.
 761  */
 762 int
 763 ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa)
 764 {
 765         ts_label_t      *effective_tsl = NULL;
 766         int             err;
 767 
 768         ASSERT(ixa->ixa_ipst != NULL);
 769 
 770         if (is_system_labeled()) {
 771                 ip_stack_t *ipst = ixa->ixa_ipst;
 772 
 773                 if (ixa->ixa_flags & IXAF_IS_IPV4) {
 774                         err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid,
 775                             &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
 776                             &effective_tsl);
 777                 } else {
 778                         err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid,
 779                             &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
 780                             &effective_tsl);
 781                 }
 782                 if (err != 0) {
 783                         ip2dbg(("tsol_check: label check failed (%d)\n", err));
 784                         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 785                         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
 786                         ip_drop_output("tsol_check_label", mp, NULL);
 787                         freemsg(mp);
 788                         return (err);
 789                 }
 790                 if (effective_tsl != NULL) {
 791                         /* Update the label */
 792                         ip_xmit_attr_replace_tsl(ixa, effective_tsl);
 793                 }
 794         }
 795 
 796         if (ixa->ixa_flags & IXAF_IS_IPV4)
 797                 return (ip_output_simple_v4(mp, ixa));
 798         else
 799                 return (ip_output_simple_v6(mp, ixa));
 800 }
 801 
 802 int
 803 ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa)
 804 {
 805         ipha_t          *ipha;
 806         ipaddr_t        firsthop; /* In IP header */
 807         ipaddr_t        dst;    /* End of source route, or ipha_dst if none */
 808         ire_t           *ire;
 809         ipaddr_t        setsrc; /* RTF_SETSRC */
 810         int             error;
 811         ill_t           *ill = NULL;
 812         dce_t           *dce = NULL;
 813         nce_t           *nce;
 814         iaflags_t       ixaflags = ixa->ixa_flags;
 815         ip_stack_t      *ipst = ixa->ixa_ipst;
 816         boolean_t       repeat = B_FALSE;
 817         boolean_t       multirt = B_FALSE;
 818         int64_t         now;
 819 
 820         ipha = (ipha_t *)mp->b_rptr;
 821         ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
 822 
 823         /*
 824          * Even on labeled systems we can have a NULL ixa_tsl e.g.,
 825          * for IGMP/MLD traffic.
 826          */
 827 
 828         /* Caller already set flags */
 829         ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
 830 
 831         ASSERT(ixa->ixa_nce == NULL);
 832 
 833         ixa->ixa_pktlen = ntohs(ipha->ipha_length);
 834         ASSERT(ixa->ixa_pktlen == msgdsize(mp));
 835         ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
 836         ixa->ixa_protocol = ipha->ipha_protocol;
 837 
 838         /*
 839          * Assumes that source routed packets have already been massaged by
 840          * the ULP (ip_massage_options) and as a result ipha_dst is the next
 841          * hop in the source route. The final destination is used for IPsec
 842          * policy and DCE lookup.
 843          */
 844         firsthop = ipha->ipha_dst;
 845         dst = ip_get_dst(ipha);
 846 
 847 repeat_ire:
 848         error = 0;
 849         setsrc = INADDR_ANY;
 850         ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL,
 851             &setsrc, &error, &multirt);
 852         ASSERT(ire != NULL);    /* IRE_NOROUTE if none found */
 853         if (error != 0) {
 854                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 855                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
 856                 ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL);
 857                 freemsg(mp);
 858                 goto done;
 859         }
 860 
 861         if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) {
 862                 /* ire_ill might be NULL hence need to skip some code */
 863                 if (ixaflags & IXAF_SET_SOURCE)
 864                         ipha->ipha_src = htonl(INADDR_LOOPBACK);
 865                 ixa->ixa_fragsize = IP_MAXPACKET;
 866                 ill = NULL;
 867                 nce = NULL;
 868                 ire->ire_ob_pkt_count++;
 869                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 870                 /* No dce yet; use default one */
 871                 error = (ire->ire_sendfn)(ire, mp, ipha, ixa,
 872                     &ipst->ips_dce_default->dce_ident);
 873                 goto done;
 874         }
 875 
 876         /* Note that ipha_dst is only used for IRE_MULTICAST */
 877         nce = ire_to_nce(ire, ipha->ipha_dst, NULL);
 878         if (nce == NULL) {
 879                 /* Allocation failure? */
 880                 ip_drop_output("ire_to_nce", mp, ill);
 881                 freemsg(mp);
 882                 error = ENOBUFS;
 883                 goto done;
 884         }
 885         if (nce->nce_is_condemned) {
 886                 nce_t *nce1;
 887 
 888                 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE);
 889                 nce_refrele(nce);
 890                 if (nce1 == NULL) {
 891                         if (!repeat) {
 892                                 /* Try finding a better IRE */
 893                                 repeat = B_TRUE;
 894                                 ire_refrele(ire);
 895                                 goto repeat_ire;
 896                         }
 897                         /* Tried twice - drop packet */
 898                         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
 899                         ip_drop_output("No nce", mp, ill);
 900                         freemsg(mp);
 901                         error = ENOBUFS;
 902                         goto done;
 903                 }
 904                 nce = nce1;
 905         }
 906 
 907         /*
 908          * For multicast with multirt we have a flag passed back from
 909          * ire_lookup_multi_ill_v4 since we don't have an IRE for each
 910          * possible multicast address.
 911          * We also need a flag for multicast since we can't check
 912          * whether RTF_MULTIRT is set in ixa_ire for multicast.
 913          */
 914         if (multirt) {
 915                 ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
 916                 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
 917         } else {
 918                 ixa->ixa_postfragfn = ire->ire_postfragfn;
 919                 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
 920         }
 921         ASSERT(ixa->ixa_nce == NULL);
 922         ixa->ixa_nce = nce;
 923 
 924         /*
 925          * Check for a dce_t with a path mtu.
 926          */
 927         dce = dce_lookup_v4(dst, ipst, NULL);
 928         ASSERT(dce != NULL);
 929 
 930         if (!(ixaflags & IXAF_PMTU_DISCOVERY)) {
 931                 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
 932         } else if (dce->dce_flags & DCEF_PMTU) {
 933                 /*
 934                  * To avoid a periodic timer to increase the path MTU we
 935                  * look at dce_last_change_time each time we send a packet.
 936                  */
 937                 now = ddi_get_lbolt64();
 938                 if (TICK_TO_SEC(now) - dce->dce_last_change_time >
 939                     ipst->ips_ip_pathmtu_interval) {
 940                         /*
 941                          * Older than 20 minutes. Drop the path MTU information.
 942                          */
 943                         mutex_enter(&dce->dce_lock);
 944                         dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
 945                         dce->dce_last_change_time = TICK_TO_SEC(now);
 946                         mutex_exit(&dce->dce_lock);
 947                         dce_increment_generation(dce);
 948                         ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
 949                 } else {
 950                         uint_t fragsize;
 951 
 952                         fragsize = ip_get_base_mtu(nce->nce_ill, ire);
 953                         if (fragsize > dce->dce_pmtu)
 954                                 fragsize = dce->dce_pmtu;
 955                         ixa->ixa_fragsize = fragsize;
 956                 }
 957         } else {
 958                 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
 959         }
 960 
 961         /*
 962          * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp
 963          * interface for source address selection.
 964          */
 965         ill = ire_nexthop_ill(ire);
 966 
 967         if (ixaflags & IXAF_SET_SOURCE) {
 968                 ipaddr_t        src;
 969 
 970                 /*
 971                  * We use the final destination to get
 972                  * correct selection for source routed packets
 973                  */
 974 
 975                 /* If unreachable we have no ill but need some source */
 976                 if (ill == NULL) {
 977                         src = htonl(INADDR_LOOPBACK);
 978                         error = 0;
 979                 } else {
 980                         error = ip_select_source_v4(ill, setsrc, dst,
 981                             ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst,
 982                             &src, NULL, NULL);
 983                 }
 984                 if (error != 0) {
 985                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
 986                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
 987                         ip_drop_output("ipIfStatsOutDiscards - no source",
 988                             mp, ill);
 989                         freemsg(mp);
 990                         goto done;
 991                 }
 992                 ipha->ipha_src = src;
 993         } else if (ixaflags & IXAF_VERIFY_SOURCE) {
 994                 /* Check if the IP source is assigned to the host. */
 995                 if (!ip_verify_src(mp, ixa, NULL)) {
 996                         /* Don't send a packet with a source that isn't ours */
 997                         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 998                         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
 999                         ip_drop_output("ipIfStatsOutDiscards - invalid source",
1000                             mp, ill);
1001                         freemsg(mp);
1002                         error = EADDRNOTAVAIL;
1003                         goto done;
1004                 }
1005         }
1006 
1007 
1008         /*
1009          * Check against global IPsec policy to set the AH/ESP attributes.
1010          * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate.
1011          */
1012         if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
1013                 ASSERT(ixa->ixa_ipsec_policy == NULL);
1014                 mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa);
1015                 if (mp == NULL) {
1016                         /* MIB and ip_drop_packet already done */
1017                         return (EHOSTUNREACH);  /* IPsec policy failure */
1018                 }
1019         }
1020 
1021         if (ill != NULL) {
1022                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
1023         } else {
1024                 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
1025         }
1026 
1027         /*
1028          * We update the statistics on the most specific IRE i.e., the first
1029          * one we found.
1030          * We don't have an IRE when we fragment, hence ire_ob_pkt_count
1031          * can only count the use prior to fragmentation. However the MIB
1032          * counters on the ill will be incremented in post fragmentation.
1033          */
1034         ire->ire_ob_pkt_count++;
1035 
1036         /*
1037          * Based on ire_type and ire_flags call one of:
1038          *      ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK
1039          *      ire_send_multirt_v4 - if RTF_MULTIRT
1040          *      ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
1041          *      ire_send_multicast_v4 - for IRE_MULTICAST
1042          *      ire_send_broadcast_v4 - for IRE_BROADCAST
1043          *      ire_send_wire_v4 - for the rest.
1044          */
1045         error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident);
1046 done:
1047         ire_refrele(ire);
1048         if (dce != NULL)
1049                 dce_refrele(dce);
1050         if (ill != NULL)
1051                 ill_refrele(ill);
1052         if (ixa->ixa_nce != NULL)
1053                 nce_refrele(ixa->ixa_nce);
1054         ixa->ixa_nce = NULL;
1055         return (error);
1056 }
1057 
1058 /*
1059  * ire_sendfn() functions.
1060  * These functions use the following xmit_attr:
1061  *  - ixa_fragsize - read to determine whether or not to fragment
1062  *  - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec
1063  *  - ixa_ipsec_*  are used inside IPsec
1064  *  - IXAF_SET_SOURCE - replace IP source in broadcast case.
1065  *  - IXAF_LOOPBACK_COPY - for multicast and broadcast
1066  */
1067 
1068 
1069 /*
1070  * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK
1071  *
1072  * The checks for restrict_interzone_loopback are done in ire_route_recursive.
1073  */
1074 /* ARGSUSED4 */
1075 int
1076 ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1077     ip_xmit_attr_t *ixa, uint32_t *identp)
1078 {
1079         ipha_t          *ipha = (ipha_t *)iph_arg;
1080         ip_stack_t      *ipst = ixa->ixa_ipst;
1081         ill_t           *ill = ire->ire_ill;
1082         ip_recv_attr_t  iras;   /* NOTE: No bzero for performance */
1083         uint_t          pktlen = ixa->ixa_pktlen;
1084 
1085         /*
1086          * No fragmentation, no nce, no application of IPsec,
1087          * and no ipha_ident assignment.
1088          *
1089          * Note different order between IP provider and FW_HOOKS than in
1090          * send_wire case.
1091          */
1092 
1093         /*
1094          * DTrace this as ip:::send.  A packet blocked by FW_HOOKS will fire the
1095          * send probe, but not the receive probe.
1096          */
1097         DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1098             ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1099             int, 1);
1100 
1101         if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) {
1102                 int error;
1103 
1104                 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
1105                     ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
1106                 FW_HOOKS(ipst->ips_ip4_loopback_out_event,
1107                     ipst->ips_ipv4firewall_loopback_out,
1108                     NULL, ill, ipha, mp, mp, 0, ipst, error);
1109                 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
1110                 if (mp == NULL)
1111                         return (error);
1112 
1113                 /*
1114                  * Even if the destination was changed by the filter we use the
1115                  * forwarding decision that was made based on the address
1116                  * in ip_output/ip_set_destination.
1117                  */
1118                 /* Length could be different */
1119                 ipha = (ipha_t *)mp->b_rptr;
1120                 pktlen = ntohs(ipha->ipha_length);
1121         }
1122 
1123         /*
1124          * If a callback is enabled then we need to know the
1125          * source and destination zoneids for the packet. We already
1126          * have those handy.
1127          */
1128         if (ipst->ips_ip4_observe.he_interested) {
1129                 zoneid_t szone, dzone;
1130                 zoneid_t stackzoneid;
1131 
1132                 stackzoneid = netstackid_to_zoneid(
1133                     ipst->ips_netstack->netstack_stackid);
1134 
1135                 if (stackzoneid == GLOBAL_ZONEID) {
1136                         /* Shared-IP zone */
1137                         dzone = ire->ire_zoneid;
1138                         szone = ixa->ixa_zoneid;
1139                 } else {
1140                         szone = dzone = stackzoneid;
1141                 }
1142                 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
1143         }
1144 
1145         /* Handle lo0 stats */
1146         ipst->ips_loopback_packets++;
1147 
1148         /* Map ixa to ira including IPsec policies */
1149         ipsec_out_to_in(ixa, ill, &iras);
1150         iras.ira_pktlen = pktlen;
1151 
1152         if (!IS_SIMPLE_IPH(ipha)) {
1153                 ip_output_local_options(ipha, ipst);
1154                 iras.ira_flags |= IRAF_IPV4_OPTIONS;
1155         }
1156 
1157         if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) {
1158                 int error;
1159 
1160                 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
1161                     ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
1162                 FW_HOOKS(ipst->ips_ip4_loopback_in_event,
1163                     ipst->ips_ipv4firewall_loopback_in,
1164                     ill, NULL, ipha, mp, mp, 0, ipst, error);
1165 
1166                 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
1167                 if (mp == NULL) {
1168                         ira_cleanup(&iras, B_FALSE);
1169                         return (error);
1170                 }
1171                 /*
1172                  * Even if the destination was changed by the filter we use the
1173                  * forwarding decision that was made based on the address
1174                  * in ip_output/ip_set_destination.
1175                  */
1176                 /* Length could be different */
1177                 ipha = (ipha_t *)mp->b_rptr;
1178                 pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length);
1179         }
1180 
1181         DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1182             ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1183             int, 1);
1184 
1185         ire->ire_ib_pkt_count++;
1186         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
1187         UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen);
1188 
1189         /* Destined to ire_zoneid - use that for fanout */
1190         iras.ira_zoneid = ire->ire_zoneid;
1191 
1192         if (is_system_labeled()) {
1193                 iras.ira_flags |= IRAF_SYSTEM_LABELED;
1194 
1195                 /*
1196                  * This updates ira_cred, ira_tsl and ira_free_flags based
1197                  * on the label. We don't expect this to ever fail for
1198                  * loopback packets, so we silently drop the packet should it
1199                  * fail.
1200                  */
1201                 if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) {
1202                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1203                         ip_drop_input("tsol_get_pkt_label", mp, ill);
1204                         freemsg(mp);
1205                         return (0);
1206                 }
1207                 ASSERT(iras.ira_tsl != NULL);
1208 
1209                 /* tsol_get_pkt_label sometimes does pullupmsg */
1210                 ipha = (ipha_t *)mp->b_rptr;
1211         }
1212 
1213         ip_fanout_v4(mp, ipha, &iras);
1214 
1215         /* We moved any IPsec refs from ixa to iras */
1216         ira_cleanup(&iras, B_FALSE);
1217         return (0);
1218 }
1219 
1220 /*
1221  * ire_sendfn for IRE_BROADCAST
1222  * If the broadcast address is present on multiple ills and ixa_ifindex
1223  * isn't set, then we generate
1224  * a separate datagram (potentially with different source address) for
1225  * those ills. In any case, only one copy is looped back to ip_input_v4.
1226  */
1227 int
1228 ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1229     ip_xmit_attr_t *ixa, uint32_t *identp)
1230 {
1231         ipha_t          *ipha = (ipha_t *)iph_arg;
1232         ip_stack_t      *ipst = ixa->ixa_ipst;
1233         irb_t           *irb = ire->ire_bucket;
1234         ire_t           *ire1;
1235         mblk_t          *mp1;
1236         ipha_t          *ipha1;
1237         iaflags_t       ixaflags = ixa->ixa_flags;
1238         nce_t           *nce1, *nce_orig;
1239 
1240         /*
1241          * Unless ire_send_multirt_v4 already set a ttl, force the
1242          * ttl to a smallish value.
1243          */
1244         if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) {
1245                 /*
1246                  * To avoid broadcast storms, we usually set the TTL to 1 for
1247                  * broadcasts.  This can
1248                  * be overridden stack-wide through the ip_broadcast_ttl
1249                  * ndd tunable, or on a per-connection basis through the
1250                  * IP_BROADCAST_TTL socket option.
1251                  *
1252                  * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4
1253                  * will force ttl to one after we've set this.
1254                  */
1255                 if (ixaflags & IXAF_BROADCAST_TTL_SET)
1256                         ipha->ipha_ttl = ixa->ixa_broadcast_ttl;
1257                 else
1258                         ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
1259         }
1260         /*
1261          * Make sure we get a loopback copy (after IPsec and frag)
1262          * Skip hardware checksum so that loopback copy is checksumed.
1263          */
1264         ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1265 
1266         /* Do we need to potentially generate multiple copies? */
1267         if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0)
1268                 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1269 
1270         /*
1271          * Loop over all IRE_BROADCAST in the bucket (might only be one).
1272          * Note that everything in the bucket has the same destination address.
1273          */
1274         irb_refhold(irb);
1275         for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1276                 /* We do the main IRE after the end of the loop */
1277                 if (ire1 == ire)
1278                         continue;
1279 
1280                 /*
1281                  * Only IREs for the same IP address should be in the same
1282                  * bucket.
1283                  * But could have IRE_HOSTs in the case of CGTP.
1284                  * If we find any multirt routes we bail out of the loop
1285                  * and just do the single packet at the end; ip_postfrag_multirt
1286                  * will duplicate the packet.
1287                  */
1288                 ASSERT(ire1->ire_addr == ire->ire_addr);
1289                 if (!(ire1->ire_type & IRE_BROADCAST))
1290                         continue;
1291 
1292                 if (IRE_IS_CONDEMNED(ire1))
1293                         continue;
1294 
1295                 if (ixa->ixa_zoneid != ALL_ZONES &&
1296                     ire->ire_zoneid != ire1->ire_zoneid)
1297                         continue;
1298 
1299                 ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL);
1300 
1301                 if (ire1->ire_flags & RTF_MULTIRT)
1302                         break;
1303 
1304                 /*
1305                  * For IPMP we only send for the ipmp_ill. arp_nce_init() will
1306                  * ensure that this goes out on the cast_ill.
1307                  */
1308                 if (IS_UNDER_IPMP(ire1->ire_ill))
1309                         continue;
1310 
1311                 mp1 = copymsg(mp);
1312                 if (mp1 == NULL) {
1313                         BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1314                             ipIfStatsOutDiscards);
1315                         ip_drop_output("ipIfStatsOutDiscards",
1316                             mp, ire1->ire_ill);
1317                         continue;
1318                 }
1319 
1320                 ipha1 = (ipha_t *)mp1->b_rptr;
1321                 if (ixa->ixa_flags & IXAF_SET_SOURCE) {
1322                         /*
1323                          * Need to pick a different source address for each
1324                          * interface. If we have a global IPsec policy and
1325                          * no per-socket policy then we punt to
1326                          * ip_output_simple_v4 using a separate ip_xmit_attr_t.
1327                          */
1328                         if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) {
1329                                 ip_output_simple_broadcast(ixa, mp1);
1330                                 continue;
1331                         }
1332                         /* Pick a new source address for each interface */
1333                         if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY,
1334                             ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst,
1335                             &ipha1->ipha_src, NULL, NULL) != 0) {
1336                                 BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1337                                     ipIfStatsOutDiscards);
1338                                 ip_drop_output("ipIfStatsOutDiscards - select "
1339                                     "broadcast source", mp1, ire1->ire_ill);
1340                                 freemsg(mp1);
1341                                 continue;
1342                         }
1343                         /*
1344                          * Check against global IPsec policy to set the AH/ESP
1345                          * attributes. IPsec will set IXAF_IPSEC_* and
1346                          * ixa_ipsec_* as appropriate.
1347                          */
1348                         if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
1349                                 ASSERT(ixa->ixa_ipsec_policy == NULL);
1350                                 mp1 = ip_output_attach_policy(mp1, ipha, NULL,
1351                                     NULL, ixa);
1352                                 if (mp1 == NULL) {
1353                                         /*
1354                                          * MIB and ip_drop_packet already
1355                                          * done
1356                                          */
1357                                         continue;
1358                                 }
1359                         }
1360                 }
1361                 /* Make sure we have an NCE on this ill */
1362                 nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr,
1363                     ire1->ire_type);
1364                 if (nce1 == NULL) {
1365                         BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1366                             ipIfStatsOutDiscards);
1367                         ip_drop_output("ipIfStatsOutDiscards - broadcast nce",
1368                             mp1, ire1->ire_ill);
1369                         freemsg(mp1);
1370                         continue;
1371                 }
1372                 nce_orig = ixa->ixa_nce;
1373                 ixa->ixa_nce = nce1;
1374 
1375                 ire_refhold(ire1);
1376                 /*
1377                  * Ignore any errors here. We just collect the errno for
1378                  * the main ire below
1379                  */
1380                 (void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp);
1381                 ire_refrele(ire1);
1382 
1383                 ixa->ixa_nce = nce_orig;
1384                 nce_refrele(nce1);
1385 
1386                 ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY;
1387         }
1388         irb_refrele(irb);
1389         /* Finally, the main one */
1390 
1391         /*
1392          * For IPMP we only send broadcasts on the ipmp_ill.
1393          */
1394         if (IS_UNDER_IPMP(ire->ire_ill)) {
1395                 freemsg(mp);
1396                 return (0);
1397         }
1398 
1399         return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1400 }
1401 
1402 /*
1403  * Send a packet using a different source address and different
1404  * IPsec policy.
1405  */
1406 static void
1407 ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp)
1408 {
1409         ip_xmit_attr_t ixas;
1410 
1411         bzero(&ixas, sizeof (ixas));
1412         ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
1413         ixas.ixa_zoneid = ixa->ixa_zoneid;
1414         ixas.ixa_ifindex = 0;
1415         ixas.ixa_ipst = ixa->ixa_ipst;
1416         ixas.ixa_cred = ixa->ixa_cred;
1417         ixas.ixa_cpid = ixa->ixa_cpid;
1418         ixas.ixa_tsl = ixa->ixa_tsl;
1419         ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1420 
1421         (void) ip_output_simple(mp, &ixas);
1422         ixa_cleanup(&ixas);
1423 }
1424 
1425 
1426 static void
1427 multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa)
1428 {
1429         ip_stack_t      *ipst = ixa->ixa_ipst;
1430 
1431         /* Limit the TTL on multirt packets */
1432         if (ire->ire_type & IRE_MULTICAST) {
1433                 if (ipha->ipha_ttl > 1) {
1434                         ip2dbg(("ire_send_multirt_v4: forcing multicast "
1435                             "multirt TTL to 1 (was %d), dst 0x%08x\n",
1436                             ipha->ipha_ttl, ntohl(ire->ire_addr)));
1437                         ipha->ipha_ttl = 1;
1438                 }
1439                 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
1440         } else if ((ipst->ips_ip_multirt_ttl > 0) &&
1441             (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
1442                 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
1443                 /*
1444                  * Need to ensure we don't increase the ttl should we go through
1445                  * ire_send_broadcast or multicast.
1446                  */
1447                 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
1448         }
1449 }
1450 
1451 /*
1452  * ire_sendfn for IRE_MULTICAST
1453  */
1454 int
1455 ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1456     ip_xmit_attr_t *ixa, uint32_t *identp)
1457 {
1458         ipha_t          *ipha = (ipha_t *)iph_arg;
1459         ip_stack_t      *ipst = ixa->ixa_ipst;
1460         ill_t           *ill = ire->ire_ill;
1461         iaflags_t       ixaflags = ixa->ixa_flags;
1462 
1463         /*
1464          * The IRE_MULTICAST is the same whether or not multirt is in use.
1465          * Hence we need special-case code.
1466          */
1467         if (ixaflags & IXAF_MULTIRT_MULTICAST)
1468                 multirt_check_v4(ire, ipha, ixa);
1469 
1470         /*
1471          * Check if anything in ip_input_v4 wants a copy of the transmitted
1472          * packet (after IPsec and fragmentation)
1473          *
1474          * 1. Multicast routers always need a copy unless SO_DONTROUTE is set
1475          *    RSVP and the rsvp daemon is an example of a
1476          *    protocol and user level process that
1477          *    handles it's own routing. Hence, it uses the
1478          *    SO_DONTROUTE option to accomplish this.
1479          * 2. If the sender has set IP_MULTICAST_LOOP, then we just
1480          *    check whether there are any receivers for the group on the ill
1481          *    (ignoring the zoneid).
1482          * 3. If IP_MULTICAST_LOOP is not set, then we check if there are
1483          *    any members in other shared-IP zones.
1484          *    If such members exist, then we indicate that the sending zone
1485          *    shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP
1486          *    behavior.
1487          *
1488          * When we loopback we skip hardware checksum to make sure loopback
1489          * copy is checksumed.
1490          *
1491          * Note that ire_ill is the upper in the case of IPMP.
1492          */
1493         ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM);
1494         if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 &&
1495             !(ixaflags & IXAF_DONTROUTE)) {
1496                 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1497         } else if (ixaflags & IXAF_MULTICAST_LOOP) {
1498                 /*
1499                  * If this zone or any other zone has members then loopback
1500                  * a copy.
1501                  */
1502                 if (ill_hasmembers_v4(ill, ipha->ipha_dst))
1503                         ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1504         } else if (ipst->ips_netstack->netstack_numzones > 1) {
1505                 /*
1506                  * This zone should not have a copy. But there are some other
1507                  * zones which might have members.
1508                  */
1509                 if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
1510                     ixa->ixa_zoneid)) {
1511                         ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET;
1512                         ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid;
1513                         ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1514                 }
1515         }
1516 
1517         /*
1518          * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl,
1519          * force the ttl to the IP_MULTICAST_TTL value
1520          */
1521         if (!(ixaflags & IXAF_NO_TTL_CHANGE)) {
1522                 ipha->ipha_ttl = ixa->ixa_multicast_ttl;
1523         }
1524 
1525         return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1526 }
1527 
1528 /*
1529  * ire_sendfn for IREs with RTF_MULTIRT
1530  */
1531 int
1532 ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1533     ip_xmit_attr_t *ixa, uint32_t *identp)
1534 {
1535         ipha_t          *ipha = (ipha_t *)iph_arg;
1536 
1537         multirt_check_v4(ire, ipha, ixa);
1538 
1539         if (ire->ire_type & IRE_MULTICAST)
1540                 return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp));
1541         else if (ire->ire_type & IRE_BROADCAST)
1542                 return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp));
1543         else
1544                 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1545 }
1546 
1547 /*
1548  * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE
1549  */
1550 int
1551 ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1552     ip_xmit_attr_t *ixa, uint32_t *identp)
1553 {
1554         ip_stack_t      *ipst = ixa->ixa_ipst;
1555         ipha_t          *ipha = (ipha_t *)iph_arg;
1556         ill_t           *ill;
1557         ip_recv_attr_t  iras;
1558         boolean_t       dummy;
1559 
1560         /* We assign an IP ident for nice errors */
1561         ipha->ipha_ident = atomic_add_32_nv(identp, 1);
1562 
1563         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
1564 
1565         if (ire->ire_type & IRE_NOROUTE) {
1566                 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */
1567                 ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0,
1568                     RTA_DST, ipst);
1569         }
1570 
1571         if (ire->ire_flags & RTF_BLACKHOLE) {
1572                 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL);
1573                 freemsg(mp);
1574                 /* No error even for local senders - silent blackhole */
1575                 return (0);
1576         }
1577         ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL);
1578 
1579         /*
1580          * We need an ill_t for the ip_recv_attr_t even though this packet
1581          * was never received and icmp_unreachable doesn't currently use
1582          * ira_ill.
1583          */
1584         ill = ill_lookup_on_name("lo0", B_FALSE,
1585             !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst);
1586         if (ill == NULL) {
1587                 freemsg(mp);
1588                 return (EHOSTUNREACH);
1589         }
1590 
1591         bzero(&iras, sizeof (iras));
1592         /* Map ixa to ira including IPsec policies */
1593         ipsec_out_to_in(ixa, ill, &iras);
1594 
1595         if (ip_source_routed(ipha, ipst)) {
1596                 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
1597         } else {
1598                 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
1599         }
1600         /* We moved any IPsec refs from ixa to iras */
1601         ira_cleanup(&iras, B_FALSE);
1602         ill_refrele(ill);
1603         return (EHOSTUNREACH);
1604 }
1605 
1606 /*
1607  * Calculate a checksum ignoring any hardware capabilities
1608  *
1609  * Returns B_FALSE if the packet was too short for the checksum. Caller
1610  * should free and do stats.
1611  */
1612 static boolean_t
1613 ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa)
1614 {
1615         ip_stack_t      *ipst = ixa->ixa_ipst;
1616         uint_t          pktlen = ixa->ixa_pktlen;
1617         uint16_t        *cksump;
1618         uint32_t        cksum;
1619         uint8_t         protocol = ixa->ixa_protocol;
1620         uint16_t        ip_hdr_length = ixa->ixa_ip_hdr_length;
1621         ipaddr_t        dst = ipha->ipha_dst;
1622         ipaddr_t        src = ipha->ipha_src;
1623 
1624         /* Just in case it contained garbage */
1625         DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1626 
1627         /*
1628          * Calculate ULP checksum
1629          */
1630         if (protocol == IPPROTO_TCP) {
1631                 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1632                 cksum = IP_TCP_CSUM_COMP;
1633         } else if (protocol == IPPROTO_UDP) {
1634                 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1635                 cksum = IP_UDP_CSUM_COMP;
1636         } else if (protocol == IPPROTO_SCTP) {
1637                 sctp_hdr_t      *sctph;
1638 
1639                 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1640                 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1641                 /*
1642                  * Zero out the checksum field to ensure proper
1643                  * checksum calculation.
1644                  */
1645                 sctph->sh_chksum = 0;
1646 #ifdef  DEBUG
1647                 if (!skip_sctp_cksum)
1648 #endif
1649                         sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1650                 goto ip_hdr_cksum;
1651         } else {
1652                 goto ip_hdr_cksum;
1653         }
1654 
1655         /* ULP puts the checksum field is in the first mblk */
1656         ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1657 
1658         /*
1659          * We accumulate the pseudo header checksum in cksum.
1660          * This is pretty hairy code, so watch close.  One
1661          * thing to keep in mind is that UDP and TCP have
1662          * stored their respective datagram lengths in their
1663          * checksum fields.  This lines things up real nice.
1664          */
1665         cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
1666 
1667         cksum = IP_CSUM(mp, ip_hdr_length, cksum);
1668         /*
1669          * For UDP/IPv4 a zero means that the packets wasn't checksummed.
1670          * Change to 0xffff
1671          */
1672         if (protocol == IPPROTO_UDP && cksum == 0)
1673                 *cksump = ~cksum;
1674         else
1675                 *cksump = cksum;
1676 
1677         IP_STAT(ipst, ip_out_sw_cksum);
1678         IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen);
1679 
1680 ip_hdr_cksum:
1681         /* Calculate IPv4 header checksum */
1682         ipha->ipha_hdr_checksum = 0;
1683         ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1684         return (B_TRUE);
1685 }
1686 
1687 /*
1688  * Calculate the ULP checksum - try to use hardware.
1689  * In the case of MULTIRT, broadcast or multicast the
1690  * IXAF_NO_HW_CKSUM is set in which case we use software.
1691  *
1692  * If the hardware supports IP header checksum offload; then clear the
1693  * contents of IP header checksum field as expected by NIC.
1694  * Do this only if we offloaded either full or partial sum.
1695  *
1696  * Returns B_FALSE if the packet was too short for the checksum. Caller
1697  * should free and do stats.
1698  */
1699 static boolean_t
1700 ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
1701     ip_xmit_attr_t *ixa, ill_t *ill)
1702 {
1703         uint_t          pktlen = ixa->ixa_pktlen;
1704         uint16_t        *cksump;
1705         uint16_t        hck_flags;
1706         uint32_t        cksum;
1707         uint8_t         protocol = ixa->ixa_protocol;
1708         uint16_t        ip_hdr_length = ixa->ixa_ip_hdr_length;
1709 
1710         if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
1711             !dohwcksum) {
1712                 return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1713         }
1714 
1715         /*
1716          * Calculate ULP checksum. Note that we don't use cksump and cksum
1717          * if the ill has FULL support.
1718          */
1719         if (protocol == IPPROTO_TCP) {
1720                 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1721                 cksum = IP_TCP_CSUM_COMP;       /* Pseudo-header cksum */
1722         } else if (protocol == IPPROTO_UDP) {
1723                 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1724                 cksum = IP_UDP_CSUM_COMP;       /* Pseudo-header cksum */
1725         } else if (protocol == IPPROTO_SCTP) {
1726                 sctp_hdr_t      *sctph;
1727 
1728                 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1729                 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1730                 /*
1731                  * Zero out the checksum field to ensure proper
1732                  * checksum calculation.
1733                  */
1734                 sctph->sh_chksum = 0;
1735 #ifdef  DEBUG
1736                 if (!skip_sctp_cksum)
1737 #endif
1738                         sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1739                 goto ip_hdr_cksum;
1740         } else {
1741         ip_hdr_cksum:
1742                 /* Calculate IPv4 header checksum */
1743                 ipha->ipha_hdr_checksum = 0;
1744                 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1745                 return (B_TRUE);
1746         }
1747 
1748         /* ULP puts the checksum field is in the first mblk */
1749         ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1750 
1751         /*
1752          * Underlying interface supports hardware checksum offload for
1753          * the payload; leave the payload checksum for the hardware to
1754          * calculate.  N.B: We only need to set up checksum info on the
1755          * first mblk.
1756          */
1757         hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
1758 
1759         DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1760         if (hck_flags & HCKSUM_INET_FULL_V4) {
1761                 /*
1762                  * Hardware calculates pseudo-header, header and the
1763                  * payload checksums, so clear the checksum field in
1764                  * the protocol header.
1765                  */
1766                 *cksump = 0;
1767                 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
1768 
1769                 ipha->ipha_hdr_checksum = 0;
1770                 if (hck_flags & HCKSUM_IPHDRCKSUM) {
1771                         DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1772                 } else {
1773                         ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1774                 }
1775                 return (B_TRUE);
1776         }
1777         if ((hck_flags) & HCKSUM_INET_PARTIAL)  {
1778                 ipaddr_t        dst = ipha->ipha_dst;
1779                 ipaddr_t        src = ipha->ipha_src;
1780                 /*
1781                  * Partial checksum offload has been enabled.  Fill
1782                  * the checksum field in the protocol header with the
1783                  * pseudo-header checksum value.
1784                  *
1785                  * We accumulate the pseudo header checksum in cksum.
1786                  * This is pretty hairy code, so watch close.  One
1787                  * thing to keep in mind is that UDP and TCP have
1788                  * stored their respective datagram lengths in their
1789                  * checksum fields.  This lines things up real nice.
1790                  */
1791                 cksum += (dst >> 16) + (dst & 0xFFFF) +
1792                     (src >> 16) + (src & 0xFFFF);
1793                 cksum += *(cksump);
1794                 cksum = (cksum & 0xFFFF) + (cksum >> 16);
1795                 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
1796 
1797                 /*
1798                  * Offsets are relative to beginning of IP header.
1799                  */
1800                 DB_CKSUMSTART(mp) = ip_hdr_length;
1801                 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha;
1802                 DB_CKSUMEND(mp) = pktlen;
1803                 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
1804 
1805                 ipha->ipha_hdr_checksum = 0;
1806                 if (hck_flags & HCKSUM_IPHDRCKSUM) {
1807                         DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1808                 } else {
1809                         ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1810                 }
1811                 return (B_TRUE);
1812         }
1813         /* Hardware capabilities include neither full nor partial IPv4 */
1814         return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1815 }
1816 
1817 /*
1818  * ire_sendfn for offlink and onlink destinations.
1819  * Also called from the multicast, broadcast, multirt send functions.
1820  *
1821  * Assumes that the caller has a hold on the ire.
1822  *
1823  * This function doesn't care if the IRE just became condemned since that
1824  * can happen at any time.
1825  */
1826 /* ARGSUSED */
1827 int
1828 ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1829     ip_xmit_attr_t *ixa, uint32_t *identp)
1830 {
1831         ip_stack_t      *ipst = ixa->ixa_ipst;
1832         ipha_t          *ipha = (ipha_t *)iph_arg;
1833         iaflags_t       ixaflags = ixa->ixa_flags;
1834         ill_t           *ill;
1835 
1836         ASSERT(ixa->ixa_nce != NULL);
1837         ill = ixa->ixa_nce->nce_ill;
1838 
1839         if (ixaflags & IXAF_DONTROUTE)
1840                 ipha->ipha_ttl = 1;
1841 
1842         /*
1843          * Assign an ident value for this packet. There could be other
1844          * threads targeting the same destination, so we have to arrange
1845          * for a atomic increment.  Note that we use a 32-bit atomic add
1846          * because it has better performance than its 16-bit sibling.
1847          *
1848          * Normally ixa_extra_ident is 0, but in the case of LSO it will
1849          * be the number of TCP segments  that the driver/hardware will
1850          * extraly construct.
1851          *
1852          * If running in cluster mode and if the source address
1853          * belongs to a replicated service then vector through
1854          * cl_inet_ipident vector to allocate ip identifier
1855          * NOTE: This is a contract private interface with the
1856          * clustering group.
1857          */
1858         if (cl_inet_ipident != NULL) {
1859                 ipaddr_t src = ipha->ipha_src;
1860                 ipaddr_t dst = ipha->ipha_dst;
1861                 netstackid_t stack_id = ipst->ips_netstack->netstack_stackid;
1862 
1863                 ASSERT(cl_inet_isclusterwide != NULL);
1864                 if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP,
1865                     AF_INET, (uint8_t *)(uintptr_t)src, NULL)) {
1866                         /*
1867                          * Note: not correct with LSO since we can't allocate
1868                          * ixa_extra_ident+1 consecutive values.
1869                          */
1870                         ipha->ipha_ident = (*cl_inet_ipident)(stack_id,
1871                             IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src,
1872                             (uint8_t *)(uintptr_t)dst, NULL);
1873                 } else {
1874                         ipha->ipha_ident = atomic_add_32_nv(identp,
1875                             ixa->ixa_extra_ident + 1);
1876                 }
1877         } else {
1878                 ipha->ipha_ident = atomic_add_32_nv(identp,
1879                     ixa->ixa_extra_ident + 1);
1880         }
1881 #ifndef _BIG_ENDIAN
1882         ipha->ipha_ident = htons(ipha->ipha_ident);
1883 #endif
1884 
1885         /*
1886          * This might set b_band, thus the IPsec and fragmentation
1887          * code in IP ensures that b_band is updated in the first mblk.
1888          */
1889         if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
1890                 /* ip_process translates an IS_UNDER_IPMP */
1891                 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill);
1892                 if (mp == NULL) {
1893                         /* ip_drop_packet and MIB done */
1894                         return (0);     /* Might just be delayed */
1895                 }
1896         }
1897 
1898         /*
1899          * Verify any IPv4 options.
1900          *
1901          * The presense of IP options also forces the network stack to
1902          * calculate the checksum in software.  This is because:
1903          *
1904          * Wrap around: certain partial-checksum NICs (eri, ce) limit
1905          * the size of "start offset" width to 6-bit.  This effectively
1906          * sets the largest value of the offset to 64-bytes, starting
1907          * from the MAC header.  When the cumulative MAC and IP headers
1908          * exceed such limit, the offset will wrap around.  This causes
1909          * the checksum to be calculated at the wrong place.
1910          *
1911          * IPv4 source routing: none of the full-checksum capable NICs
1912          * is capable of correctly handling the IPv4 source-routing
1913          * option for purposes of calculating the pseudo-header; the
1914          * actual destination is different from the destination in the
1915          * header which is that of the next-hop.  (This case may not be
1916          * true for NICs which can parse IPv6 extension headers, but
1917          * we choose to simplify the implementation by not offloading
1918          * checksum when they are present.)
1919          */
1920         if (!IS_SIMPLE_IPH(ipha)) {
1921                 ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM;
1922                 /* An IS_UNDER_IPMP ill is ok here */
1923                 if (ip_output_options(mp, ipha, ixa, ill)) {
1924                         /* Packet has been consumed and ICMP error sent */
1925                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1926                         return (EINVAL);
1927                 }
1928         }
1929 
1930         /*
1931          * To handle IPsec/iptun's labeling needs we need to tag packets
1932          * while we still have ixa_tsl
1933          */
1934         if (is_system_labeled() && ixa->ixa_tsl != NULL &&
1935             (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 ||
1936             ill->ill_mactype == DL_IPV6)) {
1937                 cred_t *newcr;
1938 
1939                 newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl,
1940                     KM_NOSLEEP);
1941                 if (newcr == NULL) {
1942                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1943                         ip_drop_output("ipIfStatsOutDiscards - newcr",
1944                             mp, ill);
1945                         freemsg(mp);
1946                         return (ENOBUFS);
1947                 }
1948                 mblk_setcred(mp, newcr, NOPID);
1949                 crfree(newcr);  /* mblk_setcred did its own crhold */
1950         }
1951 
1952         if (ixa->ixa_pktlen > ixa->ixa_fragsize ||
1953             (ixaflags & IXAF_IPSEC_SECURE)) {
1954                 uint32_t pktlen;
1955 
1956                 pktlen = ixa->ixa_pktlen;
1957                 if (ixaflags & IXAF_IPSEC_SECURE)
1958                         pktlen += ipsec_out_extra_length(ixa);
1959 
1960                 if (pktlen > IP_MAXPACKET)
1961                         return (EMSGSIZE);
1962 
1963                 if (ixaflags & IXAF_SET_ULP_CKSUM) {
1964                         /*
1965                          * Compute ULP checksum and IP header checksum
1966                          * using software
1967                          */
1968                         if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
1969                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1970                                 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1971                                 freemsg(mp);
1972                                 return (EINVAL);
1973                         }
1974                 } else {
1975                         /* Calculate IPv4 header checksum */
1976                         ipha->ipha_hdr_checksum = 0;
1977                         ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1978                 }
1979 
1980                 /*
1981                  * If this packet would generate a icmp_frag_needed
1982                  * message, we need to handle it before we do the IPsec
1983                  * processing. Otherwise, we need to strip the IPsec
1984                  * headers before we send up the message to the ULPs
1985                  * which becomes messy and difficult.
1986                  *
1987                  * We check using IXAF_DONTFRAG. The DF bit in the header
1988                  * is not inspected - it will be copied to any generated
1989                  * fragments.
1990                  */
1991                 if ((pktlen > ixa->ixa_fragsize) &&
1992                     (ixaflags & IXAF_DONTFRAG)) {
1993                         /* Generate ICMP and return error */
1994                         ip_recv_attr_t  iras;
1995 
1996                         DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen,
1997                             uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
1998                             uint_t, ixa->ixa_pmtu);
1999 
2000                         bzero(&iras, sizeof (iras));
2001                         /* Map ixa to ira including IPsec policies */
2002                         ipsec_out_to_in(ixa, ill, &iras);
2003 
2004                         ip_drop_output("ICMP_FRAG_NEEDED", mp, ill);
2005                         icmp_frag_needed(mp, ixa->ixa_fragsize, &iras);
2006                         /* We moved any IPsec refs from ixa to iras */
2007                         ira_cleanup(&iras, B_FALSE);
2008                         return (EMSGSIZE);
2009                 }
2010                 DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen,
2011                     uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
2012                     uint_t, ixa->ixa_pmtu);
2013 
2014                 if (ixaflags & IXAF_IPSEC_SECURE) {
2015                         /*
2016                          * Pass in sufficient information so that
2017                          * IPsec can determine whether to fragment, and
2018                          * which function to call after fragmentation.
2019                          */
2020                         return (ipsec_out_process(mp, ixa));
2021                 }
2022                 return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags,
2023                     ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint,
2024                     ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
2025                     ixa->ixa_postfragfn, &ixa->ixa_cookie));
2026         }
2027         if (ixaflags & IXAF_SET_ULP_CKSUM) {
2028                 /* Compute ULP checksum and IP header checksum */
2029                 /* An IS_UNDER_IPMP ill is ok here */
2030                 if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) {
2031                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2032                         ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2033                         freemsg(mp);
2034                         return (EINVAL);
2035                 }
2036         } else {
2037                 /* Calculate IPv4 header checksum */
2038                 ipha->ipha_hdr_checksum = 0;
2039                 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2040         }
2041         return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags,
2042             ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
2043             ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie));
2044 }
2045 
2046 /*
2047  * Send mp into ip_input
2048  * Common for IPv4 and IPv6
2049  */
2050 void
2051 ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2052     uint_t pkt_len, zoneid_t nolzid)
2053 {
2054         rtc_t           rtc;
2055         ill_t           *ill = nce->nce_ill;
2056         ip_recv_attr_t  iras;   /* NOTE: No bzero for performance */
2057         ncec_t          *ncec;
2058 
2059         ncec = nce->nce_common;
2060         iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM |
2061             IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK;
2062         if (ncec->ncec_flags & NCE_F_BCAST)
2063                 iras.ira_flags |= IRAF_L2DST_BROADCAST;
2064         else if (ncec->ncec_flags & NCE_F_MCAST)
2065                 iras.ira_flags |= IRAF_L2DST_MULTICAST;
2066 
2067         iras.ira_free_flags = 0;
2068         iras.ira_cred = NULL;
2069         iras.ira_cpid = NOPID;
2070         iras.ira_tsl = NULL;
2071         iras.ira_zoneid = ALL_ZONES;
2072         iras.ira_pktlen = pkt_len;
2073         UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen);
2074         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
2075 
2076         if (ixaflags & IXAF_IS_IPV4)
2077                 iras.ira_flags |= IRAF_IS_IPV4;
2078 
2079         iras.ira_ill = iras.ira_rill = ill;
2080         iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2081         iras.ira_rifindex = iras.ira_ruifindex;
2082         iras.ira_mhip = NULL;
2083 
2084         iras.ira_flags |= ixaflags & IAF_MASK;
2085         iras.ira_no_loop_zoneid = nolzid;
2086 
2087         /* Broadcast and multicast doesn't care about the squeue */
2088         iras.ira_sqp = NULL;
2089 
2090         rtc.rtc_ire = NULL;
2091         if (ixaflags & IXAF_IS_IPV4) {
2092                 ipha_t          *ipha = (ipha_t *)mp->b_rptr;
2093 
2094                 rtc.rtc_ipaddr = INADDR_ANY;
2095 
2096                 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
2097                 if (rtc.rtc_ire != NULL) {
2098                         ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
2099                         ire_refrele(rtc.rtc_ire);
2100                 }
2101         } else {
2102                 ip6_t           *ip6h = (ip6_t *)mp->b_rptr;
2103 
2104                 rtc.rtc_ip6addr = ipv6_all_zeros;
2105 
2106                 (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc);
2107                 if (rtc.rtc_ire != NULL) {
2108                         ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr));
2109                         ire_refrele(rtc.rtc_ire);
2110                 }
2111         }
2112         /* Any references to clean up? No hold on ira */
2113         if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
2114                 ira_cleanup(&iras, B_FALSE);
2115 }
2116 
2117 /*
2118  * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which
2119  * looks at the IXAF_LOOPBACK_COPY flag.
2120  * Common for IPv4 and IPv6.
2121  *
2122  * If the loopback copy fails (due to no memory) but we send the packet out
2123  * on the wire we return no failure. Only in the case we supress the wire
2124  * sending do we take the loopback failure into account.
2125  *
2126  * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy.
2127  * Those operations are performed on this packet in ip_xmit() and it would
2128  * be odd to do it twice for the same packet.
2129  */
2130 int
2131 ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2132     uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
2133     uintptr_t *ixacookie)
2134 {
2135         ill_t           *ill = nce->nce_ill;
2136         int             error = 0;
2137 
2138         /*
2139          * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver
2140          * had looped it back
2141          */
2142         if (ixaflags & IXAF_LOOPBACK_COPY) {
2143                 mblk_t          *mp1;
2144 
2145                 mp1 = copymsg(mp);
2146                 if (mp1 == NULL) {
2147                         /* Failed to deliver the loopback copy. */
2148                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2149                         ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2150                         error = ENOBUFS;
2151                 } else {
2152                         ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
2153                             nolzid);
2154                 }
2155         }
2156 
2157         /*
2158          * If TTL = 0 then only do the loopback to this host i.e. we are
2159          * done. We are also done if this was the
2160          * loopback interface since it is sufficient
2161          * to loopback one copy of a multicast packet.
2162          */
2163         if (ixaflags & IXAF_IS_IPV4) {
2164                 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2165 
2166                 if (ipha->ipha_ttl == 0) {
2167                         ip_drop_output("multicast ipha_ttl not sent to wire",
2168                             mp, ill);
2169                         freemsg(mp);
2170                         return (error);
2171                 }
2172         } else {
2173                 ip6_t   *ip6h = (ip6_t *)mp->b_rptr;
2174 
2175                 if (ip6h->ip6_hops == 0) {
2176                         ip_drop_output("multicast ipha_ttl not sent to wire",
2177                             mp, ill);
2178                         freemsg(mp);
2179                         return (error);
2180                 }
2181         }
2182         if (nce->nce_ill->ill_wq == NULL) {
2183                 /* Loopback interface */
2184                 ip_drop_output("multicast on lo0 not sent to wire", mp, ill);
2185                 freemsg(mp);
2186                 return (error);
2187         }
2188 
2189         return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
2190             ixacookie));
2191 }
2192 
2193 /*
2194  * Post fragmentation function for RTF_MULTIRT routes.
2195  * Since IRE_BROADCASTs can have RTF_MULTIRT, this function
2196  * checks IXAF_LOOPBACK_COPY.
2197  *
2198  * If no packet is sent due to failures then we return an errno, but if at
2199  * least one succeeded we return zero.
2200  */
2201 int
2202 ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2203     uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
2204     uintptr_t *ixacookie)
2205 {
2206         irb_t           *irb;
2207         ipha_t          *ipha = (ipha_t *)mp->b_rptr;
2208         ire_t           *ire;
2209         ire_t           *ire1;
2210         mblk_t          *mp1;
2211         nce_t           *nce1;
2212         ill_t           *ill = nce->nce_ill;
2213         ill_t           *ill1;
2214         ip_stack_t      *ipst = ill->ill_ipst;
2215         int             error = 0;
2216         int             num_sent = 0;
2217         int             err;
2218         uint_t          ire_type;
2219         ipaddr_t        nexthop;
2220 
2221         ASSERT(ixaflags & IXAF_IS_IPV4);
2222 
2223         /* Check for IXAF_LOOPBACK_COPY */
2224         if (ixaflags & IXAF_LOOPBACK_COPY) {
2225                 mblk_t *mp1;
2226 
2227                 mp1 = copymsg(mp);
2228                 if (mp1 == NULL) {
2229                         /* Failed to deliver the loopback copy. */
2230                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2231                         ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2232                         error = ENOBUFS;
2233                 } else {
2234                         ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
2235                             nolzid);
2236                 }
2237         }
2238 
2239         /*
2240          * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send
2241          * a copy to each one.
2242          * Use the nce (nexthop) and ipha_dst to find the ire.
2243          *
2244          * MULTIRT is not designed to work with shared-IP zones thus we don't
2245          * need to pass a zoneid or a label to the IRE lookup.
2246          */
2247         if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) {
2248                 /* Broadcast and multicast case */
2249                 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0,
2250                     NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
2251         } else {
2252                 ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr);
2253 
2254                 /* Unicast case */
2255                 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0,
2256                     NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL);
2257         }
2258 
2259         if (ire == NULL ||
2260             (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
2261             !(ire->ire_flags & RTF_MULTIRT)) {
2262                 /* Drop */
2263                 ip_drop_output("ip_postfrag_multirt didn't find route",
2264                     mp, nce->nce_ill);
2265                 if (ire != NULL)
2266                         ire_refrele(ire);
2267                 return (ENETUNREACH);
2268         }
2269 
2270         irb = ire->ire_bucket;
2271         irb_refhold(irb);
2272         for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
2273                 /*
2274                  * For broadcast we can have a mixture of IRE_BROADCAST and
2275                  * IRE_HOST due to the manually added IRE_HOSTs that are used
2276                  * to trigger the creation of the special CGTP broadcast routes.
2277                  * Thus we have to skip if ire_type doesn't match the original.
2278                  */
2279                 if (IRE_IS_CONDEMNED(ire1) ||
2280                     !(ire1->ire_flags & RTF_MULTIRT) ||
2281                     ire1->ire_type != ire->ire_type)
2282                         continue;
2283 
2284                 /* Do the ire argument one after the loop */
2285                 if (ire1 == ire)
2286                         continue;
2287 
2288                 ill1 = ire_nexthop_ill(ire1);
2289                 if (ill1 == NULL) {
2290                         /*
2291                          * This ire might not have been picked by
2292                          * ire_route_recursive, in which case ire_dep might
2293                          * not have been setup yet.
2294                          * We kick ire_route_recursive to try to resolve
2295                          * starting at ire1.
2296                          */
2297                         ire_t *ire2;
2298                         uint_t  match_flags = MATCH_IRE_DSTONLY;
2299 
2300                         if (ire1->ire_ill != NULL)
2301                                 match_flags |= MATCH_IRE_ILL;
2302                         ire2 = ire_route_recursive_impl_v4(ire1,
2303                             ire1->ire_addr, ire1->ire_type, ire1->ire_ill,
2304                             ire1->ire_zoneid, NULL, match_flags,
2305                             IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
2306                         if (ire2 != NULL)
2307                                 ire_refrele(ire2);
2308                         ill1 = ire_nexthop_ill(ire1);
2309                 }
2310 
2311                 if (ill1 == NULL) {
2312                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2313                         ip_drop_output("ipIfStatsOutDiscards - no ill",
2314                             mp, ill);
2315                         error = ENETUNREACH;
2316                         continue;
2317                 }
2318 
2319                 /* Pick the addr and type to use for arp_nce_init */
2320                 if (nce->nce_common->ncec_flags & NCE_F_BCAST) {
2321                         ire_type = IRE_BROADCAST;
2322                         nexthop = ire1->ire_gateway_addr;
2323                 } else if (nce->nce_common->ncec_flags & NCE_F_MCAST) {
2324                         ire_type = IRE_MULTICAST;
2325                         nexthop = ipha->ipha_dst;
2326                 } else {
2327                         ire_type = ire1->ire_type;   /* Doesn't matter */
2328                         nexthop = ire1->ire_gateway_addr;
2329                 }
2330 
2331                 /* If IPMP meta or under, then we just drop */
2332                 if (ill1->ill_grp != NULL) {
2333                         BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2334                         ip_drop_output("ipIfStatsOutDiscards - IPMP",
2335                             mp, ill1);
2336                         ill_refrele(ill1);
2337                         error = ENETUNREACH;
2338                         continue;
2339                 }
2340 
2341                 nce1 = arp_nce_init(ill1, nexthop, ire_type);
2342                 if (nce1 == NULL) {
2343                         BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2344                         ip_drop_output("ipIfStatsOutDiscards - no nce",
2345                             mp, ill1);
2346                         ill_refrele(ill1);
2347                         error = ENETUNREACH;
2348                         continue;
2349                 }
2350                 mp1 = copymsg(mp);
2351                 if (mp1 == NULL) {
2352                         BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2353                         ip_drop_output("ipIfStatsOutDiscards", mp, ill1);
2354                         nce_refrele(nce1);
2355                         ill_refrele(ill1);
2356                         error = ENOBUFS;
2357                         continue;
2358                 }
2359                 /* Preserve HW checksum for this copy */
2360                 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
2361                 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
2362                 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
2363                 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
2364                 DB_LSOMSS(mp1) = DB_LSOMSS(mp);
2365 
2366                 ire1->ire_ob_pkt_count++;
2367                 err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone,
2368                     0, ixacookie);
2369                 if (err == 0)
2370                         num_sent++;
2371                 else
2372                         error = err;
2373                 nce_refrele(nce1);
2374                 ill_refrele(ill1);
2375         }
2376         irb_refrele(irb);
2377         ire_refrele(ire);
2378         /* Finally, the main one */
2379         err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
2380             ixacookie);
2381         if (err == 0)
2382                 num_sent++;
2383         else
2384                 error = err;
2385         if (num_sent > 0)
2386                 return (0);
2387         else
2388                 return (error);
2389 }
2390 
2391 /*
2392  * Verify local connectivity. This check is called by ULP fusion code.
2393  * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if
2394  * the interface is brought down and back up. So we simply fail the local
2395  * process. The caller, TCP Fusion, should unfuse the connection.
2396  */
2397 boolean_t
2398 ip_output_verify_local(ip_xmit_attr_t *ixa)
2399 {
2400         ire_t           *ire = ixa->ixa_ire;
2401 
2402         if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)))
2403                 return (B_FALSE);
2404 
2405         return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation);
2406 }
2407 
2408 /*
2409  * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6.
2410  *
2411  * The caller must call ip_output_verify_local() first. This function handles
2412  * IPobs, FW_HOOKS, and/or IPsec cases sequentially.
2413  */
2414 mblk_t *
2415 ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out,
2416     boolean_t hooks_in, conn_t *peer_connp)
2417 {
2418         ill_t           *ill = ixa->ixa_ire->ire_ill;
2419         ipha_t          *ipha = NULL;
2420         ip6_t           *ip6h = NULL;
2421         ip_stack_t      *ipst = ixa->ixa_ipst;
2422         iaflags_t       ixaflags = ixa->ixa_flags;
2423         ip_recv_attr_t  iras;
2424         int             error;
2425 
2426         ASSERT(mp != NULL);
2427 
2428         if (ixaflags & IXAF_IS_IPV4) {
2429                 ipha = (ipha_t *)mp->b_rptr;
2430 
2431                 /*
2432                  * If a callback is enabled then we need to know the
2433                  * source and destination zoneids for the packet. We already
2434                  * have those handy.
2435                  */
2436                 if (ipst->ips_ip4_observe.he_interested) {
2437                         zoneid_t szone, dzone;
2438                         zoneid_t stackzoneid;
2439 
2440                         stackzoneid = netstackid_to_zoneid(
2441                             ipst->ips_netstack->netstack_stackid);
2442 
2443                         if (stackzoneid == GLOBAL_ZONEID) {
2444                                 /* Shared-IP zone */
2445                                 dzone = ixa->ixa_ire->ire_zoneid;
2446                                 szone = ixa->ixa_zoneid;
2447                         } else {
2448                                 szone = dzone = stackzoneid;
2449                         }
2450                         ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2451                             ipst);
2452                 }
2453                 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2454                     ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2455                     NULL, int, 1);
2456 
2457                 /* FW_HOOKS: LOOPBACK_OUT */
2458                 if (hooks_out) {
2459                         DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
2460                             ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
2461                         FW_HOOKS(ipst->ips_ip4_loopback_out_event,
2462                             ipst->ips_ipv4firewall_loopback_out,
2463                             NULL, ill, ipha, mp, mp, 0, ipst, error);
2464                         DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
2465                 }
2466                 if (mp == NULL)
2467                         return (NULL);
2468 
2469                 /* FW_HOOKS: LOOPBACK_IN */
2470                 if (hooks_in) {
2471                         DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
2472                             ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
2473                         FW_HOOKS(ipst->ips_ip4_loopback_in_event,
2474                             ipst->ips_ipv4firewall_loopback_in,
2475                             ill, NULL, ipha, mp, mp, 0, ipst, error);
2476                         DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
2477                 }
2478                 if (mp == NULL)
2479                         return (NULL);
2480 
2481                 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2482                     ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2483                     NULL, int, 1);
2484 
2485                 /* Inbound IPsec polocies */
2486                 if (peer_connp != NULL) {
2487                         /* Map ixa to ira including IPsec policies. */
2488                         ipsec_out_to_in(ixa, ill, &iras);
2489                         mp = ipsec_check_inbound_policy(mp, peer_connp, ipha,
2490                             NULL, &iras);
2491                 }
2492         } else {
2493                 ip6h = (ip6_t *)mp->b_rptr;
2494 
2495                 /*
2496                  * If a callback is enabled then we need to know the
2497                  * source and destination zoneids for the packet. We already
2498                  * have those handy.
2499                  */
2500                 if (ipst->ips_ip6_observe.he_interested) {
2501                         zoneid_t szone, dzone;
2502                         zoneid_t stackzoneid;
2503 
2504                         stackzoneid = netstackid_to_zoneid(
2505                             ipst->ips_netstack->netstack_stackid);
2506 
2507                         if (stackzoneid == GLOBAL_ZONEID) {
2508                                 /* Shared-IP zone */
2509                                 dzone = ixa->ixa_ire->ire_zoneid;
2510                                 szone = ixa->ixa_zoneid;
2511                         } else {
2512                                 szone = dzone = stackzoneid;
2513                         }
2514                         ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2515                             ipst);
2516                 }
2517                 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2518                     ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2519                     ip6h, int, 1);
2520 
2521                 /* FW_HOOKS: LOOPBACK_OUT */
2522                 if (hooks_out) {
2523                         DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL,
2524                             ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp);
2525                         FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
2526                             ipst->ips_ipv6firewall_loopback_out,
2527                             NULL, ill, ip6h, mp, mp, 0, ipst, error);
2528                         DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp);
2529                 }
2530                 if (mp == NULL)
2531                         return (NULL);
2532 
2533                 /* FW_HOOKS: LOOPBACK_IN */
2534                 if (hooks_in) {
2535                         DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill,
2536                             ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp);
2537                         FW_HOOKS6(ipst->ips_ip6_loopback_in_event,
2538                             ipst->ips_ipv6firewall_loopback_in,
2539                             ill, NULL, ip6h, mp, mp, 0, ipst, error);
2540                         DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp);
2541                 }
2542                 if (mp == NULL)
2543                         return (NULL);
2544 
2545                 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2546                     ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2547                     ip6h, int, 1);
2548 
2549                 /* Inbound IPsec polocies */
2550                 if (peer_connp != NULL) {
2551                         /* Map ixa to ira including IPsec policies. */
2552                         ipsec_out_to_in(ixa, ill, &iras);
2553                         mp = ipsec_check_inbound_policy(mp, peer_connp, NULL,
2554                             ip6h, &iras);
2555                 }
2556         }
2557 
2558         if (mp == NULL) {
2559                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2560                 ip_drop_input("ipIfStatsInDiscards", NULL, ill);
2561         }
2562 
2563         return (mp);
2564 }