1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 /* Copyright (c) 1990 Mentat Inc. */
  26 
  27 #include <sys/types.h>
  28 #include <sys/stream.h>
  29 #include <sys/strsun.h>
  30 #include <sys/zone.h>
  31 #include <sys/ddi.h>
  32 #include <sys/sunddi.h>
  33 #include <sys/cmn_err.h>
  34 #include <sys/debug.h>
  35 #include <sys/atomic.h>
  36 
  37 #include <sys/systm.h>
  38 #include <sys/param.h>
  39 #include <sys/kmem.h>
  40 #include <sys/sdt.h>
  41 #include <sys/socket.h>
  42 #include <sys/mac.h>
  43 #include <net/if.h>
  44 #include <net/if_arp.h>
  45 #include <net/route.h>
  46 #include <sys/sockio.h>
  47 #include <netinet/in.h>
  48 #include <net/if_dl.h>
  49 
  50 #include <inet/common.h>
  51 #include <inet/mi.h>
  52 #include <inet/mib2.h>
  53 #include <inet/nd.h>
  54 #include <inet/arp.h>
  55 #include <inet/snmpcom.h>
  56 #include <inet/kstatcom.h>
  57 
  58 #include <netinet/igmp_var.h>
  59 #include <netinet/ip6.h>
  60 #include <netinet/icmp6.h>
  61 #include <netinet/sctp.h>
  62 
  63 #include <inet/ip.h>
  64 #include <inet/ip_impl.h>
  65 #include <inet/ip6.h>
  66 #include <inet/ip6_asp.h>
  67 #include <inet/tcp.h>
  68 #include <inet/ip_multi.h>
  69 #include <inet/ip_if.h>
  70 #include <inet/ip_ire.h>
  71 #include <inet/ip_ftable.h>
  72 #include <inet/ip_rts.h>
  73 #include <inet/optcom.h>
  74 #include <inet/ip_ndp.h>
  75 #include <inet/ip_listutils.h>
  76 #include <netinet/igmp.h>
  77 #include <netinet/ip_mroute.h>
  78 #include <inet/ipp_common.h>
  79 
  80 #include <net/pfkeyv2.h>
  81 #include <inet/sadb.h>
  82 #include <inet/ipsec_impl.h>
  83 #include <inet/ipdrop.h>
  84 #include <inet/ip_netinfo.h>
  85 #include <sys/squeue_impl.h>
  86 #include <sys/squeue.h>
  87 
  88 #include <inet/ipclassifier.h>
  89 #include <inet/sctp_ip.h>
  90 #include <inet/sctp/sctp_impl.h>
  91 #include <inet/udp_impl.h>
  92 #include <sys/sunddi.h>
  93 
  94 #include <sys/tsol/label.h>
  95 #include <sys/tsol/tnet.h>
  96 
  97 /*
  98  * Release a reference on ip_xmit_attr.
  99  * The reference is acquired by conn_get_ixa()
 100  */
 101 #define IXA_REFRELE(ixa)                                        \
 102 {                                                               \
 103         if (atomic_add_32_nv(&(ixa)->ixa_refcnt, -1) == 0)       \
 104                 ixa_inactive(ixa);                              \
 105 }
 106 
 107 #define IXA_REFHOLD(ixa)                                        \
 108 {                                                               \
 109         ASSERT((ixa)->ixa_refcnt != 0);                              \
 110         atomic_add_32(&(ixa)->ixa_refcnt, 1);                    \
 111 }
 112 
 113 /*
 114  * When we need to handle a transmit side asynchronous operation, then we need
 115  * to save sufficient information so that we can call the fragment and postfrag
 116  * functions. That information is captured in an mblk containing this structure.
 117  *
 118  * Since this is currently only used for IPsec, we include information for
 119  * the kernel crypto framework.
 120  */
 121 typedef struct ixamblk_s {
 122         boolean_t       ixm_inbound;    /* B_FALSE */
 123         iaflags_t       ixm_flags;      /* ixa_flags */
 124         netstackid_t    ixm_stackid;    /* Verify it didn't go away */
 125         uint_t          ixm_ifindex;    /* Used to find the nce */
 126         in6_addr_t      ixm_nceaddr_v6; /* Used to find nce */
 127 #define ixm_nceaddr_v4  V4_PART_OF_V6(ixm_nceaddr_v6)
 128         uint32_t        ixm_fragsize;
 129         uint_t          ixm_pktlen;
 130         uint16_t        ixm_ip_hdr_length; /* Points to ULP header */
 131         uint8_t         ixm_protocol;   /* Protocol number for ULP cksum */
 132         pfirepostfrag_t ixm_postfragfn;
 133 
 134         zoneid_t        ixm_zoneid;             /* Needed for ipobs */
 135         zoneid_t        ixm_no_loop_zoneid;     /* IXAF_NO_LOOP_ZONEID_SET */
 136 
 137         uint_t          ixm_scopeid;            /* For IPv6 link-locals */
 138 
 139         uint32_t        ixm_ident;              /* For IPv6 fragment header */
 140         uint32_t        ixm_xmit_hint;
 141 
 142         uint64_t        ixm_conn_id;            /* Used by DTrace */
 143         cred_t          *ixm_cred;      /* For getpeerucred - refhold if set */
 144         pid_t           ixm_cpid;       /* For getpeerucred */
 145 
 146         ts_label_t      *ixm_tsl;       /* Refhold if set. */
 147 
 148         /*
 149          * When the pointers below are set they have a refhold on the struct.
 150          */
 151         ipsec_latch_t           *ixm_ipsec_latch;
 152         struct ipsa_s           *ixm_ipsec_ah_sa;       /* SA for AH */
 153         struct ipsa_s           *ixm_ipsec_esp_sa;      /* SA for ESP */
 154         struct ipsec_policy_s   *ixm_ipsec_policy;      /* why are we here? */
 155         struct ipsec_action_s   *ixm_ipsec_action; /* For reflected packets */
 156 
 157         ipsa_ref_t              ixm_ipsec_ref[2]; /* Soft reference to SA */
 158 
 159         /* Need these while waiting for SA */
 160         uint16_t ixm_ipsec_src_port;    /* Source port number of d-gram. */
 161         uint16_t ixm_ipsec_dst_port;    /* Destination port number of d-gram. */
 162         uint8_t  ixm_ipsec_icmp_type;   /* ICMP type of d-gram */
 163         uint8_t  ixm_ipsec_icmp_code;   /* ICMP code of d-gram */
 164 
 165         sa_family_t ixm_ipsec_inaf;     /* Inner address family */
 166         uint32_t ixm_ipsec_insrc[IXA_MAX_ADDRLEN];      /* Inner src address */
 167         uint32_t ixm_ipsec_indst[IXA_MAX_ADDRLEN];      /* Inner dest address */
 168         uint8_t  ixm_ipsec_insrcpfx;    /* Inner source prefix */
 169         uint8_t  ixm_ipsec_indstpfx;    /* Inner destination prefix */
 170 
 171         uint8_t ixm_ipsec_proto;        /* IP protocol number for d-gram. */
 172 } ixamblk_t;
 173 
 174 
 175 /*
 176  * When we need to handle a receive side asynchronous operation, then we need
 177  * to save sufficient information so that we can call ip_fanout.
 178  * That information is captured in an mblk containing this structure.
 179  *
 180  * Since this is currently only used for IPsec, we include information for
 181  * the kernel crypto framework.
 182  */
 183 typedef struct iramblk_s {
 184         boolean_t       irm_inbound;    /* B_TRUE */
 185         iaflags_t       irm_flags;      /* ira_flags */
 186         netstackid_t    irm_stackid;    /* Verify it didn't go away */
 187         uint_t          irm_ifindex;    /* To find ira_ill */
 188 
 189         uint_t          irm_rifindex;   /* ira_rifindex */
 190         uint_t          irm_ruifindex;  /* ira_ruifindex */
 191         uint_t          irm_pktlen;
 192         uint16_t        irm_ip_hdr_length; /* Points to ULP header */
 193         uint8_t         irm_protocol;   /* Protocol number for ULP cksum */
 194         zoneid_t        irm_zoneid;     /* ALL_ZONES unless local delivery */
 195 
 196         squeue_t        *irm_sqp;
 197         ill_rx_ring_t   *irm_ring;
 198 
 199         ipaddr_t        irm_mroute_tunnel;      /* IRAF_MROUTE_TUNNEL_SET */
 200         zoneid_t        irm_no_loop_zoneid;     /* IRAF_NO_LOOP_ZONEID_SET */
 201         uint32_t        irm_esp_udp_ports;      /* IRAF_ESP_UDP_PORTS */
 202 
 203         char            irm_l2src[IRA_L2SRC_SIZE];      /* If IRAF_L2SRC_SET */
 204 
 205         cred_t          *irm_cred;      /* For getpeerucred - refhold if set */
 206         pid_t           irm_cpid;       /* For getpeerucred */
 207 
 208         ts_label_t      *irm_tsl;       /* Refhold if set. */
 209 
 210         /*
 211          * When set these correspond to a refhold on the object.
 212          */
 213         struct ipsa_s           *irm_ipsec_ah_sa;       /* SA for AH */
 214         struct ipsa_s           *irm_ipsec_esp_sa;      /* SA for ESP */
 215         struct ipsec_action_s   *irm_ipsec_action; /* For reflected packets */
 216 } iramblk_t;
 217 
 218 
 219 /*
 220  * Take the information in ip_xmit_attr_t and stick it in an mblk
 221  * that can later be passed to ip_xmit_attr_from_mblk to recreate the
 222  * ip_xmit_attr_t.
 223  *
 224  * Returns NULL on memory allocation failure.
 225  */
 226 mblk_t *
 227 ip_xmit_attr_to_mblk(ip_xmit_attr_t *ixa)
 228 {
 229         mblk_t          *ixamp;
 230         ixamblk_t       *ixm;
 231         nce_t           *nce = ixa->ixa_nce;
 232 
 233         ASSERT(nce != NULL);
 234         ixamp = allocb(sizeof (*ixm), BPRI_MED);
 235         if (ixamp == NULL)
 236                 return (NULL);
 237 
 238         ixamp->b_datap->db_type = M_BREAK;
 239         ixamp->b_wptr += sizeof (*ixm);
 240         ixm = (ixamblk_t *)ixamp->b_rptr;
 241 
 242         bzero(ixm, sizeof (*ixm));
 243         ixm->ixm_inbound = B_FALSE;
 244         ixm->ixm_flags = ixa->ixa_flags;
 245         ixm->ixm_stackid = ixa->ixa_ipst->ips_netstack->netstack_stackid;
 246         ixm->ixm_ifindex = nce->nce_ill->ill_phyint->phyint_ifindex;
 247         ixm->ixm_nceaddr_v6 = nce->nce_addr;
 248         ixm->ixm_fragsize = ixa->ixa_fragsize;
 249         ixm->ixm_pktlen = ixa->ixa_pktlen;
 250         ixm->ixm_ip_hdr_length = ixa->ixa_ip_hdr_length;
 251         ixm->ixm_protocol = ixa->ixa_protocol;
 252         ixm->ixm_postfragfn = ixa->ixa_postfragfn;
 253         ixm->ixm_zoneid = ixa->ixa_zoneid;
 254         ixm->ixm_no_loop_zoneid = ixa->ixa_no_loop_zoneid;
 255         ixm->ixm_scopeid = ixa->ixa_scopeid;
 256         ixm->ixm_ident = ixa->ixa_ident;
 257         ixm->ixm_xmit_hint = ixa->ixa_xmit_hint;
 258 
 259         if (ixa->ixa_tsl != NULL) {
 260                 ixm->ixm_tsl = ixa->ixa_tsl;
 261                 label_hold(ixm->ixm_tsl);
 262         }
 263         if (ixa->ixa_cred != NULL) {
 264                 ixm->ixm_cred = ixa->ixa_cred;
 265                 crhold(ixa->ixa_cred);
 266         }
 267         ixm->ixm_cpid = ixa->ixa_cpid;
 268         ixm->ixm_conn_id = ixa->ixa_conn_id;
 269 
 270         if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
 271                 if (ixa->ixa_ipsec_ah_sa != NULL) {
 272                         ixm->ixm_ipsec_ah_sa = ixa->ixa_ipsec_ah_sa;
 273                         IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
 274                 }
 275                 if (ixa->ixa_ipsec_esp_sa != NULL) {
 276                         ixm->ixm_ipsec_esp_sa = ixa->ixa_ipsec_esp_sa;
 277                         IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
 278                 }
 279                 if (ixa->ixa_ipsec_policy != NULL) {
 280                         ixm->ixm_ipsec_policy = ixa->ixa_ipsec_policy;
 281                         IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
 282                 }
 283                 if (ixa->ixa_ipsec_action != NULL) {
 284                         ixm->ixm_ipsec_action = ixa->ixa_ipsec_action;
 285                         IPACT_REFHOLD(ixa->ixa_ipsec_action);
 286                 }
 287                 if (ixa->ixa_ipsec_latch != NULL) {
 288                         ixm->ixm_ipsec_latch = ixa->ixa_ipsec_latch;
 289                         IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
 290                 }
 291                 ixm->ixm_ipsec_ref[0] = ixa->ixa_ipsec_ref[0];
 292                 ixm->ixm_ipsec_ref[1] = ixa->ixa_ipsec_ref[1];
 293                 ixm->ixm_ipsec_src_port = ixa->ixa_ipsec_src_port;
 294                 ixm->ixm_ipsec_dst_port = ixa->ixa_ipsec_dst_port;
 295                 ixm->ixm_ipsec_icmp_type = ixa->ixa_ipsec_icmp_type;
 296                 ixm->ixm_ipsec_icmp_code = ixa->ixa_ipsec_icmp_code;
 297                 ixm->ixm_ipsec_inaf = ixa->ixa_ipsec_inaf;
 298                 ixm->ixm_ipsec_insrc[0] = ixa->ixa_ipsec_insrc[0];
 299                 ixm->ixm_ipsec_insrc[1] = ixa->ixa_ipsec_insrc[1];
 300                 ixm->ixm_ipsec_insrc[2] = ixa->ixa_ipsec_insrc[2];
 301                 ixm->ixm_ipsec_insrc[3] = ixa->ixa_ipsec_insrc[3];
 302                 ixm->ixm_ipsec_indst[0] = ixa->ixa_ipsec_indst[0];
 303                 ixm->ixm_ipsec_indst[1] = ixa->ixa_ipsec_indst[1];
 304                 ixm->ixm_ipsec_indst[2] = ixa->ixa_ipsec_indst[2];
 305                 ixm->ixm_ipsec_indst[3] = ixa->ixa_ipsec_indst[3];
 306                 ixm->ixm_ipsec_insrcpfx = ixa->ixa_ipsec_insrcpfx;
 307                 ixm->ixm_ipsec_indstpfx = ixa->ixa_ipsec_indstpfx;
 308                 ixm->ixm_ipsec_proto = ixa->ixa_ipsec_proto;
 309         }
 310         return (ixamp);
 311 }
 312 
 313 /*
 314  * Extract the ip_xmit_attr_t from the mblk, checking that the
 315  * ip_stack_t, ill_t, and nce_t still exist. Returns B_FALSE if that is
 316  * not the case.
 317  *
 318  * Otherwise ixa is updated.
 319  * Caller needs to release references on the ixa by calling ixa_refrele()
 320  * which will imediately call ixa_inactive to release the references.
 321  */
 322 boolean_t
 323 ip_xmit_attr_from_mblk(mblk_t *ixamp, ip_xmit_attr_t *ixa)
 324 {
 325         ixamblk_t       *ixm;
 326         netstack_t      *ns;
 327         ip_stack_t      *ipst;
 328         ill_t           *ill;
 329         nce_t           *nce;
 330 
 331         /* We assume the caller hasn't initialized ixa */
 332         bzero(ixa, sizeof (*ixa));
 333 
 334         ASSERT(DB_TYPE(ixamp) == M_BREAK);
 335         ASSERT(ixamp->b_cont == NULL);
 336 
 337         ixm = (ixamblk_t *)ixamp->b_rptr;
 338         ASSERT(!ixm->ixm_inbound);
 339 
 340         /* Verify the netstack is still around */
 341         ns = netstack_find_by_stackid(ixm->ixm_stackid);
 342         if (ns == NULL) {
 343                 /* Disappeared on us */
 344                 (void) ip_xmit_attr_free_mblk(ixamp);
 345                 return (B_FALSE);
 346         }
 347         ipst = ns->netstack_ip;
 348 
 349         /* Verify the ill is still around */
 350         ill = ill_lookup_on_ifindex(ixm->ixm_ifindex,
 351             !(ixm->ixm_flags & IXAF_IS_IPV4), ipst);
 352 
 353         /* We have the ill, hence the netstack can't go away */
 354         netstack_rele(ns);
 355         if (ill == NULL) {
 356                 /* Disappeared on us */
 357                 (void) ip_xmit_attr_free_mblk(ixamp);
 358                 return (B_FALSE);
 359         }
 360         /*
 361          * Find the nce. We don't load-spread (only lookup nce's on the ill)
 362          * because we want to find the same nce as the one we had when
 363          * ip_xmit_attr_to_mblk was called.
 364          */
 365         if (ixm->ixm_flags & IXAF_IS_IPV4) {
 366                 nce = nce_lookup_v4(ill, &ixm->ixm_nceaddr_v4);
 367         } else {
 368                 nce = nce_lookup_v6(ill, &ixm->ixm_nceaddr_v6);
 369         }
 370 
 371         /* We have the nce, hence the ill can't go away */
 372         ill_refrele(ill);
 373         if (nce == NULL) {
 374                 /*
 375                  * Since this is unusual and we don't know what type of
 376                  * nce it was, we drop the packet.
 377                  */
 378                 (void) ip_xmit_attr_free_mblk(ixamp);
 379                 return (B_FALSE);
 380         }
 381 
 382         ixa->ixa_flags = ixm->ixm_flags;
 383         ixa->ixa_refcnt = 1;
 384         ixa->ixa_ipst = ipst;
 385         ixa->ixa_fragsize = ixm->ixm_fragsize;
 386         ixa->ixa_pktlen =  ixm->ixm_pktlen;
 387         ixa->ixa_ip_hdr_length = ixm->ixm_ip_hdr_length;
 388         ixa->ixa_protocol = ixm->ixm_protocol;
 389         ixa->ixa_nce = nce;
 390         ixa->ixa_postfragfn = ixm->ixm_postfragfn;
 391         ixa->ixa_zoneid = ixm->ixm_zoneid;
 392         ixa->ixa_no_loop_zoneid = ixm->ixm_no_loop_zoneid;
 393         ixa->ixa_scopeid = ixm->ixm_scopeid;
 394         ixa->ixa_ident = ixm->ixm_ident;
 395         ixa->ixa_xmit_hint = ixm->ixm_xmit_hint;
 396 
 397         if (ixm->ixm_tsl != NULL) {
 398                 ixa->ixa_tsl = ixm->ixm_tsl;
 399                 ixa->ixa_free_flags |= IXA_FREE_TSL;
 400                 ixm->ixm_tsl = NULL;
 401         }
 402         if (ixm->ixm_cred != NULL) {
 403                 ixa->ixa_cred = ixm->ixm_cred;
 404                 ixa->ixa_free_flags |= IXA_FREE_CRED;
 405                 ixm->ixm_cred = NULL;
 406         }
 407         ixa->ixa_cpid = ixm->ixm_cpid;
 408         ixa->ixa_conn_id = ixm->ixm_conn_id;
 409 
 410         ixa->ixa_ipsec_ah_sa = ixm->ixm_ipsec_ah_sa;
 411         ixa->ixa_ipsec_esp_sa = ixm->ixm_ipsec_esp_sa;
 412         ixa->ixa_ipsec_policy = ixm->ixm_ipsec_policy;
 413         ixa->ixa_ipsec_action = ixm->ixm_ipsec_action;
 414         ixa->ixa_ipsec_latch = ixm->ixm_ipsec_latch;
 415 
 416         ixa->ixa_ipsec_ref[0] = ixm->ixm_ipsec_ref[0];
 417         ixa->ixa_ipsec_ref[1] = ixm->ixm_ipsec_ref[1];
 418         ixa->ixa_ipsec_src_port = ixm->ixm_ipsec_src_port;
 419         ixa->ixa_ipsec_dst_port = ixm->ixm_ipsec_dst_port;
 420         ixa->ixa_ipsec_icmp_type = ixm->ixm_ipsec_icmp_type;
 421         ixa->ixa_ipsec_icmp_code = ixm->ixm_ipsec_icmp_code;
 422         ixa->ixa_ipsec_inaf = ixm->ixm_ipsec_inaf;
 423         ixa->ixa_ipsec_insrc[0] = ixm->ixm_ipsec_insrc[0];
 424         ixa->ixa_ipsec_insrc[1] = ixm->ixm_ipsec_insrc[1];
 425         ixa->ixa_ipsec_insrc[2] = ixm->ixm_ipsec_insrc[2];
 426         ixa->ixa_ipsec_insrc[3] = ixm->ixm_ipsec_insrc[3];
 427         ixa->ixa_ipsec_indst[0] = ixm->ixm_ipsec_indst[0];
 428         ixa->ixa_ipsec_indst[1] = ixm->ixm_ipsec_indst[1];
 429         ixa->ixa_ipsec_indst[2] = ixm->ixm_ipsec_indst[2];
 430         ixa->ixa_ipsec_indst[3] = ixm->ixm_ipsec_indst[3];
 431         ixa->ixa_ipsec_insrcpfx = ixm->ixm_ipsec_insrcpfx;
 432         ixa->ixa_ipsec_indstpfx = ixm->ixm_ipsec_indstpfx;
 433         ixa->ixa_ipsec_proto = ixm->ixm_ipsec_proto;
 434 
 435         freeb(ixamp);
 436         return (B_TRUE);
 437 }
 438 
 439 /*
 440  * Free the ixm mblk and any references it holds
 441  * Returns b_cont.
 442  */
 443 mblk_t *
 444 ip_xmit_attr_free_mblk(mblk_t *ixamp)
 445 {
 446         ixamblk_t       *ixm;
 447         mblk_t          *mp;
 448 
 449         /* Consume mp */
 450         ASSERT(DB_TYPE(ixamp) == M_BREAK);
 451         mp = ixamp->b_cont;
 452 
 453         ixm = (ixamblk_t *)ixamp->b_rptr;
 454         ASSERT(!ixm->ixm_inbound);
 455 
 456         if (ixm->ixm_ipsec_ah_sa != NULL) {
 457                 IPSA_REFRELE(ixm->ixm_ipsec_ah_sa);
 458                 ixm->ixm_ipsec_ah_sa = NULL;
 459         }
 460         if (ixm->ixm_ipsec_esp_sa != NULL) {
 461                 IPSA_REFRELE(ixm->ixm_ipsec_esp_sa);
 462                 ixm->ixm_ipsec_esp_sa = NULL;
 463         }
 464         if (ixm->ixm_ipsec_policy != NULL) {
 465                 IPPOL_REFRELE(ixm->ixm_ipsec_policy);
 466                 ixm->ixm_ipsec_policy = NULL;
 467         }
 468         if (ixm->ixm_ipsec_action != NULL) {
 469                 IPACT_REFRELE(ixm->ixm_ipsec_action);
 470                 ixm->ixm_ipsec_action = NULL;
 471         }
 472         if (ixm->ixm_ipsec_latch) {
 473                 IPLATCH_REFRELE(ixm->ixm_ipsec_latch);
 474                 ixm->ixm_ipsec_latch = NULL;
 475         }
 476 
 477         if (ixm->ixm_tsl != NULL) {
 478                 label_rele(ixm->ixm_tsl);
 479                 ixm->ixm_tsl = NULL;
 480         }
 481         if (ixm->ixm_cred != NULL) {
 482                 crfree(ixm->ixm_cred);
 483                 ixm->ixm_cred = NULL;
 484         }
 485         freeb(ixamp);
 486         return (mp);
 487 }
 488 
 489 /*
 490  * Take the information in ip_recv_attr_t and stick it in an mblk
 491  * that can later be passed to ip_recv_attr_from_mblk to recreate the
 492  * ip_recv_attr_t.
 493  *
 494  * Returns NULL on memory allocation failure.
 495  */
 496 mblk_t *
 497 ip_recv_attr_to_mblk(ip_recv_attr_t *ira)
 498 {
 499         mblk_t          *iramp;
 500         iramblk_t       *irm;
 501         ill_t           *ill = ira->ira_ill;
 502 
 503         ASSERT(ira->ira_ill != NULL || ira->ira_ruifindex != 0);
 504 
 505         iramp = allocb(sizeof (*irm), BPRI_MED);
 506         if (iramp == NULL)
 507                 return (NULL);
 508 
 509         iramp->b_datap->db_type = M_BREAK;
 510         iramp->b_wptr += sizeof (*irm);
 511         irm = (iramblk_t *)iramp->b_rptr;
 512 
 513         bzero(irm, sizeof (*irm));
 514         irm->irm_inbound = B_TRUE;
 515         irm->irm_flags = ira->ira_flags;
 516         if (ill != NULL) {
 517                 /* Internal to IP - preserve ip_stack_t, ill and rill */
 518                 irm->irm_stackid =
 519                     ill->ill_ipst->ips_netstack->netstack_stackid;
 520                 irm->irm_ifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
 521                 ASSERT(ira->ira_rill->ill_phyint->phyint_ifindex ==
 522                     ira->ira_rifindex);
 523         } else {
 524                 /* Let ip_recv_attr_from_stackid know there isn't one */
 525                 irm->irm_stackid = -1;
 526         }
 527         irm->irm_rifindex = ira->ira_rifindex;
 528         irm->irm_ruifindex = ira->ira_ruifindex;
 529         irm->irm_pktlen = ira->ira_pktlen;
 530         irm->irm_ip_hdr_length = ira->ira_ip_hdr_length;
 531         irm->irm_protocol = ira->ira_protocol;
 532 
 533         irm->irm_sqp = ira->ira_sqp;
 534         irm->irm_ring = ira->ira_ring;
 535 
 536         irm->irm_zoneid = ira->ira_zoneid;
 537         irm->irm_mroute_tunnel = ira->ira_mroute_tunnel;
 538         irm->irm_no_loop_zoneid = ira->ira_no_loop_zoneid;
 539         irm->irm_esp_udp_ports = ira->ira_esp_udp_ports;
 540 
 541         if (ira->ira_tsl != NULL) {
 542                 irm->irm_tsl = ira->ira_tsl;
 543                 label_hold(irm->irm_tsl);
 544         }
 545         if (ira->ira_cred != NULL) {
 546                 irm->irm_cred = ira->ira_cred;
 547                 crhold(ira->ira_cred);
 548         }
 549         irm->irm_cpid = ira->ira_cpid;
 550 
 551         if (ira->ira_flags & IRAF_L2SRC_SET)
 552                 bcopy(ira->ira_l2src, irm->irm_l2src, IRA_L2SRC_SIZE);
 553 
 554         if (ira->ira_flags & IRAF_IPSEC_SECURE) {
 555                 if (ira->ira_ipsec_ah_sa != NULL) {
 556                         irm->irm_ipsec_ah_sa = ira->ira_ipsec_ah_sa;
 557                         IPSA_REFHOLD(ira->ira_ipsec_ah_sa);
 558                 }
 559                 if (ira->ira_ipsec_esp_sa != NULL) {
 560                         irm->irm_ipsec_esp_sa = ira->ira_ipsec_esp_sa;
 561                         IPSA_REFHOLD(ira->ira_ipsec_esp_sa);
 562                 }
 563                 if (ira->ira_ipsec_action != NULL) {
 564                         irm->irm_ipsec_action = ira->ira_ipsec_action;
 565                         IPACT_REFHOLD(ira->ira_ipsec_action);
 566                 }
 567         }
 568         return (iramp);
 569 }
 570 
 571 /*
 572  * Extract the ip_recv_attr_t from the mblk. If we are used inside IP
 573  * then irm_stackid is not -1, in which case we check that the
 574  * ip_stack_t and ill_t still exist. Returns B_FALSE if that is
 575  * not the case.
 576  * If irm_stackid is zero then we are used by an ULP (e.g., squeue_enter)
 577  * and we just proceed with ira_ill and ira_rill as NULL.
 578  *
 579  * The caller needs to release any references on the pointers inside the ire
 580  * by calling ira_cleanup.
 581  */
 582 boolean_t
 583 ip_recv_attr_from_mblk(mblk_t *iramp, ip_recv_attr_t *ira)
 584 {
 585         iramblk_t       *irm;
 586         netstack_t      *ns;
 587         ip_stack_t      *ipst = NULL;
 588         ill_t           *ill = NULL, *rill = NULL;
 589 
 590         /* We assume the caller hasn't initialized ira */
 591         bzero(ira, sizeof (*ira));
 592 
 593         ASSERT(DB_TYPE(iramp) == M_BREAK);
 594         ASSERT(iramp->b_cont == NULL);
 595 
 596         irm = (iramblk_t *)iramp->b_rptr;
 597         ASSERT(irm->irm_inbound);
 598 
 599         if (irm->irm_stackid != -1) {
 600                 /* Verify the netstack is still around */
 601                 ns = netstack_find_by_stackid(irm->irm_stackid);
 602                 if (ns == NULL) {
 603                         /* Disappeared on us */
 604                         (void) ip_recv_attr_free_mblk(iramp);
 605                         return (B_FALSE);
 606                 }
 607                 ipst = ns->netstack_ip;
 608 
 609                 /* Verify the ill is still around */
 610                 ill = ill_lookup_on_ifindex(irm->irm_ifindex,
 611                     !(irm->irm_flags & IRAF_IS_IPV4), ipst);
 612 
 613                 if (irm->irm_ifindex == irm->irm_rifindex) {
 614                         rill = ill;
 615                 } else {
 616                         rill = ill_lookup_on_ifindex(irm->irm_rifindex,
 617                             !(irm->irm_flags & IRAF_IS_IPV4), ipst);
 618                 }
 619 
 620                 /* We have the ill, hence the netstack can't go away */
 621                 netstack_rele(ns);
 622                 if (ill == NULL || rill == NULL) {
 623                         /* Disappeared on us */
 624                         if (ill != NULL)
 625                                 ill_refrele(ill);
 626                         if (rill != NULL && rill != ill)
 627                                 ill_refrele(rill);
 628                         (void) ip_recv_attr_free_mblk(iramp);
 629                         return (B_FALSE);
 630                 }
 631         }
 632 
 633         ira->ira_flags = irm->irm_flags;
 634         /* Caller must ill_refele(ira_ill) by using ira_cleanup() */
 635         ira->ira_ill = ill;
 636         ira->ira_rill = rill;
 637 
 638         ira->ira_rifindex = irm->irm_rifindex;
 639         ira->ira_ruifindex = irm->irm_ruifindex;
 640         ira->ira_pktlen = irm->irm_pktlen;
 641         ira->ira_ip_hdr_length = irm->irm_ip_hdr_length;
 642         ira->ira_protocol = irm->irm_protocol;
 643 
 644         ira->ira_sqp = irm->irm_sqp;
 645         /* The rest of IP assumes that the rings never go away. */
 646         ira->ira_ring = irm->irm_ring;
 647 
 648         ira->ira_zoneid = irm->irm_zoneid;
 649         ira->ira_mroute_tunnel = irm->irm_mroute_tunnel;
 650         ira->ira_no_loop_zoneid = irm->irm_no_loop_zoneid;
 651         ira->ira_esp_udp_ports = irm->irm_esp_udp_ports;
 652 
 653         if (irm->irm_tsl != NULL) {
 654                 ira->ira_tsl = irm->irm_tsl;
 655                 ira->ira_free_flags |= IRA_FREE_TSL;
 656                 irm->irm_tsl = NULL;
 657         }
 658         if (irm->irm_cred != NULL) {
 659                 ira->ira_cred = irm->irm_cred;
 660                 ira->ira_free_flags |= IRA_FREE_CRED;
 661                 irm->irm_cred = NULL;
 662         }
 663         ira->ira_cpid = irm->irm_cpid;
 664 
 665         if (ira->ira_flags & IRAF_L2SRC_SET)
 666                 bcopy(irm->irm_l2src, ira->ira_l2src, IRA_L2SRC_SIZE);
 667 
 668         ira->ira_ipsec_ah_sa = irm->irm_ipsec_ah_sa;
 669         ira->ira_ipsec_esp_sa = irm->irm_ipsec_esp_sa;
 670         ira->ira_ipsec_action = irm->irm_ipsec_action;
 671 
 672         freeb(iramp);
 673         return (B_TRUE);
 674 }
 675 
 676 /*
 677  * Free the irm mblk and any references it holds
 678  * Returns b_cont.
 679  */
 680 mblk_t *
 681 ip_recv_attr_free_mblk(mblk_t *iramp)
 682 {
 683         iramblk_t       *irm;
 684         mblk_t          *mp;
 685 
 686         /* Consume mp */
 687         ASSERT(DB_TYPE(iramp) == M_BREAK);
 688         mp = iramp->b_cont;
 689 
 690         irm = (iramblk_t *)iramp->b_rptr;
 691         ASSERT(irm->irm_inbound);
 692 
 693         if (irm->irm_ipsec_ah_sa != NULL) {
 694                 IPSA_REFRELE(irm->irm_ipsec_ah_sa);
 695                 irm->irm_ipsec_ah_sa = NULL;
 696         }
 697         if (irm->irm_ipsec_esp_sa != NULL) {
 698                 IPSA_REFRELE(irm->irm_ipsec_esp_sa);
 699                 irm->irm_ipsec_esp_sa = NULL;
 700         }
 701         if (irm->irm_ipsec_action != NULL) {
 702                 IPACT_REFRELE(irm->irm_ipsec_action);
 703                 irm->irm_ipsec_action = NULL;
 704         }
 705         if (irm->irm_tsl != NULL) {
 706                 label_rele(irm->irm_tsl);
 707                 irm->irm_tsl = NULL;
 708         }
 709         if (irm->irm_cred != NULL) {
 710                 crfree(irm->irm_cred);
 711                 irm->irm_cred = NULL;
 712         }
 713 
 714         freeb(iramp);
 715         return (mp);
 716 }
 717 
 718 /*
 719  * Returns true if the mblk contains an ip_recv_attr_t
 720  * For now we just check db_type.
 721  */
 722 boolean_t
 723 ip_recv_attr_is_mblk(mblk_t *mp)
 724 {
 725         /*
 726          * Need to handle the various forms of tcp_timermp which are tagged
 727          * with b_wptr and might have a NULL b_datap.
 728          */
 729         if (mp->b_wptr == NULL || mp->b_wptr == (uchar_t *)-1)
 730                 return (B_FALSE);
 731 
 732 #ifdef  DEBUG
 733         iramblk_t       *irm;
 734 
 735         if (DB_TYPE(mp) != M_BREAK)
 736                 return (B_FALSE);
 737 
 738         irm = (iramblk_t *)mp->b_rptr;
 739         ASSERT(irm->irm_inbound);
 740         return (B_TRUE);
 741 #else
 742         return (DB_TYPE(mp) == M_BREAK);
 743 #endif
 744 }
 745 
 746 static ip_xmit_attr_t *
 747 conn_get_ixa_impl(conn_t *connp, boolean_t replace, int kmflag)
 748 {
 749         ip_xmit_attr_t  *ixa;
 750         ip_xmit_attr_t  *oldixa;
 751 
 752         mutex_enter(&connp->conn_lock);
 753         ixa = connp->conn_ixa;
 754 
 755         /* At least one references for the conn_t */
 756         ASSERT(ixa->ixa_refcnt >= 1);
 757         if (atomic_add_32_nv(&ixa->ixa_refcnt, 1) == 2) {
 758                 /* No other thread using conn_ixa */
 759                 mutex_exit(&connp->conn_lock);
 760                 return (ixa);
 761         }
 762         ixa = kmem_alloc(sizeof (*ixa), kmflag);
 763         if (ixa == NULL) {
 764                 mutex_exit(&connp->conn_lock);
 765                 ixa_refrele(connp->conn_ixa);
 766                 return (NULL);
 767         }
 768         ixa_safe_copy(connp->conn_ixa, ixa);
 769 
 770         /* Make sure we drop conn_lock before any refrele */
 771         if (replace) {
 772                 ixa->ixa_refcnt++;   /* No atomic needed - not visible */
 773                 oldixa = connp->conn_ixa;
 774                 connp->conn_ixa = ixa;
 775                 mutex_exit(&connp->conn_lock);
 776                 IXA_REFRELE(oldixa);    /* Undo refcnt from conn_t */
 777         } else {
 778                 oldixa = connp->conn_ixa;
 779                 mutex_exit(&connp->conn_lock);
 780         }
 781         IXA_REFRELE(oldixa);    /* Undo above atomic_add_32_nv */
 782 
 783         return (ixa);
 784 }
 785 
 786 /*
 787  * Return an ip_xmit_attr_t to use with a conn_t that ensures that only
 788  * the caller can access the ip_xmit_attr_t.
 789  *
 790  * If nobody else is using conn_ixa we return it.
 791  * Otherwise we make a "safe" copy of conn_ixa
 792  * and return it. The "safe" copy has the pointers set to NULL
 793  * (since the pointers might be changed by another thread using
 794  * conn_ixa). The caller needs to check for NULL pointers to see
 795  * if ip_set_destination needs to be called to re-establish the pointers.
 796  *
 797  * If 'replace' is set then we replace conn_ixa with the new ip_xmit_attr_t.
 798  * That is used when we connect() the ULP.
 799  */
 800 ip_xmit_attr_t *
 801 conn_get_ixa(conn_t *connp, boolean_t replace)
 802 {
 803         return (conn_get_ixa_impl(connp, replace, KM_NOSLEEP));
 804 }
 805 
 806 /*
 807  * Used only when the option is to have the kernel hang due to not
 808  * cleaning up ixa references on ills etc.
 809  */
 810 ip_xmit_attr_t *
 811 conn_get_ixa_tryhard(conn_t *connp, boolean_t replace)
 812 {
 813         return (conn_get_ixa_impl(connp, replace, KM_SLEEP));
 814 }
 815 
 816 /*
 817  * Replace conn_ixa with the ixa argument.
 818  *
 819  * The caller must hold conn_lock.
 820  *
 821  * We return the old ixa; the caller must ixa_refrele that after conn_lock
 822  * has been dropped.
 823  */
 824 ip_xmit_attr_t *
 825 conn_replace_ixa(conn_t *connp, ip_xmit_attr_t *ixa)
 826 {
 827         ip_xmit_attr_t  *oldixa;
 828 
 829         ASSERT(MUTEX_HELD(&connp->conn_lock));
 830 
 831         oldixa = connp->conn_ixa;
 832         IXA_REFHOLD(ixa);
 833         ixa->ixa_conn_id = oldixa->ixa_conn_id;
 834         connp->conn_ixa = ixa;
 835         return (oldixa);
 836 }
 837 
 838 /*
 839  * Return a ip_xmit_attr_t to use with a conn_t that is based on but
 840  * separate from conn_ixa.
 841  *
 842  * This "safe" copy has the pointers set to NULL
 843  * (since the pointers might be changed by another thread using
 844  * conn_ixa). The caller needs to check for NULL pointers to see
 845  * if ip_set_destination needs to be called to re-establish the pointers.
 846  */
 847 ip_xmit_attr_t *
 848 conn_get_ixa_exclusive(conn_t *connp)
 849 {
 850         ip_xmit_attr_t *ixa;
 851 
 852         mutex_enter(&connp->conn_lock);
 853         ixa = connp->conn_ixa;
 854 
 855         /* At least one references for the conn_t */
 856         ASSERT(ixa->ixa_refcnt >= 1);
 857 
 858         /* Make sure conn_ixa doesn't disappear while we copy it */
 859         atomic_add_32(&ixa->ixa_refcnt, 1);
 860 
 861         ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP);
 862         if (ixa == NULL) {
 863                 mutex_exit(&connp->conn_lock);
 864                 ixa_refrele(connp->conn_ixa);
 865                 return (NULL);
 866         }
 867         ixa_safe_copy(connp->conn_ixa, ixa);
 868         mutex_exit(&connp->conn_lock);
 869         IXA_REFRELE(connp->conn_ixa);
 870         return (ixa);
 871 }
 872 
 873 void
 874 ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
 875 {
 876         bcopy(src, ixa, sizeof (*ixa));
 877         ixa->ixa_refcnt = 1;
 878         /*
 879          * Clear any pointers that have references and might be changed
 880          * by ip_set_destination or the ULP
 881          */
 882         ixa->ixa_ire = NULL;
 883         ixa->ixa_nce = NULL;
 884         ixa->ixa_dce = NULL;
 885         ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
 886         ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
 887 #ifdef DEBUG
 888         ixa->ixa_curthread = NULL;
 889 #endif
 890         /* Clear all the IPsec pointers and the flag as well. */
 891         ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
 892 
 893         ixa->ixa_ipsec_latch = NULL;
 894         ixa->ixa_ipsec_ah_sa = NULL;
 895         ixa->ixa_ipsec_esp_sa = NULL;
 896         ixa->ixa_ipsec_policy = NULL;
 897         ixa->ixa_ipsec_action = NULL;
 898 
 899         /*
 900          * We leave ixa_tsl unchanged, but if it has a refhold we need
 901          * to get an extra refhold.
 902          */
 903         if (ixa->ixa_free_flags & IXA_FREE_TSL)
 904                 label_hold(ixa->ixa_tsl);
 905 
 906         /*
 907          * We leave ixa_cred unchanged, but if it has a refhold we need
 908          * to get an extra refhold.
 909          */
 910         if (ixa->ixa_free_flags & IXA_FREE_CRED)
 911                 crhold(ixa->ixa_cred);
 912 }
 913 
 914 /*
 915  * Duplicate an ip_xmit_attr_t.
 916  * Assumes that the caller controls the ixa, hence we do not need to use
 917  * a safe copy. We just have to increase the refcnt on any pointers.
 918  */
 919 ip_xmit_attr_t *
 920 ip_xmit_attr_duplicate(ip_xmit_attr_t *src_ixa)
 921 {
 922         ip_xmit_attr_t *ixa;
 923 
 924         ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP);
 925         if (ixa == NULL)
 926                 return (NULL);
 927         bcopy(src_ixa, ixa, sizeof (*ixa));
 928         ixa->ixa_refcnt = 1;
 929 
 930         if (ixa->ixa_ire != NULL)
 931                 ire_refhold_notr(ixa->ixa_ire);
 932         if (ixa->ixa_nce != NULL)
 933                 nce_refhold(ixa->ixa_nce);
 934         if (ixa->ixa_dce != NULL)
 935                 dce_refhold_notr(ixa->ixa_dce);
 936 
 937 #ifdef DEBUG
 938         ixa->ixa_curthread = NULL;
 939 #endif
 940 
 941         if (ixa->ixa_ipsec_latch != NULL)
 942                 IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
 943         if (ixa->ixa_ipsec_ah_sa != NULL)
 944                 IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
 945         if (ixa->ixa_ipsec_esp_sa != NULL)
 946                 IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
 947         if (ixa->ixa_ipsec_policy != NULL)
 948                 IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
 949         if (ixa->ixa_ipsec_action != NULL)
 950                 IPACT_REFHOLD(ixa->ixa_ipsec_action);
 951 
 952         if (ixa->ixa_tsl != NULL) {
 953                 label_hold(ixa->ixa_tsl);
 954                 ixa->ixa_free_flags |= IXA_FREE_TSL;
 955         }
 956         if (ixa->ixa_cred != NULL) {
 957                 crhold(ixa->ixa_cred);
 958                 ixa->ixa_free_flags |= IXA_FREE_CRED;
 959         }
 960         return (ixa);
 961 }
 962 
 963 /*
 964  * Used to replace the ixa_label field.
 965  * The caller should have a reference on the label, which we transfer to
 966  * the attributes so that when the attribute is freed/cleaned up
 967  * we will release that reference.
 968  */
 969 void
 970 ip_xmit_attr_replace_tsl(ip_xmit_attr_t *ixa, ts_label_t *tsl)
 971 {
 972         ASSERT(tsl != NULL);
 973 
 974         if (ixa->ixa_free_flags & IXA_FREE_TSL) {
 975                 ASSERT(ixa->ixa_tsl != NULL);
 976                 label_rele(ixa->ixa_tsl);
 977         } else {
 978                 ixa->ixa_free_flags |= IXA_FREE_TSL;
 979         }
 980         ixa->ixa_tsl = tsl;
 981 }
 982 
 983 /*
 984  * Replace the ip_recv_attr_t's label.
 985  * Due to kernel RPC's use of db_credp we also need to replace ira_cred;
 986  * TCP/UDP uses ira_cred to set db_credp for non-socket users.
 987  * This can fail (and return B_FALSE) due to lack of memory.
 988  */
 989 boolean_t
 990 ip_recv_attr_replace_label(ip_recv_attr_t *ira, ts_label_t *tsl)
 991 {
 992         cred_t  *newcr;
 993 
 994         if (ira->ira_free_flags & IRA_FREE_TSL) {
 995                 ASSERT(ira->ira_tsl != NULL);
 996                 label_rele(ira->ira_tsl);
 997         }
 998         label_hold(tsl);
 999         ira->ira_tsl = tsl;
1000         ira->ira_free_flags |= IRA_FREE_TSL;
1001 
1002         /*
1003          * Reset zoneid if we have a shared address. That allows
1004          * ip_fanout_tx_v4/v6 to determine the zoneid again.
1005          */
1006         if (ira->ira_flags & IRAF_TX_SHARED_ADDR)
1007                 ira->ira_zoneid = ALL_ZONES;
1008 
1009         /* We update ira_cred for RPC */
1010         newcr = copycred_from_tslabel(ira->ira_cred, ira->ira_tsl, KM_NOSLEEP);
1011         if (newcr == NULL)
1012                 return (B_FALSE);
1013         if (ira->ira_free_flags & IRA_FREE_CRED)
1014                 crfree(ira->ira_cred);
1015         ira->ira_cred = newcr;
1016         ira->ira_free_flags |= IRA_FREE_CRED;
1017         return (B_TRUE);
1018 }
1019 
1020 /*
1021  * This needs to be called after ip_set_destination/tsol_check_dest might
1022  * have changed ixa_tsl to be specific for a destination, and we now want to
1023  * send to a different destination.
1024  * We have to restart with crgetlabel() since ip_set_destination/
1025  * tsol_check_dest will start with ixa_tsl.
1026  */
1027 void
1028 ip_xmit_attr_restore_tsl(ip_xmit_attr_t *ixa, cred_t *cr)
1029 {
1030         if (!is_system_labeled())
1031                 return;
1032 
1033         if (ixa->ixa_free_flags & IXA_FREE_TSL) {
1034                 ASSERT(ixa->ixa_tsl != NULL);
1035                 label_rele(ixa->ixa_tsl);
1036                 ixa->ixa_free_flags &= ~IXA_FREE_TSL;
1037         }
1038         ixa->ixa_tsl = crgetlabel(cr);
1039 }
1040 
1041 void
1042 ixa_refrele(ip_xmit_attr_t *ixa)
1043 {
1044         IXA_REFRELE(ixa);
1045 }
1046 
1047 void
1048 ixa_inactive(ip_xmit_attr_t *ixa)
1049 {
1050         ASSERT(ixa->ixa_refcnt == 0);
1051 
1052         ixa_cleanup(ixa);
1053         kmem_free(ixa, sizeof (*ixa));
1054 }
1055 
1056 /*
1057  * Release any references contained in the ixa.
1058  * Also clear any fields that are not controlled by ixa_flags.
1059  */
1060 void
1061 ixa_cleanup(ip_xmit_attr_t *ixa)
1062 {
1063         if (ixa->ixa_ire != NULL) {
1064                 ire_refrele_notr(ixa->ixa_ire);
1065                 ixa->ixa_ire = NULL;
1066         }
1067         if (ixa->ixa_dce != NULL) {
1068                 dce_refrele_notr(ixa->ixa_dce);
1069                 ixa->ixa_dce = NULL;
1070         }
1071         if (ixa->ixa_nce != NULL) {
1072                 nce_refrele(ixa->ixa_nce);
1073                 ixa->ixa_nce = NULL;
1074         }
1075         ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1076         ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
1077         if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
1078                 ipsec_out_release_refs(ixa);
1079         }
1080         if (ixa->ixa_free_flags & IXA_FREE_TSL) {
1081                 ASSERT(ixa->ixa_tsl != NULL);
1082                 label_rele(ixa->ixa_tsl);
1083                 ixa->ixa_free_flags &= ~IXA_FREE_TSL;
1084         }
1085         ixa->ixa_tsl = NULL;
1086         if (ixa->ixa_free_flags & IXA_FREE_CRED) {
1087                 ASSERT(ixa->ixa_cred != NULL);
1088                 crfree(ixa->ixa_cred);
1089                 ixa->ixa_free_flags &= ~IXA_FREE_CRED;
1090         }
1091         ixa->ixa_cred = NULL;
1092         ixa->ixa_src_preferences = 0;
1093         ixa->ixa_ifindex = 0;
1094         ixa->ixa_multicast_ifindex = 0;
1095         ixa->ixa_multicast_ifaddr = INADDR_ANY;
1096 }
1097 
1098 /*
1099  * Release any references contained in the ira.
1100  * Callers which use ip_recv_attr_from_mblk() would pass B_TRUE as the second
1101  * argument.
1102  */
1103 void
1104 ira_cleanup(ip_recv_attr_t *ira, boolean_t refrele_ill)
1105 {
1106         if (ira->ira_ill != NULL) {
1107                 if (ira->ira_rill != ira->ira_ill) {
1108                         /* Caused by async processing */
1109                         ill_refrele(ira->ira_rill);
1110                 }
1111                 if (refrele_ill)
1112                         ill_refrele(ira->ira_ill);
1113         }
1114         if (ira->ira_flags & IRAF_IPSEC_SECURE) {
1115                 ipsec_in_release_refs(ira);
1116         }
1117         if (ira->ira_free_flags & IRA_FREE_TSL) {
1118                 ASSERT(ira->ira_tsl != NULL);
1119                 label_rele(ira->ira_tsl);
1120                 ira->ira_free_flags &= ~IRA_FREE_TSL;
1121         }
1122         ira->ira_tsl = NULL;
1123         if (ira->ira_free_flags & IRA_FREE_CRED) {
1124                 ASSERT(ira->ira_cred != NULL);
1125                 crfree(ira->ira_cred);
1126                 ira->ira_free_flags &= ~IRA_FREE_CRED;
1127         }
1128         ira->ira_cred = NULL;
1129 }
1130 
1131 /*
1132  * Function to help release any IRE, NCE, or DCEs that
1133  * have been deleted and are marked as condemned.
1134  * The caller is responsible for any serialization which is different
1135  * for TCP, SCTP, and others.
1136  */
1137 static void
1138 ixa_cleanup_stale(ip_xmit_attr_t *ixa)
1139 {
1140         ire_t           *ire;
1141         nce_t           *nce;
1142         dce_t           *dce;
1143 
1144         ire = ixa->ixa_ire;
1145         nce = ixa->ixa_nce;
1146         dce = ixa->ixa_dce;
1147 
1148         if (ire != NULL && IRE_IS_CONDEMNED(ire)) {
1149                 ire_refrele_notr(ire);
1150                 ire = ire_blackhole(ixa->ixa_ipst,
1151                     !(ixa->ixa_flags & IXAF_IS_IPV4));
1152                 ASSERT(ire != NULL);
1153 #ifdef DEBUG
1154                 ire_refhold_notr(ire);
1155                 ire_refrele(ire);
1156 #endif
1157                 ixa->ixa_ire = ire;
1158                 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1159         }
1160         if (nce != NULL && nce->nce_is_condemned) {
1161                 /* Can make it NULL as long as we set IRE_GENERATION_VERIFY */
1162                 nce_refrele(nce);
1163                 ixa->ixa_nce = NULL;
1164                 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1165         }
1166         if (dce != NULL && DCE_IS_CONDEMNED(dce)) {
1167                 dce_refrele_notr(dce);
1168                 dce = dce_get_default(ixa->ixa_ipst);
1169                 ASSERT(dce != NULL);
1170 #ifdef DEBUG
1171                 dce_refhold_notr(dce);
1172                 dce_refrele(dce);
1173 #endif
1174                 ixa->ixa_dce = dce;
1175                 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
1176         }
1177 }
1178 
1179 static mblk_t *
1180 tcp_ixa_cleanup_getmblk(conn_t *connp)
1181 {
1182         tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
1183         int need_retry;
1184         mblk_t *mp;
1185 
1186         mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1187 
1188         /*
1189          * It's possible that someone else came in and started cleaning up
1190          * another connection between the time we verified this one is not being
1191          * cleaned up and the time we actually get the shared mblk.  If that's
1192          * the case, we've dropped the lock, and some other thread may have
1193          * cleaned up this connection again, and is still waiting for
1194          * notification of that cleanup's completion.  Therefore we need to
1195          * recheck.
1196          */
1197         do {
1198                 need_retry = 0;
1199                 while (connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE) {
1200                         cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
1201                             &tcps->tcps_ixa_cleanup_lock);
1202                 }
1203 
1204                 while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) {
1205                         /*
1206                          * Multiple concurrent cleanups; need to have the last
1207                          * one run since it could be an unplumb.
1208                          */
1209                         need_retry = 1;
1210                         cv_wait(&tcps->tcps_ixa_cleanup_ready_cv,
1211                             &tcps->tcps_ixa_cleanup_lock);
1212                 }
1213         } while (need_retry);
1214 
1215         /*
1216          * We now have the lock and the mblk; now make sure that no one else can
1217          * try to clean up this connection or enqueue it for cleanup, clear the
1218          * mblk pointer for this stack, drop the lock, and return the mblk.
1219          */
1220         ASSERT(MUTEX_HELD(&tcps->tcps_ixa_cleanup_lock));
1221         ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_IDLE);
1222         ASSERT(tcps->tcps_ixa_cleanup_mp == mp);
1223         ASSERT(mp != NULL);
1224 
1225         connp->conn_ixa->ixa_tcpcleanup = IXATC_INPROGRESS;
1226         tcps->tcps_ixa_cleanup_mp = NULL;
1227         mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1228 
1229         return (mp);
1230 }
1231 
1232 /*
1233  * Used to run ixa_cleanup_stale inside the tcp squeue.
1234  * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp
1235  * and waking up the caller.
1236  */
1237 /* ARGSUSED2 */
1238 static void
1239 tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2,
1240     ip_recv_attr_t *dummy)
1241 {
1242         conn_t  *connp = (conn_t *)arg;
1243         tcp_stack_t     *tcps;
1244 
1245         tcps = connp->conn_netstack->netstack_tcp;
1246 
1247         ixa_cleanup_stale(connp->conn_ixa);
1248 
1249         mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1250         ASSERT(tcps->tcps_ixa_cleanup_mp == NULL);
1251         connp->conn_ixa->ixa_tcpcleanup = IXATC_COMPLETE;
1252         tcps->tcps_ixa_cleanup_mp = mp;
1253         cv_signal(&tcps->tcps_ixa_cleanup_ready_cv);
1254         /*
1255          * It is possible for any number of threads to be waiting for cleanup of
1256          * different connections.  Absent a per-connection (or per-IXA) CV, we
1257          * need to wake them all up even though only one can be waiting on this
1258          * particular cleanup.
1259          */
1260         cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
1261         mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1262 }
1263 
1264 static void
1265 tcp_ixa_cleanup_wait_and_finish(conn_t *connp)
1266 {
1267         tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
1268 
1269         mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1270 
1271         ASSERT(connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE);
1272 
1273         while (connp->conn_ixa->ixa_tcpcleanup == IXATC_INPROGRESS) {
1274                 cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
1275                     &tcps->tcps_ixa_cleanup_lock);
1276         }
1277 
1278         ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_COMPLETE);
1279         connp->conn_ixa->ixa_tcpcleanup = IXATC_IDLE;
1280         cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
1281 
1282         mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1283 }
1284 
1285 /*
1286  * ipcl_walk() function to help release any IRE, NCE, or DCEs that
1287  * have been deleted and are marked as condemned.
1288  * Note that we can't cleanup the pointers since there can be threads
1289  * in conn_ip_output() sending while we are called.
1290  */
1291 void
1292 conn_ixa_cleanup(conn_t *connp, void *arg)
1293 {
1294         boolean_t tryhard = (boolean_t)arg;
1295 
1296         if (IPCL_IS_TCP(connp)) {
1297                 mblk_t          *mp;
1298 
1299                 mp = tcp_ixa_cleanup_getmblk(connp);
1300 
1301                 if (connp->conn_sqp->sq_run == curthread) {
1302                         /* Already on squeue */
1303                         tcp_ixa_cleanup(connp, mp, NULL, NULL);
1304                 } else {
1305                         CONN_INC_REF(connp);
1306                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup,
1307                             connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP);
1308                 }
1309                 tcp_ixa_cleanup_wait_and_finish(connp);
1310         } else if (IPCL_IS_SCTP(connp)) {
1311                 sctp_t  *sctp;
1312                 sctp_faddr_t *fp;
1313 
1314                 sctp = CONN2SCTP(connp);
1315                 RUN_SCTP(sctp);
1316                 ixa_cleanup_stale(connp->conn_ixa);
1317                 for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->sf_next)
1318                         ixa_cleanup_stale(fp->sf_ixa);
1319                 WAKE_SCTP(sctp);
1320         } else {
1321                 ip_xmit_attr_t  *ixa;
1322 
1323                 /*
1324                  * If there is a different thread using conn_ixa then we get a
1325                  * new copy and cut the old one loose from conn_ixa. Otherwise
1326                  * we use conn_ixa and prevent any other thread from
1327                  * using/changing it. Anybody using conn_ixa (e.g., a thread in
1328                  * conn_ip_output) will do an ixa_refrele which will remove any
1329                  * references on the ire etc.
1330                  *
1331                  * Once we are done other threads can use conn_ixa since the
1332                  * refcnt will be back at one.
1333                  *
1334                  * We are called either because an ill is going away, or
1335                  * due to memory reclaim. In the former case we wait for
1336                  * memory since we must remove the refcnts on the ill.
1337                  */
1338                 if (tryhard) {
1339                         ixa = conn_get_ixa_tryhard(connp, B_TRUE);
1340                         ASSERT(ixa != NULL);
1341                 } else {
1342                         ixa = conn_get_ixa(connp, B_TRUE);
1343                         if (ixa == NULL) {
1344                                 /*
1345                                  * Somebody else was using it and kmem_alloc
1346                                  * failed! Next memory reclaim will try to
1347                                  * clean up.
1348                                  */
1349                                 DTRACE_PROBE1(conn__ixa__cleanup__bail,
1350                                     conn_t *, connp);
1351                                 return;
1352                         }
1353                 }
1354                 ixa_cleanup_stale(ixa);
1355                 ixa_refrele(ixa);
1356         }
1357 }
1358 
1359 /*
1360  * ixa needs to be an exclusive copy so that no one changes the cookie
1361  * or the ixa_nce.
1362  */
1363 boolean_t
1364 ixa_check_drain_insert(conn_t *connp, ip_xmit_attr_t *ixa)
1365 {
1366         uintptr_t cookie = ixa->ixa_cookie;
1367         ill_dld_direct_t *idd;
1368         idl_tx_list_t *idl_txl;
1369         ill_t *ill = ixa->ixa_nce->nce_ill;
1370         boolean_t inserted = B_FALSE;
1371 
1372         idd = &(ill)->ill_dld_capab->idc_direct;
1373         idl_txl = &ixa->ixa_ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
1374         mutex_enter(&idl_txl->txl_lock);
1375 
1376         /*
1377          * If `cookie' is zero, ip_xmit() -> canputnext() failed -- i.e., flow
1378          * control is asserted on an ill that does not support direct calls.
1379          * Jump to insert.
1380          */
1381         if (cookie == 0)
1382                 goto tryinsert;
1383 
1384         ASSERT(ILL_DIRECT_CAPABLE(ill));
1385 
1386         if (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, cookie) == 0) {
1387                 DTRACE_PROBE1(ill__tx__not__blocked, uintptr_t, cookie);
1388         } else if (idl_txl->txl_cookie != NULL &&
1389             idl_txl->txl_cookie != ixa->ixa_cookie) {
1390                 DTRACE_PROBE2(ill__tx__cookie__collision, uintptr_t, cookie,
1391                     uintptr_t, idl_txl->txl_cookie);
1392                 /* TODO: bump kstat for cookie collision */
1393         } else {
1394                 /*
1395                  * Check/set conn_blocked under conn_lock.  Note that txl_lock
1396                  * will not suffice since two separate UDP threads may be
1397                  * racing to send to different destinations that are
1398                  * associated with different cookies and thus may not be
1399                  * holding the same txl_lock.  Further, since a given conn_t
1400                  * can only be on a single drain list, the conn_t will be
1401                  * enqueued on whichever thread wins this race.
1402                  */
1403 tryinsert:      mutex_enter(&connp->conn_lock);
1404                 if (connp->conn_blocked) {
1405                         DTRACE_PROBE1(ill__tx__conn__already__blocked,
1406                             conn_t *, connp);
1407                         mutex_exit(&connp->conn_lock);
1408                 } else {
1409                         connp->conn_blocked = B_TRUE;
1410                         mutex_exit(&connp->conn_lock);
1411                         idl_txl->txl_cookie = cookie;
1412                         conn_drain_insert(connp, idl_txl);
1413                         if (!IPCL_IS_NONSTR(connp))
1414                                 noenable(connp->conn_wq);
1415                         inserted = B_TRUE;
1416                 }
1417         }
1418         mutex_exit(&idl_txl->txl_lock);
1419         return (inserted);
1420 }