1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  *  This module supports AF_TRILL sockets and TRILL layer-2 forwarding.
  29  */
  30 
  31 #include <sys/strsubr.h>
  32 #include <sys/socket.h>
  33 #include <sys/socketvar.h>
  34 #include <sys/modctl.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/tihdr.h>
  37 #include <sys/strsun.h>
  38 #include <sys/policy.h>
  39 #include <sys/ethernet.h>
  40 #include <sys/vlan.h>
  41 #include <net/trill.h>
  42 #include <net/if_dl.h>
  43 #include <sys/mac.h>
  44 #include <sys/mac_client.h>
  45 #include <sys/mac_provider.h>
  46 #include <sys/mac_client_priv.h>
  47 #include <sys/sdt.h>
  48 #include <sys/dls.h>
  49 #include <sys/sunddi.h>
  50 
  51 #include "trill_impl.h"
  52 
  53 static void trill_del_all(trill_inst_t *, boolean_t);
  54 static int trill_del_nick(trill_inst_t *, uint16_t, boolean_t);
  55 static void trill_stop_recv(trill_sock_t *);
  56 static void trill_ctrl_input(trill_sock_t *, mblk_t *, const uint8_t *,
  57     uint16_t);
  58 static trill_node_t *trill_node_lookup(trill_inst_t *, uint16_t);
  59 static void trill_node_unref(trill_inst_t *, trill_node_t *);
  60 static void trill_sock_unref(trill_sock_t *);
  61 static void trill_kstats_init(trill_sock_t *, const char *);
  62 
  63 static list_t trill_inst_list;
  64 static krwlock_t trill_inst_rwlock;
  65 
  66 static sock_lower_handle_t trill_create(int, int, int, sock_downcalls_t **,
  67     uint_t *, int *, int, cred_t *);
  68 
  69 static smod_reg_t sinfo = {
  70         SOCKMOD_VERSION,
  71         "trill",
  72         SOCK_UC_VERSION,
  73         SOCK_DC_VERSION,
  74         trill_create,
  75         NULL,
  76 };
  77 
  78 /* modldrv structure */
  79 static struct modlsockmod sockmod = {
  80         &mod_sockmodops, "AF_TRILL socket module", &sinfo
  81 };
  82 
  83 /* modlinkage structure */
  84 static struct modlinkage ml = {
  85         MODREV_1,
  86         &sockmod,
  87         NULL
  88 };
  89 
  90 #define VALID_NICK(n)   ((n) != RBRIDGE_NICKNAME_NONE && \
  91                         (n) != RBRIDGE_NICKNAME_UNUSED)
  92 
  93 static mblk_t *
  94 create_trill_header(trill_sock_t *tsock, mblk_t *mp, const uint8_t *daddr,
  95     boolean_t trill_hdr_ok, boolean_t multidest, uint16_t tci,
  96     size_t msglen)
  97 {
  98         int extra_hdr_len;
  99         struct ether_vlan_header *ethvlanhdr;
 100         mblk_t *hdr_mp;
 101         uint16_t etype;
 102 
 103         etype = msglen > 0 ? (uint16_t)msglen : ETHERTYPE_TRILL;
 104 
 105         /* When sending on the PVID, we must not give a VLAN ID */
 106         if (tci == tsock->ts_link->bl_pvid)
 107                 tci = TRILL_NO_TCI;
 108 
 109         /*
 110          * Create new Ethernet header and include additional space
 111          * for writing TRILL header and/or VLAN tag.
 112          */
 113         extra_hdr_len = (trill_hdr_ok ? 0 : sizeof (trill_header_t)) +
 114             (tci != TRILL_NO_TCI ? sizeof (struct ether_vlan_extinfo) : 0);
 115         hdr_mp = mac_header(tsock->ts_link->bl_mh, daddr,
 116             tci != TRILL_NO_TCI ? ETHERTYPE_VLAN : etype, mp, extra_hdr_len);
 117         if (hdr_mp == NULL) {
 118                 freemsg(mp);
 119                 return (NULL);
 120         }
 121 
 122         if (tci != TRILL_NO_TCI) {
 123                 /* LINTED: alignment */
 124                 ethvlanhdr = (struct ether_vlan_header *)hdr_mp->b_rptr;
 125                 ethvlanhdr->ether_tci = htons(tci);
 126                 ethvlanhdr->ether_type = htons(etype);
 127                 hdr_mp->b_wptr += sizeof (struct ether_vlan_extinfo);
 128         }
 129 
 130         if (!trill_hdr_ok) {
 131                 trill_header_t *thp;
 132                 /* LINTED: alignment */
 133                 thp = (trill_header_t *)hdr_mp->b_wptr;
 134                 (void) memset(thp, 0, sizeof (trill_header_t));
 135                 thp->th_hopcount = TRILL_DEFAULT_HOPS;
 136                 thp->th_multidest = (multidest ? 1:0);
 137                 hdr_mp->b_wptr += sizeof (trill_header_t);
 138         }
 139 
 140         hdr_mp->b_cont = mp;
 141         return (hdr_mp);
 142 }
 143 
 144 /*
 145  * TRILL local recv function. TRILL data frames that should be received
 146  * by the local system are decapsulated here and passed to bridging for
 147  * learning and local system receive. Only called when we are the forwarder
 148  * on the link (multi-dest frames) or the frame was destined for us.
 149  */
 150 static void
 151 trill_recv_local(trill_sock_t *tsock, mblk_t *mp, uint16_t ingressnick)
 152 {
 153         struct ether_header *inner_ethhdr;
 154 
 155         /* LINTED: alignment */
 156         inner_ethhdr = (struct ether_header *)mp->b_rptr;
 157         DTRACE_PROBE1(trill__recv__local, struct ether_header *, inner_ethhdr);
 158 
 159         DB_CKSUMFLAGS(mp) = 0;
 160         /*
 161          * Transmit the decapsulated frame on the link via Bridging.
 162          * Bridging does source address learning and appropriate forwarding.
 163          */
 164         bridge_trill_decaps(tsock->ts_link, mp, ingressnick);
 165         KSPINCR(tks_decap);
 166 }
 167 
 168 /*
 169  * Determines the outgoing link to reach a RBridge having the given nick
 170  * Assumes caller has acquired the trill instance rwlock.
 171  */
 172 static trill_sock_t *
 173 find_trill_link(trill_inst_t *tip, datalink_id_t linkid)
 174 {
 175         trill_sock_t *tsp = NULL;
 176 
 177         ASSERT(RW_LOCK_HELD(&tip->ti_rwlock));
 178         for (tsp = list_head(&tip->ti_socklist); tsp != NULL;
 179             tsp = list_next(&tip->ti_socklist, tsp)) {
 180                 if (tsp->ts_link != NULL && tsp->ts_link->bl_linkid == linkid) {
 181                         ASSERT(tsp->ts_link->bl_mh != NULL);
 182                         ASSERT(!(tsp->ts_flags & TSF_SHUTDOWN));
 183                         atomic_inc_uint(&tsp->ts_refs);
 184                         break;
 185                 }
 186         }
 187         return (tsp);
 188 }
 189 
 190 /*
 191  * TRILL destination forwarding function. Transmits the TRILL data packet
 192  * to the next-hop, adjacent RBridge.  Consumes passed mblk_t.
 193  */
 194 static void
 195 trill_dest_fwd(trill_inst_t *tip, mblk_t *fwd_mp, uint16_t adj_nick,
 196     boolean_t has_trill_hdr, boolean_t multidest, uint16_t dtnick)
 197 {
 198         trill_node_t *adj;
 199         trill_sock_t *tsock = NULL;
 200         trill_header_t *trillhdr;
 201         struct ether_header *ethhdr;
 202         int ethtype;
 203         int ethhdrlen;
 204 
 205         adj = trill_node_lookup(tip, adj_nick);
 206         if (adj == NULL || ((tsock = adj->tn_tsp) == NULL))
 207                 goto dest_fwd_fail;
 208 
 209         ASSERT(tsock->ts_link != NULL);
 210         ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
 211         ASSERT(adj->tn_ni != NULL);
 212 
 213         DTRACE_PROBE3(trill__dest__fwd, uint16_t, adj_nick, trill_node_t,
 214             adj, trill_sock_t, tsock);
 215 
 216         /*
 217          * For broadcast links by using the dest address of
 218          * the RBridge to forward the frame should result in
 219          * savings. When the link is a bridged LAN or there are
 220          * many end stations the frame will not always be flooded.
 221          */
 222         fwd_mp = create_trill_header(tsock, fwd_mp, adj->tn_ni->tni_adjsnpa,
 223             has_trill_hdr, multidest, tsock->ts_desigvlan, 0);
 224         if (fwd_mp == NULL)
 225                 goto dest_fwd_fail;
 226 
 227         /* LINTED: alignment */
 228         ethhdr = (struct ether_header *)fwd_mp->b_rptr;
 229         ethtype = ntohs(ethhdr->ether_type);
 230         ASSERT(ethtype == ETHERTYPE_VLAN || ethtype == ETHERTYPE_TRILL);
 231 
 232         /* Pullup Ethernet and TRILL header (w/o TRILL options) */
 233         ethhdrlen = sizeof (struct ether_header) +
 234             (ethtype == ETHERTYPE_VLAN ? sizeof (struct ether_vlan_extinfo):0);
 235         if (!pullupmsg(fwd_mp, ethhdrlen + sizeof (trill_header_t)))
 236                 goto dest_fwd_fail;
 237         /* LINTED: alignment */
 238         trillhdr = (struct trill_header *)(fwd_mp->b_rptr + ethhdrlen);
 239 
 240         /* Update TRILL header with ingress and egress nicks for new frames */
 241         if (!has_trill_hdr) {
 242                 /* We are creating a new TRILL frame */
 243                 trillhdr->th_egressnick = (multidest ? dtnick:adj_nick);
 244                 rw_enter(&tip->ti_rwlock, RW_READER);
 245                 trillhdr->th_ingressnick = tip->ti_nick;
 246                 rw_exit(&tip->ti_rwlock);
 247                 if (!VALID_NICK(trillhdr->th_ingressnick))
 248                         goto dest_fwd_fail;
 249         }
 250 
 251         /* Set hop count and update header in packet */
 252         ASSERT(trillhdr->th_hopcount != 0);
 253         trillhdr->th_hopcount--;
 254 
 255         /* Clear checksum flag and transmit frame on the link */
 256         DB_CKSUMFLAGS(fwd_mp) = 0;
 257         DTRACE_PROBE1(trill__dest__fwd__tx, trill_header_t *, &trillhdr);
 258         fwd_mp = bridge_trill_output(tsock->ts_link, fwd_mp);
 259         if (fwd_mp == NULL) {
 260                 KSPINCR(tks_sent);
 261                 KSPINCR(tks_forward);
 262         } else {
 263                 freemsg(fwd_mp);
 264                 KSPINCR(tks_drops);
 265         }
 266         trill_node_unref(tip, adj);
 267         return;
 268 
 269 dest_fwd_fail:
 270         if (adj != NULL)
 271                 trill_node_unref(tip, adj);
 272         if (tsock != NULL)
 273                 KSPINCR(tks_drops);
 274         freemsg(fwd_mp);
 275 }
 276 
 277 /*
 278  * TRILL multi-destination forwarding. Transmits the packet to the adjacencies
 279  * on the distribution tree determined by the egress nick. Source addr (saddr)
 280  * is NULL for new TRILL packets originating from us.
 281  */
 282 static void
 283 trill_multidest_fwd(trill_inst_t *tip, mblk_t *mp, uint16_t egressnick,
 284     uint16_t ingressnick, boolean_t is_trill_pkt, const uint8_t *saddr,
 285     int inner_vlan, boolean_t free_mblk)
 286 {
 287         int idx;
 288         uint16_t adjnick;
 289         trill_node_t *dest;
 290         trill_node_t *adj;
 291         mblk_t *fwd_mp;
 292         boolean_t nicksaved = B_FALSE;
 293         uint16_t adjnicksaved;
 294 
 295         /* Lookup the egress nick info, this is the DT root */
 296         if ((dest = trill_node_lookup(tip, egressnick)) == NULL)
 297                 goto fail_multidest_fwd;
 298 
 299         /* Send a copy to all our adjacencies on the DT root  */
 300         ASSERT(dest->tn_ni);
 301         for (idx = 0; idx < dest->tn_ni->tni_adjcount; idx++) {
 302 
 303                 /* Check for a valid adjacency node */
 304                 adjnick = TNI_ADJNICK(dest->tn_ni, idx);
 305                 if (!VALID_NICK(adjnick) || ingressnick == adjnick ||
 306                     ((adj = trill_node_lookup(tip, adjnick)) == NULL))
 307                         continue;
 308 
 309                 /* Do not forward back to adjacency that sent the pkt to us */
 310                 ASSERT(adj->tn_ni != NULL);
 311                 if ((saddr != NULL) &&
 312                     (memcmp(adj->tn_ni->tni_adjsnpa, saddr,
 313                     ETHERADDRL) == 0)) {
 314                         trill_node_unref(tip, adj);
 315                         continue;
 316                 }
 317 
 318                 /* Check if adj is marked as reaching inner VLAN downstream */
 319                 if ((inner_vlan != VLAN_ID_NONE) &&
 320                     !TRILL_VLANISSET(TNI_VLANFILTERMAP(dest->tn_ni, idx),
 321                     inner_vlan)) {
 322                         trill_node_unref(tip, adj);
 323                         DTRACE_PROBE4(trill__multi__dest__fwd__vlanfiltered,
 324                             uint16_t, adjnick, uint16_t, ingressnick,
 325                             uint16_t, egressnick, int, inner_vlan);
 326                         continue;
 327                 }
 328 
 329                 trill_node_unref(tip, adj);
 330 
 331                 /*
 332                  * Save the nick and look ahead to see if we should forward the
 333                  * frame to more adjacencies. We avoid doing a copy for this
 334                  * nick and use the passed mblk when we can consume the passed
 335                  * mblk.
 336                  */
 337                 if (free_mblk && !nicksaved) {
 338                         adjnicksaved = adjnick;
 339                         nicksaved = B_TRUE;
 340                         continue;
 341                 }
 342 
 343                 fwd_mp = copymsg(mp);
 344                 if (fwd_mp == NULL)
 345                         break;
 346                 DTRACE_PROBE2(trill__multi__dest__fwd, uint16_t,
 347                     adjnick, uint16_t, ingressnick);
 348                 trill_dest_fwd(tip, fwd_mp, adjnick, is_trill_pkt,
 349                     B_TRUE, egressnick);
 350         }
 351         trill_node_unref(tip, dest);
 352 
 353         if (nicksaved) {
 354                 ASSERT(free_mblk);
 355                 DTRACE_PROBE2(trill__multi__dest__fwd, uint16_t,
 356                     adjnicksaved, uint16_t, ingressnick);
 357                 trill_dest_fwd(tip, mp, adjnicksaved, is_trill_pkt,
 358                     B_TRUE, egressnick);
 359                 return;
 360         }
 361 
 362 fail_multidest_fwd:
 363         DTRACE_PROBE2(trill__multi__dest__fwd__fail, uint16_t,
 364             egressnick, uint16_t, ingressnick);
 365         if (free_mblk) {
 366                 freemsg(mp);
 367         }
 368 }
 369 
 370 /*
 371  * TRILL data receive function. Forwards the received frame if necessary
 372  * and also determines if the received frame should be consumed locally.
 373  * Consumes passed mblk.
 374  */
 375 static void
 376 trill_recv(trill_sock_t *tsock, mblk_t *mp, const uint8_t *mpsaddr)
 377 {
 378         trill_header_t *trillhdr;
 379         trill_node_t *dest = NULL;
 380         trill_node_t *source = NULL;
 381         trill_node_t *adj;
 382         uint16_t ournick, adjnick, treeroot;
 383         struct ether_header *ethhdr;
 384         trill_inst_t *tip = tsock->ts_tip;
 385         uint8_t srcaddr[ETHERADDRL];
 386         size_t trillhdrlen;
 387         int inner_vlan = VLAN_ID_NONE;
 388         int tci;
 389         int idx;
 390         size_t min_size;
 391 
 392         /* Copy Ethernet source address before modifying packet */
 393         (void) memcpy(srcaddr, mpsaddr, ETHERADDRL);
 394 
 395         /* Pull up TRILL header if necessary. */
 396         min_size = sizeof (trill_header_t);
 397         if ((MBLKL(mp) < min_size ||
 398             !IS_P2ALIGNED(mp->b_rptr, TRILL_HDR_ALIGN)) &&
 399             !pullupmsg(mp, min_size))
 400                 goto fail;
 401 
 402         /* LINTED: alignment */
 403         trillhdr = (trill_header_t *)mp->b_rptr;
 404         if (trillhdr->th_version != TRILL_PROTOCOL_VERS) {
 405                 DTRACE_PROBE1(trill__recv__wrongversion,
 406                     trill_header_t *, trillhdr);
 407                 goto fail;
 408         }
 409 
 410         /* Drop if unknown or invalid nickname */
 411         if (!VALID_NICK(trillhdr->th_egressnick) ||
 412             !VALID_NICK(trillhdr->th_ingressnick)) {
 413                 DTRACE_PROBE1(trill__recv__invalidnick,
 414                     trill_header_t *, trillhdr);
 415                 goto fail;
 416         }
 417 
 418         rw_enter(&tip->ti_rwlock, RW_READER);
 419         ournick = tip->ti_nick;
 420         treeroot = tip->ti_treeroot;
 421         rw_exit(&tip->ti_rwlock);
 422         /* Drop if we received a packet with our nick as ingress */
 423         if (trillhdr->th_ingressnick == ournick)
 424                 goto fail;
 425 
 426         /* Re-pull any TRILL options and inner Ethernet header */
 427         min_size += GET_TRILL_OPTS_LEN(trillhdr) * sizeof (uint32_t) +
 428             sizeof (struct ether_header);
 429         if (MBLKL(mp) < min_size) {
 430                 if (!pullupmsg(mp, min_size))
 431                         goto fail;
 432                 /* LINTED: alignment */
 433                 trillhdr = (trill_header_t *)mp->b_rptr;
 434         }
 435         trillhdrlen = sizeof (trill_header_t) +
 436             (GET_TRILL_OPTS_LEN(trillhdr) * sizeof (uint32_t));
 437 
 438         /*
 439          * Get the inner Ethernet header, plus the inner VLAN header if there
 440          * is one.
 441          */
 442         /* LINTED: alignment */
 443         ethhdr = (struct ether_header *)(mp->b_rptr + trillhdrlen);
 444         if (ethhdr->ether_type == htons(ETHERTYPE_VLAN)) {
 445                 min_size += sizeof (struct ether_vlan_extinfo);
 446                 if (MBLKL(mp) < min_size) {
 447                         if (!pullupmsg(mp, min_size))
 448                                 goto fail;
 449                         /* LINTED: alignment */
 450                         trillhdr = (trill_header_t *)mp->b_rptr;
 451                         /* LINTED: alignment */
 452                         ethhdr = (struct ether_header *)(mp->b_rptr +
 453                             trillhdrlen);
 454                 }
 455 
 456                 tci = ntohs(((struct ether_vlan_header *)ethhdr)->ether_tci);
 457                 inner_vlan = VLAN_ID(tci);
 458         }
 459 
 460         /* Known/single destination forwarding. */
 461         if (!trillhdr->th_multidest) {
 462 
 463                 /* Inner MacDA must be unicast */
 464                 if (ethhdr->ether_dhost.ether_addr_octet[0] & 1)
 465                         goto fail;
 466 
 467                 /* Ingress and Egress nicks must be different */
 468                 if (trillhdr->th_egressnick == trillhdr->th_ingressnick)
 469                         goto fail;
 470 
 471                 DTRACE_PROBE1(trill__recv__singledest,
 472                     trill_header_t *, trillhdr);
 473                 if (trillhdr->th_egressnick == ournick) {
 474                         mp->b_rptr += trillhdrlen;
 475                         trill_recv_local(tsock, mp, trillhdr->th_ingressnick);
 476                 } else if (trillhdr->th_hopcount > 0) {
 477                         trill_dest_fwd(tip, mp, trillhdr->th_egressnick,
 478                             B_TRUE, B_FALSE, RBRIDGE_NICKNAME_NONE);
 479                 } else {
 480                         goto fail;
 481                 }
 482                 return;
 483         }
 484 
 485         /*
 486          * Multi-destination frame: perform checks verifying we have
 487          * received a valid multi-destination frame before receiving the
 488          * frame locally and forwarding the frame to other RBridges.
 489          *
 490          * Check if we received this multi-destination frame on a
 491          * adjacency in the distribution tree indicated by the frame's
 492          * egress nickname.
 493          */
 494         if ((dest = trill_node_lookup(tip, trillhdr->th_egressnick)) == NULL)
 495                 goto fail;
 496         for (idx = 0; idx < dest->tn_ni->tni_adjcount; idx++) {
 497                 adjnick = TNI_ADJNICK(dest->tn_ni, idx);
 498                 if ((adj = trill_node_lookup(tip, adjnick)) == NULL)
 499                         continue;
 500                 if (memcmp(adj->tn_ni->tni_adjsnpa, srcaddr, ETHERADDRL) == 0) {
 501                         trill_node_unref(tip, adj);
 502                         break;
 503                 }
 504                 trill_node_unref(tip, adj);
 505         }
 506 
 507         if (idx >= dest->tn_ni->tni_adjcount) {
 508                 DTRACE_PROBE2(trill__recv__multidest__adjcheckfail,
 509                     trill_header_t *, trillhdr, trill_node_t *, dest);
 510                 goto fail;
 511         }
 512 
 513         /*
 514          * Reverse path forwarding check. Check if the ingress RBridge
 515          * that has forwarded the frame advertised the use of the
 516          * distribution tree specified in the egress nick.
 517          */
 518         if ((source = trill_node_lookup(tip, trillhdr->th_ingressnick)) == NULL)
 519                 goto fail;
 520         for (idx = 0; idx < source->tn_ni->tni_dtrootcount; idx++) {
 521                 if (TNI_DTROOTNICK(source->tn_ni, idx) ==
 522                     trillhdr->th_egressnick)
 523                         break;
 524         }
 525 
 526         if (idx >= source->tn_ni->tni_dtrootcount) {
 527                 /*
 528                  * Allow receipt of forwarded frame with the highest
 529                  * tree root RBridge as the egress RBridge when the
 530                  * ingress RBridge has not advertised the use of any
 531                  * distribution trees.
 532                  */
 533                 if (source->tn_ni->tni_dtrootcount != 0 ||
 534                     trillhdr->th_egressnick != treeroot) {
 535                         DTRACE_PROBE3(
 536                             trill__recv__multidest__rpfcheckfail,
 537                             trill_header_t *, trillhdr, trill_node_t *,
 538                             source, trill_inst_t *, tip);
 539                         goto fail;
 540                 }
 541         }
 542 
 543         /* Check hop count before doing any forwarding */
 544         if (trillhdr->th_hopcount == 0)
 545                 goto fail;
 546 
 547         /* Forward frame using the distribution tree specified by egress nick */
 548         DTRACE_PROBE2(trill__recv__multidest, trill_header_t *,
 549             trillhdr, trill_node_t *, source);
 550         trill_node_unref(tip, source);
 551         trill_node_unref(tip, dest);
 552 
 553         /* Tell forwarding not to free if we're the link forwarder. */
 554         trill_multidest_fwd(tip, mp, trillhdr->th_egressnick,
 555             trillhdr->th_ingressnick, B_TRUE, srcaddr, inner_vlan,
 556             B_FALSE);
 557 
 558         /*
 559          * Send de-capsulated frame locally if we are the link forwarder (also
 560          * does bridge learning).
 561          */
 562         mp->b_rptr += trillhdrlen;
 563         trill_recv_local(tsock, mp, trillhdr->th_ingressnick);
 564         KSPINCR(tks_recv);
 565         return;
 566 
 567 fail:
 568         DTRACE_PROBE2(trill__recv__multidest__fail, mblk_t *, mp,
 569             trill_sock_t *, tsock);
 570         if (dest != NULL)
 571                 trill_node_unref(tip, dest);
 572         if (source != NULL)
 573                 trill_node_unref(tip, source);
 574         freemsg(mp);
 575         KSPINCR(tks_drops);
 576 }
 577 
 578 static void
 579 trill_stop_recv(trill_sock_t *tsock)
 580 {
 581         mutex_enter(&tsock->ts_socklock);
 582 stop_retry:
 583         if (tsock->ts_state == TS_UNBND || tsock->ts_link == NULL) {
 584                 mutex_exit(&tsock->ts_socklock);
 585                 return;
 586         }
 587 
 588         /*
 589          * If another thread is closing the socket then wait. Our callers
 590          * expect us to return only after the socket is closed.
 591          */
 592         if (tsock->ts_flags & TSF_CLOSEWAIT) {
 593                 cv_wait(&tsock->ts_sockclosewait, &tsock->ts_socklock);
 594                 goto stop_retry;
 595         }
 596 
 597         /*
 598          * Set state and flags to block new bind or close calls
 599          * while we close the socket.
 600          */
 601         tsock->ts_flags |= TSF_CLOSEWAIT;
 602 
 603         /* Wait until all AF_TRILL socket transmit operations are done */
 604         while (tsock->ts_sockthreadcount > 0)
 605                 cv_wait(&tsock->ts_sockthreadwait, &tsock->ts_socklock);
 606 
 607         /*
 608          * We are guaranteed to be the only thread closing on the
 609          * socket while the TSF_CLOSEWAIT flag is set, all others cv_wait
 610          * for us to finish.
 611          */
 612         ASSERT(tsock->ts_link != NULL);
 613         if (tsock->ts_ksp != NULL)
 614                 kstat_delete(tsock->ts_ksp);
 615 
 616         /*
 617          * Release lock before bridge_trill_lnunref to prevent deadlock
 618          * between trill_ctrl_input thread waiting to acquire ts_socklock
 619          * and bridge_trill_lnunref waiting for the trill thread to finish.
 620          */
 621         mutex_exit(&tsock->ts_socklock);
 622 
 623         /*
 624          * Release TRILL link reference from Bridging. On return from
 625          * bridge_trill_lnunref we can be sure there are no active TRILL data
 626          * threads for this link.
 627          */
 628         bridge_trill_lnunref(tsock->ts_link);
 629 
 630         /* Set socket as unbound & wakeup threads waiting for socket to close */
 631         mutex_enter(&tsock->ts_socklock);
 632         ASSERT(tsock->ts_link != NULL);
 633         tsock->ts_link = NULL;
 634         tsock->ts_state = TS_UNBND;
 635         tsock->ts_flags &= ~TSF_CLOSEWAIT;
 636         cv_broadcast(&tsock->ts_sockclosewait);
 637         mutex_exit(&tsock->ts_socklock);
 638 }
 639 
 640 static int
 641 trill_start_recv(trill_sock_t *tsock, const struct sockaddr *sa, socklen_t len)
 642 {
 643         struct sockaddr_dl *lladdr = (struct sockaddr_dl *)sa;
 644         datalink_id_t linkid;
 645         int err = 0;
 646 
 647         if (len != sizeof (*lladdr))
 648                 return (EINVAL);
 649 
 650         mutex_enter(&tsock->ts_socklock);
 651         if (tsock->ts_tip == NULL || tsock->ts_state != TS_UNBND) {
 652                 err = EINVAL;
 653                 goto bind_error;
 654         }
 655 
 656         if (tsock->ts_flags & TSF_CLOSEWAIT || tsock->ts_link != NULL) {
 657                 err = EBUSY;
 658                 goto bind_error;
 659         }
 660 
 661         (void) memcpy(&(tsock->ts_lladdr), lladdr,
 662             sizeof (struct sockaddr_dl));
 663         (void) memcpy(&linkid, tsock->ts_lladdr.sdl_data,
 664             sizeof (datalink_id_t));
 665 
 666         tsock->ts_link = bridge_trill_lnref(tsock->ts_tip->ti_binst,
 667             linkid, tsock);
 668         if (tsock->ts_link == NULL) {
 669                 err = EINVAL;
 670                 goto bind_error;
 671         }
 672 
 673         trill_kstats_init(tsock, tsock->ts_tip->ti_bridgename);
 674         tsock->ts_state = TS_IDLE;
 675 
 676 bind_error:
 677         mutex_exit(&tsock->ts_socklock);
 678         return (err);
 679 }
 680 
 681 static int
 682 trill_do_unbind(trill_sock_t *tsock)
 683 {
 684         /* If a bind has not been done, we can't unbind. */
 685         if (tsock->ts_state != TS_IDLE)
 686                 return (EINVAL);
 687 
 688         trill_stop_recv(tsock);
 689         return (0);
 690 }
 691 
 692 static void
 693 trill_instance_unref(trill_inst_t *tip)
 694 {
 695         rw_enter(&trill_inst_rwlock, RW_WRITER);
 696         rw_enter(&tip->ti_rwlock, RW_WRITER);
 697         if (atomic_dec_uint_nv(&tip->ti_refs) == 0) {
 698                 list_remove(&trill_inst_list, tip);
 699                 rw_exit(&tip->ti_rwlock);
 700                 rw_exit(&trill_inst_rwlock);
 701                 if (tip->ti_binst != NULL)
 702                         bridge_trill_brunref(tip->ti_binst);
 703                 list_destroy(&tip->ti_socklist);
 704                 rw_destroy(&tip->ti_rwlock);
 705                 kmem_free(tip, sizeof (*tip));
 706         } else {
 707                 rw_exit(&tip->ti_rwlock);
 708                 rw_exit(&trill_inst_rwlock);
 709         }
 710 }
 711 
 712 /*
 713  * This is called when the bridge module receives a TRILL-encapsulated packet
 714  * on a given link or a packet identified as "TRILL control."  We must verify
 715  * that it's for us (it almost certainly will be), and then either decapsulate
 716  * (if it's to our nickname), forward (if it's to someone else), or send up one
 717  * of the sockets (if it's control traffic).
 718  *
 719  * Sadly, on Ethernet, the control traffic is identified by Outer.MacDA, and
 720  * not by TRILL header information.
 721  */
 722 static void
 723 trill_recv_pkt_cb(void *lptr, bridge_link_t *blp, mac_resource_handle_t rsrc,
 724     mblk_t *mp, mac_header_info_t *hdr_info)
 725 {
 726         trill_sock_t *tsock = lptr;
 727 
 728         _NOTE(ARGUNUSED(rsrc));
 729 
 730         ASSERT(tsock->ts_tip != NULL);
 731         ASSERT(tsock->ts_link != NULL);
 732         ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
 733 
 734         /*
 735          * Only receive packet if the source address is not multicast (which is
 736          * bogus).
 737          */
 738         if (hdr_info->mhi_saddr[0] & 1)
 739                 goto discard;
 740 
 741         /*
 742          * Check if this is our own packet reflected back.  It should not be.
 743          */
 744         if (bcmp(hdr_info->mhi_saddr, blp->bl_local_mac, ETHERADDRL) == 0)
 745                 goto discard;
 746 
 747         /* Only receive unicast packet if addressed to us */
 748         if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST &&
 749             bcmp(hdr_info->mhi_daddr, blp->bl_local_mac, ETHERADDRL) != 0)
 750                 goto discard;
 751 
 752         if (hdr_info->mhi_bindsap == ETHERTYPE_TRILL) {
 753                 /* TRILL data packets */
 754                 trill_recv(tsock, mp, hdr_info->mhi_saddr);
 755         } else {
 756                 /* Design constraint for cheap IS-IS/BPDU comparison */
 757                 ASSERT(all_isis_rbridges[4] != bridge_group_address[4]);
 758                 /* Send received control packet upstream */
 759                 trill_ctrl_input(tsock, mp, hdr_info->mhi_saddr,
 760                     hdr_info->mhi_daddr[4] == all_isis_rbridges[4] ?
 761                     hdr_info->mhi_tci : TRILL_TCI_BPDU);
 762         }
 763 
 764         return;
 765 
 766 discard:
 767         freemsg(mp);
 768         KSPINCR(tks_drops);
 769 }
 770 
 771 /*
 772  * This is called when the bridge module discovers that the destination address
 773  * for a packet is not local -- it's through some remote node.  We must verify
 774  * that the remote node isn't our nickname (it shouldn't be), add a TRILL
 775  * header, and then use the IS-IS data to determine which link and which
 776  * next-hop RBridge should be used for output.  We then transmit on that link.
 777  *
 778  * The egress_nick is RBRIDGE_NICKNAME_NONE for the "unknown destination" case.
 779  */
 780 static void
 781 trill_encap_pkt_cb(void *lptr, bridge_link_t *blp, mac_header_info_t *hdr_info,
 782     mblk_t *mp, uint16_t egress_nick)
 783 {
 784         uint16_t ournick;
 785         uint16_t dtnick;
 786         trill_node_t *self = NULL;
 787         trill_sock_t *tsock = lptr;
 788         trill_inst_t *tip = tsock->ts_tip;
 789         int vlan = VLAN_ID_NONE;
 790 
 791         _NOTE(ARGUNUSED(blp));
 792         ASSERT(hdr_info->mhi_bindsap != ETHERTYPE_TRILL);
 793 
 794         /* egress_nick = RBRIDGE_NICKNAME_NONE is valid */
 795         if (egress_nick != RBRIDGE_NICKNAME_NONE && !VALID_NICK(egress_nick))
 796                 goto discard;
 797 
 798         /* Check if our own nick is valid before we do any forwarding */
 799         rw_enter(&tip->ti_rwlock, RW_READER);
 800         ournick = tip->ti_nick;
 801         dtnick = tip->ti_treeroot;
 802         rw_exit(&tip->ti_rwlock);
 803         if (!VALID_NICK(ournick))
 804                 goto discard;
 805 
 806         /*
 807          * For Multi-Destination forwarding determine our choice of
 808          * root distribution tree. If we didn't choose a distribution
 809          * tree (dtroots_count=0) then we use the highest priority tree
 810          * root (t_treeroot) else we drop the packet without forwarding.
 811          */
 812         if (egress_nick == RBRIDGE_NICKNAME_NONE) {
 813                 if ((self = trill_node_lookup(tip, ournick)) == NULL)
 814                         goto discard;
 815 
 816                 /*
 817                  * Use the first DT configured for now. In future we
 818                  * should have DT selection code here.
 819                  */
 820                 if (self->tn_ni->tni_dtrootcount > 0) {
 821                         dtnick = TNI_DTROOTNICK(self->tn_ni, 0);
 822                 }
 823 
 824                 trill_node_unref(tip, self);
 825                 if (!VALID_NICK(dtnick)) {
 826                         DTRACE_PROBE(trill__fwd__packet__nodtroot);
 827                         goto discard;
 828                 }
 829         }
 830 
 831         /*
 832          * Retrieve VLAN ID of the native frame used for VLAN
 833          * pruning of multi-destination frames.
 834          */
 835         if (hdr_info->mhi_istagged) {
 836                 vlan = VLAN_ID(hdr_info->mhi_tci);
 837         }
 838 
 839         DTRACE_PROBE2(trill__fwd__packet, mac_header_info_t *, hdr_info,
 840             uint16_t, egress_nick);
 841         if (egress_nick == RBRIDGE_NICKNAME_NONE) {
 842                 trill_multidest_fwd(tip, mp, dtnick,
 843                     ournick, B_FALSE, NULL, vlan, B_TRUE);
 844         } else {
 845                 trill_dest_fwd(tip, mp, egress_nick, B_FALSE, B_FALSE,
 846                     RBRIDGE_NICKNAME_NONE);
 847         }
 848         KSPINCR(tks_encap);
 849         return;
 850 
 851 discard:
 852         freemsg(mp);
 853 }
 854 
 855 /*
 856  * This is called when the bridge module has completely torn down a bridge
 857  * instance and all of the attached links.  We need to make the TRILL instance
 858  * go away at this point.
 859  */
 860 static void
 861 trill_br_dstr_cb(void *bptr, bridge_inst_t *bip)
 862 {
 863         trill_inst_t *tip = bptr;
 864 
 865         _NOTE(ARGUNUSED(bip));
 866         rw_enter(&tip->ti_rwlock, RW_WRITER);
 867         if (tip->ti_binst != NULL)
 868                 bridge_trill_brunref(tip->ti_binst);
 869         tip->ti_binst = NULL;
 870         rw_exit(&tip->ti_rwlock);
 871 }
 872 
 873 /*
 874  * This is called when the bridge module is tearing down a link, but before the
 875  * actual tear-down starts.  When this function returns, we must make sure that
 876  * we will not initiate any new transmits on this link.
 877  */
 878 static void
 879 trill_ln_dstr_cb(void *lptr, bridge_link_t *blp)
 880 {
 881         trill_sock_t *tsock = lptr;
 882 
 883         _NOTE(ARGUNUSED(blp));
 884         trill_stop_recv(tsock);
 885 }
 886 
 887 static void
 888 trill_init(void)
 889 {
 890         list_create(&trill_inst_list, sizeof (trill_inst_t),
 891             offsetof(trill_inst_t, ti_instnode));
 892         rw_init(&trill_inst_rwlock, NULL, RW_DRIVER, NULL);
 893         bridge_trill_register_cb(trill_recv_pkt_cb, trill_encap_pkt_cb,
 894             trill_br_dstr_cb, trill_ln_dstr_cb);
 895 }
 896 
 897 static void
 898 trill_fini(void)
 899 {
 900         bridge_trill_register_cb(NULL, NULL, NULL, NULL);
 901         rw_destroy(&trill_inst_rwlock);
 902         list_destroy(&trill_inst_list);
 903 }
 904 
 905 /* Loadable module configuration entry points */
 906 int
 907 _init(void)
 908 {
 909         int rc;
 910 
 911         trill_init();
 912         if ((rc = mod_install(&ml)) != 0)
 913                 trill_fini();
 914         return (rc);
 915 }
 916 
 917 int
 918 _info(struct modinfo *modinfop)
 919 {
 920         return (mod_info(&ml, modinfop));
 921 }
 922 
 923 int
 924 _fini(void)
 925 {
 926         int rc;
 927 
 928         rw_enter(&trill_inst_rwlock, RW_READER);
 929         rc = list_is_empty(&trill_inst_list) ? 0 : EBUSY;
 930         rw_exit(&trill_inst_rwlock);
 931         if (rc == 0 && ((rc = mod_remove(&ml)) == 0))
 932                 trill_fini();
 933         return (rc);
 934 }
 935 
 936 static void
 937 trill_kstats_init(trill_sock_t *tsock, const char *bname)
 938 {
 939         int i;
 940         char kstatname[KSTAT_STRLEN];
 941         kstat_named_t  *knt;
 942         static const char *sock_kstats_list[] = { TRILL_KSSOCK_NAMES };
 943         char link_name[MAXNAMELEN];
 944         int num;
 945         int err;
 946 
 947         bzero(link_name, sizeof (link_name));
 948         if ((err = dls_mgmt_get_linkinfo(tsock->ts_link->bl_linkid, link_name,
 949             NULL, NULL, NULL)) != 0) {
 950                 cmn_err(CE_WARN, "%s: trill_kstats_init: error %d retrieving"
 951                     " linkinfo for linkid:%d", "trill", err,
 952                     tsock->ts_link->bl_linkid);
 953                 return;
 954         }
 955 
 956         bzero(kstatname, sizeof (kstatname));
 957         (void) snprintf(kstatname, sizeof (kstatname), "%s-%s",
 958             bname, link_name);
 959 
 960         num = sizeof (sock_kstats_list) / sizeof (*sock_kstats_list);
 961         for (i = 0; i < num; i++) {
 962                 knt = (kstat_named_t *)&(tsock->ts_kstats);
 963                 kstat_named_init(&knt[i], sock_kstats_list[i],
 964                     KSTAT_DATA_UINT64);
 965         }
 966 
 967         tsock->ts_ksp = kstat_create_zone("trill", 0, kstatname, "sock",
 968             KSTAT_TYPE_NAMED, num, KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
 969         if (tsock->ts_ksp != NULL) {
 970                 tsock->ts_ksp->ks_data = &tsock->ts_kstats;
 971                 kstat_install(tsock->ts_ksp);
 972         }
 973 }
 974 
 975 static trill_sock_t *
 976 trill_do_open(int flags)
 977 {
 978         trill_sock_t *tsock;
 979         int kmflag = ((flags & SOCKET_NOSLEEP)) ? KM_NOSLEEP:KM_SLEEP;
 980 
 981         tsock = kmem_zalloc(sizeof (trill_sock_t), kmflag);
 982         if (tsock != NULL) {
 983                 tsock->ts_state = TS_UNBND;
 984                 tsock->ts_refs++;
 985                 mutex_init(&tsock->ts_socklock, NULL, MUTEX_DRIVER, NULL);
 986                 cv_init(&tsock->ts_sockthreadwait, NULL, CV_DRIVER, NULL);
 987                 cv_init(&tsock->ts_sockclosewait, NULL, CV_DRIVER, NULL);
 988         }
 989         return (tsock);
 990 }
 991 
 992 static int
 993 trill_find_bridge(trill_sock_t *tsock, const char *bname, boolean_t can_create)
 994 {
 995         trill_inst_t *tip, *newtip = NULL;
 996 
 997         /* Allocate some memory (speculatively) before taking locks */
 998         if (can_create)
 999                 newtip = kmem_zalloc(sizeof (*tip), KM_NOSLEEP);
1000 
1001         rw_enter(&trill_inst_rwlock, RW_WRITER);
1002         for (tip = list_head(&trill_inst_list); tip != NULL;
1003             tip = list_next(&trill_inst_list, tip)) {
1004                 if (strcmp(tip->ti_bridgename, bname) == 0)
1005                         break;
1006         }
1007         if (tip == NULL) {
1008                 if (!can_create || newtip == NULL) {
1009                         rw_exit(&trill_inst_rwlock);
1010                         return (can_create ? ENOMEM : ENOENT);
1011                 }
1012 
1013                 tip = newtip;
1014                 newtip = NULL;
1015                 (void) strcpy(tip->ti_bridgename, bname);
1016 
1017                 /* Register TRILL instance with bridging */
1018                 tip->ti_binst = bridge_trill_brref(bname, tip);
1019                 if (tip->ti_binst == NULL) {
1020                         rw_exit(&trill_inst_rwlock);
1021                         kmem_free(tip, sizeof (*tip));
1022                         return (ENOENT);
1023                 }
1024 
1025                 rw_init(&tip->ti_rwlock, NULL, RW_DRIVER, NULL);
1026                 list_create(&tip->ti_socklist, sizeof (trill_sock_t),
1027                     offsetof(trill_sock_t, ts_socklistnode));
1028                 list_insert_tail(&trill_inst_list, tip);
1029         }
1030         atomic_inc_uint(&tip->ti_refs);
1031         rw_exit(&trill_inst_rwlock);
1032 
1033         /* If we didn't need the preallocated memory, then discard now. */
1034         if (newtip != NULL)
1035                 kmem_free(newtip, sizeof (*newtip));
1036 
1037         rw_enter(&tip->ti_rwlock, RW_WRITER);
1038         list_insert_tail(&(tip->ti_socklist), tsock);
1039         tsock->ts_tip = tip;
1040         rw_exit(&tip->ti_rwlock);
1041         return (0);
1042 }
1043 
1044 static void
1045 trill_clear_bridge(trill_sock_t *tsock)
1046 {
1047         trill_inst_t *tip;
1048 
1049         if ((tip = tsock->ts_tip) == NULL)
1050                 return;
1051         rw_enter(&tip->ti_rwlock, RW_WRITER);
1052         list_remove(&tip->ti_socklist, tsock);
1053         if (list_is_empty(&tip->ti_socklist))
1054                 trill_del_all(tip, B_TRUE);
1055         rw_exit(&tip->ti_rwlock);
1056 }
1057 
1058 static void
1059 trill_sock_unref(trill_sock_t *tsock)
1060 {
1061         if (atomic_dec_uint_nv(&tsock->ts_refs) == 0) {
1062                 mutex_destroy(&tsock->ts_socklock);
1063                 cv_destroy(&tsock->ts_sockthreadwait);
1064                 cv_destroy(&tsock->ts_sockclosewait);
1065                 kmem_free(tsock, sizeof (trill_sock_t));
1066         }
1067 }
1068 
1069 static void
1070 trill_do_close(trill_sock_t *tsock)
1071 {
1072         trill_inst_t *tip;
1073 
1074         tip = tsock->ts_tip;
1075         trill_stop_recv(tsock);
1076         /* Remove socket from TRILL instance socket list */
1077         trill_clear_bridge(tsock);
1078         tsock->ts_flags |= TSF_SHUTDOWN;
1079         trill_sock_unref(tsock);
1080         if (tip != NULL)
1081                 trill_instance_unref(tip);
1082 }
1083 
1084 static void
1085 trill_del_all(trill_inst_t *tip, boolean_t lockheld)
1086 {
1087         int i;
1088 
1089         if (!lockheld)
1090                 rw_enter(&tip->ti_rwlock, RW_WRITER);
1091         for (i = RBRIDGE_NICKNAME_MIN; i < RBRIDGE_NICKNAME_MAX; i++) {
1092                 if (tip->ti_nodes[i] != NULL)
1093                         (void) trill_del_nick(tip, i, B_TRUE);
1094         }
1095         if (!lockheld)
1096                 rw_exit(&tip->ti_rwlock);
1097 }
1098 
1099 static void
1100 trill_node_free(trill_node_t *nick_entry)
1101 {
1102         trill_nickinfo_t *tni;
1103 
1104         tni = nick_entry->tn_ni;
1105         kmem_free(tni, TNI_TOTALSIZE(tni));
1106         kmem_free(nick_entry, sizeof (trill_node_t));
1107 }
1108 
1109 static void
1110 trill_node_unref(trill_inst_t *tip, trill_node_t *tnp)
1111 {
1112         if (atomic_dec_uint_nv(&tnp->tn_refs) == 0) {
1113                 if (tnp->tn_tsp != NULL)
1114                         trill_sock_unref(tnp->tn_tsp);
1115                 trill_node_free(tnp);
1116                 (void) atomic_dec_uint_nv(&tip->ti_nodecount);
1117         }
1118 }
1119 
1120 static trill_node_t *
1121 trill_node_lookup(trill_inst_t *tip, uint16_t nick)
1122 {
1123         trill_node_t *nick_entry;
1124 
1125         if (!VALID_NICK(nick))
1126                 return (NULL);
1127         rw_enter(&tip->ti_rwlock, RW_READER);
1128         nick_entry = tip->ti_nodes[nick];
1129         if (nick_entry != NULL) {
1130                 atomic_inc_uint(&nick_entry->tn_refs);
1131         }
1132         rw_exit(&tip->ti_rwlock);
1133         return (nick_entry);
1134 }
1135 
1136 static int
1137 trill_del_nick(trill_inst_t *tip, uint16_t nick, boolean_t lockheld)
1138 {
1139         trill_node_t *nick_entry;
1140         int rc = ENOENT;
1141 
1142         if (!lockheld)
1143                 rw_enter(&tip->ti_rwlock, RW_WRITER);
1144         if (VALID_NICK(nick)) {
1145                 nick_entry = tip->ti_nodes[nick];
1146                 if (nick_entry != NULL) {
1147                         trill_node_unref(tip, nick_entry);
1148                         tip->ti_nodes[nick] = NULL;
1149                         rc = 0;
1150                 }
1151         }
1152         if (!lockheld)
1153                 rw_exit(&tip->ti_rwlock);
1154         return (rc);
1155 }
1156 
1157 static int
1158 trill_add_nick(trill_inst_t *tip, void *arg, boolean_t self, int mode)
1159 {
1160         uint16_t nick;
1161         int size;
1162         trill_node_t *tnode;
1163         trill_nickinfo_t tnihdr;
1164 
1165         /* First make sure we have at least the header available */
1166         if (ddi_copyin(arg, &tnihdr, sizeof (trill_nickinfo_t), mode) != 0)
1167                 return (EFAULT);
1168 
1169         nick = tnihdr.tni_nick;
1170         if (!VALID_NICK(nick)) {
1171                 DTRACE_PROBE1(trill__add__nick__bad, trill_nickinfo_t *,
1172                     &tnihdr);
1173                 return (EINVAL);
1174         }
1175 
1176         size = TNI_TOTALSIZE(&tnihdr);
1177         if (size > TNI_MAXSIZE)
1178                 return (EINVAL);
1179         tnode = kmem_zalloc(sizeof (trill_node_t), KM_SLEEP);
1180         tnode->tn_ni = kmem_zalloc(size, KM_SLEEP);
1181         if (ddi_copyin(arg, tnode->tn_ni, size, mode) != 0) {
1182                 kmem_free(tnode->tn_ni, size);
1183                 kmem_free(tnode, sizeof (trill_node_t));
1184                 return (EFAULT);
1185         }
1186 
1187         tnode->tn_refs++;
1188         rw_enter(&tip->ti_rwlock, RW_WRITER);
1189         if (tip->ti_nodes[nick] != NULL)
1190                 (void) trill_del_nick(tip, nick, B_TRUE);
1191 
1192         if (self) {
1193                 tip->ti_nick = nick;
1194         } else {
1195                 tnode->tn_tsp = find_trill_link(tip,
1196                     tnode->tn_ni->tni_linkid);
1197         }
1198         DTRACE_PROBE2(trill__add__nick, trill_node_t *, tnode,
1199             uint16_t, nick);
1200         tip->ti_nodes[nick] = tnode;
1201         tip->ti_nodecount++;
1202         rw_exit(&tip->ti_rwlock);
1203         return (0);
1204 }
1205 
1206 static int
1207 trill_do_ioctl(trill_sock_t *tsock, int cmd, void *arg, int mode)
1208 {
1209         int error = 0;
1210         trill_inst_t *tip = tsock->ts_tip;
1211 
1212         switch (cmd) {
1213         case TRILL_DESIGVLAN: {
1214                 uint16_t desigvlan;
1215 
1216                 if (ddi_copyin(arg, &desigvlan, sizeof (desigvlan), mode) != 0)
1217                         return (EFAULT);
1218                 tsock->ts_desigvlan = desigvlan;
1219                 break;
1220         }
1221         case TRILL_VLANFWDER: {
1222                 uint8_t vlans[TRILL_VLANS_ARRSIZE];
1223 
1224                 if (tsock->ts_link == NULL)
1225                         return (EINVAL);
1226                 if ((ddi_copyin(arg, vlans, sizeof (vlans), mode)) != 0)
1227                         return (EFAULT);
1228                 bridge_trill_setvlans(tsock->ts_link, vlans);
1229                 break;
1230         }
1231         case TRILL_SETNICK:
1232                 if (tip == NULL)
1233                         return (EINVAL);
1234                 error = trill_add_nick(tip, arg, B_TRUE, mode);
1235                 break;
1236 
1237         case TRILL_GETNICK:
1238                 if (tip == NULL)
1239                         return (EINVAL);
1240                 rw_enter(&tip->ti_rwlock, RW_READER);
1241                 if (ddi_copyout(&tip->ti_nick, arg, sizeof (tip->ti_nick),
1242                     mode) != 0)
1243                         error = EFAULT;
1244                 rw_exit(&tip->ti_rwlock);
1245                 break;
1246 
1247         case TRILL_ADDNICK:
1248                 if (tip == NULL)
1249                         break;
1250                 error = trill_add_nick(tip, arg, B_FALSE, mode);
1251                 break;
1252 
1253         case TRILL_DELNICK: {
1254                 uint16_t delnick;
1255 
1256                 if (tip == NULL)
1257                         break;
1258                 if (ddi_copyin(arg, &delnick, sizeof (delnick), mode) != 0)
1259                         return (EFAULT);
1260                 error = trill_del_nick(tip, delnick, B_FALSE);
1261                 break;
1262         }
1263         case TRILL_DELALL:
1264                 if (tip == NULL)
1265                         break;
1266                 trill_del_all(tip, B_FALSE);
1267                 break;
1268 
1269         case TRILL_TREEROOT: {
1270                 uint16_t treeroot;
1271 
1272                 if (tip == NULL)
1273                         break;
1274                 if (ddi_copyin(arg, &treeroot, sizeof (treeroot), mode) != 0)
1275                         return (EFAULT);
1276                 if (!VALID_NICK(treeroot))
1277                         return (EINVAL);
1278                 rw_enter(&tip->ti_rwlock, RW_WRITER);
1279                 tip->ti_treeroot = treeroot;
1280                 rw_exit(&tip->ti_rwlock);
1281                 break;
1282         }
1283         case TRILL_HWADDR:
1284                 if (tsock->ts_link == NULL)
1285                         break;
1286                 if (ddi_copyout(tsock->ts_link->bl_local_mac, arg, ETHERADDRL,
1287                     mode) != 0)
1288                         return (EFAULT);
1289                 break;
1290 
1291         case TRILL_NEWBRIDGE: {
1292                 char bname[MAXLINKNAMELEN];
1293 
1294                 if (tsock->ts_state != TS_UNBND)
1295                         return (ENOTSUP);
1296                 /* ts_tip can only be set once */
1297                 if (tip != NULL)
1298                         return (EEXIST);
1299                 if (ddi_copyin(arg, bname, sizeof (bname), mode) != 0)
1300                         return (EFAULT);
1301                 bname[MAXLINKNAMELEN-1] = '\0';
1302                 error = trill_find_bridge(tsock, bname, B_TRUE);
1303                 break;
1304         }
1305 
1306         case TRILL_GETBRIDGE: {
1307                 char bname[MAXLINKNAMELEN];
1308 
1309                 /* ts_tip can only be set once */
1310                 if (tip != NULL)
1311                         return (EEXIST);
1312                 if (ddi_copyin(arg, bname, sizeof (bname), mode) != 0)
1313                         return (EFAULT);
1314                 bname[MAXLINKNAMELEN - 1] = '\0';
1315                 error = trill_find_bridge(tsock, bname, B_FALSE);
1316                 break;
1317         }
1318 
1319         case TRILL_LISTNICK: {
1320                 trill_listnick_t tln;
1321                 trill_node_t *tnp;
1322                 trill_nickinfo_t *tnip;
1323                 uint16_t nick;
1324 
1325                 if (tip == NULL)
1326                         return (EINVAL);
1327                 if (ddi_copyin(arg, &tln, sizeof (tln), mode) != 0)
1328                         return (EFAULT);
1329                 nick = tln.tln_nick;
1330                 if (nick >= RBRIDGE_NICKNAME_MAX) {
1331                         error = EINVAL;
1332                         break;
1333                 }
1334                 rw_enter(&tip->ti_rwlock, RW_READER);
1335                 while (++nick < RBRIDGE_NICKNAME_MAX) {
1336                         if ((tnp = tip->ti_nodes[nick]) != NULL) {
1337                                 tnip = tnp->tn_ni;
1338                                 ASSERT(nick == tnip->tni_nick);
1339                                 tln.tln_nick = nick;
1340                                 bcopy(tnip->tni_adjsnpa, tln.tln_nexthop,
1341                                     ETHERADDRL);
1342                                 tln.tln_ours = nick == tip->ti_nick;
1343                                 if (tln.tln_ours || tnp->tn_tsp == NULL) {
1344                                         tln.tln_linkid =
1345                                             DATALINK_INVALID_LINKID;
1346                                 } else {
1347                                         tln.tln_linkid =
1348                                             tnp->tn_tsp->ts_link->bl_linkid;
1349                                 }
1350                                 break;
1351                         }
1352                 }
1353                 rw_exit(&tip->ti_rwlock);
1354                 if (nick >= RBRIDGE_NICKNAME_MAX)
1355                         bzero(&tln, sizeof (tln));
1356                 if (ddi_copyout(&tln, arg, sizeof (tln), mode) != 0)
1357                         return (EFAULT);
1358                 break;
1359         }
1360 
1361         /*
1362          * Port flush: this is used when we lose AF on a port.  We must discard
1363          * all regular bridge forwarding entries on this port with the
1364          * indicated VLAN.
1365          */
1366         case TRILL_PORTFLUSH: {
1367                 uint16_t vlan = (uint16_t)(uintptr_t)arg;
1368 
1369                 if (tsock->ts_link == NULL)
1370                         return (EINVAL);
1371                 bridge_trill_flush(tsock->ts_link, vlan, B_FALSE);
1372                 break;
1373         }
1374 
1375         /*
1376          * Nick flush: this is used when we lose AF on a port.  We must discard
1377          * all bridge TRILL forwarding entries on this port with the indicated
1378          * VLAN.
1379          */
1380         case TRILL_NICKFLUSH: {
1381                 uint16_t vlan = (uint16_t)(uintptr_t)arg;
1382 
1383                 if (tsock->ts_link == NULL)
1384                         return (EINVAL);
1385                 bridge_trill_flush(tsock->ts_link, vlan, B_TRUE);
1386                 break;
1387         }
1388 
1389         case TRILL_GETMTU:
1390                 if (tsock->ts_link == NULL)
1391                         break;
1392                 if (ddi_copyout(&tsock->ts_link->bl_maxsdu, arg,
1393                     sizeof (uint_t), mode) != 0)
1394                         return (EFAULT);
1395                 break;
1396 
1397         default:
1398                 error = ENOTSUP;
1399                 break;
1400         }
1401 
1402         return (error);
1403 }
1404 
1405 /*
1406  * Sends received packet back upstream on the TRILL socket.
1407  * Consumes passed mblk_t.
1408  */
1409 static void
1410 trill_ctrl_input(trill_sock_t *tsock, mblk_t *mp, const uint8_t *saddr,
1411     uint16_t tci)
1412 {
1413         int udi_size;
1414         mblk_t *mp1;
1415         struct T_unitdata_ind *tudi;
1416         struct sockaddr_dl *sdl;
1417         char *lladdr;
1418         int error;
1419 
1420         ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
1421         if (tsock->ts_flow_ctrld) {
1422                 freemsg(mp);
1423                 KSPINCR(tks_drops);
1424                 return;
1425         }
1426 
1427         udi_size =  sizeof (struct T_unitdata_ind) +
1428             sizeof (struct sockaddr_dl);
1429         mp1 = allocb(udi_size, BPRI_MED);
1430         if (mp1 == NULL) {
1431                 freemsg(mp);
1432                 KSPINCR(tks_drops);
1433                 return;
1434         }
1435 
1436         mp1->b_cont = mp;
1437         mp = mp1;
1438         mp->b_datap->db_type = M_PROTO;
1439         /* LINTED: alignment */
1440         tudi = (struct T_unitdata_ind *)mp->b_rptr;
1441         mp->b_wptr = (uchar_t *)tudi + udi_size;
1442 
1443         tudi->PRIM_type = T_UNITDATA_IND;
1444         tudi->SRC_length = sizeof (struct sockaddr_dl);
1445         tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1446         tudi->OPT_length = 0;
1447         tudi->OPT_offset = sizeof (struct T_unitdata_ind) +
1448             sizeof (struct sockaddr_dl);
1449 
1450         /* Information of the link on which packet was received. */
1451         sdl = (struct sockaddr_dl *)&tudi[1];
1452         (void) memset(sdl, 0, sizeof (struct sockaddr_dl));
1453         sdl->sdl_family = AF_TRILL;
1454 
1455         /* LINTED: alignment */
1456         *(datalink_id_t *)sdl->sdl_data = tsock->ts_link->bl_linkid;
1457         sdl->sdl_nlen = sizeof (tsock->ts_link->bl_linkid);
1458 
1459         lladdr = LLADDR(sdl);
1460         (void) memcpy(lladdr, saddr, ETHERADDRL);
1461         lladdr += ETHERADDRL;
1462         sdl->sdl_alen = ETHERADDRL;
1463 
1464         /* LINTED: alignment */
1465         *(uint16_t *)lladdr = tci;
1466         sdl->sdl_slen = sizeof (uint16_t);
1467 
1468         DTRACE_PROBE2(trill__ctrl__input, trill_sock_t *, tsock, mblk_t *, mp);
1469         (*tsock->ts_conn_upcalls->su_recv)(tsock->ts_conn_upper_handle,
1470             mp, msgdsize(mp), 0, &error, NULL);
1471 
1472         if (error == ENOSPC) {
1473                 mutex_enter(&tsock->ts_socklock);
1474                 (*tsock->ts_conn_upcalls->su_recv)(tsock->ts_conn_upper_handle,
1475                     NULL, 0, 0, &error, NULL);
1476                 if (error == ENOSPC)
1477                         tsock->ts_flow_ctrld = B_TRUE;
1478                 mutex_exit(&tsock->ts_socklock);
1479                 KSPINCR(tks_drops);
1480         } else if (error != 0) {
1481                 KSPINCR(tks_drops);
1482         } else {
1483                 KSPINCR(tks_recv);
1484         }
1485 
1486         DTRACE_PROBE2(trill__ctrl__input__done, trill_sock_t *,
1487             tsock, int, error);
1488 }
1489 
1490 /* ARGSUSED */
1491 static void
1492 trill_activate(sock_lower_handle_t proto_handle,
1493     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls,
1494     int flags, cred_t *cr)
1495 {
1496         trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1497         struct sock_proto_props sopp;
1498 
1499         tsock->ts_conn_upcalls = sock_upcalls;
1500         tsock->ts_conn_upper_handle = sock_handle;
1501 
1502         sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT |
1503             SOCKOPT_RCVLOWAT | SOCKOPT_MAXADDRLEN | SOCKOPT_MAXPSZ |
1504             SOCKOPT_MAXBLK | SOCKOPT_MINPSZ;
1505         sopp.sopp_wroff = 0;
1506         sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
1507         sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
1508         sopp.sopp_maxaddrlen = sizeof (struct sockaddr_dl);
1509         sopp.sopp_maxpsz = INFPSZ;
1510         sopp.sopp_maxblk = INFPSZ;
1511         sopp.sopp_minpsz = 0;
1512         (*tsock->ts_conn_upcalls->su_set_proto_props)(
1513             tsock->ts_conn_upper_handle, &sopp);
1514 }
1515 
1516 /* ARGSUSED */
1517 static int
1518 trill_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
1519 {
1520         trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1521 
1522         trill_do_close(tsock);
1523         return (0);
1524 }
1525 
1526 /* ARGSUSED */
1527 static int
1528 trill_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
1529     socklen_t len, cred_t *cr)
1530 {
1531         int error;
1532         trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1533 
1534         if (sa == NULL)
1535                 error = trill_do_unbind(tsock);
1536         else
1537                 error = trill_start_recv(tsock, sa, len);
1538 
1539         return (error);
1540 }
1541 
1542 /* ARGSUSED */
1543 static int
1544 trill_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
1545     cred_t *cr)
1546 {
1547         trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1548         struct sockaddr_dl *laddr;
1549         uint16_t tci;
1550 
1551         ASSERT(DB_TYPE(mp) == M_DATA);
1552         ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
1553 
1554         if (msg->msg_name == NULL || msg->msg_namelen != sizeof (*laddr))
1555                 goto eproto;
1556 
1557         /*
1558          * The name is a datalink_id_t, the address is an Ethernet address, and
1559          * the selector value is the VLAN ID.
1560          */
1561         laddr = (struct sockaddr_dl *)msg->msg_name;
1562         if (laddr->sdl_nlen != sizeof (datalink_id_t) ||
1563             laddr->sdl_alen != ETHERADDRL ||
1564             (laddr->sdl_slen != sizeof (tci) && laddr->sdl_slen != 0))
1565                 goto eproto;
1566 
1567         mutex_enter(&tsock->ts_socklock);
1568         if (tsock->ts_state != TS_IDLE || tsock->ts_link == NULL) {
1569                 mutex_exit(&tsock->ts_socklock);
1570                 goto eproto;
1571         }
1572         atomic_inc_uint(&tsock->ts_sockthreadcount);
1573         mutex_exit(&tsock->ts_socklock);
1574 
1575         /*
1576          * Safe to dereference VLAN now, as we've checked the user's specified
1577          * values, and alignment is now guaranteed.
1578          */
1579         if (laddr->sdl_slen == 0) {
1580                 tci = TRILL_NO_TCI;
1581         } else {
1582                 /* LINTED: alignment */
1583                 tci = *(uint16_t *)(LLADDR(laddr) + ETHERADDRL);
1584         }
1585 
1586         mp = create_trill_header(tsock, mp, (const uchar_t *)LLADDR(laddr),
1587             B_TRUE, B_FALSE, tci, msgdsize(mp));
1588         if (mp != NULL) {
1589                 mp = bridge_trill_output(tsock->ts_link, mp);
1590                 if (mp == NULL) {
1591                         KSPINCR(tks_sent);
1592                 } else {
1593                         freemsg(mp);
1594                         KSPINCR(tks_drops);
1595                 }
1596         }
1597 
1598         /* Wake up any threads blocking on us */
1599         if (atomic_dec_uint_nv(&tsock->ts_sockthreadcount) == 0)
1600                 cv_broadcast(&tsock->ts_sockthreadwait);
1601         return (0);
1602 
1603 eproto:
1604         freemsg(mp);
1605         KSPINCR(tks_drops);
1606         return (EPROTO);
1607 }
1608 
1609 /* ARGSUSED */
1610 static int
1611 trill_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
1612     int mode, int32_t *rvalp, cred_t *cr)
1613 {
1614         trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1615         int rc;
1616 
1617         switch (cmd) {
1618         /* List of unprivileged TRILL ioctls */
1619         case TRILL_GETNICK:
1620         case TRILL_GETBRIDGE:
1621         case TRILL_LISTNICK:
1622                 break;
1623         default:
1624                 if (secpolicy_dl_config(cr) != 0)
1625                         return (EPERM);
1626                 break;
1627         }
1628 
1629         /* Lock ensures socket state is unchanged during ioctl handling */
1630         mutex_enter(&tsock->ts_socklock);
1631         rc = trill_do_ioctl(tsock, cmd, (void *)arg, mode);
1632         mutex_exit(&tsock->ts_socklock);
1633         return (rc);
1634 }
1635 
1636 static void
1637 trill_clr_flowctrl(sock_lower_handle_t proto_handle)
1638 {
1639         trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1640 
1641         mutex_enter(&tsock->ts_socklock);
1642         tsock->ts_flow_ctrld = B_FALSE;
1643         mutex_exit(&tsock->ts_socklock);
1644 }
1645 
1646 static sock_downcalls_t sock_trill_downcalls = {
1647         trill_activate,                 /* sd_activate */
1648         sock_accept_notsupp,            /* sd_accept */
1649         trill_bind,                     /* sd_bind */
1650         sock_listen_notsupp,            /* sd_listen */
1651         sock_connect_notsupp,           /* sd_connect */
1652         sock_getpeername_notsupp,       /* sd_getpeername */
1653         sock_getsockname_notsupp,       /* sd_getsockname */
1654         sock_getsockopt_notsupp,        /* sd_getsockopt */
1655         sock_setsockopt_notsupp,        /* sd_setsockopt */
1656         trill_send,                     /* sd_send */
1657         NULL,                           /* sd_send_uio */
1658         NULL,                           /* sd_recv_uio */
1659         NULL,                           /* sd_poll */
1660         sock_shutdown_notsupp,          /* sd_shutdown */
1661         trill_clr_flowctrl,             /* sd_setflowctrl */
1662         trill_ioctl,                    /* sd_ioctl */
1663         trill_close                     /* sd_close */
1664 };
1665 
1666 /* ARGSUSED */
1667 static sock_lower_handle_t
1668 trill_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
1669     uint_t *smodep, int *errorp, int flags, cred_t *credp)
1670 {
1671         trill_sock_t *tsock;
1672 
1673         if (family != AF_TRILL || type != SOCK_DGRAM || proto != 0) {
1674                 *errorp = EPROTONOSUPPORT;
1675                 return (NULL);
1676         }
1677 
1678         *sock_downcalls = &sock_trill_downcalls;
1679         *smodep = SM_ATOMIC;
1680         tsock = trill_do_open(flags);
1681         *errorp = (tsock != NULL) ? 0:ENOMEM;
1682         return ((sock_lower_handle_t)tsock);
1683 }