1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * Data-Link Driver
  27  */
  28 
  29 #include        <inet/common.h>
  30 #include        <sys/strsubr.h>
  31 #include        <sys/stropts.h>
  32 #include        <sys/strsun.h>
  33 #include        <sys/vlan.h>
  34 #include        <sys/dld_impl.h>
  35 #include        <sys/cpuvar.h>
  36 #include        <sys/callb.h>
  37 #include        <sys/list.h>
  38 #include        <sys/mac_client.h>
  39 #include        <sys/mac_client_priv.h>
  40 #include        <sys/mac_flow.h>
  41 
  42 static int      str_constructor(void *, void *, int);
  43 static void     str_destructor(void *, void *);
  44 static mblk_t   *str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
  45 static void     str_notify_promisc_on_phys(dld_str_t *);
  46 static void     str_notify_promisc_off_phys(dld_str_t *);
  47 static void     str_notify_phys_addr(dld_str_t *, uint_t, const uint8_t *);
  48 static void     str_notify_link_up(dld_str_t *);
  49 static void     str_notify_link_down(dld_str_t *);
  50 static void     str_notify_capab_reneg(dld_str_t *);
  51 static void     str_notify_speed(dld_str_t *, uint32_t);
  52 
  53 static void     ioc_native(dld_str_t *,  mblk_t *);
  54 static void     ioc_margin(dld_str_t *, mblk_t *);
  55 static void     ioc_raw(dld_str_t *, mblk_t *);
  56 static void     ioc_fast(dld_str_t *,  mblk_t *);
  57 static void     ioc_lowlink(dld_str_t *,  mblk_t *);
  58 static void     ioc(dld_str_t *, mblk_t *);
  59 static void     dld_ioc(dld_str_t *, mblk_t *);
  60 static void     dld_wput_nondata(dld_str_t *, mblk_t *);
  61 
  62 static void     str_mdata_raw_put(dld_str_t *, mblk_t *);
  63 static mblk_t   *i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t,
  64     link_tagmode_t);
  65 static mblk_t   *i_dld_ether_header_strip_tag(mblk_t *, boolean_t);
  66 
  67 static uint32_t         str_count;
  68 static kmem_cache_t     *str_cachep;
  69 static mod_hash_t       *str_hashp;
  70 
  71 #define STR_HASHSZ              64
  72 #define STR_HASH_KEY(key)       ((mod_hash_key_t)(uintptr_t)(key))
  73 
  74 #define dld_taskq       system_taskq
  75 
  76 static kmutex_t         dld_taskq_lock;
  77 static kcondvar_t       dld_taskq_cv;
  78 static list_t           dld_taskq_list;         /* List of dld_str_t */
  79 boolean_t               dld_taskq_quit;
  80 boolean_t               dld_taskq_done;
  81 
  82 static void             dld_taskq_dispatch(void);
  83 
  84 /*
  85  * Some notes on entry points, flow-control, queueing.
  86  *
  87  * This driver exports the traditional STREAMS put entry point as well as
  88  * the non-STREAMS fast-path transmit routine which is provided to IP via
  89  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
  90  * and data operations, while the fast-path routine deals only with M_DATA
  91  * fast-path packets.  Regardless of the entry point, all outbound packets
  92  * will end up in DLD_TX(), where they will be delivered to the MAC layer.
  93  *
  94  * The transmit logic operates in the following way: All packets coming
  95  * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
  96  * happens when the MAC layer indicates the packets couldn't be
  97  * transmitted due to 1) lack of resources (e.g. running out of
  98  * descriptors),  or 2) reaching the allowed bandwidth limit for this
  99  * particular flow. The indication comes in the form of a Tx cookie that
 100  * identifies the blocked ring. In such case, DLD will place a
 101  * dummy message on its write-side STREAMS queue so that the queue is
 102  * marked as "full". Any subsequent packets arriving at the driver will
 103  * still be sent to the MAC layer where it either gets queued in the Tx
 104  * SRS or discarded it if queue limit is exceeded. The write-side STREAMS
 105  * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
 106  * When the write service procedure runs, it will remove the dummy
 107  * message from the write-side STREAMS queue; in effect this will trigger
 108  * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
 109  * respectively, due to the above reasons.
 110  *
 111  * All non-data operations, both DLPI and ioctls are single threaded on a per
 112  * dld_str_t endpoint. This is done using a taskq so that the control operation
 113  * has kernel context and can cv_wait for resources. In addition all set type
 114  * operations that involve mac level state modification are serialized on a
 115  * per mac end point using the perimeter mechanism provided by the mac layer.
 116  * This serializes all mac clients trying to modify a single mac end point over
 117  * the entire sequence of mac calls made by that client as an atomic unit. The
 118  * mac framework locking is described in mac.c. A critical element is that
 119  * DLD/DLS does not hold any locks across the mac perimeter.
 120  *
 121  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
 122  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
 123  * match dev_t. If a stream is found and it is attached, its dev_info_t *
 124  * is returned. If the mac handle is non-null, it can be safely accessed
 125  * below. The mac handle won't be freed until the mac_unregister which
 126  * won't happen until the driver detaches. The DDI framework ensures that
 127  * the detach won't happen while a getinfo is in progress.
 128  */
 129 typedef struct i_dld_str_state_s {
 130         major_t         ds_major;
 131         minor_t         ds_minor;
 132         int             ds_instance;
 133         dev_info_t      *ds_dip;
 134 } i_dld_str_state_t;
 135 
 136 /* ARGSUSED */
 137 static uint_t
 138 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
 139 {
 140         i_dld_str_state_t       *statep = arg;
 141         dld_str_t               *dsp = (dld_str_t *)val;
 142         mac_handle_t            mh;
 143 
 144         if (statep->ds_major != dsp->ds_major)
 145                 return (MH_WALK_CONTINUE);
 146 
 147         ASSERT(statep->ds_minor != 0);
 148         mh = dsp->ds_mh;
 149 
 150         if (statep->ds_minor == dsp->ds_minor) {
 151                 /*
 152                  * Clone: a clone minor is unique. we can terminate the
 153                  * walk if we find a matching stream -- even if we fail
 154                  * to obtain the devinfo.
 155                  */
 156                 if (mh != NULL) {
 157                         statep->ds_dip = mac_devinfo_get(mh);
 158                         statep->ds_instance = DLS_MINOR2INST(mac_minor(mh));
 159                 }
 160                 return (MH_WALK_TERMINATE);
 161         }
 162         return (MH_WALK_CONTINUE);
 163 }
 164 
 165 static dev_info_t *
 166 dld_finddevinfo(dev_t dev)
 167 {
 168         dev_info_t              *dip;
 169         i_dld_str_state_t       state;
 170 
 171         if (getminor(dev) == 0)
 172                 return (NULL);
 173 
 174         /*
 175          * See if it's a minor node of a link
 176          */
 177         if ((dip = dls_link_devinfo(dev)) != NULL)
 178                 return (dip);
 179 
 180         state.ds_minor = getminor(dev);
 181         state.ds_major = getmajor(dev);
 182         state.ds_dip = NULL;
 183         state.ds_instance = -1;
 184 
 185         mod_hash_walk(str_hashp, i_dld_str_walker, &state);
 186         return (state.ds_dip);
 187 }
 188 
 189 int
 190 dld_devt_to_instance(dev_t dev)
 191 {
 192         minor_t                 minor;
 193         i_dld_str_state_t       state;
 194 
 195         /*
 196          * GLDv3 numbers DLPI style 1 node as the instance number + 1.
 197          * Minor number 0 is reserved for the DLPI style 2 unattached
 198          * node.
 199          */
 200 
 201         if ((minor = getminor(dev)) == 0)
 202                 return (-1);
 203 
 204         /*
 205          * Check for unopened style 1 node.
 206          * Note that this doesn't *necessarily* work for legacy
 207          * devices, but this code is only called within the
 208          * getinfo(9e) implementation for true GLDv3 devices, so it
 209          * doesn't matter.
 210          */
 211         if (minor > 0 && minor <= DLS_MAX_MINOR) {
 212                 return (DLS_MINOR2INST(minor));
 213         }
 214 
 215         state.ds_minor = getminor(dev);
 216         state.ds_major = getmajor(dev);
 217         state.ds_dip = NULL;
 218         state.ds_instance = -1;
 219 
 220         mod_hash_walk(str_hashp, i_dld_str_walker, &state);
 221         return (state.ds_instance);
 222 }
 223 
 224 /*
 225  * devo_getinfo: getinfo(9e)
 226  *
 227  * NB: This may be called for a provider before the provider's
 228  * instances are attached.  Hence, if a particular provider needs a
 229  * special mapping (the mac instance != ddi_get_instance()), then it
 230  * may need to provide its own implmentation using the
 231  * mac_devt_to_instance() function, and translating the returned mac
 232  * instance to a devinfo instance.  For dev_t's where the minor number
 233  * is too large (i.e. > MAC_MAX_MINOR), the provider can call this
 234  * function indirectly via the mac_getinfo() function.
 235  */
 236 /*ARGSUSED*/
 237 int
 238 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
 239 {
 240         dev_info_t      *devinfo;
 241         minor_t         minor = getminor((dev_t)arg);
 242         int             rc = DDI_FAILURE;
 243 
 244         switch (cmd) {
 245         case DDI_INFO_DEVT2DEVINFO:
 246                 if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
 247                         *(dev_info_t **)resp = devinfo;
 248                         rc = DDI_SUCCESS;
 249                 }
 250                 break;
 251         case DDI_INFO_DEVT2INSTANCE:
 252                 if (minor > 0 && minor <= DLS_MAX_MINOR) {
 253                         *resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
 254                         rc = DDI_SUCCESS;
 255                 } else if (minor > DLS_MAX_MINOR &&
 256                     (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
 257                         *resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
 258                         rc = DDI_SUCCESS;
 259                 }
 260                 break;
 261         }
 262         return (rc);
 263 }
 264 
 265 void *
 266 dld_str_private(queue_t *q)
 267 {
 268         return (((dld_str_t *)(q->q_ptr))->ds_private);
 269 }
 270 
 271 int
 272 dld_str_open(queue_t *rq, dev_t *devp, void *private)
 273 {
 274         dld_str_t       *dsp;
 275         major_t         major;
 276         minor_t         minor;
 277         int             err;
 278 
 279         major = getmajor(*devp);
 280         minor = getminor(*devp);
 281 
 282         /*
 283          * Create a new dld_str_t for the stream. This will grab a new minor
 284          * number that will be handed back in the cloned dev_t.  Creation may
 285          * fail if we can't allocate the dummy mblk used for flow-control.
 286          */
 287         dsp = dld_str_create(rq, DLD_DLPI, major,
 288             ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
 289         if (dsp == NULL)
 290                 return (ENOSR);
 291 
 292         ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
 293         dsp->ds_private = private;
 294         if (minor != 0) {
 295                 /*
 296                  * Style 1 open
 297                  */
 298                 if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
 299                         goto failed;
 300 
 301                 ASSERT(dsp->ds_dlstate == DL_UNBOUND);
 302         } else {
 303                 (void) qassociate(rq, -1);
 304         }
 305 
 306         /*
 307          * Enable the queue srv(9e) routine.
 308          */
 309         qprocson(rq);
 310 
 311         /*
 312          * Construct a cloned dev_t to hand back.
 313          */
 314         *devp = makedevice(getmajor(*devp), dsp->ds_minor);
 315         return (0);
 316 
 317 failed:
 318         dld_str_destroy(dsp);
 319         return (err);
 320 }
 321 
 322 int
 323 dld_str_close(queue_t *rq)
 324 {
 325         dld_str_t       *dsp = rq->q_ptr;
 326 
 327         /*
 328          * All modules on top have been popped off. So there can't be any
 329          * threads from the top.
 330          */
 331         ASSERT(dsp->ds_datathr_cnt == 0);
 332 
 333         /*
 334          * Wait until pending DLPI requests are processed.
 335          */
 336         mutex_enter(&dsp->ds_lock);
 337         while (dsp->ds_dlpi_pending)
 338                 cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock);
 339         mutex_exit(&dsp->ds_lock);
 340 
 341 
 342         /*
 343          * This stream was open to a provider node. Check to see
 344          * if it has been cleanly shut down.
 345          */
 346         if (dsp->ds_dlstate != DL_UNATTACHED) {
 347                 /*
 348                  * The stream is either open to a style 1 provider or
 349                  * this is not clean shutdown. Detach from the PPA.
 350                  * (This is still ok even in the style 1 case).
 351                  */
 352                 dld_str_detach(dsp);
 353         }
 354 
 355         dld_str_destroy(dsp);
 356         return (0);
 357 }
 358 
 359 /*
 360  * qi_qopen: open(9e)
 361  */
 362 /*ARGSUSED*/
 363 int
 364 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
 365 {
 366         if (sflag == MODOPEN)
 367                 return (ENOTSUP);
 368 
 369         /*
 370          * This is a cloning driver and therefore each queue should only
 371          * ever get opened once.
 372          */
 373         if (rq->q_ptr != NULL)
 374                 return (EBUSY);
 375 
 376         return (dld_str_open(rq, devp, NULL));
 377 }
 378 
 379 /*
 380  * qi_qclose: close(9e)
 381  */
 382 int
 383 dld_close(queue_t *rq)
 384 {
 385         /*
 386          * Disable the queue srv(9e) routine.
 387          */
 388         qprocsoff(rq);
 389 
 390         return (dld_str_close(rq));
 391 }
 392 
 393 /*
 394  * qi_qputp: put(9e)
 395  */
 396 void
 397 dld_wput(queue_t *wq, mblk_t *mp)
 398 {
 399         dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
 400         dld_str_mode_t  mode;
 401 
 402         switch (DB_TYPE(mp)) {
 403         case M_DATA:
 404                 mutex_enter(&dsp->ds_lock);
 405                 mode = dsp->ds_mode;
 406                 if ((dsp->ds_dlstate != DL_IDLE) ||
 407                     (mode != DLD_FASTPATH && mode != DLD_RAW)) {
 408                         mutex_exit(&dsp->ds_lock);
 409                         freemsg(mp);
 410                         break;
 411                 }
 412 
 413                 DLD_DATATHR_INC(dsp);
 414                 mutex_exit(&dsp->ds_lock);
 415                 if (mode == DLD_FASTPATH) {
 416                         if (dsp->ds_mip->mi_media == DL_ETHER &&
 417                             (MBLKL(mp) < sizeof (struct ether_header))) {
 418                                 freemsg(mp);
 419                         } else {
 420                                 (void) str_mdata_fastpath_put(dsp, mp, 0, 0);
 421                         }
 422                 } else {
 423                         str_mdata_raw_put(dsp, mp);
 424                 }
 425                 DLD_DATATHR_DCR(dsp);
 426                 break;
 427         case M_PROTO:
 428         case M_PCPROTO: {
 429                 t_uscalar_t     prim;
 430 
 431                 if (MBLKL(mp) < sizeof (t_uscalar_t))
 432                         break;
 433 
 434                 prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
 435 
 436                 if (prim == DL_UNITDATA_REQ) {
 437                         proto_unitdata_req(dsp, mp);
 438                 } else {
 439                         dld_wput_nondata(dsp, mp);
 440                 }
 441                 break;
 442         }
 443 
 444         case M_IOCTL:
 445                 dld_wput_nondata(dsp, mp);
 446                 break;
 447 
 448         case M_FLUSH:
 449                 if (*mp->b_rptr & FLUSHW) {
 450                         DLD_CLRQFULL(dsp);
 451                         *mp->b_rptr &= ~FLUSHW;
 452                 }
 453 
 454                 if (*mp->b_rptr & FLUSHR) {
 455                         qreply(wq, mp);
 456                 } else {
 457                         freemsg(mp);
 458                 }
 459                 break;
 460 
 461         default:
 462                 freemsg(mp);
 463                 break;
 464         }
 465 }
 466 
 467 /*
 468  * qi_srvp: srv(9e)
 469  */
 470 void
 471 dld_wsrv(queue_t *wq)
 472 {
 473         dld_str_t       *dsp = wq->q_ptr;
 474 
 475         DLD_CLRQFULL(dsp);
 476 }
 477 
 478 void
 479 dld_init_ops(struct dev_ops *ops, const char *name)
 480 {
 481         struct streamtab *stream;
 482         struct qinit *rq, *wq;
 483         struct module_info *modinfo;
 484 
 485         modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
 486         modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
 487         (void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
 488         modinfo->mi_minpsz = 0;
 489         modinfo->mi_maxpsz = 64*1024;
 490         modinfo->mi_hiwat  = 1;
 491         modinfo->mi_lowat = 0;
 492 
 493         rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
 494         rq->qi_qopen = dld_open;
 495         rq->qi_qclose = dld_close;
 496         rq->qi_minfo = modinfo;
 497 
 498         wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
 499         wq->qi_putp = (pfi_t)dld_wput;
 500         wq->qi_srvp = (pfi_t)dld_wsrv;
 501         wq->qi_minfo = modinfo;
 502 
 503         stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
 504         stream->st_rdinit = rq;
 505         stream->st_wrinit = wq;
 506         ops->devo_cb_ops->cb_str = stream;
 507 
 508         if (ops->devo_getinfo == NULL)
 509                 ops->devo_getinfo = &dld_getinfo;
 510 }
 511 
 512 void
 513 dld_fini_ops(struct dev_ops *ops)
 514 {
 515         struct streamtab *stream;
 516         struct qinit *rq, *wq;
 517         struct module_info *modinfo;
 518 
 519         stream = ops->devo_cb_ops->cb_str;
 520         rq = stream->st_rdinit;
 521         wq = stream->st_wrinit;
 522         modinfo = rq->qi_minfo;
 523         ASSERT(wq->qi_minfo == modinfo);
 524 
 525         kmem_free(stream, sizeof (struct streamtab));
 526         kmem_free(wq, sizeof (struct qinit));
 527         kmem_free(rq, sizeof (struct qinit));
 528         kmem_free(modinfo->mi_idname, FMNAMESZ);
 529         kmem_free(modinfo, sizeof (struct module_info));
 530 }
 531 
 532 /*
 533  * Initialize this module's data structures.
 534  */
 535 void
 536 dld_str_init(void)
 537 {
 538         /*
 539          * Create dld_str_t object cache.
 540          */
 541         str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
 542             0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
 543         ASSERT(str_cachep != NULL);
 544 
 545         /*
 546          * Create a hash table for maintaining dld_str_t's.
 547          * The ds_minor field (the clone minor number) of a dld_str_t
 548          * is used as a key for this hash table because this number is
 549          * globally unique (allocated from "dls_minor_arena").
 550          */
 551         str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
 552             mod_hash_null_valdtor);
 553 
 554         mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL);
 555         cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL);
 556 
 557         dld_taskq_quit = B_FALSE;
 558         dld_taskq_done = B_FALSE;
 559         list_create(&dld_taskq_list, sizeof (dld_str_t),
 560             offsetof(dld_str_t, ds_tqlist));
 561         (void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0,
 562             &p0, TS_RUN, minclsyspri);
 563 }
 564 
 565 /*
 566  * Tear down this module's data structures.
 567  */
 568 int
 569 dld_str_fini(void)
 570 {
 571         /*
 572          * Make sure that there are no objects in use.
 573          */
 574         if (str_count != 0)
 575                 return (EBUSY);
 576 
 577         /*
 578          * Ask the dld_taskq thread to quit and wait for it to be done
 579          */
 580         mutex_enter(&dld_taskq_lock);
 581         dld_taskq_quit = B_TRUE;
 582         cv_signal(&dld_taskq_cv);
 583         while (!dld_taskq_done)
 584                 cv_wait(&dld_taskq_cv, &dld_taskq_lock);
 585         mutex_exit(&dld_taskq_lock);
 586         list_destroy(&dld_taskq_list);
 587         /*
 588          * Destroy object cache.
 589          */
 590         kmem_cache_destroy(str_cachep);
 591         mod_hash_destroy_idhash(str_hashp);
 592         return (0);
 593 }
 594 
 595 /*
 596  * Create a new dld_str_t object.
 597  */
 598 dld_str_t *
 599 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
 600 {
 601         dld_str_t       *dsp;
 602         int             err;
 603 
 604         /*
 605          * Allocate an object from the cache.
 606          */
 607         atomic_add_32(&str_count, 1);
 608         dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
 609 
 610         /*
 611          * Allocate the dummy mblk for flow-control.
 612          */
 613         dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
 614         if (dsp->ds_tx_flow_mp == NULL) {
 615                 kmem_cache_free(str_cachep, dsp);
 616                 atomic_add_32(&str_count, -1);
 617                 return (NULL);
 618         }
 619         dsp->ds_type = type;
 620         dsp->ds_major = major;
 621         dsp->ds_style = style;
 622 
 623         /*
 624          * Initialize the queue pointers.
 625          */
 626         ASSERT(RD(rq) == rq);
 627         dsp->ds_rq = rq;
 628         dsp->ds_wq = WR(rq);
 629         rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
 630 
 631         /*
 632          * We want explicit control over our write-side STREAMS queue
 633          * where the dummy mblk gets added/removed for flow-control.
 634          */
 635         noenable(WR(rq));
 636 
 637         err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
 638             (mod_hash_val_t)dsp);
 639         ASSERT(err == 0);
 640         return (dsp);
 641 }
 642 
 643 /*
 644  * Destroy a dld_str_t object.
 645  */
 646 void
 647 dld_str_destroy(dld_str_t *dsp)
 648 {
 649         queue_t         *rq;
 650         queue_t         *wq;
 651         mod_hash_val_t  val;
 652 
 653         /*
 654          * Clear the queue pointers.
 655          */
 656         rq = dsp->ds_rq;
 657         wq = dsp->ds_wq;
 658         ASSERT(wq == WR(rq));
 659         rq->q_ptr = wq->q_ptr = NULL;
 660         dsp->ds_rq = dsp->ds_wq = NULL;
 661 
 662         ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
 663         ASSERT(dsp->ds_sap == 0);
 664         ASSERT(dsp->ds_mh == NULL);
 665         ASSERT(dsp->ds_mch == NULL);
 666         ASSERT(dsp->ds_promisc == 0);
 667         ASSERT(dsp->ds_mph == NULL);
 668         ASSERT(dsp->ds_mip == NULL);
 669         ASSERT(dsp->ds_mnh == NULL);
 670 
 671         ASSERT(dsp->ds_polling == B_FALSE);
 672         ASSERT(dsp->ds_direct == B_FALSE);
 673         ASSERT(dsp->ds_lso == B_FALSE);
 674         ASSERT(dsp->ds_lso_max == 0);
 675         ASSERT(dsp->ds_passivestate != DLD_ACTIVE);
 676 
 677         /*
 678          * Reinitialize all the flags.
 679          */
 680         dsp->ds_notifications = 0;
 681         dsp->ds_passivestate = DLD_UNINITIALIZED;
 682         dsp->ds_mode = DLD_UNITDATA;
 683         dsp->ds_native = B_FALSE;
 684         dsp->ds_nonip = B_FALSE;
 685 
 686         ASSERT(dsp->ds_datathr_cnt == 0);
 687         ASSERT(dsp->ds_pending_head == NULL);
 688         ASSERT(dsp->ds_pending_tail == NULL);
 689         ASSERT(!dsp->ds_dlpi_pending);
 690 
 691         ASSERT(dsp->ds_dlp == NULL);
 692         ASSERT(dsp->ds_dmap == NULL);
 693         ASSERT(dsp->ds_rx == NULL);
 694         ASSERT(dsp->ds_rx_arg == NULL);
 695         ASSERT(dsp->ds_next == NULL);
 696         ASSERT(dsp->ds_head == NULL);
 697 
 698         /*
 699          * Free the dummy mblk if exists.
 700          */
 701         if (dsp->ds_tx_flow_mp != NULL) {
 702                 freeb(dsp->ds_tx_flow_mp);
 703                 dsp->ds_tx_flow_mp = NULL;
 704         }
 705 
 706         (void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
 707         ASSERT(dsp == (dld_str_t *)val);
 708 
 709         /*
 710          * Free the object back to the cache.
 711          */
 712         kmem_cache_free(str_cachep, dsp);
 713         atomic_add_32(&str_count, -1);
 714 }
 715 
 716 /*
 717  * kmem_cache contructor function: see kmem_cache_create(9f).
 718  */
 719 /*ARGSUSED*/
 720 static int
 721 str_constructor(void *buf, void *cdrarg, int kmflags)
 722 {
 723         dld_str_t       *dsp = buf;
 724 
 725         bzero(buf, sizeof (dld_str_t));
 726 
 727         /*
 728          * Allocate a new minor number.
 729          */
 730         if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0)
 731                 return (-1);
 732 
 733         /*
 734          * Initialize the DLPI state machine.
 735          */
 736         dsp->ds_dlstate = DL_UNATTACHED;
 737 
 738         mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL);
 739         cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL);
 740         cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL);
 741 
 742         return (0);
 743 }
 744 
 745 /*
 746  * kmem_cache destructor function.
 747  */
 748 /*ARGSUSED*/
 749 static void
 750 str_destructor(void *buf, void *cdrarg)
 751 {
 752         dld_str_t       *dsp = buf;
 753 
 754         /*
 755          * Release the minor number.
 756          */
 757         mac_minor_rele(dsp->ds_minor);
 758 
 759         ASSERT(dsp->ds_tx_flow_mp == NULL);
 760 
 761         mutex_destroy(&dsp->ds_lock);
 762         cv_destroy(&dsp->ds_datathr_cv);
 763         cv_destroy(&dsp->ds_dlpi_pending_cv);
 764 }
 765 
 766 /*
 767  * Update the priority bits and VID (may need to insert tag if mp points
 768  * to an untagged packet.
 769  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
 770  */
 771 static mblk_t *
 772 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
 773     link_tagmode_t tagmode)
 774 {
 775         mblk_t *hmp;
 776         struct ether_vlan_header *evhp;
 777         struct ether_header *ehp;
 778         uint16_t old_tci = 0;
 779         size_t len;
 780 
 781         ASSERT(pri != 0 || vid != VLAN_ID_NONE);
 782 
 783         evhp = (struct ether_vlan_header *)mp->b_rptr;
 784         if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
 785                 /*
 786                  * Tagged packet, update the priority bits.
 787                  */
 788                 len = sizeof (struct ether_vlan_header);
 789 
 790                 if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
 791                         /*
 792                          * In case some drivers only check the db_ref
 793                          * count of the first mblk, we pullup the
 794                          * message into a single mblk.
 795                          */
 796                         hmp = msgpullup(mp, -1);
 797                         if ((hmp == NULL) || (MBLKL(hmp) < len)) {
 798                                 freemsg(hmp);
 799                                 return (NULL);
 800                         } else {
 801                                 freemsg(mp);
 802                                 mp = hmp;
 803                         }
 804                 }
 805 
 806                 evhp = (struct ether_vlan_header *)mp->b_rptr;
 807                 old_tci = ntohs(evhp->ether_tci);
 808         } else {
 809                 /*
 810                  * Untagged packet.  Two factors will cause us to insert a
 811                  * VLAN header:
 812                  * - This is a VLAN link (vid is specified)
 813                  * - The link supports user priority tagging and the priority
 814                  *   is non-zero.
 815                  */
 816                 if (vid == VLAN_ID_NONE && tagmode == LINK_TAGMODE_VLANONLY)
 817                         return (mp);
 818 
 819                 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
 820                 if (hmp == NULL)
 821                         return (NULL);
 822 
 823                 evhp = (struct ether_vlan_header *)hmp->b_rptr;
 824                 ehp = (struct ether_header *)mp->b_rptr;
 825 
 826                 /*
 827                  * Copy the MAC addresses and typelen
 828                  */
 829                 bcopy(ehp, evhp, (ETHERADDRL * 2));
 830                 evhp->ether_type = ehp->ether_type;
 831                 evhp->ether_tpid = htons(ETHERTYPE_VLAN);
 832 
 833                 hmp->b_wptr += sizeof (struct ether_vlan_header);
 834                 mp->b_rptr += sizeof (struct ether_header);
 835 
 836                 /*
 837                  * Free the original message if it's now empty. Link the
 838                  * rest of the messages to the header message.
 839                  */
 840                 if (MBLKL(mp) == 0) {
 841                         hmp->b_cont = mp->b_cont;
 842                         freeb(mp);
 843                 } else {
 844                         hmp->b_cont = mp;
 845                 }
 846                 mp = hmp;
 847         }
 848 
 849         if (pri == 0)
 850                 pri = VLAN_PRI(old_tci);
 851         if (vid == VLAN_ID_NONE)
 852                 vid = VLAN_ID(old_tci);
 853         evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
 854         return (mp);
 855 }
 856 
 857 /*
 858  * M_DATA put (IP fast-path mode)
 859  */
 860 mac_tx_cookie_t
 861 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
 862     uint16_t flag)
 863 {
 864         boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
 865         mblk_t *newmp;
 866         uint_t pri;
 867         mac_tx_cookie_t cookie;
 868 
 869         if (is_ethernet) {
 870                 /*
 871                  * Update the priority bits to the assigned priority.
 872                  */
 873                 pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
 874 
 875                 if (pri != 0) {
 876                         newmp = i_dld_ether_header_update_tag(mp, pri,
 877                             VLAN_ID_NONE, dsp->ds_dlp->dl_tagmode);
 878                         if (newmp == NULL)
 879                                 goto discard;
 880                         mp = newmp;
 881                 }
 882         }
 883 
 884         if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
 885                 DLD_SETQFULL(dsp);
 886         }
 887         return (cookie);
 888 
 889 discard:
 890         /* TODO: bump kstat? */
 891         freemsg(mp);
 892         return (NULL);
 893 }
 894 
 895 /*
 896  * M_DATA put (DLIOCRAW mode)
 897  */
 898 static void
 899 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
 900 {
 901         boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
 902         mblk_t *bp, *newmp;
 903         size_t size;
 904         mac_header_info_t mhi;
 905         uint_t pri, vid, dvid;
 906         uint_t max_sdu;
 907 
 908         /*
 909          * Certain MAC type plugins provide an illusion for raw DLPI
 910          * consumers.  They pretend that the MAC layer is something that
 911          * it's not for the benefit of observability tools.  For example,
 912          * mac_wifi pretends that it's Ethernet for such consumers.
 913          * Here, unless native mode is enabled, we call into the MAC layer so
 914          * that this illusion can be maintained.  The plugin will optionally
 915          * transform the MAC header here into something that can be passed
 916          * down.  The header goes from raw mode to "cooked" mode.
 917          */
 918         if (!dsp->ds_native) {
 919                 if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
 920                         goto discard;
 921                 mp = newmp;
 922         }
 923 
 924         size = MBLKL(mp);
 925 
 926         /*
 927          * Check the packet is not too big and that any remaining
 928          * fragment list is composed entirely of M_DATA messages. (We
 929          * know the first fragment was M_DATA otherwise we could not
 930          * have got here).
 931          */
 932         for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
 933                 if (DB_TYPE(bp) != M_DATA)
 934                         goto discard;
 935                 size += MBLKL(bp);
 936         }
 937 
 938         if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
 939                 goto discard;
 940 
 941         mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
 942         /*
 943          * If LSO is enabled, check the size against lso_max. Otherwise,
 944          * compare the packet size with max_sdu.
 945          */
 946         max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu;
 947         if (size > max_sdu + mhi.mhi_hdrsize)
 948                 goto discard;
 949 
 950         if (is_ethernet) {
 951                 dvid = mac_client_vid(dsp->ds_mch);
 952 
 953                 /*
 954                  * Discard the packet if this is a VLAN stream but the VID in
 955                  * the packet is not correct.
 956                  */
 957                 vid = VLAN_ID(mhi.mhi_tci);
 958                 if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
 959                         goto discard;
 960 
 961                 /*
 962                  * Discard the packet if this packet is a tagged packet
 963                  * but both pri and VID are 0.
 964                  */
 965                 pri = VLAN_PRI(mhi.mhi_tci);
 966                 if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 &&
 967                     vid == VLAN_ID_NONE)
 968                         goto discard;
 969 
 970                 /*
 971                  * Update the priority bits to the per-stream priority if
 972                  * priority is not set in the packet. Update the VID for
 973                  * packets on a VLAN stream.
 974                  */
 975                 pri = (pri == 0) ? dsp->ds_pri : 0;
 976                 if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
 977                         if ((newmp = i_dld_ether_header_update_tag(mp, pri,
 978                             dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
 979                                 goto discard;
 980                         }
 981                         mp = newmp;
 982                 }
 983         }
 984 
 985         if (DLD_TX(dsp, mp, 0, 0) != NULL) {
 986                 /* Turn on flow-control for dld */
 987                 DLD_SETQFULL(dsp);
 988         }
 989         return;
 990 
 991 discard:
 992         /* TODO: bump kstat? */
 993         freemsg(mp);
 994 }
 995 
 996 /*
 997  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
 998  */
 999 int
1000 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
1001 {
1002         dev_t                   dev;
1003         int                     err;
1004         const char              *drvname;
1005         mac_perim_handle_t      mph = NULL;
1006         boolean_t               qassociated = B_FALSE;
1007         dls_link_t              *dlp = NULL;
1008         dls_dl_handle_t         ddp = NULL;
1009 
1010         if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
1011                 return (EINVAL);
1012 
1013         if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA)
1014                 return (ENOTSUP);
1015 
1016         /*
1017          * /dev node access. This will still be supported for backward
1018          * compatibility reason.
1019          */
1020         if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) &&
1021             (strcmp(drvname, "vnic") != 0)) {
1022                 if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
1023                         return (EINVAL);
1024                 qassociated = B_TRUE;
1025         }
1026 
1027         dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
1028         if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0)
1029                 goto failed;
1030 
1031         if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0)
1032                 goto failed;
1033 
1034         /*
1035          * Open a channel.
1036          */
1037         if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0)
1038                 goto failed;
1039 
1040         if ((err = dls_open(dlp, ddp, dsp)) != 0)
1041                 goto failed;
1042 
1043         /*
1044          * Set the default packet priority.
1045          */
1046         dsp->ds_pri = 0;
1047 
1048         /*
1049          * Add a notify function so that the we get updates from the MAC.
1050          */
1051         dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp);
1052         dsp->ds_dlstate = DL_UNBOUND;
1053         mac_perim_exit(mph);
1054         return (0);
1055 
1056 failed:
1057         if (dlp != NULL)
1058                 dls_link_rele(dlp);
1059         if (mph != NULL)
1060                 mac_perim_exit(mph);
1061         if (ddp != NULL)
1062                 dls_devnet_rele(ddp);
1063         if (qassociated)
1064                 (void) qassociate(dsp->ds_wq, -1);
1065 
1066         return (err);
1067 }
1068 
1069 /*
1070  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1071  * from close(2) for style 2.
1072  */
1073 void
1074 dld_str_detach(dld_str_t *dsp)
1075 {
1076         mac_perim_handle_t      mph;
1077         int                     err;
1078 
1079         ASSERT(dsp->ds_datathr_cnt == 0);
1080 
1081         mac_perim_enter_by_mh(dsp->ds_mh, &mph);
1082         /*
1083          * Remove the notify function.
1084          *
1085          * Note that we cannot wait for the notification callback to be removed
1086          * since it could cause the deadlock with str_notify() since they both
1087          * need the mac perimeter. Continue if we cannot remove the
1088          * notification callback right now and wait after we leave the
1089          * perimeter.
1090          */
1091         err = mac_notify_remove(dsp->ds_mnh, B_FALSE);
1092         dsp->ds_mnh = NULL;
1093 
1094         /*
1095          * Disable the capabilities
1096          */
1097         dld_capabilities_disable(dsp);
1098 
1099         /*
1100          * Clear LSO flags.
1101          */
1102         dsp->ds_lso = B_FALSE;
1103         dsp->ds_lso_max = 0;
1104 
1105         dls_close(dsp);
1106         mac_perim_exit(mph);
1107 
1108         /*
1109          * Now we leave the mac perimeter. If mac_notify_remove() failed
1110          * because the notification callback was in progress, wait for
1111          * it to finish before we proceed.
1112          */
1113         if (err != 0)
1114                 mac_notify_remove_wait(dsp->ds_mh);
1115 
1116         /*
1117          * An unreferenced tagged (non-persistent) vlan gets destroyed
1118          * automatically in the call to dls_devnet_rele.
1119          */
1120         dls_devnet_rele(dsp->ds_ddh);
1121 
1122         dsp->ds_sap = 0;
1123         dsp->ds_mh = NULL;
1124         dsp->ds_mch = NULL;
1125         dsp->ds_mip = NULL;
1126 
1127         if (dsp->ds_style == DL_STYLE2)
1128                 (void) qassociate(dsp->ds_wq, -1);
1129 
1130         /*
1131          * Re-initialize the DLPI state machine.
1132          */
1133         dsp->ds_dlstate = DL_UNATTACHED;
1134 }
1135 
1136 /*
1137  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1138  * tags before sending packets up to the DLS clients, with the exception of
1139  * special priority tagged packets, in that case, we set the VID to 0.
1140  * mp must be a VLAN tagged packet.
1141  */
1142 static mblk_t *
1143 i_dld_ether_header_strip_tag(mblk_t *mp, boolean_t keep_pri)
1144 {
1145         mblk_t *newmp;
1146         struct ether_vlan_header *evhp;
1147         uint16_t tci, new_tci;
1148 
1149         ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1150         if (DB_REF(mp) > 1) {
1151                 newmp = copymsg(mp);
1152                 if (newmp == NULL)
1153                         return (NULL);
1154                 freemsg(mp);
1155                 mp = newmp;
1156         }
1157         evhp = (struct ether_vlan_header *)mp->b_rptr;
1158 
1159         tci = ntohs(evhp->ether_tci);
1160         if (VLAN_PRI(tci) == 0 || !keep_pri) {
1161                 /*
1162                  * Priority is 0, strip the tag.
1163                  */
1164                 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1165                 mp->b_rptr += VLAN_TAGSZ;
1166         } else {
1167                 /*
1168                  * Priority is not 0, update the VID to 0.
1169                  */
1170                 new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1171                 evhp->ether_tci = htons(new_tci);
1172         }
1173         return (mp);
1174 }
1175 
1176 /*
1177  * Raw mode receive function.
1178  */
1179 /*ARGSUSED*/
1180 void
1181 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1182     mac_header_info_t *mhip)
1183 {
1184         dld_str_t *dsp = (dld_str_t *)arg;
1185         boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1186         mblk_t *next, *newmp;
1187 
1188         ASSERT(mp != NULL);
1189         do {
1190                 /*
1191                  * Get the pointer to the next packet in the chain and then
1192                  * clear b_next before the packet gets passed on.
1193                  */
1194                 next = mp->b_next;
1195                 mp->b_next = NULL;
1196 
1197                 /*
1198                  * Wind back b_rptr to point at the MAC header.
1199                  */
1200                 ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1201                 mp->b_rptr -= mhip->mhi_hdrsize;
1202 
1203                 /*
1204                  * Certain MAC type plugins provide an illusion for raw
1205                  * DLPI consumers.  They pretend that the MAC layer is
1206                  * something that it's not for the benefit of observability
1207                  * tools.  For example, mac_wifi pretends that it's Ethernet
1208                  * for such consumers.  Here, unless native mode is enabled,
1209                  * we call into the MAC layer so that this illusion can be
1210                  * maintained.  The plugin will optionally transform the MAC
1211                  * header here into something that can be passed up to raw
1212                  * consumers.  The header goes from "cooked" mode to raw mode.
1213                  */
1214                 if (!dsp->ds_native) {
1215                         newmp = mac_header_uncook(dsp->ds_mh, mp);
1216                         if (newmp == NULL) {
1217                                 freemsg(mp);
1218                                 goto next;
1219                         }
1220                         mp = newmp;
1221                 }
1222 
1223                 /*
1224                  * Strip the VLAN tag for VLAN streams.
1225                  */
1226                 if (is_ethernet &&
1227                     mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) {
1228                         /*
1229                          * The priority should be kept only for VLAN
1230                          * data-links.
1231                          */
1232                         newmp = i_dld_ether_header_strip_tag(mp,
1233                             mac_client_is_vlan_vnic(dsp->ds_mch));
1234                         if (newmp == NULL) {
1235                                 freemsg(mp);
1236                                 goto next;
1237                         }
1238                         mp = newmp;
1239                 }
1240 
1241                 /*
1242                  * Pass the packet on.
1243                  */
1244                 if (canputnext(dsp->ds_rq))
1245                         putnext(dsp->ds_rq, mp);
1246                 else
1247                         freemsg(mp);
1248 
1249 next:
1250                 /*
1251                  * Move on to the next packet in the chain.
1252                  */
1253                 mp = next;
1254         } while (mp != NULL);
1255 }
1256 
1257 /*
1258  * Fast-path receive function.
1259  */
1260 /*ARGSUSED*/
1261 void
1262 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1263     mac_header_info_t *mhip)
1264 {
1265         dld_str_t *dsp = (dld_str_t *)arg;
1266         mblk_t *next;
1267         size_t offset = 0;
1268 
1269         /*
1270          * MAC header stripping rules:
1271          *    - Tagged packets:
1272          *      a. VLAN streams. Strip the whole VLAN header including the tag.
1273          *      b. Physical streams
1274          *      - VLAN packets (non-zero VID). The stream must be either a
1275          *        DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1276          *        Strip the Ethernet header but keep the VLAN header.
1277          *      - Special tagged packets (zero VID)
1278          *        * The stream is either a DL_PROMISC_SAP listener or a
1279          *          ETHERTYPE_VLAN listener, strip the Ethernet header but
1280          *          keep the VLAN header.
1281          *        * Otherwise, strip the whole VLAN header.
1282          *    - Untagged packets. Strip the whole MAC header.
1283          */
1284         if (mhip->mhi_istagged &&
1285             (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1286             ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1287             (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1288                 offset = VLAN_TAGSZ;
1289         }
1290 
1291         ASSERT(mp != NULL);
1292         do {
1293                 /*
1294                  * Get the pointer to the next packet in the chain and then
1295                  * clear b_next before the packet gets passed on.
1296                  */
1297                 next = mp->b_next;
1298                 mp->b_next = NULL;
1299 
1300                 /*
1301                  * Wind back b_rptr to point at the VLAN header.
1302                  */
1303                 ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1304                 mp->b_rptr -= offset;
1305 
1306                 /*
1307                  * Pass the packet on.
1308                  */
1309                 if (canputnext(dsp->ds_rq))
1310                         putnext(dsp->ds_rq, mp);
1311                 else
1312                         freemsg(mp);
1313                 /*
1314                  * Move on to the next packet in the chain.
1315                  */
1316                 mp = next;
1317         } while (mp != NULL);
1318 }
1319 
1320 /*
1321  * Default receive function (send DL_UNITDATA_IND messages).
1322  */
1323 /*ARGSUSED*/
1324 void
1325 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1326     mac_header_info_t *mhip)
1327 {
1328         dld_str_t               *dsp = (dld_str_t *)arg;
1329         mblk_t                  *ud_mp;
1330         mblk_t                  *next;
1331         size_t                  offset = 0;
1332         boolean_t               strip_vlan = B_TRUE;
1333 
1334         /*
1335          * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1336          */
1337         if (mhip->mhi_istagged &&
1338             (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1339             ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1340             (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1341                 offset = VLAN_TAGSZ;
1342                 strip_vlan = B_FALSE;
1343         }
1344 
1345         ASSERT(mp != NULL);
1346         do {
1347                 /*
1348                  * Get the pointer to the next packet in the chain and then
1349                  * clear b_next before the packet gets passed on.
1350                  */
1351                 next = mp->b_next;
1352                 mp->b_next = NULL;
1353 
1354                 /*
1355                  * Wind back b_rptr to point at the MAC header.
1356                  */
1357                 ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1358                 mp->b_rptr -= mhip->mhi_hdrsize;
1359 
1360                 /*
1361                  * Create the DL_UNITDATA_IND M_PROTO.
1362                  */
1363                 if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1364                         freemsgchain(mp);
1365                         return;
1366                 }
1367 
1368                 /*
1369                  * Advance b_rptr to point at the payload (or the VLAN header).
1370                  */
1371                 mp->b_rptr += (mhip->mhi_hdrsize - offset);
1372 
1373                 /*
1374                  * Prepend the DL_UNITDATA_IND.
1375                  */
1376                 ud_mp->b_cont = mp;
1377 
1378                 /*
1379                  * Send the message.
1380                  */
1381                 if (canputnext(dsp->ds_rq))
1382                         putnext(dsp->ds_rq, ud_mp);
1383                 else
1384                         freemsg(ud_mp);
1385 
1386                 /*
1387                  * Move on to the next packet in the chain.
1388                  */
1389                 mp = next;
1390         } while (mp != NULL);
1391 }
1392 
1393 /*
1394  * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
1395  */
1396 static void
1397 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu, uint_t multicast_sdu)
1398 {
1399         mblk_t          *mp;
1400         dl_notify_ind_t *dlip;
1401 
1402         if (!(dsp->ds_notifications & (DL_NOTE_SDU_SIZE|DL_NOTE_SDU_SIZE2)))
1403                 return;
1404 
1405         if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1406             M_PROTO, 0)) == NULL)
1407                 return;
1408 
1409         bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1410         dlip = (dl_notify_ind_t *)mp->b_rptr;
1411         dlip->dl_primitive = DL_NOTIFY_IND;
1412         if (dsp->ds_notifications & DL_NOTE_SDU_SIZE2) {
1413                 dlip->dl_notification = DL_NOTE_SDU_SIZE2;
1414                 dlip->dl_data1 = max_sdu;
1415                 dlip->dl_data2 = multicast_sdu;
1416         } else {
1417                 dlip->dl_notification = DL_NOTE_SDU_SIZE;
1418                 dlip->dl_data = max_sdu;
1419         }
1420 
1421         qreply(dsp->ds_wq, mp);
1422 }
1423 
1424 /*
1425  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1426  * current state of the interface.
1427  */
1428 void
1429 dld_str_notify_ind(dld_str_t *dsp)
1430 {
1431         mac_notify_type_t       type;
1432 
1433         for (type = 0; type < MAC_NNOTE; type++)
1434                 str_notify(dsp, type);
1435 }
1436 
1437 typedef struct dl_unitdata_ind_wrapper {
1438         dl_unitdata_ind_t       dl_unitdata;
1439         uint8_t                 dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1440         uint8_t                 dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1441 } dl_unitdata_ind_wrapper_t;
1442 
1443 /*
1444  * Create a DL_UNITDATA_IND M_PROTO message.
1445  */
1446 static mblk_t *
1447 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1448 {
1449         mblk_t                          *nmp;
1450         dl_unitdata_ind_wrapper_t       *dlwp;
1451         dl_unitdata_ind_t               *dlp;
1452         mac_header_info_t               mhi;
1453         uint_t                          addr_length;
1454         uint8_t                         *daddr;
1455         uint8_t                         *saddr;
1456 
1457         /*
1458          * Get the packet header information.
1459          */
1460         if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
1461                 return (NULL);
1462 
1463         /*
1464          * Allocate a message large enough to contain the wrapper structure
1465          * defined above.
1466          */
1467         if ((nmp = mexchange(dsp->ds_wq, NULL,
1468             sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1469             DL_UNITDATA_IND)) == NULL)
1470                 return (NULL);
1471 
1472         dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1473 
1474         dlp = &(dlwp->dl_unitdata);
1475         ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1476         ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1477 
1478         /*
1479          * Copy in the destination address.
1480          */
1481         addr_length = dsp->ds_mip->mi_addr_length;
1482         daddr = dlwp->dl_dest_addr;
1483         dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1484         bcopy(mhi.mhi_daddr, daddr, addr_length);
1485 
1486         /*
1487          * Set the destination DLSAP to the SAP value encoded in the packet.
1488          */
1489         if (mhi.mhi_istagged && !strip_vlan)
1490                 *(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1491         else
1492                 *(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1493         dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1494 
1495         /*
1496          * If the destination address was multicast or broadcast then the
1497          * dl_group_address field should be non-zero.
1498          */
1499         dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1500             (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1501 
1502         /*
1503          * Copy in the source address if one exists.  Some MAC types (DL_IB
1504          * for example) may not have access to source information.
1505          */
1506         if (mhi.mhi_saddr == NULL) {
1507                 dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1508         } else {
1509                 saddr = dlwp->dl_src_addr;
1510                 dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1511                 bcopy(mhi.mhi_saddr, saddr, addr_length);
1512 
1513                 /*
1514                  * Set the source DLSAP to the packet ethertype.
1515                  */
1516                 *(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1517                 dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1518         }
1519 
1520         return (nmp);
1521 }
1522 
1523 /*
1524  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1525  */
1526 static void
1527 str_notify_promisc_on_phys(dld_str_t *dsp)
1528 {
1529         mblk_t          *mp;
1530         dl_notify_ind_t *dlip;
1531 
1532         if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1533                 return;
1534 
1535         if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1536             M_PROTO, 0)) == NULL)
1537                 return;
1538 
1539         bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1540         dlip = (dl_notify_ind_t *)mp->b_rptr;
1541         dlip->dl_primitive = DL_NOTIFY_IND;
1542         dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1543 
1544         qreply(dsp->ds_wq, mp);
1545 }
1546 
1547 /*
1548  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1549  */
1550 static void
1551 str_notify_promisc_off_phys(dld_str_t *dsp)
1552 {
1553         mblk_t          *mp;
1554         dl_notify_ind_t *dlip;
1555 
1556         if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1557                 return;
1558 
1559         if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1560             M_PROTO, 0)) == NULL)
1561                 return;
1562 
1563         bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1564         dlip = (dl_notify_ind_t *)mp->b_rptr;
1565         dlip->dl_primitive = DL_NOTIFY_IND;
1566         dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1567 
1568         qreply(dsp->ds_wq, mp);
1569 }
1570 
1571 /*
1572  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1573  */
1574 static void
1575 str_notify_phys_addr(dld_str_t *dsp, uint_t addr_type, const uint8_t *addr)
1576 {
1577         mblk_t          *mp;
1578         dl_notify_ind_t *dlip;
1579         uint_t          addr_length;
1580         uint16_t        ethertype;
1581 
1582         if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1583                 return;
1584 
1585         addr_length = dsp->ds_mip->mi_addr_length;
1586         if ((mp = mexchange(dsp->ds_wq, NULL,
1587             sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1588             M_PROTO, 0)) == NULL)
1589                 return;
1590 
1591         bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1592         dlip = (dl_notify_ind_t *)mp->b_rptr;
1593         dlip->dl_primitive = DL_NOTIFY_IND;
1594         dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1595         dlip->dl_data = addr_type;
1596         dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1597         dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1598 
1599         bcopy(addr, &dlip[1], addr_length);
1600 
1601         ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1602         *(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
1603 
1604         qreply(dsp->ds_wq, mp);
1605 }
1606 
1607 /*
1608  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1609  */
1610 static void
1611 str_notify_link_up(dld_str_t *dsp)
1612 {
1613         mblk_t          *mp;
1614         dl_notify_ind_t *dlip;
1615 
1616         if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1617                 return;
1618 
1619         if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1620             M_PROTO, 0)) == NULL)
1621                 return;
1622 
1623         bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1624         dlip = (dl_notify_ind_t *)mp->b_rptr;
1625         dlip->dl_primitive = DL_NOTIFY_IND;
1626         dlip->dl_notification = DL_NOTE_LINK_UP;
1627 
1628         qreply(dsp->ds_wq, mp);
1629 }
1630 
1631 /*
1632  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1633  */
1634 static void
1635 str_notify_link_down(dld_str_t *dsp)
1636 {
1637         mblk_t          *mp;
1638         dl_notify_ind_t *dlip;
1639 
1640         if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1641                 return;
1642 
1643         if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1644             M_PROTO, 0)) == NULL)
1645                 return;
1646 
1647         bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1648         dlip = (dl_notify_ind_t *)mp->b_rptr;
1649         dlip->dl_primitive = DL_NOTIFY_IND;
1650         dlip->dl_notification = DL_NOTE_LINK_DOWN;
1651 
1652         qreply(dsp->ds_wq, mp);
1653 }
1654 
1655 /*
1656  * DL_NOTIFY_IND: DL_NOTE_SPEED
1657  */
1658 static void
1659 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1660 {
1661         mblk_t          *mp;
1662         dl_notify_ind_t *dlip;
1663 
1664         if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1665                 return;
1666 
1667         if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1668             M_PROTO, 0)) == NULL)
1669                 return;
1670 
1671         bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1672         dlip = (dl_notify_ind_t *)mp->b_rptr;
1673         dlip->dl_primitive = DL_NOTIFY_IND;
1674         dlip->dl_notification = DL_NOTE_SPEED;
1675         dlip->dl_data = speed;
1676 
1677         qreply(dsp->ds_wq, mp);
1678 }
1679 
1680 /*
1681  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1682  */
1683 static void
1684 str_notify_capab_reneg(dld_str_t *dsp)
1685 {
1686         mblk_t          *mp;
1687         dl_notify_ind_t *dlip;
1688 
1689         if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1690                 return;
1691 
1692         if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1693             M_PROTO, 0)) == NULL)
1694                 return;
1695 
1696         bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1697         dlip = (dl_notify_ind_t *)mp->b_rptr;
1698         dlip->dl_primitive = DL_NOTIFY_IND;
1699         dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1700 
1701         qreply(dsp->ds_wq, mp);
1702 }
1703 
1704 /*
1705  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1706  */
1707 static void
1708 str_notify_fastpath_flush(dld_str_t *dsp)
1709 {
1710         mblk_t          *mp;
1711         dl_notify_ind_t *dlip;
1712 
1713         if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1714                 return;
1715 
1716         if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1717             M_PROTO, 0)) == NULL)
1718                 return;
1719 
1720         bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1721         dlip = (dl_notify_ind_t *)mp->b_rptr;
1722         dlip->dl_primitive = DL_NOTIFY_IND;
1723         dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1724 
1725         qreply(dsp->ds_wq, mp);
1726 }
1727 
1728 static void
1729 str_notify_allowed_ips(dld_str_t *dsp)
1730 {
1731         mblk_t          *mp;
1732         dl_notify_ind_t *dlip;
1733         size_t          mp_size;
1734         mac_protect_t   *mrp;
1735 
1736         if (!(dsp->ds_notifications & DL_NOTE_ALLOWED_IPS))
1737                 return;
1738 
1739         mp_size = sizeof (mac_protect_t) + sizeof (dl_notify_ind_t);
1740         if ((mp = mexchange(dsp->ds_wq, NULL, mp_size, M_PROTO, 0)) == NULL)
1741                 return;
1742 
1743         mrp = mac_protect_get(dsp->ds_mh);
1744         bzero(mp->b_rptr, mp_size);
1745         dlip = (dl_notify_ind_t *)mp->b_rptr;
1746         dlip->dl_primitive = DL_NOTIFY_IND;
1747         dlip->dl_notification = DL_NOTE_ALLOWED_IPS;
1748         dlip->dl_data = 0;
1749         dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1750         dlip->dl_addr_length = sizeof (mac_protect_t);
1751         bcopy(mrp, mp->b_rptr + sizeof (dl_notify_ind_t),
1752             sizeof (mac_protect_t));
1753 
1754         qreply(dsp->ds_wq, mp);
1755 }
1756 
1757 /*
1758  * MAC notification callback.
1759  */
1760 void
1761 str_notify(void *arg, mac_notify_type_t type)
1762 {
1763         dld_str_t               *dsp = (dld_str_t *)arg;
1764         queue_t                 *q = dsp->ds_wq;
1765         mac_handle_t            mh = dsp->ds_mh;
1766         mac_client_handle_t     mch = dsp->ds_mch;
1767         uint8_t                 addr[MAXMACADDRLEN];
1768 
1769         switch (type) {
1770         case MAC_NOTE_TX:
1771                 qenable(q);
1772                 break;
1773 
1774         case MAC_NOTE_DEVPROMISC:
1775                 /*
1776                  * Send the appropriate DL_NOTIFY_IND.
1777                  */
1778                 if (mac_promisc_get(mh))
1779                         str_notify_promisc_on_phys(dsp);
1780                 else
1781                         str_notify_promisc_off_phys(dsp);
1782                 break;
1783 
1784         case MAC_NOTE_UNICST:
1785                 /*
1786                  * This notification is sent whenever the MAC unicast
1787                  * address changes.
1788                  */
1789                 mac_unicast_primary_get(mh, addr);
1790 
1791                 /*
1792                  * Send the appropriate DL_NOTIFY_IND.
1793                  */
1794                 str_notify_phys_addr(dsp, DL_CURR_PHYS_ADDR, addr);
1795                 break;
1796 
1797         case MAC_NOTE_DEST:
1798                 /*
1799                  * Only send up DL_NOTE_DEST_ADDR if the link has a
1800                  * destination address.
1801                  */
1802                 if (mac_dst_get(dsp->ds_mh, addr))
1803                         str_notify_phys_addr(dsp, DL_CURR_DEST_ADDR, addr);
1804                 break;
1805 
1806         case MAC_NOTE_LOWLINK:
1807         case MAC_NOTE_LINK:
1808                 /*
1809                  * LOWLINK refers to the actual link status. For links that
1810                  * are not part of a bridge instance LOWLINK and LINK state
1811                  * are the same. But for a link part of a bridge instance
1812                  * LINK state refers to the aggregate link status: "up" when
1813                  * at least one link part of the bridge is up and is "down"
1814                  * when all links part of the bridge are down.
1815                  *
1816                  * Clients can request to be notified of the LOWLINK state
1817                  * using the DLIOCLOWLINK ioctl. Clients such as the bridge
1818                  * daemon request lowlink state changes and upper layer clients
1819                  * receive notifications of the aggregate link state changes
1820                  * which is the default when requesting LINK UP/DOWN state
1821                  * notifications.
1822                  */
1823 
1824                 /*
1825                  * Check that the notification type matches the one that we
1826                  * want.  If we want lower-level link notifications, and this
1827                  * is upper, or if we want upper and this is lower, then
1828                  * ignore.
1829                  */
1830                 if ((type == MAC_NOTE_LOWLINK) != dsp->ds_lowlink)
1831                         break;
1832                 /*
1833                  * This notification is sent every time the MAC driver
1834                  * updates the link state.
1835                  */
1836                 switch (mac_client_stat_get(mch, dsp->ds_lowlink ?
1837                     MAC_STAT_LOWLINK_STATE : MAC_STAT_LINK_STATE)) {
1838                 case LINK_STATE_UP: {
1839                         uint64_t speed;
1840                         /*
1841                          * The link is up so send the appropriate
1842                          * DL_NOTIFY_IND.
1843                          */
1844                         str_notify_link_up(dsp);
1845 
1846                         speed = mac_stat_get(mh, MAC_STAT_IFSPEED);
1847                         str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1848                         break;
1849                 }
1850                 case LINK_STATE_DOWN:
1851                         /*
1852                          * The link is down so send the appropriate
1853                          * DL_NOTIFY_IND.
1854                          */
1855                         str_notify_link_down(dsp);
1856                         break;
1857 
1858                 default:
1859                         break;
1860                 }
1861                 break;
1862 
1863         case MAC_NOTE_CAPAB_CHG:
1864                 /*
1865                  * This notification is sent whenever the MAC resources
1866                  * change or capabilities change. We need to renegotiate
1867                  * the capabilities. Send the appropriate DL_NOTIFY_IND.
1868                  */
1869                 str_notify_capab_reneg(dsp);
1870                 break;
1871 
1872         case MAC_NOTE_SDU_SIZE: {
1873                 uint_t  max_sdu;
1874                 uint_t  multicast_sdu;
1875                 mac_sdu_get2(dsp->ds_mh, NULL, &max_sdu, &multicast_sdu);
1876                 str_notify_sdu_size(dsp, max_sdu, multicast_sdu);
1877                 break;
1878         }
1879 
1880         case MAC_NOTE_FASTPATH_FLUSH:
1881                 str_notify_fastpath_flush(dsp);
1882                 break;
1883 
1884         /* Unused notifications */
1885         case MAC_NOTE_MARGIN:
1886                 break;
1887 
1888         case MAC_NOTE_ALLOWED_IPS:
1889                 str_notify_allowed_ips(dsp);
1890                 break;
1891 
1892         default:
1893                 ASSERT(B_FALSE);
1894                 break;
1895         }
1896 }
1897 
1898 /*
1899  * This function is called via a taskq mechansim to process all control
1900  * messages on a per 'dsp' end point.
1901  */
1902 static void
1903 dld_wput_nondata_task(void *arg)
1904 {
1905         dld_str_t       *dsp = arg;
1906         mblk_t          *mp;
1907 
1908         mutex_enter(&dsp->ds_lock);
1909         while (dsp->ds_pending_head != NULL) {
1910                 mp = dsp->ds_pending_head;
1911                 dsp->ds_pending_head = mp->b_next;
1912                 mp->b_next = NULL;
1913                 if (dsp->ds_pending_head == NULL)
1914                         dsp->ds_pending_tail = NULL;
1915                 mutex_exit(&dsp->ds_lock);
1916 
1917                 switch (DB_TYPE(mp)) {
1918                 case M_PROTO:
1919                 case M_PCPROTO:
1920                         dld_proto(dsp, mp);
1921                         break;
1922                 case M_IOCTL:
1923                         dld_ioc(dsp, mp);
1924                         break;
1925                 default:
1926                         ASSERT(0);
1927                 }
1928 
1929                 mutex_enter(&dsp->ds_lock);
1930         }
1931         ASSERT(dsp->ds_pending_tail == NULL);
1932         dsp->ds_dlpi_pending = 0;
1933         cv_broadcast(&dsp->ds_dlpi_pending_cv);
1934         mutex_exit(&dsp->ds_lock);
1935 }
1936 
1937 /*
1938  * Kernel thread to handle taskq dispatch failures in dld_wput_data. This
1939  * thread is started at boot time.
1940  */
1941 static void
1942 dld_taskq_dispatch(void)
1943 {
1944         callb_cpr_t     cprinfo;
1945         dld_str_t       *dsp;
1946 
1947         CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr,
1948             "dld_taskq_dispatch");
1949         mutex_enter(&dld_taskq_lock);
1950 
1951         while (!dld_taskq_quit) {
1952                 dsp = list_head(&dld_taskq_list);
1953                 while (dsp != NULL) {
1954                         list_remove(&dld_taskq_list, dsp);
1955                         mutex_exit(&dld_taskq_lock);
1956                         VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task,
1957                             dsp, TQ_SLEEP) != 0);
1958                         mutex_enter(&dld_taskq_lock);
1959                         dsp = list_head(&dld_taskq_list);
1960                 }
1961 
1962                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1963                 cv_wait(&dld_taskq_cv, &dld_taskq_lock);
1964                 CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock);
1965         }
1966 
1967         dld_taskq_done = B_TRUE;
1968         cv_signal(&dld_taskq_cv);
1969         CALLB_CPR_EXIT(&cprinfo);
1970         thread_exit();
1971 }
1972 
1973 /*
1974  * All control operations are serialized on the 'dsp' and are also funneled
1975  * through a taskq mechanism to ensure that subsequent processing has kernel
1976  * context and can safely use cv_wait.
1977  *
1978  * Mechanisms to handle taskq dispatch failures
1979  *
1980  * The only way to be sure that taskq dispatch does not fail is to either
1981  * specify TQ_SLEEP or to use a static taskq and prepopulate it with
1982  * some number of entries and make sure that the number of outstanding requests
1983  * are less than that number. We can't use TQ_SLEEP since we don't know the
1984  * context. Nor can we bound the total number of 'dsp' end points. So we are
1985  * unable to use either of the above schemes, and are forced to deal with
1986  * taskq dispatch failures. Note that even dynamic taskq could fail in
1987  * dispatch if TQ_NOSLEEP is specified, since this flag is translated
1988  * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
1989  * framework.
1990  *
1991  * We maintain a queue of 'dsp's that encountered taskq dispatch failure.
1992  * We also have a single global thread to retry the taskq dispatch. This
1993  * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
1994  * uses TQ_SLEEP to ensure eventual success of the dispatch operation.
1995  */
1996 static void
1997 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
1998 {
1999         ASSERT(mp->b_next == NULL);
2000         mutex_enter(&dsp->ds_lock);
2001         if (dsp->ds_pending_head != NULL) {
2002                 ASSERT(dsp->ds_dlpi_pending);
2003                 dsp->ds_pending_tail->b_next = mp;
2004                 dsp->ds_pending_tail = mp;
2005                 mutex_exit(&dsp->ds_lock);
2006                 return;
2007         }
2008         ASSERT(dsp->ds_pending_tail == NULL);
2009         dsp->ds_pending_head = dsp->ds_pending_tail = mp;
2010         /*
2011          * At this point if ds_dlpi_pending is set, it implies that the taskq
2012          * thread is still active and is processing the last message, though
2013          * the pending queue has been emptied.
2014          */
2015         if (dsp->ds_dlpi_pending) {
2016                 mutex_exit(&dsp->ds_lock);
2017                 return;
2018         }
2019 
2020         dsp->ds_dlpi_pending = 1;
2021         mutex_exit(&dsp->ds_lock);
2022 
2023         if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp,
2024             TQ_NOSLEEP) != 0)
2025                 return;
2026 
2027         mutex_enter(&dld_taskq_lock);
2028         list_insert_tail(&dld_taskq_list, dsp);
2029         cv_signal(&dld_taskq_cv);
2030         mutex_exit(&dld_taskq_lock);
2031 }
2032 
2033 /*
2034  * Process an M_IOCTL message.
2035  */
2036 static void
2037 dld_ioc(dld_str_t *dsp, mblk_t *mp)
2038 {
2039         uint_t                  cmd;
2040 
2041         cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
2042         ASSERT(dsp->ds_type == DLD_DLPI);
2043 
2044         switch (cmd) {
2045         case DLIOCNATIVE:
2046                 ioc_native(dsp, mp);
2047                 break;
2048         case DLIOCMARGININFO:
2049                 ioc_margin(dsp, mp);
2050                 break;
2051         case DLIOCRAW:
2052                 ioc_raw(dsp, mp);
2053                 break;
2054         case DLIOCHDRINFO:
2055                 ioc_fast(dsp, mp);
2056                 break;
2057         case DLIOCLOWLINK:
2058                 ioc_lowlink(dsp, mp);
2059                 break;
2060         default:
2061                 ioc(dsp, mp);
2062         }
2063 }
2064 
2065 /*
2066  * DLIOCNATIVE
2067  */
2068 static void
2069 ioc_native(dld_str_t *dsp, mblk_t *mp)
2070 {
2071         queue_t *q = dsp->ds_wq;
2072         const mac_info_t *mip = dsp->ds_mip;
2073 
2074         /*
2075          * Native mode can be enabled if it's disabled and if the
2076          * native media type is different.
2077          */
2078         if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
2079                 dsp->ds_native = B_TRUE;
2080 
2081         if (dsp->ds_native)
2082                 miocack(q, mp, 0, mip->mi_nativemedia);
2083         else
2084                 miocnak(q, mp, 0, ENOTSUP);
2085 }
2086 
2087 /*
2088  * DLIOCMARGININFO
2089  */
2090 static void
2091 ioc_margin(dld_str_t *dsp, mblk_t *mp)
2092 {
2093         queue_t *q = dsp->ds_wq;
2094         uint32_t margin;
2095         int err;
2096 
2097         if (dsp->ds_dlstate == DL_UNATTACHED) {
2098                 err = EINVAL;
2099                 goto failed;
2100         }
2101         if ((err = miocpullup(mp, sizeof (uint32_t))) != 0)
2102                 goto failed;
2103 
2104         mac_margin_get(dsp->ds_mh, &margin);
2105         *((uint32_t *)mp->b_cont->b_rptr) = margin;
2106         miocack(q, mp, sizeof (uint32_t), 0);
2107         return;
2108 
2109 failed:
2110         miocnak(q, mp, 0, err);
2111 }
2112 
2113 /*
2114  * DLIOCRAW
2115  */
2116 static void
2117 ioc_raw(dld_str_t *dsp, mblk_t *mp)
2118 {
2119         queue_t *q = dsp->ds_wq;
2120         mac_perim_handle_t      mph;
2121 
2122         if (dsp->ds_mh == NULL) {
2123                 dsp->ds_mode = DLD_RAW;
2124                 miocack(q, mp, 0, 0);
2125                 return;
2126         }
2127 
2128         mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2129         if (dsp->ds_polling || dsp->ds_direct) {
2130                 mac_perim_exit(mph);
2131                 miocnak(q, mp, 0, EPROTO);
2132                 return;
2133         }
2134 
2135         if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
2136                 /*
2137                  * Set the receive callback.
2138                  */
2139                 dls_rx_set(dsp, dld_str_rx_raw, dsp);
2140         }
2141 
2142         /*
2143          * Note that raw mode is enabled.
2144          */
2145         dsp->ds_mode = DLD_RAW;
2146         mac_perim_exit(mph);
2147 
2148         miocack(q, mp, 0, 0);
2149 }
2150 
2151 /*
2152  * DLIOCHDRINFO
2153  */
2154 static void
2155 ioc_fast(dld_str_t *dsp, mblk_t *mp)
2156 {
2157         dl_unitdata_req_t *dlp;
2158         off_t           off;
2159         size_t          len;
2160         const uint8_t   *addr;
2161         uint16_t        sap;
2162         mblk_t          *nmp;
2163         mblk_t          *hmp;
2164         uint_t          addr_length;
2165         queue_t         *q = dsp->ds_wq;
2166         int             err;
2167         mac_perim_handle_t      mph;
2168 
2169         if (dld_opt & DLD_OPT_NO_FASTPATH) {
2170                 err = ENOTSUP;
2171                 goto failed;
2172         }
2173 
2174         /*
2175          * DLIOCHDRINFO should only come from IP. The one initiated from
2176          * user-land should not be allowed.
2177          */
2178         if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
2179                 err = EINVAL;
2180                 goto failed;
2181         }
2182 
2183         nmp = mp->b_cont;
2184         if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
2185             (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
2186             dlp->dl_primitive != DL_UNITDATA_REQ)) {
2187                 err = EINVAL;
2188                 goto failed;
2189         }
2190 
2191         off = dlp->dl_dest_addr_offset;
2192         len = dlp->dl_dest_addr_length;
2193 
2194         if (!MBLKIN(nmp, off, len)) {
2195                 err = EINVAL;
2196                 goto failed;
2197         }
2198 
2199         if (dsp->ds_dlstate != DL_IDLE) {
2200                 err = ENOTSUP;
2201                 goto failed;
2202         }
2203 
2204         addr_length = dsp->ds_mip->mi_addr_length;
2205         if (len != addr_length + sizeof (uint16_t)) {
2206                 err = EINVAL;
2207                 goto failed;
2208         }
2209 
2210         addr = nmp->b_rptr + off;
2211         sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2212 
2213         if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) {
2214                 err = ENOMEM;
2215                 goto failed;
2216         }
2217 
2218         /*
2219          * This ioctl might happen concurrently with a direct call to dld_capab
2220          * that tries to enable direct and/or poll capabilities. Since the
2221          * stack does not serialize them, we do so here to avoid mixing
2222          * the callbacks.
2223          */
2224         mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2225         if (dsp->ds_mode != DLD_FASTPATH) {
2226                 /*
2227                  * Set the receive callback (unless polling is enabled).
2228                  */
2229                 if (!dsp->ds_polling && !dsp->ds_direct)
2230                         dls_rx_set(dsp, dld_str_rx_fastpath, dsp);
2231 
2232                 /*
2233                  * Note that fast-path mode is enabled.
2234                  */
2235                 dsp->ds_mode = DLD_FASTPATH;
2236         }
2237         mac_perim_exit(mph);
2238 
2239         freemsg(nmp->b_cont);
2240         nmp->b_cont = hmp;
2241 
2242         miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2243         return;
2244 failed:
2245         miocnak(q, mp, 0, err);
2246 }
2247 
2248 /*
2249  * DLIOCLOWLINK: request actual link state changes. When the
2250  * link is part of a bridge instance the client receives actual
2251  * link state changes and not the aggregate link status. Used by
2252  * the bridging daemon (bridged) for proper RSTP operation.
2253  */
2254 static void
2255 ioc_lowlink(dld_str_t *dsp, mblk_t *mp)
2256 {
2257         queue_t *q = dsp->ds_wq;
2258         int err;
2259 
2260         if ((err = miocpullup(mp, sizeof (int))) != 0) {
2261                 miocnak(q, mp, 0, err);
2262         } else {
2263                 /* LINTED: alignment */
2264                 dsp->ds_lowlink = *(boolean_t *)mp->b_cont->b_rptr;
2265                 miocack(q, mp, 0, 0);
2266         }
2267 }
2268 
2269 /*
2270  * Catch-all handler.
2271  */
2272 static void
2273 ioc(dld_str_t *dsp, mblk_t *mp)
2274 {
2275         queue_t *q = dsp->ds_wq;
2276 
2277         if (dsp->ds_dlstate == DL_UNATTACHED) {
2278                 miocnak(q, mp, 0, EINVAL);
2279                 return;
2280         }
2281         mac_ioctl(dsp->ds_mh, q, mp);
2282 }