1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  * MAC Services Module
  28  *
  29  * The GLDv3 framework locking -  The MAC layer
  30  * --------------------------------------------
  31  *
  32  * The MAC layer is central to the GLD framework and can provide the locking
  33  * framework needed for itself and for the use of MAC clients. MAC end points
  34  * are fairly disjoint and don't share a lot of state. So a coarse grained
  35  * multi-threading scheme is to single thread all create/modify/delete or set
  36  * type of control operations on a per mac end point while allowing data threads
  37  * concurrently.
  38  *
  39  * Control operations (set) that modify a mac end point are always serialized on
  40  * a per mac end point basis, We have at most 1 such thread per mac end point
  41  * at a time.
  42  *
  43  * All other operations that are not serialized are essentially multi-threaded.
  44  * For example a control operation (get) like getting statistics which may not
  45  * care about reading values atomically or data threads sending or receiving
  46  * data. Mostly these type of operations don't modify the control state. Any
  47  * state these operations care about are protected using traditional locks.
  48  *
  49  * The perimeter only serializes serial operations. It does not imply there
  50  * aren't any other concurrent operations. However a serialized operation may
  51  * sometimes need to make sure it is the only thread. In this case it needs
  52  * to use reference counting mechanisms to cv_wait until any current data
  53  * threads are done.
  54  *
  55  * The mac layer itself does not hold any locks across a call to another layer.
  56  * The perimeter is however held across a down call to the driver to make the
  57  * whole control operation atomic with respect to other control operations.
  58  * Also the data path and get type control operations may proceed concurrently.
  59  * These operations synchronize with the single serial operation on a given mac
  60  * end point using regular locks. The perimeter ensures that conflicting
  61  * operations like say a mac_multicast_add and a mac_multicast_remove on the
  62  * same mac end point don't interfere with each other and also ensures that the
  63  * changes in the mac layer and the call to the underlying driver to say add a
  64  * multicast address are done atomically without interference from a thread
  65  * trying to delete the same address.
  66  *
  67  * For example, consider
  68  * mac_multicst_add()
  69  * {
  70  *      mac_perimeter_enter();  serialize all control operations
  71  *
  72  *      grab list lock          protect against access by data threads
  73  *      add to list
  74  *      drop list lock
  75  *
  76  *      call driver's mi_multicst
  77  *
  78  *      mac_perimeter_exit();
  79  * }
  80  *
  81  * To lessen the number of serialization locks and simplify the lock hierarchy,
  82  * we serialize all the control operations on a per mac end point by using a
  83  * single serialization lock called the perimeter. We allow recursive entry into
  84  * the perimeter to facilitate use of this mechanism by both the mac client and
  85  * the MAC layer itself.
  86  *
  87  * MAC client means an entity that does an operation on a mac handle
  88  * obtained from a mac_open/mac_client_open. Similarly MAC driver means
  89  * an entity that does an operation on a mac handle obtained from a
  90  * mac_register. An entity could be both client and driver but on different
  91  * handles eg. aggr. and should only make the corresponding mac interface calls
  92  * i.e. mac driver interface or mac client interface as appropriate for that
  93  * mac handle.
  94  *
  95  * General rules.
  96  * -------------
  97  *
  98  * R1. The lock order of upcall threads is natually opposite to downcall
  99  * threads. Hence upcalls must not hold any locks across layers for fear of
 100  * recursive lock enter and lock order violation. This applies to all layers.
 101  *
 102  * R2. The perimeter is just another lock. Since it is held in the down
 103  * direction, acquiring the perimeter in an upcall is prohibited as it would
 104  * cause a deadlock. This applies to all layers.
 105  *
 106  * Note that upcalls that need to grab the mac perimeter (for example
 107  * mac_notify upcalls) can still achieve that by posting the request to a
 108  * thread, which can then grab all the required perimeters and locks in the
 109  * right global order. Note that in the above example the mac layer iself
 110  * won't grab the mac perimeter in the mac_notify upcall, instead the upcall
 111  * to the client must do that. Please see the aggr code for an example.
 112  *
 113  * MAC client rules
 114  * ----------------
 115  *
 116  * R3. A MAC client may use the MAC provided perimeter facility to serialize
 117  * control operations on a per mac end point. It does this by by acquring
 118  * and holding the perimeter across a sequence of calls to the mac layer.
 119  * This ensures atomicity across the entire block of mac calls. In this
 120  * model the MAC client must not hold any client locks across the calls to
 121  * the mac layer. This model is the preferred solution.
 122  *
 123  * R4. However if a MAC client has a lot of global state across all mac end
 124  * points the per mac end point serialization may not be sufficient. In this
 125  * case the client may choose to use global locks or use its own serialization.
 126  * To avoid deadlocks, these client layer locks held across the mac calls
 127  * in the control path must never be acquired by the data path for the reason
 128  * mentioned below.
 129  *
 130  * (Assume that a control operation that holds a client lock blocks in the
 131  * mac layer waiting for upcall reference counts to drop to zero. If an upcall
 132  * data thread that holds this reference count, tries to acquire the same
 133  * client lock subsequently it will deadlock).
 134  *
 135  * A MAC client may follow either the R3 model or the R4 model, but can't
 136  * mix both. In the former, the hierarchy is Perim -> client locks, but in
 137  * the latter it is client locks -> Perim.
 138  *
 139  * R5. MAC clients must make MAC calls (excluding data calls) in a cv_wait'able
 140  * context since they may block while trying to acquire the perimeter.
 141  * In addition some calls may block waiting for upcall refcnts to come down to
 142  * zero.
 143  *
 144  * R6. MAC clients must make sure that they are single threaded and all threads
 145  * from the top (in particular data threads) have finished before calling
 146  * mac_client_close. The MAC framework does not track the number of client
 147  * threads using the mac client handle. Also mac clients must make sure
 148  * they have undone all the control operations before calling mac_client_close.
 149  * For example mac_unicast_remove/mac_multicast_remove to undo the corresponding
 150  * mac_unicast_add/mac_multicast_add.
 151  *
 152  * MAC framework rules
 153  * -------------------
 154  *
 155  * R7. The mac layer itself must not hold any mac layer locks (except the mac
 156  * perimeter) across a call to any other layer from the mac layer. The call to
 157  * any other layer could be via mi_* entry points, classifier entry points into
 158  * the driver or via upcall pointers into layers above. The mac perimeter may
 159  * be acquired or held only in the down direction, for e.g. when calling into
 160  * a mi_* driver enty point to provide atomicity of the operation.
 161  *
 162  * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
 163  * mac driver interfaces, the MAC layer must provide a cut out for control
 164  * interfaces like upcall notifications and start them in a separate thread.
 165  *
 166  * R9. Note that locking order also implies a plumbing order. For example
 167  * VNICs are allowed to be created over aggrs, but not vice-versa. An attempt
 168  * to plumb in any other order must be failed at mac_open time, otherwise it
 169  * could lead to deadlocks due to inverse locking order.
 170  *
 171  * R10. MAC driver interfaces must not block since the driver could call them
 172  * in interrupt context.
 173  *
 174  * R11. Walkers must preferably not hold any locks while calling walker
 175  * callbacks. Instead these can operate on reference counts. In simple
 176  * callbacks it may be ok to hold a lock and call the callbacks, but this is
 177  * harder to maintain in the general case of arbitrary callbacks.
 178  *
 179  * R12. The MAC layer must protect upcall notification callbacks using reference
 180  * counts rather than holding locks across the callbacks.
 181  *
 182  * R13. Given the variety of drivers, it is preferable if the MAC layer can make
 183  * sure that any pointers (such as mac ring pointers) it passes to the driver
 184  * remain valid until mac unregister time. Currently the mac layer achieves
 185  * this by using generation numbers for rings and freeing the mac rings only
 186  * at unregister time.  The MAC layer must provide a layer of indirection and
 187  * must not expose underlying driver rings or driver data structures/pointers
 188  * directly to MAC clients.
 189  *
 190  * MAC driver rules
 191  * ----------------
 192  *
 193  * R14. It would be preferable if MAC drivers don't hold any locks across any
 194  * mac call. However at a minimum they must not hold any locks across data
 195  * upcalls. They must also make sure that all references to mac data structures
 196  * are cleaned up and that it is single threaded at mac_unregister time.
 197  *
 198  * R15. MAC driver interfaces don't block and so the action may be done
 199  * asynchronously in a separate thread as for example handling notifications.
 200  * The driver must not assume that the action is complete when the call
 201  * returns.
 202  *
 203  * R16. Drivers must maintain a generation number per Rx ring, and pass it
 204  * back to mac_rx_ring(); They are expected to increment the generation
 205  * number whenever the ring's stop routine is invoked.
 206  * See comments in mac_rx_ring();
 207  *
 208  * R17 Similarly mi_stop is another synchronization point and the driver must
 209  * ensure that all upcalls are done and there won't be any future upcall
 210  * before returning from mi_stop.
 211  *
 212  * R18. The driver may assume that all set/modify control operations via
 213  * the mi_* entry points are single threaded on a per mac end point.
 214  *
 215  * Lock and Perimeter hierarchy scenarios
 216  * ---------------------------------------
 217  *
 218  * i_mac_impl_lock -> mi_rw_lock -> srs_lock -> s_ring_lock[i_mac_tx_srs_notify]
 219  *
 220  * ft_lock -> fe_lock [mac_flow_lookup]
 221  *
 222  * mi_rw_lock -> fe_lock [mac_bcast_send]
 223  *
 224  * srs_lock -> mac_bw_lock [mac_rx_srs_drain_bw]
 225  *
 226  * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
 227  *
 228  * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
 229  *
 230  * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
 231  * client to driver. In the case of clients that explictly use the mac provided
 232  * perimeter mechanism for its serialization, the hierarchy is
 233  * Perimeter -> mac layer locks, since the client never holds any locks across
 234  * the mac calls. In the case of clients that use its own locks the hierarchy
 235  * is Client locks -> Mac Perim -> Mac layer locks. The client never explicitly
 236  * calls mac_perim_enter/exit in this case.
 237  *
 238  * Subflow creation rules
 239  * ---------------------------
 240  * o In case of a user specified cpulist present on underlying link and flows,
 241  * the flows cpulist must be a subset of the underlying link.
 242  * o In case of a user specified fanout mode present on link and flow, the
 243  * subflow fanout count has to be less than or equal to that of the
 244  * underlying link. The cpu-bindings for the subflows will be a subset of
 245  * the underlying link.
 246  * o In case if no cpulist specified on both underlying link and flow, the
 247  * underlying link relies on a  MAC tunable to provide out of box fanout.
 248  * The subflow will have no cpulist (the subflow will be unbound)
 249  * o In case if no cpulist is specified on the underlying link, a subflow can
 250  * carry  either a user-specified cpulist or fanout count. The cpu-bindings
 251  * for the subflow will not adhere to restriction that they need to be subset
 252  * of the underlying link.
 253  * o In case where the underlying link is carrying either a user specified
 254  * cpulist or fanout mode and for a unspecified subflow, the subflow will be
 255  * created unbound.
 256  * o While creating unbound subflows, bandwidth mode changes attempt to
 257  * figure a right fanout count. In such cases the fanout count will override
 258  * the unbound cpu-binding behavior.
 259  * o In addition to this, while cycling between flow and link properties, we
 260  * impose a restriction that if a link property has a subflow with
 261  * user-specified attributes, we will not allow changing the link property.
 262  * The administrator needs to reset all the user specified properties for the
 263  * subflows before attempting a link property change.
 264  * Some of the above rules can be overridden by specifying additional command
 265  * line options while creating or modifying link or subflow properties.
 266  */
 267 
 268 #include <sys/types.h>
 269 #include <sys/conf.h>
 270 #include <sys/id_space.h>
 271 #include <sys/esunddi.h>
 272 #include <sys/stat.h>
 273 #include <sys/mkdev.h>
 274 #include <sys/stream.h>
 275 #include <sys/strsun.h>
 276 #include <sys/strsubr.h>
 277 #include <sys/dlpi.h>
 278 #include <sys/list.h>
 279 #include <sys/modhash.h>
 280 #include <sys/mac_provider.h>
 281 #include <sys/mac_client_impl.h>
 282 #include <sys/mac_soft_ring.h>
 283 #include <sys/mac_stat.h>
 284 #include <sys/mac_impl.h>
 285 #include <sys/mac.h>
 286 #include <sys/dls.h>
 287 #include <sys/dld.h>
 288 #include <sys/modctl.h>
 289 #include <sys/fs/dv_node.h>
 290 #include <sys/thread.h>
 291 #include <sys/proc.h>
 292 #include <sys/callb.h>
 293 #include <sys/cpuvar.h>
 294 #include <sys/atomic.h>
 295 #include <sys/bitmap.h>
 296 #include <sys/sdt.h>
 297 #include <sys/mac_flow.h>
 298 #include <sys/ddi_intr_impl.h>
 299 #include <sys/disp.h>
 300 #include <sys/sdt.h>
 301 #include <sys/vnic.h>
 302 #include <sys/vnic_impl.h>
 303 #include <sys/vlan.h>
 304 #include <inet/ip.h>
 305 #include <inet/ip6.h>
 306 #include <sys/exacct.h>
 307 #include <sys/exacct_impl.h>
 308 #include <inet/nd.h>
 309 #include <sys/ethernet.h>
 310 #include <sys/pool.h>
 311 #include <sys/pool_pset.h>
 312 #include <sys/cpupart.h>
 313 #include <inet/wifi_ioctl.h>
 314 #include <net/wpa.h>
 315 
 316 #define IMPL_HASHSZ     67      /* prime */
 317 
 318 kmem_cache_t            *i_mac_impl_cachep;
 319 mod_hash_t              *i_mac_impl_hash;
 320 krwlock_t               i_mac_impl_lock;
 321 uint_t                  i_mac_impl_count;
 322 static kmem_cache_t     *mac_ring_cache;
 323 static id_space_t       *minor_ids;
 324 static uint32_t         minor_count;
 325 static pool_event_cb_t  mac_pool_event_reg;
 326 
 327 /*
 328  * Logging stuff. Perhaps mac_logging_interval could be broken into
 329  * mac_flow_log_interval and mac_link_log_interval if we want to be
 330  * able to schedule them differently.
 331  */
 332 uint_t                  mac_logging_interval;
 333 boolean_t               mac_flow_log_enable;
 334 boolean_t               mac_link_log_enable;
 335 timeout_id_t            mac_logging_timer;
 336 
 337 /* for debugging, see MAC_DBG_PRT() in mac_impl.h */
 338 int mac_dbg = 0;
 339 
 340 #define MACTYPE_KMODDIR "mac"
 341 #define MACTYPE_HASHSZ  67
 342 static mod_hash_t       *i_mactype_hash;
 343 /*
 344  * i_mactype_lock synchronizes threads that obtain references to mactype_t
 345  * structures through i_mactype_getplugin().
 346  */
 347 static kmutex_t         i_mactype_lock;
 348 
 349 /*
 350  * mac_tx_percpu_cnt
 351  *
 352  * Number of per cpu locks per mac_client_impl_t. Used by the transmit side
 353  * in mac_tx to reduce lock contention. This is sized at boot time in mac_init.
 354  * mac_tx_percpu_cnt_max is settable in /etc/system and must be a power of 2.
 355  * Per cpu locks may be disabled by setting mac_tx_percpu_cnt_max to 1.
 356  */
 357 int mac_tx_percpu_cnt;
 358 int mac_tx_percpu_cnt_max = 128;
 359 
 360 /*
 361  * Call back functions for the bridge module.  These are guaranteed to be valid
 362  * when holding a reference on a link or when holding mip->mi_bridge_lock and
 363  * mi_bridge_link is non-NULL.
 364  */
 365 mac_bridge_tx_t mac_bridge_tx_cb;
 366 mac_bridge_rx_t mac_bridge_rx_cb;
 367 mac_bridge_ref_t mac_bridge_ref_cb;
 368 mac_bridge_ls_t mac_bridge_ls_cb;
 369 
 370 static int i_mac_constructor(void *, void *, int);
 371 static void i_mac_destructor(void *, void *);
 372 static int i_mac_ring_ctor(void *, void *, int);
 373 static void i_mac_ring_dtor(void *, void *);
 374 static mblk_t *mac_rx_classify(mac_impl_t *, mac_resource_handle_t, mblk_t *);
 375 void mac_tx_client_flush(mac_client_impl_t *);
 376 void mac_tx_client_block(mac_client_impl_t *);
 377 static void mac_rx_ring_quiesce(mac_ring_t *, uint_t);
 378 static int mac_start_group_and_rings(mac_group_t *);
 379 static void mac_stop_group_and_rings(mac_group_t *);
 380 static void mac_pool_event_cb(pool_event_t, int, void *);
 381 
 382 typedef struct netinfo_s {
 383         list_node_t     ni_link;
 384         void            *ni_record;
 385         int             ni_size;
 386         int             ni_type;
 387 } netinfo_t;
 388 
 389 /*
 390  * Module initialization functions.
 391  */
 392 
 393 void
 394 mac_init(void)
 395 {
 396         mac_tx_percpu_cnt = ((boot_max_ncpus == -1) ? max_ncpus :
 397             boot_max_ncpus);
 398 
 399         /* Upper bound is mac_tx_percpu_cnt_max */
 400         if (mac_tx_percpu_cnt > mac_tx_percpu_cnt_max)
 401                 mac_tx_percpu_cnt = mac_tx_percpu_cnt_max;
 402 
 403         if (mac_tx_percpu_cnt < 1) {
 404                 /* Someone set max_tx_percpu_cnt_max to 0 or less */
 405                 mac_tx_percpu_cnt = 1;
 406         }
 407 
 408         ASSERT(mac_tx_percpu_cnt >= 1);
 409         mac_tx_percpu_cnt = (1 << highbit(mac_tx_percpu_cnt - 1));
 410         /*
 411          * Make it of the form 2**N - 1 in the range
 412          * [0 .. mac_tx_percpu_cnt_max - 1]
 413          */
 414         mac_tx_percpu_cnt--;
 415 
 416         i_mac_impl_cachep = kmem_cache_create("mac_impl_cache",
 417             sizeof (mac_impl_t), 0, i_mac_constructor, i_mac_destructor,
 418             NULL, NULL, NULL, 0);
 419         ASSERT(i_mac_impl_cachep != NULL);
 420 
 421         mac_ring_cache = kmem_cache_create("mac_ring_cache",
 422             sizeof (mac_ring_t), 0, i_mac_ring_ctor, i_mac_ring_dtor, NULL,
 423             NULL, NULL, 0);
 424         ASSERT(mac_ring_cache != NULL);
 425 
 426         i_mac_impl_hash = mod_hash_create_extended("mac_impl_hash",
 427             IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
 428             mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
 429         rw_init(&i_mac_impl_lock, NULL, RW_DEFAULT, NULL);
 430 
 431         mac_flow_init();
 432         mac_soft_ring_init();
 433         mac_bcast_init();
 434         mac_client_init();
 435 
 436         i_mac_impl_count = 0;
 437 
 438         i_mactype_hash = mod_hash_create_extended("mactype_hash",
 439             MACTYPE_HASHSZ,
 440             mod_hash_null_keydtor, mod_hash_null_valdtor,
 441             mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
 442 
 443         /*
 444          * Allocate an id space to manage minor numbers. The range of the
 445          * space will be from MAC_MAX_MINOR+1 to MAC_PRIVATE_MINOR-1.  This
 446          * leaves half of the 32-bit minors available for driver private use.
 447          */
 448         minor_ids = id_space_create("mac_minor_ids", MAC_MAX_MINOR+1,
 449             MAC_PRIVATE_MINOR-1);
 450         ASSERT(minor_ids != NULL);
 451         minor_count = 0;
 452 
 453         /* Let's default to 20 seconds */
 454         mac_logging_interval = 20;
 455         mac_flow_log_enable = B_FALSE;
 456         mac_link_log_enable = B_FALSE;
 457         mac_logging_timer = 0;
 458 
 459         /* Register to be notified of noteworthy pools events */
 460         mac_pool_event_reg.pec_func =  mac_pool_event_cb;
 461         mac_pool_event_reg.pec_arg = NULL;
 462         pool_event_cb_register(&mac_pool_event_reg);
 463 }
 464 
 465 int
 466 mac_fini(void)
 467 {
 468 
 469         if (i_mac_impl_count > 0 || minor_count > 0)
 470                 return (EBUSY);
 471 
 472         pool_event_cb_unregister(&mac_pool_event_reg);
 473 
 474         id_space_destroy(minor_ids);
 475         mac_flow_fini();
 476 
 477         mod_hash_destroy_hash(i_mac_impl_hash);
 478         rw_destroy(&i_mac_impl_lock);
 479 
 480         mac_client_fini();
 481         kmem_cache_destroy(mac_ring_cache);
 482 
 483         mod_hash_destroy_hash(i_mactype_hash);
 484         mac_soft_ring_finish();
 485 
 486 
 487         return (0);
 488 }
 489 
 490 /*
 491  * Initialize a GLDv3 driver's device ops.  A driver that manages its own ops
 492  * (e.g. softmac) may pass in a NULL ops argument.
 493  */
 494 void
 495 mac_init_ops(struct dev_ops *ops, const char *name)
 496 {
 497         major_t major = ddi_name_to_major((char *)name);
 498 
 499         /*
 500          * By returning on error below, we are not letting the driver continue
 501          * in an undefined context.  The mac_register() function will faill if
 502          * DN_GLDV3_DRIVER isn't set.
 503          */
 504         if (major == DDI_MAJOR_T_NONE)
 505                 return;
 506         LOCK_DEV_OPS(&devnamesp[major].dn_lock);
 507         devnamesp[major].dn_flags |= (DN_GLDV3_DRIVER | DN_NETWORK_DRIVER);
 508         UNLOCK_DEV_OPS(&devnamesp[major].dn_lock);
 509         if (ops != NULL)
 510                 dld_init_ops(ops, name);
 511 }
 512 
 513 void
 514 mac_fini_ops(struct dev_ops *ops)
 515 {
 516         dld_fini_ops(ops);
 517 }
 518 
 519 /*ARGSUSED*/
 520 static int
 521 i_mac_constructor(void *buf, void *arg, int kmflag)
 522 {
 523         mac_impl_t      *mip = buf;
 524 
 525         bzero(buf, sizeof (mac_impl_t));
 526 
 527         mip->mi_linkstate = LINK_STATE_UNKNOWN;
 528 
 529         rw_init(&mip->mi_rw_lock, NULL, RW_DRIVER, NULL);
 530         mutex_init(&mip->mi_notify_lock, NULL, MUTEX_DRIVER, NULL);
 531         mutex_init(&mip->mi_promisc_lock, NULL, MUTEX_DRIVER, NULL);
 532         mutex_init(&mip->mi_ring_lock, NULL, MUTEX_DEFAULT, NULL);
 533 
 534         mip->mi_notify_cb_info.mcbi_lockp = &mip->mi_notify_lock;
 535         cv_init(&mip->mi_notify_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
 536         mip->mi_promisc_cb_info.mcbi_lockp = &mip->mi_promisc_lock;
 537         cv_init(&mip->mi_promisc_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
 538 
 539         mutex_init(&mip->mi_bridge_lock, NULL, MUTEX_DEFAULT, NULL);
 540 
 541         return (0);
 542 }
 543 
 544 /*ARGSUSED*/
 545 static void
 546 i_mac_destructor(void *buf, void *arg)
 547 {
 548         mac_impl_t      *mip = buf;
 549         mac_cb_info_t   *mcbi;
 550 
 551         ASSERT(mip->mi_ref == 0);
 552         ASSERT(mip->mi_active == 0);
 553         ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN);
 554         ASSERT(mip->mi_devpromisc == 0);
 555         ASSERT(mip->mi_ksp == NULL);
 556         ASSERT(mip->mi_kstat_count == 0);
 557         ASSERT(mip->mi_nclients == 0);
 558         ASSERT(mip->mi_nactiveclients == 0);
 559         ASSERT(mip->mi_single_active_client == NULL);
 560         ASSERT(mip->mi_state_flags == 0);
 561         ASSERT(mip->mi_factory_addr == NULL);
 562         ASSERT(mip->mi_factory_addr_num == 0);
 563         ASSERT(mip->mi_default_tx_ring == NULL);
 564 
 565         mcbi = &mip->mi_notify_cb_info;
 566         ASSERT(mcbi->mcbi_del_cnt == 0 && mcbi->mcbi_walker_cnt == 0);
 567         ASSERT(mip->mi_notify_bits == 0);
 568         ASSERT(mip->mi_notify_thread == NULL);
 569         ASSERT(mcbi->mcbi_lockp == &mip->mi_notify_lock);
 570         mcbi->mcbi_lockp = NULL;
 571 
 572         mcbi = &mip->mi_promisc_cb_info;
 573         ASSERT(mcbi->mcbi_del_cnt == 0 && mip->mi_promisc_list == NULL);
 574         ASSERT(mip->mi_promisc_list == NULL);
 575         ASSERT(mcbi->mcbi_lockp == &mip->mi_promisc_lock);
 576         mcbi->mcbi_lockp = NULL;
 577 
 578         ASSERT(mip->mi_bcast_ngrps == 0 && mip->mi_bcast_grp == NULL);
 579         ASSERT(mip->mi_perim_owner == NULL && mip->mi_perim_ocnt == 0);
 580 
 581         rw_destroy(&mip->mi_rw_lock);
 582 
 583         mutex_destroy(&mip->mi_promisc_lock);
 584         cv_destroy(&mip->mi_promisc_cb_info.mcbi_cv);
 585         mutex_destroy(&mip->mi_notify_lock);
 586         cv_destroy(&mip->mi_notify_cb_info.mcbi_cv);
 587         mutex_destroy(&mip->mi_ring_lock);
 588 
 589         ASSERT(mip->mi_bridge_link == NULL);
 590 }
 591 
 592 /* ARGSUSED */
 593 static int
 594 i_mac_ring_ctor(void *buf, void *arg, int kmflag)
 595 {
 596         mac_ring_t *ring = (mac_ring_t *)buf;
 597 
 598         bzero(ring, sizeof (mac_ring_t));
 599         cv_init(&ring->mr_cv, NULL, CV_DEFAULT, NULL);
 600         mutex_init(&ring->mr_lock, NULL, MUTEX_DEFAULT, NULL);
 601         ring->mr_state = MR_FREE;
 602         return (0);
 603 }
 604 
 605 /* ARGSUSED */
 606 static void
 607 i_mac_ring_dtor(void *buf, void *arg)
 608 {
 609         mac_ring_t *ring = (mac_ring_t *)buf;
 610 
 611         cv_destroy(&ring->mr_cv);
 612         mutex_destroy(&ring->mr_lock);
 613 }
 614 
 615 /*
 616  * Common functions to do mac callback addition and deletion. Currently this is
 617  * used by promisc callbacks and notify callbacks. List addition and deletion
 618  * need to take care of list walkers. List walkers in general, can't hold list
 619  * locks and make upcall callbacks due to potential lock order and recursive
 620  * reentry issues. Instead list walkers increment the list walker count to mark
 621  * the presence of a walker thread. Addition can be carefully done to ensure
 622  * that the list walker always sees either the old list or the new list.
 623  * However the deletion can't be done while the walker is active, instead the
 624  * deleting thread simply marks the entry as logically deleted. The last walker
 625  * physically deletes and frees up the logically deleted entries when the walk
 626  * is complete.
 627  */
 628 void
 629 mac_callback_add(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
 630     mac_cb_t *mcb_elem)
 631 {
 632         mac_cb_t        *p;
 633         mac_cb_t        **pp;
 634 
 635         /* Verify it is not already in the list */
 636         for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
 637                 if (p == mcb_elem)
 638                         break;
 639         }
 640         VERIFY(p == NULL);
 641 
 642         /*
 643          * Add it to the head of the callback list. The membar ensures that
 644          * the following list pointer manipulations reach global visibility
 645          * in exactly the program order below.
 646          */
 647         ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
 648 
 649         mcb_elem->mcb_nextp = *mcb_head;
 650         membar_producer();
 651         *mcb_head = mcb_elem;
 652 }
 653 
 654 /*
 655  * Mark the entry as logically deleted. If there aren't any walkers unlink
 656  * from the list. In either case return the corresponding status.
 657  */
 658 boolean_t
 659 mac_callback_remove(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
 660     mac_cb_t *mcb_elem)
 661 {
 662         mac_cb_t        *p;
 663         mac_cb_t        **pp;
 664 
 665         ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
 666         /*
 667          * Search the callback list for the entry to be removed
 668          */
 669         for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
 670                 if (p == mcb_elem)
 671                         break;
 672         }
 673         VERIFY(p != NULL);
 674 
 675         /*
 676          * If there are walkers just mark it as deleted and the last walker
 677          * will remove from the list and free it.
 678          */
 679         if (mcbi->mcbi_walker_cnt != 0) {
 680                 p->mcb_flags |= MCB_CONDEMNED;
 681                 mcbi->mcbi_del_cnt++;
 682                 return (B_FALSE);
 683         }
 684 
 685         ASSERT(mcbi->mcbi_del_cnt == 0);
 686         *pp = p->mcb_nextp;
 687         p->mcb_nextp = NULL;
 688         return (B_TRUE);
 689 }
 690 
 691 /*
 692  * Wait for all pending callback removals to be completed
 693  */
 694 void
 695 mac_callback_remove_wait(mac_cb_info_t *mcbi)
 696 {
 697         ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
 698         while (mcbi->mcbi_del_cnt != 0) {
 699                 DTRACE_PROBE1(need_wait, mac_cb_info_t *, mcbi);
 700                 cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
 701         }
 702 }
 703 
 704 /*
 705  * The last mac callback walker does the cleanup. Walk the list and unlik
 706  * all the logically deleted entries and construct a temporary list of
 707  * removed entries. Return the list of removed entries to the caller.
 708  */
 709 mac_cb_t *
 710 mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head)
 711 {
 712         mac_cb_t        *p;
 713         mac_cb_t        **pp;
 714         mac_cb_t        *rmlist = NULL;         /* List of removed elements */
 715         int     cnt = 0;
 716 
 717         ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
 718         ASSERT(mcbi->mcbi_del_cnt != 0 && mcbi->mcbi_walker_cnt == 0);
 719 
 720         pp = mcb_head;
 721         while (*pp != NULL) {
 722                 if ((*pp)->mcb_flags & MCB_CONDEMNED) {
 723                         p = *pp;
 724                         *pp = p->mcb_nextp;
 725                         p->mcb_nextp = rmlist;
 726                         rmlist = p;
 727                         cnt++;
 728                         continue;
 729                 }
 730                 pp = &(*pp)->mcb_nextp;
 731         }
 732 
 733         ASSERT(mcbi->mcbi_del_cnt == cnt);
 734         mcbi->mcbi_del_cnt = 0;
 735         return (rmlist);
 736 }
 737 
 738 boolean_t
 739 mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
 740 {
 741         mac_cb_t        *mcb;
 742 
 743         /* Verify it is not already in the list */
 744         for (mcb = *mcb_headp; mcb != NULL; mcb = mcb->mcb_nextp) {
 745                 if (mcb == mcb_elem)
 746                         return (B_TRUE);
 747         }
 748 
 749         return (B_FALSE);
 750 }
 751 
 752 boolean_t
 753 mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
 754 {
 755         boolean_t       found;
 756 
 757         mutex_enter(mcbi->mcbi_lockp);
 758         found = mac_callback_lookup(mcb_headp, mcb_elem);
 759         mutex_exit(mcbi->mcbi_lockp);
 760 
 761         return (found);
 762 }
 763 
 764 /* Free the list of removed callbacks */
 765 void
 766 mac_callback_free(mac_cb_t *rmlist)
 767 {
 768         mac_cb_t        *mcb;
 769         mac_cb_t        *mcb_next;
 770 
 771         for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
 772                 mcb_next = mcb->mcb_nextp;
 773                 kmem_free(mcb->mcb_objp, mcb->mcb_objsize);
 774         }
 775 }
 776 
 777 /*
 778  * The promisc callbacks are in 2 lists, one off the 'mip' and another off the
 779  * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there
 780  * is only a single shared total walker count, and an entry can't be physically
 781  * unlinked if a walker is active on either list. The last walker does this
 782  * cleanup of logically deleted entries.
 783  */
 784 void
 785 i_mac_promisc_walker_cleanup(mac_impl_t *mip)
 786 {
 787         mac_cb_t        *rmlist;
 788         mac_cb_t        *mcb;
 789         mac_cb_t        *mcb_next;
 790         mac_promisc_impl_t      *mpip;
 791 
 792         /*
 793          * Construct a temporary list of deleted callbacks by walking the
 794          * the mi_promisc_list. Then for each entry in the temporary list,
 795          * remove it from the mci_promisc_list and free the entry.
 796          */
 797         rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info,
 798             &mip->mi_promisc_list);
 799 
 800         for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
 801                 mcb_next = mcb->mcb_nextp;
 802                 mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
 803                 VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info,
 804                     &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link));
 805                 mcb->mcb_flags = 0;
 806                 mcb->mcb_nextp = NULL;
 807                 kmem_cache_free(mac_promisc_impl_cache, mpip);
 808         }
 809 }
 810 
 811 void
 812 i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
 813 {
 814         mac_cb_info_t   *mcbi;
 815 
 816         /*
 817          * Signal the notify thread even after mi_ref has become zero and
 818          * mi_disabled is set. The synchronization with the notify thread
 819          * happens in mac_unregister and that implies the driver must make
 820          * sure it is single-threaded (with respect to mac calls) and that
 821          * all pending mac calls have returned before it calls mac_unregister
 822          */
 823         rw_enter(&i_mac_impl_lock, RW_READER);
 824         if (mip->mi_state_flags & MIS_DISABLED)
 825                 goto exit;
 826 
 827         /*
 828          * Guard against incorrect notifications.  (Running a newer
 829          * mac client against an older implementation?)
 830          */
 831         if (type >= MAC_NNOTE)
 832                 goto exit;
 833 
 834         mcbi = &mip->mi_notify_cb_info;
 835         mutex_enter(mcbi->mcbi_lockp);
 836         mip->mi_notify_bits |= (1 << type);
 837         cv_broadcast(&mcbi->mcbi_cv);
 838         mutex_exit(mcbi->mcbi_lockp);
 839 
 840 exit:
 841         rw_exit(&i_mac_impl_lock);
 842 }
 843 
 844 /*
 845  * Mac serialization primitives. Please see the block comment at the
 846  * top of the file.
 847  */
 848 void
 849 i_mac_perim_enter(mac_impl_t *mip)
 850 {
 851         mac_client_impl_t       *mcip;
 852 
 853         if (mip->mi_state_flags & MIS_IS_VNIC) {
 854                 /*
 855                  * This is a VNIC. Return the lower mac since that is what
 856                  * we want to serialize on.
 857                  */
 858                 mcip = mac_vnic_lower(mip);
 859                 mip = mcip->mci_mip;
 860         }
 861 
 862         mutex_enter(&mip->mi_perim_lock);
 863         if (mip->mi_perim_owner == curthread) {
 864                 mip->mi_perim_ocnt++;
 865                 mutex_exit(&mip->mi_perim_lock);
 866                 return;
 867         }
 868 
 869         while (mip->mi_perim_owner != NULL)
 870                 cv_wait(&mip->mi_perim_cv, &mip->mi_perim_lock);
 871 
 872         mip->mi_perim_owner = curthread;
 873         ASSERT(mip->mi_perim_ocnt == 0);
 874         mip->mi_perim_ocnt++;
 875 #ifdef DEBUG
 876         mip->mi_perim_stack_depth = getpcstack(mip->mi_perim_stack,
 877             MAC_PERIM_STACK_DEPTH);
 878 #endif
 879         mutex_exit(&mip->mi_perim_lock);
 880 }
 881 
 882 int
 883 i_mac_perim_enter_nowait(mac_impl_t *mip)
 884 {
 885         /*
 886          * The vnic is a special case, since the serialization is done based
 887          * on the lower mac. If the lower mac is busy, it does not imply the
 888          * vnic can't be unregistered. But in the case of other drivers,
 889          * a busy perimeter or open mac handles implies that the mac is busy
 890          * and can't be unregistered.
 891          */
 892         if (mip->mi_state_flags & MIS_IS_VNIC) {
 893                 i_mac_perim_enter(mip);
 894                 return (0);
 895         }
 896 
 897         mutex_enter(&mip->mi_perim_lock);
 898         if (mip->mi_perim_owner != NULL) {
 899                 mutex_exit(&mip->mi_perim_lock);
 900                 return (EBUSY);
 901         }
 902         ASSERT(mip->mi_perim_ocnt == 0);
 903         mip->mi_perim_owner = curthread;
 904         mip->mi_perim_ocnt++;
 905         mutex_exit(&mip->mi_perim_lock);
 906 
 907         return (0);
 908 }
 909 
 910 void
 911 i_mac_perim_exit(mac_impl_t *mip)
 912 {
 913         mac_client_impl_t *mcip;
 914 
 915         if (mip->mi_state_flags & MIS_IS_VNIC) {
 916                 /*
 917                  * This is a VNIC. Return the lower mac since that is what
 918                  * we want to serialize on.
 919                  */
 920                 mcip = mac_vnic_lower(mip);
 921                 mip = mcip->mci_mip;
 922         }
 923 
 924         ASSERT(mip->mi_perim_owner == curthread && mip->mi_perim_ocnt != 0);
 925 
 926         mutex_enter(&mip->mi_perim_lock);
 927         if (--mip->mi_perim_ocnt == 0) {
 928                 mip->mi_perim_owner = NULL;
 929                 cv_signal(&mip->mi_perim_cv);
 930         }
 931         mutex_exit(&mip->mi_perim_lock);
 932 }
 933 
 934 /*
 935  * Returns whether the current thread holds the mac perimeter. Used in making
 936  * assertions.
 937  */
 938 boolean_t
 939 mac_perim_held(mac_handle_t mh)
 940 {
 941         mac_impl_t      *mip = (mac_impl_t *)mh;
 942         mac_client_impl_t *mcip;
 943 
 944         if (mip->mi_state_flags & MIS_IS_VNIC) {
 945                 /*
 946                  * This is a VNIC. Return the lower mac since that is what
 947                  * we want to serialize on.
 948                  */
 949                 mcip = mac_vnic_lower(mip);
 950                 mip = mcip->mci_mip;
 951         }
 952         return (mip->mi_perim_owner == curthread);
 953 }
 954 
 955 /*
 956  * mac client interfaces to enter the mac perimeter of a mac end point, given
 957  * its mac handle, or macname or linkid.
 958  */
 959 void
 960 mac_perim_enter_by_mh(mac_handle_t mh, mac_perim_handle_t *mphp)
 961 {
 962         mac_impl_t      *mip = (mac_impl_t *)mh;
 963 
 964         i_mac_perim_enter(mip);
 965         /*
 966          * The mac_perim_handle_t returned encodes the 'mip' and whether a
 967          * mac_open has been done internally while entering the perimeter.
 968          * This information is used in mac_perim_exit
 969          */
 970         MAC_ENCODE_MPH(*mphp, mip, 0);
 971 }
 972 
 973 int
 974 mac_perim_enter_by_macname(const char *name, mac_perim_handle_t *mphp)
 975 {
 976         int     err;
 977         mac_handle_t    mh;
 978 
 979         if ((err = mac_open(name, &mh)) != 0)
 980                 return (err);
 981 
 982         mac_perim_enter_by_mh(mh, mphp);
 983         MAC_ENCODE_MPH(*mphp, mh, 1);
 984         return (0);
 985 }
 986 
 987 int
 988 mac_perim_enter_by_linkid(datalink_id_t linkid, mac_perim_handle_t *mphp)
 989 {
 990         int     err;
 991         mac_handle_t    mh;
 992 
 993         if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
 994                 return (err);
 995 
 996         mac_perim_enter_by_mh(mh, mphp);
 997         MAC_ENCODE_MPH(*mphp, mh, 1);
 998         return (0);
 999 }
1000 
1001 void
1002 mac_perim_exit(mac_perim_handle_t mph)
1003 {
1004         mac_impl_t      *mip;
1005         boolean_t       need_close;
1006 
1007         MAC_DECODE_MPH(mph, mip, need_close);
1008         i_mac_perim_exit(mip);
1009         if (need_close)
1010                 mac_close((mac_handle_t)mip);
1011 }
1012 
1013 int
1014 mac_hold(const char *macname, mac_impl_t **pmip)
1015 {
1016         mac_impl_t      *mip;
1017         int             err;
1018 
1019         /*
1020          * Check the device name length to make sure it won't overflow our
1021          * buffer.
1022          */
1023         if (strlen(macname) >= MAXNAMELEN)
1024                 return (EINVAL);
1025 
1026         /*
1027          * Look up its entry in the global hash table.
1028          */
1029         rw_enter(&i_mac_impl_lock, RW_WRITER);
1030         err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname,
1031             (mod_hash_val_t *)&mip);
1032 
1033         if (err != 0) {
1034                 rw_exit(&i_mac_impl_lock);
1035                 return (ENOENT);
1036         }
1037 
1038         if (mip->mi_state_flags & MIS_DISABLED) {
1039                 rw_exit(&i_mac_impl_lock);
1040                 return (ENOENT);
1041         }
1042 
1043         if (mip->mi_state_flags & MIS_EXCLUSIVE_HELD) {
1044                 rw_exit(&i_mac_impl_lock);
1045                 return (EBUSY);
1046         }
1047 
1048         mip->mi_ref++;
1049         rw_exit(&i_mac_impl_lock);
1050 
1051         *pmip = mip;
1052         return (0);
1053 }
1054 
1055 void
1056 mac_rele(mac_impl_t *mip)
1057 {
1058         rw_enter(&i_mac_impl_lock, RW_WRITER);
1059         ASSERT(mip->mi_ref != 0);
1060         if (--mip->mi_ref == 0) {
1061                 ASSERT(mip->mi_nactiveclients == 0 &&
1062                     !(mip->mi_state_flags & MIS_EXCLUSIVE));
1063         }
1064         rw_exit(&i_mac_impl_lock);
1065 }
1066 
1067 /*
1068  * Private GLDv3 function to start a MAC instance.
1069  */
1070 int
1071 mac_start(mac_handle_t mh)
1072 {
1073         mac_impl_t      *mip = (mac_impl_t *)mh;
1074         int             err = 0;
1075         mac_group_t     *defgrp;
1076 
1077         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1078         ASSERT(mip->mi_start != NULL);
1079 
1080         /*
1081          * Check whether the device is already started.
1082          */
1083         if (mip->mi_active++ == 0) {
1084                 mac_ring_t *ring = NULL;
1085 
1086                 /*
1087                  * Start the device.
1088                  */
1089                 err = mip->mi_start(mip->mi_driver);
1090                 if (err != 0) {
1091                         mip->mi_active--;
1092                         return (err);
1093                 }
1094 
1095                 /*
1096                  * Start the default tx ring.
1097                  */
1098                 if (mip->mi_default_tx_ring != NULL) {
1099 
1100                         ring = (mac_ring_t *)mip->mi_default_tx_ring;
1101                         if (ring->mr_state != MR_INUSE) {
1102                                 err = mac_start_ring(ring);
1103                                 if (err != 0) {
1104                                         mip->mi_active--;
1105                                         return (err);
1106                                 }
1107                         }
1108                 }
1109 
1110                 if ((defgrp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) {
1111                         /*
1112                          * Start the default ring, since it will be needed
1113                          * to receive broadcast and multicast traffic for
1114                          * both primary and non-primary MAC clients.
1115                          */
1116                         ASSERT(defgrp->mrg_state == MAC_GROUP_STATE_REGISTERED);
1117                         err = mac_start_group_and_rings(defgrp);
1118                         if (err != 0) {
1119                                 mip->mi_active--;
1120                                 if ((ring != NULL) &&
1121                                     (ring->mr_state == MR_INUSE))
1122                                         mac_stop_ring(ring);
1123                                 return (err);
1124                         }
1125                         mac_set_group_state(defgrp, MAC_GROUP_STATE_SHARED);
1126                 }
1127         }
1128 
1129         return (err);
1130 }
1131 
1132 /*
1133  * Private GLDv3 function to stop a MAC instance.
1134  */
1135 void
1136 mac_stop(mac_handle_t mh)
1137 {
1138         mac_impl_t      *mip = (mac_impl_t *)mh;
1139         mac_group_t     *grp;
1140 
1141         ASSERT(mip->mi_stop != NULL);
1142         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1143 
1144         /*
1145          * Check whether the device is still needed.
1146          */
1147         ASSERT(mip->mi_active != 0);
1148         if (--mip->mi_active == 0) {
1149                 if ((grp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) {
1150                         /*
1151                          * There should be no more active clients since the
1152                          * MAC is being stopped. Stop the default RX group
1153                          * and transition it back to registered state.
1154                          *
1155                          * When clients are torn down, the groups
1156                          * are release via mac_release_rx_group which
1157                          * knows the the default group is always in
1158                          * started mode since broadcast uses it. So
1159                          * we can assert that their are no clients
1160                          * (since mac_bcast_add doesn't register itself
1161                          * as a client) and group is in SHARED state.
1162                          */
1163                         ASSERT(grp->mrg_state == MAC_GROUP_STATE_SHARED);
1164                         ASSERT(MAC_GROUP_NO_CLIENT(grp) &&
1165                             mip->mi_nactiveclients == 0);
1166                         mac_stop_group_and_rings(grp);
1167                         mac_set_group_state(grp, MAC_GROUP_STATE_REGISTERED);
1168                 }
1169 
1170                 if (mip->mi_default_tx_ring != NULL) {
1171                         mac_ring_t *ring;
1172 
1173                         ring = (mac_ring_t *)mip->mi_default_tx_ring;
1174                         if (ring->mr_state == MR_INUSE) {
1175                                 mac_stop_ring(ring);
1176                                 ring->mr_flag = 0;
1177                         }
1178                 }
1179 
1180                 /*
1181                  * Stop the device.
1182                  */
1183                 mip->mi_stop(mip->mi_driver);
1184         }
1185 }
1186 
1187 int
1188 i_mac_promisc_set(mac_impl_t *mip, boolean_t on)
1189 {
1190         int             err = 0;
1191 
1192         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1193         ASSERT(mip->mi_setpromisc != NULL);
1194 
1195         if (on) {
1196                 /*
1197                  * Enable promiscuous mode on the device if not yet enabled.
1198                  */
1199                 if (mip->mi_devpromisc++ == 0) {
1200                         err = mip->mi_setpromisc(mip->mi_driver, B_TRUE);
1201                         if (err != 0) {
1202                                 mip->mi_devpromisc--;
1203                                 return (err);
1204                         }
1205                         i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1206                 }
1207         } else {
1208                 if (mip->mi_devpromisc == 0)
1209                         return (EPROTO);
1210 
1211                 /*
1212                  * Disable promiscuous mode on the device if this is the last
1213                  * enabling.
1214                  */
1215                 if (--mip->mi_devpromisc == 0) {
1216                         err = mip->mi_setpromisc(mip->mi_driver, B_FALSE);
1217                         if (err != 0) {
1218                                 mip->mi_devpromisc++;
1219                                 return (err);
1220                         }
1221                         i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1222                 }
1223         }
1224 
1225         return (0);
1226 }
1227 
1228 /*
1229  * The promiscuity state can change any time. If the caller needs to take
1230  * actions that are atomic with the promiscuity state, then the caller needs
1231  * to bracket the entire sequence with mac_perim_enter/exit
1232  */
1233 boolean_t
1234 mac_promisc_get(mac_handle_t mh)
1235 {
1236         mac_impl_t              *mip = (mac_impl_t *)mh;
1237 
1238         /*
1239          * Return the current promiscuity.
1240          */
1241         return (mip->mi_devpromisc != 0);
1242 }
1243 
1244 /*
1245  * Invoked at MAC instance attach time to initialize the list
1246  * of factory MAC addresses supported by a MAC instance. This function
1247  * builds a local cache in the mac_impl_t for the MAC addresses
1248  * supported by the underlying hardware. The MAC clients themselves
1249  * use the mac_addr_factory*() functions to query and reserve
1250  * factory MAC addresses.
1251  */
1252 void
1253 mac_addr_factory_init(mac_impl_t *mip)
1254 {
1255         mac_capab_multifactaddr_t capab;
1256         uint8_t *addr;
1257         int i;
1258 
1259         /*
1260          * First round to see how many factory MAC addresses are available.
1261          */
1262         bzero(&capab, sizeof (capab));
1263         if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_MULTIFACTADDR,
1264             &capab) || (capab.mcm_naddr == 0)) {
1265                 /*
1266                  * The MAC instance doesn't support multiple factory
1267                  * MAC addresses, we're done here.
1268                  */
1269                 return;
1270         }
1271 
1272         /*
1273          * Allocate the space and get all the factory addresses.
1274          */
1275         addr = kmem_alloc(capab.mcm_naddr * MAXMACADDRLEN, KM_SLEEP);
1276         capab.mcm_getaddr(mip->mi_driver, capab.mcm_naddr, addr);
1277 
1278         mip->mi_factory_addr_num = capab.mcm_naddr;
1279         mip->mi_factory_addr = kmem_zalloc(mip->mi_factory_addr_num *
1280             sizeof (mac_factory_addr_t), KM_SLEEP);
1281 
1282         for (i = 0; i < capab.mcm_naddr; i++) {
1283                 bcopy(addr + i * MAXMACADDRLEN,
1284                     mip->mi_factory_addr[i].mfa_addr,
1285                     mip->mi_type->mt_addr_length);
1286                 mip->mi_factory_addr[i].mfa_in_use = B_FALSE;
1287         }
1288 
1289         kmem_free(addr, capab.mcm_naddr * MAXMACADDRLEN);
1290 }
1291 
1292 void
1293 mac_addr_factory_fini(mac_impl_t *mip)
1294 {
1295         if (mip->mi_factory_addr == NULL) {
1296                 ASSERT(mip->mi_factory_addr_num == 0);
1297                 return;
1298         }
1299 
1300         kmem_free(mip->mi_factory_addr, mip->mi_factory_addr_num *
1301             sizeof (mac_factory_addr_t));
1302 
1303         mip->mi_factory_addr = NULL;
1304         mip->mi_factory_addr_num = 0;
1305 }
1306 
1307 /*
1308  * Reserve a factory MAC address. If *slot is set to -1, the function
1309  * attempts to reserve any of the available factory MAC addresses and
1310  * returns the reserved slot id. If no slots are available, the function
1311  * returns ENOSPC. If *slot is not set to -1, the function reserves
1312  * the specified slot if it is available, or returns EBUSY is the slot
1313  * is already used. Returns ENOTSUP if the underlying MAC does not
1314  * support multiple factory addresses. If the slot number is not -1 but
1315  * is invalid, returns EINVAL.
1316  */
1317 int
1318 mac_addr_factory_reserve(mac_client_handle_t mch, int *slot)
1319 {
1320         mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1321         mac_impl_t *mip = mcip->mci_mip;
1322         int i, ret = 0;
1323 
1324         i_mac_perim_enter(mip);
1325         /*
1326          * Protect against concurrent readers that may need a self-consistent
1327          * view of the factory addresses
1328          */
1329         rw_enter(&mip->mi_rw_lock, RW_WRITER);
1330 
1331         if (mip->mi_factory_addr_num == 0) {
1332                 ret = ENOTSUP;
1333                 goto bail;
1334         }
1335 
1336         if (*slot != -1) {
1337                 /* check the specified slot */
1338                 if (*slot < 1 || *slot > mip->mi_factory_addr_num) {
1339                         ret = EINVAL;
1340                         goto bail;
1341                 }
1342                 if (mip->mi_factory_addr[*slot-1].mfa_in_use) {
1343                         ret = EBUSY;
1344                         goto bail;
1345                 }
1346         } else {
1347                 /* pick the next available slot */
1348                 for (i = 0; i < mip->mi_factory_addr_num; i++) {
1349                         if (!mip->mi_factory_addr[i].mfa_in_use)
1350                                 break;
1351                 }
1352 
1353                 if (i == mip->mi_factory_addr_num) {
1354                         ret = ENOSPC;
1355                         goto bail;
1356                 }
1357                 *slot = i+1;
1358         }
1359 
1360         mip->mi_factory_addr[*slot-1].mfa_in_use = B_TRUE;
1361         mip->mi_factory_addr[*slot-1].mfa_client = mcip;
1362 
1363 bail:
1364         rw_exit(&mip->mi_rw_lock);
1365         i_mac_perim_exit(mip);
1366         return (ret);
1367 }
1368 
1369 /*
1370  * Release the specified factory MAC address slot.
1371  */
1372 void
1373 mac_addr_factory_release(mac_client_handle_t mch, uint_t slot)
1374 {
1375         mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1376         mac_impl_t *mip = mcip->mci_mip;
1377 
1378         i_mac_perim_enter(mip);
1379         /*
1380          * Protect against concurrent readers that may need a self-consistent
1381          * view of the factory addresses
1382          */
1383         rw_enter(&mip->mi_rw_lock, RW_WRITER);
1384 
1385         ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1386         ASSERT(mip->mi_factory_addr[slot-1].mfa_in_use);
1387 
1388         mip->mi_factory_addr[slot-1].mfa_in_use = B_FALSE;
1389 
1390         rw_exit(&mip->mi_rw_lock);
1391         i_mac_perim_exit(mip);
1392 }
1393 
1394 /*
1395  * Stores in mac_addr the value of the specified MAC address. Returns
1396  * 0 on success, or EINVAL if the slot number is not valid for the MAC.
1397  * The caller must provide a string of at least MAXNAMELEN bytes.
1398  */
1399 void
1400 mac_addr_factory_value(mac_handle_t mh, int slot, uchar_t *mac_addr,
1401     uint_t *addr_len, char *client_name, boolean_t *in_use_arg)
1402 {
1403         mac_impl_t *mip = (mac_impl_t *)mh;
1404         boolean_t in_use;
1405 
1406         ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1407 
1408         /*
1409          * Readers need to hold mi_rw_lock. Writers need to hold mac perimeter
1410          * and mi_rw_lock
1411          */
1412         rw_enter(&mip->mi_rw_lock, RW_READER);
1413         bcopy(mip->mi_factory_addr[slot-1].mfa_addr, mac_addr, MAXMACADDRLEN);
1414         *addr_len = mip->mi_type->mt_addr_length;
1415         in_use = mip->mi_factory_addr[slot-1].mfa_in_use;
1416         if (in_use && client_name != NULL) {
1417                 bcopy(mip->mi_factory_addr[slot-1].mfa_client->mci_name,
1418                     client_name, MAXNAMELEN);
1419         }
1420         if (in_use_arg != NULL)
1421                 *in_use_arg = in_use;
1422         rw_exit(&mip->mi_rw_lock);
1423 }
1424 
1425 /*
1426  * Returns the number of factory MAC addresses (in addition to the
1427  * primary MAC address), 0 if the underlying MAC doesn't support
1428  * that feature.
1429  */
1430 uint_t
1431 mac_addr_factory_num(mac_handle_t mh)
1432 {
1433         mac_impl_t *mip = (mac_impl_t *)mh;
1434 
1435         return (mip->mi_factory_addr_num);
1436 }
1437 
1438 
1439 void
1440 mac_rx_group_unmark(mac_group_t *grp, uint_t flag)
1441 {
1442         mac_ring_t      *ring;
1443 
1444         for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next)
1445                 ring->mr_flag &= ~flag;
1446 }
1447 
1448 /*
1449  * The following mac_hwrings_xxx() functions are private mac client functions
1450  * used by the aggr driver to access and control the underlying HW Rx group
1451  * and rings. In this case, the aggr driver has exclusive control of the
1452  * underlying HW Rx group/rings, it calls the following functions to
1453  * start/stop the HW Rx rings, disable/enable polling, add/remove mac'
1454  * addresses, or set up the Rx callback.
1455  */
1456 /* ARGSUSED */
1457 static void
1458 mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs,
1459     mblk_t *mp_chain, boolean_t loopback)
1460 {
1461         mac_soft_ring_set_t     *mac_srs = (mac_soft_ring_set_t *)srs;
1462         mac_srs_rx_t            *srs_rx = &mac_srs->srs_rx;
1463         mac_direct_rx_t         proc;
1464         void                    *arg1;
1465         mac_resource_handle_t   arg2;
1466 
1467         proc = srs_rx->sr_func;
1468         arg1 = srs_rx->sr_arg1;
1469         arg2 = mac_srs->srs_mrh;
1470 
1471         proc(arg1, arg2, mp_chain, NULL);
1472 }
1473 
1474 /*
1475  * This function is called to get the list of HW rings that are reserved by
1476  * an exclusive mac client.
1477  *
1478  * Return value: the number of HW rings.
1479  */
1480 int
1481 mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
1482     mac_ring_handle_t *hwrh, mac_ring_type_t rtype)
1483 {
1484         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1485         flow_entry_t            *flent = mcip->mci_flent;
1486         mac_group_t             *grp;
1487         mac_ring_t              *ring;
1488         int                     cnt = 0;
1489 
1490         if (rtype == MAC_RING_TYPE_RX) {
1491                 grp = flent->fe_rx_ring_group;
1492         } else if (rtype == MAC_RING_TYPE_TX) {
1493                 grp = flent->fe_tx_ring_group;
1494         } else {
1495                 ASSERT(B_FALSE);
1496                 return (-1);
1497         }
1498         /*
1499          * The mac client did not reserve any RX group, return directly.
1500          * This is probably because the underlying MAC does not support
1501          * any groups.
1502          */
1503         if (hwgh != NULL)
1504                 *hwgh = NULL;
1505         if (grp == NULL)
1506                 return (0);
1507         /*
1508          * This group must be reserved by this mac client.
1509          */
1510         ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
1511             (mcip == MAC_GROUP_ONLY_CLIENT(grp)));
1512 
1513         for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next, cnt++) {
1514                 ASSERT(cnt < MAX_RINGS_PER_GROUP);
1515                 hwrh[cnt] = (mac_ring_handle_t)ring;
1516         }
1517         if (hwgh != NULL)
1518                 *hwgh = (mac_group_handle_t)grp;
1519 
1520         return (cnt);
1521 }
1522 
1523 /*
1524  * This function is called to get info about Tx/Rx rings.
1525  *
1526  * Return value: returns uint_t which will have various bits set
1527  * that indicates different properties of the ring.
1528  */
1529 uint_t
1530 mac_hwring_getinfo(mac_ring_handle_t rh)
1531 {
1532         mac_ring_t *ring = (mac_ring_t *)rh;
1533         mac_ring_info_t *info = &ring->mr_info;
1534 
1535         return (info->mri_flags);
1536 }
1537 
1538 /*
1539  * Export ddi interrupt handles from the HW ring to the pseudo ring and
1540  * setup the RX callback of the mac client which exclusively controls
1541  * HW ring.
1542  */
1543 void
1544 mac_hwring_setup(mac_ring_handle_t hwrh, mac_resource_handle_t prh,
1545     mac_ring_handle_t pseudo_rh)
1546 {
1547         mac_ring_t              *hw_ring = (mac_ring_t *)hwrh;
1548         mac_ring_t              *pseudo_ring;
1549         mac_soft_ring_set_t     *mac_srs = hw_ring->mr_srs;
1550 
1551         if (pseudo_rh != NULL) {
1552                 pseudo_ring = (mac_ring_t *)pseudo_rh;
1553                 /* Export the ddi handles to pseudo ring */
1554                 pseudo_ring->mr_info.mri_intr.mi_ddi_handle =
1555                     hw_ring->mr_info.mri_intr.mi_ddi_handle;
1556                 pseudo_ring->mr_info.mri_intr.mi_ddi_shared =
1557                     hw_ring->mr_info.mri_intr.mi_ddi_shared;
1558                 /*
1559                  * Save a pointer to pseudo ring in the hw ring. If
1560                  * interrupt handle changes, the hw ring will be
1561                  * notified of the change (see mac_ring_intr_set())
1562                  * and the appropriate change has to be made to
1563                  * the pseudo ring that has exported the ddi handle.
1564                  */
1565                 hw_ring->mr_prh = pseudo_rh;
1566         }
1567 
1568         if (hw_ring->mr_type == MAC_RING_TYPE_RX) {
1569                 ASSERT(!(mac_srs->srs_type & SRST_TX));
1570                 mac_srs->srs_mrh = prh;
1571                 mac_srs->srs_rx.sr_lower_proc = mac_hwrings_rx_process;
1572         }
1573 }
1574 
1575 void
1576 mac_hwring_teardown(mac_ring_handle_t hwrh)
1577 {
1578         mac_ring_t              *hw_ring = (mac_ring_t *)hwrh;
1579         mac_soft_ring_set_t     *mac_srs;
1580 
1581         if (hw_ring == NULL)
1582                 return;
1583         hw_ring->mr_prh = NULL;
1584         if (hw_ring->mr_type == MAC_RING_TYPE_RX) {
1585                 mac_srs = hw_ring->mr_srs;
1586                 ASSERT(!(mac_srs->srs_type & SRST_TX));
1587                 mac_srs->srs_rx.sr_lower_proc = mac_rx_srs_process;
1588                 mac_srs->srs_mrh = NULL;
1589         }
1590 }
1591 
1592 int
1593 mac_hwring_disable_intr(mac_ring_handle_t rh)
1594 {
1595         mac_ring_t *rr_ring = (mac_ring_t *)rh;
1596         mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1597 
1598         return (intr->mi_disable(intr->mi_handle));
1599 }
1600 
1601 int
1602 mac_hwring_enable_intr(mac_ring_handle_t rh)
1603 {
1604         mac_ring_t *rr_ring = (mac_ring_t *)rh;
1605         mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1606 
1607         return (intr->mi_enable(intr->mi_handle));
1608 }
1609 
1610 int
1611 mac_hwring_start(mac_ring_handle_t rh)
1612 {
1613         mac_ring_t *rr_ring = (mac_ring_t *)rh;
1614 
1615         MAC_RING_UNMARK(rr_ring, MR_QUIESCE);
1616         return (0);
1617 }
1618 
1619 void
1620 mac_hwring_stop(mac_ring_handle_t rh)
1621 {
1622         mac_ring_t *rr_ring = (mac_ring_t *)rh;
1623 
1624         mac_rx_ring_quiesce(rr_ring, MR_QUIESCE);
1625 }
1626 
1627 mblk_t *
1628 mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup)
1629 {
1630         mac_ring_t *rr_ring = (mac_ring_t *)rh;
1631         mac_ring_info_t *info = &rr_ring->mr_info;
1632 
1633         return (info->mri_poll(info->mri_driver, bytes_to_pickup));
1634 }
1635 
1636 /*
1637  * Send packets through a selected tx ring.
1638  */
1639 mblk_t *
1640 mac_hwring_tx(mac_ring_handle_t rh, mblk_t *mp)
1641 {
1642         mac_ring_t *ring = (mac_ring_t *)rh;
1643         mac_ring_info_t *info = &ring->mr_info;
1644 
1645         ASSERT(ring->mr_type == MAC_RING_TYPE_TX &&
1646             ring->mr_state >= MR_INUSE);
1647         return (info->mri_tx(info->mri_driver, mp));
1648 }
1649 
1650 /*
1651  * Query stats for a particular rx/tx ring
1652  */
1653 int
1654 mac_hwring_getstat(mac_ring_handle_t rh, uint_t stat, uint64_t *val)
1655 {
1656         mac_ring_t      *ring = (mac_ring_t *)rh;
1657         mac_ring_info_t *info = &ring->mr_info;
1658 
1659         return (info->mri_stat(info->mri_driver, stat, val));
1660 }
1661 
1662 /*
1663  * Private function that is only used by aggr to send packets through
1664  * a port/Tx ring. Since aggr exposes a pseudo Tx ring even for ports
1665  * that does not expose Tx rings, aggr_ring_tx() entry point needs
1666  * access to mac_impl_t to send packets through m_tx() entry point.
1667  * It accomplishes this by calling mac_hwring_send_priv() function.
1668  */
1669 mblk_t *
1670 mac_hwring_send_priv(mac_client_handle_t mch, mac_ring_handle_t rh, mblk_t *mp)
1671 {
1672         mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1673         mac_impl_t *mip = mcip->mci_mip;
1674 
1675         MAC_TX(mip, rh, mp, mcip);
1676         return (mp);
1677 }
1678 
1679 int
1680 mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr)
1681 {
1682         mac_group_t *group = (mac_group_t *)gh;
1683 
1684         return (mac_group_addmac(group, addr));
1685 }
1686 
1687 int
1688 mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr)
1689 {
1690         mac_group_t *group = (mac_group_t *)gh;
1691 
1692         return (mac_group_remmac(group, addr));
1693 }
1694 
1695 /*
1696  * Set the RX group to be shared/reserved. Note that the group must be
1697  * started/stopped outside of this function.
1698  */
1699 void
1700 mac_set_group_state(mac_group_t *grp, mac_group_state_t state)
1701 {
1702         /*
1703          * If there is no change in the group state, just return.
1704          */
1705         if (grp->mrg_state == state)
1706                 return;
1707 
1708         switch (state) {
1709         case MAC_GROUP_STATE_RESERVED:
1710                 /*
1711                  * Successfully reserved the group.
1712                  *
1713                  * Given that there is an exclusive client controlling this
1714                  * group, we enable the group level polling when available,
1715                  * so that SRSs get to turn on/off individual rings they's
1716                  * assigned to.
1717                  */
1718                 ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1719 
1720                 if (grp->mrg_type == MAC_RING_TYPE_RX &&
1721                     GROUP_INTR_DISABLE_FUNC(grp) != NULL) {
1722                         GROUP_INTR_DISABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1723                 }
1724                 break;
1725 
1726         case MAC_GROUP_STATE_SHARED:
1727                 /*
1728                  * Set all rings of this group to software classified.
1729                  * If the group has an overriding interrupt, then re-enable it.
1730                  */
1731                 ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1732 
1733                 if (grp->mrg_type == MAC_RING_TYPE_RX &&
1734                     GROUP_INTR_ENABLE_FUNC(grp) != NULL) {
1735                         GROUP_INTR_ENABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1736                 }
1737                 /* The ring is not available for reservations any more */
1738                 break;
1739 
1740         case MAC_GROUP_STATE_REGISTERED:
1741                 /* Also callable from mac_register, perim is not held */
1742                 break;
1743 
1744         default:
1745                 ASSERT(B_FALSE);
1746                 break;
1747         }
1748 
1749         grp->mrg_state = state;
1750 }
1751 
1752 /*
1753  * Quiesce future hardware classified packets for the specified Rx ring
1754  */
1755 static void
1756 mac_rx_ring_quiesce(mac_ring_t *rx_ring, uint_t ring_flag)
1757 {
1758         ASSERT(rx_ring->mr_classify_type == MAC_HW_CLASSIFIER);
1759         ASSERT(ring_flag == MR_CONDEMNED || ring_flag  == MR_QUIESCE);
1760 
1761         mutex_enter(&rx_ring->mr_lock);
1762         rx_ring->mr_flag |= ring_flag;
1763         while (rx_ring->mr_refcnt != 0)
1764                 cv_wait(&rx_ring->mr_cv, &rx_ring->mr_lock);
1765         mutex_exit(&rx_ring->mr_lock);
1766 }
1767 
1768 /*
1769  * Please see mac_tx for details about the per cpu locking scheme
1770  */
1771 static void
1772 mac_tx_lock_all(mac_client_impl_t *mcip)
1773 {
1774         int     i;
1775 
1776         for (i = 0; i <= mac_tx_percpu_cnt; i++)
1777                 mutex_enter(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1778 }
1779 
1780 static void
1781 mac_tx_unlock_all(mac_client_impl_t *mcip)
1782 {
1783         int     i;
1784 
1785         for (i = mac_tx_percpu_cnt; i >= 0; i--)
1786                 mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1787 }
1788 
1789 static void
1790 mac_tx_unlock_allbutzero(mac_client_impl_t *mcip)
1791 {
1792         int     i;
1793 
1794         for (i = mac_tx_percpu_cnt; i > 0; i--)
1795                 mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1796 }
1797 
1798 static int
1799 mac_tx_sum_refcnt(mac_client_impl_t *mcip)
1800 {
1801         int     i;
1802         int     refcnt = 0;
1803 
1804         for (i = 0; i <= mac_tx_percpu_cnt; i++)
1805                 refcnt += mcip->mci_tx_pcpu[i].pcpu_tx_refcnt;
1806 
1807         return (refcnt);
1808 }
1809 
1810 /*
1811  * Stop future Tx packets coming down from the client in preparation for
1812  * quiescing the Tx side. This is needed for dynamic reclaim and reassignment
1813  * of rings between clients
1814  */
1815 void
1816 mac_tx_client_block(mac_client_impl_t *mcip)
1817 {
1818         mac_tx_lock_all(mcip);
1819         mcip->mci_tx_flag |= MCI_TX_QUIESCE;
1820         while (mac_tx_sum_refcnt(mcip) != 0) {
1821                 mac_tx_unlock_allbutzero(mcip);
1822                 cv_wait(&mcip->mci_tx_cv, &mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1823                 mutex_exit(&mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1824                 mac_tx_lock_all(mcip);
1825         }
1826         mac_tx_unlock_all(mcip);
1827 }
1828 
1829 void
1830 mac_tx_client_unblock(mac_client_impl_t *mcip)
1831 {
1832         mac_tx_lock_all(mcip);
1833         mcip->mci_tx_flag &= ~MCI_TX_QUIESCE;
1834         mac_tx_unlock_all(mcip);
1835         /*
1836          * We may fail to disable flow control for the last MAC_NOTE_TX
1837          * notification because the MAC client is quiesced. Send the
1838          * notification again.
1839          */
1840         i_mac_notify(mcip->mci_mip, MAC_NOTE_TX);
1841 }
1842 
1843 /*
1844  * Wait for an SRS to quiesce. The SRS worker will signal us when the
1845  * quiesce is done.
1846  */
1847 static void
1848 mac_srs_quiesce_wait(mac_soft_ring_set_t *srs, uint_t srs_flag)
1849 {
1850         mutex_enter(&srs->srs_lock);
1851         while (!(srs->srs_state & srs_flag))
1852                 cv_wait(&srs->srs_quiesce_done_cv, &srs->srs_lock);
1853         mutex_exit(&srs->srs_lock);
1854 }
1855 
1856 /*
1857  * Quiescing an Rx SRS is achieved by the following sequence. The protocol
1858  * works bottom up by cutting off packet flow from the bottommost point in the
1859  * mac, then the SRS, and then the soft rings. There are 2 use cases of this
1860  * mechanism. One is a temporary quiesce of the SRS, such as say while changing
1861  * the Rx callbacks. Another use case is Rx SRS teardown. In the former case
1862  * the QUIESCE prefix/suffix is used and in the latter the CONDEMNED is used
1863  * for the SRS and MR flags. In the former case the threads pause waiting for
1864  * a restart, while in the latter case the threads exit. The Tx SRS teardown
1865  * is also mostly similar to the above.
1866  *
1867  * 1. Stop future hardware classified packets at the lowest level in the mac.
1868  *    Remove any hardware classification rule (CONDEMNED case) and mark the
1869  *    rings as CONDEMNED or QUIESCE as appropriate. This prevents the mr_refcnt
1870  *    from increasing. Upcalls from the driver that come through hardware
1871  *    classification will be dropped in mac_rx from now on. Then we wait for
1872  *    the mr_refcnt to drop to zero. When the mr_refcnt reaches zero we are
1873  *    sure there aren't any upcall threads from the driver through hardware
1874  *    classification. In the case of SRS teardown we also remove the
1875  *    classification rule in the driver.
1876  *
1877  * 2. Stop future software classified packets by marking the flow entry with
1878  *    FE_QUIESCE or FE_CONDEMNED as appropriate which prevents the refcnt from
1879  *    increasing. We also remove the flow entry from the table in the latter
1880  *    case. Then wait for the fe_refcnt to reach an appropriate quiescent value
1881  *    that indicates there aren't any active threads using that flow entry.
1882  *
1883  * 3. Quiesce the SRS and softrings by signaling the SRS. The SRS poll thread,
1884  *    SRS worker thread, and the soft ring threads are quiesced in sequence
1885  *    with the SRS worker thread serving as a master controller. This
1886  *    mechansim is explained in mac_srs_worker_quiesce().
1887  *
1888  * The restart mechanism to reactivate the SRS and softrings is explained
1889  * in mac_srs_worker_restart(). Here we just signal the SRS worker to start the
1890  * restart sequence.
1891  */
1892 void
1893 mac_rx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
1894 {
1895         flow_entry_t    *flent = srs->srs_flent;
1896         uint_t  mr_flag, srs_done_flag;
1897 
1898         ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1899         ASSERT(!(srs->srs_type & SRST_TX));
1900 
1901         if (srs_quiesce_flag == SRS_CONDEMNED) {
1902                 mr_flag = MR_CONDEMNED;
1903                 srs_done_flag = SRS_CONDEMNED_DONE;
1904                 if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1905                         mac_srs_client_poll_disable(srs->srs_mcip, srs);
1906         } else {
1907                 ASSERT(srs_quiesce_flag == SRS_QUIESCE);
1908                 mr_flag = MR_QUIESCE;
1909                 srs_done_flag = SRS_QUIESCE_DONE;
1910                 if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1911                         mac_srs_client_poll_quiesce(srs->srs_mcip, srs);
1912         }
1913 
1914         if (srs->srs_ring != NULL) {
1915                 mac_rx_ring_quiesce(srs->srs_ring, mr_flag);
1916         } else {
1917                 /*
1918                  * SRS is driven by software classification. In case
1919                  * of CONDEMNED, the top level teardown functions will
1920                  * deal with flow removal.
1921                  */
1922                 if (srs_quiesce_flag != SRS_CONDEMNED) {
1923                         FLOW_MARK(flent, FE_QUIESCE);
1924                         mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1925                 }
1926         }
1927 
1928         /*
1929          * Signal the SRS to quiesce itself, and then cv_wait for the
1930          * SRS quiesce to complete. The SRS worker thread will wake us
1931          * up when the quiesce is complete
1932          */
1933         mac_srs_signal(srs, srs_quiesce_flag);
1934         mac_srs_quiesce_wait(srs, srs_done_flag);
1935 }
1936 
1937 /*
1938  * Remove an SRS.
1939  */
1940 void
1941 mac_rx_srs_remove(mac_soft_ring_set_t *srs)
1942 {
1943         flow_entry_t *flent = srs->srs_flent;
1944         int i;
1945 
1946         mac_rx_srs_quiesce(srs, SRS_CONDEMNED);
1947         /*
1948          * Locate and remove our entry in the fe_rx_srs[] array, and
1949          * adjust the fe_rx_srs array entries and array count by
1950          * moving the last entry into the vacated spot.
1951          */
1952         mutex_enter(&flent->fe_lock);
1953         for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1954                 if (flent->fe_rx_srs[i] == srs)
1955                         break;
1956         }
1957 
1958         ASSERT(i != 0 && i < flent->fe_rx_srs_cnt);
1959         if (i != flent->fe_rx_srs_cnt - 1) {
1960                 flent->fe_rx_srs[i] =
1961                     flent->fe_rx_srs[flent->fe_rx_srs_cnt - 1];
1962                 i = flent->fe_rx_srs_cnt - 1;
1963         }
1964 
1965         flent->fe_rx_srs[i] = NULL;
1966         flent->fe_rx_srs_cnt--;
1967         mutex_exit(&flent->fe_lock);
1968 
1969         mac_srs_free(srs);
1970 }
1971 
1972 static void
1973 mac_srs_clear_flag(mac_soft_ring_set_t *srs, uint_t flag)
1974 {
1975         mutex_enter(&srs->srs_lock);
1976         srs->srs_state &= ~flag;
1977         mutex_exit(&srs->srs_lock);
1978 }
1979 
1980 void
1981 mac_rx_srs_restart(mac_soft_ring_set_t *srs)
1982 {
1983         flow_entry_t    *flent = srs->srs_flent;
1984         mac_ring_t      *mr;
1985 
1986         ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1987         ASSERT((srs->srs_type & SRST_TX) == 0);
1988 
1989         /*
1990          * This handles a change in the number of SRSs between the quiesce and
1991          * and restart operation of a flow.
1992          */
1993         if (!SRS_QUIESCED(srs))
1994                 return;
1995 
1996         /*
1997          * Signal the SRS to restart itself. Wait for the restart to complete
1998          * Note that we only restart the SRS if it is not marked as
1999          * permanently quiesced.
2000          */
2001         if (!SRS_QUIESCED_PERMANENT(srs)) {
2002                 mac_srs_signal(srs, SRS_RESTART);
2003                 mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2004                 mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2005 
2006                 mac_srs_client_poll_restart(srs->srs_mcip, srs);
2007         }
2008 
2009         /* Finally clear the flags to let the packets in */
2010         mr = srs->srs_ring;
2011         if (mr != NULL) {
2012                 MAC_RING_UNMARK(mr, MR_QUIESCE);
2013                 /* In case the ring was stopped, safely restart it */
2014                 if (mr->mr_state != MR_INUSE)
2015                         (void) mac_start_ring(mr);
2016         } else {
2017                 FLOW_UNMARK(flent, FE_QUIESCE);
2018         }
2019 }
2020 
2021 /*
2022  * Temporary quiesce of a flow and associated Rx SRS.
2023  * Please see block comment above mac_rx_classify_flow_rem.
2024  */
2025 /* ARGSUSED */
2026 int
2027 mac_rx_classify_flow_quiesce(flow_entry_t *flent, void *arg)
2028 {
2029         int             i;
2030 
2031         for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
2032                 mac_rx_srs_quiesce((mac_soft_ring_set_t *)flent->fe_rx_srs[i],
2033                     SRS_QUIESCE);
2034         }
2035         return (0);
2036 }
2037 
2038 /*
2039  * Restart a flow and associated Rx SRS that has been quiesced temporarily
2040  * Please see block comment above mac_rx_classify_flow_rem
2041  */
2042 /* ARGSUSED */
2043 int
2044 mac_rx_classify_flow_restart(flow_entry_t *flent, void *arg)
2045 {
2046         int             i;
2047 
2048         for (i = 0; i < flent->fe_rx_srs_cnt; i++)
2049                 mac_rx_srs_restart((mac_soft_ring_set_t *)flent->fe_rx_srs[i]);
2050 
2051         return (0);
2052 }
2053 
2054 void
2055 mac_srs_perm_quiesce(mac_client_handle_t mch, boolean_t on)
2056 {
2057         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
2058         flow_entry_t            *flent = mcip->mci_flent;
2059         mac_impl_t              *mip = mcip->mci_mip;
2060         mac_soft_ring_set_t     *mac_srs;
2061         int                     i;
2062 
2063         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2064 
2065         if (flent == NULL)
2066                 return;
2067 
2068         for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
2069                 mac_srs = flent->fe_rx_srs[i];
2070                 mutex_enter(&mac_srs->srs_lock);
2071                 if (on)
2072                         mac_srs->srs_state |= SRS_QUIESCE_PERM;
2073                 else
2074                         mac_srs->srs_state &= ~SRS_QUIESCE_PERM;
2075                 mutex_exit(&mac_srs->srs_lock);
2076         }
2077 }
2078 
2079 void
2080 mac_rx_client_quiesce(mac_client_handle_t mch)
2081 {
2082         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
2083         mac_impl_t              *mip = mcip->mci_mip;
2084 
2085         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2086 
2087         if (MCIP_DATAPATH_SETUP(mcip)) {
2088                 (void) mac_rx_classify_flow_quiesce(mcip->mci_flent,
2089                     NULL);
2090                 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2091                     mac_rx_classify_flow_quiesce, NULL);
2092         }
2093 }
2094 
2095 void
2096 mac_rx_client_restart(mac_client_handle_t mch)
2097 {
2098         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
2099         mac_impl_t              *mip = mcip->mci_mip;
2100 
2101         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2102 
2103         if (MCIP_DATAPATH_SETUP(mcip)) {
2104                 (void) mac_rx_classify_flow_restart(mcip->mci_flent, NULL);
2105                 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2106                     mac_rx_classify_flow_restart, NULL);
2107         }
2108 }
2109 
2110 /*
2111  * This function only quiesces the Tx SRS and softring worker threads. Callers
2112  * need to make sure that there aren't any mac client threads doing current or
2113  * future transmits in the mac before calling this function.
2114  */
2115 void
2116 mac_tx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
2117 {
2118         mac_client_impl_t       *mcip = srs->srs_mcip;
2119 
2120         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2121 
2122         ASSERT(srs->srs_type & SRST_TX);
2123         ASSERT(srs_quiesce_flag == SRS_CONDEMNED ||
2124             srs_quiesce_flag == SRS_QUIESCE);
2125 
2126         /*
2127          * Signal the SRS to quiesce itself, and then cv_wait for the
2128          * SRS quiesce to complete. The SRS worker thread will wake us
2129          * up when the quiesce is complete
2130          */
2131         mac_srs_signal(srs, srs_quiesce_flag);
2132         mac_srs_quiesce_wait(srs, srs_quiesce_flag == SRS_QUIESCE ?
2133             SRS_QUIESCE_DONE : SRS_CONDEMNED_DONE);
2134 }
2135 
2136 void
2137 mac_tx_srs_restart(mac_soft_ring_set_t *srs)
2138 {
2139         /*
2140          * Resizing the fanout could result in creation of new SRSs.
2141          * They may not necessarily be in the quiesced state in which
2142          * case it need be restarted
2143          */
2144         if (!SRS_QUIESCED(srs))
2145                 return;
2146 
2147         mac_srs_signal(srs, SRS_RESTART);
2148         mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2149         mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2150 }
2151 
2152 /*
2153  * Temporary quiesce of a flow and associated Rx SRS.
2154  * Please see block comment above mac_rx_srs_quiesce
2155  */
2156 /* ARGSUSED */
2157 int
2158 mac_tx_flow_quiesce(flow_entry_t *flent, void *arg)
2159 {
2160         /*
2161          * The fe_tx_srs is null for a subflow on an interface that is
2162          * not plumbed
2163          */
2164         if (flent->fe_tx_srs != NULL)
2165                 mac_tx_srs_quiesce(flent->fe_tx_srs, SRS_QUIESCE);
2166         return (0);
2167 }
2168 
2169 /* ARGSUSED */
2170 int
2171 mac_tx_flow_restart(flow_entry_t *flent, void *arg)
2172 {
2173         /*
2174          * The fe_tx_srs is null for a subflow on an interface that is
2175          * not plumbed
2176          */
2177         if (flent->fe_tx_srs != NULL)
2178                 mac_tx_srs_restart(flent->fe_tx_srs);
2179         return (0);
2180 }
2181 
2182 static void
2183 i_mac_tx_client_quiesce(mac_client_handle_t mch, uint_t srs_quiesce_flag)
2184 {
2185         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
2186 
2187         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2188 
2189         mac_tx_client_block(mcip);
2190         if (MCIP_TX_SRS(mcip) != NULL) {
2191                 mac_tx_srs_quiesce(MCIP_TX_SRS(mcip), srs_quiesce_flag);
2192                 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2193                     mac_tx_flow_quiesce, NULL);
2194         }
2195 }
2196 
2197 void
2198 mac_tx_client_quiesce(mac_client_handle_t mch)
2199 {
2200         i_mac_tx_client_quiesce(mch, SRS_QUIESCE);
2201 }
2202 
2203 void
2204 mac_tx_client_condemn(mac_client_handle_t mch)
2205 {
2206         i_mac_tx_client_quiesce(mch, SRS_CONDEMNED);
2207 }
2208 
2209 void
2210 mac_tx_client_restart(mac_client_handle_t mch)
2211 {
2212         mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
2213 
2214         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2215 
2216         mac_tx_client_unblock(mcip);
2217         if (MCIP_TX_SRS(mcip) != NULL) {
2218                 mac_tx_srs_restart(MCIP_TX_SRS(mcip));
2219                 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2220                     mac_tx_flow_restart, NULL);
2221         }
2222 }
2223 
2224 void
2225 mac_tx_client_flush(mac_client_impl_t *mcip)
2226 {
2227         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2228 
2229         mac_tx_client_quiesce((mac_client_handle_t)mcip);
2230         mac_tx_client_restart((mac_client_handle_t)mcip);
2231 }
2232 
2233 void
2234 mac_client_quiesce(mac_client_impl_t *mcip)
2235 {
2236         mac_rx_client_quiesce((mac_client_handle_t)mcip);
2237         mac_tx_client_quiesce((mac_client_handle_t)mcip);
2238 }
2239 
2240 void
2241 mac_client_restart(mac_client_impl_t *mcip)
2242 {
2243         mac_rx_client_restart((mac_client_handle_t)mcip);
2244         mac_tx_client_restart((mac_client_handle_t)mcip);
2245 }
2246 
2247 /*
2248  * Allocate a minor number.
2249  */
2250 minor_t
2251 mac_minor_hold(boolean_t sleep)
2252 {
2253         minor_t minor;
2254 
2255         /*
2256          * Grab a value from the arena.
2257          */
2258         atomic_add_32(&minor_count, 1);
2259 
2260         if (sleep)
2261                 minor = (uint_t)id_alloc(minor_ids);
2262         else
2263                 minor = (uint_t)id_alloc_nosleep(minor_ids);
2264 
2265         if (minor == 0) {
2266                 atomic_add_32(&minor_count, -1);
2267                 return (0);
2268         }
2269 
2270         return (minor);
2271 }
2272 
2273 /*
2274  * Release a previously allocated minor number.
2275  */
2276 void
2277 mac_minor_rele(minor_t minor)
2278 {
2279         /*
2280          * Return the value to the arena.
2281          */
2282         id_free(minor_ids, minor);
2283         atomic_add_32(&minor_count, -1);
2284 }
2285 
2286 uint32_t
2287 mac_no_notification(mac_handle_t mh)
2288 {
2289         mac_impl_t *mip = (mac_impl_t *)mh;
2290 
2291         return (((mip->mi_state_flags & MIS_LEGACY) != 0) ?
2292             mip->mi_capab_legacy.ml_unsup_note : 0);
2293 }
2294 
2295 /*
2296  * Prevent any new opens of this mac in preparation for unregister
2297  */
2298 int
2299 i_mac_disable(mac_impl_t *mip)
2300 {
2301         mac_client_impl_t       *mcip;
2302 
2303         rw_enter(&i_mac_impl_lock, RW_WRITER);
2304         if (mip->mi_state_flags & MIS_DISABLED) {
2305                 /* Already disabled, return success */
2306                 rw_exit(&i_mac_impl_lock);
2307                 return (0);
2308         }
2309         /*
2310          * See if there are any other references to this mac_t (e.g., VLAN's).
2311          * If so return failure. If all the other checks below pass, then
2312          * set mi_disabled atomically under the i_mac_impl_lock to prevent
2313          * any new VLAN's from being created or new mac client opens of this
2314          * mac end point.
2315          */
2316         if (mip->mi_ref > 0) {
2317                 rw_exit(&i_mac_impl_lock);
2318                 return (EBUSY);
2319         }
2320 
2321         /*
2322          * mac clients must delete all multicast groups they join before
2323          * closing. bcast groups are reference counted, the last client
2324          * to delete the group will wait till the group is physically
2325          * deleted. Since all clients have closed this mac end point
2326          * mi_bcast_ngrps must be zero at this point
2327          */
2328         ASSERT(mip->mi_bcast_ngrps == 0);
2329 
2330         /*
2331          * Don't let go of this if it has some flows.
2332          * All other code guarantees no flows are added to a disabled
2333          * mac, therefore it is sufficient to check for the flow table
2334          * only here.
2335          */
2336         mcip = mac_primary_client_handle(mip);
2337         if ((mcip != NULL) && mac_link_has_flows((mac_client_handle_t)mcip)) {
2338                 rw_exit(&i_mac_impl_lock);
2339                 return (ENOTEMPTY);
2340         }
2341 
2342         mip->mi_state_flags |= MIS_DISABLED;
2343         rw_exit(&i_mac_impl_lock);
2344         return (0);
2345 }
2346 
2347 int
2348 mac_disable_nowait(mac_handle_t mh)
2349 {
2350         mac_impl_t      *mip = (mac_impl_t *)mh;
2351         int err;
2352 
2353         if ((err = i_mac_perim_enter_nowait(mip)) != 0)
2354                 return (err);
2355         err = i_mac_disable(mip);
2356         i_mac_perim_exit(mip);
2357         return (err);
2358 }
2359 
2360 int
2361 mac_disable(mac_handle_t mh)
2362 {
2363         mac_impl_t      *mip = (mac_impl_t *)mh;
2364         int err;
2365 
2366         i_mac_perim_enter(mip);
2367         err = i_mac_disable(mip);
2368         i_mac_perim_exit(mip);
2369 
2370         /*
2371          * Clean up notification thread and wait for it to exit.
2372          */
2373         if (err == 0)
2374                 i_mac_notify_exit(mip);
2375 
2376         return (err);
2377 }
2378 
2379 /*
2380  * Called when the MAC instance has a non empty flow table, to de-multiplex
2381  * incoming packets to the right flow.
2382  * The MAC's rw lock is assumed held as a READER.
2383  */
2384 /* ARGSUSED */
2385 static mblk_t *
2386 mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp)
2387 {
2388         flow_entry_t    *flent = NULL;
2389         uint_t          flags = FLOW_INBOUND;
2390         int             err;
2391 
2392         /*
2393          * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN
2394          * to mac_flow_lookup() so that the VLAN packets can be successfully
2395          * passed to the non-VLAN aggregation flows.
2396          *
2397          * Note that there is possibly a race between this and
2398          * mac_unicast_remove/add() and VLAN packets could be incorrectly
2399          * classified to non-VLAN flows of non-aggregation mac clients. These
2400          * VLAN packets will be then filtered out by the mac module.
2401          */
2402         if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0)
2403                 flags |= FLOW_IGNORE_VLAN;
2404 
2405         err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent);
2406         if (err != 0) {
2407                 /* no registered receive function */
2408                 return (mp);
2409         } else {
2410                 mac_client_impl_t       *mcip;
2411 
2412                 /*
2413                  * This flent might just be an additional one on the MAC client,
2414                  * i.e. for classification purposes (different fdesc), however
2415                  * the resources, SRS et. al., are in the mci_flent, so if
2416                  * this isn't the mci_flent, we need to get it.
2417                  */
2418                 if ((mcip = flent->fe_mcip) != NULL &&
2419                     mcip->mci_flent != flent) {
2420                         FLOW_REFRELE(flent);
2421                         flent = mcip->mci_flent;
2422                         FLOW_TRY_REFHOLD(flent, err);
2423                         if (err != 0)
2424                                 return (mp);
2425                 }
2426                 (flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp,
2427                     B_FALSE);
2428                 FLOW_REFRELE(flent);
2429         }
2430         return (NULL);
2431 }
2432 
2433 mblk_t *
2434 mac_rx_flow(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
2435 {
2436         mac_impl_t      *mip = (mac_impl_t *)mh;
2437         mblk_t          *bp, *bp1, **bpp, *list = NULL;
2438 
2439         /*
2440          * We walk the chain and attempt to classify each packet.
2441          * The packets that couldn't be classified will be returned
2442          * back to the caller.
2443          */
2444         bp = mp_chain;
2445         bpp = &list;
2446         while (bp != NULL) {
2447                 bp1 = bp;
2448                 bp = bp->b_next;
2449                 bp1->b_next = NULL;
2450 
2451                 if (mac_rx_classify(mip, mrh, bp1) != NULL) {
2452                         *bpp = bp1;
2453                         bpp = &bp1->b_next;
2454                 }
2455         }
2456         return (list);
2457 }
2458 
2459 static int
2460 mac_tx_flow_srs_wakeup(flow_entry_t *flent, void *arg)
2461 {
2462         mac_ring_handle_t ring = arg;
2463 
2464         if (flent->fe_tx_srs)
2465                 mac_tx_srs_wakeup(flent->fe_tx_srs, ring);
2466         return (0);
2467 }
2468 
2469 void
2470 i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring)
2471 {
2472         mac_client_impl_t       *cclient;
2473         mac_soft_ring_set_t     *mac_srs;
2474 
2475         /*
2476          * After grabbing the mi_rw_lock, the list of clients can't change.
2477          * If there are any clients mi_disabled must be B_FALSE and can't
2478          * get set since there are clients. If there aren't any clients we
2479          * don't do anything. In any case the mip has to be valid. The driver
2480          * must make sure that it goes single threaded (with respect to mac
2481          * calls) and wait for all pending mac calls to finish before calling
2482          * mac_unregister.
2483          */
2484         rw_enter(&i_mac_impl_lock, RW_READER);
2485         if (mip->mi_state_flags & MIS_DISABLED) {
2486                 rw_exit(&i_mac_impl_lock);
2487                 return;
2488         }
2489 
2490         /*
2491          * Get MAC tx srs from walking mac_client_handle list.
2492          */
2493         rw_enter(&mip->mi_rw_lock, RW_READER);
2494         for (cclient = mip->mi_clients_list; cclient != NULL;
2495             cclient = cclient->mci_client_next) {
2496                 if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL) {
2497                         mac_tx_srs_wakeup(mac_srs, ring);
2498                 } else {
2499                         /*
2500                          * Aggr opens underlying ports in exclusive mode
2501                          * and registers flow control callbacks using
2502                          * mac_tx_client_notify(). When opened in
2503                          * exclusive mode, Tx SRS won't be created
2504                          * during mac_unicast_add().
2505                          */
2506                         if (cclient->mci_state_flags & MCIS_EXCLUSIVE) {
2507                                 mac_tx_invoke_callbacks(cclient,
2508                                     (mac_tx_cookie_t)ring);
2509                         }
2510                 }
2511                 (void) mac_flow_walk(cclient->mci_subflow_tab,
2512                     mac_tx_flow_srs_wakeup, ring);
2513         }
2514         rw_exit(&mip->mi_rw_lock);
2515         rw_exit(&i_mac_impl_lock);
2516 }
2517 
2518 /* ARGSUSED */
2519 void
2520 mac_multicast_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg,
2521     boolean_t add)
2522 {
2523         mac_impl_t *mip = (mac_impl_t *)mh;
2524 
2525         i_mac_perim_enter((mac_impl_t *)mh);
2526         /*
2527          * If no specific refresh function was given then default to the
2528          * driver's m_multicst entry point.
2529          */
2530         if (refresh == NULL) {
2531                 refresh = mip->mi_multicst;
2532                 arg = mip->mi_driver;
2533         }
2534 
2535         mac_bcast_refresh(mip, refresh, arg, add);
2536         i_mac_perim_exit((mac_impl_t *)mh);
2537 }
2538 
2539 void
2540 mac_promisc_refresh(mac_handle_t mh, mac_setpromisc_t refresh, void *arg)
2541 {
2542         mac_impl_t      *mip = (mac_impl_t *)mh;
2543 
2544         /*
2545          * If no specific refresh function was given then default to the
2546          * driver's m_promisc entry point.
2547          */
2548         if (refresh == NULL) {
2549                 refresh = mip->mi_setpromisc;
2550                 arg = mip->mi_driver;
2551         }
2552         ASSERT(refresh != NULL);
2553 
2554         /*
2555          * Call the refresh function with the current promiscuity.
2556          */
2557         refresh(arg, (mip->mi_devpromisc != 0));
2558 }
2559 
2560 /*
2561  * The mac client requests that the mac not to change its margin size to
2562  * be less than the specified value.  If "current" is B_TRUE, then the client
2563  * requests the mac not to change its margin size to be smaller than the
2564  * current size. Further, return the current margin size value in this case.
2565  *
2566  * We keep every requested size in an ordered list from largest to smallest.
2567  */
2568 int
2569 mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
2570 {
2571         mac_impl_t              *mip = (mac_impl_t *)mh;
2572         mac_margin_req_t        **pp, *p;
2573         int                     err = 0;
2574 
2575         rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2576         if (current)
2577                 *marginp = mip->mi_margin;
2578 
2579         /*
2580          * If the current margin value cannot satisfy the margin requested,
2581          * return ENOTSUP directly.
2582          */
2583         if (*marginp > mip->mi_margin) {
2584                 err = ENOTSUP;
2585                 goto done;
2586         }
2587 
2588         /*
2589          * Check whether the given margin is already in the list. If so,
2590          * bump the reference count.
2591          */
2592         for (pp = &mip->mi_mmrp; (p = *pp) != NULL; pp = &p->mmr_nextp) {
2593                 if (p->mmr_margin == *marginp) {
2594                         /*
2595                          * The margin requested is already in the list,
2596                          * so just bump the reference count.
2597                          */
2598                         p->mmr_ref++;
2599                         goto done;
2600                 }
2601                 if (p->mmr_margin < *marginp)
2602                         break;
2603         }
2604 
2605 
2606         p = kmem_zalloc(sizeof (mac_margin_req_t), KM_SLEEP);
2607         p->mmr_margin = *marginp;
2608         p->mmr_ref++;
2609         p->mmr_nextp = *pp;
2610         *pp = p;
2611 
2612 done:
2613         rw_exit(&(mip->mi_rw_lock));
2614         return (err);
2615 }
2616 
2617 /*
2618  * The mac client requests to cancel its previous mac_margin_add() request.
2619  * We remove the requested margin size from the list.
2620  */
2621 int
2622 mac_margin_remove(mac_handle_t mh, uint32_t margin)
2623 {
2624         mac_impl_t              *mip = (mac_impl_t *)mh;
2625         mac_margin_req_t        **pp, *p;
2626         int                     err = 0;
2627 
2628         rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2629         /*
2630          * Find the entry in the list for the given margin.
2631          */
2632         for (pp = &(mip->mi_mmrp); (p = *pp) != NULL; pp = &(p->mmr_nextp)) {
2633                 if (p->mmr_margin == margin) {
2634                         if (--p->mmr_ref == 0)
2635                                 break;
2636 
2637                         /*
2638                          * There is still a reference to this address so
2639                          * there's nothing more to do.
2640                          */
2641                         goto done;
2642                 }
2643         }
2644 
2645         /*
2646          * We did not find an entry for the given margin.
2647          */
2648         if (p == NULL) {
2649                 err = ENOENT;
2650                 goto done;
2651         }
2652 
2653         ASSERT(p->mmr_ref == 0);
2654 
2655         /*
2656          * Remove it from the list.
2657          */
2658         *pp = p->mmr_nextp;
2659         kmem_free(p, sizeof (mac_margin_req_t));
2660 done:
2661         rw_exit(&(mip->mi_rw_lock));
2662         return (err);
2663 }
2664 
2665 boolean_t
2666 mac_margin_update(mac_handle_t mh, uint32_t margin)
2667 {
2668         mac_impl_t      *mip = (mac_impl_t *)mh;
2669         uint32_t        margin_needed = 0;
2670 
2671         rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2672 
2673         if (mip->mi_mmrp != NULL)
2674                 margin_needed = mip->mi_mmrp->mmr_margin;
2675 
2676         if (margin_needed <= margin)
2677                 mip->mi_margin = margin;
2678 
2679         rw_exit(&(mip->mi_rw_lock));
2680 
2681         if (margin_needed <= margin)
2682                 i_mac_notify(mip, MAC_NOTE_MARGIN);
2683 
2684         return (margin_needed <= margin);
2685 }
2686 
2687 /*
2688  * MAC Type Plugin functions.
2689  */
2690 
2691 mactype_t *
2692 mactype_getplugin(const char *pname)
2693 {
2694         mactype_t       *mtype = NULL;
2695         boolean_t       tried_modload = B_FALSE;
2696 
2697         mutex_enter(&i_mactype_lock);
2698 
2699 find_registered_mactype:
2700         if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname,
2701             (mod_hash_val_t *)&mtype) != 0) {
2702                 if (!tried_modload) {
2703                         /*
2704                          * If the plugin has not yet been loaded, then
2705                          * attempt to load it now.  If modload() succeeds,
2706                          * the plugin should have registered using
2707                          * mactype_register(), in which case we can go back
2708                          * and attempt to find it again.
2709                          */
2710                         if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) {
2711                                 tried_modload = B_TRUE;
2712                                 goto find_registered_mactype;
2713                         }
2714                 }
2715         } else {
2716                 /*
2717                  * Note that there's no danger that the plugin we've loaded
2718                  * could be unloaded between the modload() step and the
2719                  * reference count bump here, as we're holding
2720                  * i_mactype_lock, which mactype_unregister() also holds.
2721                  */
2722                 atomic_inc_32(&mtype->mt_ref);
2723         }
2724 
2725         mutex_exit(&i_mactype_lock);
2726         return (mtype);
2727 }
2728 
2729 mactype_register_t *
2730 mactype_alloc(uint_t mactype_version)
2731 {
2732         mactype_register_t *mtrp;
2733 
2734         /*
2735          * Make sure there isn't a version mismatch between the plugin and
2736          * the framework.  In the future, if multiple versions are
2737          * supported, this check could become more sophisticated.
2738          */
2739         if (mactype_version != MACTYPE_VERSION)
2740                 return (NULL);
2741 
2742         mtrp = kmem_zalloc(sizeof (mactype_register_t), KM_SLEEP);
2743         mtrp->mtr_version = mactype_version;
2744         return (mtrp);
2745 }
2746 
2747 void
2748 mactype_free(mactype_register_t *mtrp)
2749 {
2750         kmem_free(mtrp, sizeof (mactype_register_t));
2751 }
2752 
2753 int
2754 mactype_register(mactype_register_t *mtrp)
2755 {
2756         mactype_t       *mtp;
2757         mactype_ops_t   *ops = mtrp->mtr_ops;
2758 
2759         /* Do some sanity checking before we register this MAC type. */
2760         if (mtrp->mtr_ident == NULL || ops == NULL)
2761                 return (EINVAL);
2762 
2763         /*
2764          * Verify that all mandatory callbacks are set in the ops
2765          * vector.
2766          */
2767         if (ops->mtops_unicst_verify == NULL ||
2768             ops->mtops_multicst_verify == NULL ||
2769             ops->mtops_sap_verify == NULL ||
2770             ops->mtops_header == NULL ||
2771             ops->mtops_header_info == NULL) {
2772                 return (EINVAL);
2773         }
2774 
2775         mtp = kmem_zalloc(sizeof (*mtp), KM_SLEEP);
2776         mtp->mt_ident = mtrp->mtr_ident;
2777         mtp->mt_ops = *ops;
2778         mtp->mt_type = mtrp->mtr_mactype;
2779         mtp->mt_nativetype = mtrp->mtr_nativetype;
2780         mtp->mt_addr_length = mtrp->mtr_addrlen;
2781         if (mtrp->mtr_brdcst_addr != NULL) {
2782                 mtp->mt_brdcst_addr = kmem_alloc(mtrp->mtr_addrlen, KM_SLEEP);
2783                 bcopy(mtrp->mtr_brdcst_addr, mtp->mt_brdcst_addr,
2784                     mtrp->mtr_addrlen);
2785         }
2786 
2787         mtp->mt_stats = mtrp->mtr_stats;
2788         mtp->mt_statcount = mtrp->mtr_statcount;
2789 
2790         mtp->mt_mapping = mtrp->mtr_mapping;
2791         mtp->mt_mappingcount = mtrp->mtr_mappingcount;
2792 
2793         if (mod_hash_insert(i_mactype_hash,
2794             (mod_hash_key_t)mtp->mt_ident, (mod_hash_val_t)mtp) != 0) {
2795                 kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2796                 kmem_free(mtp, sizeof (*mtp));
2797                 return (EEXIST);
2798         }
2799         return (0);
2800 }
2801 
2802 int
2803 mactype_unregister(const char *ident)
2804 {
2805         mactype_t       *mtp;
2806         mod_hash_val_t  val;
2807         int             err;
2808 
2809         /*
2810          * Let's not allow MAC drivers to use this plugin while we're
2811          * trying to unregister it.  Holding i_mactype_lock also prevents a
2812          * plugin from unregistering while a MAC driver is attempting to
2813          * hold a reference to it in i_mactype_getplugin().
2814          */
2815         mutex_enter(&i_mactype_lock);
2816 
2817         if ((err = mod_hash_find(i_mactype_hash, (mod_hash_key_t)ident,
2818             (mod_hash_val_t *)&mtp)) != 0) {
2819                 /* A plugin is trying to unregister, but it never registered. */
2820                 err = ENXIO;
2821                 goto done;
2822         }
2823 
2824         if (mtp->mt_ref != 0) {
2825                 err = EBUSY;
2826                 goto done;
2827         }
2828 
2829         err = mod_hash_remove(i_mactype_hash, (mod_hash_key_t)ident, &val);
2830         ASSERT(err == 0);
2831         if (err != 0) {
2832                 /* This should never happen, thus the ASSERT() above. */
2833                 err = EINVAL;
2834                 goto done;
2835         }
2836         ASSERT(mtp == (mactype_t *)val);
2837 
2838         if (mtp->mt_brdcst_addr != NULL)
2839                 kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2840         kmem_free(mtp, sizeof (mactype_t));
2841 done:
2842         mutex_exit(&i_mactype_lock);
2843         return (err);
2844 }
2845 
2846 /*
2847  * Checks the size of the value size specified for a property as
2848  * part of a property operation. Returns B_TRUE if the size is
2849  * correct, B_FALSE otherwise.
2850  */
2851 boolean_t
2852 mac_prop_check_size(mac_prop_id_t id, uint_t valsize, boolean_t is_range)
2853 {
2854         uint_t minsize = 0;
2855 
2856         if (is_range)
2857                 return (valsize >= sizeof (mac_propval_range_t));
2858 
2859         switch (id) {
2860         case MAC_PROP_ZONE:
2861                 minsize = sizeof (dld_ioc_zid_t);
2862                 break;
2863         case MAC_PROP_AUTOPUSH:
2864                 if (valsize != 0)
2865                         minsize = sizeof (struct dlautopush);
2866                 break;
2867         case MAC_PROP_TAGMODE:
2868                 minsize = sizeof (link_tagmode_t);
2869                 break;
2870         case MAC_PROP_RESOURCE:
2871         case MAC_PROP_RESOURCE_EFF:
2872                 minsize = sizeof (mac_resource_props_t);
2873                 break;
2874         case MAC_PROP_DUPLEX:
2875                 minsize = sizeof (link_duplex_t);
2876                 break;
2877         case MAC_PROP_SPEED:
2878                 minsize = sizeof (uint64_t);
2879                 break;
2880         case MAC_PROP_STATUS:
2881                 minsize = sizeof (link_state_t);
2882                 break;
2883         case MAC_PROP_AUTONEG:
2884         case MAC_PROP_EN_AUTONEG:
2885                 minsize = sizeof (uint8_t);
2886                 break;
2887         case MAC_PROP_MTU:
2888         case MAC_PROP_LLIMIT:
2889         case MAC_PROP_LDECAY:
2890                 minsize = sizeof (uint32_t);
2891                 break;
2892         case MAC_PROP_FLOWCTRL:
2893                 minsize = sizeof (link_flowctrl_t);
2894                 break;
2895         case MAC_PROP_ADV_10GFDX_CAP:
2896         case MAC_PROP_EN_10GFDX_CAP:
2897         case MAC_PROP_ADV_1000HDX_CAP:
2898         case MAC_PROP_EN_1000HDX_CAP:
2899         case MAC_PROP_ADV_100FDX_CAP:
2900         case MAC_PROP_EN_100FDX_CAP:
2901         case MAC_PROP_ADV_100HDX_CAP:
2902         case MAC_PROP_EN_100HDX_CAP:
2903         case MAC_PROP_ADV_10FDX_CAP:
2904         case MAC_PROP_EN_10FDX_CAP:
2905         case MAC_PROP_ADV_10HDX_CAP:
2906         case MAC_PROP_EN_10HDX_CAP:
2907         case MAC_PROP_ADV_100T4_CAP:
2908         case MAC_PROP_EN_100T4_CAP:
2909                 minsize = sizeof (uint8_t);
2910                 break;
2911         case MAC_PROP_PVID:
2912                 minsize = sizeof (uint16_t);
2913                 break;
2914         case MAC_PROP_IPTUN_HOPLIMIT:
2915                 minsize = sizeof (uint32_t);
2916                 break;
2917         case MAC_PROP_IPTUN_ENCAPLIMIT:
2918                 minsize = sizeof (uint32_t);
2919                 break;
2920         case MAC_PROP_MAX_TX_RINGS_AVAIL:
2921         case MAC_PROP_MAX_RX_RINGS_AVAIL:
2922         case MAC_PROP_MAX_RXHWCLNT_AVAIL:
2923         case MAC_PROP_MAX_TXHWCLNT_AVAIL:
2924                 minsize = sizeof (uint_t);
2925                 break;
2926         case MAC_PROP_WL_ESSID:
2927                 minsize = sizeof (wl_linkstatus_t);
2928                 break;
2929         case MAC_PROP_WL_BSSID:
2930                 minsize = sizeof (wl_bssid_t);
2931                 break;
2932         case MAC_PROP_WL_BSSTYPE:
2933                 minsize = sizeof (wl_bss_type_t);
2934                 break;
2935         case MAC_PROP_WL_LINKSTATUS:
2936                 minsize = sizeof (wl_linkstatus_t);
2937                 break;
2938         case MAC_PROP_WL_DESIRED_RATES:
2939                 minsize = sizeof (wl_rates_t);
2940                 break;
2941         case MAC_PROP_WL_SUPPORTED_RATES:
2942                 minsize = sizeof (wl_rates_t);
2943                 break;
2944         case MAC_PROP_WL_AUTH_MODE:
2945                 minsize = sizeof (wl_authmode_t);
2946                 break;
2947         case MAC_PROP_WL_ENCRYPTION:
2948                 minsize = sizeof (wl_encryption_t);
2949                 break;
2950         case MAC_PROP_WL_RSSI:
2951                 minsize = sizeof (wl_rssi_t);
2952                 break;
2953         case MAC_PROP_WL_PHY_CONFIG:
2954                 minsize = sizeof (wl_phy_conf_t);
2955                 break;
2956         case MAC_PROP_WL_CAPABILITY:
2957                 minsize = sizeof (wl_capability_t);
2958                 break;
2959         case MAC_PROP_WL_WPA:
2960                 minsize = sizeof (wl_wpa_t);
2961                 break;
2962         case MAC_PROP_WL_SCANRESULTS:
2963                 minsize = sizeof (wl_wpa_ess_t);
2964                 break;
2965         case MAC_PROP_WL_POWER_MODE:
2966                 minsize = sizeof (wl_ps_mode_t);
2967                 break;
2968         case MAC_PROP_WL_RADIO:
2969                 minsize = sizeof (wl_radio_t);
2970                 break;
2971         case MAC_PROP_WL_ESS_LIST:
2972                 minsize = sizeof (wl_ess_list_t);
2973                 break;
2974         case MAC_PROP_WL_KEY_TAB:
2975                 minsize = sizeof (wl_wep_key_tab_t);
2976                 break;
2977         case MAC_PROP_WL_CREATE_IBSS:
2978                 minsize = sizeof (wl_create_ibss_t);
2979                 break;
2980         case MAC_PROP_WL_SETOPTIE:
2981                 minsize = sizeof (wl_wpa_ie_t);
2982                 break;
2983         case MAC_PROP_WL_DELKEY:
2984                 minsize = sizeof (wl_del_key_t);
2985                 break;
2986         case MAC_PROP_WL_KEY:
2987                 minsize = sizeof (wl_key_t);
2988                 break;
2989         case MAC_PROP_WL_MLME:
2990                 minsize = sizeof (wl_mlme_t);
2991                 break;
2992         }
2993 
2994         return (valsize >= minsize);
2995 }
2996 
2997 /*
2998  * mac_set_prop() sets MAC or hardware driver properties:
2999  *
3000  * - MAC-managed properties such as resource properties include maxbw,
3001  *   priority, and cpu binding list, as well as the default port VID
3002  *   used by bridging. These properties are consumed by the MAC layer
3003  *   itself and not passed down to the driver. For resource control
3004  *   properties, this function invokes mac_set_resources() which will
3005  *   cache the property value in mac_impl_t and may call
3006  *   mac_client_set_resource() to update property value of the primary
3007  *   mac client, if it exists.
3008  *
3009  * - Properties which act on the hardware and must be passed to the
3010  *   driver, such as MTU, through the driver's mc_setprop() entry point.
3011  */
3012 int
3013 mac_set_prop(mac_handle_t mh, mac_prop_id_t id, char *name, void *val,
3014     uint_t valsize)
3015 {
3016         int err = ENOTSUP;
3017         mac_impl_t *mip = (mac_impl_t *)mh;
3018 
3019         ASSERT(MAC_PERIM_HELD(mh));
3020 
3021         switch (id) {
3022         case MAC_PROP_RESOURCE: {
3023                 mac_resource_props_t *mrp;
3024 
3025                 /* call mac_set_resources() for MAC properties */
3026                 ASSERT(valsize >= sizeof (mac_resource_props_t));
3027                 mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3028                 bcopy(val, mrp, sizeof (*mrp));
3029                 err = mac_set_resources(mh, mrp);
3030                 kmem_free(mrp, sizeof (*mrp));
3031                 break;
3032         }
3033 
3034         case MAC_PROP_PVID:
3035                 ASSERT(valsize >= sizeof (uint16_t));
3036                 if (mip->mi_state_flags & MIS_IS_VNIC)
3037                         return (EINVAL);
3038                 err = mac_set_pvid(mh, *(uint16_t *)val);
3039                 break;
3040 
3041         case MAC_PROP_MTU: {
3042                 uint32_t mtu;
3043 
3044                 ASSERT(valsize >= sizeof (uint32_t));
3045                 bcopy(val, &mtu, sizeof (mtu));
3046                 err = mac_set_mtu(mh, mtu, NULL);
3047                 break;
3048         }
3049 
3050         case MAC_PROP_LLIMIT:
3051         case MAC_PROP_LDECAY: {
3052                 uint32_t learnval;
3053 
3054                 if (valsize < sizeof (learnval) ||
3055                     (mip->mi_state_flags & MIS_IS_VNIC))
3056                         return (EINVAL);
3057                 bcopy(val, &learnval, sizeof (learnval));
3058                 if (learnval == 0 && id == MAC_PROP_LDECAY)
3059                         return (EINVAL);
3060                 if (id == MAC_PROP_LLIMIT)
3061                         mip->mi_llimit = learnval;
3062                 else
3063                         mip->mi_ldecay = learnval;
3064                 err = 0;
3065                 break;
3066         }
3067 
3068         default:
3069                 /* For other driver properties, call driver's callback */
3070                 if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
3071                         err = mip->mi_callbacks->mc_setprop(mip->mi_driver,
3072                             name, id, valsize, val);
3073                 }
3074         }
3075         return (err);
3076 }
3077 
3078 /*
3079  * mac_get_prop() gets MAC or device driver properties.
3080  *
3081  * If the property is a driver property, mac_get_prop() calls driver's callback
3082  * entry point to get it.
3083  * If the property is a MAC property, mac_get_prop() invokes mac_get_resources()
3084  * which returns the cached value in mac_impl_t.
3085  */
3086 int
3087 mac_get_prop(mac_handle_t mh, mac_prop_id_t id, char *name, void *val,
3088     uint_t valsize)
3089 {
3090         int err = ENOTSUP;
3091         mac_impl_t *mip = (mac_impl_t *)mh;
3092         uint_t  rings;
3093         uint_t  vlinks;
3094 
3095         bzero(val, valsize);
3096 
3097         switch (id) {
3098         case MAC_PROP_RESOURCE: {
3099                 mac_resource_props_t *mrp;
3100 
3101                 /* If mac property, read from cache */
3102                 ASSERT(valsize >= sizeof (mac_resource_props_t));
3103                 mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3104                 mac_get_resources(mh, mrp);
3105                 bcopy(mrp, val, sizeof (*mrp));
3106                 kmem_free(mrp, sizeof (*mrp));
3107                 return (0);
3108         }
3109         case MAC_PROP_RESOURCE_EFF: {
3110                 mac_resource_props_t *mrp;
3111 
3112                 /* If mac effective property, read from client */
3113                 ASSERT(valsize >= sizeof (mac_resource_props_t));
3114                 mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3115                 mac_get_effective_resources(mh, mrp);
3116                 bcopy(mrp, val, sizeof (*mrp));
3117                 kmem_free(mrp, sizeof (*mrp));
3118                 return (0);
3119         }
3120 
3121         case MAC_PROP_PVID:
3122                 ASSERT(valsize >= sizeof (uint16_t));
3123                 if (mip->mi_state_flags & MIS_IS_VNIC)
3124                         return (EINVAL);
3125                 *(uint16_t *)val = mac_get_pvid(mh);
3126                 return (0);
3127 
3128         case MAC_PROP_LLIMIT:
3129         case MAC_PROP_LDECAY:
3130                 ASSERT(valsize >= sizeof (uint32_t));
3131                 if (mip->mi_state_flags & MIS_IS_VNIC)
3132                         return (EINVAL);
3133                 if (id == MAC_PROP_LLIMIT)
3134                         bcopy(&mip->mi_llimit, val, sizeof (mip->mi_llimit));
3135                 else
3136                         bcopy(&mip->mi_ldecay, val, sizeof (mip->mi_ldecay));
3137                 return (0);
3138 
3139         case MAC_PROP_MTU: {
3140                 uint32_t sdu;
3141 
3142                 ASSERT(valsize >= sizeof (uint32_t));
3143                 mac_sdu_get2(mh, NULL, &sdu, NULL);
3144                 bcopy(&sdu, val, sizeof (sdu));
3145 
3146                 return (0);
3147         }
3148         case MAC_PROP_STATUS: {
3149                 link_state_t link_state;
3150 
3151                 if (valsize < sizeof (link_state))
3152                         return (EINVAL);
3153                 link_state = mac_link_get(mh);
3154                 bcopy(&link_state, val, sizeof (link_state));
3155 
3156                 return (0);
3157         }
3158 
3159         case MAC_PROP_MAX_RX_RINGS_AVAIL:
3160         case MAC_PROP_MAX_TX_RINGS_AVAIL:
3161                 ASSERT(valsize >= sizeof (uint_t));
3162                 rings = id == MAC_PROP_MAX_RX_RINGS_AVAIL ?
3163                     mac_rxavail_get(mh) : mac_txavail_get(mh);
3164                 bcopy(&rings, val, sizeof (uint_t));
3165                 return (0);
3166 
3167         case MAC_PROP_MAX_RXHWCLNT_AVAIL:
3168         case MAC_PROP_MAX_TXHWCLNT_AVAIL:
3169                 ASSERT(valsize >= sizeof (uint_t));
3170                 vlinks = id == MAC_PROP_MAX_RXHWCLNT_AVAIL ?
3171                     mac_rxhwlnksavail_get(mh) : mac_txhwlnksavail_get(mh);
3172                 bcopy(&vlinks, val, sizeof (uint_t));
3173                 return (0);
3174 
3175         case MAC_PROP_RXRINGSRANGE:
3176         case MAC_PROP_TXRINGSRANGE:
3177                 /*
3178                  * The value for these properties are returned through
3179                  * the MAC_PROP_RESOURCE property.
3180                  */
3181                 return (0);
3182 
3183         default:
3184                 break;
3185 
3186         }
3187 
3188         /* If driver property, request from driver */
3189         if (mip->mi_callbacks->mc_callbacks & MC_GETPROP) {
3190                 err = mip->mi_callbacks->mc_getprop(mip->mi_driver, name, id,
3191                     valsize, val);
3192         }
3193 
3194         return (err);
3195 }
3196 
3197 /*
3198  * Helper function to initialize the range structure for use in
3199  * mac_get_prop. If the type can be other than uint32, we can
3200  * pass that as an arg.
3201  */
3202 static void
3203 _mac_set_range(mac_propval_range_t *range, uint32_t min, uint32_t max)
3204 {
3205         range->mpr_count = 1;
3206         range->mpr_type = MAC_PROPVAL_UINT32;
3207         range->mpr_range_uint32[0].mpur_min = min;
3208         range->mpr_range_uint32[0].mpur_max = max;
3209 }
3210 
3211 /*
3212  * Returns information about the specified property, such as default
3213  * values or permissions.
3214  */
3215 int
3216 mac_prop_info(mac_handle_t mh, mac_prop_id_t id, char *name,
3217     void *default_val, uint_t default_size, mac_propval_range_t *range,
3218     uint_t *perm)
3219 {
3220         mac_prop_info_state_t state;
3221         mac_impl_t *mip = (mac_impl_t *)mh;
3222         uint_t  max;
3223 
3224         /*
3225          * A property is read/write by default unless the driver says
3226          * otherwise.
3227          */
3228         if (perm != NULL)
3229                 *perm = MAC_PROP_PERM_RW;
3230 
3231         if (default_val != NULL)
3232                 bzero(default_val, default_size);
3233 
3234         /*
3235          * First, handle framework properties for which we don't need to
3236          * involve the driver.
3237          */
3238         switch (id) {
3239         case MAC_PROP_RESOURCE:
3240         case MAC_PROP_PVID:
3241         case MAC_PROP_LLIMIT:
3242         case MAC_PROP_LDECAY:
3243                 return (0);
3244 
3245         case MAC_PROP_MAX_RX_RINGS_AVAIL:
3246         case MAC_PROP_MAX_TX_RINGS_AVAIL:
3247         case MAC_PROP_MAX_RXHWCLNT_AVAIL:
3248         case MAC_PROP_MAX_TXHWCLNT_AVAIL:
3249                 if (perm != NULL)
3250                         *perm = MAC_PROP_PERM_READ;
3251                 return (0);
3252 
3253         case MAC_PROP_RXRINGSRANGE:
3254         case MAC_PROP_TXRINGSRANGE:
3255                 /*
3256                  * Currently, we support range for RX and TX rings properties.
3257                  * When we extend this support to maxbw, cpus and priority,
3258                  * we should move this to mac_get_resources.
3259                  * There is no default value for RX or TX rings.
3260                  */
3261                 if ((mip->mi_state_flags & MIS_IS_VNIC) &&
3262                     mac_is_vnic_primary(mh)) {
3263                         /*
3264                          * We don't support setting rings for a VLAN
3265                          * data link because it shares its ring with the
3266                          * primary MAC client.
3267                          */
3268                         if (perm != NULL)
3269                                 *perm = MAC_PROP_PERM_READ;
3270                         if (range != NULL)
3271                                 range->mpr_count = 0;
3272                 } else if (range != NULL) {
3273                         if (mip->mi_state_flags & MIS_IS_VNIC)
3274                                 mh = mac_get_lower_mac_handle(mh);
3275                         mip = (mac_impl_t *)mh;
3276                         if ((id == MAC_PROP_RXRINGSRANGE &&
3277                             mip->mi_rx_group_type == MAC_GROUP_TYPE_STATIC) ||
3278                             (id == MAC_PROP_TXRINGSRANGE &&
3279                             mip->mi_tx_group_type == MAC_GROUP_TYPE_STATIC)) {
3280                                 if (id == MAC_PROP_RXRINGSRANGE) {
3281                                         if ((mac_rxhwlnksavail_get(mh) +
3282                                             mac_rxhwlnksrsvd_get(mh)) <= 1) {
3283                                                 /*
3284                                                  * doesn't support groups or
3285                                                  * rings
3286                                                  */
3287                                                 range->mpr_count = 0;
3288                                         } else {
3289                                                 /*
3290                                                  * supports specifying groups,
3291                                                  * but not rings
3292                                                  */
3293                                                 _mac_set_range(range, 0, 0);
3294                                         }
3295                                 } else {
3296                                         if ((mac_txhwlnksavail_get(mh) +
3297                                             mac_txhwlnksrsvd_get(mh)) <= 1) {
3298                                                 /*
3299                                                  * doesn't support groups or
3300                                                  * rings
3301                                                  */
3302                                                 range->mpr_count = 0;
3303                                         } else {
3304                                                 /*
3305                                                  * supports specifying groups,
3306                                                  * but not rings
3307                                                  */
3308                                                 _mac_set_range(range, 0, 0);
3309                                         }
3310                                 }
3311                         } else {
3312                                 max = id == MAC_PROP_RXRINGSRANGE ?
3313                                     mac_rxavail_get(mh) + mac_rxrsvd_get(mh) :
3314                                     mac_txavail_get(mh) + mac_txrsvd_get(mh);
3315                                 if (max <= 1) {
3316                                         /*
3317                                          * doesn't support groups or
3318                                          * rings
3319                                          */
3320                                         range->mpr_count = 0;
3321                                 } else  {
3322                                         /*
3323                                          * -1 because we have to leave out the
3324                                          * default ring.
3325                                          */
3326                                         _mac_set_range(range, 1, max - 1);
3327                                 }
3328                         }
3329                 }
3330                 return (0);
3331 
3332         case MAC_PROP_STATUS:
3333                 if (perm != NULL)
3334                         *perm = MAC_PROP_PERM_READ;
3335                 return (0);
3336         }
3337 
3338         /*
3339          * Get the property info from the driver if it implements the
3340          * property info entry point.
3341          */
3342         bzero(&state, sizeof (state));
3343 
3344         if (mip->mi_callbacks->mc_callbacks & MC_PROPINFO) {
3345                 state.pr_default = default_val;
3346                 state.pr_default_size = default_size;
3347 
3348                 /*
3349                  * The caller specifies the maximum number of ranges
3350                  * it can accomodate using mpr_count. We don't touch
3351                  * this value until the driver returns from its
3352                  * mc_propinfo() callback, and ensure we don't exceed
3353                  * this number of range as the driver defines
3354                  * supported range from its mc_propinfo().
3355                  *
3356                  * pr_range_cur_count keeps track of how many ranges
3357                  * were defined by the driver from its mc_propinfo()
3358                  * entry point.
3359                  *
3360                  * On exit, the user-specified range mpr_count returns
3361                  * the number of ranges specified by the driver on
3362                  * success, or the number of ranges it wanted to
3363                  * define if that number of ranges could not be
3364                  * accomodated by the specified range structure.  In
3365                  * the latter case, the caller will be able to
3366                  * allocate a larger range structure, and query the
3367                  * property again.
3368                  */
3369                 state.pr_range_cur_count = 0;
3370                 state.pr_range = range;
3371 
3372                 mip->mi_callbacks->mc_propinfo(mip->mi_driver, name, id,
3373                     (mac_prop_info_handle_t)&state);
3374 
3375                 if (state.pr_flags & MAC_PROP_INFO_RANGE)
3376                         range->mpr_count = state.pr_range_cur_count;
3377 
3378                 /*
3379                  * The operation could fail if the buffer supplied by
3380                  * the user was too small for the range or default
3381                  * value of the property.
3382                  */
3383                 if (state.pr_errno != 0)
3384                         return (state.pr_errno);
3385 
3386                 if (perm != NULL && state.pr_flags & MAC_PROP_INFO_PERM)
3387                         *perm = state.pr_perm;
3388         }
3389 
3390         /*
3391          * The MAC layer may want to provide default values or allowed
3392          * ranges for properties if the driver does not provide a
3393          * property info entry point, or that entry point exists, but
3394          * it did not provide a default value or allowed ranges for
3395          * that property.
3396          */
3397         switch (id) {
3398         case MAC_PROP_MTU: {
3399                 uint32_t sdu;
3400 
3401                 mac_sdu_get2(mh, NULL, &sdu, NULL);
3402 
3403                 if (range != NULL && !(state.pr_flags &
3404                     MAC_PROP_INFO_RANGE)) {
3405                         /* MTU range */
3406                         _mac_set_range(range, sdu, sdu);
3407                 }
3408 
3409                 if (default_val != NULL && !(state.pr_flags &
3410                     MAC_PROP_INFO_DEFAULT)) {
3411                         if (mip->mi_info.mi_media == DL_ETHER)
3412                                 sdu = ETHERMTU;
3413                         /* default MTU value */
3414                         bcopy(&sdu, default_val, sizeof (sdu));
3415                 }
3416         }
3417         }
3418 
3419         return (0);
3420 }
3421 
3422 int
3423 mac_fastpath_disable(mac_handle_t mh)
3424 {
3425         mac_impl_t      *mip = (mac_impl_t *)mh;
3426 
3427         if ((mip->mi_state_flags & MIS_LEGACY) == 0)
3428                 return (0);
3429 
3430         return (mip->mi_capab_legacy.ml_fastpath_disable(mip->mi_driver));
3431 }
3432 
3433 void
3434 mac_fastpath_enable(mac_handle_t mh)
3435 {
3436         mac_impl_t      *mip = (mac_impl_t *)mh;
3437 
3438         if ((mip->mi_state_flags & MIS_LEGACY) == 0)
3439                 return;
3440 
3441         mip->mi_capab_legacy.ml_fastpath_enable(mip->mi_driver);
3442 }
3443 
3444 void
3445 mac_register_priv_prop(mac_impl_t *mip, char **priv_props)
3446 {
3447         uint_t nprops, i;
3448 
3449         if (priv_props == NULL)
3450                 return;
3451 
3452         nprops = 0;
3453         while (priv_props[nprops] != NULL)
3454                 nprops++;
3455         if (nprops == 0)
3456                 return;
3457 
3458 
3459         mip->mi_priv_prop = kmem_zalloc(nprops * sizeof (char *), KM_SLEEP);
3460 
3461         for (i = 0; i < nprops; i++) {
3462                 mip->mi_priv_prop[i] = kmem_zalloc(MAXLINKPROPNAME, KM_SLEEP);
3463                 (void) strlcpy(mip->mi_priv_prop[i], priv_props[i],
3464                     MAXLINKPROPNAME);
3465         }
3466 
3467         mip->mi_priv_prop_count = nprops;
3468 }
3469 
3470 void
3471 mac_unregister_priv_prop(mac_impl_t *mip)
3472 {
3473         uint_t i;
3474 
3475         if (mip->mi_priv_prop_count == 0) {
3476                 ASSERT(mip->mi_priv_prop == NULL);
3477                 return;
3478         }
3479 
3480         for (i = 0; i < mip->mi_priv_prop_count; i++)
3481                 kmem_free(mip->mi_priv_prop[i], MAXLINKPROPNAME);
3482         kmem_free(mip->mi_priv_prop, mip->mi_priv_prop_count *
3483             sizeof (char *));
3484 
3485         mip->mi_priv_prop = NULL;
3486         mip->mi_priv_prop_count = 0;
3487 }
3488 
3489 /*
3490  * mac_ring_t 'mr' macros. Some rogue drivers may access ring structure
3491  * (by invoking mac_rx()) even after processing mac_stop_ring(). In such
3492  * cases if MAC free's the ring structure after mac_stop_ring(), any
3493  * illegal access to the ring structure coming from the driver will panic
3494  * the system. In order to protect the system from such inadverent access,
3495  * we maintain a cache of rings in the mac_impl_t after they get free'd up.
3496  * When packets are received on free'd up rings, MAC (through the generation
3497  * count mechanism) will drop such packets.
3498  */
3499 static mac_ring_t *
3500 mac_ring_alloc(mac_impl_t *mip)
3501 {
3502         mac_ring_t *ring;
3503 
3504         mutex_enter(&mip->mi_ring_lock);
3505         if (mip->mi_ring_freelist != NULL) {
3506                 ring = mip->mi_ring_freelist;
3507                 mip->mi_ring_freelist = ring->mr_next;
3508                 bzero(ring, sizeof (mac_ring_t));
3509                 mutex_exit(&mip->mi_ring_lock);
3510         } else {
3511                 mutex_exit(&mip->mi_ring_lock);
3512                 ring = kmem_cache_alloc(mac_ring_cache, KM_SLEEP);
3513         }
3514         ASSERT((ring != NULL) && (ring->mr_state == MR_FREE));
3515         return (ring);
3516 }
3517 
3518 static void
3519 mac_ring_free(mac_impl_t *mip, mac_ring_t *ring)
3520 {
3521         ASSERT(ring->mr_state == MR_FREE);
3522 
3523         mutex_enter(&mip->mi_ring_lock);
3524         ring->mr_state = MR_FREE;
3525         ring->mr_flag = 0;
3526         ring->mr_next = mip->mi_ring_freelist;
3527         ring->mr_mip = NULL;
3528         mip->mi_ring_freelist = ring;
3529         mac_ring_stat_delete(ring);
3530         mutex_exit(&mip->mi_ring_lock);
3531 }
3532 
3533 static void
3534 mac_ring_freeall(mac_impl_t *mip)
3535 {
3536         mac_ring_t *ring_next;
3537         mutex_enter(&mip->mi_ring_lock);
3538         mac_ring_t *ring = mip->mi_ring_freelist;
3539         while (ring != NULL) {
3540                 ring_next = ring->mr_next;
3541                 kmem_cache_free(mac_ring_cache, ring);
3542                 ring = ring_next;
3543         }
3544         mip->mi_ring_freelist = NULL;
3545         mutex_exit(&mip->mi_ring_lock);
3546 }
3547 
3548 int
3549 mac_start_ring(mac_ring_t *ring)
3550 {
3551         int rv = 0;
3552 
3553         ASSERT(ring->mr_state == MR_FREE);
3554 
3555         if (ring->mr_start != NULL) {
3556                 rv = ring->mr_start(ring->mr_driver, ring->mr_gen_num);
3557                 if (rv != 0)
3558                         return (rv);
3559         }
3560 
3561         ring->mr_state = MR_INUSE;
3562         return (rv);
3563 }
3564 
3565 void
3566 mac_stop_ring(mac_ring_t *ring)
3567 {
3568         ASSERT(ring->mr_state == MR_INUSE);
3569 
3570         if (ring->mr_stop != NULL)
3571                 ring->mr_stop(ring->mr_driver);
3572 
3573         ring->mr_state = MR_FREE;
3574 
3575         /*
3576          * Increment the ring generation number for this ring.
3577          */
3578         ring->mr_gen_num++;
3579 }
3580 
3581 int
3582 mac_start_group(mac_group_t *group)
3583 {
3584         int rv = 0;
3585 
3586         if (group->mrg_start != NULL)
3587                 rv = group->mrg_start(group->mrg_driver);
3588 
3589         return (rv);
3590 }
3591 
3592 void
3593 mac_stop_group(mac_group_t *group)
3594 {
3595         if (group->mrg_stop != NULL)
3596                 group->mrg_stop(group->mrg_driver);
3597 }
3598 
3599 /*
3600  * Called from mac_start() on the default Rx group. Broadcast and multicast
3601  * packets are received only on the default group. Hence the default group
3602  * needs to be up even if the primary client is not up, for the other groups
3603  * to be functional. We do this by calling this function at mac_start time
3604  * itself. However the broadcast packets that are received can't make their
3605  * way beyond mac_rx until a mac client creates a broadcast flow.
3606  */
3607 static int
3608 mac_start_group_and_rings(mac_group_t *group)
3609 {
3610         mac_ring_t      *ring;
3611         int             rv = 0;
3612 
3613         ASSERT(group->mrg_state == MAC_GROUP_STATE_REGISTERED);
3614         if ((rv = mac_start_group(group)) != 0)
3615                 return (rv);
3616 
3617         for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3618                 ASSERT(ring->mr_state == MR_FREE);
3619                 if ((rv = mac_start_ring(ring)) != 0)
3620                         goto error;
3621                 ring->mr_classify_type = MAC_SW_CLASSIFIER;
3622         }
3623         return (0);
3624 
3625 error:
3626         mac_stop_group_and_rings(group);
3627         return (rv);
3628 }
3629 
3630 /* Called from mac_stop on the default Rx group */
3631 static void
3632 mac_stop_group_and_rings(mac_group_t *group)
3633 {
3634         mac_ring_t      *ring;
3635 
3636         for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3637                 if (ring->mr_state != MR_FREE) {
3638                         mac_stop_ring(ring);
3639                         ring->mr_flag = 0;
3640                         ring->mr_classify_type = MAC_NO_CLASSIFIER;
3641                 }
3642         }
3643         mac_stop_group(group);
3644 }
3645 
3646 
3647 static mac_ring_t *
3648 mac_init_ring(mac_impl_t *mip, mac_group_t *group, int index,
3649     mac_capab_rings_t *cap_rings)
3650 {
3651         mac_ring_t *ring, *rnext;
3652         mac_ring_info_t ring_info;
3653         ddi_intr_handle_t ddi_handle;
3654 
3655         ring = mac_ring_alloc(mip);
3656 
3657         /* Prepare basic information of ring */
3658 
3659         /*
3660          * Ring index is numbered to be unique across a particular device.
3661          * Ring index computation makes following assumptions:
3662          *      - For drivers with static grouping (e.g. ixgbe, bge),
3663          *      ring index exchanged with the driver (e.g. during mr_rget)
3664          *      is unique only across the group the ring belongs to.
3665          *      - Drivers with dynamic grouping (e.g. nxge), start
3666          *      with single group (mrg_index = 0).
3667          */
3668         ring->mr_index = group->mrg_index * group->mrg_info.mgi_count + index;
3669         ring->mr_type = group->mrg_type;
3670         ring->mr_gh = (mac_group_handle_t)group;
3671 
3672         /* Insert the new ring to the list. */
3673         ring->mr_next = group->mrg_rings;
3674         group->mrg_rings = ring;
3675 
3676         /* Zero to reuse the info data structure */
3677         bzero(&ring_info, sizeof (ring_info));
3678 
3679         /* Query ring information from driver */
3680         cap_rings->mr_rget(mip->mi_driver, group->mrg_type, group->mrg_index,
3681             index, &ring_info, (mac_ring_handle_t)ring);
3682 
3683         ring->mr_info = ring_info;
3684 
3685         /*
3686          * The interrupt handle could be shared among multiple rings.
3687          * Thus if there is a bunch of rings that are sharing an
3688          * interrupt, then only one ring among the bunch will be made
3689          * available for interrupt re-targeting; the rest will have
3690          * ddi_shared flag set to TRUE and would not be available for
3691          * be interrupt re-targeting.
3692          */
3693         if ((ddi_handle = ring_info.mri_intr.mi_ddi_handle) != NULL) {
3694                 rnext = ring->mr_next;
3695                 while (rnext != NULL) {
3696                         if (rnext->mr_info.mri_intr.mi_ddi_handle ==
3697                             ddi_handle) {
3698                                 /*
3699                                  * If default ring (mr_index == 0) is part
3700                                  * of a group of rings sharing an
3701                                  * interrupt, then set ddi_shared flag for
3702                                  * the default ring and give another ring
3703                                  * the chance to be re-targeted.
3704                                  */
3705                                 if (rnext->mr_index == 0 &&
3706                                     !rnext->mr_info.mri_intr.mi_ddi_shared) {
3707                                         rnext->mr_info.mri_intr.mi_ddi_shared =
3708                                             B_TRUE;
3709                                 } else {
3710                                         ring->mr_info.mri_intr.mi_ddi_shared =
3711                                             B_TRUE;
3712                                 }
3713                                 break;
3714                         }
3715                         rnext = rnext->mr_next;
3716                 }
3717                 /*
3718                  * If rnext is NULL, then no matching ddi_handle was found.
3719                  * Rx rings get registered first. So if this is a Tx ring,
3720                  * then go through all the Rx rings and see if there is a
3721                  * matching ddi handle.
3722                  */
3723                 if (rnext == NULL && ring->mr_type == MAC_RING_TYPE_TX) {
3724                         mac_compare_ddi_handle(mip->mi_rx_groups,
3725                             mip->mi_rx_group_count, ring);
3726                 }
3727         }
3728 
3729         /* Update ring's status */
3730         ring->mr_state = MR_FREE;
3731         ring->mr_flag = 0;
3732 
3733         /* Update the ring count of the group */
3734         group->mrg_cur_count++;
3735 
3736         /* Create per ring kstats */
3737         if (ring->mr_stat != NULL) {
3738                 ring->mr_mip = mip;
3739                 mac_ring_stat_create(ring);
3740         }
3741 
3742         return (ring);
3743 }
3744 
3745 /*
3746  * Rings are chained together for easy regrouping.
3747  */
3748 static void
3749 mac_init_group(mac_impl_t *mip, mac_group_t *group, int size,
3750     mac_capab_rings_t *cap_rings)
3751 {
3752         int index;
3753 
3754         /*
3755          * Initialize all ring members of this group. Size of zero will not
3756          * enter the loop, so it's safe for initializing an empty group.
3757          */
3758         for (index = size - 1; index >= 0; index--)
3759                 (void) mac_init_ring(mip, group, index, cap_rings);
3760 }
3761 
3762 int
3763 mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
3764 {
3765         mac_capab_rings_t       *cap_rings;
3766         mac_group_t             *group;
3767         mac_group_t             *groups;
3768         mac_group_info_t        group_info;
3769         uint_t                  group_free = 0;
3770         uint_t                  ring_left;
3771         mac_ring_t              *ring;
3772         int                     g;
3773         int                     err = 0;
3774         uint_t                  grpcnt;
3775         boolean_t               pseudo_txgrp = B_FALSE;
3776 
3777         switch (rtype) {
3778         case MAC_RING_TYPE_RX:
3779                 ASSERT(mip->mi_rx_groups == NULL);
3780 
3781                 cap_rings = &mip->mi_rx_rings_cap;
3782                 cap_rings->mr_type = MAC_RING_TYPE_RX;
3783                 break;
3784         case MAC_RING_TYPE_TX:
3785                 ASSERT(mip->mi_tx_groups == NULL);
3786 
3787                 cap_rings = &mip->mi_tx_rings_cap;
3788                 cap_rings->mr_type = MAC_RING_TYPE_TX;
3789                 break;
3790         default:
3791                 ASSERT(B_FALSE);
3792         }
3793 
3794         if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_RINGS, cap_rings))
3795                 return (0);
3796         grpcnt = cap_rings->mr_gnum;
3797 
3798         /*
3799          * If we have multiple TX rings, but only one TX group, we can
3800          * create pseudo TX groups (one per TX ring) in the MAC layer,
3801          * except for an aggr. For an aggr currently we maintain only
3802          * one group with all the rings (for all its ports), going
3803          * forwards we might change this.
3804          */
3805         if (rtype == MAC_RING_TYPE_TX &&
3806             cap_rings->mr_gnum == 0 && cap_rings->mr_rnum >  0 &&
3807             (mip->mi_state_flags & MIS_IS_AGGR) == 0) {
3808                 /*
3809                  * The -1 here is because we create a default TX group
3810                  * with all the rings in it.
3811                  */
3812                 grpcnt = cap_rings->mr_rnum - 1;
3813                 pseudo_txgrp = B_TRUE;
3814         }
3815 
3816         /*
3817          * Allocate a contiguous buffer for all groups.
3818          */
3819         groups = kmem_zalloc(sizeof (mac_group_t) * (grpcnt+ 1), KM_SLEEP);
3820 
3821         ring_left = cap_rings->mr_rnum;
3822 
3823         /*
3824          * Get all ring groups if any, and get their ring members
3825          * if any.
3826          */
3827         for (g = 0; g < grpcnt; g++) {
3828                 group = groups + g;
3829 
3830                 /* Prepare basic information of the group */
3831                 group->mrg_index = g;
3832                 group->mrg_type = rtype;
3833                 group->mrg_state = MAC_GROUP_STATE_UNINIT;
3834                 group->mrg_mh = (mac_handle_t)mip;
3835                 group->mrg_next = group + 1;
3836 
3837                 /* Zero to reuse the info data structure */
3838                 bzero(&group_info, sizeof (group_info));
3839 
3840                 if (pseudo_txgrp) {
3841                         /*
3842                          * This is a pseudo group that we created, apart
3843                          * from setting the state there is nothing to be
3844                          * done.
3845                          */
3846                         group->mrg_state = MAC_GROUP_STATE_REGISTERED;
3847                         group_free++;
3848                         continue;
3849                 }
3850                 /* Query group information from driver */
3851                 cap_rings->mr_gget(mip->mi_driver, rtype, g, &group_info,
3852                     (mac_group_handle_t)group);
3853 
3854                 switch (cap_rings->mr_group_type) {
3855                 case MAC_GROUP_TYPE_DYNAMIC:
3856                         if (cap_rings->mr_gaddring == NULL ||
3857                             cap_rings->mr_gremring == NULL) {
3858                                 DTRACE_PROBE3(
3859                                     mac__init__rings_no_addremring,
3860                                     char *, mip->mi_name,
3861                                     mac_group_add_ring_t,
3862                                     cap_rings->mr_gaddring,
3863                                     mac_group_add_ring_t,
3864                                     cap_rings->mr_gremring);
3865                                 err = EINVAL;
3866                                 goto bail;
3867                         }
3868 
3869                         switch (rtype) {
3870                         case MAC_RING_TYPE_RX:
3871                                 /*
3872                                  * The first RX group must have non-zero
3873                                  * rings, and the following groups must
3874                                  * have zero rings.
3875                                  */
3876                                 if (g == 0 && group_info.mgi_count == 0) {
3877                                         DTRACE_PROBE1(
3878                                             mac__init__rings__rx__def__zero,
3879                                             char *, mip->mi_name);
3880                                         err = EINVAL;
3881                                         goto bail;
3882                                 }
3883                                 if (g > 0 && group_info.mgi_count != 0) {
3884                                         DTRACE_PROBE3(
3885                                             mac__init__rings__rx__nonzero,
3886                                             char *, mip->mi_name,
3887                                             int, g, int, group_info.mgi_count);
3888                                         err = EINVAL;
3889                                         goto bail;
3890                                 }
3891                                 break;
3892                         case MAC_RING_TYPE_TX:
3893                                 /*
3894                                  * All TX ring groups must have zero rings.
3895                                  */
3896                                 if (group_info.mgi_count != 0) {
3897                                         DTRACE_PROBE3(
3898                                             mac__init__rings__tx__nonzero,
3899                                             char *, mip->mi_name,
3900                                             int, g, int, group_info.mgi_count);
3901                                         err = EINVAL;
3902                                         goto bail;
3903                                 }
3904                                 break;
3905                         }
3906                         break;
3907                 case MAC_GROUP_TYPE_STATIC:
3908                         /*
3909                          * Note that an empty group is allowed, e.g., an aggr
3910                          * would start with an empty group.
3911                          */
3912                         break;
3913                 default:
3914                         /* unknown group type */
3915                         DTRACE_PROBE2(mac__init__rings__unknown__type,
3916                             char *, mip->mi_name,
3917                             int, cap_rings->mr_group_type);
3918                         err = EINVAL;
3919                         goto bail;
3920                 }
3921 
3922 
3923                 /*
3924                  * Driver must register group->mgi_addmac/remmac() for rx groups
3925                  * to support multiple MAC addresses.
3926                  */
3927                 if (rtype == MAC_RING_TYPE_RX) {
3928                         if ((group_info.mgi_addmac == NULL) ||
3929                             (group_info.mgi_addmac == NULL)) {
3930                                 goto bail;
3931                         }
3932                 }
3933 
3934                 /* Cache driver-supplied information */
3935                 group->mrg_info = group_info;
3936 
3937                 /* Update the group's status and group count. */
3938                 mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED);
3939                 group_free++;
3940 
3941                 group->mrg_rings = NULL;
3942                 group->mrg_cur_count = 0;
3943                 mac_init_group(mip, group, group_info.mgi_count, cap_rings);
3944                 ring_left -= group_info.mgi_count;
3945 
3946                 /* The current group size should be equal to default value */
3947                 ASSERT(group->mrg_cur_count == group_info.mgi_count);
3948         }
3949 
3950         /* Build up a dummy group for free resources as a pool */
3951         group = groups + grpcnt;
3952 
3953         /* Prepare basic information of the group */
3954         group->mrg_index = -1;
3955         group->mrg_type = rtype;
3956         group->mrg_state = MAC_GROUP_STATE_UNINIT;
3957         group->mrg_mh = (mac_handle_t)mip;
3958         group->mrg_next = NULL;
3959 
3960         /*
3961          * If there are ungrouped rings, allocate a continuous buffer for
3962          * remaining resources.
3963          */
3964         if (ring_left != 0) {
3965                 group->mrg_rings = NULL;
3966                 group->mrg_cur_count = 0;
3967                 mac_init_group(mip, group, ring_left, cap_rings);
3968 
3969                 /* The current group size should be equal to ring_left */
3970                 ASSERT(group->mrg_cur_count == ring_left);
3971 
3972                 ring_left = 0;
3973 
3974                 /* Update this group's status */
3975                 mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED);
3976         } else
3977                 group->mrg_rings = NULL;
3978 
3979         ASSERT(ring_left == 0);
3980 
3981 bail:
3982 
3983         /* Cache other important information to finalize the initialization */
3984         switch (rtype) {
3985         case MAC_RING_TYPE_RX:
3986                 mip->mi_rx_group_type = cap_rings->mr_group_type;
3987                 mip->mi_rx_group_count = cap_rings->mr_gnum;
3988                 mip->mi_rx_groups = groups;
3989                 mip->mi_rx_donor_grp = groups;
3990                 if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
3991                         /*
3992                          * The default ring is reserved since it is
3993                          * used for sending the broadcast etc. packets.
3994                          */
3995                         mip->mi_rxrings_avail =
3996                             mip->mi_rx_groups->mrg_cur_count - 1;
3997                         mip->mi_rxrings_rsvd = 1;
3998                 }
3999                 /*
4000                  * The default group cannot be reserved. It is used by
4001                  * all the clients that do not have an exclusive group.
4002                  */
4003                 mip->mi_rxhwclnt_avail = mip->mi_rx_group_count - 1;
4004                 mip->mi_rxhwclnt_used = 1;
4005                 break;
4006         case MAC_RING_TYPE_TX:
4007                 mip->mi_tx_group_type = pseudo_txgrp ? MAC_GROUP_TYPE_DYNAMIC :
4008                     cap_rings->mr_group_type;
4009                 mip->mi_tx_group_count = grpcnt;
4010                 mip->mi_tx_group_free = group_free;
4011                 mip->mi_tx_groups = groups;
4012 
4013                 group = groups + grpcnt;
4014                 ring = group->mrg_rings;
4015                 /*
4016                  * The ring can be NULL in the case of aggr. Aggr will
4017                  * have an empty Tx group which will get populated
4018                  * later when pseudo Tx rings are added after
4019                  * mac_register() is done.
4020                  */
4021                 if (ring == NULL) {
4022                         ASSERT(mip->mi_state_flags & MIS_IS_AGGR);
4023                         /*
4024                          * pass the group to aggr so it can add Tx
4025                          * rings to the group later.
4026                          */
4027                         cap_rings->mr_gget(mip->mi_driver, rtype, 0, NULL,
4028                             (mac_group_handle_t)group);
4029                         /*
4030                          * Even though there are no rings at this time
4031                          * (rings will come later), set the group
4032                          * state to registered.
4033                          */
4034                         group->mrg_state = MAC_GROUP_STATE_REGISTERED;
4035                 } else {
4036                         /*
4037                          * Ring 0 is used as the default one and it could be
4038                          * assigned to a client as well.
4039                          */
4040                         while ((ring->mr_index != 0) && (ring->mr_next != NULL))
4041                                 ring = ring->mr_next;
4042                         ASSERT(ring->mr_index == 0);
4043                         mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
4044                 }
4045                 if (mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC)
4046                         mip->mi_txrings_avail = group->mrg_cur_count - 1;
4047                         /*
4048                          * The default ring cannot be reserved.
4049                          */
4050                         mip->mi_txrings_rsvd = 1;
4051                 /*
4052                  * The default group cannot be reserved. It will be shared
4053                  * by clients that do not have an exclusive group.
4054                  */
4055                 mip->mi_txhwclnt_avail = mip->mi_tx_group_count;
4056                 mip->mi_txhwclnt_used = 1;
4057                 break;
4058         default:
4059                 ASSERT(B_FALSE);
4060         }
4061 
4062         if (err != 0)
4063                 mac_free_rings(mip, rtype);
4064 
4065         return (err);
4066 }
4067 
4068 /*
4069  * The ddi interrupt handle could be shared amoung rings. If so, compare
4070  * the new ring's ddi handle with the existing ones and set ddi_shared
4071  * flag.
4072  */
4073 void
4074 mac_compare_ddi_handle(mac_group_t *groups, uint_t grpcnt, mac_ring_t *cring)
4075 {
4076         mac_group_t *group;
4077         mac_ring_t *ring;
4078         ddi_intr_handle_t ddi_handle;
4079         int g;
4080 
4081         ddi_handle = cring->mr_info.mri_intr.mi_ddi_handle;
4082         for (g = 0; g < grpcnt; g++) {
4083                 group = groups + g;
4084                 for (ring = group->mrg_rings; ring != NULL;
4085                     ring = ring->mr_next) {
4086                         if (ring == cring)
4087                                 continue;
4088                         if (ring->mr_info.mri_intr.mi_ddi_handle ==
4089                             ddi_handle) {
4090                                 if (cring->mr_type == MAC_RING_TYPE_RX &&
4091                                     ring->mr_index == 0 &&
4092                                     !ring->mr_info.mri_intr.mi_ddi_shared) {
4093                                         ring->mr_info.mri_intr.mi_ddi_shared =
4094                                             B_TRUE;
4095                                 } else {
4096                                         cring->mr_info.mri_intr.mi_ddi_shared =
4097                                             B_TRUE;
4098                                 }
4099                                 return;
4100                         }
4101                 }
4102         }
4103 }
4104 
4105 /*
4106  * Called to free all groups of particular type (RX or TX). It's assumed that
4107  * no clients are using these groups.
4108  */
4109 void
4110 mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
4111 {
4112         mac_group_t *group, *groups;
4113         uint_t group_count;
4114 
4115         switch (rtype) {
4116         case MAC_RING_TYPE_RX:
4117                 if (mip->mi_rx_groups == NULL)
4118                         return;
4119 
4120                 groups = mip->mi_rx_groups;
4121                 group_count = mip->mi_rx_group_count;
4122 
4123                 mip->mi_rx_groups = NULL;
4124                 mip->mi_rx_donor_grp = NULL;
4125                 mip->mi_rx_group_count = 0;
4126                 break;
4127         case MAC_RING_TYPE_TX:
4128                 ASSERT(mip->mi_tx_group_count == mip->mi_tx_group_free);
4129 
4130                 if (mip->mi_tx_groups == NULL)
4131                         return;
4132 
4133                 groups = mip->mi_tx_groups;
4134                 group_count = mip->mi_tx_group_count;
4135 
4136                 mip->mi_tx_groups = NULL;
4137                 mip->mi_tx_group_count = 0;
4138                 mip->mi_tx_group_free = 0;
4139                 mip->mi_default_tx_ring = NULL;
4140                 break;
4141         default:
4142                 ASSERT(B_FALSE);
4143         }
4144 
4145         for (group = groups; group != NULL; group = group->mrg_next) {
4146                 mac_ring_t *ring;
4147 
4148                 if (group->mrg_cur_count == 0)
4149                         continue;
4150 
4151                 ASSERT(group->mrg_rings != NULL);
4152 
4153                 while ((ring = group->mrg_rings) != NULL) {
4154                         group->mrg_rings = ring->mr_next;
4155                         mac_ring_free(mip, ring);
4156                 }
4157         }
4158 
4159         /* Free all the cached rings */
4160         mac_ring_freeall(mip);
4161         /* Free the block of group data strutures */
4162         kmem_free(groups, sizeof (mac_group_t) * (group_count + 1));
4163 }
4164 
4165 /*
4166  * Associate a MAC address with a receive group.
4167  *
4168  * The return value of this function should always be checked properly, because
4169  * any type of failure could cause unexpected results. A group can be added
4170  * or removed with a MAC address only after it has been reserved. Ideally,
4171  * a successful reservation always leads to calling mac_group_addmac() to
4172  * steer desired traffic. Failure of adding an unicast MAC address doesn't
4173  * always imply that the group is functioning abnormally.
4174  *
4175  * Currently this function is called everywhere, and it reflects assumptions
4176  * about MAC addresses in the implementation. CR 6735196.
4177  */
4178 int
4179 mac_group_addmac(mac_group_t *group, const uint8_t *addr)
4180 {
4181         ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
4182         ASSERT(group->mrg_info.mgi_addmac != NULL);
4183 
4184         return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr));
4185 }
4186 
4187 /*
4188  * Remove the association between MAC address and receive group.
4189  */
4190 int
4191 mac_group_remmac(mac_group_t *group, const uint8_t *addr)
4192 {
4193         ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
4194         ASSERT(group->mrg_info.mgi_remmac != NULL);
4195 
4196         return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr));
4197 }
4198 
4199 /*
4200  * This is the entry point for packets transmitted through the bridging code.
4201  * If no bridge is in place, MAC_RING_TX transmits using tx ring. The 'rh'
4202  * pointer may be NULL to select the default ring.
4203  */
4204 mblk_t *
4205 mac_bridge_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp)
4206 {
4207         mac_handle_t mh;
4208 
4209         /*
4210          * Once we take a reference on the bridge link, the bridge
4211          * module itself can't unload, so the callback pointers are
4212          * stable.
4213          */
4214         mutex_enter(&mip->mi_bridge_lock);
4215         if ((mh = mip->mi_bridge_link) != NULL)
4216                 mac_bridge_ref_cb(mh, B_TRUE);
4217         mutex_exit(&mip->mi_bridge_lock);
4218         if (mh == NULL) {
4219                 MAC_RING_TX(mip, rh, mp, mp);
4220         } else {
4221                 mp = mac_bridge_tx_cb(mh, rh, mp);
4222                 mac_bridge_ref_cb(mh, B_FALSE);
4223         }
4224 
4225         return (mp);
4226 }
4227 
4228 /*
4229  * Find a ring from its index.
4230  */
4231 mac_ring_handle_t
4232 mac_find_ring(mac_group_handle_t gh, int index)
4233 {
4234         mac_group_t *group = (mac_group_t *)gh;
4235         mac_ring_t *ring = group->mrg_rings;
4236 
4237         for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next)
4238                 if (ring->mr_index == index)
4239                         break;
4240 
4241         return ((mac_ring_handle_t)ring);
4242 }
4243 /*
4244  * Add a ring to an existing group.
4245  *
4246  * The ring must be either passed directly (for example if the ring
4247  * movement is initiated by the framework), or specified through a driver
4248  * index (for example when the ring is added by the driver.
4249  *
4250  * The caller needs to call mac_perim_enter() before calling this function.
4251  */
4252 int
4253 i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
4254 {
4255         mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
4256         mac_capab_rings_t *cap_rings;
4257         boolean_t driver_call = (ring == NULL);
4258         mac_group_type_t group_type;
4259         int ret = 0;
4260         flow_entry_t *flent;
4261 
4262         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4263 
4264         switch (group->mrg_type) {
4265         case MAC_RING_TYPE_RX:
4266                 cap_rings = &mip->mi_rx_rings_cap;
4267                 group_type = mip->mi_rx_group_type;
4268                 break;
4269         case MAC_RING_TYPE_TX:
4270                 cap_rings = &mip->mi_tx_rings_cap;
4271                 group_type = mip->mi_tx_group_type;
4272                 break;
4273         default:
4274                 ASSERT(B_FALSE);
4275         }
4276 
4277         /*
4278          * There should be no ring with the same ring index in the target
4279          * group.
4280          */
4281         ASSERT(mac_find_ring((mac_group_handle_t)group,
4282             driver_call ? index : ring->mr_index) == NULL);
4283 
4284         if (driver_call) {
4285                 /*
4286                  * The function is called as a result of a request from
4287                  * a driver to add a ring to an existing group, for example
4288                  * from the aggregation driver. Allocate a new mac_ring_t
4289                  * for that ring.
4290                  */
4291                 ring = mac_init_ring(mip, group, index, cap_rings);
4292                 ASSERT(group->mrg_state > MAC_GROUP_STATE_UNINIT);
4293         } else {
4294                 /*
4295                  * The function is called as a result of a MAC layer request
4296                  * to add a ring to an existing group. In this case the
4297                  * ring is being moved between groups, which requires
4298                  * the underlying driver to support dynamic grouping,
4299                  * and the mac_ring_t already exists.
4300                  */
4301                 ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
4302                 ASSERT(group->mrg_driver == NULL ||
4303                     cap_rings->mr_gaddring != NULL);
4304                 ASSERT(ring->mr_gh == NULL);
4305         }
4306 
4307         /*
4308          * At this point the ring should not be in use, and it should be
4309          * of the right for the target group.
4310          */
4311         ASSERT(ring->mr_state < MR_INUSE);
4312         ASSERT(ring->mr_srs == NULL);
4313         ASSERT(ring->mr_type == group->mrg_type);
4314 
4315         if (!driver_call) {
4316                 /*
4317                  * Add the driver level hardware ring if the process was not
4318                  * initiated by the driver, and the target group is not the
4319                  * group.
4320                  */
4321                 if (group->mrg_driver != NULL) {
4322                         cap_rings->mr_gaddring(group->mrg_driver,
4323                             ring->mr_driver, ring->mr_type);
4324                 }
4325 
4326                 /*
4327                  * Insert the ring ahead existing rings.
4328                  */
4329                 ring->mr_next = group->mrg_rings;
4330                 group->mrg_rings = ring;
4331                 ring->mr_gh = (mac_group_handle_t)group;
4332                 group->mrg_cur_count++;
4333         }
4334 
4335         /*
4336          * If the group has not been actively used, we're done.
4337          */
4338         if (group->mrg_index != -1 &&
4339             group->mrg_state < MAC_GROUP_STATE_RESERVED)
4340                 return (0);
4341 
4342         /*
4343          * Start the ring if needed. Failure causes to undo the grouping action.
4344          */
4345         if (ring->mr_state != MR_INUSE) {
4346                 if ((ret = mac_start_ring(ring)) != 0) {
4347                         if (!driver_call) {
4348                                 cap_rings->mr_gremring(group->mrg_driver,
4349                                     ring->mr_driver, ring->mr_type);
4350                         }
4351                         group->mrg_cur_count--;
4352                         group->mrg_rings = ring->mr_next;
4353 
4354                         ring->mr_gh = NULL;
4355 
4356                         if (driver_call)
4357                                 mac_ring_free(mip, ring);
4358 
4359                         return (ret);
4360                 }
4361         }
4362 
4363         /*
4364          * Set up SRS/SR according to the ring type.
4365          */
4366         switch (ring->mr_type) {
4367         case MAC_RING_TYPE_RX:
4368                 /*
4369                  * Setup SRS on top of the new ring if the group is
4370                  * reserved for someones exclusive use.
4371                  */
4372                 if (group->mrg_state == MAC_GROUP_STATE_RESERVED) {
4373                         mac_client_impl_t *mcip;
4374 
4375                         mcip = MAC_GROUP_ONLY_CLIENT(group);
4376                         /*
4377                          * Even though this group is reserved we migth still
4378                          * have multiple clients, i.e a VLAN shares the
4379                          * group with the primary mac client.
4380                          */
4381                         if (mcip != NULL) {
4382                                 flent = mcip->mci_flent;
4383                                 ASSERT(flent->fe_rx_srs_cnt > 0);
4384                                 mac_rx_srs_group_setup(mcip, flent, SRST_LINK);
4385                                 mac_fanout_setup(mcip, flent,
4386                                     MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver,
4387                                     mcip, NULL, NULL);
4388                         } else {
4389                                 ring->mr_classify_type = MAC_SW_CLASSIFIER;
4390                         }
4391                 }
4392                 break;
4393         case MAC_RING_TYPE_TX:
4394         {
4395                 mac_grp_client_t        *mgcp = group->mrg_clients;
4396                 mac_client_impl_t       *mcip;
4397                 mac_soft_ring_set_t     *mac_srs;
4398                 mac_srs_tx_t            *tx;
4399 
4400                 if (MAC_GROUP_NO_CLIENT(group)) {
4401                         if (ring->mr_state == MR_INUSE)
4402                                 mac_stop_ring(ring);
4403                         ring->mr_flag = 0;
4404                         break;
4405                 }
4406                 /*
4407                  * If the rings are being moved to a group that has
4408                  * clients using it, then add the new rings to the
4409                  * clients SRS.
4410                  */
4411                 while (mgcp != NULL) {
4412                         boolean_t       is_aggr;
4413 
4414                         mcip = mgcp->mgc_client;
4415                         flent = mcip->mci_flent;
4416                         is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR);
4417                         mac_srs = MCIP_TX_SRS(mcip);
4418                         tx = &mac_srs->srs_tx;
4419                         mac_tx_client_quiesce((mac_client_handle_t)mcip);
4420                         /*
4421                          * If we are  growing from 1 to multiple rings.
4422                          */
4423                         if (tx->st_mode == SRS_TX_BW ||
4424                             tx->st_mode == SRS_TX_SERIALIZE ||
4425                             tx->st_mode == SRS_TX_DEFAULT) {
4426                                 mac_ring_t      *tx_ring = tx->st_arg2;
4427 
4428                                 tx->st_arg2 = NULL;
4429                                 mac_tx_srs_stat_recreate(mac_srs, B_TRUE);
4430                                 mac_tx_srs_add_ring(mac_srs, tx_ring);
4431                                 if (mac_srs->srs_type & SRST_BW_CONTROL) {
4432                                         tx->st_mode = is_aggr ? SRS_TX_BW_AGGR :
4433                                             SRS_TX_BW_FANOUT;
4434                                 } else {
4435                                         tx->st_mode = is_aggr ? SRS_TX_AGGR :
4436                                             SRS_TX_FANOUT;
4437                                 }
4438                                 tx->st_func = mac_tx_get_func(tx->st_mode);
4439                         }
4440                         mac_tx_srs_add_ring(mac_srs, ring);
4441                         mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
4442                             mac_rx_deliver, mcip, NULL, NULL);
4443                         mac_tx_client_restart((mac_client_handle_t)mcip);
4444                         mgcp = mgcp->mgc_next;
4445                 }
4446                 break;
4447         }
4448         default:
4449                 ASSERT(B_FALSE);
4450         }
4451         /*
4452          * For aggr, the default ring will be NULL to begin with. If it
4453          * is NULL, then pick the first ring that gets added as the
4454          * default ring. Any ring in an aggregation can be removed at
4455          * any time (by the user action of removing a link) and if the
4456          * current default ring gets removed, then a new one gets
4457          * picked (see i_mac_group_rem_ring()).
4458          */
4459         if (mip->mi_state_flags & MIS_IS_AGGR &&
4460             mip->mi_default_tx_ring == NULL &&
4461             ring->mr_type == MAC_RING_TYPE_TX) {
4462                 mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
4463         }
4464 
4465         MAC_RING_UNMARK(ring, MR_INCIPIENT);
4466         return (0);
4467 }
4468 
4469 /*
4470  * Remove a ring from it's current group. MAC internal function for dynamic
4471  * grouping.
4472  *
4473  * The caller needs to call mac_perim_enter() before calling this function.
4474  */
4475 void
4476 i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring,
4477     boolean_t driver_call)
4478 {
4479         mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
4480         mac_capab_rings_t *cap_rings = NULL;
4481         mac_group_type_t group_type;
4482 
4483         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4484 
4485         ASSERT(mac_find_ring((mac_group_handle_t)group,
4486             ring->mr_index) == (mac_ring_handle_t)ring);
4487         ASSERT((mac_group_t *)ring->mr_gh == group);
4488         ASSERT(ring->mr_type == group->mrg_type);
4489 
4490         if (ring->mr_state == MR_INUSE)
4491                 mac_stop_ring(ring);
4492         switch (ring->mr_type) {
4493         case MAC_RING_TYPE_RX:
4494                 group_type = mip->mi_rx_group_type;
4495                 cap_rings = &mip->mi_rx_rings_cap;
4496 
4497                 /*
4498                  * Only hardware classified packets hold a reference to the
4499                  * ring all the way up the Rx path. mac_rx_srs_remove()
4500                  * will take care of quiescing the Rx path and removing the
4501                  * SRS. The software classified path neither holds a reference
4502                  * nor any association with the ring in mac_rx.
4503                  */
4504                 if (ring->mr_srs != NULL) {
4505                         mac_rx_srs_remove(ring->mr_srs);
4506                         ring->mr_srs = NULL;
4507                 }
4508 
4509                 break;
4510         case MAC_RING_TYPE_TX:
4511         {
4512                 mac_grp_client_t        *mgcp;
4513                 mac_client_impl_t       *mcip;
4514                 mac_soft_ring_set_t     *mac_srs;
4515                 mac_srs_tx_t            *tx;
4516                 mac_ring_t              *rem_ring;
4517                 mac_group_t             *defgrp;
4518                 uint_t                  ring_info = 0;
4519 
4520                 /*
4521                  * For TX this function is invoked in three
4522                  * cases:
4523                  *
4524                  * 1) In the case of a failure during the
4525                  * initial creation of a group when a share is
4526                  * associated with a MAC client. So the SRS is not
4527                  * yet setup, and will be setup later after the
4528                  * group has been reserved and populated.
4529                  *
4530                  * 2) From mac_release_tx_group() when freeing
4531                  * a TX SRS.
4532                  *
4533                  * 3) In the case of aggr, when a port gets removed,
4534                  * the pseudo Tx rings that it exposed gets removed.
4535                  *
4536                  * In the first two cases the SRS and its soft
4537                  * rings are already quiesced.
4538                  */
4539                 if (driver_call) {
4540                         mac_client_impl_t *mcip;
4541                         mac_soft_ring_set_t *mac_srs;
4542                         mac_soft_ring_t *sringp;
4543                         mac_srs_tx_t *srs_tx;
4544 
4545                         if (mip->mi_state_flags & MIS_IS_AGGR &&
4546                             mip->mi_default_tx_ring ==
4547                             (mac_ring_handle_t)ring) {
4548                                 /* pick a new default Tx ring */
4549                                 mip->mi_default_tx_ring =
4550                                     (group->mrg_rings != ring) ?
4551                                     (mac_ring_handle_t)group->mrg_rings :
4552                                     (mac_ring_handle_t)(ring->mr_next);
4553                         }
4554                         /* Presently only aggr case comes here */
4555                         if (group->mrg_state != MAC_GROUP_STATE_RESERVED)
4556                                 break;
4557 
4558                         mcip = MAC_GROUP_ONLY_CLIENT(group);
4559                         ASSERT(mcip != NULL);
4560                         ASSERT(mcip->mci_state_flags & MCIS_IS_AGGR);
4561                         mac_srs = MCIP_TX_SRS(mcip);
4562                         ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_AGGR ||
4563                             mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR);
4564                         srs_tx = &mac_srs->srs_tx;
4565                         /*
4566                          * Wakeup any callers blocked on this
4567                          * Tx ring due to flow control.
4568                          */
4569                         sringp = srs_tx->st_soft_rings[ring->mr_index];
4570                         ASSERT(sringp != NULL);
4571                         mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)sringp);
4572                         mac_tx_client_quiesce((mac_client_handle_t)mcip);
4573                         mac_tx_srs_del_ring(mac_srs, ring);
4574                         mac_tx_client_restart((mac_client_handle_t)mcip);
4575                         break;
4576                 }
4577                 ASSERT(ring != (mac_ring_t *)mip->mi_default_tx_ring);
4578                 group_type = mip->mi_tx_group_type;
4579                 cap_rings = &mip->mi_tx_rings_cap;
4580                 /*
4581                  * See if we need to take it out of the MAC clients using
4582                  * this group
4583                  */
4584                 if (MAC_GROUP_NO_CLIENT(group))
4585                         break;
4586                 mgcp = group->mrg_clients;
4587                 defgrp = MAC_DEFAULT_TX_GROUP(mip);
4588                 while (mgcp != NULL) {
4589                         mcip = mgcp->mgc_client;
4590                         mac_srs = MCIP_TX_SRS(mcip);
4591                         tx = &mac_srs->srs_tx;
4592                         mac_tx_client_quiesce((mac_client_handle_t)mcip);
4593                         /*
4594                          * If we are here when removing rings from the
4595                          * defgroup, mac_reserve_tx_ring would have
4596                          * already deleted the ring from the MAC
4597                          * clients in the group.
4598                          */
4599                         if (group != defgrp) {
4600                                 mac_tx_invoke_callbacks(mcip,
4601                                     (mac_tx_cookie_t)
4602                                     mac_tx_srs_get_soft_ring(mac_srs, ring));
4603                                 mac_tx_srs_del_ring(mac_srs, ring);
4604                         }
4605                         /*
4606                          * Additionally, if  we are left with only
4607                          * one ring in the group after this, we need
4608                          * to modify the mode etc. to. (We haven't
4609                          * yet taken the ring out, so we check with 2).
4610                          */
4611                         if (group->mrg_cur_count == 2) {
4612                                 if (ring->mr_next == NULL)
4613                                         rem_ring = group->mrg_rings;
4614                                 else
4615                                         rem_ring = ring->mr_next;
4616                                 mac_tx_invoke_callbacks(mcip,
4617                                     (mac_tx_cookie_t)
4618                                     mac_tx_srs_get_soft_ring(mac_srs,
4619                                     rem_ring));
4620                                 mac_tx_srs_del_ring(mac_srs, rem_ring);
4621                                 if (rem_ring->mr_state != MR_INUSE) {
4622                                         (void) mac_start_ring(rem_ring);
4623                                 }
4624                                 tx->st_arg2 = (void *)rem_ring;
4625                                 mac_tx_srs_stat_recreate(mac_srs, B_FALSE);
4626                                 ring_info = mac_hwring_getinfo(
4627                                     (mac_ring_handle_t)rem_ring);
4628                                 /*
4629                                  * We are  shrinking from multiple
4630                                  * to 1 ring.
4631                                  */
4632                                 if (mac_srs->srs_type & SRST_BW_CONTROL) {
4633                                         tx->st_mode = SRS_TX_BW;
4634                                 } else if (mac_tx_serialize ||
4635                                     (ring_info & MAC_RING_TX_SERIALIZE)) {
4636                                         tx->st_mode = SRS_TX_SERIALIZE;
4637                                 } else {
4638                                         tx->st_mode = SRS_TX_DEFAULT;
4639                                 }
4640                                 tx->st_func = mac_tx_get_func(tx->st_mode);
4641                         }
4642                         mac_tx_client_restart((mac_client_handle_t)mcip);
4643                         mgcp = mgcp->mgc_next;
4644                 }
4645                 break;
4646         }
4647         default:
4648                 ASSERT(B_FALSE);
4649         }
4650 
4651         /*
4652          * Remove the ring from the group.
4653          */
4654         if (ring == group->mrg_rings)
4655                 group->mrg_rings = ring->mr_next;
4656         else {
4657                 mac_ring_t *pre;
4658 
4659                 pre = group->mrg_rings;
4660                 while (pre->mr_next != ring)
4661                         pre = pre->mr_next;
4662                 pre->mr_next = ring->mr_next;
4663         }
4664         group->mrg_cur_count--;
4665 
4666         if (!driver_call) {
4667                 ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
4668                 ASSERT(group->mrg_driver == NULL ||
4669                     cap_rings->mr_gremring != NULL);
4670 
4671                 /*
4672                  * Remove the driver level hardware ring.
4673                  */
4674                 if (group->mrg_driver != NULL) {
4675                         cap_rings->mr_gremring(group->mrg_driver,
4676                             ring->mr_driver, ring->mr_type);
4677                 }
4678         }
4679 
4680         ring->mr_gh = NULL;
4681         if (driver_call)
4682                 mac_ring_free(mip, ring);
4683         else
4684                 ring->mr_flag = 0;
4685 }
4686 
4687 /*
4688  * Move a ring to the target group. If needed, remove the ring from the group
4689  * that it currently belongs to.
4690  *
4691  * The caller need to enter MAC's perimeter by calling mac_perim_enter().
4692  */
4693 static int
4694 mac_group_mov_ring(mac_impl_t *mip, mac_group_t *d_group, mac_ring_t *ring)
4695 {
4696         mac_group_t *s_group = (mac_group_t *)ring->mr_gh;
4697         int rv;
4698 
4699         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4700         ASSERT(d_group != NULL);
4701         ASSERT(s_group->mrg_mh == d_group->mrg_mh);
4702 
4703         if (s_group == d_group)
4704                 return (0);
4705 
4706         /*
4707          * Remove it from current group first.
4708          */
4709         if (s_group != NULL)
4710                 i_mac_group_rem_ring(s_group, ring, B_FALSE);
4711 
4712         /*
4713          * Add it to the new group.
4714          */
4715         rv = i_mac_group_add_ring(d_group, ring, 0);
4716         if (rv != 0) {
4717                 /*
4718                  * Failed to add ring back to source group. If
4719                  * that fails, the ring is stuck in limbo, log message.
4720                  */
4721                 if (i_mac_group_add_ring(s_group, ring, 0)) {
4722                         cmn_err(CE_WARN, "%s: failed to move ring %p\n",
4723                             mip->mi_name, (void *)ring);
4724                 }
4725         }
4726 
4727         return (rv);
4728 }
4729 
4730 /*
4731  * Find a MAC address according to its value.
4732  */
4733 mac_address_t *
4734 mac_find_macaddr(mac_impl_t *mip, uint8_t *mac_addr)
4735 {
4736         mac_address_t *map;
4737 
4738         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4739 
4740         for (map = mip->mi_addresses; map != NULL; map = map->ma_next) {
4741                 if (bcmp(mac_addr, map->ma_addr, map->ma_len) == 0)
4742                         break;
4743         }
4744 
4745         return (map);
4746 }
4747 
4748 /*
4749  * Check whether the MAC address is shared by multiple clients.
4750  */
4751 boolean_t
4752 mac_check_macaddr_shared(mac_address_t *map)
4753 {
4754         ASSERT(MAC_PERIM_HELD((mac_handle_t)map->ma_mip));
4755 
4756         return (map->ma_nusers > 1);
4757 }
4758 
4759 /*
4760  * Remove the specified MAC address from the MAC address list and free it.
4761  */
4762 static void
4763 mac_free_macaddr(mac_address_t *map)
4764 {
4765         mac_impl_t *mip = map->ma_mip;
4766 
4767         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4768         ASSERT(mip->mi_addresses != NULL);
4769 
4770         map = mac_find_macaddr(mip, map->ma_addr);
4771 
4772         ASSERT(map != NULL);
4773         ASSERT(map->ma_nusers == 0);
4774 
4775         if (map == mip->mi_addresses) {
4776                 mip->mi_addresses = map->ma_next;
4777         } else {
4778                 mac_address_t *pre;
4779 
4780                 pre = mip->mi_addresses;
4781                 while (pre->ma_next != map)
4782                         pre = pre->ma_next;
4783                 pre->ma_next = map->ma_next;
4784         }
4785 
4786         kmem_free(map, sizeof (mac_address_t));
4787 }
4788 
4789 /*
4790  * Add a MAC address reference for a client. If the desired MAC address
4791  * exists, add a reference to it. Otherwise, add the new address by adding
4792  * it to a reserved group or setting promiscuous mode. Won't try different
4793  * group is the group is non-NULL, so the caller must explictly share
4794  * default group when needed.
4795  *
4796  * Note, the primary MAC address is initialized at registration time, so
4797  * to add it to default group only need to activate it if its reference
4798  * count is still zero. Also, some drivers may not have advertised RINGS
4799  * capability.
4800  */
4801 int
4802 mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr,
4803     boolean_t use_hw)
4804 {
4805         mac_address_t *map;
4806         int err = 0;
4807         boolean_t allocated_map = B_FALSE;
4808 
4809         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4810 
4811         map = mac_find_macaddr(mip, mac_addr);
4812 
4813         /*
4814          * If the new MAC address has not been added. Allocate a new one
4815          * and set it up.
4816          */
4817         if (map == NULL) {
4818                 map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
4819                 map->ma_len = mip->mi_type->mt_addr_length;
4820                 bcopy(mac_addr, map->ma_addr, map->ma_len);
4821                 map->ma_nusers = 0;
4822                 map->ma_group = group;
4823                 map->ma_mip = mip;
4824 
4825                 /* add the new MAC address to the head of the address list */
4826                 map->ma_next = mip->mi_addresses;
4827                 mip->mi_addresses = map;
4828 
4829                 allocated_map = B_TRUE;
4830         }
4831 
4832         ASSERT(map->ma_group == NULL || map->ma_group == group);
4833         if (map->ma_group == NULL)
4834                 map->ma_group = group;
4835 
4836         /*
4837          * If the MAC address is already in use, simply account for the
4838          * new client.
4839          */
4840         if (map->ma_nusers++ > 0)
4841                 return (0);
4842 
4843         /*
4844          * Activate this MAC address by adding it to the reserved group.
4845          */
4846         if (group != NULL) {
4847                 err = mac_group_addmac(group, (const uint8_t *)mac_addr);
4848                 if (err == 0) {
4849                         map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4850                         return (0);
4851                 }
4852         }
4853 
4854         /*
4855          * The MAC address addition failed. If the client requires a
4856          * hardware classified MAC address, fail the operation.
4857          */
4858         if (use_hw) {
4859                 err = ENOSPC;
4860                 goto bail;
4861         }
4862 
4863         /*
4864          * Try promiscuous mode.
4865          *
4866          * For drivers that don't advertise RINGS capability, do
4867          * nothing for the primary address.
4868          */
4869         if ((group == NULL) &&
4870             (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) {
4871                 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4872                 return (0);
4873         }
4874 
4875         /*
4876          * Enable promiscuous mode in order to receive traffic
4877          * to the new MAC address.
4878          */
4879         if ((err = i_mac_promisc_set(mip, B_TRUE)) == 0) {
4880                 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC;
4881                 return (0);
4882         }
4883 
4884         /*
4885          * Free the MAC address that could not be added. Don't free
4886          * a pre-existing address, it could have been the entry
4887          * for the primary MAC address which was pre-allocated by
4888          * mac_init_macaddr(), and which must remain on the list.
4889          */
4890 bail:
4891         map->ma_nusers--;
4892         if (allocated_map)
4893                 mac_free_macaddr(map);
4894         return (err);
4895 }
4896 
4897 /*
4898  * Remove a reference to a MAC address. This may cause to remove the MAC
4899  * address from an associated group or to turn off promiscuous mode.
4900  * The caller needs to handle the failure properly.
4901  */
4902 int
4903 mac_remove_macaddr(mac_address_t *map)
4904 {
4905         mac_impl_t *mip = map->ma_mip;
4906         int err = 0;
4907 
4908         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4909 
4910         ASSERT(map == mac_find_macaddr(mip, map->ma_addr));
4911 
4912         /*
4913          * If it's not the last client using this MAC address, only update
4914          * the MAC clients count.
4915          */
4916         if (--map->ma_nusers > 0)
4917                 return (0);
4918 
4919         /*
4920          * The MAC address is no longer used by any MAC client, so remove
4921          * it from its associated group, or turn off promiscuous mode
4922          * if it was enabled for the MAC address.
4923          */
4924         switch (map->ma_type) {
4925         case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
4926                 /*
4927                  * Don't free the preset primary address for drivers that
4928                  * don't advertise RINGS capability.
4929                  */
4930                 if (map->ma_group == NULL)
4931                         return (0);
4932 
4933                 err = mac_group_remmac(map->ma_group, map->ma_addr);
4934                 if (err == 0)
4935                         map->ma_group = NULL;
4936                 break;
4937         case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
4938                 err = i_mac_promisc_set(mip, B_FALSE);
4939                 break;
4940         default:
4941                 ASSERT(B_FALSE);
4942         }
4943 
4944         if (err != 0)
4945                 return (err);
4946 
4947         /*
4948          * We created MAC address for the primary one at registration, so we
4949          * won't free it here. mac_fini_macaddr() will take care of it.
4950          */
4951         if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) != 0)
4952                 mac_free_macaddr(map);
4953 
4954         return (0);
4955 }
4956 
4957 /*
4958  * Update an existing MAC address. The caller need to make sure that the new
4959  * value has not been used.
4960  */
4961 int
4962 mac_update_macaddr(mac_address_t *map, uint8_t *mac_addr)
4963 {
4964         mac_impl_t *mip = map->ma_mip;
4965         int err = 0;
4966 
4967         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4968         ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
4969 
4970         switch (map->ma_type) {
4971         case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
4972                 /*
4973                  * Update the primary address for drivers that are not
4974                  * RINGS capable.
4975                  */
4976                 if (mip->mi_rx_groups == NULL) {
4977                         err = mip->mi_unicst(mip->mi_driver, (const uint8_t *)
4978                             mac_addr);
4979                         if (err != 0)
4980                                 return (err);
4981                         break;
4982                 }
4983 
4984                 /*
4985                  * If this MAC address is not currently in use,
4986                  * simply break out and update the value.
4987                  */
4988                 if (map->ma_nusers == 0)
4989                         break;
4990 
4991                 /*
4992                  * Need to replace the MAC address associated with a group.
4993                  */
4994                 err = mac_group_remmac(map->ma_group, map->ma_addr);
4995                 if (err != 0)
4996                         return (err);
4997 
4998                 err = mac_group_addmac(map->ma_group, mac_addr);
4999 
5000                 /*
5001                  * Failure hints hardware error. The MAC layer needs to
5002                  * have error notification facility to handle this.
5003                  * Now, simply try to restore the value.
5004                  */
5005                 if (err != 0)
5006                         (void) mac_group_addmac(map->ma_group, map->ma_addr);
5007 
5008                 break;
5009         case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
5010                 /*
5011                  * Need to do nothing more if in promiscuous mode.
5012                  */
5013                 break;
5014         default:
5015                 ASSERT(B_FALSE);
5016         }
5017 
5018         /*
5019          * Successfully replaced the MAC address.
5020          */
5021         if (err == 0)
5022                 bcopy(mac_addr, map->ma_addr, map->ma_len);
5023 
5024         return (err);
5025 }
5026 
5027 /*
5028  * Freshen the MAC address with new value. Its caller must have updated the
5029  * hardware MAC address before calling this function.
5030  * This funcitons is supposed to be used to handle the MAC address change
5031  * notification from underlying drivers.
5032  */
5033 void
5034 mac_freshen_macaddr(mac_address_t *map, uint8_t *mac_addr)
5035 {
5036         mac_impl_t *mip = map->ma_mip;
5037 
5038         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5039         ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
5040 
5041         /*
5042          * Freshen the MAC address with new value.
5043          */
5044         bcopy(mac_addr, map->ma_addr, map->ma_len);
5045         bcopy(mac_addr, mip->mi_addr, map->ma_len);
5046 
5047         /*
5048          * Update all MAC clients that share this MAC address.
5049          */
5050         mac_unicast_update_clients(mip, map);
5051 }
5052 
5053 /*
5054  * Set up the primary MAC address.
5055  */
5056 void
5057 mac_init_macaddr(mac_impl_t *mip)
5058 {
5059         mac_address_t *map;
5060 
5061         /*
5062          * The reference count is initialized to zero, until it's really
5063          * activated.
5064          */
5065         map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
5066         map->ma_len = mip->mi_type->mt_addr_length;
5067         bcopy(mip->mi_addr, map->ma_addr, map->ma_len);
5068 
5069         /*
5070          * If driver advertises RINGS capability, it shouldn't have initialized
5071          * its primary MAC address. For other drivers, including VNIC, the
5072          * primary address must work after registration.
5073          */
5074         if (mip->mi_rx_groups == NULL)
5075                 map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
5076 
5077         map->ma_mip = mip;
5078 
5079         mip->mi_addresses = map;
5080 }
5081 
5082 /*
5083  * Clean up the primary MAC address. Note, only one primary MAC address
5084  * is allowed. All other MAC addresses must have been freed appropriately.
5085  */
5086 void
5087 mac_fini_macaddr(mac_impl_t *mip)
5088 {
5089         mac_address_t *map = mip->mi_addresses;
5090 
5091         if (map == NULL)
5092                 return;
5093 
5094         /*
5095          * If mi_addresses is initialized, there should be exactly one
5096          * entry left on the list with no users.
5097          */
5098         ASSERT(map->ma_nusers == 0);
5099         ASSERT(map->ma_next == NULL);
5100 
5101         kmem_free(map, sizeof (mac_address_t));
5102         mip->mi_addresses = NULL;
5103 }
5104 
5105 /*
5106  * Logging related functions.
5107  *
5108  * Note that Kernel statistics have been extended to maintain fine
5109  * granularity of statistics viz. hardware lane, software lane, fanout
5110  * stats etc. However, extended accounting continues to support only
5111  * aggregate statistics like before.
5112  */
5113 
5114 /* Write the flow description to a netinfo_t record */
5115 static netinfo_t *
5116 mac_write_flow_desc(flow_entry_t *flent, mac_client_impl_t *mcip)
5117 {
5118         netinfo_t               *ninfo;
5119         net_desc_t              *ndesc;
5120         flow_desc_t             *fdesc;
5121         mac_resource_props_t    *mrp;
5122 
5123         ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5124         if (ninfo == NULL)
5125                 return (NULL);
5126         ndesc = kmem_zalloc(sizeof (net_desc_t), KM_NOSLEEP);
5127         if (ndesc == NULL) {
5128                 kmem_free(ninfo, sizeof (netinfo_t));
5129                 return (NULL);
5130         }
5131 
5132         /*
5133          * Grab the fe_lock to see a self-consistent fe_flow_desc.
5134          * Updates to the fe_flow_desc are done under the fe_lock
5135          */
5136         mutex_enter(&flent->fe_lock);
5137         fdesc = &flent->fe_flow_desc;
5138         mrp = &flent->fe_resource_props;
5139 
5140         ndesc->nd_name = flent->fe_flow_name;
5141         ndesc->nd_devname = mcip->mci_name;
5142         bcopy(fdesc->fd_src_mac, ndesc->nd_ehost, ETHERADDRL);
5143         bcopy(fdesc->fd_dst_mac, ndesc->nd_edest, ETHERADDRL);
5144         ndesc->nd_sap = htonl(fdesc->fd_sap);
5145         ndesc->nd_isv4 = (uint8_t)fdesc->fd_ipversion == IPV4_VERSION;
5146         ndesc->nd_bw_limit = mrp->mrp_maxbw;
5147         if (ndesc->nd_isv4) {
5148                 ndesc->nd_saddr[3] = htonl(fdesc->fd_local_addr.s6_addr32[3]);
5149                 ndesc->nd_daddr[3] = htonl(fdesc->fd_remote_addr.s6_addr32[3]);
5150         } else {
5151                 bcopy(&fdesc->fd_local_addr, ndesc->nd_saddr, IPV6_ADDR_LEN);
5152                 bcopy(&fdesc->fd_remote_addr, ndesc->nd_daddr, IPV6_ADDR_LEN);
5153         }
5154         ndesc->nd_sport = htons(fdesc->fd_local_port);
5155         ndesc->nd_dport = htons(fdesc->fd_remote_port);
5156         ndesc->nd_protocol = (uint8_t)fdesc->fd_protocol;
5157         mutex_exit(&flent->fe_lock);
5158 
5159         ninfo->ni_record = ndesc;
5160         ninfo->ni_size = sizeof (net_desc_t);
5161         ninfo->ni_type = EX_NET_FLDESC_REC;
5162 
5163         return (ninfo);
5164 }
5165 
5166 /* Write the flow statistics to a netinfo_t record */
5167 static netinfo_t *
5168 mac_write_flow_stats(flow_entry_t *flent)
5169 {
5170         netinfo_t               *ninfo;
5171         net_stat_t              *nstat;
5172         mac_soft_ring_set_t     *mac_srs;
5173         mac_rx_stats_t          *mac_rx_stat;
5174         mac_tx_stats_t          *mac_tx_stat;
5175         int                     i;
5176 
5177         ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5178         if (ninfo == NULL)
5179                 return (NULL);
5180         nstat = kmem_zalloc(sizeof (net_stat_t), KM_NOSLEEP);
5181         if (nstat == NULL) {
5182                 kmem_free(ninfo, sizeof (netinfo_t));
5183                 return (NULL);
5184         }
5185 
5186         nstat->ns_name = flent->fe_flow_name;
5187         for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
5188                 mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
5189                 mac_rx_stat = &mac_srs->srs_rx.sr_stat;
5190 
5191                 nstat->ns_ibytes += mac_rx_stat->mrs_intrbytes +
5192                     mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes;
5193                 nstat->ns_ipackets += mac_rx_stat->mrs_intrcnt +
5194                     mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
5195                 nstat->ns_oerrors += mac_rx_stat->mrs_ierrors;
5196         }
5197 
5198         mac_srs = (mac_soft_ring_set_t *)(flent->fe_tx_srs);
5199         if (mac_srs != NULL) {
5200                 mac_tx_stat = &mac_srs->srs_tx.st_stat;
5201 
5202                 nstat->ns_obytes = mac_tx_stat->mts_obytes;
5203                 nstat->ns_opackets = mac_tx_stat->mts_opackets;
5204                 nstat->ns_oerrors = mac_tx_stat->mts_oerrors;
5205         }
5206 
5207         ninfo->ni_record = nstat;
5208         ninfo->ni_size = sizeof (net_stat_t);
5209         ninfo->ni_type = EX_NET_FLSTAT_REC;
5210 
5211         return (ninfo);
5212 }
5213 
5214 /* Write the link description to a netinfo_t record */
5215 static netinfo_t *
5216 mac_write_link_desc(mac_client_impl_t *mcip)
5217 {
5218         netinfo_t               *ninfo;
5219         net_desc_t              *ndesc;
5220         flow_entry_t            *flent = mcip->mci_flent;
5221 
5222         ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5223         if (ninfo == NULL)
5224                 return (NULL);
5225         ndesc = kmem_zalloc(sizeof (net_desc_t), KM_NOSLEEP);
5226         if (ndesc == NULL) {
5227                 kmem_free(ninfo, sizeof (netinfo_t));
5228                 return (NULL);
5229         }
5230 
5231         ndesc->nd_name = mcip->mci_name;
5232         ndesc->nd_devname = mcip->mci_name;
5233         ndesc->nd_isv4 = B_TRUE;
5234         /*
5235          * Grab the fe_lock to see a self-consistent fe_flow_desc.
5236          * Updates to the fe_flow_desc are done under the fe_lock
5237          * after removing the flent from the flow table.
5238          */
5239         mutex_enter(&flent->fe_lock);
5240         bcopy(flent->fe_flow_desc.fd_src_mac, ndesc->nd_ehost, ETHERADDRL);
5241         mutex_exit(&flent->fe_lock);
5242 
5243         ninfo->ni_record = ndesc;
5244         ninfo->ni_size = sizeof (net_desc_t);
5245         ninfo->ni_type = EX_NET_LNDESC_REC;
5246 
5247         return (ninfo);
5248 }
5249 
5250 /* Write the link statistics to a netinfo_t record */
5251 static netinfo_t *
5252 mac_write_link_stats(mac_client_impl_t *mcip)
5253 {
5254         netinfo_t               *ninfo;
5255         net_stat_t              *nstat;
5256         flow_entry_t            *flent;
5257         mac_soft_ring_set_t     *mac_srs;
5258         mac_rx_stats_t          *mac_rx_stat;
5259         mac_tx_stats_t          *mac_tx_stat;
5260         int                     i;
5261 
5262         ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5263         if (ninfo == NULL)
5264                 return (NULL);
5265         nstat = kmem_zalloc(sizeof (net_stat_t), KM_NOSLEEP);
5266         if (nstat == NULL) {
5267                 kmem_free(ninfo, sizeof (netinfo_t));
5268                 return (NULL);
5269         }
5270 
5271         nstat->ns_name = mcip->mci_name;
5272         flent = mcip->mci_flent;
5273         if (flent != NULL)  {
5274                 for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
5275                         mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
5276                         mac_rx_stat = &mac_srs->srs_rx.sr_stat;
5277 
5278                         nstat->ns_ibytes += mac_rx_stat->mrs_intrbytes +
5279                             mac_rx_stat->mrs_pollbytes +
5280                             mac_rx_stat->mrs_lclbytes;
5281                         nstat->ns_ipackets += mac_rx_stat->mrs_intrcnt +
5282                             mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
5283                         nstat->ns_oerrors += mac_rx_stat->mrs_ierrors;
5284                 }
5285         }
5286 
5287         mac_srs = (mac_soft_ring_set_t *)(mcip->mci_flent->fe_tx_srs);
5288         if (mac_srs != NULL) {
5289                 mac_tx_stat = &mac_srs->srs_tx.st_stat;
5290 
5291                 nstat->ns_obytes = mac_tx_stat->mts_obytes;
5292                 nstat->ns_opackets = mac_tx_stat->mts_opackets;
5293                 nstat->ns_oerrors = mac_tx_stat->mts_oerrors;
5294         }
5295 
5296         ninfo->ni_record = nstat;
5297         ninfo->ni_size = sizeof (net_stat_t);
5298         ninfo->ni_type = EX_NET_LNSTAT_REC;
5299 
5300         return (ninfo);
5301 }
5302 
5303 typedef struct i_mac_log_state_s {
5304         boolean_t       mi_last;
5305         int             mi_fenable;
5306         int             mi_lenable;
5307         list_t          *mi_list;
5308 } i_mac_log_state_t;
5309 
5310 /*
5311  * For a given flow, if the description has not been logged before, do it now.
5312  * If it is a VNIC, then we have collected information about it from the MAC
5313  * table, so skip it.
5314  *
5315  * Called through mac_flow_walk_nolock()
5316  *
5317  * Return 0 if successful.
5318  */
5319 static int
5320 mac_log_flowinfo(flow_entry_t *flent, void *arg)
5321 {
5322         mac_client_impl_t       *mcip = flent->fe_mcip;
5323         i_mac_log_state_t       *lstate = arg;
5324         netinfo_t               *ninfo;
5325 
5326         if (mcip == NULL)
5327                 return (0);
5328 
5329         /*
5330          * If the name starts with "vnic", and fe_user_generated is true (to
5331          * exclude the mcast and active flow entries created implicitly for
5332          * a vnic, it is a VNIC flow.  i.e. vnic1 is a vnic flow,
5333          * vnic/bge1/mcast1 is not and neither is vnic/bge1/active.
5334          */
5335         if (strncasecmp(flent->fe_flow_name, "vnic", 4) == 0 &&
5336             (flent->fe_type & FLOW_USER) != 0) {
5337                 return (0);
5338         }
5339 
5340         if (!flent->fe_desc_logged) {
5341                 /*
5342                  * We don't return error because we want to continue the
5343                  * walk in case this is the last walk which means we
5344                  * need to reset fe_desc_logged in all the flows.
5345                  */
5346                 if ((ninfo = mac_write_flow_desc(flent, mcip)) == NULL)
5347                         return (0);
5348                 list_insert_tail(lstate->mi_list, ninfo);
5349                 flent->fe_desc_logged = B_TRUE;
5350         }
5351 
5352         /*
5353          * Regardless of the error, we want to proceed in case we have to
5354          * reset fe_desc_logged.
5355          */
5356         ninfo = mac_write_flow_stats(flent);
5357         if (ninfo == NULL)
5358                 return (-1);
5359 
5360         list_insert_tail(lstate->mi_list, ninfo);
5361 
5362         if (mcip != NULL && !(mcip->mci_state_flags & MCIS_DESC_LOGGED))
5363                 flent->fe_desc_logged = B_FALSE;
5364 
5365         return (0);
5366 }
5367 
5368 /*
5369  * Log the description for each mac client of this mac_impl_t, if it
5370  * hasn't already been done. Additionally, log statistics for the link as
5371  * well. Walk the flow table and log information for each flow as well.
5372  * If it is the last walk (mci_last), then we turn off mci_desc_logged (and
5373  * also fe_desc_logged, if flow logging is on) since we want to log the
5374  * description if and when logging is restarted.
5375  *
5376  * Return 0 upon success or -1 upon failure
5377  */
5378 static int
5379 i_mac_impl_log(mac_impl_t *mip, i_mac_log_state_t *lstate)
5380 {
5381         mac_client_impl_t       *mcip;
5382         netinfo_t               *ninfo;
5383 
5384         i_mac_perim_enter(mip);
5385         /*
5386          * Only walk the client list for NIC and etherstub
5387          */
5388         if ((mip->mi_state_flags & MIS_DISABLED) ||
5389             ((mip->mi_state_flags & MIS_IS_VNIC) &&
5390             (mac_get_lower_mac_handle((mac_handle_t)mip) != NULL))) {
5391                 i_mac_perim_exit(mip);
5392                 return (0);
5393         }
5394 
5395         for (mcip = mip->mi_clients_list; mcip != NULL;
5396             mcip = mcip->mci_client_next) {
5397                 if (!MCIP_DATAPATH_SETUP(mcip))
5398                         continue;
5399                 if (lstate->mi_lenable) {
5400                         if (!(mcip->mci_state_flags & MCIS_DESC_LOGGED)) {
5401                                 ninfo = mac_write_link_desc(mcip);
5402                                 if (ninfo == NULL) {
5403                                 /*
5404                                  * We can't terminate it if this is the last
5405                                  * walk, else there might be some links with
5406                                  * mi_desc_logged set to true, which means
5407                                  * their description won't be logged the next
5408                                  * time logging is started (similarly for the
5409                                  * flows within such links). We can continue
5410                                  * without walking the flow table (i.e. to
5411                                  * set fe_desc_logged to false) because we
5412                                  * won't have written any flow stuff for this
5413                                  * link as we haven't logged the link itself.
5414                                  */
5415                                         i_mac_perim_exit(mip);
5416                                         if (lstate->mi_last)
5417                                                 return (0);
5418                                         else
5419                                                 return (-1);
5420                                 }
5421                                 mcip->mci_state_flags |= MCIS_DESC_LOGGED;
5422                                 list_insert_tail(lstate->mi_list, ninfo);
5423                         }
5424                 }
5425 
5426                 ninfo = mac_write_link_stats(mcip);
5427                 if (ninfo == NULL && !lstate->mi_last) {
5428                         i_mac_perim_exit(mip);
5429                         return (-1);
5430                 }
5431                 list_insert_tail(lstate->mi_list, ninfo);
5432 
5433                 if (lstate->mi_last)
5434                         mcip->mci_state_flags &= ~MCIS_DESC_LOGGED;
5435 
5436                 if (lstate->mi_fenable) {
5437                         if (mcip->mci_subflow_tab != NULL) {
5438                                 (void) mac_flow_walk_nolock(
5439                                     mcip->mci_subflow_tab, mac_log_flowinfo,
5440                                     lstate);
5441                         }
5442                 }
5443         }
5444         i_mac_perim_exit(mip);
5445         return (0);
5446 }
5447 
5448 /*
5449  * modhash walker function to add a mac_impl_t to a list
5450  */
5451 /*ARGSUSED*/
5452 static uint_t
5453 i_mac_impl_list_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
5454 {
5455         list_t                  *list = (list_t *)arg;
5456         mac_impl_t              *mip = (mac_impl_t *)val;
5457 
5458         if ((mip->mi_state_flags & MIS_DISABLED) == 0) {
5459                 list_insert_tail(list, mip);
5460                 mip->mi_ref++;
5461         }
5462 
5463         return (MH_WALK_CONTINUE);
5464 }
5465 
5466 void
5467 i_mac_log_info(list_t *net_log_list, i_mac_log_state_t *lstate)
5468 {
5469         list_t                  mac_impl_list;
5470         mac_impl_t              *mip;
5471         netinfo_t               *ninfo;
5472 
5473         /* Create list of mac_impls */
5474         ASSERT(RW_LOCK_HELD(&i_mac_impl_lock));
5475         list_create(&mac_impl_list, sizeof (mac_impl_t), offsetof(mac_impl_t,
5476             mi_node));
5477         mod_hash_walk(i_mac_impl_hash, i_mac_impl_list_walker, &mac_impl_list);
5478         rw_exit(&i_mac_impl_lock);
5479 
5480         /* Create log entries for each mac_impl */
5481         for (mip = list_head(&mac_impl_list); mip != NULL;
5482             mip = list_next(&mac_impl_list, mip)) {
5483                 if (i_mac_impl_log(mip, lstate) != 0)
5484                         continue;
5485         }
5486 
5487         /* Remove elements and destroy list of mac_impls */
5488         rw_enter(&i_mac_impl_lock, RW_WRITER);
5489         while ((mip = list_remove_tail(&mac_impl_list)) != NULL) {
5490                 mip->mi_ref--;
5491         }
5492         rw_exit(&i_mac_impl_lock);
5493         list_destroy(&mac_impl_list);
5494 
5495         /*
5496          * Write log entries to files outside of locks, free associated
5497          * structures, and remove entries from the list.
5498          */
5499         while ((ninfo = list_head(net_log_list)) != NULL) {
5500                 (void) exacct_commit_netinfo(ninfo->ni_record, ninfo->ni_type);
5501                 list_remove(net_log_list, ninfo);
5502                 kmem_free(ninfo->ni_record, ninfo->ni_size);
5503                 kmem_free(ninfo, sizeof (*ninfo));
5504         }
5505         list_destroy(net_log_list);
5506 }
5507 
5508 /*
5509  * The timer thread that runs every mac_logging_interval seconds and logs
5510  * link and/or flow information.
5511  */
5512 /* ARGSUSED */
5513 void
5514 mac_log_linkinfo(void *arg)
5515 {
5516         i_mac_log_state_t       lstate;
5517         list_t                  net_log_list;
5518 
5519         list_create(&net_log_list, sizeof (netinfo_t),
5520             offsetof(netinfo_t, ni_link));
5521 
5522         rw_enter(&i_mac_impl_lock, RW_READER);
5523         if (!mac_flow_log_enable && !mac_link_log_enable) {
5524                 rw_exit(&i_mac_impl_lock);
5525                 return;
5526         }
5527         lstate.mi_fenable = mac_flow_log_enable;
5528         lstate.mi_lenable = mac_link_log_enable;
5529         lstate.mi_last = B_FALSE;
5530         lstate.mi_list = &net_log_list;
5531 
5532         /* Write log entries for each mac_impl in the list */
5533         i_mac_log_info(&net_log_list, &lstate);
5534 
5535         if (mac_flow_log_enable || mac_link_log_enable) {
5536                 mac_logging_timer = timeout(mac_log_linkinfo, NULL,
5537                     SEC_TO_TICK(mac_logging_interval));
5538         }
5539 }
5540 
5541 typedef struct i_mac_fastpath_state_s {
5542         boolean_t       mf_disable;
5543         int             mf_err;
5544 } i_mac_fastpath_state_t;
5545 
5546 /* modhash walker function to enable or disable fastpath */
5547 /*ARGSUSED*/
5548 static uint_t
5549 i_mac_fastpath_walker(mod_hash_key_t key, mod_hash_val_t *val,
5550     void *arg)
5551 {
5552         i_mac_fastpath_state_t  *state = arg;
5553         mac_handle_t            mh = (mac_handle_t)val;
5554 
5555         if (state->mf_disable)
5556                 state->mf_err = mac_fastpath_disable(mh);
5557         else
5558                 mac_fastpath_enable(mh);
5559 
5560         return (state->mf_err == 0 ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
5561 }
5562 
5563 /*
5564  * Start the logging timer.
5565  */
5566 int
5567 mac_start_logusage(mac_logtype_t type, uint_t interval)
5568 {
5569         i_mac_fastpath_state_t  dstate = {B_TRUE, 0};
5570         i_mac_fastpath_state_t  estate = {B_FALSE, 0};
5571         int                     err;
5572 
5573         rw_enter(&i_mac_impl_lock, RW_WRITER);
5574         switch (type) {
5575         case MAC_LOGTYPE_FLOW:
5576                 if (mac_flow_log_enable) {
5577                         rw_exit(&i_mac_impl_lock);
5578                         return (0);
5579                 }
5580                 /* FALLTHRU */
5581         case MAC_LOGTYPE_LINK:
5582                 if (mac_link_log_enable) {
5583                         rw_exit(&i_mac_impl_lock);
5584                         return (0);
5585                 }
5586                 break;
5587         default:
5588                 ASSERT(0);
5589         }
5590 
5591         /* Disable fastpath */
5592         mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &dstate);
5593         if ((err = dstate.mf_err) != 0) {
5594                 /* Reenable fastpath  */
5595                 mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate);
5596                 rw_exit(&i_mac_impl_lock);
5597                 return (err);
5598         }
5599 
5600         switch (type) {
5601         case MAC_LOGTYPE_FLOW:
5602                 mac_flow_log_enable = B_TRUE;
5603                 /* FALLTHRU */
5604         case MAC_LOGTYPE_LINK:
5605                 mac_link_log_enable = B_TRUE;
5606                 break;
5607         }
5608 
5609         mac_logging_interval = interval;
5610         rw_exit(&i_mac_impl_lock);
5611         mac_log_linkinfo(NULL);
5612         return (0);
5613 }
5614 
5615 /*
5616  * Stop the logging timer if both link and flow logging are turned off.
5617  */
5618 void
5619 mac_stop_logusage(mac_logtype_t type)
5620 {
5621         i_mac_log_state_t       lstate;
5622         i_mac_fastpath_state_t  estate = {B_FALSE, 0};
5623         list_t                  net_log_list;
5624 
5625         list_create(&net_log_list, sizeof (netinfo_t),
5626             offsetof(netinfo_t, ni_link));
5627 
5628         rw_enter(&i_mac_impl_lock, RW_WRITER);
5629 
5630         lstate.mi_fenable = mac_flow_log_enable;
5631         lstate.mi_lenable = mac_link_log_enable;
5632         lstate.mi_list = &net_log_list;
5633 
5634         /* Last walk */
5635         lstate.mi_last = B_TRUE;
5636 
5637         switch (type) {
5638         case MAC_LOGTYPE_FLOW:
5639                 if (lstate.mi_fenable) {
5640                         ASSERT(mac_link_log_enable);
5641                         mac_flow_log_enable = B_FALSE;
5642                         mac_link_log_enable = B_FALSE;
5643                         break;
5644                 }
5645                 /* FALLTHRU */
5646         case MAC_LOGTYPE_LINK:
5647                 if (!lstate.mi_lenable || mac_flow_log_enable) {
5648                         rw_exit(&i_mac_impl_lock);
5649                         return;
5650                 }
5651                 mac_link_log_enable = B_FALSE;
5652                 break;
5653         default:
5654                 ASSERT(0);
5655         }
5656 
5657         /* Reenable fastpath */
5658         mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate);
5659 
5660         (void) untimeout(mac_logging_timer);
5661         mac_logging_timer = 0;
5662 
5663         /* Write log entries for each mac_impl in the list */
5664         i_mac_log_info(&net_log_list, &lstate);
5665 }
5666 
5667 /*
5668  * Walk the rx and tx SRS/SRs for a flow and update the priority value.
5669  */
5670 void
5671 mac_flow_update_priority(mac_client_impl_t *mcip, flow_entry_t *flent)
5672 {
5673         pri_t                   pri;
5674         int                     count;
5675         mac_soft_ring_set_t     *mac_srs;
5676 
5677         if (flent->fe_rx_srs_cnt <= 0)
5678                 return;
5679 
5680         if (((mac_soft_ring_set_t *)flent->fe_rx_srs[0])->srs_type ==
5681             SRST_FLOW) {
5682                 pri = FLOW_PRIORITY(mcip->mci_min_pri,
5683                     mcip->mci_max_pri,
5684                     flent->fe_resource_props.mrp_priority);
5685         } else {
5686                 pri = mcip->mci_max_pri;
5687         }
5688 
5689         for (count = 0; count < flent->fe_rx_srs_cnt; count++) {
5690                 mac_srs = flent->fe_rx_srs[count];
5691                 mac_update_srs_priority(mac_srs, pri);
5692         }
5693         /*
5694          * If we have a Tx SRS, we need to modify all the threads associated
5695          * with it.
5696          */
5697         if (flent->fe_tx_srs != NULL)
5698                 mac_update_srs_priority(flent->fe_tx_srs, pri);
5699 }
5700 
5701 /*
5702  * RX and TX rings are reserved according to different semantics depending
5703  * on the requests from the MAC clients and type of rings:
5704  *
5705  * On the Tx side, by default we reserve individual rings, independently from
5706  * the groups.
5707  *
5708  * On the Rx side, the reservation is at the granularity of the group
5709  * of rings, and used for v12n level 1 only. It has a special case for the
5710  * primary client.
5711  *
5712  * If a share is allocated to a MAC client, we allocate a TX group and an
5713  * RX group to the client, and assign TX rings and RX rings to these
5714  * groups according to information gathered from the driver through
5715  * the share capability.
5716  *
5717  * The foreseable evolution of Rx rings will handle v12n level 2 and higher
5718  * to allocate individual rings out of a group and program the hw classifier
5719  * based on IP address or higher level criteria.
5720  */
5721 
5722 /*
5723  * mac_reserve_tx_ring()
5724  * Reserve a unused ring by marking it with MR_INUSE state.
5725  * As reserved, the ring is ready to function.
5726  *
5727  * Notes for Hybrid I/O:
5728  *
5729  * If a specific ring is needed, it is specified through the desired_ring
5730  * argument. Otherwise that argument is set to NULL.
5731  * If the desired ring was previous allocated to another client, this
5732  * function swaps it with a new ring from the group of unassigned rings.
5733  */
5734 mac_ring_t *
5735 mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring)
5736 {
5737         mac_group_t             *group;
5738         mac_grp_client_t        *mgcp;
5739         mac_client_impl_t       *mcip;
5740         mac_soft_ring_set_t     *srs;
5741 
5742         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5743 
5744         /*
5745          * Find an available ring and start it before changing its status.
5746          * The unassigned rings are at the end of the mi_tx_groups
5747          * array.
5748          */
5749         group = MAC_DEFAULT_TX_GROUP(mip);
5750 
5751         /* Can't take the default ring out of the default group */
5752         ASSERT(desired_ring != (mac_ring_t *)mip->mi_default_tx_ring);
5753 
5754         if (desired_ring->mr_state == MR_FREE) {
5755                 ASSERT(MAC_GROUP_NO_CLIENT(group));
5756                 if (mac_start_ring(desired_ring) != 0)
5757                         return (NULL);
5758                 return (desired_ring);
5759         }
5760         /*
5761          * There are clients using this ring, so let's move the clients
5762          * away from using this ring.
5763          */
5764         for (mgcp = group->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
5765                 mcip = mgcp->mgc_client;
5766                 mac_tx_client_quiesce((mac_client_handle_t)mcip);
5767                 srs = MCIP_TX_SRS(mcip);
5768                 ASSERT(mac_tx_srs_ring_present(srs, desired_ring));
5769                 mac_tx_invoke_callbacks(mcip,
5770                     (mac_tx_cookie_t)mac_tx_srs_get_soft_ring(srs,
5771                     desired_ring));
5772                 mac_tx_srs_del_ring(srs, desired_ring);
5773                 mac_tx_client_restart((mac_client_handle_t)mcip);
5774         }
5775         return (desired_ring);
5776 }
5777 
5778 /*
5779  * For a reserved group with multiple clients, return the primary client.
5780  */
5781 static mac_client_impl_t *
5782 mac_get_grp_primary(mac_group_t *grp)
5783 {
5784         mac_grp_client_t        *mgcp = grp->mrg_clients;
5785         mac_client_impl_t       *mcip;
5786 
5787         while (mgcp != NULL) {
5788                 mcip = mgcp->mgc_client;
5789                 if (mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC)
5790                         return (mcip);
5791                 mgcp = mgcp->mgc_next;
5792         }
5793         return (NULL);
5794 }
5795 
5796 /*
5797  * Hybrid I/O specifies the ring that should be given to a share.
5798  * If the ring is already used by clients, then we need to release
5799  * the ring back to the default group so that we can give it to
5800  * the share. This means the clients using this ring now get a
5801  * replacement ring. If there aren't any replacement rings, this
5802  * function returns a failure.
5803  */
5804 static int
5805 mac_reclaim_ring_from_grp(mac_impl_t *mip, mac_ring_type_t ring_type,
5806     mac_ring_t *ring, mac_ring_t **rings, int nrings)
5807 {
5808         mac_group_t             *group = (mac_group_t *)ring->mr_gh;
5809         mac_resource_props_t    *mrp;
5810         mac_client_impl_t       *mcip;
5811         mac_group_t             *defgrp;
5812         mac_ring_t              *tring;
5813         mac_group_t             *tgrp;
5814         int                     i;
5815         int                     j;
5816 
5817         mcip = MAC_GROUP_ONLY_CLIENT(group);
5818         if (mcip == NULL)
5819                 mcip = mac_get_grp_primary(group);
5820         ASSERT(mcip != NULL);
5821         ASSERT(mcip->mci_share == NULL);
5822 
5823         mrp = MCIP_RESOURCE_PROPS(mcip);
5824         if (ring_type == MAC_RING_TYPE_RX) {
5825                 defgrp = mip->mi_rx_donor_grp;
5826                 if ((mrp->mrp_mask & MRP_RX_RINGS) == 0) {
5827                         /* Need to put this mac client in the default group */
5828                         if (mac_rx_switch_group(mcip, group, defgrp) != 0)
5829                                 return (ENOSPC);
5830                 } else {
5831                         /*
5832                          * Switch this ring with some other ring from
5833                          * the default group.
5834                          */
5835                         for (tring = defgrp->mrg_rings; tring != NULL;
5836                             tring = tring->mr_next) {
5837                                 if (tring->mr_index == 0)
5838                                         continue;
5839                                 for (j = 0; j < nrings; j++) {
5840                                         if (rings[j] == tring)
5841                                                 break;
5842                                 }
5843                                 if (j >= nrings)
5844                                         break;
5845                         }
5846                         if (tring == NULL)
5847                                 return (ENOSPC);
5848                         if (mac_group_mov_ring(mip, group, tring) != 0)
5849                                 return (ENOSPC);
5850                         if (mac_group_mov_ring(mip, defgrp, ring) != 0) {
5851                                 (void) mac_group_mov_ring(mip, defgrp, tring);
5852                                 return (ENOSPC);
5853                         }
5854                 }
5855                 ASSERT(ring->mr_gh == (mac_group_handle_t)defgrp);
5856                 return (0);
5857         }
5858 
5859         defgrp = MAC_DEFAULT_TX_GROUP(mip);
5860         if (ring == (mac_ring_t *)mip->mi_default_tx_ring) {
5861                 /*
5862                  * See if we can get a spare ring to replace the default
5863                  * ring.
5864                  */
5865                 if (defgrp->mrg_cur_count == 1) {
5866                         /*
5867                          * Need to get a ring from another client, see if
5868                          * there are any clients that can be moved to
5869                          * the default group, thereby freeing some rings.
5870                          */
5871                         for (i = 0; i < mip->mi_tx_group_count; i++) {
5872                                 tgrp = &mip->mi_tx_groups[i];
5873                                 if (tgrp->mrg_state ==
5874                                     MAC_GROUP_STATE_REGISTERED) {
5875                                         continue;
5876                                 }
5877                                 mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
5878                                 if (mcip == NULL)
5879                                         mcip = mac_get_grp_primary(tgrp);
5880                                 ASSERT(mcip != NULL);
5881                                 mrp = MCIP_RESOURCE_PROPS(mcip);
5882                                 if ((mrp->mrp_mask & MRP_TX_RINGS) == 0) {
5883                                         ASSERT(tgrp->mrg_cur_count == 1);
5884                                         /*
5885                                          * If this ring is part of the
5886                                          * rings asked by the share we cannot
5887                                          * use it as the default ring.
5888                                          */
5889                                         for (j = 0; j < nrings; j++) {
5890                                                 if (rings[j] == tgrp->mrg_rings)
5891                                                         break;
5892                                         }
5893                                         if (j < nrings)
5894                                                 continue;
5895                                         mac_tx_client_quiesce(
5896                                             (mac_client_handle_t)mcip);
5897                                         mac_tx_switch_group(mcip, tgrp,
5898                                             defgrp);
5899                                         mac_tx_client_restart(
5900                                             (mac_client_handle_t)mcip);
5901                                         break;
5902                                 }
5903                         }
5904                         /*
5905                          * All the rings are reserved, can't give up the
5906                          * default ring.
5907                          */
5908                         if (defgrp->mrg_cur_count <= 1)
5909                                 return (ENOSPC);
5910                 }
5911                 /*
5912                  * Swap the default ring with another.
5913                  */
5914                 for (tring = defgrp->mrg_rings; tring != NULL;
5915                     tring = tring->mr_next) {
5916                         /*
5917                          * If this ring is part of the rings asked by the
5918                          * share we cannot use it as the default ring.
5919                          */
5920                         for (j = 0; j < nrings; j++) {
5921                                 if (rings[j] == tring)
5922                                         break;
5923                         }
5924                         if (j >= nrings)
5925                                 break;
5926                 }
5927                 ASSERT(tring != NULL);
5928                 mip->mi_default_tx_ring = (mac_ring_handle_t)tring;
5929                 return (0);
5930         }
5931         /*
5932          * The Tx ring is with a group reserved by a MAC client. See if
5933          * we can swap it.
5934          */
5935         ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
5936         mcip = MAC_GROUP_ONLY_CLIENT(group);
5937         if (mcip == NULL)
5938                 mcip = mac_get_grp_primary(group);
5939         ASSERT(mcip !=  NULL);
5940         mrp = MCIP_RESOURCE_PROPS(mcip);
5941         mac_tx_client_quiesce((mac_client_handle_t)mcip);
5942         if ((mrp->mrp_mask & MRP_TX_RINGS) == 0) {
5943                 ASSERT(group->mrg_cur_count == 1);
5944                 /* Put this mac client in the default group */
5945                 mac_tx_switch_group(mcip, group, defgrp);
5946         } else {
5947                 /*
5948                  * Switch this ring with some other ring from
5949                  * the default group.
5950                  */
5951                 for (tring = defgrp->mrg_rings; tring != NULL;
5952                     tring = tring->mr_next) {
5953                         if (tring == (mac_ring_t *)mip->mi_default_tx_ring)
5954                                 continue;
5955                         /*
5956                          * If this ring is part of the rings asked by the
5957                          * share we cannot use it for swapping.
5958                          */
5959                         for (j = 0; j < nrings; j++) {
5960                                 if (rings[j] == tring)
5961                                         break;
5962                         }
5963                         if (j >= nrings)
5964                                 break;
5965                 }
5966                 if (tring == NULL) {
5967                         mac_tx_client_restart((mac_client_handle_t)mcip);
5968                         return (ENOSPC);
5969                 }
5970                 if (mac_group_mov_ring(mip, group, tring) != 0) {
5971                         mac_tx_client_restart((mac_client_handle_t)mcip);
5972                         return (ENOSPC);
5973                 }
5974                 if (mac_group_mov_ring(mip, defgrp, ring) != 0) {
5975                         (void) mac_group_mov_ring(mip, defgrp, tring);
5976                         mac_tx_client_restart((mac_client_handle_t)mcip);
5977                         return (ENOSPC);
5978                 }
5979         }
5980         mac_tx_client_restart((mac_client_handle_t)mcip);
5981         ASSERT(ring->mr_gh == (mac_group_handle_t)defgrp);
5982         return (0);
5983 }
5984 
5985 /*
5986  * Populate a zero-ring group with rings. If the share is non-NULL,
5987  * the rings are chosen according to that share.
5988  * Invoked after allocating a new RX or TX group through
5989  * mac_reserve_rx_group() or mac_reserve_tx_group(), respectively.
5990  * Returns zero on success, an errno otherwise.
5991  */
5992 int
5993 i_mac_group_allocate_rings(mac_impl_t *mip, mac_ring_type_t ring_type,
5994     mac_group_t *src_group, mac_group_t *new_group, mac_share_handle_t share,
5995     uint32_t ringcnt)
5996 {
5997         mac_ring_t **rings, *ring;
5998         uint_t nrings;
5999         int rv = 0, i = 0, j;
6000 
6001         ASSERT((ring_type == MAC_RING_TYPE_RX &&
6002             mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) ||
6003             (ring_type == MAC_RING_TYPE_TX &&
6004             mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC));
6005 
6006         /*
6007          * First find the rings to allocate to the group.
6008          */
6009         if (share != NULL) {
6010                 /* get rings through ms_squery() */
6011                 mip->mi_share_capab.ms_squery(share, ring_type, NULL, &nrings);
6012                 ASSERT(nrings != 0);
6013                 rings = kmem_alloc(nrings * sizeof (mac_ring_handle_t),
6014                     KM_SLEEP);
6015                 mip->mi_share_capab.ms_squery(share, ring_type,
6016                     (mac_ring_handle_t *)rings, &nrings);
6017                 for (i = 0; i < nrings; i++) {
6018                         /*
6019                          * If we have given this ring to a non-default
6020                          * group, we need to check if we can get this
6021                          * ring.
6022                          */
6023                         ring = rings[i];
6024                         if (ring->mr_gh != (mac_group_handle_t)src_group ||
6025                             ring == (mac_ring_t *)mip->mi_default_tx_ring) {
6026                                 if (mac_reclaim_ring_from_grp(mip, ring_type,
6027                                     ring, rings, nrings) != 0) {
6028                                         rv = ENOSPC;
6029                                         goto bail;
6030                                 }
6031                         }
6032                 }
6033         } else {
6034                 /*
6035                  * Pick one ring from default group.
6036                  *
6037                  * for now pick the second ring which requires the first ring
6038                  * at index 0 to stay in the default group, since it is the
6039                  * ring which carries the multicast traffic.
6040                  * We need a better way for a driver to indicate this,
6041                  * for example a per-ring flag.
6042                  */
6043                 rings = kmem_alloc(ringcnt * sizeof (mac_ring_handle_t),
6044                     KM_SLEEP);
6045                 for (ring = src_group->mrg_rings; ring != NULL;
6046                     ring = ring->mr_next) {
6047                         if (ring_type == MAC_RING_TYPE_RX &&
6048                             ring->mr_index == 0) {
6049                                 continue;
6050                         }
6051                         if (ring_type == MAC_RING_TYPE_TX &&
6052                             ring == (mac_ring_t *)mip->mi_default_tx_ring) {
6053                                 continue;
6054                         }
6055                         rings[i++] = ring;
6056                         if (i == ringcnt)
6057                                 break;
6058                 }
6059                 ASSERT(ring != NULL);
6060                 nrings = i;
6061                 /* Not enough rings as required */
6062                 if (nrings != ringcnt) {
6063                         rv = ENOSPC;
6064                         goto bail;
6065                 }
6066         }
6067 
6068         switch (ring_type) {
6069         case MAC_RING_TYPE_RX:
6070                 if (src_group->mrg_cur_count - nrings < 1) {
6071                         /* we ran out of rings */
6072                         rv = ENOSPC;
6073                         goto bail;
6074                 }
6075 
6076                 /* move receive rings to new group */
6077                 for (i = 0; i < nrings; i++) {
6078                         rv = mac_group_mov_ring(mip, new_group, rings[i]);
6079                         if (rv != 0) {
6080                                 /* move rings back on failure */
6081                                 for (j = 0; j < i; j++) {
6082                                         (void) mac_group_mov_ring(mip,
6083                                             src_group, rings[j]);
6084                                 }
6085                                 goto bail;
6086                         }
6087                 }
6088                 break;
6089 
6090         case MAC_RING_TYPE_TX: {
6091                 mac_ring_t *tmp_ring;
6092 
6093                 /* move the TX rings to the new group */
6094                 for (i = 0; i < nrings; i++) {
6095                         /* get the desired ring */
6096                         tmp_ring = mac_reserve_tx_ring(mip, rings[i]);
6097                         if (tmp_ring == NULL) {
6098                                 rv = ENOSPC;
6099                                 goto bail;
6100                         }
6101                         ASSERT(tmp_ring == rings[i]);
6102                         rv = mac_group_mov_ring(mip, new_group, rings[i]);
6103                         if (rv != 0) {
6104                                 /* cleanup on failure */
6105                                 for (j = 0; j < i; j++) {
6106                                         (void) mac_group_mov_ring(mip,
6107                                             MAC_DEFAULT_TX_GROUP(mip),
6108                                             rings[j]);
6109                                 }
6110                                 goto bail;
6111                         }
6112                 }
6113                 break;
6114         }
6115         }
6116 
6117         /* add group to share */
6118         if (share != NULL)
6119                 mip->mi_share_capab.ms_sadd(share, new_group->mrg_driver);
6120 
6121 bail:
6122         /* free temporary array of rings */
6123         kmem_free(rings, nrings * sizeof (mac_ring_handle_t));
6124 
6125         return (rv);
6126 }
6127 
6128 void
6129 mac_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip)
6130 {
6131         mac_grp_client_t *mgcp;
6132 
6133         for (mgcp = grp->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
6134                 if (mgcp->mgc_client == mcip)
6135                         break;
6136         }
6137 
6138         VERIFY(mgcp == NULL);
6139 
6140         mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP);
6141         mgcp->mgc_client = mcip;
6142         mgcp->mgc_next = grp->mrg_clients;
6143         grp->mrg_clients = mgcp;
6144 
6145 }
6146 
6147 void
6148 mac_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip)
6149 {
6150         mac_grp_client_t *mgcp, **pprev;
6151 
6152         for (pprev = &grp->mrg_clients, mgcp = *pprev; mgcp != NULL;
6153             pprev = &mgcp->mgc_next, mgcp = *pprev) {
6154                 if (mgcp->mgc_client == mcip)
6155                         break;
6156         }
6157 
6158         ASSERT(mgcp != NULL);
6159 
6160         *pprev = mgcp->mgc_next;
6161         kmem_free(mgcp, sizeof (mac_grp_client_t));
6162 }
6163 
6164 /*
6165  * mac_reserve_rx_group()
6166  *
6167  * Finds an available group and exclusively reserves it for a client.
6168  * The group is chosen to suit the flow's resource controls (bandwidth and
6169  * fanout requirements) and the address type.
6170  * If the requestor is the pimary MAC then return the group with the
6171  * largest number of rings, otherwise the default ring when available.
6172  */
6173 mac_group_t *
6174 mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move)
6175 {
6176         mac_share_handle_t      share = mcip->mci_share;
6177         mac_impl_t              *mip = mcip->mci_mip;
6178         mac_group_t             *grp = NULL;
6179         int                     i;
6180         int                     err = 0;
6181         mac_address_t           *map;
6182         mac_resource_props_t    *mrp = MCIP_RESOURCE_PROPS(mcip);
6183         int                     nrings;
6184         int                     donor_grp_rcnt;
6185         boolean_t               need_exclgrp = B_FALSE;
6186         int                     need_rings = 0;
6187         mac_group_t             *candidate_grp = NULL;
6188         mac_client_impl_t       *gclient;
6189         mac_resource_props_t    *gmrp;
6190         mac_group_t             *donorgrp = NULL;
6191         boolean_t               rxhw = mrp->mrp_mask & MRP_RX_RINGS;
6192         boolean_t               unspec = mrp->mrp_mask & MRP_RXRINGS_UNSPEC;
6193         boolean_t               isprimary;
6194 
6195         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
6196 
6197         isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC;
6198 
6199         /*
6200          * Check if a group already has this mac address (case of VLANs)
6201          * unless we are moving this MAC client from one group to another.
6202          */
6203         if (!move && (map = mac_find_macaddr(mip, mac_addr)) != NULL) {
6204                 if (map->ma_group != NULL)
6205                         return (map->ma_group);
6206         }
6207         if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0)
6208                 return (NULL);
6209         /*
6210          * If exclusive open, return NULL which will enable the
6211          * caller to use the default group.
6212          */
6213         if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
6214                 return (NULL);
6215 
6216         /* For dynamic groups default unspecified to 1 */
6217         if (rxhw && unspec &&
6218             mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6219                 mrp->mrp_nrxrings = 1;
6220         }
6221         /*
6222          * For static grouping we allow only specifying rings=0 and
6223          * unspecified
6224          */
6225         if (rxhw && mrp->mrp_nrxrings > 0 &&
6226             mip->mi_rx_group_type == MAC_GROUP_TYPE_STATIC) {
6227                 return (NULL);
6228         }
6229         if (rxhw) {
6230                 /*
6231                  * We have explicitly asked for a group (with nrxrings,
6232                  * if unspec).
6233                  */
6234                 if (unspec || mrp->mrp_nrxrings > 0) {
6235                         need_exclgrp = B_TRUE;
6236                         need_rings = mrp->mrp_nrxrings;
6237                 } else if (mrp->mrp_nrxrings == 0) {
6238                         /*
6239                          * We have asked for a software group.
6240                          */
6241                         return (NULL);
6242                 }
6243         } else if (isprimary && mip->mi_nactiveclients == 1 &&
6244             mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6245                 /*
6246                  * If the primary is the only active client on this
6247                  * mip and we have not asked for any rings, we give
6248                  * it the default group so that the primary gets to
6249                  * use all the rings.
6250                  */
6251                 return (NULL);
6252         }
6253 
6254         /* The group that can donate rings */
6255         donorgrp = mip->mi_rx_donor_grp;
6256 
6257         /*
6258          * The number of rings that the default group can donate.
6259          * We need to leave at least one ring.
6260          */
6261         donor_grp_rcnt = donorgrp->mrg_cur_count - 1;
6262 
6263         /*
6264          * Try to exclusively reserve a RX group.
6265          *
6266          * For flows requiring HW_DEFAULT_RING (unicast flow of the primary
6267          * client), try to reserve the a non-default RX group and give
6268          * it all the rings from the donor group, except the default ring
6269          *
6270          * For flows requiring HW_RING (unicast flow of other clients), try
6271          * to reserve non-default RX group with the specified number of
6272          * rings, if available.
6273          *
6274          * For flows that have not asked for software or hardware ring,
6275          * try to reserve a non-default group with 1 ring, if available.
6276          */
6277         for (i = 1; i < mip->mi_rx_group_count; i++) {
6278                 grp = &mip->mi_rx_groups[i];
6279 
6280                 DTRACE_PROBE3(rx__group__trying, char *, mip->mi_name,
6281                     int, grp->mrg_index, mac_group_state_t, grp->mrg_state);
6282 
6283                 /*
6284                  * Check if this group could be a candidate group for
6285                  * eviction if we need a group for this MAC client,
6286                  * but there aren't any. A candidate group is one
6287                  * that didn't ask for an exclusive group, but got
6288                  * one and it has enough rings (combined with what
6289                  * the donor group can donate) for the new MAC
6290                  * client
6291                  */
6292                 if (grp->mrg_state >= MAC_GROUP_STATE_RESERVED) {
6293                         /*
6294                          * If the primary/donor group is not the default
6295                          * group, don't bother looking for a candidate group.
6296                          * If we don't have enough rings we will check
6297                          * if the primary group can be vacated.
6298                          */
6299                         if (candidate_grp == NULL &&
6300                             donorgrp == MAC_DEFAULT_RX_GROUP(mip)) {
6301                                 ASSERT(!MAC_GROUP_NO_CLIENT(grp));
6302                                 gclient = MAC_GROUP_ONLY_CLIENT(grp);
6303                                 if (gclient == NULL)
6304                                         gclient = mac_get_grp_primary(grp);
6305                                 ASSERT(gclient != NULL);
6306                                 gmrp = MCIP_RESOURCE_PROPS(gclient);
6307                                 if (gclient->mci_share == NULL &&
6308                                     (gmrp->mrp_mask & MRP_RX_RINGS) == 0 &&
6309                                     (unspec ||
6310                                     (grp->mrg_cur_count + donor_grp_rcnt >=
6311                                     need_rings))) {
6312                                         candidate_grp = grp;
6313                                 }
6314                         }
6315                         continue;
6316                 }
6317                 /*
6318                  * This group could already be SHARED by other multicast
6319                  * flows on this client. In that case, the group would
6320                  * be shared and has already been started.
6321                  */
6322                 ASSERT(grp->mrg_state != MAC_GROUP_STATE_UNINIT);
6323 
6324                 if ((grp->mrg_state == MAC_GROUP_STATE_REGISTERED) &&
6325                     (mac_start_group(grp) != 0)) {
6326                         continue;
6327                 }
6328 
6329                 if (mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC)
6330                         break;
6331                 ASSERT(grp->mrg_cur_count == 0);
6332 
6333                 /*
6334                  * Populate the group. Rings should be taken
6335                  * from the donor group.
6336                  */
6337                 nrings = rxhw ? need_rings : isprimary ? donor_grp_rcnt: 1;
6338 
6339                 /*
6340                  * If the donor group can't donate, let's just walk and
6341                  * see if someone can vacate a group, so that we have
6342                  * enough rings for this, unless we already have
6343                  * identified a candiate group..
6344                  */
6345                 if (nrings <= donor_grp_rcnt) {
6346                         err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
6347                             donorgrp, grp, share, nrings);
6348                         if (err == 0) {
6349                                 /*
6350                                  * For a share i_mac_group_allocate_rings gets
6351                                  * the rings from the driver, let's populate
6352                                  * the property for the client now.
6353                                  */
6354                                 if (share != NULL) {
6355                                         mac_client_set_rings(
6356                                             (mac_client_handle_t)mcip,
6357                                             grp->mrg_cur_count, -1);
6358                                 }
6359                                 if (mac_is_primary_client(mcip) && !rxhw)
6360                                         mip->mi_rx_donor_grp = grp;
6361                                 break;
6362                         }
6363                 }
6364 
6365                 DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
6366                     mip->mi_name, int, grp->mrg_index, int, err);
6367 
6368                 /*
6369                  * It's a dynamic group but the grouping operation
6370                  * failed.
6371                  */
6372                 mac_stop_group(grp);
6373         }
6374         /* We didn't find an exclusive group for this MAC client */
6375         if (i >= mip->mi_rx_group_count) {
6376 
6377                 if (!need_exclgrp)
6378                         return (NULL);
6379 
6380                 /*
6381                  * If we found a candidate group then we switch the
6382                  * MAC client from the candidate_group to the default
6383                  * group and give the group to this MAC client. If
6384                  * we didn't find a candidate_group, check if the
6385                  * primary is in its own group and if it can make way
6386                  * for this MAC client.
6387                  */
6388                 if (candidate_grp == NULL &&
6389                     donorgrp != MAC_DEFAULT_RX_GROUP(mip) &&
6390                     donorgrp->mrg_cur_count >= need_rings) {
6391                         candidate_grp = donorgrp;
6392                 }
6393                 if (candidate_grp != NULL) {
6394                         boolean_t       prim_grp = B_FALSE;
6395 
6396                         /*
6397                          * Switch the MAC client from the candidate group
6398                          * to the default group.. If this group was the
6399                          * donor group, then after the switch we need
6400                          * to update the donor group too.
6401                          */
6402                         grp = candidate_grp;
6403                         gclient = MAC_GROUP_ONLY_CLIENT(grp);
6404                         if (gclient == NULL)
6405                                 gclient = mac_get_grp_primary(grp);
6406                         if (grp == mip->mi_rx_donor_grp)
6407                                 prim_grp = B_TRUE;
6408                         if (mac_rx_switch_group(gclient, grp,
6409                             MAC_DEFAULT_RX_GROUP(mip)) != 0) {
6410                                 return (NULL);
6411                         }
6412                         if (prim_grp) {
6413                                 mip->mi_rx_donor_grp =
6414                                     MAC_DEFAULT_RX_GROUP(mip);
6415                                 donorgrp = MAC_DEFAULT_RX_GROUP(mip);
6416                         }
6417 
6418 
6419                         /*
6420                          * Now give this group with the required rings
6421                          * to this MAC client.
6422                          */
6423                         ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
6424                         if (mac_start_group(grp) != 0)
6425                                 return (NULL);
6426 
6427                         if (mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC)
6428                                 return (grp);
6429 
6430                         donor_grp_rcnt = donorgrp->mrg_cur_count - 1;
6431                         ASSERT(grp->mrg_cur_count == 0);
6432                         ASSERT(donor_grp_rcnt >= need_rings);
6433                         err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
6434                             donorgrp, grp, share, need_rings);
6435                         if (err == 0) {
6436                                 /*
6437                                  * For a share i_mac_group_allocate_rings gets
6438                                  * the rings from the driver, let's populate
6439                                  * the property for the client now.
6440                                  */
6441                                 if (share != NULL) {
6442                                         mac_client_set_rings(
6443                                             (mac_client_handle_t)mcip,
6444                                             grp->mrg_cur_count, -1);
6445                                 }
6446                                 DTRACE_PROBE2(rx__group__reserved,
6447                                     char *, mip->mi_name, int, grp->mrg_index);
6448                                 return (grp);
6449                         }
6450                         DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
6451                             mip->mi_name, int, grp->mrg_index, int, err);
6452                         mac_stop_group(grp);
6453                 }
6454                 return (NULL);
6455         }
6456         ASSERT(grp != NULL);
6457 
6458         DTRACE_PROBE2(rx__group__reserved,
6459             char *, mip->mi_name, int, grp->mrg_index);
6460         return (grp);
6461 }
6462 
6463 /*
6464  * mac_rx_release_group()
6465  *
6466  * This is called when there are no clients left for the group.
6467  * The group is stopped and marked MAC_GROUP_STATE_REGISTERED,
6468  * and if it is a non default group, the shares are removed and
6469  * all rings are assigned back to default group.
6470  */
6471 void
6472 mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
6473 {
6474         mac_impl_t              *mip = mcip->mci_mip;
6475         mac_ring_t              *ring;
6476 
6477         ASSERT(group != MAC_DEFAULT_RX_GROUP(mip));
6478 
6479         if (mip->mi_rx_donor_grp == group)
6480                 mip->mi_rx_donor_grp = MAC_DEFAULT_RX_GROUP(mip);
6481 
6482         /*
6483          * This is the case where there are no clients left. Any
6484          * SRS etc on this group have also be quiesced.
6485          */
6486         for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
6487                 if (ring->mr_classify_type == MAC_HW_CLASSIFIER) {
6488                         ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
6489                         /*
6490                          * Remove the SRS associated with the HW ring.
6491                          * As a result, polling will be disabled.
6492                          */
6493                         ring->mr_srs = NULL;
6494                 }
6495                 ASSERT(group->mrg_state < MAC_GROUP_STATE_RESERVED ||
6496                     ring->mr_state == MR_INUSE);
6497                 if (ring->mr_state == MR_INUSE) {
6498                         mac_stop_ring(ring);
6499                         ring->mr_flag = 0;
6500                 }
6501         }
6502 
6503         /* remove group from share */
6504         if (mcip->mci_share != NULL) {
6505                 mip->mi_share_capab.ms_sremove(mcip->mci_share,
6506                     group->mrg_driver);
6507         }
6508 
6509         if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6510                 mac_ring_t *ring;
6511 
6512                 /*
6513                  * Rings were dynamically allocated to group.
6514                  * Move rings back to default group.
6515                  */
6516                 while ((ring = group->mrg_rings) != NULL) {
6517                         (void) mac_group_mov_ring(mip, mip->mi_rx_donor_grp,
6518                             ring);
6519                 }
6520         }
6521         mac_stop_group(group);
6522         /*
6523          * Possible improvement: See if we can assign the group just released
6524          * to a another client of the mip
6525          */
6526 }
6527 
6528 /*
6529  * When we move the primary's mac address between groups, we need to also
6530  * take all the clients sharing the same mac address along with it (VLANs)
6531  * We remove the mac address for such clients from the group after quiescing
6532  * them. When we add the mac address we restart the client. Note that
6533  * the primary's mac address is removed from the group after all the
6534  * other clients sharing the address are removed. Similarly, the primary's
6535  * mac address is added before all the other client's mac address are
6536  * added. While grp is the group where the clients reside, tgrp is
6537  * the group where the addresses have to be added.
6538  */
6539 static void
6540 mac_rx_move_macaddr_prim(mac_client_impl_t *mcip, mac_group_t *grp,
6541     mac_group_t *tgrp, uint8_t *maddr, boolean_t add)
6542 {
6543         mac_impl_t              *mip = mcip->mci_mip;
6544         mac_grp_client_t        *mgcp = grp->mrg_clients;
6545         mac_client_impl_t       *gmcip;
6546         boolean_t               prim;
6547 
6548         prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
6549 
6550         /*
6551          * If the clients are in a non-default group, we just have to
6552          * walk the group's client list. If it is in the default group
6553          * (which will be shared by other clients as well, we need to
6554          * check if the unicast address matches mcip's unicast.
6555          */
6556         while (mgcp != NULL) {
6557                 gmcip = mgcp->mgc_client;
6558                 if (gmcip != mcip &&
6559                     (grp != MAC_DEFAULT_RX_GROUP(mip) ||
6560                     mcip->mci_unicast == gmcip->mci_unicast)) {
6561                         if (!add) {
6562                                 mac_rx_client_quiesce(
6563                                     (mac_client_handle_t)gmcip);
6564                                 (void) mac_remove_macaddr(mcip->mci_unicast);
6565                         } else {
6566                                 (void) mac_add_macaddr(mip, tgrp, maddr, prim);
6567                                 mac_rx_client_restart(
6568                                     (mac_client_handle_t)gmcip);
6569                         }
6570                 }
6571                 mgcp = mgcp->mgc_next;
6572         }
6573 }
6574 
6575 
6576 /*
6577  * Move the MAC address from fgrp to tgrp. If this is the primary client,
6578  * we need to take any VLANs etc. together too.
6579  */
6580 static int
6581 mac_rx_move_macaddr(mac_client_impl_t *mcip, mac_group_t *fgrp,
6582     mac_group_t *tgrp)
6583 {
6584         mac_impl_t              *mip = mcip->mci_mip;
6585         uint8_t                 maddr[MAXMACADDRLEN];
6586         int                     err = 0;
6587         boolean_t               prim;
6588         boolean_t               multiclnt = B_FALSE;
6589 
6590         mac_rx_client_quiesce((mac_client_handle_t)mcip);
6591         ASSERT(mcip->mci_unicast != NULL);
6592         bcopy(mcip->mci_unicast->ma_addr, maddr, mcip->mci_unicast->ma_len);
6593 
6594         prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
6595         if (mcip->mci_unicast->ma_nusers > 1) {
6596                 mac_rx_move_macaddr_prim(mcip, fgrp, NULL, maddr, B_FALSE);
6597                 multiclnt = B_TRUE;
6598         }
6599         ASSERT(mcip->mci_unicast->ma_nusers == 1);
6600         err = mac_remove_macaddr(mcip->mci_unicast);
6601         if (err != 0) {
6602                 mac_rx_client_restart((mac_client_handle_t)mcip);
6603                 if (multiclnt) {
6604                         mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr,
6605                             B_TRUE);
6606                 }
6607                 return (err);
6608         }
6609         /*
6610          * Program the H/W Classifier first, if this fails we need
6611          * not proceed with the other stuff.
6612          */
6613         if ((err = mac_add_macaddr(mip, tgrp, maddr, prim)) != 0) {
6614                 /* Revert back the H/W Classifier */
6615                 if ((err = mac_add_macaddr(mip, fgrp, maddr, prim)) != 0) {
6616                         /*
6617                          * This should not fail now since it worked earlier,
6618                          * should we panic?
6619                          */
6620                         cmn_err(CE_WARN,
6621                             "mac_rx_switch_group: switching %p back"
6622                             " to group %p failed!!", (void *)mcip,
6623                             (void *)fgrp);
6624                 }
6625                 mac_rx_client_restart((mac_client_handle_t)mcip);
6626                 if (multiclnt) {
6627                         mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr,
6628                             B_TRUE);
6629                 }
6630                 return (err);
6631         }
6632         mcip->mci_unicast = mac_find_macaddr(mip, maddr);
6633         mac_rx_client_restart((mac_client_handle_t)mcip);
6634         if (multiclnt)
6635                 mac_rx_move_macaddr_prim(mcip, fgrp, tgrp, maddr, B_TRUE);
6636         return (err);
6637 }
6638 
6639 /*
6640  * Switch the MAC client from one group to another. This means we need
6641  * to remove the MAC address from the group, remove the MAC client,
6642  * teardown the SRSs and revert the group state. Then, we add the client
6643  * to the destination group, set the SRSs, and add the MAC address to the
6644  * group.
6645  */
6646 int
6647 mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
6648     mac_group_t *tgrp)
6649 {
6650         int                     err;
6651         mac_group_state_t       next_state;
6652         mac_client_impl_t       *group_only_mcip;
6653         mac_client_impl_t       *gmcip;
6654         mac_impl_t              *mip = mcip->mci_mip;
6655         mac_grp_client_t        *mgcp;
6656 
6657         ASSERT(fgrp == mcip->mci_flent->fe_rx_ring_group);
6658 
6659         if ((err = mac_rx_move_macaddr(mcip, fgrp, tgrp)) != 0)
6660                 return (err);
6661 
6662         /*
6663          * The group might be reserved, but SRSs may not be set up, e.g.
6664          * primary and its vlans using a reserved group.
6665          */
6666         if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED &&
6667             MAC_GROUP_ONLY_CLIENT(fgrp) != NULL) {
6668                 mac_rx_srs_group_teardown(mcip->mci_flent, B_TRUE);
6669         }
6670         if (fgrp != MAC_DEFAULT_RX_GROUP(mip)) {
6671                 mgcp = fgrp->mrg_clients;
6672                 while (mgcp != NULL) {
6673                         gmcip = mgcp->mgc_client;
6674                         mgcp = mgcp->mgc_next;
6675                         mac_group_remove_client(fgrp, gmcip);
6676                         mac_group_add_client(tgrp, gmcip);
6677                         gmcip->mci_flent->fe_rx_ring_group = tgrp;
6678                 }
6679                 mac_release_rx_group(mcip, fgrp);
6680                 ASSERT(MAC_GROUP_NO_CLIENT(fgrp));
6681                 mac_set_group_state(fgrp, MAC_GROUP_STATE_REGISTERED);
6682         } else {
6683                 mac_group_remove_client(fgrp, mcip);
6684                 mac_group_add_client(tgrp, mcip);
6685                 mcip->mci_flent->fe_rx_ring_group = tgrp;
6686                 /*
6687                  * If there are other clients (VLANs) sharing this address
6688                  * we should be here only for the primary.
6689                  */
6690                 if (mcip->mci_unicast->ma_nusers > 1) {
6691                         /*
6692                          * We need to move all the clients that are using
6693                          * this h/w address.
6694                          */
6695                         mgcp = fgrp->mrg_clients;
6696                         while (mgcp != NULL) {
6697                                 gmcip = mgcp->mgc_client;
6698                                 mgcp = mgcp->mgc_next;
6699                                 if (mcip->mci_unicast == gmcip->mci_unicast) {
6700                                         mac_group_remove_client(fgrp, gmcip);
6701                                         mac_group_add_client(tgrp, gmcip);
6702                                         gmcip->mci_flent->fe_rx_ring_group =
6703                                             tgrp;
6704                                 }
6705                         }
6706                 }
6707                 /*
6708                  * The default group will still take the multicast,
6709                  * broadcast traffic etc., so it won't go to
6710                  * MAC_GROUP_STATE_REGISTERED.
6711                  */
6712                 if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED)
6713                         mac_rx_group_unmark(fgrp, MR_CONDEMNED);
6714                 mac_set_group_state(fgrp, MAC_GROUP_STATE_SHARED);
6715         }
6716         next_state = mac_group_next_state(tgrp, &group_only_mcip,
6717             MAC_DEFAULT_RX_GROUP(mip), B_TRUE);
6718         mac_set_group_state(tgrp, next_state);
6719         /*
6720          * If the destination group is reserved, setup the SRSs etc.
6721          */
6722         if (tgrp->mrg_state == MAC_GROUP_STATE_RESERVED) {
6723                 mac_rx_srs_group_setup(mcip, mcip->mci_flent, SRST_LINK);
6724                 mac_fanout_setup(mcip, mcip->mci_flent,
6725                     MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver, mcip, NULL,
6726                     NULL);
6727                 mac_rx_group_unmark(tgrp, MR_INCIPIENT);
6728         } else {
6729                 mac_rx_switch_grp_to_sw(tgrp);
6730         }
6731         return (0);
6732 }
6733 
6734 /*
6735  * Reserves a TX group for the specified share. Invoked by mac_tx_srs_setup()
6736  * when a share was allocated to the client.
6737  */
6738 mac_group_t *
6739 mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move)
6740 {
6741         mac_impl_t              *mip = mcip->mci_mip;
6742         mac_group_t             *grp = NULL;
6743         int                     rv;
6744         int                     i;
6745         int                     err;
6746         mac_group_t             *defgrp;
6747         mac_share_handle_t      share = mcip->mci_share;
6748         mac_resource_props_t    *mrp = MCIP_RESOURCE_PROPS(mcip);
6749         int                     nrings;
6750         int                     defnrings;
6751         boolean_t               need_exclgrp = B_FALSE;
6752         int                     need_rings = 0;
6753         mac_group_t             *candidate_grp = NULL;
6754         mac_client_impl_t       *gclient;
6755         mac_resource_props_t    *gmrp;
6756         boolean_t               txhw = mrp->mrp_mask & MRP_TX_RINGS;
6757         boolean_t               unspec = mrp->mrp_mask & MRP_TXRINGS_UNSPEC;
6758         boolean_t               isprimary;
6759 
6760         isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC;
6761         /*
6762          * When we come here for a VLAN on the primary (dladm create-vlan),
6763          * we need to pair it along with the primary (to keep it consistent
6764          * with the RX side). So, we check if the primary is already assigned
6765          * to a group and return the group if so. The other way is also
6766          * true, i.e. the VLAN is already created and now we are plumbing
6767          * the primary.
6768          */
6769         if (!move && isprimary) {
6770                 for (gclient = mip->mi_clients_list; gclient != NULL;
6771                     gclient = gclient->mci_client_next) {
6772                         if (gclient->mci_flent->fe_type & FLOW_PRIMARY_MAC &&
6773                             gclient->mci_flent->fe_tx_ring_group != NULL) {
6774                                 return (gclient->mci_flent->fe_tx_ring_group);
6775                         }
6776                 }
6777         }
6778 
6779         if (mip->mi_tx_groups == NULL || mip->mi_tx_group_count == 0)
6780                 return (NULL);
6781 
6782         /* For dynamic groups, default unspec to 1 */
6783         if (txhw && unspec &&
6784             mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6785                 mrp->mrp_ntxrings = 1;
6786         }
6787         /*
6788          * For static grouping we allow only specifying rings=0 and
6789          * unspecified
6790          */
6791         if (txhw && mrp->mrp_ntxrings > 0 &&
6792             mip->mi_tx_group_type == MAC_GROUP_TYPE_STATIC) {
6793                 return (NULL);
6794         }
6795 
6796         if (txhw) {
6797                 /*
6798                  * We have explicitly asked for a group (with ntxrings,
6799                  * if unspec).
6800                  */
6801                 if (unspec || mrp->mrp_ntxrings > 0) {
6802                         need_exclgrp = B_TRUE;
6803                         need_rings = mrp->mrp_ntxrings;
6804                 } else if (mrp->mrp_ntxrings == 0) {
6805                         /*
6806                          * We have asked for a software group.
6807                          */
6808                         return (NULL);
6809                 }
6810         }
6811         defgrp = MAC_DEFAULT_TX_GROUP(mip);
6812         /*
6813          * The number of rings that the default group can donate.
6814          * We need to leave at least one ring - the default ring - in
6815          * this group.
6816          */
6817         defnrings = defgrp->mrg_cur_count - 1;
6818 
6819         /*
6820          * Primary gets default group unless explicitly told not
6821          * to  (i.e. rings > 0).
6822          */
6823         if (isprimary && !need_exclgrp)
6824                 return (NULL);
6825 
6826         nrings = (mrp->mrp_mask & MRP_TX_RINGS) != 0 ? mrp->mrp_ntxrings : 1;
6827         for (i = 0; i <  mip->mi_tx_group_count; i++) {
6828                 grp = &mip->mi_tx_groups[i];
6829                 if ((grp->mrg_state == MAC_GROUP_STATE_RESERVED) ||
6830                     (grp->mrg_state == MAC_GROUP_STATE_UNINIT)) {
6831                         /*
6832                          * Select a candidate for replacement if we don't
6833                          * get an exclusive group. A candidate group is one
6834                          * that didn't ask for an exclusive group, but got
6835                          * one and it has enough rings (combined with what
6836                          * the default group can donate) for the new MAC
6837                          * client.
6838                          */
6839                         if (grp->mrg_state == MAC_GROUP_STATE_RESERVED &&
6840                             candidate_grp == NULL) {
6841                                 gclient = MAC_GROUP_ONLY_CLIENT(grp);
6842                                 if (gclient == NULL)
6843                                         gclient = mac_get_grp_primary(grp);
6844                                 gmrp = MCIP_RESOURCE_PROPS(gclient);
6845                                 if (gclient->mci_share == NULL &&
6846                                     (gmrp->mrp_mask & MRP_TX_RINGS) == 0 &&
6847                                     (unspec ||
6848                                     (grp->mrg_cur_count + defnrings) >=
6849                                     need_rings)) {
6850                                         candidate_grp = grp;
6851                                 }
6852                         }
6853                         continue;
6854                 }
6855                 /*
6856                  * If the default can't donate let's just walk and
6857                  * see if someone can vacate a group, so that we have
6858                  * enough rings for this.
6859                  */
6860                 if (mip->mi_tx_group_type != MAC_GROUP_TYPE_DYNAMIC ||
6861                     nrings <= defnrings) {
6862                         if (grp->mrg_state == MAC_GROUP_STATE_REGISTERED) {
6863                                 rv = mac_start_group(grp);
6864                                 ASSERT(rv == 0);
6865                         }
6866                         break;
6867                 }
6868         }
6869 
6870         /* The default group */
6871         if (i >= mip->mi_tx_group_count) {
6872                 /*
6873                  * If we need an exclusive group and have identified a
6874                  * candidate group we switch the MAC client from the
6875                  * candidate group to the default group and give the
6876                  * candidate group to this client.
6877                  */
6878                 if (need_exclgrp && candidate_grp != NULL) {
6879                         /*
6880                          * Switch the MAC client from the candidate group
6881                          * to the default group.
6882                          */
6883                         grp = candidate_grp;
6884                         gclient = MAC_GROUP_ONLY_CLIENT(grp);
6885                         if (gclient == NULL)
6886                                 gclient = mac_get_grp_primary(grp);
6887                         mac_tx_client_quiesce((mac_client_handle_t)gclient);
6888                         mac_tx_switch_group(gclient, grp, defgrp);
6889                         mac_tx_client_restart((mac_client_handle_t)gclient);
6890 
6891                         /*
6892                          * Give the candidate group with the specified number
6893                          * of rings to this MAC client.
6894                          */
6895                         ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
6896                         rv = mac_start_group(grp);
6897                         ASSERT(rv == 0);
6898 
6899                         if (mip->mi_tx_group_type != MAC_GROUP_TYPE_DYNAMIC)
6900                                 return (grp);
6901 
6902                         ASSERT(grp->mrg_cur_count == 0);
6903                         ASSERT(defgrp->mrg_cur_count > need_rings);
6904 
6905                         err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX,
6906                             defgrp, grp, share, need_rings);
6907                         if (err == 0) {
6908                                 /*
6909                                  * For a share i_mac_group_allocate_rings gets
6910                                  * the rings from the driver, let's populate
6911                                  * the property for the client now.
6912                                  */
6913                                 if (share != NULL) {
6914                                         mac_client_set_rings(
6915                                             (mac_client_handle_t)mcip, -1,
6916                                             grp->mrg_cur_count);
6917                                 }
6918                                 mip->mi_tx_group_free--;
6919                                 return (grp);
6920                         }
6921                         DTRACE_PROBE3(tx__group__reserve__alloc__rings, char *,
6922                             mip->mi_name, int, grp->mrg_index, int, err);
6923                         mac_stop_group(grp);
6924                 }
6925                 return (NULL);
6926         }
6927         /*
6928          * We got an exclusive group, but it is not dynamic.
6929          */
6930         if (mip->mi_tx_group_type != MAC_GROUP_TYPE_DYNAMIC) {
6931                 mip->mi_tx_group_free--;
6932                 return (grp);
6933         }
6934 
6935         rv = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX, defgrp, grp,
6936             share, nrings);
6937         if (rv != 0) {
6938                 DTRACE_PROBE3(tx__group__reserve__alloc__rings,
6939                     char *, mip->mi_name, int, grp->mrg_index, int, rv);
6940                 mac_stop_group(grp);
6941                 return (NULL);
6942         }
6943         /*
6944          * For a share i_mac_group_allocate_rings gets the rings from the
6945          * driver, let's populate the property for the client now.
6946          */
6947         if (share != NULL) {
6948                 mac_client_set_rings((mac_client_handle_t)mcip, -1,
6949                     grp->mrg_cur_count);
6950         }
6951         mip->mi_tx_group_free--;
6952         return (grp);
6953 }
6954 
6955 void
6956 mac_release_tx_group(mac_client_impl_t *mcip, mac_group_t *grp)
6957 {
6958         mac_impl_t              *mip = mcip->mci_mip;
6959         mac_share_handle_t      share = mcip->mci_share;
6960         mac_ring_t              *ring;
6961         mac_soft_ring_set_t     *srs = MCIP_TX_SRS(mcip);
6962         mac_group_t             *defgrp;
6963 
6964         defgrp = MAC_DEFAULT_TX_GROUP(mip);
6965         if (srs != NULL) {
6966                 if (srs->srs_soft_ring_count > 0) {
6967                         for (ring = grp->mrg_rings; ring != NULL;
6968                             ring = ring->mr_next) {
6969                                 ASSERT(mac_tx_srs_ring_present(srs, ring));
6970                                 mac_tx_invoke_callbacks(mcip,
6971                                     (mac_tx_cookie_t)
6972                                     mac_tx_srs_get_soft_ring(srs, ring));
6973                                 mac_tx_srs_del_ring(srs, ring);
6974                         }
6975                 } else {
6976                         ASSERT(srs->srs_tx.st_arg2 != NULL);
6977                         srs->srs_tx.st_arg2 = NULL;
6978                         mac_srs_stat_delete(srs);
6979                 }
6980         }
6981         if (share != NULL)
6982                 mip->mi_share_capab.ms_sremove(share, grp->mrg_driver);
6983 
6984         /* move the ring back to the pool */
6985         if (mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
6986                 while ((ring = grp->mrg_rings) != NULL)
6987                         (void) mac_group_mov_ring(mip, defgrp, ring);
6988         }
6989         mac_stop_group(grp);
6990         mip->mi_tx_group_free++;
6991 }
6992 
6993 /*
6994  * Disassociate a MAC client from a group, i.e go through the rings in the
6995  * group and delete all the soft rings tied to them.
6996  */
6997 static void
6998 mac_tx_dismantle_soft_rings(mac_group_t *fgrp, flow_entry_t *flent)
6999 {
7000         mac_client_impl_t       *mcip = flent->fe_mcip;
7001         mac_soft_ring_set_t     *tx_srs;
7002         mac_srs_tx_t            *tx;
7003         mac_ring_t              *ring;
7004 
7005         tx_srs = flent->fe_tx_srs;
7006         tx = &tx_srs->srs_tx;
7007 
7008         /* Single ring case we haven't created any soft rings */
7009         if (tx->st_mode == SRS_TX_BW || tx->st_mode == SRS_TX_SERIALIZE ||
7010             tx->st_mode == SRS_TX_DEFAULT) {
7011                 tx->st_arg2 = NULL;
7012                 mac_srs_stat_delete(tx_srs);
7013         /* Fanout case, where we have to dismantle the soft rings */
7014         } else {
7015                 for (ring = fgrp->mrg_rings; ring != NULL;
7016                     ring = ring->mr_next) {
7017                         ASSERT(mac_tx_srs_ring_present(tx_srs, ring));
7018                         mac_tx_invoke_callbacks(mcip,
7019                             (mac_tx_cookie_t)mac_tx_srs_get_soft_ring(tx_srs,
7020                             ring));
7021                         mac_tx_srs_del_ring(tx_srs, ring);
7022                 }
7023                 ASSERT(tx->st_arg2 == NULL);
7024         }
7025 }
7026 
7027 /*
7028  * Switch the MAC client from one group to another. This means we need
7029  * to remove the MAC client, teardown the SRSs and revert the group state.
7030  * Then, we add the client to the destination roup, set the SRSs etc.
7031  */
7032 void
7033 mac_tx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp,
7034     mac_group_t *tgrp)
7035 {
7036         mac_client_impl_t       *group_only_mcip;
7037         mac_impl_t              *mip = mcip->mci_mip;
7038         flow_entry_t            *flent = mcip->mci_flent;
7039         mac_group_t             *defgrp;
7040         mac_grp_client_t        *mgcp;
7041         mac_client_impl_t       *gmcip;
7042         flow_entry_t            *gflent;
7043 
7044         defgrp = MAC_DEFAULT_TX_GROUP(mip);
7045         ASSERT(fgrp == flent->fe_tx_ring_group);
7046 
7047         if (fgrp == defgrp) {
7048                 /*
7049                  * If this is the primary we need to find any VLANs on
7050                  * the primary and move them too.
7051                  */
7052                 mac_group_remove_client(fgrp, mcip);
7053                 mac_tx_dismantle_soft_rings(fgrp, flent);
7054                 if (mcip->mci_unicast->ma_nusers > 1) {
7055                         mgcp = fgrp->mrg_clients;
7056                         while (mgcp != NULL) {
7057                                 gmcip = mgcp->mgc_client;
7058                                 mgcp = mgcp->mgc_next;
7059                                 if (mcip->mci_unicast != gmcip->mci_unicast)
7060                                         continue;
7061                                 mac_tx_client_quiesce(
7062                                     (mac_client_handle_t)gmcip);
7063 
7064                                 gflent = gmcip->mci_flent;
7065                                 mac_group_remove_client(fgrp, gmcip);
7066                                 mac_tx_dismantle_soft_rings(fgrp, gflent);
7067 
7068                                 mac_group_add_client(tgrp, gmcip);
7069                                 gflent->fe_tx_ring_group = tgrp;
7070                                 /* We could directly set this to SHARED */
7071                                 tgrp->mrg_state = mac_group_next_state(tgrp,
7072                                     &group_only_mcip, defgrp, B_FALSE);
7073 
7074                                 mac_tx_srs_group_setup(gmcip, gflent,
7075                                     SRST_LINK);
7076                                 mac_fanout_setup(gmcip, gflent,
7077                                     MCIP_RESOURCE_PROPS(gmcip), mac_rx_deliver,
7078                                     gmcip, NULL, NULL);
7079 
7080                                 mac_tx_client_restart(
7081                                     (mac_client_handle_t)gmcip);
7082                         }
7083                 }
7084                 if (MAC_GROUP_NO_CLIENT(fgrp)) {
7085                         mac_ring_t      *ring;
7086                         int             cnt;
7087                         int             ringcnt;
7088 
7089                         fgrp->mrg_state = MAC_GROUP_STATE_REGISTERED;
7090                         /*
7091                          * Additionally, we also need to stop all
7092                          * the rings in the default group, except
7093                          * the default ring. The reason being
7094                          * this group won't be released since it is
7095                          * the default group, so the rings won't
7096                          * be stopped otherwise.
7097                          */
7098                         ringcnt = fgrp->mrg_cur_count;
7099                         ring = fgrp->mrg_rings;
7100                         for (cnt = 0; cnt < ringcnt; cnt++) {
7101                                 if (ring->mr_state == MR_INUSE &&
7102                                     ring !=
7103                                     (mac_ring_t *)mip->mi_default_tx_ring) {
7104                                         mac_stop_ring(ring);
7105                                         ring->mr_flag = 0;
7106                                 }
7107                                 ring = ring->mr_next;
7108                         }
7109                 } else if (MAC_GROUP_ONLY_CLIENT(fgrp) != NULL) {
7110                         fgrp->mrg_state = MAC_GROUP_STATE_RESERVED;
7111                 } else {
7112                         ASSERT(fgrp->mrg_state == MAC_GROUP_STATE_SHARED);
7113                 }
7114         } else {
7115                 /*
7116                  * We could have VLANs sharing the non-default group with
7117                  * the primary.
7118                  */
7119                 mgcp = fgrp->mrg_clients;
7120                 while (mgcp != NULL) {
7121                         gmcip = mgcp->mgc_client;
7122                         mgcp = mgcp->mgc_next;
7123                         if (gmcip == mcip)
7124                                 continue;
7125                         mac_tx_client_quiesce((mac_client_handle_t)gmcip);
7126                         gflent = gmcip->mci_flent;
7127 
7128                         mac_group_remove_client(fgrp, gmcip);
7129                         mac_tx_dismantle_soft_rings(fgrp, gflent);
7130 
7131                         mac_group_add_client(tgrp, gmcip);
7132                         gflent->fe_tx_ring_group = tgrp;
7133                         /* We could directly set this to SHARED */
7134                         tgrp->mrg_state = mac_group_next_state(tgrp,
7135                             &group_only_mcip, defgrp, B_FALSE);
7136                         mac_tx_srs_group_setup(gmcip, gflent, SRST_LINK);
7137                         mac_fanout_setup(gmcip, gflent,
7138                             MCIP_RESOURCE_PROPS(gmcip), mac_rx_deliver,
7139                             gmcip, NULL, NULL);
7140 
7141                         mac_tx_client_restart((mac_client_handle_t)gmcip);
7142                 }
7143                 mac_group_remove_client(fgrp, mcip);
7144                 mac_release_tx_group(mcip, fgrp);
7145                 fgrp->mrg_state = MAC_GROUP_STATE_REGISTERED;
7146         }
7147 
7148         /* Add it to the tgroup */
7149         mac_group_add_client(tgrp, mcip);
7150         flent->fe_tx_ring_group = tgrp;
7151         tgrp->mrg_state = mac_group_next_state(tgrp, &group_only_mcip,
7152             defgrp, B_FALSE);
7153 
7154         mac_tx_srs_group_setup(mcip, flent, SRST_LINK);
7155         mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
7156             mac_rx_deliver, mcip, NULL, NULL);
7157 }
7158 
7159 /*
7160  * This is a 1-time control path activity initiated by the client (IP).
7161  * The mac perimeter protects against other simultaneous control activities,
7162  * for example an ioctl that attempts to change the degree of fanout and
7163  * increase or decrease the number of softrings associated with this Tx SRS.
7164  */
7165 static mac_tx_notify_cb_t *
7166 mac_client_tx_notify_add(mac_client_impl_t *mcip,
7167     mac_tx_notify_t notify, void *arg)
7168 {
7169         mac_cb_info_t *mcbi;
7170         mac_tx_notify_cb_t *mtnfp;
7171 
7172         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
7173 
7174         mtnfp = kmem_zalloc(sizeof (mac_tx_notify_cb_t), KM_SLEEP);
7175         mtnfp->mtnf_fn = notify;
7176         mtnfp->mtnf_arg = arg;
7177         mtnfp->mtnf_link.mcb_objp = mtnfp;
7178         mtnfp->mtnf_link.mcb_objsize = sizeof (mac_tx_notify_cb_t);
7179         mtnfp->mtnf_link.mcb_flags = MCB_TX_NOTIFY_CB_T;
7180 
7181         mcbi = &mcip->mci_tx_notify_cb_info;
7182         mutex_enter(mcbi->mcbi_lockp);
7183         mac_callback_add(mcbi, &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link);
7184         mutex_exit(mcbi->mcbi_lockp);
7185         return (mtnfp);
7186 }
7187 
7188 static void
7189 mac_client_tx_notify_remove(mac_client_impl_t *mcip, mac_tx_notify_cb_t *mtnfp)
7190 {
7191         mac_cb_info_t   *mcbi;
7192         mac_cb_t        **cblist;
7193 
7194         ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
7195 
7196         if (!mac_callback_find(&mcip->mci_tx_notify_cb_info,
7197             &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link)) {
7198                 cmn_err(CE_WARN,
7199                     "mac_client_tx_notify_remove: callback not "
7200                     "found, mcip 0x%p mtnfp 0x%p", (void *)mcip, (void *)mtnfp);
7201                 return;
7202         }
7203 
7204         mcbi = &mcip->mci_tx_notify_cb_info;
7205         cblist = &mcip->mci_tx_notify_cb_list;
7206         mutex_enter(mcbi->mcbi_lockp);
7207         if (mac_callback_remove(mcbi, cblist, &mtnfp->mtnf_link))
7208                 kmem_free(mtnfp, sizeof (mac_tx_notify_cb_t));
7209         else
7210                 mac_callback_remove_wait(&mcip->mci_tx_notify_cb_info);
7211         mutex_exit(mcbi->mcbi_lockp);
7212 }
7213 
7214 /*
7215  * mac_client_tx_notify():
7216  * call to add and remove flow control callback routine.
7217  */
7218 mac_tx_notify_handle_t
7219 mac_client_tx_notify(mac_client_handle_t mch, mac_tx_notify_t callb_func,
7220     void *ptr)
7221 {
7222         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
7223         mac_tx_notify_cb_t      *mtnfp = NULL;
7224 
7225         i_mac_perim_enter(mcip->mci_mip);
7226 
7227         if (callb_func != NULL) {
7228                 /* Add a notify callback */
7229                 mtnfp = mac_client_tx_notify_add(mcip, callb_func, ptr);
7230         } else {
7231                 mac_client_tx_notify_remove(mcip, (mac_tx_notify_cb_t *)ptr);
7232         }
7233         i_mac_perim_exit(mcip->mci_mip);
7234 
7235         return ((mac_tx_notify_handle_t)mtnfp);
7236 }
7237 
7238 void
7239 mac_bridge_vectors(mac_bridge_tx_t txf, mac_bridge_rx_t rxf,
7240     mac_bridge_ref_t reff, mac_bridge_ls_t lsf)
7241 {
7242         mac_bridge_tx_cb = txf;
7243         mac_bridge_rx_cb = rxf;
7244         mac_bridge_ref_cb = reff;
7245         mac_bridge_ls_cb = lsf;
7246 }
7247 
7248 int
7249 mac_bridge_set(mac_handle_t mh, mac_handle_t link)
7250 {
7251         mac_impl_t *mip = (mac_impl_t *)mh;
7252         int retv;
7253 
7254         mutex_enter(&mip->mi_bridge_lock);
7255         if (mip->mi_bridge_link == NULL) {
7256                 mip->mi_bridge_link = link;
7257                 retv = 0;
7258         } else {
7259                 retv = EBUSY;
7260         }
7261         mutex_exit(&mip->mi_bridge_lock);
7262         if (retv == 0) {
7263                 mac_poll_state_change(mh, B_FALSE);
7264                 mac_capab_update(mh);
7265         }
7266         return (retv);
7267 }
7268 
7269 /*
7270  * Disable bridging on the indicated link.
7271  */
7272 void
7273 mac_bridge_clear(mac_handle_t mh, mac_handle_t link)
7274 {
7275         mac_impl_t *mip = (mac_impl_t *)mh;
7276 
7277         mutex_enter(&mip->mi_bridge_lock);
7278         ASSERT(mip->mi_bridge_link == link);
7279         mip->mi_bridge_link = NULL;
7280         mutex_exit(&mip->mi_bridge_lock);
7281         mac_poll_state_change(mh, B_TRUE);
7282         mac_capab_update(mh);
7283 }
7284 
7285 void
7286 mac_no_active(mac_handle_t mh)
7287 {
7288         mac_impl_t *mip = (mac_impl_t *)mh;
7289 
7290         i_mac_perim_enter(mip);
7291         mip->mi_state_flags |= MIS_NO_ACTIVE;
7292         i_mac_perim_exit(mip);
7293 }
7294 
7295 /*
7296  * Walk the primary VLAN clients whenever the primary's rings property
7297  * changes and update the mac_resource_props_t for the VLAN's client.
7298  * We need to do this since we don't support setting these properties
7299  * on the primary's VLAN clients, but the VLAN clients have to
7300  * follow the primary w.r.t the rings property;
7301  */
7302 void
7303 mac_set_prim_vlan_rings(mac_impl_t  *mip, mac_resource_props_t *mrp)
7304 {
7305         mac_client_impl_t       *vmcip;
7306         mac_resource_props_t    *vmrp;
7307 
7308         for (vmcip = mip->mi_clients_list; vmcip != NULL;
7309             vmcip = vmcip->mci_client_next) {
7310                 if (!(vmcip->mci_flent->fe_type & FLOW_PRIMARY_MAC) ||
7311                     mac_client_vid((mac_client_handle_t)vmcip) ==
7312                     VLAN_ID_NONE) {
7313                         continue;
7314                 }
7315                 vmrp = MCIP_RESOURCE_PROPS(vmcip);
7316 
7317                 vmrp->mrp_nrxrings =  mrp->mrp_nrxrings;
7318                 if (mrp->mrp_mask & MRP_RX_RINGS)
7319                         vmrp->mrp_mask |= MRP_RX_RINGS;
7320                 else if (vmrp->mrp_mask & MRP_RX_RINGS)
7321                         vmrp->mrp_mask &= ~MRP_RX_RINGS;
7322 
7323                 vmrp->mrp_ntxrings =  mrp->mrp_ntxrings;
7324                 if (mrp->mrp_mask & MRP_TX_RINGS)
7325                         vmrp->mrp_mask |= MRP_TX_RINGS;
7326                 else if (vmrp->mrp_mask & MRP_TX_RINGS)
7327                         vmrp->mrp_mask &= ~MRP_TX_RINGS;
7328 
7329                 if (mrp->mrp_mask & MRP_RXRINGS_UNSPEC)
7330                         vmrp->mrp_mask |= MRP_RXRINGS_UNSPEC;
7331                 else
7332                         vmrp->mrp_mask &= ~MRP_RXRINGS_UNSPEC;
7333 
7334                 if (mrp->mrp_mask & MRP_TXRINGS_UNSPEC)
7335                         vmrp->mrp_mask |= MRP_TXRINGS_UNSPEC;
7336                 else
7337                         vmrp->mrp_mask &= ~MRP_TXRINGS_UNSPEC;
7338         }
7339 }
7340 
7341 /*
7342  * We are adding or removing ring(s) from a group. The source for taking
7343  * rings is the default group. The destination for giving rings back is
7344  * the default group.
7345  */
7346 int
7347 mac_group_ring_modify(mac_client_impl_t *mcip, mac_group_t *group,
7348     mac_group_t *defgrp)
7349 {
7350         mac_resource_props_t    *mrp = MCIP_RESOURCE_PROPS(mcip);
7351         uint_t                  modify;
7352         int                     count;
7353         mac_ring_t              *ring;
7354         mac_ring_t              *next;
7355         mac_impl_t              *mip = mcip->mci_mip;
7356         mac_ring_t              **rings;
7357         uint_t                  ringcnt;
7358         int                     i = 0;
7359         boolean_t               rx_group = group->mrg_type == MAC_RING_TYPE_RX;
7360         int                     start;
7361         int                     end;
7362         mac_group_t             *tgrp;
7363         int                     j;
7364         int                     rv = 0;
7365 
7366         /*
7367          * If we are asked for just a group, we give 1 ring, else
7368          * the specified number of rings.
7369          */
7370         if (rx_group) {
7371                 ringcnt = (mrp->mrp_mask & MRP_RXRINGS_UNSPEC) ? 1:
7372                     mrp->mrp_nrxrings;
7373         } else {
7374                 ringcnt = (mrp->mrp_mask & MRP_TXRINGS_UNSPEC) ? 1:
7375                     mrp->mrp_ntxrings;
7376         }
7377 
7378         /* don't allow modifying rings for a share for now. */
7379         ASSERT(mcip->mci_share == NULL);
7380 
7381         if (ringcnt == group->mrg_cur_count)
7382                 return (0);
7383 
7384         if (group->mrg_cur_count > ringcnt) {
7385                 modify = group->mrg_cur_count - ringcnt;
7386                 if (rx_group) {
7387                         if (mip->mi_rx_donor_grp == group) {
7388                                 ASSERT(mac_is_primary_client(mcip));
7389                                 mip->mi_rx_donor_grp = defgrp;
7390                         } else {
7391                                 defgrp = mip->mi_rx_donor_grp;
7392                         }
7393                 }
7394                 ring = group->mrg_rings;
7395                 rings = kmem_alloc(modify * sizeof (mac_ring_handle_t),
7396                     KM_SLEEP);
7397                 j = 0;
7398                 for (count = 0; count < modify; count++) {
7399                         next = ring->mr_next;
7400                         rv = mac_group_mov_ring(mip, defgrp, ring);
7401                         if (rv != 0) {
7402                                 /* cleanup on failure */
7403                                 for (j = 0; j < count; j++) {
7404                                         (void) mac_group_mov_ring(mip, group,
7405                                             rings[j]);
7406                                 }
7407                                 break;
7408                         }
7409                         rings[j++] = ring;
7410                         ring = next;
7411                 }
7412                 kmem_free(rings, modify * sizeof (mac_ring_handle_t));
7413                 return (rv);
7414         }
7415         if (ringcnt >= MAX_RINGS_PER_GROUP)
7416                 return (EINVAL);
7417 
7418         modify = ringcnt - group->mrg_cur_count;
7419 
7420         if (rx_group) {
7421                 if (group != mip->mi_rx_donor_grp)
7422                         defgrp = mip->mi_rx_donor_grp;
7423                 else
7424                         /*
7425                          * This is the donor group with all the remaining
7426                          * rings. Default group now gets to be the donor
7427                          */
7428                         mip->mi_rx_donor_grp = defgrp;
7429                 start = 1;
7430                 end = mip->mi_rx_group_count;
7431         } else {
7432                 start = 0;
7433                 end = mip->mi_tx_group_count - 1;
7434         }
7435         /*
7436          * If the default doesn't have any rings, lets see if we can
7437          * take rings given to an h/w client that doesn't need it.
7438          * For now, we just see if there is  any one client that can donate
7439          * all the required rings.
7440          */
7441         if (defgrp->mrg_cur_count < (modify + 1)) {
7442                 for (i = start; i < end; i++) {
7443                         if (rx_group) {
7444                                 tgrp = &mip->mi_rx_groups[i];
7445                                 if (tgrp == group || tgrp->mrg_state <
7446                                     MAC_GROUP_STATE_RESERVED) {
7447                                         continue;
7448                                 }
7449                                 mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
7450                                 if (mcip == NULL)
7451                                         mcip = mac_get_grp_primary(tgrp);
7452                                 ASSERT(mcip != NULL);
7453                                 mrp = MCIP_RESOURCE_PROPS(mcip);
7454                                 if ((mrp->mrp_mask & MRP_RX_RINGS) != 0)
7455                                         continue;
7456                                 if ((tgrp->mrg_cur_count +
7457                                     defgrp->mrg_cur_count) < (modify + 1)) {
7458                                         continue;
7459                                 }
7460                                 if (mac_rx_switch_group(mcip, tgrp,
7461                                     defgrp) != 0) {
7462                                         return (ENOSPC);
7463                                 }
7464                         } else {
7465                                 tgrp = &mip->mi_tx_groups[i];
7466                                 if (tgrp == group || tgrp->mrg_state <
7467                                     MAC_GROUP_STATE_RESERVED) {
7468                                         continue;
7469                                 }
7470                                 mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
7471                                 if (mcip == NULL)
7472                                         mcip = mac_get_grp_primary(tgrp);
7473                                 mrp = MCIP_RESOURCE_PROPS(mcip);
7474                                 if ((mrp->mrp_mask & MRP_TX_RINGS) != 0)
7475                                         continue;
7476                                 if ((tgrp->mrg_cur_count +
7477                                     defgrp->mrg_cur_count) < (modify + 1)) {
7478                                         continue;
7479                                 }
7480                                 /* OK, we can switch this to s/w */
7481                                 mac_tx_client_quiesce(
7482                                     (mac_client_handle_t)mcip);
7483                                 mac_tx_switch_group(mcip, tgrp, defgrp);
7484                                 mac_tx_client_restart(
7485                                     (mac_client_handle_t)mcip);
7486                         }
7487                 }
7488                 if (defgrp->mrg_cur_count < (modify + 1))
7489                         return (ENOSPC);
7490         }
7491         if ((rv = i_mac_group_allocate_rings(mip, group->mrg_type, defgrp,
7492             group, mcip->mci_share, modify)) != 0) {
7493                 return (rv);
7494         }
7495         return (0);
7496 }
7497 
7498 /*
7499  * Given the poolname in mac_resource_props, find the cpupart
7500  * that is associated with this pool.  The cpupart will be used
7501  * later for finding the cpus to be bound to the networking threads.
7502  *
7503  * use_default is set B_TRUE if pools are enabled and pool_default
7504  * is returned.  This avoids a 2nd lookup to set the poolname
7505  * for pool-effective.
7506  *
7507  * returns:
7508  *
7509  *    NULL -   pools are disabled or if the 'cpus' property is set.
7510  *    cpupart of pool_default  - pools are enabled and the pool
7511  *             is not available or poolname is blank
7512  *    cpupart of named pool    - pools are enabled and the pool
7513  *             is available.
7514  */
7515 cpupart_t *
7516 mac_pset_find(mac_resource_props_t *mrp, boolean_t *use_default)
7517 {
7518         pool_t          *pool;
7519         cpupart_t       *cpupart;
7520 
7521         *use_default = B_FALSE;
7522 
7523         /* CPUs property is set */
7524         if (mrp->mrp_mask & MRP_CPUS)
7525                 return (NULL);
7526 
7527         ASSERT(pool_lock_held());
7528 
7529         /* Pools are disabled, no pset */
7530         if (pool_state == POOL_DISABLED)
7531                 return (NULL);
7532 
7533         /* Pools property is set */
7534         if (mrp->mrp_mask & MRP_POOL) {
7535                 if ((pool = pool_lookup_pool_by_name(mrp->mrp_pool)) == NULL) {
7536                         /* Pool not found */
7537                         DTRACE_PROBE1(mac_pset_find_no_pool, char *,
7538                             mrp->mrp_pool);
7539                         *use_default = B_TRUE;
7540                         pool = pool_default;
7541                 }
7542         /* Pools property is not set */
7543         } else {
7544                 *use_default = B_TRUE;
7545                 pool = pool_default;
7546         }
7547 
7548         /* Find the CPU pset that corresponds to the pool */
7549         mutex_enter(&cpu_lock);
7550         if ((cpupart = cpupart_find(pool->pool_pset->pset_id)) == NULL) {
7551                 DTRACE_PROBE1(mac_find_pset_no_pset, psetid_t,
7552                     pool->pool_pset->pset_id);
7553         }
7554         mutex_exit(&cpu_lock);
7555 
7556         return (cpupart);
7557 }
7558 
7559 void
7560 mac_set_pool_effective(boolean_t use_default, cpupart_t *cpupart,
7561     mac_resource_props_t *mrp, mac_resource_props_t *emrp)
7562 {
7563         ASSERT(pool_lock_held());
7564 
7565         if (cpupart != NULL) {
7566                 emrp->mrp_mask |= MRP_POOL;
7567                 if (use_default) {
7568                         (void) strcpy(emrp->mrp_pool,
7569                             "pool_default");
7570                 } else {
7571                         ASSERT(strlen(mrp->mrp_pool) != 0);
7572                         (void) strcpy(emrp->mrp_pool,
7573                             mrp->mrp_pool);
7574                 }
7575         } else {
7576                 emrp->mrp_mask &= ~MRP_POOL;
7577                 bzero(emrp->mrp_pool, MAXPATHLEN);
7578         }
7579 }
7580 
7581 struct mac_pool_arg {
7582         char            mpa_poolname[MAXPATHLEN];
7583         pool_event_t    mpa_what;
7584 };
7585 
7586 /*ARGSUSED*/
7587 static uint_t
7588 mac_pool_link_update(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
7589 {
7590         struct mac_pool_arg     *mpa = arg;
7591         mac_impl_t              *mip = (mac_impl_t *)val;
7592         mac_client_impl_t       *mcip;
7593         mac_resource_props_t    *mrp, *emrp;
7594         boolean_t               pool_update = B_FALSE;
7595         boolean_t               pool_clear = B_FALSE;
7596         boolean_t               use_default = B_FALSE;
7597         cpupart_t               *cpupart = NULL;
7598 
7599         mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
7600         i_mac_perim_enter(mip);
7601         for (mcip = mip->mi_clients_list; mcip != NULL;
7602             mcip = mcip->mci_client_next) {
7603                 pool_update = B_FALSE;
7604                 pool_clear = B_FALSE;
7605                 use_default = B_FALSE;
7606                 mac_client_get_resources((mac_client_handle_t)mcip, mrp);
7607                 emrp = MCIP_EFFECTIVE_PROPS(mcip);
7608 
7609                 /*
7610                  * When pools are enabled
7611                  */
7612                 if ((mpa->mpa_what == POOL_E_ENABLE) &&
7613                     ((mrp->mrp_mask & MRP_CPUS) == 0)) {
7614                         mrp->mrp_mask |= MRP_POOL;
7615                         pool_update = B_TRUE;
7616                 }
7617 
7618                 /*
7619                  * When pools are disabled
7620                  */
7621                 if ((mpa->mpa_what == POOL_E_DISABLE) &&
7622                     ((mrp->mrp_mask & MRP_CPUS) == 0)) {
7623                         mrp->mrp_mask |= MRP_POOL;
7624                         pool_clear = B_TRUE;
7625                 }
7626 
7627                 /*
7628                  * Look for links with the pool property set and the poolname
7629                  * matching the one which is changing.
7630                  */
7631                 if (strcmp(mrp->mrp_pool, mpa->mpa_poolname) == 0) {
7632                         /*
7633                          * The pool associated with the link has changed.
7634                          */
7635                         if (mpa->mpa_what == POOL_E_CHANGE) {
7636                                 mrp->mrp_mask |= MRP_POOL;
7637                                 pool_update = B_TRUE;
7638                         }
7639                 }
7640 
7641                 /*
7642                  * This link is associated with pool_default and
7643                  * pool_default has changed.
7644                  */
7645                 if ((mpa->mpa_what == POOL_E_CHANGE) &&
7646                     (strcmp(emrp->mrp_pool, "pool_default") == 0) &&
7647                     (strcmp(mpa->mpa_poolname, "pool_default") == 0)) {
7648                         mrp->mrp_mask |= MRP_POOL;
7649                         pool_update = B_TRUE;
7650                 }
7651 
7652                 /*
7653                  * Get new list of cpus for the pool, bind network
7654                  * threads to new list of cpus and update resources.
7655                  */
7656                 if (pool_update) {
7657                         if (MCIP_DATAPATH_SETUP(mcip)) {
7658                                 pool_lock();
7659                                 cpupart = mac_pset_find(mrp, &use_default);
7660                                 mac_fanout_setup(mcip, mcip->mci_flent, mrp,
7661                                     mac_rx_deliver, mcip, NULL, cpupart);
7662                                 mac_set_pool_effective(use_default, cpupart,
7663                                     mrp, emrp);
7664                                 pool_unlock();
7665                         }
7666                         mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip),
7667                             B_FALSE);
7668                 }
7669 
7670                 /*
7671                  * Clear the effective pool and bind network threads
7672                  * to any available CPU.
7673                  */
7674                 if (pool_clear) {
7675                         if (MCIP_DATAPATH_SETUP(mcip)) {
7676                                 emrp->mrp_mask &= ~MRP_POOL;
7677                                 bzero(emrp->mrp_pool, MAXPATHLEN);
7678                                 mac_fanout_setup(mcip, mcip->mci_flent, mrp,
7679                                     mac_rx_deliver, mcip, NULL, NULL);
7680                         }
7681                         mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip),
7682                             B_FALSE);
7683                 }
7684         }
7685         i_mac_perim_exit(mip);
7686         kmem_free(mrp, sizeof (*mrp));
7687         return (MH_WALK_CONTINUE);
7688 }
7689 
7690 static void
7691 mac_pool_update(void *arg)
7692 {
7693         mod_hash_walk(i_mac_impl_hash, mac_pool_link_update, arg);
7694         kmem_free(arg, sizeof (struct mac_pool_arg));
7695 }
7696 
7697 /*
7698  * Callback function to be executed when a noteworthy pool event
7699  * takes place.
7700  */
7701 /* ARGSUSED */
7702 static void
7703 mac_pool_event_cb(pool_event_t what, poolid_t id, void *arg)
7704 {
7705         pool_t                  *pool;
7706         char                    *poolname = NULL;
7707         struct mac_pool_arg     *mpa;
7708 
7709         pool_lock();
7710         mpa = kmem_zalloc(sizeof (struct mac_pool_arg), KM_SLEEP);
7711 
7712         switch (what) {
7713         case POOL_E_ENABLE:
7714         case POOL_E_DISABLE:
7715                 break;
7716 
7717         case POOL_E_CHANGE:
7718                 pool = pool_lookup_pool_by_id(id);
7719                 if (pool == NULL) {
7720                         kmem_free(mpa, sizeof (struct mac_pool_arg));
7721                         pool_unlock();
7722                         return;
7723                 }
7724                 pool_get_name(pool, &poolname);
7725                 (void) strlcpy(mpa->mpa_poolname, poolname,
7726                     sizeof (mpa->mpa_poolname));
7727                 break;
7728 
7729         default:
7730                 kmem_free(mpa, sizeof (struct mac_pool_arg));
7731                 pool_unlock();
7732                 return;
7733         }
7734         pool_unlock();
7735 
7736         mpa->mpa_what = what;
7737 
7738         mac_pool_update(mpa);
7739 }
7740 
7741 /*
7742  * Set effective rings property. This could be called from datapath_setup/
7743  * datapath_teardown or set-linkprop.
7744  * If the group is reserved we just go ahead and set the effective rings.
7745  * Additionally, for TX this could mean the default  group has lost/gained
7746  * some rings, so if the default group is reserved, we need to adjust the
7747  * effective rings for the default group clients. For RX, if we are working
7748  * with the non-default group, we just need * to reset the effective props
7749  * for the default group clients.
7750  */
7751 void
7752 mac_set_rings_effective(mac_client_impl_t *mcip)
7753 {
7754         mac_impl_t              *mip = mcip->mci_mip;
7755         mac_group_t             *grp;
7756         mac_group_t             *defgrp;
7757         flow_entry_t            *flent = mcip->mci_flent;
7758         mac_resource_props_t    *emrp = MCIP_EFFECTIVE_PROPS(mcip);
7759         mac_grp_client_t        *mgcp;
7760         mac_client_impl_t       *gmcip;
7761 
7762         grp = flent->fe_rx_ring_group;
7763         if (grp != NULL) {
7764                 defgrp = MAC_DEFAULT_RX_GROUP(mip);
7765                 /*
7766                  * If we have reserved a group, set the effective rings
7767                  * to the ring count in the group.
7768                  */
7769                 if (grp->mrg_state == MAC_GROUP_STATE_RESERVED) {
7770                         emrp->mrp_mask |= MRP_RX_RINGS;
7771                         emrp->mrp_nrxrings = grp->mrg_cur_count;
7772                 }
7773 
7774                 /*
7775                  * We go through the clients in the shared group and
7776                  * reset the effective properties. It is possible this
7777                  * might have already been done for some client (i.e.
7778                  * if some client is being moved to a group that is
7779                  * already shared). The case where the default group is
7780                  * RESERVED is taken care of above (note in the RX side if
7781                  * there is a non-default group, the default group is always
7782                  * SHARED).
7783                  */
7784                 if (grp != defgrp || grp->mrg_state == MAC_GROUP_STATE_SHARED) {
7785                         if (grp->mrg_state == MAC_GROUP_STATE_SHARED)
7786                                 mgcp = grp->mrg_clients;
7787                         else
7788                                 mgcp = defgrp->mrg_clients;
7789                         while (mgcp != NULL) {
7790                                 gmcip = mgcp->mgc_client;
7791                                 emrp = MCIP_EFFECTIVE_PROPS(gmcip);
7792                                 if (emrp->mrp_mask & MRP_RX_RINGS) {
7793                                         emrp->mrp_mask &= ~MRP_RX_RINGS;
7794                                         emrp->mrp_nrxrings = 0;
7795                                 }
7796                                 mgcp = mgcp->mgc_next;
7797                         }
7798                 }
7799         }
7800 
7801         /* Now the TX side */
7802         grp = flent->fe_tx_ring_group;
7803         if (grp != NULL) {
7804                 defgrp = MAC_DEFAULT_TX_GROUP(mip);
7805 
7806                 if (grp->mrg_state == MAC_GROUP_STATE_RESERVED) {
7807                         emrp->mrp_mask |= MRP_TX_RINGS;
7808                         emrp->mrp_ntxrings = grp->mrg_cur_count;
7809                 } else if (grp->mrg_state == MAC_GROUP_STATE_SHARED) {
7810                         mgcp = grp->mrg_clients;
7811                         while (mgcp != NULL) {
7812                                 gmcip = mgcp->mgc_client;
7813                                 emrp = MCIP_EFFECTIVE_PROPS(gmcip);
7814                                 if (emrp->mrp_mask & MRP_TX_RINGS) {
7815                                         emrp->mrp_mask &= ~MRP_TX_RINGS;
7816                                         emrp->mrp_ntxrings = 0;
7817                                 }
7818                                 mgcp = mgcp->mgc_next;
7819                         }
7820                 }
7821 
7822                 /*
7823                  * If the group is not the default group and the default
7824                  * group is reserved, the ring count in the default group
7825                  * might have changed, update it.
7826                  */
7827                 if (grp != defgrp &&
7828                     defgrp->mrg_state == MAC_GROUP_STATE_RESERVED) {
7829                         gmcip = MAC_GROUP_ONLY_CLIENT(defgrp);
7830                         emrp = MCIP_EFFECTIVE_PROPS(gmcip);
7831                         emrp->mrp_ntxrings = defgrp->mrg_cur_count;
7832                 }
7833         }
7834         emrp = MCIP_EFFECTIVE_PROPS(mcip);
7835 }
7836 
7837 /*
7838  * Check if the primary is in the default group. If so, see if we
7839  * can give it a an exclusive group now that another client is
7840  * being configured. We take the primary out of the default group
7841  * because the multicast/broadcast packets for the all the clients
7842  * will land in the default ring in the default group which means
7843  * any client in the default group, even if it is the only on in
7844  * the group, will lose exclusive access to the rings, hence
7845  * polling.
7846  */
7847 mac_client_impl_t *
7848 mac_check_primary_relocation(mac_client_impl_t *mcip, boolean_t rxhw)
7849 {
7850         mac_impl_t              *mip = mcip->mci_mip;
7851         mac_group_t             *defgrp = MAC_DEFAULT_RX_GROUP(mip);
7852         flow_entry_t            *flent = mcip->mci_flent;
7853         mac_resource_props_t    *mrp = MCIP_RESOURCE_PROPS(mcip);
7854         uint8_t                 *mac_addr;
7855         mac_group_t             *ngrp;
7856 
7857         /*
7858          * Check if the primary is in the default group, if not
7859          * or if it is explicitly configured to be in the default
7860          * group OR set the RX rings property, return.
7861          */
7862         if (flent->fe_rx_ring_group != defgrp || mrp->mrp_mask & MRP_RX_RINGS)
7863                 return (NULL);
7864 
7865         /*
7866          * If the new client needs an exclusive group and we
7867          * don't have another for the primary, return.
7868          */
7869         if (rxhw && mip->mi_rxhwclnt_avail < 2)
7870                 return (NULL);
7871 
7872         mac_addr = flent->fe_flow_desc.fd_dst_mac;
7873         /*
7874          * We call this when we are setting up the datapath for
7875          * the first non-primary.
7876          */
7877         ASSERT(mip->mi_nactiveclients == 2);
7878         /*
7879          * OK, now we have the primary that needs to be relocated.
7880          */
7881         ngrp =  mac_reserve_rx_group(mcip, mac_addr, B_TRUE);
7882         if (ngrp == NULL)
7883                 return (NULL);
7884         if (mac_rx_switch_group(mcip, defgrp, ngrp) != 0) {
7885                 mac_stop_group(ngrp);
7886                 return (NULL);
7887         }
7888         return (mcip);
7889 }