1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 #include <sys/note.h>
  26 #include <sys/sysmacros.h>
  27 #include <sys/types.h>
  28 #include <sys/param.h>
  29 #include <sys/systm.h>
  30 #include <sys/kmem.h>
  31 #include <sys/cmn_err.h>
  32 #include <sys/debug.h>
  33 #include <sys/ddi.h>
  34 #include <sys/sunndi.h>
  35 #include <sys/ndi_impldefs.h>     /* include prototypes */
  36 
  37 #if defined(__i386) || defined(__amd64)
  38 /*
  39  * MSI-X allocation limit.
  40  */
  41 extern uint_t           ddi_msix_alloc_limit;
  42 #endif
  43 
  44 /*
  45  * Interrupt Resource Management (IRM).
  46  */
  47 
  48 #define DDI_IRM_BALANCE_DELAY   (60)    /* In seconds */
  49 
  50 #define DDI_IRM_HAS_CB(c)       ((c) && (c->cb_flags & DDI_CB_FLAG_INTR))
  51 
  52 #define DDI_IRM_IS_REDUCIBLE(r) (((r->ireq_flags & DDI_IRM_FLAG_CALLBACK) && \
  53                                 (r->ireq_type == DDI_INTR_TYPE_MSIX)) || \
  54                                 (r->ireq_flags & DDI_IRM_FLAG_NEW))
  55 
  56 extern pri_t    minclsyspri;
  57 
  58 /* Global policies */
  59 int             irm_enable = 1;
  60 boolean_t       irm_active = B_FALSE;
  61 int             irm_default_policy = DDI_IRM_POLICY_LARGE;
  62 uint_t          irm_balance_delay = DDI_IRM_BALANCE_DELAY;
  63 
  64 /* Global list of interrupt pools */
  65 kmutex_t        irm_pools_lock;
  66 list_t          irm_pools_list;
  67 
  68 /* Global debug tunables */
  69 #ifdef  DEBUG
  70 int             irm_debug_policy = 0;
  71 uint_t          irm_debug_size = 0;
  72 #endif  /* DEBUG */
  73 
  74 static void     irm_balance_thread(ddi_irm_pool_t *);
  75 static void     i_ddi_irm_balance(ddi_irm_pool_t *);
  76 static void     i_ddi_irm_enqueue(ddi_irm_pool_t *, boolean_t);
  77 static void     i_ddi_irm_reduce(ddi_irm_pool_t *pool);
  78 static int      i_ddi_irm_reduce_by_policy(ddi_irm_pool_t *, int, int);
  79 static void     i_ddi_irm_reduce_new(ddi_irm_pool_t *, int);
  80 static void     i_ddi_irm_insertion_sort(list_t *, ddi_irm_req_t *);
  81 static int      i_ddi_irm_notify(ddi_irm_pool_t *, ddi_irm_req_t *);
  82 static int      i_ddi_irm_modify_increase(ddi_irm_req_t *, int);
  83 
  84 /*
  85  * OS Initialization Routines
  86  */
  87 
  88 /*
  89  * irm_init()
  90  *
  91  *      Initialize IRM subsystem before any drivers are attached.
  92  */
  93 void
  94 irm_init(void)
  95 {
  96         /* Do nothing if IRM is disabled */
  97         if (!irm_enable)
  98                 return;
  99 
 100         /* Verify that the default balancing policy is valid */
 101         if (!DDI_IRM_POLICY_VALID(irm_default_policy))
 102                 irm_default_policy = DDI_IRM_POLICY_LARGE;
 103 
 104         /* Initialize the global list of interrupt pools */
 105         mutex_init(&irm_pools_lock, NULL, MUTEX_DRIVER, NULL);
 106         list_create(&irm_pools_list, sizeof (ddi_irm_pool_t),
 107             offsetof(ddi_irm_pool_t, ipool_link));
 108 }
 109 
 110 /*
 111  * i_ddi_irm_poststartup()
 112  *
 113  *      IRM is not activated until after the IO subsystem is initialized.
 114  *      When activated, per-pool balancing threads are spawned and a flag
 115  *      is set so that all future pools will be activated when created.
 116  *
 117  *      NOTE: the global variable 'irm_enable' disables IRM if zero.
 118  */
 119 void
 120 i_ddi_irm_poststartup(void)
 121 {
 122         ddi_irm_pool_t  *pool_p;
 123 
 124         /* Do nothing if IRM is disabled */
 125         if (!irm_enable)
 126                 return;
 127 
 128         /* Lock the global list */
 129         mutex_enter(&irm_pools_lock);
 130 
 131         /* Activate all defined pools */
 132         for (pool_p = list_head(&irm_pools_list); pool_p;
 133             pool_p = list_next(&irm_pools_list, pool_p))
 134                 pool_p->ipool_thread = thread_create(NULL, 0,
 135                     irm_balance_thread, pool_p, 0, &p0, TS_RUN, minclsyspri);
 136 
 137         /* Set future pools to be active */
 138         irm_active = B_TRUE;
 139 
 140         /* Unlock the global list */
 141         mutex_exit(&irm_pools_lock);
 142 }
 143 
 144 /*
 145  * NDI interfaces for creating/destroying IRM pools.
 146  */
 147 
 148 /*
 149  * ndi_irm_create()
 150  *
 151  *      Nexus interface to create an IRM pool.  Create the new
 152  *      pool and add it to the global list of interrupt pools.
 153  */
 154 int
 155 ndi_irm_create(dev_info_t *dip, ddi_irm_params_t *paramsp,
 156     ddi_irm_pool_t **pool_retp)
 157 {
 158         ddi_irm_pool_t  *pool_p;
 159 
 160         ASSERT(dip != NULL);
 161         ASSERT(paramsp != NULL);
 162         ASSERT(pool_retp != NULL);
 163         ASSERT(paramsp->iparams_total >= 1);
 164         ASSERT(paramsp->iparams_types != 0);
 165 
 166         DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_create: dip %p\n", (void *)dip));
 167 
 168         /* Check if IRM is enabled */
 169         if (!irm_enable)
 170                 return (NDI_FAILURE);
 171 
 172         /* Validate parameters */
 173         if ((dip == NULL) || (paramsp == NULL) || (pool_retp == NULL) ||
 174             (paramsp->iparams_total < 1) || (paramsp->iparams_types == 0))
 175                 return (NDI_FAILURE);
 176 
 177         /* Allocate and initialize the pool */
 178         pool_p = kmem_zalloc(sizeof (ddi_irm_pool_t), KM_SLEEP);
 179         pool_p->ipool_owner = dip;
 180         pool_p->ipool_policy = irm_default_policy;
 181         pool_p->ipool_types = paramsp->iparams_types;
 182         pool_p->ipool_totsz = paramsp->iparams_total;
 183         pool_p->ipool_defsz = MIN(DDI_MAX_MSIX_ALLOC, MAX(DDI_MIN_MSIX_ALLOC,
 184             paramsp->iparams_total / DDI_MSIX_ALLOC_DIVIDER));
 185         list_create(&pool_p->ipool_req_list, sizeof (ddi_irm_req_t),
 186             offsetof(ddi_irm_req_t, ireq_link));
 187         list_create(&pool_p->ipool_scratch_list, sizeof (ddi_irm_req_t),
 188             offsetof(ddi_irm_req_t, ireq_scratch_link));
 189         cv_init(&pool_p->ipool_cv, NULL, CV_DRIVER, NULL);
 190         mutex_init(&pool_p->ipool_lock, NULL, MUTEX_DRIVER, NULL);
 191         mutex_init(&pool_p->ipool_navail_lock, NULL, MUTEX_DRIVER, NULL);
 192 
 193         /* Add to global list of pools */
 194         mutex_enter(&irm_pools_lock);
 195         list_insert_tail(&irm_pools_list, pool_p);
 196         mutex_exit(&irm_pools_lock);
 197 
 198         /* If IRM is active, then activate the pool */
 199         if (irm_active)
 200                 pool_p->ipool_thread = thread_create(NULL, 0,
 201                     irm_balance_thread, pool_p, 0, &p0, TS_RUN, minclsyspri);
 202 
 203         *pool_retp = pool_p;
 204         return (NDI_SUCCESS);
 205 }
 206 
 207 /*
 208  * ndi_irm_resize_pool()
 209  *
 210  *      Nexus interface to resize IRM pool. If the pool size drops
 211  *      below  the allocated number of vectors then initiate rebalance
 212  *      operation before resizing the pool. If rebalance operation fails
 213  *      then return NDI_FAILURE.
 214  */
 215 int
 216 ndi_irm_resize_pool(ddi_irm_pool_t *pool_p, uint_t new_size)
 217 {
 218         uint_t prev_size;
 219 
 220         ASSERT(pool_p != NULL);
 221 
 222         DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_resize_pool: pool_p %p"
 223             " current-size 0x%x new-size 0x%x\n",
 224             (void *)pool_p, pool_p->ipool_totsz, new_size));
 225 
 226         if (pool_p == NULL)
 227                 return (NDI_EINVAL);
 228 
 229         /* Check if IRM is enabled */
 230         if (!irm_enable)
 231                 return (NDI_FAILURE);
 232 
 233         mutex_enter(&pool_p->ipool_lock);
 234 
 235         /*
 236          * If we are increasing the pool size or if the reserved
 237          * number of vectors is <= the new pool size then simply
 238          * update the pool size and enqueue a reblance operation
 239          * if necessary to use the new vectors.
 240          */
 241         if ((pool_p->ipool_totsz < new_size) ||
 242             (pool_p->ipool_resno <= new_size)) {
 243                 /* set new pool size */
 244                 pool_p->ipool_totsz = new_size;
 245                 /* adjust the default allocation limit */
 246                 pool_p->ipool_defsz = MIN(DDI_MAX_MSIX_ALLOC,
 247                     MAX(DDI_MIN_MSIX_ALLOC, new_size / DDI_MSIX_ALLOC_DIVIDER));
 248                 /* queue a rebalance operation to use the new vectors */
 249                 if (pool_p->ipool_reqno > pool_p->ipool_resno)
 250                         i_ddi_irm_enqueue(pool_p, B_FALSE);
 251                 mutex_exit(&pool_p->ipool_lock);
 252                 return (NDI_SUCCESS);
 253         }
 254 
 255         DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_resize_pool: pool_p %p"
 256             " needs a rebalance operation\n", (void *)pool_p));
 257 
 258         /*
 259          * requires a rebalance operation
 260          */
 261         /* save the current pool size */
 262         prev_size = pool_p->ipool_totsz;
 263         /* set the pool size to the desired new value */
 264         pool_p->ipool_totsz = new_size;
 265         /* perform the rebalance operation */
 266         i_ddi_irm_enqueue(pool_p, B_TRUE);
 267 
 268         /*
 269          * If rebalance operation couldn't free up enough
 270          * vectors then fail the resize operation.
 271          */
 272         if (pool_p->ipool_resno > new_size) { /* rebalance failed */
 273                 /* restore the pool size to the previous value */
 274                 pool_p->ipool_totsz = prev_size;
 275                 /* enqueue a rebalance operation for the original pool size */
 276                 i_ddi_irm_enqueue(pool_p, B_FALSE);
 277                 mutex_exit(&pool_p->ipool_lock);
 278                 return (NDI_FAILURE);
 279         } else { /* rebalance worked */
 280                 /* adjust the default allocation limit */
 281                 pool_p->ipool_defsz = MIN(DDI_MAX_MSIX_ALLOC,
 282                     MAX(DDI_MIN_MSIX_ALLOC, new_size / DDI_MSIX_ALLOC_DIVIDER));
 283                 mutex_exit(&pool_p->ipool_lock);
 284                 DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_resize_pool: pool_p %p"
 285                     " resized from %x to %x\n",
 286                     (void *)pool_p, prev_size, pool_p->ipool_totsz));
 287                 return (NDI_SUCCESS);
 288         }
 289 }
 290 
 291 /*
 292  * ndi_irm_destroy()
 293  *
 294  *      Nexus interface to destroy an IRM pool.  Destroy the pool
 295  *      and remove it from the global list of interrupt pools.
 296  */
 297 int
 298 ndi_irm_destroy(ddi_irm_pool_t *pool_p)
 299 {
 300         ASSERT(pool_p != NULL);
 301         ASSERT(pool_p->ipool_resno == 0);
 302 
 303         DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_destroy: pool_p %p\n",
 304             (void *)pool_p));
 305 
 306         /* Validate parameters */
 307         if (pool_p == NULL)
 308                 return (NDI_FAILURE);
 309 
 310         /* Validate that pool is empty */
 311         if (pool_p->ipool_resno != 0)
 312                 return (NDI_BUSY);
 313 
 314         /* Remove the pool from the global list */
 315         mutex_enter(&irm_pools_lock);
 316         list_remove(&irm_pools_list, pool_p);
 317         mutex_exit(&irm_pools_lock);
 318 
 319         /* Terminate the balancing thread */
 320         mutex_enter(&pool_p->ipool_lock);
 321         if (pool_p->ipool_thread &&
 322             (pool_p->ipool_flags & DDI_IRM_FLAG_ACTIVE)) {
 323                 pool_p->ipool_flags |= DDI_IRM_FLAG_EXIT;
 324                 cv_signal(&pool_p->ipool_cv);
 325                 mutex_exit(&pool_p->ipool_lock);
 326                 thread_join(pool_p->ipool_thread->t_did);
 327         } else
 328                 mutex_exit(&pool_p->ipool_lock);
 329 
 330         /* Destroy the pool */
 331         cv_destroy(&pool_p->ipool_cv);
 332         mutex_destroy(&pool_p->ipool_lock);
 333         mutex_destroy(&pool_p->ipool_navail_lock);
 334         list_destroy(&pool_p->ipool_req_list);
 335         list_destroy(&pool_p->ipool_scratch_list);
 336         kmem_free(pool_p, sizeof (ddi_irm_pool_t));
 337 
 338         return (NDI_SUCCESS);
 339 }
 340 
 341 /*
 342  * Insert/Modify/Remove Interrupt Requests
 343  */
 344 
 345 /*
 346  * i_ddi_irm_insert()
 347  *
 348  *      Insert a new request into an interrupt pool, and balance the pool.
 349  */
 350 int
 351 i_ddi_irm_insert(dev_info_t *dip, int type, int count)
 352 {
 353         ddi_irm_req_t   *req_p;
 354         devinfo_intr_t  *intr_p;
 355         ddi_irm_pool_t  *pool_p;
 356         uint_t          nreq, nmin, npartial;
 357         boolean_t       irm_flag = B_FALSE;
 358 
 359         ASSERT(dip != NULL);
 360         ASSERT(DDI_INTR_TYPE_FLAG_VALID(type));
 361         ASSERT(count > 0);
 362 
 363         DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: dip %p type %d count %d\n",
 364             (void *)dip, type, count));
 365 
 366         /* Validate parameters */
 367         if ((dip == NULL) || (count < 1) || !DDI_INTR_TYPE_FLAG_VALID(type)) {
 368                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: invalid args\n"));
 369                 return (DDI_EINVAL);
 370         }
 371 
 372         /* Check for an existing request */
 373         if (((intr_p = DEVI(dip)->devi_intr_p) != NULL) &&
 374             (intr_p->devi_irm_req_p != NULL))
 375                 return (DDI_SUCCESS);
 376 
 377         /* Check for IRM support from the system */
 378         if ((pool_p = i_ddi_intr_get_pool(dip, type)) == NULL) {
 379                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: not supported\n"));
 380                 return (DDI_ENOTSUP);
 381         }
 382 
 383         /* Check for IRM support from the driver */
 384         if (i_ddi_irm_supported(dip, type) == DDI_SUCCESS)
 385                 irm_flag = B_TRUE;
 386 
 387         /* Determine request size */
 388         nreq = (irm_flag) ? count :
 389             MIN(count, i_ddi_intr_get_limit(dip, type, pool_p));
 390         nmin = (irm_flag) ? 1 : nreq;
 391         npartial = MIN(nreq, pool_p->ipool_defsz);
 392 
 393         /* Allocate and initialize the request */
 394         req_p = kmem_zalloc(sizeof (ddi_irm_req_t), KM_SLEEP);
 395         req_p->ireq_type = type;
 396         req_p->ireq_dip = dip;
 397         req_p->ireq_pool_p = pool_p;
 398         req_p->ireq_nreq = nreq;
 399         req_p->ireq_flags = DDI_IRM_FLAG_NEW;
 400         if (irm_flag)
 401                 req_p->ireq_flags |= DDI_IRM_FLAG_CALLBACK;
 402 
 403         /* Lock the pool */
 404         mutex_enter(&pool_p->ipool_lock);
 405 
 406         /* Check for minimal fit before inserting */
 407         if ((pool_p->ipool_minno + nmin) > pool_p->ipool_totsz) {
 408                 cmn_err(CE_WARN, "%s%d: interrupt pool too full.\n",
 409                     ddi_driver_name(dip), ddi_get_instance(dip));
 410                 mutex_exit(&pool_p->ipool_lock);
 411                 kmem_free(req_p, sizeof (ddi_irm_req_t));
 412                 return (DDI_EAGAIN);
 413         }
 414 
 415         /* Insert the request into the pool */
 416         pool_p->ipool_reqno += nreq;
 417         pool_p->ipool_minno += nmin;
 418         i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
 419 
 420         /*
 421          * Try to fulfill the request.
 422          *
 423          * If all the interrupts are available, and either the request
 424          * is static or the pool is active, then just take them directly.
 425          *
 426          * If only some of the interrupts are available, and the request
 427          * can receive future callbacks, then take some now but queue the
 428          * pool to be rebalanced later.
 429          *
 430          * Otherwise, immediately rebalance the pool and wait.
 431          */
 432         if ((!irm_flag || (pool_p->ipool_flags & DDI_IRM_FLAG_ACTIVE)) &&
 433             ((pool_p->ipool_resno + nreq) <= pool_p->ipool_totsz)) {
 434 
 435                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: "
 436                     "request completely fulfilled.\n"));
 437                 pool_p->ipool_resno += nreq;
 438                 req_p->ireq_navail = nreq;
 439                 req_p->ireq_flags &= ~(DDI_IRM_FLAG_NEW);
 440 
 441         } else if (irm_flag &&
 442             ((pool_p->ipool_resno + npartial) <= pool_p->ipool_totsz)) {
 443 
 444                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: "
 445                     "request partially fulfilled.\n"));
 446                 pool_p->ipool_resno += npartial;
 447                 req_p->ireq_navail = npartial;
 448                 req_p->ireq_flags &= ~(DDI_IRM_FLAG_NEW);
 449                 i_ddi_irm_enqueue(pool_p, B_FALSE);
 450 
 451         } else {
 452 
 453                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_insert: "
 454                     "request needs immediate rebalance.\n"));
 455                 i_ddi_irm_enqueue(pool_p, B_TRUE);
 456                 req_p->ireq_flags &= ~(DDI_IRM_FLAG_NEW);
 457         }
 458 
 459         /* Fail if the request cannot be fulfilled at all */
 460         if (req_p->ireq_navail == 0) {
 461                 cmn_err(CE_WARN, "%s%d: interrupt pool too full.\n",
 462                     ddi_driver_name(dip), ddi_get_instance(dip));
 463                 pool_p->ipool_reqno -= nreq;
 464                 pool_p->ipool_minno -= nmin;
 465                 list_remove(&pool_p->ipool_req_list, req_p);
 466                 mutex_exit(&pool_p->ipool_lock);
 467                 kmem_free(req_p, sizeof (ddi_irm_req_t));
 468                 return (DDI_EAGAIN);
 469         }
 470 
 471         /* Unlock the pool */
 472         mutex_exit(&pool_p->ipool_lock);
 473 
 474         intr_p->devi_irm_req_p = req_p;
 475         return (DDI_SUCCESS);
 476 }
 477 
 478 /*
 479  * i_ddi_irm_modify()
 480  *
 481  *      Modify an existing request in an interrupt pool, and balance the pool.
 482  */
 483 int
 484 i_ddi_irm_modify(dev_info_t *dip, int nreq)
 485 {
 486         devinfo_intr_t  *intr_p;
 487         ddi_irm_req_t   *req_p;
 488         ddi_irm_pool_t  *pool_p;
 489         int             type;
 490         int             retval = DDI_SUCCESS;
 491 
 492         ASSERT(dip != NULL);
 493         ASSERT(nreq > 0);
 494 
 495         DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: dip %p nreq %d\n",
 496             (void *)dip, nreq));
 497 
 498         /* Validate parameters */
 499         if ((dip == NULL) || (nreq < 1)) {
 500                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: invalid args\n"));
 501                 return (DDI_EINVAL);
 502         }
 503 
 504         /* Do nothing if not mapped to an IRM pool */
 505         if (((intr_p = DEVI(dip)->devi_intr_p) == NULL) ||
 506             ((req_p = intr_p->devi_irm_req_p) == NULL))
 507                 return (DDI_SUCCESS);
 508 
 509         /* Do nothing if new size is the same */
 510         if (nreq == req_p->ireq_nreq)
 511                 return (DDI_SUCCESS);
 512 
 513         /* Do not allow MSI requests to be resized */
 514         if ((type = req_p->ireq_type) == DDI_INTR_TYPE_MSI) {
 515                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: invalid type\n"));
 516                 return (DDI_ENOTSUP);
 517         }
 518 
 519         /* Select the pool */
 520         if ((pool_p = req_p->ireq_pool_p) == NULL) {
 521                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: missing pool\n"));
 522                 return (DDI_FAILURE);
 523         }
 524 
 525         /* Validate request size is not too large */
 526         if (nreq > i_ddi_intr_get_limit(dip, type, pool_p)) {
 527                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: invalid args\n"));
 528                 return (DDI_EINVAL);
 529         }
 530 
 531         /* Lock the pool */
 532         mutex_enter(&pool_p->ipool_lock);
 533 
 534         /*
 535          * Process the modification.
 536          *
 537          *      - To increase a non-IRM request, call the implementation in
 538          *        i_ddi_irm_modify_increase().
 539          *
 540          *      - To decrease a non-IRM request, directly update the pool and
 541          *        request, then queue the pool for later rebalancing.
 542          *
 543          *      - To modify an IRM request, always queue the pool for later
 544          *        rebalancing.  IRM consumers rely upon callbacks for changes.
 545          */
 546         if ((nreq > req_p->ireq_nreq) &&
 547             (i_ddi_irm_supported(dip, type) != DDI_SUCCESS)) {
 548 
 549                 retval = i_ddi_irm_modify_increase(req_p, nreq);
 550 
 551         } else {
 552 
 553                 /* Update pool and request */
 554                 pool_p->ipool_reqno -= req_p->ireq_nreq;
 555                 pool_p->ipool_reqno += nreq;
 556                 if (i_ddi_irm_supported(dip, type) != DDI_SUCCESS) {
 557                         pool_p->ipool_minno -= req_p->ireq_navail;
 558                         pool_p->ipool_resno -= req_p->ireq_navail;
 559                         pool_p->ipool_minno += nreq;
 560                         pool_p->ipool_resno += nreq;
 561                         req_p->ireq_navail = nreq;
 562                 }
 563                 req_p->ireq_nreq = nreq;
 564 
 565                 /* Re-sort request into the pool */
 566                 list_remove(&pool_p->ipool_req_list, req_p);
 567                 i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
 568 
 569                 /* Queue pool for asynchronous rebalance */
 570                 i_ddi_irm_enqueue(pool_p, B_FALSE);
 571         }
 572 
 573         /* Unlock the pool */
 574         mutex_exit(&pool_p->ipool_lock);
 575 
 576         return (retval);
 577 }
 578 
 579 /*
 580  * i_ddi_irm_modify_increase()
 581  *
 582  *      Increase a non-IRM request.  The additional interrupts are
 583  *      directly taken from the pool when possible.  Otherwise, an
 584  *      immediate, synchronous rebalance is performed.  A temporary
 585  *      proxy request is used for any rebalance operation to ensure
 586  *      the request is not reduced below its current allocation.
 587  *
 588  *      NOTE: pool must already be locked.
 589  */
 590 static int
 591 i_ddi_irm_modify_increase(ddi_irm_req_t *req_p, int nreq)
 592 {
 593         dev_info_t      *dip = req_p->ireq_dip;
 594         ddi_irm_pool_t  *pool_p = req_p->ireq_pool_p;
 595         ddi_irm_req_t   new_req;
 596         int             count, delta;
 597 
 598         ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
 599 
 600         /* Compute number of additional vectors */
 601         count = nreq - req_p->ireq_nreq;
 602 
 603         /* Check for minimal fit */
 604         if ((pool_p->ipool_minno + count) > pool_p->ipool_totsz) {
 605                 cmn_err(CE_WARN, "%s%d: interrupt pool too full.\n",
 606                     ddi_driver_name(dip), ddi_get_instance(dip));
 607                 return (DDI_EAGAIN);
 608         }
 609 
 610         /* Update the pool */
 611         pool_p->ipool_reqno += count;
 612         pool_p->ipool_minno += count;
 613 
 614         /* Attempt direct implementation */
 615         if ((pool_p->ipool_resno + count) <= pool_p->ipool_totsz) {
 616                 req_p->ireq_nreq += count;
 617                 req_p->ireq_navail += count;
 618                 pool_p->ipool_resno += count;
 619                 return (DDI_SUCCESS);
 620         }
 621 
 622         /* Rebalance required: fail if pool is not active */
 623         if ((pool_p->ipool_flags & DDI_IRM_FLAG_ACTIVE) == 0) {
 624                 pool_p->ipool_reqno -= count;
 625                 pool_p->ipool_minno -= count;
 626                 return (DDI_EAGAIN);
 627         }
 628 
 629         /* Insert temporary proxy request */
 630         bzero(&new_req, sizeof (ddi_irm_req_t));
 631         new_req.ireq_dip = dip;
 632         new_req.ireq_nreq = count;
 633         new_req.ireq_pool_p = pool_p;
 634         new_req.ireq_type = req_p->ireq_type;
 635         new_req.ireq_flags = DDI_IRM_FLAG_NEW;
 636         i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, &new_req);
 637 
 638         /* Synchronously rebalance */
 639         i_ddi_irm_enqueue(pool_p, B_TRUE);
 640 
 641         /* Remove proxy request, and merge into original request */
 642         req_p->ireq_nreq += count;
 643         if ((delta = (count - new_req.ireq_navail)) > 0) {
 644                 req_p->ireq_nreq -= delta;
 645                 pool_p->ipool_reqno -= delta;
 646                 pool_p->ipool_minno -= delta;
 647         }
 648         req_p->ireq_navail += new_req.ireq_navail;
 649         list_remove(&pool_p->ipool_req_list, req_p);
 650         list_remove(&pool_p->ipool_req_list, &new_req);
 651         i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
 652 
 653         return (DDI_SUCCESS);
 654 }
 655 
 656 /*
 657  * i_ddi_irm_remove()
 658  *
 659  *      Remove a request from an interrupt pool, and balance the pool.
 660  */
 661 int
 662 i_ddi_irm_remove(dev_info_t *dip)
 663 {
 664         devinfo_intr_t  *intr_p;
 665         ddi_irm_pool_t  *pool_p;
 666         ddi_irm_req_t   *req_p;
 667         uint_t          nmin;
 668 
 669         ASSERT(dip != NULL);
 670 
 671         DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_remove: dip %p\n", (void *)dip));
 672 
 673         /* Validate parameters */
 674         if (dip == NULL) {
 675                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_remove: invalid args\n"));
 676                 return (DDI_EINVAL);
 677         }
 678 
 679         /* Check if the device has a request */
 680         if (!(intr_p = DEVI(dip)->devi_intr_p) ||
 681             !(req_p = intr_p->devi_irm_req_p)) {
 682                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_modify: not found\n"));
 683                 return (DDI_EINVAL);
 684         }
 685 
 686         /* Lock the pool */
 687         pool_p = req_p->ireq_pool_p;
 688         mutex_enter(&pool_p->ipool_lock);
 689 
 690         /* Remove request */
 691         nmin = DDI_IRM_IS_REDUCIBLE(req_p) ? 1 : req_p->ireq_nreq;
 692         pool_p->ipool_minno -= nmin;
 693         pool_p->ipool_reqno -= req_p->ireq_nreq;
 694         pool_p->ipool_resno -= req_p->ireq_navail;
 695         list_remove(&pool_p->ipool_req_list, req_p);
 696 
 697         /* Queue pool to be rebalanced */
 698         i_ddi_irm_enqueue(pool_p, B_FALSE);
 699 
 700         /* Unlock the pool */
 701         mutex_exit(&pool_p->ipool_lock);
 702 
 703         /* Destroy the request */
 704         intr_p->devi_irm_req_p = NULL;
 705         kmem_free(req_p, sizeof (ddi_irm_req_t));
 706 
 707         return (DDI_SUCCESS);
 708 }
 709 
 710 /*
 711  * i_ddi_irm_set_cb()
 712  *
 713  *      Change the callback flag for a request, in response to
 714  *      a change in its callback registration.  Then rebalance
 715  *      the interrupt pool.
 716  *
 717  *      NOTE: the request is not locked because the navail value
 718  *            is not directly affected.  The balancing thread may
 719  *            modify the navail value in the background after it
 720  *            locks the request itself.
 721  */
 722 void
 723 i_ddi_irm_set_cb(dev_info_t *dip, boolean_t has_cb_flag)
 724 {
 725         devinfo_intr_t  *intr_p;
 726         ddi_irm_pool_t  *pool_p;
 727         ddi_irm_req_t   *req_p;
 728         uint_t          nreq;
 729 
 730         ASSERT(dip != NULL);
 731 
 732         DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_set_cb: dip %p has_cb_flag %d\n",
 733             (void *)dip, (int)has_cb_flag));
 734 
 735         /* Validate parameters */
 736         if (dip == NULL)
 737                 return;
 738 
 739         /* Check for association with interrupt pool */
 740         if (!(intr_p = DEVI(dip)->devi_intr_p) ||
 741             !(req_p = intr_p->devi_irm_req_p)) {
 742                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_set_cb: not in pool\n"));
 743                 return;
 744         }
 745 
 746         /* Lock the pool */
 747         pool_p = req_p->ireq_pool_p;
 748         mutex_enter(&pool_p->ipool_lock);
 749 
 750         /*
 751          * Update the request and the pool
 752          */
 753         if (has_cb_flag) {
 754 
 755                 /* Update pool statistics */
 756                 if (req_p->ireq_type == DDI_INTR_TYPE_MSIX)
 757                         pool_p->ipool_minno -= (req_p->ireq_nreq - 1);
 758 
 759                 /* Update request */
 760                 req_p->ireq_flags |= DDI_IRM_FLAG_CALLBACK;
 761 
 762                 /* Rebalance in background */
 763                 i_ddi_irm_enqueue(pool_p, B_FALSE);
 764 
 765         } else {
 766 
 767                 /* Determine new request size */
 768                 nreq = MIN(req_p->ireq_nreq, pool_p->ipool_defsz);
 769 
 770 #if defined(__i386) || defined(__amd64)
 771                 /* Use the default static limit for non-IRM drivers */
 772                 if (req_p->ireq_type == DDI_INTR_TYPE_MSIX)
 773                         nreq = MIN(nreq, ddi_msix_alloc_limit);
 774 #endif
 775 
 776                 /* Update pool statistics */
 777                 pool_p->ipool_reqno -= req_p->ireq_nreq;
 778                 pool_p->ipool_reqno += nreq;
 779                 if (req_p->ireq_type == DDI_INTR_TYPE_MSIX) {
 780                         pool_p->ipool_minno -= 1;
 781                         pool_p->ipool_minno += nreq;
 782                 } else {
 783                         pool_p->ipool_minno -= req_p->ireq_nreq;
 784                         pool_p->ipool_minno += nreq;
 785                 }
 786 
 787                 /* Update request size, and re-sort in pool */
 788                 req_p->ireq_nreq = nreq;
 789                 list_remove(&pool_p->ipool_req_list, req_p);
 790                 i_ddi_irm_insertion_sort(&pool_p->ipool_req_list, req_p);
 791 
 792                 /* Rebalance synchronously, before losing callback */
 793                 i_ddi_irm_enqueue(pool_p, B_TRUE);
 794 
 795                 /* Remove callback flag */
 796                 req_p->ireq_flags &= ~(DDI_IRM_FLAG_CALLBACK);
 797         }
 798 
 799         /* Unlock the pool */
 800         mutex_exit(&pool_p->ipool_lock);
 801 }
 802 
 803 /*
 804  * i_ddi_irm_supported()
 805  *
 806  *      Query if IRM is supported by a driver using a specific interrupt type.
 807  *      Notice that IRM is limited to MSI-X users with registered callbacks.
 808  */
 809 int
 810 i_ddi_irm_supported(dev_info_t *dip, int type)
 811 {
 812         ddi_cb_t        *cb_p = DEVI(dip)->devi_cb_p;
 813 
 814         return ((DDI_IRM_HAS_CB(cb_p) && (type == DDI_INTR_TYPE_MSIX)) ?
 815             DDI_SUCCESS : DDI_ENOTSUP);
 816 }
 817 
 818 /*
 819  * Interrupt Pool Balancing
 820  */
 821 
 822 /*
 823  * irm_balance_thread()
 824  *
 825  *      One instance of this thread operates per each defined IRM pool.
 826  *      It does the initial activation of the pool, as well as balancing
 827  *      any requests that were queued up before the pool was active.
 828  *      Once active, it waits forever to service balance operations.
 829  */
 830 static void
 831 irm_balance_thread(ddi_irm_pool_t *pool_p)
 832 {
 833         clock_t         interval;
 834 
 835         DDI_INTR_IRMDBG((CE_CONT, "irm_balance_thread: pool_p %p\n",
 836             (void *)pool_p));
 837 
 838         /* Lock the pool */
 839         mutex_enter(&pool_p->ipool_lock);
 840 
 841         /* Perform initial balance if required */
 842         if (pool_p->ipool_reqno > pool_p->ipool_resno)
 843                 i_ddi_irm_balance(pool_p);
 844 
 845         /* Activate the pool */
 846         pool_p->ipool_flags |= DDI_IRM_FLAG_ACTIVE;
 847 
 848         /*
 849          * Main loop.
 850          * Iterate once first before wait on signal, in case there is signal
 851          * sent before this thread being created
 852          */
 853         for (;;) {
 854 
 855                 /* Compute the delay interval */
 856                 interval = drv_sectohz(irm_balance_delay);
 857 
 858                 /* Wait one interval, or until there are waiters */
 859                 if ((interval > 0) &&
 860                     !(pool_p->ipool_flags & DDI_IRM_FLAG_WAITERS) &&
 861                     !(pool_p->ipool_flags & DDI_IRM_FLAG_EXIT)) {
 862                         (void) cv_reltimedwait(&pool_p->ipool_cv,
 863                             &pool_p->ipool_lock, interval, TR_CLOCK_TICK);
 864                 }
 865 
 866                 /* Check if awakened to exit */
 867                 if (pool_p->ipool_flags & DDI_IRM_FLAG_EXIT) {
 868                         DDI_INTR_IRMDBG((CE_CONT,
 869                             "irm_balance_thread: exiting...\n"));
 870                         mutex_exit(&pool_p->ipool_lock);
 871                         thread_exit();
 872                 }
 873 
 874                 /* Balance the pool */
 875                 i_ddi_irm_balance(pool_p);
 876 
 877                 /* Notify waiters */
 878                 if (pool_p->ipool_flags & DDI_IRM_FLAG_WAITERS) {
 879                         cv_broadcast(&pool_p->ipool_cv);
 880                         pool_p->ipool_flags &= ~(DDI_IRM_FLAG_WAITERS);
 881                 }
 882 
 883                 /* Clear QUEUED condition */
 884                 pool_p->ipool_flags &= ~(DDI_IRM_FLAG_QUEUED);
 885 
 886                 /* Sleep until queued */
 887                 cv_wait(&pool_p->ipool_cv, &pool_p->ipool_lock);
 888 
 889                 DDI_INTR_IRMDBG((CE_CONT, "irm_balance_thread: signaled.\n"));
 890         }
 891 }
 892 
 893 /*
 894  * i_ddi_irm_balance()
 895  *
 896  *      Balance a pool.  The general algorithm is to first reset all
 897  *      requests to their maximum size, use reduction algorithms to
 898  *      solve any imbalance, and then notify affected drivers.
 899  */
 900 static void
 901 i_ddi_irm_balance(ddi_irm_pool_t *pool_p)
 902 {
 903         ddi_irm_req_t   *req_p;
 904 
 905 #ifdef  DEBUG
 906         uint_t          debug_totsz = 0;
 907         int             debug_policy = 0;
 908 #endif  /* DEBUG */
 909 
 910         ASSERT(pool_p != NULL);
 911         ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
 912 
 913         DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_balance: pool_p %p\n",
 914             (void *)pool_p));
 915 
 916 #ifndef DEBUG
 917         if ((pool_p->ipool_reqno == pool_p->ipool_resno)) {
 918 #else
 919         if ((pool_p->ipool_reqno == pool_p->ipool_resno) && !irm_debug_size) {
 920 #endif  /* DEBUG */
 921                 DDI_INTR_IRMDBG((CE_CONT,
 922                     "i_ddi_irm_balance: pool already balanced\n"));
 923                 return;
 924         }
 925 
 926 #ifdef  DEBUG   /* Adjust size and policy settings */
 927         if (irm_debug_size > pool_p->ipool_minno) {
 928                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_balance: debug size %d\n",
 929                     irm_debug_size));
 930                 debug_totsz = pool_p->ipool_totsz;
 931                 pool_p->ipool_totsz = irm_debug_size;
 932         }
 933         if (DDI_IRM_POLICY_VALID(irm_debug_policy)) {
 934                 DDI_INTR_IRMDBG((CE_CONT,
 935                     "i_ddi_irm_balance: debug policy %d\n", irm_debug_policy));
 936                 debug_policy = pool_p->ipool_policy;
 937                 pool_p->ipool_policy = irm_debug_policy;
 938         }
 939 #endif  /* DEBUG */
 940 
 941         /* Lock the availability lock */
 942         mutex_enter(&pool_p->ipool_navail_lock);
 943 
 944         /*
 945          * Put all of the reducible requests into a scratch list.
 946          * Reset each one of them to their maximum availability.
 947          */
 948         for (req_p = list_head(&pool_p->ipool_req_list); req_p;
 949             req_p = list_next(&pool_p->ipool_req_list, req_p)) {
 950                 if (DDI_IRM_IS_REDUCIBLE(req_p)) {
 951                         pool_p->ipool_resno -= req_p->ireq_navail;
 952                         req_p->ireq_scratch = req_p->ireq_navail;
 953                         req_p->ireq_navail = req_p->ireq_nreq;
 954                         pool_p->ipool_resno += req_p->ireq_navail;
 955                         list_insert_tail(&pool_p->ipool_scratch_list, req_p);
 956                 }
 957         }
 958 
 959         /* Balance the requests */
 960         i_ddi_irm_reduce(pool_p);
 961 
 962         /* Unlock the availability lock */
 963         mutex_exit(&pool_p->ipool_navail_lock);
 964 
 965         /*
 966          * Process REMOVE notifications.
 967          *
 968          * If a driver fails to release interrupts: exclude it from
 969          * further processing, correct the resulting imbalance, and
 970          * start over again at the head of the scratch list.
 971          */
 972         req_p = list_head(&pool_p->ipool_scratch_list);
 973         while (req_p) {
 974                 if ((req_p->ireq_navail < req_p->ireq_scratch) &&
 975                     (i_ddi_irm_notify(pool_p, req_p) != DDI_SUCCESS)) {
 976                         list_remove(&pool_p->ipool_scratch_list, req_p);
 977                         mutex_enter(&pool_p->ipool_navail_lock);
 978                         i_ddi_irm_reduce(pool_p);
 979                         mutex_exit(&pool_p->ipool_navail_lock);
 980                         req_p = list_head(&pool_p->ipool_scratch_list);
 981                 } else {
 982                         req_p = list_next(&pool_p->ipool_scratch_list, req_p);
 983                 }
 984         }
 985 
 986         /*
 987          * Process ADD notifications.
 988          *
 989          * This is the last use of the scratch list, so empty it.
 990          */
 991         while (req_p = list_remove_head(&pool_p->ipool_scratch_list)) {
 992                 if (req_p->ireq_navail > req_p->ireq_scratch) {
 993                         (void) i_ddi_irm_notify(pool_p, req_p);
 994                 }
 995         }
 996 
 997 #ifdef  DEBUG   /* Restore size and policy settings */
 998         if (debug_totsz != 0)
 999                 pool_p->ipool_totsz = debug_totsz;
1000         if (debug_policy != 0)
1001                 pool_p->ipool_policy = debug_policy;
1002 #endif  /* DEBUG */
1003 }
1004 
1005 /*
1006  * i_ddi_irm_reduce()
1007  *
1008  *      Use reduction algorithms to correct an imbalance in a pool.
1009  */
1010 static void
1011 i_ddi_irm_reduce(ddi_irm_pool_t *pool_p)
1012 {
1013         int     imbalance;
1014 
1015         ASSERT(pool_p != NULL);
1016         ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
1017         ASSERT(DDI_IRM_POLICY_VALID(pool_p->ipool_policy));
1018 
1019         DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_reduce: pool_p %p\n",
1020             (void *)pool_p));
1021 
1022         /* Compute the imbalance.  Do nothing if already balanced. */
1023         if ((imbalance = pool_p->ipool_resno - pool_p->ipool_totsz) <= 0)
1024                 return;
1025 
1026         /*
1027          * Try policy based reduction first. If it failed, then
1028          * possibly reduce new requests as a last resort.
1029          */
1030         if (i_ddi_irm_reduce_by_policy(pool_p, imbalance, pool_p->ipool_policy)
1031             != DDI_SUCCESS) {
1032 
1033                 DDI_INTR_IRMDBG((CE_CONT,
1034                     "i_ddi_irm_reduce: policy reductions failed.\n"));
1035 
1036                 /* Compute remaining imbalance */
1037                 imbalance = pool_p->ipool_resno - pool_p->ipool_totsz;
1038 
1039                 ASSERT(imbalance > 0);
1040 
1041                 i_ddi_irm_reduce_new(pool_p, imbalance);
1042         }
1043 }
1044 
1045 /*
1046  * i_ddi_irm_enqueue()
1047  *
1048  *      Queue a pool to be balanced.  Signals the balancing thread to wake
1049  *      up and process the pool.  If 'wait_flag' is true, then the current
1050  *      thread becomes a waiter and blocks until the balance is completed.
1051  */
1052 static void
1053 i_ddi_irm_enqueue(ddi_irm_pool_t *pool_p, boolean_t wait_flag)
1054 {
1055         ASSERT(pool_p != NULL);
1056         ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
1057 
1058         DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_enqueue: pool_p %p wait_flag %d\n",
1059             (void *)pool_p, (int)wait_flag));
1060 
1061         /* Do nothing if pool is already balanced */
1062 #ifndef DEBUG
1063         if ((pool_p->ipool_reqno == pool_p->ipool_resno)) {
1064 #else
1065         if ((pool_p->ipool_reqno == pool_p->ipool_resno) && !irm_debug_size) {
1066 #endif  /* DEBUG */
1067                 DDI_INTR_IRMDBG((CE_CONT,
1068                     "i_ddi_irm_enqueue: pool already balanced\n"));
1069                 return;
1070         }
1071 
1072         /* Avoid deadlocks when IRM is not active */
1073         if (!irm_active && wait_flag) {
1074                 DDI_INTR_IRMDBG((CE_CONT,
1075                     "i_ddi_irm_enqueue: pool not active.\n"));
1076                 return;
1077         }
1078 
1079         if (wait_flag)
1080                 pool_p->ipool_flags |= DDI_IRM_FLAG_WAITERS;
1081 
1082         if (wait_flag || !(pool_p->ipool_flags & DDI_IRM_FLAG_QUEUED)) {
1083                 pool_p->ipool_flags |= DDI_IRM_FLAG_QUEUED;
1084                 cv_signal(&pool_p->ipool_cv);
1085                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_enqueue: pool queued.\n"));
1086         }
1087 
1088         if (wait_flag) {
1089                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_enqueue: waiting...\n"));
1090                 cv_wait(&pool_p->ipool_cv, &pool_p->ipool_lock);
1091         }
1092 }
1093 
1094 /*
1095  * i_ddi_irm_reduce_by_policy()
1096  *
1097  *      Reduces requests based on reduction policies.
1098  *
1099  *      For the DDI_IRM_POLICY_LARGE reduction policy, the algorithm
1100  *      generally reduces larger requests first, before advancing
1101  *      to smaller requests.
1102  *      For the DDI_IRM_POLICY_EVEN reduction policy, the algorithm
1103  *      reduces requests evenly, without giving a specific preference
1104  *      to smaller or larger requests. Each iteration reduces all
1105  *      reducible requests by the same amount until the imbalance is
1106  *      corrected.
1107  *
1108  *      The scratch list is initially sorted in descending order by current
1109  *      navail values, which are maximized prior to reduction. This sorted
1110  *      order is preserved.  It avoids reducing requests below the threshold
1111  *      of the interrupt pool's default allocation size.
1112  *
1113  *      Optimizations in this algorithm include trying to reduce multiple
1114  *      requests together.  And the algorithm attempts to reduce in larger
1115  *      increments when possible to minimize the total number of iterations.
1116  */
1117 static int
1118 i_ddi_irm_reduce_by_policy(ddi_irm_pool_t *pool_p, int imbalance, int policy)
1119 {
1120         ASSERT(pool_p != NULL);
1121         ASSERT(imbalance > 0);
1122         ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
1123 
1124         while (imbalance > 0) {
1125                 list_t          *slist_p = &pool_p->ipool_scratch_list;
1126                 ddi_irm_req_t   *req_p = list_head(slist_p), *last_p;
1127                 uint_t          nreduce = 0, nremain = 0, stop_navail;
1128                 uint_t          pool_defsz = pool_p->ipool_defsz;
1129                 uint_t          reduction, max_redu;
1130 
1131                 /* Fail if none are reducible */
1132                 if (!req_p || req_p->ireq_navail <= pool_defsz) {
1133                         DDI_INTR_IRMDBG((CE_CONT,
1134                             "i_ddi_irm_reduce_by_policy: Failure. "
1135                             "All requests have downsized to low limit.\n"));
1136                         return (DDI_FAILURE);
1137                 }
1138 
1139                 /* Count reducible requests */
1140                 stop_navail = (policy == DDI_IRM_POLICY_LARGE) ?
1141                     req_p->ireq_navail - 1 : pool_defsz;
1142                 for (; req_p; req_p = list_next(slist_p, req_p)) {
1143                         if (req_p->ireq_navail <= stop_navail)
1144                                 break;
1145                         nreduce++;
1146                 }
1147 
1148                 /* Compute reduction */
1149                 last_p = req_p ? list_prev(slist_p, req_p) : list_tail(slist_p);
1150                 if ((policy == DDI_IRM_POLICY_LARGE) && req_p &&
1151                     req_p->ireq_navail > pool_defsz)
1152                         reduction = last_p->ireq_navail - req_p->ireq_navail;
1153                 else
1154                         reduction = last_p->ireq_navail - pool_defsz;
1155 
1156                 if ((max_redu = reduction * nreduce) > imbalance) {
1157                         reduction = imbalance / nreduce;
1158                         nremain = imbalance % nreduce;
1159                         pool_p->ipool_resno -= imbalance;
1160                         imbalance = 0;
1161                 } else {
1162                         pool_p->ipool_resno -= max_redu;
1163                         imbalance -= max_redu;
1164                 }
1165 
1166                 /* Reduce */
1167                 for (req_p = list_head(slist_p); (reduction != 0) && nreduce--;
1168                     req_p = list_next(slist_p, req_p)) {
1169                         req_p->ireq_navail -= reduction;
1170                 }
1171 
1172                 for (req_p = last_p; nremain--;
1173                     req_p = list_prev(slist_p, req_p)) {
1174                         req_p->ireq_navail--;
1175                 }
1176         }
1177 
1178         return (DDI_SUCCESS);
1179 }
1180 
1181 /*
1182  * i_ddi_irm_reduce_new()
1183  *
1184  *      Reduces new requests.  This is only used as a last resort
1185  *      after another reduction algorithm failed.
1186  *
1187  *      NOTE: The pool locking in i_ddi_irm_insert() ensures
1188  *      there can be only one new request at a time in a pool.
1189  */
1190 static void
1191 i_ddi_irm_reduce_new(ddi_irm_pool_t *pool_p, int imbalance)
1192 {
1193         ddi_irm_req_t   *req_p;
1194 
1195         ASSERT(pool_p != NULL);
1196         ASSERT(imbalance > 0);
1197         ASSERT(MUTEX_HELD(&pool_p->ipool_lock));
1198 
1199         DDI_INTR_IRMDBG((CE_CONT,
1200             "i_ddi_irm_reduce_new: pool_p %p imbalance %d\n",
1201             (void *)pool_p, imbalance));
1202 
1203         for (req_p = list_head(&pool_p->ipool_scratch_list); req_p;
1204             req_p = list_next(&pool_p->ipool_scratch_list, req_p)) {
1205                 if (req_p->ireq_flags & DDI_IRM_FLAG_NEW) {
1206                         ASSERT(req_p->ireq_navail >= imbalance);
1207                         req_p->ireq_navail -= imbalance;
1208                         pool_p->ipool_resno -= imbalance;
1209                         return;
1210                 }
1211         }
1212 
1213         /* should never go here */
1214         ASSERT(B_FALSE);
1215 }
1216 
1217 /*
1218  * Miscellaneous Helper Functions
1219  */
1220 
1221 /*
1222  * i_ddi_intr_get_pool()
1223  *
1224  *      Get an IRM pool that supplies interrupts of a specified type.
1225  *      Invokes a DDI_INTROP_GETPOOL to the bus nexus driver.  Fails
1226  *      if no pool exists.
1227  */
1228 ddi_irm_pool_t *
1229 i_ddi_intr_get_pool(dev_info_t *dip, int type)
1230 {
1231         devinfo_intr_t          *intr_p;
1232         ddi_irm_pool_t          *pool_p;
1233         ddi_irm_req_t           *req_p;
1234         ddi_intr_handle_impl_t  hdl;
1235 
1236         ASSERT(dip != NULL);
1237         ASSERT(DDI_INTR_TYPE_FLAG_VALID(type));
1238 
1239         if (((intr_p = DEVI(dip)->devi_intr_p) != NULL) &&
1240             ((req_p = intr_p->devi_irm_req_p) != NULL) &&
1241             ((pool_p = req_p->ireq_pool_p) != NULL) &&
1242             (pool_p->ipool_types & type)) {
1243                 return (pool_p);
1244         }
1245 
1246         bzero(&hdl, sizeof (ddi_intr_handle_impl_t));
1247         hdl.ih_dip = dip;
1248         hdl.ih_type = type;
1249 
1250         if (i_ddi_intr_ops(dip, dip, DDI_INTROP_GETPOOL,
1251             &hdl, (void *)&pool_p) == DDI_SUCCESS)
1252                 return (pool_p);
1253 
1254         return (NULL);
1255 }
1256 
1257 /*
1258  * i_ddi_irm_insertion_sort()
1259  *
1260  *      Use the insertion sort method to insert a request into a list.
1261  *      The list is sorted in descending order by request size.
1262  */
1263 static void
1264 i_ddi_irm_insertion_sort(list_t *req_list, ddi_irm_req_t *req_p)
1265 {
1266         ddi_irm_req_t   *next_p;
1267 
1268         next_p = list_head(req_list);
1269 
1270         while (next_p && (next_p->ireq_nreq > req_p->ireq_nreq))
1271                 next_p = list_next(req_list, next_p);
1272 
1273         list_insert_before(req_list, next_p, req_p);
1274 }
1275 
1276 /*
1277  * i_ddi_irm_notify()
1278  *
1279  *      Notify a driver of changes to its interrupt request using the
1280  *      generic callback mechanism.  Checks for errors in processing.
1281  */
1282 static int
1283 i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p)
1284 {
1285         ddi_cb_action_t action;
1286         ddi_cb_t        *cb_p;
1287         uint_t          nintrs;
1288         int             ret, count;
1289 
1290         DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_notify: pool_p %p req_p %p\n",
1291             (void *)pool_p, (void *)req_p));
1292 
1293         /* Do not notify new or unchanged requests */
1294         if ((req_p->ireq_navail == req_p->ireq_scratch) ||
1295             (req_p->ireq_flags & DDI_IRM_FLAG_NEW))
1296                 return (DDI_SUCCESS);
1297 
1298         /* Determine action and count */
1299         if (req_p->ireq_navail > req_p->ireq_scratch) {
1300                 action = DDI_CB_INTR_ADD;
1301                 count = req_p->ireq_navail - req_p->ireq_scratch;
1302                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_notify: adding %d\n",
1303                     count));
1304         } else {
1305                 action = DDI_CB_INTR_REMOVE;
1306                 count = req_p->ireq_scratch - req_p->ireq_navail;
1307                 DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_notify: removing %d\n",
1308                     count));
1309         }
1310 
1311         /* Lookup driver callback */
1312         if ((cb_p = DEVI(req_p->ireq_dip)->devi_cb_p) == NULL) {
1313                 DDI_INTR_IRMDBG((CE_WARN, "i_ddi_irm_notify: no callback!\n"));
1314                 return (DDI_FAILURE);
1315         }
1316 
1317         /* Do callback */
1318         ret = cb_p->cb_func(req_p->ireq_dip, action, (void *)(uintptr_t)count,
1319             cb_p->cb_arg1, cb_p->cb_arg2);
1320 
1321         /* Log callback errors */
1322         if (ret != DDI_SUCCESS) {
1323                 cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n",
1324                     ddi_driver_name(req_p->ireq_dip),
1325                     ddi_get_instance(req_p->ireq_dip), (int)action, ret);
1326         }
1327 
1328         /* Check if the driver exceeds its availability */
1329         nintrs = i_ddi_intr_get_current_nintrs(req_p->ireq_dip);
1330         if (nintrs > req_p->ireq_navail) {
1331                 cmn_err(CE_WARN, "%s%d: failed to release interrupts "
1332                     "(nintrs=%d, navail=%d).\n",
1333                     ddi_driver_name(req_p->ireq_dip),
1334                     ddi_get_instance(req_p->ireq_dip), nintrs,
1335                     req_p->ireq_navail);
1336                 pool_p->ipool_resno += (nintrs - req_p->ireq_navail);
1337                 req_p->ireq_navail = nintrs;
1338                 return (DDI_FAILURE);
1339         }
1340 
1341         /* Update request */
1342         req_p->ireq_scratch = req_p->ireq_navail;
1343 
1344         return (DDI_SUCCESS);
1345 }
1346 
1347 /*
1348  * i_ddi_irm_debug_balance()
1349  *
1350  *      A debug/test only routine to force the immediate,
1351  *      synchronous rebalancing of an interrupt pool.
1352  */
1353 #ifdef  DEBUG
1354 void
1355 i_ddi_irm_debug_balance(dev_info_t *dip, boolean_t wait_flag)
1356 {
1357         ddi_irm_pool_t  *pool_p;
1358         int             type;
1359 
1360         DDI_INTR_IRMDBG((CE_CONT, "i_ddi_irm_debug_balance: dip %p wait %d\n",
1361             (void *)dip, (int)wait_flag));
1362 
1363         if (((type = i_ddi_intr_get_current_type(dip)) != 0) &&
1364             ((pool_p = i_ddi_intr_get_pool(dip, type)) != NULL)) {
1365                 mutex_enter(&pool_p->ipool_lock);
1366                 i_ddi_irm_enqueue(pool_p, wait_flag);
1367                 mutex_exit(&pool_p->ipool_lock);
1368         }
1369 }
1370 #endif