1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 #include <sys/pool.h>
  27 #include <sys/pool_impl.h>
  28 #include <sys/pool_pset.h>
  29 #include <sys/id_space.h>
  30 #include <sys/mutex.h>
  31 #include <sys/nvpair.h>
  32 #include <sys/cpuvar.h>
  33 #include <sys/errno.h>
  34 #include <sys/cmn_err.h>
  35 #include <sys/systm.h>
  36 #include <sys/proc.h>
  37 #include <sys/fss.h>
  38 #include <sys/class.h>
  39 #include <sys/exacct.h>
  40 #include <sys/utsname.h>
  41 #include <sys/procset.h>
  42 #include <sys/atomic.h>
  43 #include <sys/zone.h>
  44 #include <sys/policy.h>
  45 #include <sys/schedctl.h>
  46 #include <sys/taskq.h>
  47 
  48 /*
  49  * RESOURCE POOLS
  50  *
  51  * The resource pools facility brings together process-bindable resource into
  52  * a common abstraction called a pool. Processor sets and other entities can
  53  * be configured, grouped, and labelled such that workload components can be
  54  * associated with a subset of a system's total resources.
  55  *
  56  * When disabled, the pools facility is "invisible".  All processes belong
  57  * to the same pool (pool_default), and processor sets can be managed through
  58  * the old pset() system call.  When enabled, processor sets can only be
  59  * managed via the pools facility.  New pools can be created and associated
  60  * with processor sets.  Processes can be bound to pools which have non-empty
  61  * resource sets.
  62  *
  63  * Locking: pool_lock() protects global pools state and must be called
  64  * before modifying the configuration, or when taking a snapshot of the
  65  * configuration.  If pool_lock_intr() is used, the operation may be
  66  * interrupted by a signal or a request.
  67  *
  68  * To prevent processes from being rebound between pools while they are
  69  * the middle of an operation which affects resource set bindings, such
  70  * operations must be surrounded by calls to pool_barrier_enter() and
  71  * pool_barrier_exit().  This mechanism guarantees that such processes will
  72  * be stopped either at the beginning or at the end of the barrier so that
  73  * the rebind operation can atomically bind the process and its threads
  74  * to new resource sets, and then let process run again.
  75  *
  76  * Lock ordering with respect to other locks is as follows:
  77  *
  78  *      pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock
  79  *
  80  * Most static and global variables defined in this file are protected
  81  * by calling pool_lock().
  82  *
  83  * The operation that binds tasks and projects to pools is atomic.  That is,
  84  * either all processes in a given task or a project will be bound to a
  85  * new pool, or (in case of an error) they will be all left bound to the
  86  * old pool. Processes in a given task or a given project can only be bound to
  87  * different pools if they were rebound individually one by one as single
  88  * processes.  Threads or LWPs of the same process do not have pool bindings,
  89  * and are bound to the same resource sets associated with the resource pool
  90  * of that process.
  91  *
  92  * The following picture shows one possible pool configuration with three
  93  * pools and three processor sets.  Note that processor set "foo" is not
  94  * associated with any pools and therefore cannot have any processes
  95  * bound to it.  Two pools (default and foo) are associated with the
  96  * same processor set (default).  Also, note that processes in Task 2
  97  * are bound to different pools.
  98  *
  99  *
 100  *                                                             Processor Sets
 101  *                                                              +---------+
 102  *                     +--------------+========================>| default |
 103  *                    a|              |                         +---------+
 104  *                    s|              |                             ||
 105  *                    s|              |                         +---------+
 106  *                    o|              |                         |   foo   |
 107  *                    c|              |                         +---------+
 108  *                    i|              |                             ||
 109  *                    a|              |                         +---------+
 110  *                    t|              |                 +------>|   bar   |
 111  *                    e|              |                 |       +---------+
 112  *                    d|              |                 |
 113  *                     |              |                 |
 114  *             +---------+      +---------+      +---------+
 115  *     Pools   | default |======|   foo   |======|   bar   |
 116  *             +---------+      +---------+      +---------+
 117  *                 @  @            @              @ @   @
 118  *                b|  |            |              | |   |
 119  *                o|  |            |              | |   |
 120  *                u|  +-----+      |      +-------+ |   +---+
 121  *                n|        |      |      |         |       |
 122  *            ....d|........|......|......|.........|.......|....
 123  *            :    |   ::   |      |      |    ::   |       |   :
 124  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
 125  *  Processes :  | p | :: | p |  | p |  | p |  :: | p |...| p | :
 126  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
 127  *            :........::......................::...............:
 128  *              Task 1            Task 2              Task N
 129  *                 |                 |                  |
 130  *                 |                 |                  |
 131  *                 |  +-----------+  |             +-----------+
 132  *                 +--| Project 1 |--+             | Project N |
 133  *                    +-----------+                +-----------+
 134  *
 135  * This is just an illustration of relationships between processes, tasks,
 136  * projects, pools, and processor sets. New types of resource sets will be
 137  * added in the future.
 138  */
 139 
 140 pool_t          *pool_default;  /* default pool which always exists */
 141 int             pool_count;     /* number of pools created on this system */
 142 int             pool_state;     /* pools state -- enabled/disabled */
 143 void            *pool_buf;      /* pre-commit snapshot of the pools state */
 144 size_t          pool_bufsz;     /* size of pool_buf */
 145 static hrtime_t pool_pool_mod;  /* last modification time for pools */
 146 static hrtime_t pool_sys_mod;   /* last modification time for system */
 147 static nvlist_t *pool_sys_prop; /* system properties */
 148 static id_space_t *pool_ids;    /* pool ID space */
 149 static list_t   pool_list;      /* doubly-linked list of pools */
 150 static kmutex_t         pool_mutex;             /* protects pool_busy_* */
 151 static kcondvar_t       pool_busy_cv;           /* waiting for "pool_lock" */
 152 static kthread_t        *pool_busy_thread;      /* thread holding "pool_lock" */
 153 static kmutex_t         pool_barrier_lock;      /* synch. with pool_barrier_* */
 154 static kcondvar_t       pool_barrier_cv;        /* synch. with pool_barrier_* */
 155 static int              pool_barrier_count;     /* synch. with pool_barrier_* */
 156 static list_t           pool_event_cb_list;     /* pool event callbacks */
 157 static boolean_t        pool_event_cb_init = B_FALSE;
 158 static kmutex_t         pool_event_cb_lock;
 159 static taskq_t          *pool_event_cb_taskq = NULL;
 160 
 161 void pool_event_dispatch(pool_event_t, poolid_t);
 162 
 163 /*
 164  * Boot-time pool initialization.
 165  */
 166 void
 167 pool_init(void)
 168 {
 169         pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID);
 170 
 171         /*
 172          * Initialize default pool.
 173          */
 174         pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
 175         pool_default->pool_id = POOL_DEFAULT;
 176         list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link));
 177         list_insert_head(&pool_list, pool_default);
 178 
 179         /*
 180          * Initialize plugins for resource sets.
 181          */
 182         pool_pset_init();
 183         pool_count = 1;
 184         p0.p_pool = pool_default;
 185         global_zone->zone_pool = pool_default;
 186         pool_default->pool_ref = 1;
 187 }
 188 
 189 /*
 190  * Synchronization routines.
 191  *
 192  * pool_lock is only called from syscall-level routines (processor_bind(),
 193  * pset_*(), and /dev/pool ioctls).  The pool "lock" may be held for long
 194  * periods of time, including across sleeping operations, so we allow its
 195  * acquisition to be interruptible.
 196  *
 197  * The current thread that owns the "lock" is stored in the variable
 198  * pool_busy_thread, both to let pool_lock_held() work and to aid debugging.
 199  */
 200 void
 201 pool_lock(void)
 202 {
 203         mutex_enter(&pool_mutex);
 204         ASSERT(!pool_lock_held());
 205         while (pool_busy_thread != NULL)
 206                 cv_wait(&pool_busy_cv, &pool_mutex);
 207         pool_busy_thread = curthread;
 208         mutex_exit(&pool_mutex);
 209 }
 210 
 211 int
 212 pool_lock_intr(void)
 213 {
 214         mutex_enter(&pool_mutex);
 215         ASSERT(!pool_lock_held());
 216         while (pool_busy_thread != NULL) {
 217                 if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) {
 218                         cv_signal(&pool_busy_cv);
 219                         mutex_exit(&pool_mutex);
 220                         return (1);
 221                 }
 222         }
 223         pool_busy_thread = curthread;
 224         mutex_exit(&pool_mutex);
 225         return (0);
 226 }
 227 
 228 int
 229 pool_lock_held(void)
 230 {
 231         return (pool_busy_thread == curthread);
 232 }
 233 
 234 void
 235 pool_unlock(void)
 236 {
 237         mutex_enter(&pool_mutex);
 238         ASSERT(pool_lock_held());
 239         pool_busy_thread = NULL;
 240         cv_signal(&pool_busy_cv);
 241         mutex_exit(&pool_mutex);
 242 }
 243 
 244 /*
 245  * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize
 246  * with pool_do_bind().
 247  *
 248  * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all
 249  * operations which modify pool or pset associations.  They can be called
 250  * while the process is multi-threaded.  In the common case, when current
 251  * process is not being rebound (PBWAIT flag is not set), these functions
 252  * will be just incrementing and decrementing reference counts.
 253  */
 254 void
 255 pool_barrier_enter(void)
 256 {
 257         proc_t *p = curproc;
 258 
 259         ASSERT(MUTEX_HELD(&p->p_lock));
 260         while (p->p_poolflag & PBWAIT)
 261                 cv_wait(&p->p_poolcv, &p->p_lock);
 262         p->p_poolcnt++;
 263 }
 264 
 265 void
 266 pool_barrier_exit(void)
 267 {
 268         proc_t *p = curproc;
 269 
 270         ASSERT(MUTEX_HELD(&p->p_lock));
 271         ASSERT(p->p_poolcnt > 0);
 272         p->p_poolcnt--;
 273         if (p->p_poolflag & PBWAIT) {
 274                 mutex_enter(&pool_barrier_lock);
 275                 ASSERT(pool_barrier_count > 0);
 276                 pool_barrier_count--;
 277                 if (pool_barrier_count == 0)
 278                         cv_signal(&pool_barrier_cv);
 279                 mutex_exit(&pool_barrier_lock);
 280                 while (p->p_poolflag & PBWAIT)
 281                         cv_wait(&p->p_poolcv, &p->p_lock);
 282         }
 283 }
 284 
 285 /*
 286  * Enable pools facility.
 287  */
 288 static int
 289 pool_enable(void)
 290 {
 291         int ret;
 292 
 293         ASSERT(pool_lock_held());
 294         ASSERT(pool_count == 1);
 295 
 296         ret = pool_pset_enable();
 297         if (ret != 0)
 298                 return (ret);
 299         (void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP);
 300         (void) nvlist_add_string(pool_sys_prop, "system.name",
 301             "default");
 302         (void) nvlist_add_string(pool_sys_prop, "system.comment", "");
 303         (void) nvlist_add_int64(pool_sys_prop, "system.version", 1);
 304         (void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1);
 305         (void) nvlist_add_string(pool_sys_prop, "system.poold.objectives",
 306             "wt-load");
 307 
 308         (void) nvlist_alloc(&pool_default->pool_props,
 309             NV_UNIQUE_NAME, KM_SLEEP);
 310         (void) nvlist_add_string(pool_default->pool_props,
 311             "pool.name", "pool_default");
 312         (void) nvlist_add_string(pool_default->pool_props, "pool.comment", "");
 313         (void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1);
 314         (void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1);
 315         (void) nvlist_add_int64(pool_default->pool_props,
 316             "pool.importance", 1);
 317         (void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id",
 318             pool_default->pool_id);
 319 
 320         pool_sys_mod = pool_pool_mod = gethrtime();
 321 
 322         return (ret);
 323 }
 324 
 325 /*
 326  * Disable pools facility.
 327  */
 328 static int
 329 pool_disable(void)
 330 {
 331         int ret;
 332 
 333         ASSERT(pool_lock_held());
 334 
 335         if (pool_count > 1)  /* must destroy all pools first */
 336                 return (EBUSY);
 337 
 338         ret = pool_pset_disable();
 339         if (ret != 0)
 340                 return (ret);
 341         if (pool_sys_prop != NULL) {
 342                 nvlist_free(pool_sys_prop);
 343                 pool_sys_prop = NULL;
 344         }
 345         if (pool_default->pool_props != NULL) {
 346                 nvlist_free(pool_default->pool_props);
 347                 pool_default->pool_props = NULL;
 348         }
 349         return (0);
 350 }
 351 
 352 pool_t *
 353 pool_lookup_pool_by_name(char *name)
 354 {
 355         pool_t *pool = pool_default;
 356         char *p;
 357 
 358         ASSERT(pool_lock_held());
 359         for (pool = list_head(&pool_list); pool;
 360             pool = list_next(&pool_list, pool)) {
 361                 if (nvlist_lookup_string(pool->pool_props,
 362                     "pool.name", &p) == 0 && strcmp(name, p) == 0)
 363                         return (pool);
 364         }
 365         return (NULL);
 366 }
 367 
 368 pool_t *
 369 pool_lookup_pool_by_id(poolid_t poolid)
 370 {
 371         pool_t *pool = pool_default;
 372 
 373         ASSERT(pool_lock_held());
 374         for (pool = list_head(&pool_list); pool;
 375             pool = list_next(&pool_list, pool)) {
 376                 if (pool->pool_id == poolid)
 377                         return (pool);
 378         }
 379         return (NULL);
 380 }
 381 
 382 pool_t *
 383 pool_lookup_pool_by_pset(int id)
 384 {
 385         pool_t *pool = pool_default;
 386         psetid_t psetid = (psetid_t)id;
 387 
 388         ASSERT(pool_lock_held());
 389         for (pool = list_head(&pool_list); pool != NULL;
 390             pool = list_next(&pool_list, pool)) {
 391                 if (pool->pool_pset->pset_id == psetid)
 392                         return (pool);
 393         }
 394         return (NULL);
 395 }
 396 
 397 /*
 398  * Create new pool, associate it with default resource sets, and give
 399  * it a temporary name.
 400  */
 401 static int
 402 pool_pool_create(poolid_t *poolid)
 403 {
 404         pool_t *pool;
 405         char pool_name[40];
 406 
 407         ASSERT(pool_lock_held());
 408 
 409         pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
 410         pool->pool_id = *poolid = id_alloc(pool_ids);
 411         pool->pool_pset = pool_pset_default;
 412         pool_pset_default->pset_npools++;
 413         list_insert_tail(&pool_list, pool);
 414         (void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP);
 415         (void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id);
 416         (void) nvlist_add_byte(pool->pool_props, "pool.default", 0);
 417         pool_pool_mod = gethrtime();
 418         (void) snprintf(pool_name, sizeof (pool_name), "pool_%lld",
 419             pool_pool_mod);
 420         (void) nvlist_add_string(pool->pool_props, "pool.name", pool_name);
 421         pool_count++;
 422         return (0);
 423 }
 424 
 425 struct destroy_zone_arg {
 426         pool_t *old;
 427         pool_t *new;
 428 };
 429 
 430 /*
 431  * Update pool pointers for zones that are currently bound to pool "old"
 432  * to be bound to pool "new".
 433  */
 434 static int
 435 pool_destroy_zone_cb(zone_t *zone, void *arg)
 436 {
 437         struct destroy_zone_arg *dza = arg;
 438 
 439         ASSERT(pool_lock_held());
 440         ASSERT(MUTEX_HELD(&cpu_lock));
 441 
 442         if (zone_pool_get(zone) == dza->old)
 443                 zone_pool_set(zone, dza->new);
 444         return (0);
 445 }
 446 
 447 /*
 448  * Destroy specified pool, and rebind all processes in it
 449  * to the default pool.
 450  */
 451 static int
 452 pool_pool_destroy(poolid_t poolid)
 453 {
 454         pool_t *pool;
 455         int ret;
 456 
 457         ASSERT(pool_lock_held());
 458 
 459         if (poolid == POOL_DEFAULT)
 460                 return (EINVAL);
 461         if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
 462                 return (ESRCH);
 463         ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL);
 464         if (ret == 0) {
 465                 struct destroy_zone_arg dzarg;
 466 
 467                 dzarg.old = pool;
 468                 dzarg.new = pool_default;
 469                 mutex_enter(&cpu_lock);
 470                 ret = zone_walk(pool_destroy_zone_cb, &dzarg);
 471                 mutex_exit(&cpu_lock);
 472                 ASSERT(ret == 0);
 473                 ASSERT(pool->pool_ref == 0);
 474                 (void) nvlist_free(pool->pool_props);
 475                 id_free(pool_ids, pool->pool_id);
 476                 pool->pool_pset->pset_npools--;
 477                 list_remove(&pool_list, pool);
 478                 pool_count--;
 479                 pool_pool_mod = gethrtime();
 480                 kmem_free(pool, sizeof (pool_t));
 481         }
 482         return (ret);
 483 }
 484 
 485 /*
 486  * Create new pool or resource set.
 487  */
 488 int
 489 pool_create(int class, int subclass, id_t *id)
 490 {
 491         int ret;
 492 
 493         ASSERT(pool_lock_held());
 494         if (pool_state == POOL_DISABLED)
 495                 return (ENOTACTIVE);
 496         switch (class) {
 497         case PEC_POOL:
 498                 ret = pool_pool_create((poolid_t *)id);
 499                 break;
 500         case PEC_RES_COMP:
 501                 switch (subclass) {
 502                 case PREC_PSET:
 503                         ret = pool_pset_create((psetid_t *)id);
 504                         break;
 505                 default:
 506                         ret = EINVAL;
 507                 }
 508                 break;
 509         case PEC_RES_AGG:
 510                 ret = ENOTSUP;
 511                 break;
 512         default:
 513                 ret = EINVAL;
 514         }
 515         return (ret);
 516 }
 517 
 518 /*
 519  * Destroy an existing pool or resource set.
 520  */
 521 int
 522 pool_destroy(int class, int subclass, id_t id)
 523 {
 524         int ret;
 525 
 526         ASSERT(pool_lock_held());
 527         if (pool_state == POOL_DISABLED)
 528                 return (ENOTACTIVE);
 529         switch (class) {
 530         case PEC_POOL:
 531                 ret = pool_pool_destroy((poolid_t)id);
 532                 break;
 533         case PEC_RES_COMP:
 534                 switch (subclass) {
 535                 case PREC_PSET:
 536                         ret = pool_pset_destroy((psetid_t)id);
 537                         break;
 538                 default:
 539                         ret = EINVAL;
 540                 }
 541                 break;
 542         case PEC_RES_AGG:
 543                 ret = ENOTSUP;
 544                 break;
 545         default:
 546                 ret = EINVAL;
 547         }
 548         return (ret);
 549 }
 550 
 551 /*
 552  * Enable or disable pools.
 553  */
 554 int
 555 pool_status(int status)
 556 {
 557         int ret = 0;
 558 
 559         ASSERT(pool_lock_held());
 560 
 561         if (pool_state == status)
 562                 return (0);
 563         switch (status) {
 564         case POOL_ENABLED:
 565                 ret = pool_enable();
 566                 if (ret != 0)
 567                         return (ret);
 568                 pool_state = POOL_ENABLED;
 569                 pool_event_dispatch(POOL_E_ENABLE, NULL);
 570                 break;
 571         case POOL_DISABLED:
 572                 ret = pool_disable();
 573                 if (ret != 0)
 574                         return (ret);
 575                 pool_state = POOL_DISABLED;
 576                 pool_event_dispatch(POOL_E_DISABLE, NULL);
 577                 break;
 578         default:
 579                 ret = EINVAL;
 580         }
 581         return (ret);
 582 }
 583 
 584 /*
 585  * Associate pool with resource set.
 586  */
 587 int
 588 pool_assoc(poolid_t poolid, int idtype, id_t id)
 589 {
 590         int ret;
 591 
 592         ASSERT(pool_lock_held());
 593         if (pool_state == POOL_DISABLED)
 594                 return (ENOTACTIVE);
 595         switch (idtype) {
 596         case PREC_PSET:
 597                 ret = pool_pset_assoc(poolid, (psetid_t)id);
 598                 if (ret == 0)
 599                         pool_event_dispatch(POOL_E_CHANGE, poolid);
 600                 break;
 601         default:
 602                 ret = EINVAL;
 603         }
 604         if (ret == 0)
 605                 pool_pool_mod = gethrtime();
 606         return (ret);
 607 }
 608 
 609 /*
 610  * Disassociate resource set from pool.
 611  */
 612 int
 613 pool_dissoc(poolid_t poolid, int idtype)
 614 {
 615         int ret;
 616 
 617         ASSERT(pool_lock_held());
 618         if (pool_state == POOL_DISABLED)
 619                 return (ENOTACTIVE);
 620         switch (idtype) {
 621         case PREC_PSET:
 622                 ret = pool_pset_assoc(poolid, PS_NONE);
 623                 if (ret == 0)
 624                         pool_event_dispatch(POOL_E_CHANGE, poolid);
 625                 break;
 626         default:
 627                 ret = EINVAL;
 628         }
 629         if (ret == 0)
 630                 pool_pool_mod = gethrtime();
 631         return (ret);
 632 }
 633 
 634 /*
 635  * Transfer specified quantity of resources between resource sets.
 636  */
 637 /*ARGSUSED*/
 638 int
 639 pool_transfer(int type, id_t src, id_t dst, uint64_t qty)
 640 {
 641         int ret = EINVAL;
 642 
 643         return (ret);
 644 }
 645 
 646 static poolid_t
 647 pool_lookup_id_by_pset(int id)
 648 {
 649         pool_t *pool = pool_default;
 650         psetid_t psetid = (psetid_t)id;
 651 
 652         ASSERT(pool_lock_held());
 653         for (pool = list_head(&pool_list); pool != NULL;
 654             pool = list_next(&pool_list, pool)) {
 655                 if (pool->pool_pset->pset_id == psetid)
 656                         return (pool->pool_id);
 657         }
 658         return (POOL_INVALID);
 659 }
 660 
 661 /*
 662  * Transfer resources specified by their IDs between resource sets.
 663  */
 664 int
 665 pool_xtransfer(int type, id_t src_pset, id_t dst_pset, uint_t size, id_t *ids)
 666 {
 667         int ret;
 668         poolid_t src_pool, dst_pool;
 669 
 670         ASSERT(pool_lock_held());
 671         if (pool_state == POOL_DISABLED)
 672                 return (ENOTACTIVE);
 673         switch (type) {
 674         case PREC_PSET:
 675                 ret = pool_pset_xtransfer((psetid_t)src_pset,
 676                     (psetid_t)dst_pset, size, ids);
 677                 if (ret == 0) {
 678                         if ((src_pool =  pool_lookup_id_by_pset(src_pset)) !=
 679                             POOL_INVALID)
 680                                 pool_event_dispatch(POOL_E_CHANGE, src_pool);
 681                         if ((dst_pool =  pool_lookup_id_by_pset(dst_pset)) !=
 682                             POOL_INVALID)
 683                                 pool_event_dispatch(POOL_E_CHANGE, dst_pool);
 684                 }
 685                 break;
 686         default:
 687                 ret = EINVAL;
 688         }
 689         return (ret);
 690 }
 691 
 692 /*
 693  * Bind processes to pools.
 694  */
 695 int
 696 pool_bind(poolid_t poolid, idtype_t idtype, id_t id)
 697 {
 698         pool_t  *pool;
 699 
 700         ASSERT(pool_lock_held());
 701 
 702         if (pool_state == POOL_DISABLED)
 703                 return (ENOTACTIVE);
 704         if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
 705                 return (ESRCH);
 706 
 707         switch (idtype) {
 708         case P_PID:
 709         case P_TASKID:
 710         case P_PROJID:
 711         case P_ZONEID:
 712                 break;
 713         default:
 714                 return (EINVAL);
 715         }
 716         return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL));
 717 }
 718 
 719 /*
 720  * Query pool binding of the specifed process.
 721  */
 722 int
 723 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid)
 724 {
 725         proc_t *p;
 726 
 727         if (idtype != P_PID)
 728                 return (ENOTSUP);
 729         if (id == P_MYID)
 730                 id = curproc->p_pid;
 731 
 732         ASSERT(pool_lock_held());
 733 
 734         mutex_enter(&pidlock);
 735         if ((p = prfind((pid_t)id)) == NULL) {
 736                 mutex_exit(&pidlock);
 737                 return (ESRCH);
 738         }
 739         mutex_enter(&p->p_lock);
 740         /*
 741          * In local zones, lie about pool bindings of processes from
 742          * the global zone.
 743          */
 744         if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) {
 745                 pool_t *pool;
 746 
 747                 pool = zone_pool_get(curproc->p_zone);
 748                 *poolid = pool->pool_id;
 749         } else {
 750                 *poolid = p->p_pool->pool_id;
 751         }
 752         mutex_exit(&p->p_lock);
 753         mutex_exit(&pidlock);
 754         return (0);
 755 }
 756 
 757 static ea_object_t *
 758 pool_system_pack(void)
 759 {
 760         ea_object_t *eo_system;
 761         size_t bufsz = 0;
 762         char *buf = NULL;
 763 
 764         ASSERT(pool_lock_held());
 765 
 766         eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM);
 767         (void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t),
 768             EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64);
 769         if (INGLOBALZONE(curproc))
 770                 (void) ea_attach_item(eo_system, &pool_pool_mod,
 771                     sizeof (hrtime_t),
 772                     EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
 773         else
 774                 (void) ea_attach_item(eo_system,
 775                     &curproc->p_zone->zone_pool_mod,
 776                     sizeof (hrtime_t),
 777                     EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
 778         (void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t),
 779             EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64);
 780         (void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t),
 781             EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64);
 782         (void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0);
 783         (void) ea_attach_item(eo_system, buf, bufsz,
 784             EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW);
 785         kmem_free(buf, bufsz);
 786         return (eo_system);
 787 }
 788 
 789 /*
 790  * Pack information about pools and attach it to specified exacct group.
 791  */
 792 static int
 793 pool_pool_pack(ea_object_t *eo_system)
 794 {
 795         ea_object_t *eo_pool;
 796         pool_t *pool;
 797         size_t bufsz;
 798         char *buf;
 799         pool_t *myzonepool;
 800 
 801         ASSERT(pool_lock_held());
 802         myzonepool = zone_pool_get(curproc->p_zone);
 803         for (pool = list_head(&pool_list); pool;
 804             pool = list_next(&pool_list, pool)) {
 805                 if (!INGLOBALZONE(curproc) && myzonepool != pool)
 806                         continue;
 807                 bufsz = 0;
 808                 buf = NULL;
 809                 eo_pool = ea_alloc_group(EXT_GROUP |
 810                     EXC_LOCAL | EXD_GROUP_POOL);
 811                 (void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t),
 812                     EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32);
 813                 (void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id,
 814                     sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32);
 815                 (void) nvlist_pack(pool->pool_props, &buf, &bufsz,
 816                     NV_ENCODE_NATIVE, 0);
 817                 (void) ea_attach_item(eo_pool, buf, bufsz,
 818                     EXC_LOCAL | EXD_POOL_PROP | EXT_RAW);
 819                 kmem_free(buf, bufsz);
 820                 (void) ea_attach_to_group(eo_system, eo_pool);
 821         }
 822         return (0);
 823 }
 824 
 825 /*
 826  * Pack the whole pool configuration in the specified buffer.
 827  */
 828 int
 829 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize)
 830 {
 831         ea_object_t *eo_system;
 832         size_t ksize;
 833         int ret = 0;
 834 
 835         ASSERT(pool_lock_held());
 836 
 837         eo_system = pool_system_pack();         /* 1. pack system */
 838         (void) pool_pool_pack(eo_system);       /* 2. pack all pools */
 839         (void) pool_pset_pack(eo_system);       /* 3. pack all psets */
 840         ksize = ea_pack_object(eo_system, NULL, 0);
 841         if (kbuf == NULL || kbufsz == 0)
 842                 *asize = ksize;
 843         else if (ksize > kbufsz)
 844                 ret = ENOMEM;
 845         else
 846                 *asize = ea_pack_object(eo_system, kbuf, kbufsz);
 847         ea_free_object(eo_system, EUP_ALLOC);
 848         return (ret);
 849 }
 850 
 851 /*
 852  * Start/end the commit transaction.  If commit transaction is currently
 853  * in progress, then all POOL_QUERY ioctls will return pools configuration
 854  * at the beginning of transaction.
 855  */
 856 int
 857 pool_commit(int state)
 858 {
 859         ea_object_t *eo_system;
 860         int ret = 0;
 861 
 862         ASSERT(pool_lock_held());
 863 
 864         if (pool_state == POOL_DISABLED)
 865                 return (ENOTACTIVE);
 866         switch (state) {
 867         case 1:
 868                 /*
 869                  * Beginning commit transation.
 870                  */
 871                 if (pool_buf != NULL)           /* transaction in progress */
 872                         return (EBUSY);
 873                 eo_system = pool_system_pack();         /* 1. pack system */
 874                 (void) pool_pool_pack(eo_system);       /* 2. pack all pools */
 875                 (void) pool_pset_pack(eo_system);       /* 3. pack all psets */
 876                 pool_bufsz = ea_pack_object(eo_system, NULL, 0);
 877                 pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP);
 878                 pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz);
 879                 ea_free_object(eo_system, EUP_ALLOC);
 880                 break;
 881         case 0:
 882                 /*
 883                  * Finishing commit transaction.
 884                  */
 885                 if (pool_buf != NULL) {
 886                         kmem_free(pool_buf, pool_bufsz);
 887                         pool_buf = NULL;
 888                         pool_bufsz = 0;
 889                 }
 890                 break;
 891         default:
 892                 ret = EINVAL;
 893         }
 894         return (ret);
 895 }
 896 
 897 /*
 898  * Check is the specified property is special
 899  */
 900 static pool_property_t *
 901 pool_property_find(char *name, pool_property_t *list)
 902 {
 903         pool_property_t *prop;
 904 
 905         for (prop = list; prop->pp_name != NULL; prop++)
 906                 if (strcmp(prop->pp_name, name) == 0)
 907                         return (prop);
 908         return (NULL);
 909 }
 910 
 911 static pool_property_t pool_prop_sys[] = {
 912         { "system.name",                DATA_TYPE_STRING,       PP_RDWR },
 913         { "system.comment",             DATA_TYPE_STRING,       PP_RDWR },
 914         { "system.version",             DATA_TYPE_UINT64,       PP_READ },
 915         { "system.bind-default",        DATA_TYPE_BYTE,         PP_RDWR },
 916         { "system.allocate-method",     DATA_TYPE_STRING,
 917             PP_RDWR | PP_OPTIONAL },
 918         { "system.poold.log-level",     DATA_TYPE_STRING,
 919             PP_RDWR | PP_OPTIONAL },
 920         { "system.poold.log-location",  DATA_TYPE_STRING,
 921             PP_RDWR | PP_OPTIONAL },
 922         { "system.poold.monitor-interval",      DATA_TYPE_UINT64,
 923             PP_RDWR | PP_OPTIONAL },
 924         { "system.poold.history-file",  DATA_TYPE_STRING,
 925             PP_RDWR | PP_OPTIONAL },
 926         { "system.poold.objectives",    DATA_TYPE_STRING,
 927             PP_RDWR | PP_OPTIONAL },
 928         { NULL,                         0,                      0 }
 929 };
 930 
 931 static pool_property_t pool_prop_pool[] = {
 932         { "pool.sys_id",                DATA_TYPE_UINT64,       PP_READ },
 933         { "pool.name",                  DATA_TYPE_STRING,       PP_RDWR },
 934         { "pool.default",               DATA_TYPE_BYTE,         PP_READ },
 935         { "pool.active",                DATA_TYPE_BYTE,         PP_RDWR },
 936         { "pool.importance",            DATA_TYPE_INT64,        PP_RDWR },
 937         { "pool.comment",               DATA_TYPE_STRING,       PP_RDWR },
 938         { "pool.scheduler",             DATA_TYPE_STRING,
 939             PP_RDWR | PP_OPTIONAL },
 940         { NULL,                         0,                      0 }
 941 };
 942 
 943 /*
 944  * Common routine to put new property on the specified list
 945  */
 946 int
 947 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props)
 948 {
 949         pool_property_t *prop;
 950 
 951         if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) {
 952                 /*
 953                  * No read-only properties or properties with bad types
 954                  */
 955                 if (!(prop->pp_perm & PP_WRITE) ||
 956                     prop->pp_type != nvpair_type(pair))
 957                         return (EINVAL);
 958         }
 959         return (nvlist_add_nvpair(nvlist, pair));
 960 }
 961 
 962 /*
 963  * Common routine to remove property from the given list
 964  */
 965 int
 966 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props)
 967 {
 968         pool_property_t *prop;
 969 
 970         if ((prop = pool_property_find(name, props)) != NULL) {
 971                 if (!(prop->pp_perm & PP_OPTIONAL))
 972                         return (EINVAL);
 973         }
 974         return (nvlist_remove_all(nvlist, name));
 975 }
 976 
 977 static int
 978 pool_system_propput(nvpair_t *pair)
 979 {
 980         int ret;
 981 
 982         ASSERT(pool_lock_held());
 983         ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys);
 984         if (ret == 0)
 985                 pool_sys_mod = gethrtime();
 986         return (ret);
 987 }
 988 
 989 static int
 990 pool_system_proprm(char *name)
 991 {
 992         int ret;
 993 
 994         ASSERT(pool_lock_held());
 995         ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys);
 996         if (ret == 0)
 997                 pool_sys_mod = gethrtime();
 998         return (ret);
 999 }
1000 
1001 static int
1002 pool_pool_propput(poolid_t poolid, nvpair_t *pair)
1003 {
1004         pool_t *pool;
1005         int ret;
1006 
1007         ASSERT(pool_lock_held());
1008         if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
1009                 return (ESRCH);
1010         ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool);
1011         if (ret == 0)
1012                 pool_pool_mod = gethrtime();
1013         return (ret);
1014 }
1015 
1016 static int
1017 pool_pool_proprm(poolid_t poolid, char *name)
1018 {
1019         int ret;
1020         pool_t *pool;
1021 
1022         ASSERT(pool_lock_held());
1023         if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
1024                 return (ESRCH);
1025         ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool);
1026         if (ret == 0)
1027                 pool_pool_mod = gethrtime();
1028         return (ret);
1029 }
1030 
1031 int
1032 pool_propput(int class, int subclass, id_t id, nvpair_t *pair)
1033 {
1034         int ret;
1035 
1036         ASSERT(pool_lock_held());
1037         if (pool_state == POOL_DISABLED)
1038                 return (ENOTACTIVE);
1039         switch (class) {
1040         case PEC_SYSTEM:
1041                 ret = pool_system_propput(pair);
1042                 break;
1043         case PEC_POOL:
1044                 ret = pool_pool_propput((poolid_t)id, pair);
1045                 break;
1046         case PEC_RES_COMP:
1047                 switch (subclass) {
1048                 case PREC_PSET:
1049                         ret = pool_pset_propput((psetid_t)id, pair);
1050                         break;
1051                 default:
1052                         ret = EINVAL;
1053                 }
1054                 break;
1055         case PEC_RES_AGG:
1056                 ret = ENOTSUP;
1057                 break;
1058         case PEC_COMP:
1059                 switch (subclass) {
1060                 case PCEC_CPU:
1061                         ret = pool_cpu_propput((processorid_t)id, pair);
1062                         break;
1063                 default:
1064                         ret = EINVAL;
1065                 }
1066                 break;
1067         default:
1068                 ret = EINVAL;
1069         }
1070         return (ret);
1071 }
1072 
1073 int
1074 pool_proprm(int class, int subclass, id_t id, char *name)
1075 {
1076         int ret;
1077 
1078         ASSERT(pool_lock_held());
1079         if (pool_state == POOL_DISABLED)
1080                 return (ENOTACTIVE);
1081         switch (class) {
1082         case PEC_SYSTEM:
1083                 ret = pool_system_proprm(name);
1084                 break;
1085         case PEC_POOL:
1086                 ret = pool_pool_proprm((poolid_t)id, name);
1087                 break;
1088         case PEC_RES_COMP:
1089                 switch (subclass) {
1090                 case PREC_PSET:
1091                         ret = pool_pset_proprm((psetid_t)id, name);
1092                         break;
1093                 default:
1094                         ret = EINVAL;
1095                 }
1096                 break;
1097         case PEC_RES_AGG:
1098                 ret = ENOTSUP;
1099                 break;
1100         case PEC_COMP:
1101                 switch (subclass) {
1102                 case PCEC_CPU:
1103                         ret = pool_cpu_proprm((processorid_t)id, name);
1104                         break;
1105                 default:
1106                         ret = EINVAL;
1107                 }
1108                 break;
1109         default:
1110                 ret = EINVAL;
1111         }
1112         return (ret);
1113 }
1114 
1115 int
1116 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp)
1117 {
1118         int ret;
1119         nvlist_t *nvl;
1120 
1121         ASSERT(pool_lock_held());
1122         if (pool_state == POOL_DISABLED)
1123                 return (ENOTACTIVE);
1124 
1125         (void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
1126 
1127         switch (class) {
1128         case PEC_SYSTEM:
1129         case PEC_POOL:
1130                 ret = EINVAL;
1131                 break;
1132         case PEC_RES_COMP:
1133                 switch (subclass) {
1134                 case PREC_PSET:
1135                         ret = pool_pset_propget((psetid_t)id, name, nvl);
1136                         break;
1137                 default:
1138                         ret = EINVAL;
1139                 }
1140                 break;
1141         case PEC_RES_AGG:
1142                 ret = ENOTSUP;
1143                 break;
1144         case PEC_COMP:
1145                 switch (subclass) {
1146                 case PCEC_CPU:
1147                         ret = pool_cpu_propget((processorid_t)id, name, nvl);
1148                         break;
1149                 default:
1150                         ret = EINVAL;
1151                 }
1152                 break;
1153         default:
1154                 ret = EINVAL;
1155         }
1156         if (ret == 0)
1157                 *nvlp = nvl;
1158         else
1159                 nvlist_free(nvl);
1160         return (ret);
1161 }
1162 
1163 /*
1164  * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs
1165  * in case of failure in pool_do_bind().
1166  */
1167 static void
1168 pool_bind_wake(proc_t *p)
1169 {
1170         ASSERT(pool_lock_held());
1171 
1172         mutex_enter(&p->p_lock);
1173         ASSERT(p->p_poolflag & PBWAIT);
1174         if (p->p_poolcnt > 0) {
1175                 mutex_enter(&pool_barrier_lock);
1176                 pool_barrier_count -= p->p_poolcnt;
1177                 mutex_exit(&pool_barrier_lock);
1178         }
1179         p->p_poolflag &= ~PBWAIT;
1180         cv_signal(&p->p_poolcv);
1181         mutex_exit(&p->p_lock);
1182 }
1183 
1184 static void
1185 pool_bind_wakeall(proc_t **procs)
1186 {
1187         proc_t *p, **pp;
1188 
1189         ASSERT(pool_lock_held());
1190         for (pp = procs; (p = *pp) != NULL; pp++)
1191                 pool_bind_wake(p);
1192 }
1193 
1194 /*
1195  * Return the scheduling class for this pool, or
1196  *      POOL_CLASS_UNSET if not set
1197  *      POOL_CLASS_INVAL if set to an invalid class ID.
1198  */
1199 id_t
1200 pool_get_class(pool_t *pool)
1201 {
1202         char *name;
1203         id_t cid;
1204 
1205         ASSERT(pool_lock_held());
1206 
1207         if (nvlist_lookup_string(pool->pool_props, "pool.scheduler",
1208             &name) == 0) {
1209                 if (getcidbyname(name, &cid) == 0)
1210                         return (cid);
1211                 else
1212                         return (POOL_CLASS_INVAL);
1213         }
1214         return (POOL_CLASS_UNSET);
1215 }
1216 
1217 /*
1218  * Move process to the new scheduling class.
1219  */
1220 static void
1221 pool_change_class(proc_t *p, id_t cid)
1222 {
1223         kthread_t *t;
1224         void *cldata;
1225         id_t oldcid;
1226         void **bufs;
1227         void **buf;
1228         int nlwp;
1229         int ret;
1230         int i;
1231 
1232         /*
1233          * Do not move kernel processes (such as zsched).
1234          */
1235         if (p->p_flag & SSYS)
1236                 return;
1237         /*
1238          * This process is in the pool barrier, so it can't possibly be
1239          * adding new threads and we can use p_lwpcnt + p_zombcnt + 1
1240          * (for possible agent LWP which doesn't use pool barrier) as
1241          * our upper bound.
1242          */
1243         nlwp = p->p_lwpcnt + p->p_zombcnt + 1;
1244 
1245         /*
1246          * Pre-allocate scheduling class specific buffers before
1247          * grabbing p_lock.
1248          */
1249         bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP);
1250         for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1251                 ret = CL_ALLOC(buf, cid, KM_SLEEP);
1252                 ASSERT(ret == 0);
1253         }
1254 
1255         /*
1256          * Move threads one by one to the new scheduling class.
1257          * This never fails because we have all the right
1258          * privileges here.
1259          */
1260         mutex_enter(&p->p_lock);
1261         ASSERT(p->p_poolflag & PBWAIT);
1262         buf = bufs;
1263         t = p->p_tlist;
1264         ASSERT(t != NULL);
1265         do {
1266                 if (t->t_cid != cid) {
1267                         oldcid = t->t_cid;
1268                         cldata = t->t_cldata;
1269                         ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf);
1270                         ASSERT(ret == 0);
1271                         CL_EXITCLASS(oldcid, cldata);
1272                         schedctl_set_cidpri(t);
1273                         *buf++ = NULL;
1274                 }
1275         } while ((t = t->t_forw) != p->p_tlist);
1276         mutex_exit(&p->p_lock);
1277         /*
1278          * Free unused scheduling class specific buffers.
1279          */
1280         for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1281                 if (*buf != NULL) {
1282                         CL_FREE(cid, *buf);
1283                         *buf = NULL;
1284                 }
1285         }
1286         kmem_free(bufs, nlwp * sizeof (void *));
1287 }
1288 
1289 void
1290 pool_get_name(pool_t *pool, char **name)
1291 {
1292         ASSERT(pool_lock_held());
1293 
1294         (void) nvlist_lookup_string(pool->pool_props, "pool.name", name);
1295 
1296         ASSERT(strlen(*name) != 0);
1297 }
1298 
1299 
1300 /*
1301  * The meat of the bind operation.  The steps in pool_do_bind are:
1302  *
1303  * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all
1304  *    such processes to an array.  For any interesting process that has
1305  *    threads inside the pool barrier set, increment a counter by the
1306  *    count of such threads.  Once PBWAIT is set on a process, that process
1307  *    will not disappear.
1308  *
1309  * 2) Wait for the counter from step 2 to drop to zero.  Any process which
1310  *    calls pool_barrier_exit() and notices that PBWAIT has been set on it
1311  *    will decrement that counter before going to sleep, and the process
1312  *    calling pool_barrier_exit() which does the final decrement will wake us.
1313  *
1314  * 3) For each interesting process, perform a calculation on it to see if
1315  *    the bind will actually succeed.  This uses the following three
1316  *    resource-set-specific functions:
1317  *
1318  *    - int set_bind_start(procs, pool)
1319  *
1320  *      Determine whether the given array of processes can be bound to the
1321  *      resource set associated with the given pool.  If it can, take and hold
1322  *      any locks necessary to ensure that the operation will succeed, and
1323  *      make any necessary reservations in the target resource set.  If it
1324  *      can't, return failure with no reservations made and no new locks held.
1325  *
1326  *    - void set_bind_abort(procs, pool)
1327  *
1328  *      set_bind_start() has completed successfully, but another resource set's
1329  *      set_bind_start() has failed, and we haven't begun the bind yet.  Undo
1330  *      any reservations made and drop any locks acquired by our
1331  *      set_bind_start().
1332  *
1333  *    - void set_bind_finish(void)
1334  *
1335  *      The bind has completed successfully.  The processes have been released,
1336  *      and the reservation acquired in set_bind_start() has been depleted as
1337  *      the processes have finished their bindings.  Drop any locks acquired by
1338  *      set_bind_start().
1339  *
1340  * 4) If we've decided that we can proceed with the bind, iterate through
1341  *    the list of interesting processes, grab the necessary locks (which
1342  *    may differ per resource set), perform the bind, and ASSERT that it
1343  *    succeeds.  Once a process has been rebound, it can be awakened.
1344  *
1345  * The operations from step 4 must be kept in sync with anything which might
1346  * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and
1347  * are thus located in the same source files as the associated bind operations.
1348  */
1349 int
1350 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags)
1351 {
1352         extern uint_t nproc;
1353         klwp_t *lwp = ttolwp(curthread);
1354         proc_t **pp, **procs;
1355         proc_t *prstart;
1356         int procs_count = 0;
1357         kproject_t *kpj;
1358         procset_t set;
1359         zone_t *zone;
1360         int procs_size;
1361         int rv = 0;
1362         proc_t *p;
1363         id_t cid = -1;
1364 
1365         ASSERT(pool_lock_held());
1366 
1367         if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL)
1368                 return (EINVAL);
1369 
1370         if (idtype == P_ZONEID) {
1371                 zone = zone_find_by_id(id);
1372                 if (zone == NULL)
1373                         return (ESRCH);
1374                 if (zone_status_get(zone) > ZONE_IS_RUNNING) {
1375                         zone_rele(zone);
1376                         return (EBUSY);
1377                 }
1378         }
1379 
1380         if (idtype == P_PROJID) {
1381                 kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND);
1382                 if (kpj == NULL)
1383                         return (ESRCH);
1384                 mutex_enter(&kpj->kpj_poolbind);
1385         }
1386 
1387         if (idtype == P_PID) {
1388                 /*
1389                  * Fast-path for a single process case.
1390                  */
1391                 procs_size = 2; /* procs is NULL-terminated */
1392                 procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP);
1393                 mutex_enter(&pidlock);
1394         } else {
1395                 /*
1396                  * We will need enough slots for proc_t pointers for as many as
1397                  * twice the number of currently running processes (assuming
1398                  * that each one could be in fork() creating a new child).
1399                  */
1400                 for (;;) {
1401                         procs_size = nproc * 2;
1402                         procs = kmem_zalloc(procs_size * sizeof (proc_t *),
1403                             KM_SLEEP);
1404                         mutex_enter(&pidlock);
1405 
1406                         if (nproc * 2 <= procs_size)
1407                                 break;
1408                         /*
1409                          * If nproc has changed, try again.
1410                          */
1411                         mutex_exit(&pidlock);
1412                         kmem_free(procs, procs_size * sizeof (proc_t *));
1413                 }
1414         }
1415 
1416         if (id == P_MYID)
1417                 id = getmyid(idtype);
1418         setprocset(&set, POP_AND, idtype, id, P_ALL, 0);
1419 
1420         /*
1421          * Do a first scan, and select target processes.
1422          */
1423         if (idtype == P_PID)
1424                 prstart = prfind(id);
1425         else
1426                 prstart = practive;
1427         for (p = prstart, pp = procs; p != NULL; p = p->p_next) {
1428                 mutex_enter(&p->p_lock);
1429                 /*
1430                  * Skip processes that don't match our (id, idtype) set or
1431                  * on the way of becoming zombies.  Skip kernel processes
1432                  * from the global zone.
1433                  */
1434                 if (procinset(p, &set) == 0 ||
1435                     p->p_poolflag & PEXITED ||
1436                     ((p->p_flag & SSYS) && INGLOBALZONE(p))) {
1437                         mutex_exit(&p->p_lock);
1438                         continue;
1439                 }
1440                 if (!INGLOBALZONE(p)) {
1441                         switch (idtype) {
1442                         case P_PID:
1443                         case P_TASKID:
1444                                 /*
1445                                  * Can't bind processes or tasks
1446                                  * in local zones to pools.
1447                                  */
1448                                 mutex_exit(&p->p_lock);
1449                                 mutex_exit(&pidlock);
1450                                 pool_bind_wakeall(procs);
1451                                 rv = EINVAL;
1452                                 goto out;
1453                         case P_PROJID:
1454                                 /*
1455                                  * Only projects in the global
1456                                  * zone can be rebound.
1457                                  */
1458                                 mutex_exit(&p->p_lock);
1459                                 continue;
1460                         case P_POOLID:
1461                                 /*
1462                                  * When rebinding pools, processes can be
1463                                  * in different zones.
1464                                  */
1465                                 break;
1466                         }
1467                 }
1468 
1469                 p->p_poolflag |= PBWAIT;
1470                 /*
1471                  * If some threads in this process are inside the pool
1472                  * barrier, add them to pool_barrier_count, as we have
1473                  * to wait for all of them to exit the barrier.
1474                  */
1475                 if (p->p_poolcnt > 0) {
1476                         mutex_enter(&pool_barrier_lock);
1477                         pool_barrier_count += p->p_poolcnt;
1478                         mutex_exit(&pool_barrier_lock);
1479                 }
1480                 ASSERT(pp < &procs[procs_size]);
1481                 *pp++ = p;
1482                 procs_count++;
1483                 mutex_exit(&p->p_lock);
1484 
1485                 /*
1486                  * We just found our process, so if we're only rebinding a
1487                  * single process then get out of this loop.
1488                  */
1489                 if (idtype == P_PID)
1490                         break;
1491         }
1492         *pp = NULL;     /* cap off the end of the array */
1493         mutex_exit(&pidlock);
1494 
1495         /*
1496          * Wait for relevant processes to stop before they try to enter the
1497          * barrier or at the exit from the barrier.  Make sure that we do
1498          * not get stopped here while we're holding pool_lock.  If we were
1499          * requested to stop, or got a signal then return EAGAIN to let the
1500          * library know that it needs to retry.
1501          */
1502         mutex_enter(&pool_barrier_lock);
1503         lwp->lwp_nostop++;
1504         while (pool_barrier_count > 0) {
1505                 (void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock);
1506                 if (pool_barrier_count > 0) {
1507                         /*
1508                          * We either got a signal or were requested to
1509                          * stop by /proc.  Bail out with EAGAIN.  If we were
1510                          * requested to stop, we'll stop in post_syscall()
1511                          * on our way back to userland.
1512                          */
1513                         mutex_exit(&pool_barrier_lock);
1514                         pool_bind_wakeall(procs);
1515                         lwp->lwp_nostop--;
1516                         rv = EAGAIN;
1517                         goto out;
1518                 }
1519         }
1520         lwp->lwp_nostop--;
1521         mutex_exit(&pool_barrier_lock);
1522 
1523         if (idtype == P_PID) {
1524                 if ((p = *procs) == NULL)
1525                         goto skip;
1526                 mutex_enter(&p->p_lock);
1527                 /* Drop the process if it is exiting */
1528                 if (p->p_poolflag & PEXITED) {
1529                         mutex_exit(&p->p_lock);
1530                         pool_bind_wake(p);
1531                         procs_count--;
1532                 } else
1533                         mutex_exit(&p->p_lock);
1534                 goto skip;
1535         }
1536 
1537         /*
1538          * Do another run, and drop processes that were inside the barrier
1539          * in exit(), but when they have dropped to pool_barrier_exit
1540          * they have become of no interest to us.  Pick up child processes that
1541          * were created by fork() but didn't exist during our first scan.
1542          * Their parents are now stopped at pool_barrier_exit in cfork().
1543          */
1544         mutex_enter(&pidlock);
1545         for (pp = procs; (p = *pp) != NULL; pp++) {
1546                 mutex_enter(&p->p_lock);
1547                 if (p->p_poolflag & PEXITED) {
1548                         ASSERT(p->p_lwpcnt == 0);
1549                         mutex_exit(&p->p_lock);
1550                         pool_bind_wake(p);
1551                         /* flip w/last non-NULL slot */
1552                         *pp = procs[procs_count - 1];
1553                         procs[procs_count - 1] = NULL;
1554                         procs_count--;
1555                         pp--;                   /* try this slot again */
1556                         continue;
1557                 } else
1558                         mutex_exit(&p->p_lock);
1559                 /*
1560                  * Look at the child and check if it should be rebound also.
1561                  * We're holding pidlock, so it is safe to reference p_child.
1562                  */
1563                 if ((p = p->p_child) == NULL)
1564                         continue;
1565 
1566                 mutex_enter(&p->p_lock);
1567 
1568                 /*
1569                  * Skip system processes and make sure that the child is in
1570                  * the same task/project/pool/zone as the parent.
1571                  */
1572                 if ((!INGLOBALZONE(p) && idtype != P_ZONEID &&
1573                     idtype != P_POOLID) || p->p_flag & SSYS) {
1574                         mutex_exit(&p->p_lock);
1575                         continue;
1576                 }
1577 
1578                 /*
1579                  * If the child process has been already created by fork(), has
1580                  * not exited, and has not been added to the list already,
1581                  * then add it now.  We will hit this process again (since we
1582                  * stick it at the end of the procs list) but it will ignored
1583                  * because it will have the PBWAIT flag set.
1584                  */
1585                 if (procinset(p, &set) &&
1586                     !(p->p_poolflag & PEXITED) &&
1587                     !(p->p_poolflag & PBWAIT)) {
1588                         ASSERT(p->p_child == NULL); /* no child of a child */
1589                         procs[procs_count] = p;
1590                         procs[procs_count + 1] = NULL;
1591                         procs_count++;
1592                         p->p_poolflag |= PBWAIT;
1593                 }
1594                 mutex_exit(&p->p_lock);
1595         }
1596         mutex_exit(&pidlock);
1597 skip:
1598         /*
1599          * If there's no processes to rebind then return ESRCH, unless
1600          * we're associating a pool with new resource set, destroying it,
1601          * or binding a zone to a pool.
1602          */
1603         if (procs_count == 0) {
1604                 if (idtype == P_POOLID || idtype == P_ZONEID)
1605                         rv = 0;
1606                 else
1607                         rv = ESRCH;
1608                 goto out;
1609         }
1610 
1611 #ifdef DEBUG
1612         /*
1613          * All processes in the array should have PBWAIT set, and none
1614          * should be in the critical section. Thus, although p_poolflag
1615          * and p_poolcnt are protected by p_lock, their ASSERTions below
1616          * should be stable without it. procinset(), however, ASSERTs that
1617          * the p_lock is held upon entry.
1618          */
1619         for (pp = procs; (p = *pp) != NULL; pp++) {
1620                 int in_set;
1621 
1622                 mutex_enter(&p->p_lock);
1623                 in_set = procinset(p, &set);
1624                 mutex_exit(&p->p_lock);
1625 
1626                 ASSERT(in_set);
1627                 ASSERT(p->p_poolflag & PBWAIT);
1628                 ASSERT(p->p_poolcnt == 0);
1629         }
1630 #endif
1631 
1632         /*
1633          * Do the check if processor set rebinding is going to succeed or not.
1634          */
1635         if ((flags & POOL_BIND_PSET) &&
1636             (rv = pset_bind_start(procs, pool)) != 0) {
1637                 pool_bind_wakeall(procs);
1638                 goto out;
1639         }
1640 
1641         /*
1642          * At this point, all bind operations should succeed.
1643          */
1644         for (pp = procs; (p = *pp) != NULL; pp++) {
1645                 if (flags & POOL_BIND_PSET) {
1646                         psetid_t psetid = pool->pool_pset->pset_id;
1647                         void *zonebuf;
1648                         void *projbuf;
1649 
1650                         /*
1651                          * Pre-allocate one buffer for FSS (per-project
1652                          * buffer for a new pset) in case if this is the
1653                          * first thread from its current project getting
1654                          * bound to this processor set.
1655                          */
1656                         projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ);
1657                         zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE);
1658 
1659                         mutex_enter(&pidlock);
1660                         mutex_enter(&p->p_lock);
1661                         pool_pset_bind(p, psetid, projbuf, zonebuf);
1662                         mutex_exit(&p->p_lock);
1663                         mutex_exit(&pidlock);
1664                         /*
1665                          * Free buffers pre-allocated above if it
1666                          * wasn't actually used.
1667                          */
1668                         fss_freebuf(projbuf, FSS_ALLOC_PROJ);
1669                         fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
1670                 }
1671                 /*
1672                  * Now let's change the scheduling class of this
1673                  * process if our target pool has it defined.
1674                  */
1675                 if (cid != POOL_CLASS_UNSET)
1676                         pool_change_class(p, cid);
1677 
1678                 /*
1679                  * It is safe to reference p_pool here without holding
1680                  * p_lock because it cannot change underneath of us.
1681                  * We're holding pool_lock here, so nobody else can be
1682                  * moving this process between pools.  If process "p"
1683                  * would be exiting, we're guaranteed that it would be blocked
1684                  * at pool_barrier_enter() in exit().  Otherwise, it would've
1685                  * been skipped by one of our scans of the practive list
1686                  * as a process with PEXITED flag set.
1687                  */
1688                 if (p->p_pool != pool) {
1689                         ASSERT(p->p_pool->pool_ref > 0);
1690                         atomic_add_32(&p->p_pool->pool_ref, -1);
1691                         p->p_pool = pool;
1692                         atomic_add_32(&p->p_pool->pool_ref, 1);
1693                 }
1694                 /*
1695                  * Okay, we've tortured this guy enough.
1696                  * Let this poor process go now.
1697                  */
1698                 pool_bind_wake(p);
1699         }
1700         if (flags & POOL_BIND_PSET)
1701                 pset_bind_finish();
1702 
1703 out:    switch (idtype) {
1704         case P_PROJID:
1705                 ASSERT(kpj != NULL);
1706                 mutex_exit(&kpj->kpj_poolbind);
1707                 project_rele(kpj);
1708                 break;
1709         case P_ZONEID:
1710                 if (rv == 0) {
1711                         mutex_enter(&cpu_lock);
1712                         zone_pool_set(zone, pool);
1713                         mutex_exit(&cpu_lock);
1714                 }
1715                 zone->zone_pool_mod = gethrtime();
1716                 zone_rele(zone);
1717                 break;
1718         }
1719 
1720         kmem_free(procs, procs_size * sizeof (proc_t *));
1721         ASSERT(pool_barrier_count == 0);
1722         return (rv);
1723 }
1724 
1725 void
1726 pool_event_cb_register(pool_event_cb_t *cb)
1727 {
1728         ASSERT(!pool_lock_held() || panicstr);
1729         ASSERT(cb->pec_func != NULL);
1730 
1731         mutex_enter(&pool_event_cb_lock);
1732         if (!pool_event_cb_init) {
1733                 list_create(&pool_event_cb_list,  sizeof (pool_event_cb_t),
1734                     offsetof(pool_event_cb_t, pec_list));
1735                 pool_event_cb_init = B_TRUE;
1736         }
1737         list_insert_tail(&pool_event_cb_list, cb);
1738         mutex_exit(&pool_event_cb_lock);
1739 }
1740 
1741 void
1742 pool_event_cb_unregister(pool_event_cb_t *cb)
1743 {
1744         ASSERT(!pool_lock_held() || panicstr);
1745 
1746         mutex_enter(&pool_event_cb_lock);
1747         list_remove(&pool_event_cb_list, cb);
1748         mutex_exit(&pool_event_cb_lock);
1749 }
1750 
1751 typedef struct {
1752         pool_event_t    tqd_what;
1753         poolid_t        tqd_id;
1754 } pool_tqd_t;
1755 
1756 void
1757 pool_event_notify(void *arg)
1758 {
1759         pool_tqd_t      *tqd = (pool_tqd_t *)arg;
1760         pool_event_cb_t *cb;
1761 
1762         ASSERT(!pool_lock_held() || panicstr);
1763 
1764         mutex_enter(&pool_event_cb_lock);
1765         for (cb = list_head(&pool_event_cb_list); cb != NULL;
1766             cb = list_next(&pool_event_cb_list, cb)) {
1767                 cb->pec_func(tqd->tqd_what, tqd->tqd_id, cb->pec_arg);
1768         }
1769         mutex_exit(&pool_event_cb_lock);
1770         kmem_free(tqd, sizeof (*tqd));
1771 }
1772 
1773 void
1774 pool_event_dispatch(pool_event_t what, poolid_t id)
1775 {
1776         pool_tqd_t *tqd = NULL;
1777 
1778         ASSERT(pool_lock_held());
1779 
1780         if (pool_event_cb_taskq == NULL) {
1781                 pool_event_cb_taskq = taskq_create("pool_event_cb_taskq", 1,
1782                     -1, 1, 1, TASKQ_PREPOPULATE);
1783         }
1784 
1785         tqd = kmem_alloc(sizeof (*tqd), KM_SLEEP);
1786         tqd->tqd_what = what;
1787         tqd->tqd_id = id;
1788 
1789         (void) taskq_dispatch(pool_event_cb_taskq, pool_event_notify, tqd,
1790             KM_SLEEP);
1791 }