Print this page
5045 use atomic_{inc,dec}_* instead of atomic_add_*
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/os/pool.c
+++ new/usr/src/uts/common/os/pool.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 */
25 25
26 26 #include <sys/pool.h>
27 27 #include <sys/pool_impl.h>
28 28 #include <sys/pool_pset.h>
29 29 #include <sys/id_space.h>
30 30 #include <sys/mutex.h>
31 31 #include <sys/nvpair.h>
32 32 #include <sys/cpuvar.h>
33 33 #include <sys/errno.h>
34 34 #include <sys/cmn_err.h>
35 35 #include <sys/systm.h>
36 36 #include <sys/proc.h>
37 37 #include <sys/fss.h>
38 38 #include <sys/class.h>
39 39 #include <sys/exacct.h>
40 40 #include <sys/utsname.h>
41 41 #include <sys/procset.h>
42 42 #include <sys/atomic.h>
43 43 #include <sys/zone.h>
44 44 #include <sys/policy.h>
45 45 #include <sys/schedctl.h>
46 46 #include <sys/taskq.h>
47 47
48 48 /*
49 49 * RESOURCE POOLS
50 50 *
51 51 * The resource pools facility brings together process-bindable resource into
52 52 * a common abstraction called a pool. Processor sets and other entities can
53 53 * be configured, grouped, and labelled such that workload components can be
54 54 * associated with a subset of a system's total resources.
55 55 *
56 56 * When disabled, the pools facility is "invisible". All processes belong
57 57 * to the same pool (pool_default), and processor sets can be managed through
58 58 * the old pset() system call. When enabled, processor sets can only be
59 59 * managed via the pools facility. New pools can be created and associated
60 60 * with processor sets. Processes can be bound to pools which have non-empty
61 61 * resource sets.
62 62 *
63 63 * Locking: pool_lock() protects global pools state and must be called
64 64 * before modifying the configuration, or when taking a snapshot of the
65 65 * configuration. If pool_lock_intr() is used, the operation may be
66 66 * interrupted by a signal or a request.
67 67 *
68 68 * To prevent processes from being rebound between pools while they are
69 69 * the middle of an operation which affects resource set bindings, such
70 70 * operations must be surrounded by calls to pool_barrier_enter() and
71 71 * pool_barrier_exit(). This mechanism guarantees that such processes will
72 72 * be stopped either at the beginning or at the end of the barrier so that
73 73 * the rebind operation can atomically bind the process and its threads
74 74 * to new resource sets, and then let process run again.
75 75 *
76 76 * Lock ordering with respect to other locks is as follows:
77 77 *
78 78 * pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock
79 79 *
80 80 * Most static and global variables defined in this file are protected
81 81 * by calling pool_lock().
82 82 *
83 83 * The operation that binds tasks and projects to pools is atomic. That is,
84 84 * either all processes in a given task or a project will be bound to a
85 85 * new pool, or (in case of an error) they will be all left bound to the
86 86 * old pool. Processes in a given task or a given project can only be bound to
87 87 * different pools if they were rebound individually one by one as single
88 88 * processes. Threads or LWPs of the same process do not have pool bindings,
89 89 * and are bound to the same resource sets associated with the resource pool
90 90 * of that process.
91 91 *
92 92 * The following picture shows one possible pool configuration with three
93 93 * pools and three processor sets. Note that processor set "foo" is not
94 94 * associated with any pools and therefore cannot have any processes
95 95 * bound to it. Two pools (default and foo) are associated with the
96 96 * same processor set (default). Also, note that processes in Task 2
97 97 * are bound to different pools.
98 98 *
99 99 *
100 100 * Processor Sets
101 101 * +---------+
102 102 * +--------------+========================>| default |
103 103 * a| | +---------+
104 104 * s| | ||
105 105 * s| | +---------+
106 106 * o| | | foo |
107 107 * c| | +---------+
108 108 * i| | ||
109 109 * a| | +---------+
110 110 * t| | +------>| bar |
111 111 * e| | | +---------+
112 112 * d| | |
113 113 * | | |
114 114 * +---------+ +---------+ +---------+
115 115 * Pools | default |======| foo |======| bar |
116 116 * +---------+ +---------+ +---------+
117 117 * @ @ @ @ @ @
118 118 * b| | | | | |
119 119 * o| | | | | |
120 120 * u| +-----+ | +-------+ | +---+
121 121 * n| | | | | |
122 122 * ....d|........|......|......|.........|.......|....
123 123 * : | :: | | | :: | | :
124 124 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ :
125 125 * Processes : | p | :: | p | | p | | p | :: | p |...| p | :
126 126 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ :
127 127 * :........::......................::...............:
128 128 * Task 1 Task 2 Task N
129 129 * | | |
130 130 * | | |
131 131 * | +-----------+ | +-----------+
132 132 * +--| Project 1 |--+ | Project N |
133 133 * +-----------+ +-----------+
134 134 *
135 135 * This is just an illustration of relationships between processes, tasks,
136 136 * projects, pools, and processor sets. New types of resource sets will be
137 137 * added in the future.
138 138 */
139 139
140 140 pool_t *pool_default; /* default pool which always exists */
141 141 int pool_count; /* number of pools created on this system */
142 142 int pool_state; /* pools state -- enabled/disabled */
143 143 void *pool_buf; /* pre-commit snapshot of the pools state */
144 144 size_t pool_bufsz; /* size of pool_buf */
145 145 static hrtime_t pool_pool_mod; /* last modification time for pools */
146 146 static hrtime_t pool_sys_mod; /* last modification time for system */
147 147 static nvlist_t *pool_sys_prop; /* system properties */
148 148 static id_space_t *pool_ids; /* pool ID space */
149 149 static list_t pool_list; /* doubly-linked list of pools */
150 150 static kmutex_t pool_mutex; /* protects pool_busy_* */
151 151 static kcondvar_t pool_busy_cv; /* waiting for "pool_lock" */
152 152 static kthread_t *pool_busy_thread; /* thread holding "pool_lock" */
153 153 static kmutex_t pool_barrier_lock; /* synch. with pool_barrier_* */
154 154 static kcondvar_t pool_barrier_cv; /* synch. with pool_barrier_* */
155 155 static int pool_barrier_count; /* synch. with pool_barrier_* */
156 156 static list_t pool_event_cb_list; /* pool event callbacks */
157 157 static boolean_t pool_event_cb_init = B_FALSE;
158 158 static kmutex_t pool_event_cb_lock;
159 159 static taskq_t *pool_event_cb_taskq = NULL;
160 160
161 161 void pool_event_dispatch(pool_event_t, poolid_t);
162 162
163 163 /*
164 164 * Boot-time pool initialization.
165 165 */
166 166 void
167 167 pool_init(void)
168 168 {
169 169 pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID);
170 170
171 171 /*
172 172 * Initialize default pool.
173 173 */
174 174 pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
175 175 pool_default->pool_id = POOL_DEFAULT;
176 176 list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link));
177 177 list_insert_head(&pool_list, pool_default);
178 178
179 179 /*
180 180 * Initialize plugins for resource sets.
181 181 */
182 182 pool_pset_init();
183 183 pool_count = 1;
184 184 p0.p_pool = pool_default;
185 185 global_zone->zone_pool = pool_default;
186 186 pool_default->pool_ref = 1;
187 187 }
188 188
189 189 /*
190 190 * Synchronization routines.
191 191 *
192 192 * pool_lock is only called from syscall-level routines (processor_bind(),
193 193 * pset_*(), and /dev/pool ioctls). The pool "lock" may be held for long
194 194 * periods of time, including across sleeping operations, so we allow its
195 195 * acquisition to be interruptible.
196 196 *
197 197 * The current thread that owns the "lock" is stored in the variable
198 198 * pool_busy_thread, both to let pool_lock_held() work and to aid debugging.
199 199 */
200 200 void
201 201 pool_lock(void)
202 202 {
203 203 mutex_enter(&pool_mutex);
204 204 ASSERT(!pool_lock_held());
205 205 while (pool_busy_thread != NULL)
206 206 cv_wait(&pool_busy_cv, &pool_mutex);
207 207 pool_busy_thread = curthread;
208 208 mutex_exit(&pool_mutex);
209 209 }
210 210
211 211 int
212 212 pool_lock_intr(void)
213 213 {
214 214 mutex_enter(&pool_mutex);
215 215 ASSERT(!pool_lock_held());
216 216 while (pool_busy_thread != NULL) {
217 217 if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) {
218 218 cv_signal(&pool_busy_cv);
219 219 mutex_exit(&pool_mutex);
220 220 return (1);
221 221 }
222 222 }
223 223 pool_busy_thread = curthread;
224 224 mutex_exit(&pool_mutex);
225 225 return (0);
226 226 }
227 227
228 228 int
229 229 pool_lock_held(void)
230 230 {
231 231 return (pool_busy_thread == curthread);
232 232 }
233 233
234 234 void
235 235 pool_unlock(void)
236 236 {
237 237 mutex_enter(&pool_mutex);
238 238 ASSERT(pool_lock_held());
239 239 pool_busy_thread = NULL;
240 240 cv_signal(&pool_busy_cv);
241 241 mutex_exit(&pool_mutex);
242 242 }
243 243
244 244 /*
245 245 * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize
246 246 * with pool_do_bind().
247 247 *
248 248 * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all
249 249 * operations which modify pool or pset associations. They can be called
250 250 * while the process is multi-threaded. In the common case, when current
251 251 * process is not being rebound (PBWAIT flag is not set), these functions
252 252 * will be just incrementing and decrementing reference counts.
253 253 */
254 254 void
255 255 pool_barrier_enter(void)
256 256 {
257 257 proc_t *p = curproc;
258 258
259 259 ASSERT(MUTEX_HELD(&p->p_lock));
260 260 while (p->p_poolflag & PBWAIT)
261 261 cv_wait(&p->p_poolcv, &p->p_lock);
262 262 p->p_poolcnt++;
263 263 }
264 264
265 265 void
266 266 pool_barrier_exit(void)
267 267 {
268 268 proc_t *p = curproc;
269 269
270 270 ASSERT(MUTEX_HELD(&p->p_lock));
271 271 ASSERT(p->p_poolcnt > 0);
272 272 p->p_poolcnt--;
273 273 if (p->p_poolflag & PBWAIT) {
274 274 mutex_enter(&pool_barrier_lock);
275 275 ASSERT(pool_barrier_count > 0);
276 276 pool_barrier_count--;
277 277 if (pool_barrier_count == 0)
278 278 cv_signal(&pool_barrier_cv);
279 279 mutex_exit(&pool_barrier_lock);
280 280 while (p->p_poolflag & PBWAIT)
281 281 cv_wait(&p->p_poolcv, &p->p_lock);
282 282 }
283 283 }
284 284
285 285 /*
286 286 * Enable pools facility.
287 287 */
288 288 static int
289 289 pool_enable(void)
290 290 {
291 291 int ret;
292 292
293 293 ASSERT(pool_lock_held());
294 294 ASSERT(pool_count == 1);
295 295
296 296 ret = pool_pset_enable();
297 297 if (ret != 0)
298 298 return (ret);
299 299 (void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP);
300 300 (void) nvlist_add_string(pool_sys_prop, "system.name",
301 301 "default");
302 302 (void) nvlist_add_string(pool_sys_prop, "system.comment", "");
303 303 (void) nvlist_add_int64(pool_sys_prop, "system.version", 1);
304 304 (void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1);
305 305 (void) nvlist_add_string(pool_sys_prop, "system.poold.objectives",
306 306 "wt-load");
307 307
308 308 (void) nvlist_alloc(&pool_default->pool_props,
309 309 NV_UNIQUE_NAME, KM_SLEEP);
310 310 (void) nvlist_add_string(pool_default->pool_props,
311 311 "pool.name", "pool_default");
312 312 (void) nvlist_add_string(pool_default->pool_props, "pool.comment", "");
313 313 (void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1);
314 314 (void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1);
315 315 (void) nvlist_add_int64(pool_default->pool_props,
316 316 "pool.importance", 1);
317 317 (void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id",
318 318 pool_default->pool_id);
319 319
320 320 pool_sys_mod = pool_pool_mod = gethrtime();
321 321
322 322 return (ret);
323 323 }
324 324
325 325 /*
326 326 * Disable pools facility.
327 327 */
328 328 static int
329 329 pool_disable(void)
330 330 {
331 331 int ret;
332 332
333 333 ASSERT(pool_lock_held());
334 334
335 335 if (pool_count > 1) /* must destroy all pools first */
336 336 return (EBUSY);
337 337
338 338 ret = pool_pset_disable();
339 339 if (ret != 0)
340 340 return (ret);
341 341 if (pool_sys_prop != NULL) {
342 342 nvlist_free(pool_sys_prop);
343 343 pool_sys_prop = NULL;
344 344 }
345 345 if (pool_default->pool_props != NULL) {
346 346 nvlist_free(pool_default->pool_props);
347 347 pool_default->pool_props = NULL;
348 348 }
349 349 return (0);
350 350 }
351 351
352 352 pool_t *
353 353 pool_lookup_pool_by_name(char *name)
354 354 {
355 355 pool_t *pool = pool_default;
356 356 char *p;
357 357
358 358 ASSERT(pool_lock_held());
359 359 for (pool = list_head(&pool_list); pool;
360 360 pool = list_next(&pool_list, pool)) {
361 361 if (nvlist_lookup_string(pool->pool_props,
362 362 "pool.name", &p) == 0 && strcmp(name, p) == 0)
363 363 return (pool);
364 364 }
365 365 return (NULL);
366 366 }
367 367
368 368 pool_t *
369 369 pool_lookup_pool_by_id(poolid_t poolid)
370 370 {
371 371 pool_t *pool = pool_default;
372 372
373 373 ASSERT(pool_lock_held());
374 374 for (pool = list_head(&pool_list); pool;
375 375 pool = list_next(&pool_list, pool)) {
376 376 if (pool->pool_id == poolid)
377 377 return (pool);
378 378 }
379 379 return (NULL);
380 380 }
381 381
382 382 pool_t *
383 383 pool_lookup_pool_by_pset(int id)
384 384 {
385 385 pool_t *pool = pool_default;
386 386 psetid_t psetid = (psetid_t)id;
387 387
388 388 ASSERT(pool_lock_held());
389 389 for (pool = list_head(&pool_list); pool != NULL;
390 390 pool = list_next(&pool_list, pool)) {
391 391 if (pool->pool_pset->pset_id == psetid)
392 392 return (pool);
393 393 }
394 394 return (NULL);
395 395 }
396 396
397 397 /*
398 398 * Create new pool, associate it with default resource sets, and give
399 399 * it a temporary name.
400 400 */
401 401 static int
402 402 pool_pool_create(poolid_t *poolid)
403 403 {
404 404 pool_t *pool;
405 405 char pool_name[40];
406 406
407 407 ASSERT(pool_lock_held());
408 408
409 409 pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
410 410 pool->pool_id = *poolid = id_alloc(pool_ids);
411 411 pool->pool_pset = pool_pset_default;
412 412 pool_pset_default->pset_npools++;
413 413 list_insert_tail(&pool_list, pool);
414 414 (void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP);
415 415 (void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id);
416 416 (void) nvlist_add_byte(pool->pool_props, "pool.default", 0);
417 417 pool_pool_mod = gethrtime();
418 418 (void) snprintf(pool_name, sizeof (pool_name), "pool_%lld",
419 419 pool_pool_mod);
420 420 (void) nvlist_add_string(pool->pool_props, "pool.name", pool_name);
421 421 pool_count++;
422 422 return (0);
423 423 }
424 424
425 425 struct destroy_zone_arg {
426 426 pool_t *old;
427 427 pool_t *new;
428 428 };
429 429
430 430 /*
431 431 * Update pool pointers for zones that are currently bound to pool "old"
432 432 * to be bound to pool "new".
433 433 */
434 434 static int
435 435 pool_destroy_zone_cb(zone_t *zone, void *arg)
436 436 {
437 437 struct destroy_zone_arg *dza = arg;
438 438
439 439 ASSERT(pool_lock_held());
440 440 ASSERT(MUTEX_HELD(&cpu_lock));
441 441
442 442 if (zone_pool_get(zone) == dza->old)
443 443 zone_pool_set(zone, dza->new);
444 444 return (0);
445 445 }
446 446
447 447 /*
448 448 * Destroy specified pool, and rebind all processes in it
449 449 * to the default pool.
450 450 */
451 451 static int
452 452 pool_pool_destroy(poolid_t poolid)
453 453 {
454 454 pool_t *pool;
455 455 int ret;
456 456
457 457 ASSERT(pool_lock_held());
458 458
459 459 if (poolid == POOL_DEFAULT)
460 460 return (EINVAL);
461 461 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
462 462 return (ESRCH);
463 463 ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL);
464 464 if (ret == 0) {
465 465 struct destroy_zone_arg dzarg;
466 466
467 467 dzarg.old = pool;
468 468 dzarg.new = pool_default;
469 469 mutex_enter(&cpu_lock);
470 470 ret = zone_walk(pool_destroy_zone_cb, &dzarg);
471 471 mutex_exit(&cpu_lock);
472 472 ASSERT(ret == 0);
473 473 ASSERT(pool->pool_ref == 0);
474 474 (void) nvlist_free(pool->pool_props);
475 475 id_free(pool_ids, pool->pool_id);
476 476 pool->pool_pset->pset_npools--;
477 477 list_remove(&pool_list, pool);
478 478 pool_count--;
479 479 pool_pool_mod = gethrtime();
480 480 kmem_free(pool, sizeof (pool_t));
481 481 }
482 482 return (ret);
483 483 }
484 484
485 485 /*
486 486 * Create new pool or resource set.
487 487 */
488 488 int
489 489 pool_create(int class, int subclass, id_t *id)
490 490 {
491 491 int ret;
492 492
493 493 ASSERT(pool_lock_held());
494 494 if (pool_state == POOL_DISABLED)
495 495 return (ENOTACTIVE);
496 496 switch (class) {
497 497 case PEC_POOL:
498 498 ret = pool_pool_create((poolid_t *)id);
499 499 break;
500 500 case PEC_RES_COMP:
501 501 switch (subclass) {
502 502 case PREC_PSET:
503 503 ret = pool_pset_create((psetid_t *)id);
504 504 break;
505 505 default:
506 506 ret = EINVAL;
507 507 }
508 508 break;
509 509 case PEC_RES_AGG:
510 510 ret = ENOTSUP;
511 511 break;
512 512 default:
513 513 ret = EINVAL;
514 514 }
515 515 return (ret);
516 516 }
517 517
518 518 /*
519 519 * Destroy an existing pool or resource set.
520 520 */
521 521 int
522 522 pool_destroy(int class, int subclass, id_t id)
523 523 {
524 524 int ret;
525 525
526 526 ASSERT(pool_lock_held());
527 527 if (pool_state == POOL_DISABLED)
528 528 return (ENOTACTIVE);
529 529 switch (class) {
530 530 case PEC_POOL:
531 531 ret = pool_pool_destroy((poolid_t)id);
532 532 break;
533 533 case PEC_RES_COMP:
534 534 switch (subclass) {
535 535 case PREC_PSET:
536 536 ret = pool_pset_destroy((psetid_t)id);
537 537 break;
538 538 default:
539 539 ret = EINVAL;
540 540 }
541 541 break;
542 542 case PEC_RES_AGG:
543 543 ret = ENOTSUP;
544 544 break;
545 545 default:
546 546 ret = EINVAL;
547 547 }
548 548 return (ret);
549 549 }
550 550
551 551 /*
552 552 * Enable or disable pools.
553 553 */
554 554 int
555 555 pool_status(int status)
556 556 {
557 557 int ret = 0;
558 558
559 559 ASSERT(pool_lock_held());
560 560
561 561 if (pool_state == status)
562 562 return (0);
563 563 switch (status) {
564 564 case POOL_ENABLED:
565 565 ret = pool_enable();
566 566 if (ret != 0)
567 567 return (ret);
568 568 pool_state = POOL_ENABLED;
569 569 pool_event_dispatch(POOL_E_ENABLE, NULL);
570 570 break;
571 571 case POOL_DISABLED:
572 572 ret = pool_disable();
573 573 if (ret != 0)
574 574 return (ret);
575 575 pool_state = POOL_DISABLED;
576 576 pool_event_dispatch(POOL_E_DISABLE, NULL);
577 577 break;
578 578 default:
579 579 ret = EINVAL;
580 580 }
581 581 return (ret);
582 582 }
583 583
584 584 /*
585 585 * Associate pool with resource set.
586 586 */
587 587 int
588 588 pool_assoc(poolid_t poolid, int idtype, id_t id)
589 589 {
590 590 int ret;
591 591
592 592 ASSERT(pool_lock_held());
593 593 if (pool_state == POOL_DISABLED)
594 594 return (ENOTACTIVE);
595 595 switch (idtype) {
596 596 case PREC_PSET:
597 597 ret = pool_pset_assoc(poolid, (psetid_t)id);
598 598 if (ret == 0)
599 599 pool_event_dispatch(POOL_E_CHANGE, poolid);
600 600 break;
601 601 default:
602 602 ret = EINVAL;
603 603 }
604 604 if (ret == 0)
605 605 pool_pool_mod = gethrtime();
606 606 return (ret);
607 607 }
608 608
609 609 /*
610 610 * Disassociate resource set from pool.
611 611 */
612 612 int
613 613 pool_dissoc(poolid_t poolid, int idtype)
614 614 {
615 615 int ret;
616 616
617 617 ASSERT(pool_lock_held());
618 618 if (pool_state == POOL_DISABLED)
619 619 return (ENOTACTIVE);
620 620 switch (idtype) {
621 621 case PREC_PSET:
622 622 ret = pool_pset_assoc(poolid, PS_NONE);
623 623 if (ret == 0)
624 624 pool_event_dispatch(POOL_E_CHANGE, poolid);
625 625 break;
626 626 default:
627 627 ret = EINVAL;
628 628 }
629 629 if (ret == 0)
630 630 pool_pool_mod = gethrtime();
631 631 return (ret);
632 632 }
633 633
634 634 /*
635 635 * Transfer specified quantity of resources between resource sets.
636 636 */
637 637 /*ARGSUSED*/
638 638 int
639 639 pool_transfer(int type, id_t src, id_t dst, uint64_t qty)
640 640 {
641 641 int ret = EINVAL;
642 642
643 643 return (ret);
644 644 }
645 645
646 646 static poolid_t
647 647 pool_lookup_id_by_pset(int id)
648 648 {
649 649 pool_t *pool = pool_default;
650 650 psetid_t psetid = (psetid_t)id;
651 651
652 652 ASSERT(pool_lock_held());
653 653 for (pool = list_head(&pool_list); pool != NULL;
654 654 pool = list_next(&pool_list, pool)) {
655 655 if (pool->pool_pset->pset_id == psetid)
656 656 return (pool->pool_id);
657 657 }
658 658 return (POOL_INVALID);
659 659 }
660 660
661 661 /*
662 662 * Transfer resources specified by their IDs between resource sets.
663 663 */
664 664 int
665 665 pool_xtransfer(int type, id_t src_pset, id_t dst_pset, uint_t size, id_t *ids)
666 666 {
667 667 int ret;
668 668 poolid_t src_pool, dst_pool;
669 669
670 670 ASSERT(pool_lock_held());
671 671 if (pool_state == POOL_DISABLED)
672 672 return (ENOTACTIVE);
673 673 switch (type) {
674 674 case PREC_PSET:
675 675 ret = pool_pset_xtransfer((psetid_t)src_pset,
676 676 (psetid_t)dst_pset, size, ids);
677 677 if (ret == 0) {
678 678 if ((src_pool = pool_lookup_id_by_pset(src_pset)) !=
679 679 POOL_INVALID)
680 680 pool_event_dispatch(POOL_E_CHANGE, src_pool);
681 681 if ((dst_pool = pool_lookup_id_by_pset(dst_pset)) !=
682 682 POOL_INVALID)
683 683 pool_event_dispatch(POOL_E_CHANGE, dst_pool);
684 684 }
685 685 break;
686 686 default:
687 687 ret = EINVAL;
688 688 }
689 689 return (ret);
690 690 }
691 691
692 692 /*
693 693 * Bind processes to pools.
694 694 */
695 695 int
696 696 pool_bind(poolid_t poolid, idtype_t idtype, id_t id)
697 697 {
698 698 pool_t *pool;
699 699
700 700 ASSERT(pool_lock_held());
701 701
702 702 if (pool_state == POOL_DISABLED)
703 703 return (ENOTACTIVE);
704 704 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
705 705 return (ESRCH);
706 706
707 707 switch (idtype) {
708 708 case P_PID:
709 709 case P_TASKID:
710 710 case P_PROJID:
711 711 case P_ZONEID:
712 712 break;
713 713 default:
714 714 return (EINVAL);
715 715 }
716 716 return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL));
717 717 }
718 718
719 719 /*
720 720 * Query pool binding of the specifed process.
721 721 */
722 722 int
723 723 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid)
724 724 {
725 725 proc_t *p;
726 726
727 727 if (idtype != P_PID)
728 728 return (ENOTSUP);
729 729 if (id == P_MYID)
730 730 id = curproc->p_pid;
731 731
732 732 ASSERT(pool_lock_held());
733 733
734 734 mutex_enter(&pidlock);
735 735 if ((p = prfind((pid_t)id)) == NULL) {
736 736 mutex_exit(&pidlock);
737 737 return (ESRCH);
738 738 }
739 739 mutex_enter(&p->p_lock);
740 740 /*
741 741 * In local zones, lie about pool bindings of processes from
742 742 * the global zone.
743 743 */
744 744 if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) {
745 745 pool_t *pool;
746 746
747 747 pool = zone_pool_get(curproc->p_zone);
748 748 *poolid = pool->pool_id;
749 749 } else {
750 750 *poolid = p->p_pool->pool_id;
751 751 }
752 752 mutex_exit(&p->p_lock);
753 753 mutex_exit(&pidlock);
754 754 return (0);
755 755 }
756 756
757 757 static ea_object_t *
758 758 pool_system_pack(void)
759 759 {
760 760 ea_object_t *eo_system;
761 761 size_t bufsz = 0;
762 762 char *buf = NULL;
763 763
764 764 ASSERT(pool_lock_held());
765 765
766 766 eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM);
767 767 (void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t),
768 768 EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64);
769 769 if (INGLOBALZONE(curproc))
770 770 (void) ea_attach_item(eo_system, &pool_pool_mod,
771 771 sizeof (hrtime_t),
772 772 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
773 773 else
774 774 (void) ea_attach_item(eo_system,
775 775 &curproc->p_zone->zone_pool_mod,
776 776 sizeof (hrtime_t),
777 777 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
778 778 (void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t),
779 779 EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64);
780 780 (void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t),
781 781 EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64);
782 782 (void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0);
783 783 (void) ea_attach_item(eo_system, buf, bufsz,
784 784 EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW);
785 785 kmem_free(buf, bufsz);
786 786 return (eo_system);
787 787 }
788 788
789 789 /*
790 790 * Pack information about pools and attach it to specified exacct group.
791 791 */
792 792 static int
793 793 pool_pool_pack(ea_object_t *eo_system)
794 794 {
795 795 ea_object_t *eo_pool;
796 796 pool_t *pool;
797 797 size_t bufsz;
798 798 char *buf;
799 799 pool_t *myzonepool;
800 800
801 801 ASSERT(pool_lock_held());
802 802 myzonepool = zone_pool_get(curproc->p_zone);
803 803 for (pool = list_head(&pool_list); pool;
804 804 pool = list_next(&pool_list, pool)) {
805 805 if (!INGLOBALZONE(curproc) && myzonepool != pool)
806 806 continue;
807 807 bufsz = 0;
808 808 buf = NULL;
809 809 eo_pool = ea_alloc_group(EXT_GROUP |
810 810 EXC_LOCAL | EXD_GROUP_POOL);
811 811 (void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t),
812 812 EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32);
813 813 (void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id,
814 814 sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32);
815 815 (void) nvlist_pack(pool->pool_props, &buf, &bufsz,
816 816 NV_ENCODE_NATIVE, 0);
817 817 (void) ea_attach_item(eo_pool, buf, bufsz,
818 818 EXC_LOCAL | EXD_POOL_PROP | EXT_RAW);
819 819 kmem_free(buf, bufsz);
820 820 (void) ea_attach_to_group(eo_system, eo_pool);
821 821 }
822 822 return (0);
823 823 }
824 824
825 825 /*
826 826 * Pack the whole pool configuration in the specified buffer.
827 827 */
828 828 int
829 829 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize)
830 830 {
831 831 ea_object_t *eo_system;
832 832 size_t ksize;
833 833 int ret = 0;
834 834
835 835 ASSERT(pool_lock_held());
836 836
837 837 eo_system = pool_system_pack(); /* 1. pack system */
838 838 (void) pool_pool_pack(eo_system); /* 2. pack all pools */
839 839 (void) pool_pset_pack(eo_system); /* 3. pack all psets */
840 840 ksize = ea_pack_object(eo_system, NULL, 0);
841 841 if (kbuf == NULL || kbufsz == 0)
842 842 *asize = ksize;
843 843 else if (ksize > kbufsz)
844 844 ret = ENOMEM;
845 845 else
846 846 *asize = ea_pack_object(eo_system, kbuf, kbufsz);
847 847 ea_free_object(eo_system, EUP_ALLOC);
848 848 return (ret);
849 849 }
850 850
851 851 /*
852 852 * Start/end the commit transaction. If commit transaction is currently
853 853 * in progress, then all POOL_QUERY ioctls will return pools configuration
854 854 * at the beginning of transaction.
855 855 */
856 856 int
857 857 pool_commit(int state)
858 858 {
859 859 ea_object_t *eo_system;
860 860 int ret = 0;
861 861
862 862 ASSERT(pool_lock_held());
863 863
864 864 if (pool_state == POOL_DISABLED)
865 865 return (ENOTACTIVE);
866 866 switch (state) {
867 867 case 1:
868 868 /*
869 869 * Beginning commit transation.
870 870 */
871 871 if (pool_buf != NULL) /* transaction in progress */
872 872 return (EBUSY);
873 873 eo_system = pool_system_pack(); /* 1. pack system */
874 874 (void) pool_pool_pack(eo_system); /* 2. pack all pools */
875 875 (void) pool_pset_pack(eo_system); /* 3. pack all psets */
876 876 pool_bufsz = ea_pack_object(eo_system, NULL, 0);
877 877 pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP);
878 878 pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz);
879 879 ea_free_object(eo_system, EUP_ALLOC);
880 880 break;
881 881 case 0:
882 882 /*
883 883 * Finishing commit transaction.
884 884 */
885 885 if (pool_buf != NULL) {
886 886 kmem_free(pool_buf, pool_bufsz);
887 887 pool_buf = NULL;
888 888 pool_bufsz = 0;
889 889 }
890 890 break;
891 891 default:
892 892 ret = EINVAL;
893 893 }
894 894 return (ret);
895 895 }
896 896
897 897 /*
898 898 * Check is the specified property is special
899 899 */
900 900 static pool_property_t *
901 901 pool_property_find(char *name, pool_property_t *list)
902 902 {
903 903 pool_property_t *prop;
904 904
905 905 for (prop = list; prop->pp_name != NULL; prop++)
906 906 if (strcmp(prop->pp_name, name) == 0)
907 907 return (prop);
908 908 return (NULL);
909 909 }
910 910
911 911 static pool_property_t pool_prop_sys[] = {
912 912 { "system.name", DATA_TYPE_STRING, PP_RDWR },
913 913 { "system.comment", DATA_TYPE_STRING, PP_RDWR },
914 914 { "system.version", DATA_TYPE_UINT64, PP_READ },
915 915 { "system.bind-default", DATA_TYPE_BYTE, PP_RDWR },
916 916 { "system.allocate-method", DATA_TYPE_STRING,
917 917 PP_RDWR | PP_OPTIONAL },
918 918 { "system.poold.log-level", DATA_TYPE_STRING,
919 919 PP_RDWR | PP_OPTIONAL },
920 920 { "system.poold.log-location", DATA_TYPE_STRING,
921 921 PP_RDWR | PP_OPTIONAL },
922 922 { "system.poold.monitor-interval", DATA_TYPE_UINT64,
923 923 PP_RDWR | PP_OPTIONAL },
924 924 { "system.poold.history-file", DATA_TYPE_STRING,
925 925 PP_RDWR | PP_OPTIONAL },
926 926 { "system.poold.objectives", DATA_TYPE_STRING,
927 927 PP_RDWR | PP_OPTIONAL },
928 928 { NULL, 0, 0 }
929 929 };
930 930
931 931 static pool_property_t pool_prop_pool[] = {
932 932 { "pool.sys_id", DATA_TYPE_UINT64, PP_READ },
933 933 { "pool.name", DATA_TYPE_STRING, PP_RDWR },
934 934 { "pool.default", DATA_TYPE_BYTE, PP_READ },
935 935 { "pool.active", DATA_TYPE_BYTE, PP_RDWR },
936 936 { "pool.importance", DATA_TYPE_INT64, PP_RDWR },
937 937 { "pool.comment", DATA_TYPE_STRING, PP_RDWR },
938 938 { "pool.scheduler", DATA_TYPE_STRING,
939 939 PP_RDWR | PP_OPTIONAL },
940 940 { NULL, 0, 0 }
941 941 };
942 942
943 943 /*
944 944 * Common routine to put new property on the specified list
945 945 */
946 946 int
947 947 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props)
948 948 {
949 949 pool_property_t *prop;
950 950
951 951 if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) {
952 952 /*
953 953 * No read-only properties or properties with bad types
954 954 */
955 955 if (!(prop->pp_perm & PP_WRITE) ||
956 956 prop->pp_type != nvpair_type(pair))
957 957 return (EINVAL);
958 958 }
959 959 return (nvlist_add_nvpair(nvlist, pair));
960 960 }
961 961
962 962 /*
963 963 * Common routine to remove property from the given list
964 964 */
965 965 int
966 966 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props)
967 967 {
968 968 pool_property_t *prop;
969 969
970 970 if ((prop = pool_property_find(name, props)) != NULL) {
971 971 if (!(prop->pp_perm & PP_OPTIONAL))
972 972 return (EINVAL);
973 973 }
974 974 return (nvlist_remove_all(nvlist, name));
975 975 }
976 976
977 977 static int
978 978 pool_system_propput(nvpair_t *pair)
979 979 {
980 980 int ret;
981 981
982 982 ASSERT(pool_lock_held());
983 983 ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys);
984 984 if (ret == 0)
985 985 pool_sys_mod = gethrtime();
986 986 return (ret);
987 987 }
988 988
989 989 static int
990 990 pool_system_proprm(char *name)
991 991 {
992 992 int ret;
993 993
994 994 ASSERT(pool_lock_held());
995 995 ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys);
996 996 if (ret == 0)
997 997 pool_sys_mod = gethrtime();
998 998 return (ret);
999 999 }
1000 1000
1001 1001 static int
1002 1002 pool_pool_propput(poolid_t poolid, nvpair_t *pair)
1003 1003 {
1004 1004 pool_t *pool;
1005 1005 int ret;
1006 1006
1007 1007 ASSERT(pool_lock_held());
1008 1008 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
1009 1009 return (ESRCH);
1010 1010 ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool);
1011 1011 if (ret == 0)
1012 1012 pool_pool_mod = gethrtime();
1013 1013 return (ret);
1014 1014 }
1015 1015
1016 1016 static int
1017 1017 pool_pool_proprm(poolid_t poolid, char *name)
1018 1018 {
1019 1019 int ret;
1020 1020 pool_t *pool;
1021 1021
1022 1022 ASSERT(pool_lock_held());
1023 1023 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
1024 1024 return (ESRCH);
1025 1025 ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool);
1026 1026 if (ret == 0)
1027 1027 pool_pool_mod = gethrtime();
1028 1028 return (ret);
1029 1029 }
1030 1030
1031 1031 int
1032 1032 pool_propput(int class, int subclass, id_t id, nvpair_t *pair)
1033 1033 {
1034 1034 int ret;
1035 1035
1036 1036 ASSERT(pool_lock_held());
1037 1037 if (pool_state == POOL_DISABLED)
1038 1038 return (ENOTACTIVE);
1039 1039 switch (class) {
1040 1040 case PEC_SYSTEM:
1041 1041 ret = pool_system_propput(pair);
1042 1042 break;
1043 1043 case PEC_POOL:
1044 1044 ret = pool_pool_propput((poolid_t)id, pair);
1045 1045 break;
1046 1046 case PEC_RES_COMP:
1047 1047 switch (subclass) {
1048 1048 case PREC_PSET:
1049 1049 ret = pool_pset_propput((psetid_t)id, pair);
1050 1050 break;
1051 1051 default:
1052 1052 ret = EINVAL;
1053 1053 }
1054 1054 break;
1055 1055 case PEC_RES_AGG:
1056 1056 ret = ENOTSUP;
1057 1057 break;
1058 1058 case PEC_COMP:
1059 1059 switch (subclass) {
1060 1060 case PCEC_CPU:
1061 1061 ret = pool_cpu_propput((processorid_t)id, pair);
1062 1062 break;
1063 1063 default:
1064 1064 ret = EINVAL;
1065 1065 }
1066 1066 break;
1067 1067 default:
1068 1068 ret = EINVAL;
1069 1069 }
1070 1070 return (ret);
1071 1071 }
1072 1072
1073 1073 int
1074 1074 pool_proprm(int class, int subclass, id_t id, char *name)
1075 1075 {
1076 1076 int ret;
1077 1077
1078 1078 ASSERT(pool_lock_held());
1079 1079 if (pool_state == POOL_DISABLED)
1080 1080 return (ENOTACTIVE);
1081 1081 switch (class) {
1082 1082 case PEC_SYSTEM:
1083 1083 ret = pool_system_proprm(name);
1084 1084 break;
1085 1085 case PEC_POOL:
1086 1086 ret = pool_pool_proprm((poolid_t)id, name);
1087 1087 break;
1088 1088 case PEC_RES_COMP:
1089 1089 switch (subclass) {
1090 1090 case PREC_PSET:
1091 1091 ret = pool_pset_proprm((psetid_t)id, name);
1092 1092 break;
1093 1093 default:
1094 1094 ret = EINVAL;
1095 1095 }
1096 1096 break;
1097 1097 case PEC_RES_AGG:
1098 1098 ret = ENOTSUP;
1099 1099 break;
1100 1100 case PEC_COMP:
1101 1101 switch (subclass) {
1102 1102 case PCEC_CPU:
1103 1103 ret = pool_cpu_proprm((processorid_t)id, name);
1104 1104 break;
1105 1105 default:
1106 1106 ret = EINVAL;
1107 1107 }
1108 1108 break;
1109 1109 default:
1110 1110 ret = EINVAL;
1111 1111 }
1112 1112 return (ret);
1113 1113 }
1114 1114
1115 1115 int
1116 1116 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp)
1117 1117 {
1118 1118 int ret;
1119 1119 nvlist_t *nvl;
1120 1120
1121 1121 ASSERT(pool_lock_held());
1122 1122 if (pool_state == POOL_DISABLED)
1123 1123 return (ENOTACTIVE);
1124 1124
1125 1125 (void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
1126 1126
1127 1127 switch (class) {
1128 1128 case PEC_SYSTEM:
1129 1129 case PEC_POOL:
1130 1130 ret = EINVAL;
1131 1131 break;
1132 1132 case PEC_RES_COMP:
1133 1133 switch (subclass) {
1134 1134 case PREC_PSET:
1135 1135 ret = pool_pset_propget((psetid_t)id, name, nvl);
1136 1136 break;
1137 1137 default:
1138 1138 ret = EINVAL;
1139 1139 }
1140 1140 break;
1141 1141 case PEC_RES_AGG:
1142 1142 ret = ENOTSUP;
1143 1143 break;
1144 1144 case PEC_COMP:
1145 1145 switch (subclass) {
1146 1146 case PCEC_CPU:
1147 1147 ret = pool_cpu_propget((processorid_t)id, name, nvl);
1148 1148 break;
1149 1149 default:
1150 1150 ret = EINVAL;
1151 1151 }
1152 1152 break;
1153 1153 default:
1154 1154 ret = EINVAL;
1155 1155 }
1156 1156 if (ret == 0)
1157 1157 *nvlp = nvl;
1158 1158 else
1159 1159 nvlist_free(nvl);
1160 1160 return (ret);
1161 1161 }
1162 1162
1163 1163 /*
1164 1164 * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs
1165 1165 * in case of failure in pool_do_bind().
1166 1166 */
1167 1167 static void
1168 1168 pool_bind_wake(proc_t *p)
1169 1169 {
1170 1170 ASSERT(pool_lock_held());
1171 1171
1172 1172 mutex_enter(&p->p_lock);
1173 1173 ASSERT(p->p_poolflag & PBWAIT);
1174 1174 if (p->p_poolcnt > 0) {
1175 1175 mutex_enter(&pool_barrier_lock);
1176 1176 pool_barrier_count -= p->p_poolcnt;
1177 1177 mutex_exit(&pool_barrier_lock);
1178 1178 }
1179 1179 p->p_poolflag &= ~PBWAIT;
1180 1180 cv_signal(&p->p_poolcv);
1181 1181 mutex_exit(&p->p_lock);
1182 1182 }
1183 1183
1184 1184 static void
1185 1185 pool_bind_wakeall(proc_t **procs)
1186 1186 {
1187 1187 proc_t *p, **pp;
1188 1188
1189 1189 ASSERT(pool_lock_held());
1190 1190 for (pp = procs; (p = *pp) != NULL; pp++)
1191 1191 pool_bind_wake(p);
1192 1192 }
1193 1193
1194 1194 /*
1195 1195 * Return the scheduling class for this pool, or
1196 1196 * POOL_CLASS_UNSET if not set
1197 1197 * POOL_CLASS_INVAL if set to an invalid class ID.
1198 1198 */
1199 1199 id_t
1200 1200 pool_get_class(pool_t *pool)
1201 1201 {
1202 1202 char *name;
1203 1203 id_t cid;
1204 1204
1205 1205 ASSERT(pool_lock_held());
1206 1206
1207 1207 if (nvlist_lookup_string(pool->pool_props, "pool.scheduler",
1208 1208 &name) == 0) {
1209 1209 if (getcidbyname(name, &cid) == 0)
1210 1210 return (cid);
1211 1211 else
1212 1212 return (POOL_CLASS_INVAL);
1213 1213 }
1214 1214 return (POOL_CLASS_UNSET);
1215 1215 }
1216 1216
1217 1217 /*
1218 1218 * Move process to the new scheduling class.
1219 1219 */
1220 1220 static void
1221 1221 pool_change_class(proc_t *p, id_t cid)
1222 1222 {
1223 1223 kthread_t *t;
1224 1224 void *cldata;
1225 1225 id_t oldcid;
1226 1226 void **bufs;
1227 1227 void **buf;
1228 1228 int nlwp;
1229 1229 int ret;
1230 1230 int i;
1231 1231
1232 1232 /*
1233 1233 * Do not move kernel processes (such as zsched).
1234 1234 */
1235 1235 if (p->p_flag & SSYS)
1236 1236 return;
1237 1237 /*
1238 1238 * This process is in the pool barrier, so it can't possibly be
1239 1239 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1
1240 1240 * (for possible agent LWP which doesn't use pool barrier) as
1241 1241 * our upper bound.
1242 1242 */
1243 1243 nlwp = p->p_lwpcnt + p->p_zombcnt + 1;
1244 1244
1245 1245 /*
1246 1246 * Pre-allocate scheduling class specific buffers before
1247 1247 * grabbing p_lock.
1248 1248 */
1249 1249 bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP);
1250 1250 for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1251 1251 ret = CL_ALLOC(buf, cid, KM_SLEEP);
1252 1252 ASSERT(ret == 0);
1253 1253 }
1254 1254
1255 1255 /*
1256 1256 * Move threads one by one to the new scheduling class.
1257 1257 * This never fails because we have all the right
1258 1258 * privileges here.
1259 1259 */
1260 1260 mutex_enter(&p->p_lock);
1261 1261 ASSERT(p->p_poolflag & PBWAIT);
1262 1262 buf = bufs;
1263 1263 t = p->p_tlist;
1264 1264 ASSERT(t != NULL);
1265 1265 do {
1266 1266 if (t->t_cid != cid) {
1267 1267 oldcid = t->t_cid;
1268 1268 cldata = t->t_cldata;
1269 1269 ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf);
1270 1270 ASSERT(ret == 0);
1271 1271 CL_EXITCLASS(oldcid, cldata);
1272 1272 schedctl_set_cidpri(t);
1273 1273 *buf++ = NULL;
1274 1274 }
1275 1275 } while ((t = t->t_forw) != p->p_tlist);
1276 1276 mutex_exit(&p->p_lock);
1277 1277 /*
1278 1278 * Free unused scheduling class specific buffers.
1279 1279 */
1280 1280 for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1281 1281 if (*buf != NULL) {
1282 1282 CL_FREE(cid, *buf);
1283 1283 *buf = NULL;
1284 1284 }
1285 1285 }
1286 1286 kmem_free(bufs, nlwp * sizeof (void *));
1287 1287 }
1288 1288
1289 1289 void
1290 1290 pool_get_name(pool_t *pool, char **name)
1291 1291 {
1292 1292 ASSERT(pool_lock_held());
1293 1293
1294 1294 (void) nvlist_lookup_string(pool->pool_props, "pool.name", name);
1295 1295
1296 1296 ASSERT(strlen(*name) != 0);
1297 1297 }
1298 1298
1299 1299
1300 1300 /*
1301 1301 * The meat of the bind operation. The steps in pool_do_bind are:
1302 1302 *
1303 1303 * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all
1304 1304 * such processes to an array. For any interesting process that has
1305 1305 * threads inside the pool barrier set, increment a counter by the
1306 1306 * count of such threads. Once PBWAIT is set on a process, that process
1307 1307 * will not disappear.
1308 1308 *
1309 1309 * 2) Wait for the counter from step 2 to drop to zero. Any process which
1310 1310 * calls pool_barrier_exit() and notices that PBWAIT has been set on it
1311 1311 * will decrement that counter before going to sleep, and the process
1312 1312 * calling pool_barrier_exit() which does the final decrement will wake us.
1313 1313 *
1314 1314 * 3) For each interesting process, perform a calculation on it to see if
1315 1315 * the bind will actually succeed. This uses the following three
1316 1316 * resource-set-specific functions:
1317 1317 *
1318 1318 * - int set_bind_start(procs, pool)
1319 1319 *
1320 1320 * Determine whether the given array of processes can be bound to the
1321 1321 * resource set associated with the given pool. If it can, take and hold
1322 1322 * any locks necessary to ensure that the operation will succeed, and
1323 1323 * make any necessary reservations in the target resource set. If it
1324 1324 * can't, return failure with no reservations made and no new locks held.
1325 1325 *
1326 1326 * - void set_bind_abort(procs, pool)
1327 1327 *
1328 1328 * set_bind_start() has completed successfully, but another resource set's
1329 1329 * set_bind_start() has failed, and we haven't begun the bind yet. Undo
1330 1330 * any reservations made and drop any locks acquired by our
1331 1331 * set_bind_start().
1332 1332 *
1333 1333 * - void set_bind_finish(void)
1334 1334 *
1335 1335 * The bind has completed successfully. The processes have been released,
1336 1336 * and the reservation acquired in set_bind_start() has been depleted as
1337 1337 * the processes have finished their bindings. Drop any locks acquired by
1338 1338 * set_bind_start().
1339 1339 *
1340 1340 * 4) If we've decided that we can proceed with the bind, iterate through
1341 1341 * the list of interesting processes, grab the necessary locks (which
1342 1342 * may differ per resource set), perform the bind, and ASSERT that it
1343 1343 * succeeds. Once a process has been rebound, it can be awakened.
1344 1344 *
1345 1345 * The operations from step 4 must be kept in sync with anything which might
1346 1346 * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and
1347 1347 * are thus located in the same source files as the associated bind operations.
1348 1348 */
1349 1349 int
1350 1350 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags)
1351 1351 {
1352 1352 extern uint_t nproc;
1353 1353 klwp_t *lwp = ttolwp(curthread);
1354 1354 proc_t **pp, **procs;
1355 1355 proc_t *prstart;
1356 1356 int procs_count = 0;
1357 1357 kproject_t *kpj;
1358 1358 procset_t set;
1359 1359 zone_t *zone;
1360 1360 int procs_size;
1361 1361 int rv = 0;
1362 1362 proc_t *p;
1363 1363 id_t cid = -1;
1364 1364
1365 1365 ASSERT(pool_lock_held());
1366 1366
1367 1367 if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL)
1368 1368 return (EINVAL);
1369 1369
1370 1370 if (idtype == P_ZONEID) {
1371 1371 zone = zone_find_by_id(id);
1372 1372 if (zone == NULL)
1373 1373 return (ESRCH);
1374 1374 if (zone_status_get(zone) > ZONE_IS_RUNNING) {
1375 1375 zone_rele(zone);
1376 1376 return (EBUSY);
1377 1377 }
1378 1378 }
1379 1379
1380 1380 if (idtype == P_PROJID) {
1381 1381 kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND);
1382 1382 if (kpj == NULL)
1383 1383 return (ESRCH);
1384 1384 mutex_enter(&kpj->kpj_poolbind);
1385 1385 }
1386 1386
1387 1387 if (idtype == P_PID) {
1388 1388 /*
1389 1389 * Fast-path for a single process case.
1390 1390 */
1391 1391 procs_size = 2; /* procs is NULL-terminated */
1392 1392 procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP);
1393 1393 mutex_enter(&pidlock);
1394 1394 } else {
1395 1395 /*
1396 1396 * We will need enough slots for proc_t pointers for as many as
1397 1397 * twice the number of currently running processes (assuming
1398 1398 * that each one could be in fork() creating a new child).
1399 1399 */
1400 1400 for (;;) {
1401 1401 procs_size = nproc * 2;
1402 1402 procs = kmem_zalloc(procs_size * sizeof (proc_t *),
1403 1403 KM_SLEEP);
1404 1404 mutex_enter(&pidlock);
1405 1405
1406 1406 if (nproc * 2 <= procs_size)
1407 1407 break;
1408 1408 /*
1409 1409 * If nproc has changed, try again.
1410 1410 */
1411 1411 mutex_exit(&pidlock);
1412 1412 kmem_free(procs, procs_size * sizeof (proc_t *));
1413 1413 }
1414 1414 }
1415 1415
1416 1416 if (id == P_MYID)
1417 1417 id = getmyid(idtype);
1418 1418 setprocset(&set, POP_AND, idtype, id, P_ALL, 0);
1419 1419
1420 1420 /*
1421 1421 * Do a first scan, and select target processes.
1422 1422 */
1423 1423 if (idtype == P_PID)
1424 1424 prstart = prfind(id);
1425 1425 else
1426 1426 prstart = practive;
1427 1427 for (p = prstart, pp = procs; p != NULL; p = p->p_next) {
1428 1428 mutex_enter(&p->p_lock);
1429 1429 /*
1430 1430 * Skip processes that don't match our (id, idtype) set or
1431 1431 * on the way of becoming zombies. Skip kernel processes
1432 1432 * from the global zone.
1433 1433 */
1434 1434 if (procinset(p, &set) == 0 ||
1435 1435 p->p_poolflag & PEXITED ||
1436 1436 ((p->p_flag & SSYS) && INGLOBALZONE(p))) {
1437 1437 mutex_exit(&p->p_lock);
1438 1438 continue;
1439 1439 }
1440 1440 if (!INGLOBALZONE(p)) {
1441 1441 switch (idtype) {
1442 1442 case P_PID:
1443 1443 case P_TASKID:
1444 1444 /*
1445 1445 * Can't bind processes or tasks
1446 1446 * in local zones to pools.
1447 1447 */
1448 1448 mutex_exit(&p->p_lock);
1449 1449 mutex_exit(&pidlock);
1450 1450 pool_bind_wakeall(procs);
1451 1451 rv = EINVAL;
1452 1452 goto out;
1453 1453 case P_PROJID:
1454 1454 /*
1455 1455 * Only projects in the global
1456 1456 * zone can be rebound.
1457 1457 */
1458 1458 mutex_exit(&p->p_lock);
1459 1459 continue;
1460 1460 case P_POOLID:
1461 1461 /*
1462 1462 * When rebinding pools, processes can be
1463 1463 * in different zones.
1464 1464 */
1465 1465 break;
1466 1466 }
1467 1467 }
1468 1468
1469 1469 p->p_poolflag |= PBWAIT;
1470 1470 /*
1471 1471 * If some threads in this process are inside the pool
1472 1472 * barrier, add them to pool_barrier_count, as we have
1473 1473 * to wait for all of them to exit the barrier.
1474 1474 */
1475 1475 if (p->p_poolcnt > 0) {
1476 1476 mutex_enter(&pool_barrier_lock);
1477 1477 pool_barrier_count += p->p_poolcnt;
1478 1478 mutex_exit(&pool_barrier_lock);
1479 1479 }
1480 1480 ASSERT(pp < &procs[procs_size]);
1481 1481 *pp++ = p;
1482 1482 procs_count++;
1483 1483 mutex_exit(&p->p_lock);
1484 1484
1485 1485 /*
1486 1486 * We just found our process, so if we're only rebinding a
1487 1487 * single process then get out of this loop.
1488 1488 */
1489 1489 if (idtype == P_PID)
1490 1490 break;
1491 1491 }
1492 1492 *pp = NULL; /* cap off the end of the array */
1493 1493 mutex_exit(&pidlock);
1494 1494
1495 1495 /*
1496 1496 * Wait for relevant processes to stop before they try to enter the
1497 1497 * barrier or at the exit from the barrier. Make sure that we do
1498 1498 * not get stopped here while we're holding pool_lock. If we were
1499 1499 * requested to stop, or got a signal then return EAGAIN to let the
1500 1500 * library know that it needs to retry.
1501 1501 */
1502 1502 mutex_enter(&pool_barrier_lock);
1503 1503 lwp->lwp_nostop++;
1504 1504 while (pool_barrier_count > 0) {
1505 1505 (void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock);
1506 1506 if (pool_barrier_count > 0) {
1507 1507 /*
1508 1508 * We either got a signal or were requested to
1509 1509 * stop by /proc. Bail out with EAGAIN. If we were
1510 1510 * requested to stop, we'll stop in post_syscall()
1511 1511 * on our way back to userland.
1512 1512 */
1513 1513 mutex_exit(&pool_barrier_lock);
1514 1514 pool_bind_wakeall(procs);
1515 1515 lwp->lwp_nostop--;
1516 1516 rv = EAGAIN;
1517 1517 goto out;
1518 1518 }
1519 1519 }
1520 1520 lwp->lwp_nostop--;
1521 1521 mutex_exit(&pool_barrier_lock);
1522 1522
1523 1523 if (idtype == P_PID) {
1524 1524 if ((p = *procs) == NULL)
1525 1525 goto skip;
1526 1526 mutex_enter(&p->p_lock);
1527 1527 /* Drop the process if it is exiting */
1528 1528 if (p->p_poolflag & PEXITED) {
1529 1529 mutex_exit(&p->p_lock);
1530 1530 pool_bind_wake(p);
1531 1531 procs_count--;
1532 1532 } else
1533 1533 mutex_exit(&p->p_lock);
1534 1534 goto skip;
1535 1535 }
1536 1536
1537 1537 /*
1538 1538 * Do another run, and drop processes that were inside the barrier
1539 1539 * in exit(), but when they have dropped to pool_barrier_exit
1540 1540 * they have become of no interest to us. Pick up child processes that
1541 1541 * were created by fork() but didn't exist during our first scan.
1542 1542 * Their parents are now stopped at pool_barrier_exit in cfork().
1543 1543 */
1544 1544 mutex_enter(&pidlock);
1545 1545 for (pp = procs; (p = *pp) != NULL; pp++) {
1546 1546 mutex_enter(&p->p_lock);
1547 1547 if (p->p_poolflag & PEXITED) {
1548 1548 ASSERT(p->p_lwpcnt == 0);
1549 1549 mutex_exit(&p->p_lock);
1550 1550 pool_bind_wake(p);
1551 1551 /* flip w/last non-NULL slot */
1552 1552 *pp = procs[procs_count - 1];
1553 1553 procs[procs_count - 1] = NULL;
1554 1554 procs_count--;
1555 1555 pp--; /* try this slot again */
1556 1556 continue;
1557 1557 } else
1558 1558 mutex_exit(&p->p_lock);
1559 1559 /*
1560 1560 * Look at the child and check if it should be rebound also.
1561 1561 * We're holding pidlock, so it is safe to reference p_child.
1562 1562 */
1563 1563 if ((p = p->p_child) == NULL)
1564 1564 continue;
1565 1565
1566 1566 mutex_enter(&p->p_lock);
1567 1567
1568 1568 /*
1569 1569 * Skip system processes and make sure that the child is in
1570 1570 * the same task/project/pool/zone as the parent.
1571 1571 */
1572 1572 if ((!INGLOBALZONE(p) && idtype != P_ZONEID &&
1573 1573 idtype != P_POOLID) || p->p_flag & SSYS) {
1574 1574 mutex_exit(&p->p_lock);
1575 1575 continue;
1576 1576 }
1577 1577
1578 1578 /*
1579 1579 * If the child process has been already created by fork(), has
1580 1580 * not exited, and has not been added to the list already,
1581 1581 * then add it now. We will hit this process again (since we
1582 1582 * stick it at the end of the procs list) but it will ignored
1583 1583 * because it will have the PBWAIT flag set.
1584 1584 */
1585 1585 if (procinset(p, &set) &&
1586 1586 !(p->p_poolflag & PEXITED) &&
1587 1587 !(p->p_poolflag & PBWAIT)) {
1588 1588 ASSERT(p->p_child == NULL); /* no child of a child */
1589 1589 procs[procs_count] = p;
1590 1590 procs[procs_count + 1] = NULL;
1591 1591 procs_count++;
1592 1592 p->p_poolflag |= PBWAIT;
1593 1593 }
1594 1594 mutex_exit(&p->p_lock);
1595 1595 }
1596 1596 mutex_exit(&pidlock);
1597 1597 skip:
1598 1598 /*
1599 1599 * If there's no processes to rebind then return ESRCH, unless
1600 1600 * we're associating a pool with new resource set, destroying it,
1601 1601 * or binding a zone to a pool.
1602 1602 */
1603 1603 if (procs_count == 0) {
1604 1604 if (idtype == P_POOLID || idtype == P_ZONEID)
1605 1605 rv = 0;
1606 1606 else
1607 1607 rv = ESRCH;
1608 1608 goto out;
1609 1609 }
1610 1610
1611 1611 #ifdef DEBUG
1612 1612 /*
1613 1613 * All processes in the array should have PBWAIT set, and none
1614 1614 * should be in the critical section. Thus, although p_poolflag
1615 1615 * and p_poolcnt are protected by p_lock, their ASSERTions below
1616 1616 * should be stable without it. procinset(), however, ASSERTs that
1617 1617 * the p_lock is held upon entry.
1618 1618 */
1619 1619 for (pp = procs; (p = *pp) != NULL; pp++) {
1620 1620 int in_set;
1621 1621
1622 1622 mutex_enter(&p->p_lock);
1623 1623 in_set = procinset(p, &set);
1624 1624 mutex_exit(&p->p_lock);
1625 1625
1626 1626 ASSERT(in_set);
1627 1627 ASSERT(p->p_poolflag & PBWAIT);
1628 1628 ASSERT(p->p_poolcnt == 0);
1629 1629 }
1630 1630 #endif
1631 1631
1632 1632 /*
1633 1633 * Do the check if processor set rebinding is going to succeed or not.
1634 1634 */
1635 1635 if ((flags & POOL_BIND_PSET) &&
1636 1636 (rv = pset_bind_start(procs, pool)) != 0) {
1637 1637 pool_bind_wakeall(procs);
1638 1638 goto out;
1639 1639 }
1640 1640
1641 1641 /*
1642 1642 * At this point, all bind operations should succeed.
1643 1643 */
1644 1644 for (pp = procs; (p = *pp) != NULL; pp++) {
1645 1645 if (flags & POOL_BIND_PSET) {
1646 1646 psetid_t psetid = pool->pool_pset->pset_id;
1647 1647 void *zonebuf;
1648 1648 void *projbuf;
1649 1649
1650 1650 /*
1651 1651 * Pre-allocate one buffer for FSS (per-project
1652 1652 * buffer for a new pset) in case if this is the
1653 1653 * first thread from its current project getting
1654 1654 * bound to this processor set.
1655 1655 */
1656 1656 projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ);
1657 1657 zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE);
1658 1658
1659 1659 mutex_enter(&pidlock);
1660 1660 mutex_enter(&p->p_lock);
1661 1661 pool_pset_bind(p, psetid, projbuf, zonebuf);
1662 1662 mutex_exit(&p->p_lock);
1663 1663 mutex_exit(&pidlock);
1664 1664 /*
1665 1665 * Free buffers pre-allocated above if it
1666 1666 * wasn't actually used.
1667 1667 */
1668 1668 fss_freebuf(projbuf, FSS_ALLOC_PROJ);
1669 1669 fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
1670 1670 }
1671 1671 /*
1672 1672 * Now let's change the scheduling class of this
1673 1673 * process if our target pool has it defined.
1674 1674 */
1675 1675 if (cid != POOL_CLASS_UNSET)
1676 1676 pool_change_class(p, cid);
1677 1677
1678 1678 /*
1679 1679 * It is safe to reference p_pool here without holding
↓ open down ↓ |
1679 lines elided |
↑ open up ↑ |
1680 1680 * p_lock because it cannot change underneath of us.
1681 1681 * We're holding pool_lock here, so nobody else can be
1682 1682 * moving this process between pools. If process "p"
1683 1683 * would be exiting, we're guaranteed that it would be blocked
1684 1684 * at pool_barrier_enter() in exit(). Otherwise, it would've
1685 1685 * been skipped by one of our scans of the practive list
1686 1686 * as a process with PEXITED flag set.
1687 1687 */
1688 1688 if (p->p_pool != pool) {
1689 1689 ASSERT(p->p_pool->pool_ref > 0);
1690 - atomic_add_32(&p->p_pool->pool_ref, -1);
1690 + atomic_dec_32(&p->p_pool->pool_ref);
1691 1691 p->p_pool = pool;
1692 - atomic_add_32(&p->p_pool->pool_ref, 1);
1692 + atomic_inc_32(&p->p_pool->pool_ref);
1693 1693 }
1694 1694 /*
1695 1695 * Okay, we've tortured this guy enough.
1696 1696 * Let this poor process go now.
1697 1697 */
1698 1698 pool_bind_wake(p);
1699 1699 }
1700 1700 if (flags & POOL_BIND_PSET)
1701 1701 pset_bind_finish();
1702 1702
1703 1703 out: switch (idtype) {
1704 1704 case P_PROJID:
1705 1705 ASSERT(kpj != NULL);
1706 1706 mutex_exit(&kpj->kpj_poolbind);
1707 1707 project_rele(kpj);
1708 1708 break;
1709 1709 case P_ZONEID:
1710 1710 if (rv == 0) {
1711 1711 mutex_enter(&cpu_lock);
1712 1712 zone_pool_set(zone, pool);
1713 1713 mutex_exit(&cpu_lock);
1714 1714 }
1715 1715 zone->zone_pool_mod = gethrtime();
1716 1716 zone_rele(zone);
1717 1717 break;
1718 1718 }
1719 1719
1720 1720 kmem_free(procs, procs_size * sizeof (proc_t *));
1721 1721 ASSERT(pool_barrier_count == 0);
1722 1722 return (rv);
1723 1723 }
1724 1724
1725 1725 void
1726 1726 pool_event_cb_register(pool_event_cb_t *cb)
1727 1727 {
1728 1728 ASSERT(!pool_lock_held() || panicstr);
1729 1729 ASSERT(cb->pec_func != NULL);
1730 1730
1731 1731 mutex_enter(&pool_event_cb_lock);
1732 1732 if (!pool_event_cb_init) {
1733 1733 list_create(&pool_event_cb_list, sizeof (pool_event_cb_t),
1734 1734 offsetof(pool_event_cb_t, pec_list));
1735 1735 pool_event_cb_init = B_TRUE;
1736 1736 }
1737 1737 list_insert_tail(&pool_event_cb_list, cb);
1738 1738 mutex_exit(&pool_event_cb_lock);
1739 1739 }
1740 1740
1741 1741 void
1742 1742 pool_event_cb_unregister(pool_event_cb_t *cb)
1743 1743 {
1744 1744 ASSERT(!pool_lock_held() || panicstr);
1745 1745
1746 1746 mutex_enter(&pool_event_cb_lock);
1747 1747 list_remove(&pool_event_cb_list, cb);
1748 1748 mutex_exit(&pool_event_cb_lock);
1749 1749 }
1750 1750
1751 1751 typedef struct {
1752 1752 pool_event_t tqd_what;
1753 1753 poolid_t tqd_id;
1754 1754 } pool_tqd_t;
1755 1755
1756 1756 void
1757 1757 pool_event_notify(void *arg)
1758 1758 {
1759 1759 pool_tqd_t *tqd = (pool_tqd_t *)arg;
1760 1760 pool_event_cb_t *cb;
1761 1761
1762 1762 ASSERT(!pool_lock_held() || panicstr);
1763 1763
1764 1764 mutex_enter(&pool_event_cb_lock);
1765 1765 for (cb = list_head(&pool_event_cb_list); cb != NULL;
1766 1766 cb = list_next(&pool_event_cb_list, cb)) {
1767 1767 cb->pec_func(tqd->tqd_what, tqd->tqd_id, cb->pec_arg);
1768 1768 }
1769 1769 mutex_exit(&pool_event_cb_lock);
1770 1770 kmem_free(tqd, sizeof (*tqd));
1771 1771 }
1772 1772
1773 1773 void
1774 1774 pool_event_dispatch(pool_event_t what, poolid_t id)
1775 1775 {
1776 1776 pool_tqd_t *tqd = NULL;
1777 1777
1778 1778 ASSERT(pool_lock_held());
1779 1779
1780 1780 if (pool_event_cb_taskq == NULL) {
1781 1781 pool_event_cb_taskq = taskq_create("pool_event_cb_taskq", 1,
1782 1782 -1, 1, 1, TASKQ_PREPOPULATE);
1783 1783 }
1784 1784
1785 1785 tqd = kmem_alloc(sizeof (*tqd), KM_SLEEP);
1786 1786 tqd->tqd_what = what;
1787 1787 tqd->tqd_id = id;
1788 1788
1789 1789 (void) taskq_dispatch(pool_event_cb_taskq, pool_event_notify, tqd,
1790 1790 KM_SLEEP);
1791 1791 }
↓ open down ↓ |
89 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX