1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2013, Joyent, Inc. All rights reserved. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/sysmacros.h> 30 #include <sys/cred.h> 31 #include <sys/proc.h> 32 #include <sys/strsubr.h> 33 #include <sys/priocntl.h> 34 #include <sys/class.h> 35 #include <sys/disp.h> 36 #include <sys/procset.h> 37 #include <sys/debug.h> 38 #include <sys/kmem.h> 39 #include <sys/errno.h> 40 #include <sys/systm.h> 41 #include <sys/schedctl.h> 42 #include <sys/vmsystm.h> 43 #include <sys/atomic.h> 44 #include <sys/project.h> 45 #include <sys/modctl.h> 46 #include <sys/fss.h> 47 #include <sys/fsspriocntl.h> 48 #include <sys/cpupart.h> 49 #include <sys/zone.h> 50 #include <vm/rm.h> 51 #include <vm/seg_kmem.h> 52 #include <sys/tnf_probe.h> 53 #include <sys/policy.h> 54 #include <sys/sdt.h> 55 #include <sys/cpucaps.h> 56 57 /* 58 * The fair share scheduling class ensures that collections of processes 59 * (zones and projects) each get their configured share of CPU. This is in 60 * contrast to the TS class which considers individual processes. 61 * 62 * The FSS cpu-share is set on zones using the zone.cpu-shares rctl and on 63 * projects using the project.cpu-shares rctl. By default the value is 1 64 * and it can range from 0 - 64k. A value of 0 means that processes in the 65 * collection will only get CPU resources when there are no other processes 66 * that need CPU. The cpu-share is used as one of the inputs to calculate a 67 * thread's "user-mode" priority (umdpri) for the scheduler. The umdpri falls 68 * in the range 0-59. FSS calculates other, internal, priorities which are not 69 * visible outside of the FSS class. 70 * 71 * The FSS class should approximate TS behavior when there are excess CPU 72 * resources. When there is a backlog of runnable processes, then the share 73 * is used as input into the runnable process's priority calculation, where 74 * the final umdpri is used by the scheduler to determine when the process runs. 75 * 76 * Projects in a zone compete with each other for CPU time, receiving CPU 77 * allocation within a zone proportional to the project's share; at a higher 78 * level zones compete with each other, receiving allocation in a pset 79 * proportional to the zone's share. 80 * 81 * The FSS priority calculation consists of several parts. 82 * 83 * 1) Once per second the fss_update function runs. The first thing it does is 84 * call fss_decay_usage. This function does three things. 85 * 86 * a) fss_decay_usage first decays the maxfsspri value for the pset. This 87 * value is used in the per-process priority calculation described in step 88 * (2b). The maxfsspri is decayed using the following formula: 89 * 90 * maxfsspri * fss_nice_decay[NZERO]) 91 * maxfsspri = ------------------------------------ 92 * FSS_DECAY_BASE 93 * 94 * 95 * - NZERO is the default process priority (i.e. 20) 96 * 97 * The fss_nice_decay array is a fixed set of values used to adjust the 98 * decay rate of processes based on their nice value. Entries in this 99 * array are initialized in fss_init using the following formula: 100 * 101 * (FSS_DECAY_MAX - FSS_DECAY_MIN) * i 102 * FSS_DECAY_MIN + ------------------------------------- 103 * FSS_NICE_RANGE - 1 104 * 105 * - FSS_DECAY_MIN is 82 = approximates 65% (82/128) 106 * - FSS_DECAY_MAX is 108 = approximates 85% (108/128) 107 * - FSS_NICE_RANGE is 40 (range is 0 - 39) 108 * 109 * b) The second thing fss_decay_usage does is update each project's "usage" 110 * for the last second and then recalculates the project's "share usage". 111 * 112 * The usage value is the recent CPU usage for all of the threads in the 113 * project. It is decayed and updated this way: 114 * 115 * (usage * FSS_DECAY_USG) 116 * usage = ------------------------- + ticks; 117 * FSS_DECAY_BASE 118 * 119 * - FSS_DECAY_BASE is 128 - used instead of 100 so we can shift vs divide 120 * - FSS_DECAY_USG is 96 - approximates 75% (96/128) 121 * - ticks is updated whenever a process in this project is running 122 * when the scheduler's tick processing fires. This is not a simple 123 * counter, the values are based on the entries in the fss_nice_tick 124 * array (see section 3 below). ticks is then reset to 0 so it can track 125 * the next seconds worth of nice-adjusted time for the project. 126 * 127 * c) The third thing fss_decay_usage does is update each project's "share 128 * usage" (shusage). This is the normalized usage value for the project and 129 * is calculated this way: 130 * 131 * pset_shares^2 zone_int_shares^2 132 * usage * ------------- * ------------------ 133 * kpj_shares^2 zone_ext_shares^2 134 * 135 * - usage - see (1b) for more details 136 * - pset_shares is the total of all *active* zone shares in the pset (by 137 * default there is only one pset) 138 * - kpj_shares is the individual project's share (project.cpu-shares rctl) 139 * - zone_int_shares is the sum of shares of all active projects within the 140 * zone (the zone-internal total) 141 * - zone_ext_shares is the share value for the zone (zone.cpu-shares rctl) 142 * 143 * The shusage is used in step (2b) to calculate the thread's new internal 144 * priority. A larger shusage value leads to a lower priority. 145 * 146 * 2) The fss_update function then calls fss_update_list to update the priority 147 * of all threads. This does two things. 148 * 149 * a) First the thread's internal priority is decayed using the following 150 * formula: 151 * 152 * fsspri * fss_nice_decay[nice_value]) 153 * fsspri = ------------------------------------ 154 * FSS_DECAY_BASE 155 * 156 * - FSS_DECAY_BASE is 128 as described above 157 * 158 * b) Second, if the thread is runnable (TS_RUN or TS_WAIT) calls fss_newpri 159 * to update the user-mode priority (umdpri) of the runnable thread. 160 * Threads that are running (TS_ONPROC) or waiting for an event (TS_SLEEP) 161 * are not updated at this time. The updated user-mode priority can cause 162 * threads to change their position in the run queue. 163 * 164 * The process's new internal fsspri is calculated using the following 165 * formula. All runnable threads in the project will use the same shusage 166 * and nrunnable values in their calculation. 167 * 168 * fsspri += shusage * nrunnable * ticks 169 * 170 * - shusage is the project's share usage, calculated in (1c) 171 * - nrunnable is the number of runnable threads in the project 172 * - ticks is the number of ticks this thread ran since the last fss_newpri 173 * invocation. 174 * 175 * Finally the process's new user-mode priority is calculated using the 176 * following formula: 177 * 178 * (fsspri * umdprirange) 179 * umdpri = maxumdpri - ------------------------ 180 * maxfsspri 181 * 182 * - maxumdpri is MINCLSYSPRI - 1 (i.e. 59) 183 * - umdprirange is maxumdpri - 1 (i.e. 58) 184 * - maxfsspri is the largest fsspri seen so far, as we're iterating all 185 * runnable processes 186 * 187 * Thus, a higher internal priority (fsspri) leads to a lower user-mode 188 * priority which means the thread runs less. The fsspri is higher when 189 * the project's normalized share usage is higher, when the project has 190 * more runnable threads, or when the thread has accumulated more run-time. 191 * 192 * This code has various checks to ensure the resulting umdpri is in the 193 * range 1-59. See fss_newpri for more details. 194 * 195 * To reiterate, the above processing is performed once per second to recompute 196 * the runnable thread user-mode priorities. 197 * 198 * 3) The final major component in the priority calculation is the tick 199 * processing which occurs on a thread that is running when the clock 200 * calls fss_tick. 201 * 202 * A thread can run continuously in user-land (compute-bound) for the 203 * fss_quantum (see "dispadmin -c FSS -g" for the configurable properties). 204 * The fss_quantum defaults to 11 (i.e. 11 ticks). 205 * 206 * Once the quantum has been consumed, the thread will call fss_newpri to 207 * recompute its umdpri priority, as described above in (2b). Threads that 208 * were T_ONPROC at the one second interval when runnable thread priorities 209 * were recalculated will have their umdpri priority recalculated when their 210 * quanta expires. 211 * 212 * To ensure that runnable threads within a project see the expected 213 * round-robin behavior, there is a special case in fss_newpri for a thread 214 * that has run for its quanta within the one second update interval. See 215 * the handling for the quanta_up parameter within fss_newpri. 216 * 217 * Also of interest, the fss_tick code increments the project's tick value 218 * using the fss_nice_tick array entry for the thread's nice value. The idea 219 * behind the fss_nice_tick array is that the cost of a tick is lower at 220 * positive nice values (so that it doesn't increase the project's usage 221 * as much as normal) with a 50% drop at the maximum level and a 50% 222 * increase at the minimum level. See (1b). The fss_nice_tick array is 223 * initialized in fss_init using the following formula: 224 * 225 * FSS_TICK_COST * (((3 * FSS_NICE_RANGE) / 2) - i) 226 * -------------------------------------------------- 227 * FSS_NICE_RANGE 228 * 229 * - FSS_TICK_COST is 1000, the tick cost for threads with nice level 0 230 * 231 * FSS Data Structures: 232 * 233 * fsszone 234 * ----- ----- 235 * ----- | | | | 236 * | |-------->| |<------->| |<---->... 237 * | | ----- ----- 238 * | | ^ ^ ^ 239 * | |--- | \ \ 240 * ----- | | \ \ 241 * fsspset | | \ \ 242 * | | \ \ 243 * | ----- ----- ----- 244 * -->| |<--->| |<--->| | 245 * | | | | | | 246 * ----- ----- ----- 247 * fssproj 248 * 249 * That is, fsspsets contain a list of fsszone's that are currently active in 250 * the pset, and a list of fssproj's, corresponding to projects with runnable 251 * threads on the pset. fssproj's in turn point to the fsszone which they 252 * are a member of. 253 * 254 * An fssproj_t is removed when there are no threads in it. 255 * 256 * An fsszone_t is removed when there are no projects with threads in it. 257 */ 258 259 static pri_t fss_init(id_t, int, classfuncs_t **); 260 261 static struct sclass fss = { 262 "FSS", 263 fss_init, 264 0 265 }; 266 267 extern struct mod_ops mod_schedops; 268 269 /* 270 * Module linkage information for the kernel. 271 */ 272 static struct modlsched modlsched = { 273 &mod_schedops, "fair share scheduling class", &fss 274 }; 275 276 static struct modlinkage modlinkage = { 277 MODREV_1, (void *)&modlsched, NULL 278 }; 279 280 #define FSS_MAXUPRI 60 281 282 /* 283 * The fssproc_t structures are kept in an array of circular doubly linked 284 * lists. A hash on the thread pointer is used to determine which list each 285 * thread should be placed in. Each list has a dummy "head" which is never 286 * removed, so the list is never empty. fss_update traverses these lists to 287 * update the priorities of threads that have been waiting on the run queue. 288 */ 289 #define FSS_LISTS 16 /* number of lists, must be power of 2 */ 290 #define FSS_LIST_HASH(t) (((uintptr_t)(t) >> 9) & (FSS_LISTS - 1)) 291 #define FSS_LIST_NEXT(i) (((i) + 1) & (FSS_LISTS - 1)) 292 293 #define FSS_LIST_INSERT(fssproc) \ 294 { \ 295 int index = FSS_LIST_HASH(fssproc->fss_tp); \ 296 kmutex_t *lockp = &fss_listlock[index]; \ 297 fssproc_t *headp = &fss_listhead[index]; \ 298 mutex_enter(lockp); \ 299 fssproc->fss_next = headp->fss_next; \ 300 fssproc->fss_prev = headp; \ 301 headp->fss_next->fss_prev = fssproc; \ 302 headp->fss_next = fssproc; \ 303 mutex_exit(lockp); \ 304 } 305 306 #define FSS_LIST_DELETE(fssproc) \ 307 { \ 308 int index = FSS_LIST_HASH(fssproc->fss_tp); \ 309 kmutex_t *lockp = &fss_listlock[index]; \ 310 mutex_enter(lockp); \ 311 fssproc->fss_prev->fss_next = fssproc->fss_next; \ 312 fssproc->fss_next->fss_prev = fssproc->fss_prev; \ 313 mutex_exit(lockp); \ 314 } 315 316 #define FSS_TICK_COST 1000 /* tick cost for threads with nice level = 0 */ 317 318 /* 319 * Decay rate percentages are based on n/128 rather than n/100 so that 320 * calculations can avoid having to do an integer divide by 100 (divide 321 * by FSS_DECAY_BASE == 128 optimizes to an arithmetic shift). 322 * 323 * FSS_DECAY_MIN = 83/128 ~= 65% 324 * FSS_DECAY_MAX = 108/128 ~= 85% 325 * FSS_DECAY_USG = 96/128 ~= 75% 326 */ 327 #define FSS_DECAY_MIN 83 /* fsspri decay pct for threads w/ nice -20 */ 328 #define FSS_DECAY_MAX 108 /* fsspri decay pct for threads w/ nice +19 */ 329 #define FSS_DECAY_USG 96 /* fssusage decay pct for projects */ 330 #define FSS_DECAY_BASE 128 /* base for decay percentages above */ 331 332 #define FSS_NICE_MIN 0 333 #define FSS_NICE_MAX (2 * NZERO - 1) 334 #define FSS_NICE_RANGE (FSS_NICE_MAX - FSS_NICE_MIN + 1) 335 336 static int fss_nice_tick[FSS_NICE_RANGE]; 337 static int fss_nice_decay[FSS_NICE_RANGE]; 338 339 static pri_t fss_maxupri = FSS_MAXUPRI; /* maximum FSS user priority */ 340 static pri_t fss_maxumdpri; /* maximum user mode fss priority */ 341 static pri_t fss_maxglobpri; /* maximum global priority used by fss class */ 342 static pri_t fss_minglobpri; /* minimum global priority */ 343 344 static fssproc_t fss_listhead[FSS_LISTS]; 345 static kmutex_t fss_listlock[FSS_LISTS]; 346 347 static fsspset_t *fsspsets; 348 static kmutex_t fsspsets_lock; /* protects fsspsets */ 349 350 static id_t fss_cid; 351 352 static int fss_quantum = 11; 353 354 static void fss_newpri(fssproc_t *, boolean_t); 355 static void fss_update(void *); 356 static int fss_update_list(int); 357 static void fss_change_priority(kthread_t *, fssproc_t *); 358 359 static int fss_admin(caddr_t, cred_t *); 360 static int fss_getclinfo(void *); 361 static int fss_parmsin(void *); 362 static int fss_parmsout(void *, pc_vaparms_t *); 363 static int fss_vaparmsin(void *, pc_vaparms_t *); 364 static int fss_vaparmsout(void *, pc_vaparms_t *); 365 static int fss_getclpri(pcpri_t *); 366 static int fss_alloc(void **, int); 367 static void fss_free(void *); 368 369 static int fss_enterclass(kthread_t *, id_t, void *, cred_t *, void *); 370 static void fss_exitclass(void *); 371 static int fss_canexit(kthread_t *, cred_t *); 372 static int fss_fork(kthread_t *, kthread_t *, void *); 373 static void fss_forkret(kthread_t *, kthread_t *); 374 static void fss_parmsget(kthread_t *, void *); 375 static int fss_parmsset(kthread_t *, void *, id_t, cred_t *); 376 static void fss_stop(kthread_t *, int, int); 377 static void fss_exit(kthread_t *); 378 static void fss_active(kthread_t *); 379 static void fss_inactive(kthread_t *); 380 static void fss_trapret(kthread_t *); 381 static void fss_preempt(kthread_t *); 382 static void fss_setrun(kthread_t *); 383 static void fss_sleep(kthread_t *); 384 static void fss_tick(kthread_t *); 385 static void fss_wakeup(kthread_t *); 386 static int fss_donice(kthread_t *, cred_t *, int, int *); 387 static int fss_doprio(kthread_t *, cred_t *, int, int *); 388 static pri_t fss_globpri(kthread_t *); 389 static void fss_yield(kthread_t *); 390 static void fss_nullsys(); 391 392 static struct classfuncs fss_classfuncs = { 393 /* class functions */ 394 fss_admin, 395 fss_getclinfo, 396 fss_parmsin, 397 fss_parmsout, 398 fss_vaparmsin, 399 fss_vaparmsout, 400 fss_getclpri, 401 fss_alloc, 402 fss_free, 403 404 /* thread functions */ 405 fss_enterclass, 406 fss_exitclass, 407 fss_canexit, 408 fss_fork, 409 fss_forkret, 410 fss_parmsget, 411 fss_parmsset, 412 fss_stop, 413 fss_exit, 414 fss_active, 415 fss_inactive, 416 fss_trapret, 417 fss_preempt, 418 fss_setrun, 419 fss_sleep, 420 fss_tick, 421 fss_wakeup, 422 fss_donice, 423 fss_globpri, 424 fss_nullsys, /* set_process_group */ 425 fss_yield, 426 fss_doprio, 427 }; 428 429 int 430 _init() 431 { 432 return (mod_install(&modlinkage)); 433 } 434 435 int 436 _fini() 437 { 438 return (EBUSY); 439 } 440 441 int 442 _info(struct modinfo *modinfop) 443 { 444 return (mod_info(&modlinkage, modinfop)); 445 } 446 447 /*ARGSUSED*/ 448 static int 449 fss_project_walker(kproject_t *kpj, void *buf) 450 { 451 return (0); 452 } 453 454 void * 455 fss_allocbuf(int op, int type) 456 { 457 fssbuf_t *fssbuf; 458 void **fsslist; 459 int cnt; 460 int i; 461 size_t size; 462 463 ASSERT(op == FSS_NPSET_BUF || op == FSS_NPROJ_BUF || op == FSS_ONE_BUF); 464 ASSERT(type == FSS_ALLOC_PROJ || type == FSS_ALLOC_ZONE); 465 ASSERT(MUTEX_HELD(&cpu_lock)); 466 467 fssbuf = kmem_zalloc(sizeof (fssbuf_t), KM_SLEEP); 468 switch (op) { 469 case FSS_NPSET_BUF: 470 cnt = cpupart_list(NULL, 0, CP_NONEMPTY); 471 break; 472 case FSS_NPROJ_BUF: 473 cnt = project_walk_all(ALL_ZONES, fss_project_walker, NULL); 474 break; 475 case FSS_ONE_BUF: 476 cnt = 1; 477 break; 478 } 479 480 switch (type) { 481 case FSS_ALLOC_PROJ: 482 size = sizeof (fssproj_t); 483 break; 484 case FSS_ALLOC_ZONE: 485 size = sizeof (fsszone_t); 486 break; 487 } 488 fsslist = kmem_zalloc(cnt * sizeof (void *), KM_SLEEP); 489 fssbuf->fssb_size = cnt; 490 fssbuf->fssb_list = fsslist; 491 for (i = 0; i < cnt; i++) 492 fsslist[i] = kmem_zalloc(size, KM_SLEEP); 493 return (fssbuf); 494 } 495 496 void 497 fss_freebuf(fssbuf_t *fssbuf, int type) 498 { 499 void **fsslist; 500 int i; 501 size_t size; 502 503 ASSERT(fssbuf != NULL); 504 ASSERT(type == FSS_ALLOC_PROJ || type == FSS_ALLOC_ZONE); 505 fsslist = fssbuf->fssb_list; 506 507 switch (type) { 508 case FSS_ALLOC_PROJ: 509 size = sizeof (fssproj_t); 510 break; 511 case FSS_ALLOC_ZONE: 512 size = sizeof (fsszone_t); 513 break; 514 } 515 516 for (i = 0; i < fssbuf->fssb_size; i++) { 517 if (fsslist[i] != NULL) 518 kmem_free(fsslist[i], size); 519 } 520 kmem_free(fsslist, sizeof (void *) * fssbuf->fssb_size); 521 kmem_free(fssbuf, sizeof (fssbuf_t)); 522 } 523 524 static fsspset_t * 525 fss_find_fsspset(cpupart_t *cpupart) 526 { 527 int i; 528 fsspset_t *fsspset = NULL; 529 int found = 0; 530 531 ASSERT(cpupart != NULL); 532 ASSERT(MUTEX_HELD(&fsspsets_lock)); 533 534 /* 535 * Search for the cpupart pointer in the array of fsspsets. 536 */ 537 for (i = 0; i < max_ncpus; i++) { 538 fsspset = &fsspsets[i]; 539 if (fsspset->fssps_cpupart == cpupart) { 540 ASSERT(fsspset->fssps_nproj > 0); 541 found = 1; 542 break; 543 } 544 } 545 if (found == 0) { 546 /* 547 * If we didn't find anything, then use the first 548 * available slot in the fsspsets array. 549 */ 550 for (i = 0; i < max_ncpus; i++) { 551 fsspset = &fsspsets[i]; 552 if (fsspset->fssps_cpupart == NULL) { 553 ASSERT(fsspset->fssps_nproj == 0); 554 found = 1; 555 break; 556 } 557 } 558 fsspset->fssps_cpupart = cpupart; 559 } 560 ASSERT(found == 1); 561 return (fsspset); 562 } 563 564 static void 565 fss_del_fsspset(fsspset_t *fsspset) 566 { 567 ASSERT(MUTEX_HELD(&fsspsets_lock)); 568 ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); 569 ASSERT(fsspset->fssps_nproj == 0); 570 ASSERT(fsspset->fssps_list == NULL); 571 ASSERT(fsspset->fssps_zones == NULL); 572 fsspset->fssps_cpupart = NULL; 573 fsspset->fssps_maxfsspri = 0; 574 fsspset->fssps_shares = 0; 575 } 576 577 /* 578 * The following routine returns a pointer to the fsszone structure which 579 * belongs to zone "zone" and cpu partition fsspset, if such structure exists. 580 */ 581 static fsszone_t * 582 fss_find_fsszone(fsspset_t *fsspset, zone_t *zone) 583 { 584 fsszone_t *fsszone; 585 586 ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); 587 588 if (fsspset->fssps_list != NULL) { 589 /* 590 * There are projects/zones active on this cpu partition 591 * already. Try to find our zone among them. 592 */ 593 fsszone = fsspset->fssps_zones; 594 do { 595 if (fsszone->fssz_zone == zone) { 596 return (fsszone); 597 } 598 fsszone = fsszone->fssz_next; 599 } while (fsszone != fsspset->fssps_zones); 600 } 601 return (NULL); 602 } 603 604 /* 605 * The following routine links new fsszone structure into doubly linked list of 606 * zones active on the specified cpu partition. 607 */ 608 static void 609 fss_insert_fsszone(fsspset_t *fsspset, zone_t *zone, fsszone_t *fsszone) 610 { 611 ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); 612 613 fsszone->fssz_zone = zone; 614 fsszone->fssz_rshares = zone->zone_shares; 615 616 if (fsspset->fssps_zones == NULL) { 617 /* 618 * This will be the first fsszone for this fsspset 619 */ 620 fsszone->fssz_next = fsszone->fssz_prev = fsszone; 621 fsspset->fssps_zones = fsszone; 622 } else { 623 /* 624 * Insert this fsszone to the doubly linked list. 625 */ 626 fsszone_t *fssz_head = fsspset->fssps_zones; 627 628 fsszone->fssz_next = fssz_head; 629 fsszone->fssz_prev = fssz_head->fssz_prev; 630 fssz_head->fssz_prev->fssz_next = fsszone; 631 fssz_head->fssz_prev = fsszone; 632 fsspset->fssps_zones = fsszone; 633 } 634 } 635 636 /* 637 * The following routine removes a single fsszone structure from the doubly 638 * linked list of zones active on the specified cpu partition. Note that 639 * global fsspsets_lock must be held in case this fsszone structure is the last 640 * on the above mentioned list. Also note that the fsszone structure is not 641 * freed here, it is the responsibility of the caller to call kmem_free for it. 642 */ 643 static void 644 fss_remove_fsszone(fsspset_t *fsspset, fsszone_t *fsszone) 645 { 646 ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); 647 ASSERT(fsszone->fssz_nproj == 0); 648 ASSERT(fsszone->fssz_shares == 0); 649 ASSERT(fsszone->fssz_runnable == 0); 650 651 if (fsszone->fssz_next != fsszone) { 652 /* 653 * This is not the last zone in the list. 654 */ 655 fsszone->fssz_prev->fssz_next = fsszone->fssz_next; 656 fsszone->fssz_next->fssz_prev = fsszone->fssz_prev; 657 if (fsspset->fssps_zones == fsszone) 658 fsspset->fssps_zones = fsszone->fssz_next; 659 } else { 660 /* 661 * This was the last zone active in this cpu partition. 662 */ 663 fsspset->fssps_zones = NULL; 664 } 665 } 666 667 /* 668 * The following routine returns a pointer to the fssproj structure 669 * which belongs to project kpj and cpu partition fsspset, if such structure 670 * exists. 671 */ 672 static fssproj_t * 673 fss_find_fssproj(fsspset_t *fsspset, kproject_t *kpj) 674 { 675 fssproj_t *fssproj; 676 677 ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); 678 679 if (fsspset->fssps_list != NULL) { 680 /* 681 * There are projects running on this cpu partition already. 682 * Try to find our project among them. 683 */ 684 fssproj = fsspset->fssps_list; 685 do { 686 if (fssproj->fssp_proj == kpj) { 687 ASSERT(fssproj->fssp_pset == fsspset); 688 return (fssproj); 689 } 690 fssproj = fssproj->fssp_next; 691 } while (fssproj != fsspset->fssps_list); 692 } 693 return (NULL); 694 } 695 696 /* 697 * The following routine links new fssproj structure into doubly linked list 698 * of projects running on the specified cpu partition. 699 */ 700 static void 701 fss_insert_fssproj(fsspset_t *fsspset, kproject_t *kpj, fsszone_t *fsszone, 702 fssproj_t *fssproj) 703 { 704 ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); 705 706 fssproj->fssp_pset = fsspset; 707 fssproj->fssp_proj = kpj; 708 fssproj->fssp_shares = kpj->kpj_shares; 709 710 fsspset->fssps_nproj++; 711 712 if (fsspset->fssps_list == NULL) { 713 /* 714 * This will be the first fssproj for this fsspset 715 */ 716 fssproj->fssp_next = fssproj->fssp_prev = fssproj; 717 fsspset->fssps_list = fssproj; 718 } else { 719 /* 720 * Insert this fssproj to the doubly linked list. 721 */ 722 fssproj_t *fssp_head = fsspset->fssps_list; 723 724 fssproj->fssp_next = fssp_head; 725 fssproj->fssp_prev = fssp_head->fssp_prev; 726 fssp_head->fssp_prev->fssp_next = fssproj; 727 fssp_head->fssp_prev = fssproj; 728 fsspset->fssps_list = fssproj; 729 } 730 fssproj->fssp_fsszone = fsszone; 731 fsszone->fssz_nproj++; 732 ASSERT(fsszone->fssz_nproj != 0); 733 } 734 735 /* 736 * The following routine removes a single fssproj structure from the doubly 737 * linked list of projects running on the specified cpu partition. Note that 738 * global fsspsets_lock must be held in case if this fssproj structure is the 739 * last on the above mentioned list. Also note that the fssproj structure is 740 * not freed here, it is the responsibility of the caller to call kmem_free 741 * for it. 742 */ 743 static void 744 fss_remove_fssproj(fsspset_t *fsspset, fssproj_t *fssproj) 745 { 746 fsszone_t *fsszone; 747 748 ASSERT(MUTEX_HELD(&fsspsets_lock)); 749 ASSERT(MUTEX_HELD(&fsspset->fssps_lock)); 750 ASSERT(fssproj->fssp_runnable == 0); 751 752 fsspset->fssps_nproj--; 753 754 fsszone = fssproj->fssp_fsszone; 755 fsszone->fssz_nproj--; 756 757 if (fssproj->fssp_next != fssproj) { 758 /* 759 * This is not the last part in the list. 760 */ 761 fssproj->fssp_prev->fssp_next = fssproj->fssp_next; 762 fssproj->fssp_next->fssp_prev = fssproj->fssp_prev; 763 if (fsspset->fssps_list == fssproj) 764 fsspset->fssps_list = fssproj->fssp_next; 765 if (fsszone->fssz_nproj == 0) 766 fss_remove_fsszone(fsspset, fsszone); 767 } else { 768 /* 769 * This was the last project part running 770 * at this cpu partition. 771 */ 772 fsspset->fssps_list = NULL; 773 ASSERT(fsspset->fssps_nproj == 0); 774 ASSERT(fsszone->fssz_nproj == 0); 775 fss_remove_fsszone(fsspset, fsszone); 776 fss_del_fsspset(fsspset); 777 } 778 } 779 780 static void 781 fss_inactive(kthread_t *t) 782 { 783 fssproc_t *fssproc; 784 fssproj_t *fssproj; 785 fsspset_t *fsspset; 786 fsszone_t *fsszone; 787 788 ASSERT(THREAD_LOCK_HELD(t)); 789 fssproc = FSSPROC(t); 790 fssproj = FSSPROC2FSSPROJ(fssproc); 791 if (fssproj == NULL) /* if this thread already exited */ 792 return; 793 fsspset = FSSPROJ2FSSPSET(fssproj); 794 fsszone = fssproj->fssp_fsszone; 795 disp_lock_enter_high(&fsspset->fssps_displock); 796 ASSERT(fssproj->fssp_runnable > 0); 797 if (--fssproj->fssp_runnable == 0) { 798 fsszone->fssz_shares -= fssproj->fssp_shares; 799 if (--fsszone->fssz_runnable == 0) 800 fsspset->fssps_shares -= fsszone->fssz_rshares; 801 } 802 ASSERT(fssproc->fss_runnable == 1); 803 fssproc->fss_runnable = 0; 804 disp_lock_exit_high(&fsspset->fssps_displock); 805 } 806 807 static void 808 fss_active(kthread_t *t) 809 { 810 fssproc_t *fssproc; 811 fssproj_t *fssproj; 812 fsspset_t *fsspset; 813 fsszone_t *fsszone; 814 815 ASSERT(THREAD_LOCK_HELD(t)); 816 fssproc = FSSPROC(t); 817 fssproj = FSSPROC2FSSPROJ(fssproc); 818 if (fssproj == NULL) /* if this thread already exited */ 819 return; 820 fsspset = FSSPROJ2FSSPSET(fssproj); 821 fsszone = fssproj->fssp_fsszone; 822 disp_lock_enter_high(&fsspset->fssps_displock); 823 if (++fssproj->fssp_runnable == 1) { 824 fsszone->fssz_shares += fssproj->fssp_shares; 825 if (++fsszone->fssz_runnable == 1) 826 fsspset->fssps_shares += fsszone->fssz_rshares; 827 } 828 ASSERT(fssproc->fss_runnable == 0); 829 fssproc->fss_runnable = 1; 830 disp_lock_exit_high(&fsspset->fssps_displock); 831 } 832 833 /* 834 * Fair share scheduler initialization. Called by dispinit() at boot time. 835 * We can ignore clparmsz argument since we know that the smallest possible 836 * parameter buffer is big enough for us. 837 */ 838 /*ARGSUSED*/ 839 static pri_t 840 fss_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp) 841 { 842 int i; 843 844 ASSERT(MUTEX_HELD(&cpu_lock)); 845 846 fss_cid = cid; 847 fss_maxumdpri = minclsyspri - 1; 848 fss_maxglobpri = minclsyspri; 849 fss_minglobpri = 0; 850 fsspsets = kmem_zalloc(sizeof (fsspset_t) * max_ncpus, KM_SLEEP); 851 852 /* 853 * Initialize the fssproc hash table. 854 */ 855 for (i = 0; i < FSS_LISTS; i++) 856 fss_listhead[i].fss_next = fss_listhead[i].fss_prev = 857 &fss_listhead[i]; 858 859 *clfuncspp = &fss_classfuncs; 860 861 /* 862 * Fill in fss_nice_tick and fss_nice_decay arrays: 863 * The cost of a tick is lower at positive nice values (so that it 864 * will not increase its project's usage as much as normal) with 50% 865 * drop at the maximum level and 50% increase at the minimum level. 866 * The fsspri decay is slower at positive nice values. fsspri values 867 * of processes with negative nice levels must decay faster to receive 868 * time slices more frequently than normal. 869 */ 870 for (i = 0; i < FSS_NICE_RANGE; i++) { 871 fss_nice_tick[i] = (FSS_TICK_COST * (((3 * FSS_NICE_RANGE) / 2) 872 - i)) / FSS_NICE_RANGE; 873 fss_nice_decay[i] = FSS_DECAY_MIN + 874 ((FSS_DECAY_MAX - FSS_DECAY_MIN) * i) / 875 (FSS_NICE_RANGE - 1); 876 } 877 878 return (fss_maxglobpri); 879 } 880 881 /* 882 * Calculate the new fss_umdpri based on the usage, the normalized share usage 883 * and the number of active threads. Reset the tick counter for this thread. 884 * 885 * When calculating the new priority using the standard formula we can hit 886 * a scenario where we don't have good round-robin behavior. This would be 887 * most commonly seen when there is a zone with lots of runnable threads. 888 * In the bad scenario we will see the following behavior when using the 889 * standard formula and these conditions: 890 * 891 * - there are multiple runnable threads in the zone (project) 892 * - the fssps_maxfsspri is a very large value 893 * - (we also know all of these threads will use the project's 894 * fssp_shusage) 895 * 896 * Under these conditions, a thread with a low fss_fsspri value is chosen 897 * to run and the thread gets a high fss_umdpri. This thread can run for 898 * its full quanta (fss_timeleft) at which time fss_newpri is called to 899 * calculate the thread's new priority. 900 * 901 * In this case, because the newly calculated fsspri value is much smaller 902 * (orders of magnitude) than the fssps_maxfsspri value, if we used the 903 * standard formula the thread will still get a high fss_umdpri value and 904 * will run again for another quanta, even though there are other runnable 905 * threads in the project. 906 * 907 * For a thread that is runnable for a long time, the thread can continue 908 * to run for many quanta (totaling many seconds) before the thread's fsspri 909 * exceeds the fssps_maxfsspri and the thread's fss_umdpri is reset back 910 * down to 1. This behavior also keeps the fssps_maxfsspr at a high value, 911 * so that the next runnable thread might repeat this cycle. 912 * 913 * This leads to the case where we don't have round-robin behavior at quanta 914 * granularity, but instead, runnable threads within the project only run 915 * at several second intervals. 916 * 917 * To prevent this scenario from occuring, when a thread has consumed its 918 * quanta and there are multiple runnable threads in the project, we 919 * immediately cause the thread to hit fssps_maxfsspri so that it gets 920 * reset back to 1 and another runnable thread in the project can run. 921 */ 922 static void 923 fss_newpri(fssproc_t *fssproc, boolean_t quanta_up) 924 { 925 kthread_t *tp; 926 fssproj_t *fssproj; 927 fsspset_t *fsspset; 928 fsszone_t *fsszone; 929 fsspri_t fsspri, maxfsspri; 930 uint32_t n_runnable; 931 pri_t invpri; 932 uint32_t ticks; 933 934 tp = fssproc->fss_tp; 935 ASSERT(tp != NULL); 936 937 if (tp->t_cid != fss_cid) 938 return; 939 940 ASSERT(THREAD_LOCK_HELD(tp)); 941 942 fssproj = FSSPROC2FSSPROJ(fssproc); 943 fsszone = FSSPROJ2FSSZONE(fssproj); 944 if (fssproj == NULL) 945 /* 946 * No need to change priority of exited threads. 947 */ 948 return; 949 950 fsspset = FSSPROJ2FSSPSET(fssproj); 951 disp_lock_enter_high(&fsspset->fssps_displock); 952 953 ticks = fssproc->fss_ticks; 954 fssproc->fss_ticks = 0; 955 956 if (fssproj->fssp_shares == 0 || fsszone->fssz_rshares == 0) { 957 /* 958 * Special case: threads with no shares. 959 */ 960 fssproc->fss_umdpri = fss_minglobpri; 961 disp_lock_exit_high(&fsspset->fssps_displock); 962 return; 963 } 964 965 maxfsspri = fsspset->fssps_maxfsspri; 966 n_runnable = fssproj->fssp_runnable; 967 968 if (quanta_up && n_runnable > 1) { 969 fsspri = maxfsspri; 970 } else { 971 /* 972 * fsspri += fssp_shusage * nrunnable * ticks 973 * If all three values are non-0, this typically calculates to 974 * a large number (sometimes > 1M, sometimes > 100B) due to 975 * fssp_shusage which can be > 1T. 976 */ 977 fsspri = fssproc->fss_fsspri; 978 fsspri += fssproj->fssp_shusage * n_runnable * ticks; 979 } 980 981 fssproc->fss_fsspri = fsspri; 982 983 /* 984 * fss_maxumdpri is normally 59, since FSS priorities are 0-59. 985 * If the previous calculation resulted in 0 (e.g. was 0 and added 0 986 * because ticks == 0), then instead of 0, we use the largest priority, 987 * which is still small in comparison to the large numbers we typically 988 * see. 989 */ 990 if (fsspri < fss_maxumdpri) 991 fsspri = fss_maxumdpri; /* so that maxfsspri is != 0 */ 992 993 /* 994 * The general priority formula: 995 * 996 * (fsspri * umdprirange) 997 * pri = maxumdpri - ------------------------ 998 * maxfsspri 999 * 1000 * If this thread's fsspri is greater than the previous largest 1001 * fsspri, then record it as the new high and priority for this 1002 * thread will be one (the lowest priority assigned to a thread 1003 * that has non-zero shares). Because of this check, maxfsspri can 1004 * change as this function is called via the 1005 * fss_update -> fss_update_list -> fss_newpri code path to update 1006 * all runnable threads. See the code in fss_update for how we 1007 * mitigate this issue. 1008 * 1009 * Note that this formula cannot produce out of bounds priority 1010 * values (0-59); if it is changed, additional checks may need to be 1011 * added. 1012 */ 1013 if (fsspri >= maxfsspri) { 1014 fsspset->fssps_maxfsspri = fsspri; 1015 disp_lock_exit_high(&fsspset->fssps_displock); 1016 fssproc->fss_umdpri = 1; 1017 } else { 1018 disp_lock_exit_high(&fsspset->fssps_displock); 1019 invpri = (fsspri * (fss_maxumdpri - 1)) / maxfsspri; 1020 fssproc->fss_umdpri = fss_maxumdpri - invpri; 1021 } 1022 } 1023 1024 /* 1025 * Decays usages of all running projects, resets their tick counters and 1026 * calcluates the projects normalized share usage. Called once per second from 1027 * fss_update(). 1028 */ 1029 static void 1030 fss_decay_usage() 1031 { 1032 uint32_t zone_ext_shares, zone_int_shares; 1033 uint32_t kpj_shares, pset_shares; 1034 fsspset_t *fsspset; 1035 fssproj_t *fssproj; 1036 fsszone_t *fsszone; 1037 fsspri_t maxfsspri; 1038 int psetid; 1039 struct zone *zp; 1040 1041 mutex_enter(&fsspsets_lock); 1042 /* 1043 * Go through all active processor sets and decay usages of projects 1044 * running on them. 1045 */ 1046 for (psetid = 0; psetid < max_ncpus; psetid++) { 1047 fsspset = &fsspsets[psetid]; 1048 mutex_enter(&fsspset->fssps_lock); 1049 1050 fsspset->fssps_gen++; 1051 1052 if (fsspset->fssps_cpupart == NULL || 1053 (fssproj = fsspset->fssps_list) == NULL) { 1054 mutex_exit(&fsspset->fssps_lock); 1055 continue; 1056 } 1057 1058 /* 1059 * Decay maxfsspri for this cpu partition with the 1060 * fastest possible decay rate. 1061 */ 1062 disp_lock_enter(&fsspset->fssps_displock); 1063 1064 pset_shares = fsspset->fssps_shares; 1065 1066 maxfsspri = (fsspset->fssps_maxfsspri * 1067 fss_nice_decay[NZERO]) / FSS_DECAY_BASE; 1068 if (maxfsspri < fss_maxumdpri) 1069 maxfsspri = fss_maxumdpri; 1070 fsspset->fssps_maxfsspri = maxfsspri; 1071 1072 do { 1073 fsszone = fssproj->fssp_fsszone; 1074 zp = fsszone->fssz_zone; 1075 1076 /* 1077 * Reset zone's FSS stats if they are from a 1078 * previous cycle. 1079 */ 1080 if (fsspset->fssps_gen != zp->zone_fss_gen) { 1081 zp->zone_fss_gen = fsspset->fssps_gen; 1082 zp->zone_run_ticks = 0; 1083 } 1084 1085 /* 1086 * Decay project usage, then add in this cycle's 1087 * nice tick value. 1088 */ 1089 fssproj->fssp_usage = 1090 (fssproj->fssp_usage * FSS_DECAY_USG) / 1091 FSS_DECAY_BASE + 1092 fssproj->fssp_ticks; 1093 1094 fssproj->fssp_ticks = 0; 1095 zp->zone_run_ticks += fssproj->fssp_tick_cnt; 1096 fssproj->fssp_tick_cnt = 0; 1097 1098 /* 1099 * Readjust the project's number of shares if it has 1100 * changed since we checked it last time. 1101 */ 1102 kpj_shares = fssproj->fssp_proj->kpj_shares; 1103 if (fssproj->fssp_shares != kpj_shares) { 1104 if (fssproj->fssp_runnable != 0) { 1105 fsszone->fssz_shares -= 1106 fssproj->fssp_shares; 1107 fsszone->fssz_shares += kpj_shares; 1108 } 1109 fssproj->fssp_shares = kpj_shares; 1110 } 1111 1112 /* 1113 * Readjust the zone's number of shares if it 1114 * has changed since we checked it last time. 1115 */ 1116 zone_ext_shares = zp->zone_shares; 1117 if (fsszone->fssz_rshares != zone_ext_shares) { 1118 if (fsszone->fssz_runnable != 0) { 1119 fsspset->fssps_shares -= 1120 fsszone->fssz_rshares; 1121 fsspset->fssps_shares += 1122 zone_ext_shares; 1123 pset_shares = fsspset->fssps_shares; 1124 } 1125 fsszone->fssz_rshares = zone_ext_shares; 1126 } 1127 zone_int_shares = fsszone->fssz_shares; 1128 1129 /* 1130 * If anything is runnable in the project, track the 1131 * overall project share percent for monitoring useage. 1132 */ 1133 if (fssproj->fssp_runnable > 0) { 1134 uint32_t zone_shr_pct; 1135 uint32_t int_shr_pct; 1136 1137 /* 1138 * Times 1000 to get tenths of a percent 1139 * 1140 * zone_ext_shares 1141 * zone_shr_pct = --------------- 1142 * pset_shares 1143 * 1144 * kpj_shares 1145 * int_shr_pct = --------------- 1146 * zone_int_shares 1147 */ 1148 if (pset_shares == 0 || zone_int_shares == 0) { 1149 fssproj->fssp_shr_pct = 0; 1150 } else { 1151 zone_shr_pct = 1152 (zone_ext_shares * 1000) / 1153 pset_shares; 1154 int_shr_pct = (kpj_shares * 1000) / 1155 zone_int_shares; 1156 fssproj->fssp_shr_pct = 1157 (zone_shr_pct * int_shr_pct) / 1158 1000; 1159 } 1160 } else { 1161 DTRACE_PROBE1(fss__prj__norun, fssproj_t *, 1162 fssproj); 1163 } 1164 1165 /* 1166 * Calculate fssp_shusage value to be used 1167 * for fsspri increments for the next second. 1168 */ 1169 if (kpj_shares == 0 || zone_ext_shares == 0) { 1170 fssproj->fssp_shusage = 0; 1171 } else if (FSSPROJ2KPROJ(fssproj) == proj0p) { 1172 uint32_t zone_shr_pct; 1173 1174 /* 1175 * Project 0 in the global zone has 50% 1176 * of its zone. See calculation above for 1177 * the zone's share percent. 1178 */ 1179 if (pset_shares == 0) 1180 zone_shr_pct = 1000; 1181 else 1182 zone_shr_pct = 1183 (zone_ext_shares * 1000) / 1184 pset_shares; 1185 1186 fssproj->fssp_shr_pct = zone_shr_pct / 2; 1187 1188 fssproj->fssp_shusage = (fssproj->fssp_usage * 1189 zone_int_shares * zone_int_shares) / 1190 (zone_ext_shares * zone_ext_shares); 1191 } else { 1192 /* 1193 * Thread's priority is based on its project's 1194 * normalized usage (shusage) value which gets 1195 * calculated this way: 1196 * 1197 * pset_shares^2 zone_int_shares^2 1198 * usage * ------------- * ------------------ 1199 * kpj_shares^2 zone_ext_shares^2 1200 * 1201 * Where zone_int_shares is the sum of shares 1202 * of all active projects within the zone (and 1203 * the pset), and zone_ext_shares is the number 1204 * of zone shares (ie, zone.cpu-shares). 1205 * 1206 * If there is only one zone active on the pset 1207 * the above reduces to: 1208 * 1209 * zone_int_shares^2 1210 * shusage = usage * --------------------- 1211 * kpj_shares^2 1212 * 1213 * If there's only one project active in the 1214 * zone this formula reduces to: 1215 * 1216 * pset_shares^2 1217 * shusage = usage * ---------------------- 1218 * zone_ext_shares^2 1219 * 1220 * shusage is one input to calculating fss_pri 1221 * in fss_newpri(). Larger values tend toward 1222 * lower priorities for processes in the proj. 1223 */ 1224 fssproj->fssp_shusage = fssproj->fssp_usage * 1225 pset_shares * zone_int_shares; 1226 fssproj->fssp_shusage /= 1227 kpj_shares * zone_ext_shares; 1228 fssproj->fssp_shusage *= 1229 pset_shares * zone_int_shares; 1230 fssproj->fssp_shusage /= 1231 kpj_shares * zone_ext_shares; 1232 } 1233 fssproj = fssproj->fssp_next; 1234 } while (fssproj != fsspset->fssps_list); 1235 1236 disp_lock_exit(&fsspset->fssps_displock); 1237 mutex_exit(&fsspset->fssps_lock); 1238 } 1239 mutex_exit(&fsspsets_lock); 1240 } 1241 1242 static void 1243 fss_change_priority(kthread_t *t, fssproc_t *fssproc) 1244 { 1245 pri_t new_pri; 1246 1247 ASSERT(THREAD_LOCK_HELD(t)); 1248 new_pri = fssproc->fss_umdpri; 1249 ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri); 1250 1251 t->t_cpri = fssproc->fss_upri; 1252 fssproc->fss_flags &= ~FSSRESTORE; 1253 if (t == curthread || t->t_state == TS_ONPROC) { 1254 /* 1255 * curthread is always onproc 1256 */ 1257 cpu_t *cp = t->t_disp_queue->disp_cpu; 1258 THREAD_CHANGE_PRI(t, new_pri); 1259 if (t == cp->cpu_dispthread) 1260 cp->cpu_dispatch_pri = DISP_PRIO(t); 1261 if (DISP_MUST_SURRENDER(t)) { 1262 fssproc->fss_flags |= FSSBACKQ; 1263 cpu_surrender(t); 1264 } else { 1265 fssproc->fss_timeleft = fss_quantum; 1266 } 1267 } else { 1268 /* 1269 * When the priority of a thread is changed, it may be 1270 * necessary to adjust its position on a sleep queue or 1271 * dispatch queue. The function thread_change_pri accomplishes 1272 * this. 1273 */ 1274 if (thread_change_pri(t, new_pri, 0)) { 1275 /* 1276 * The thread was on a run queue. 1277 */ 1278 fssproc->fss_timeleft = fss_quantum; 1279 } else { 1280 fssproc->fss_flags |= FSSBACKQ; 1281 } 1282 } 1283 } 1284 1285 /* 1286 * Update priorities of all fair-sharing threads that are currently runnable 1287 * at a user mode priority based on the number of shares and current usage. 1288 * Called once per second via timeout which we reset here. 1289 * 1290 * There are several lists of fair-sharing threads broken up by a hash on the 1291 * thread pointer. Each list has its own lock. This avoids blocking all 1292 * fss_enterclass, fss_fork, and fss_exitclass operations while fss_update runs. 1293 * fss_update traverses each list in turn. 1294 * 1295 * Each time we're run (once/second) we may start at the next list and iterate 1296 * through all of the lists. By starting with a different list, we mitigate any 1297 * effects we would see updating the fssps_maxfsspri value in fss_newpri. 1298 */ 1299 static void 1300 fss_update(void *arg) 1301 { 1302 int i; 1303 int new_marker = -1; 1304 static int fss_update_marker; 1305 1306 /* 1307 * Decay and update usages for all projects. 1308 */ 1309 fss_decay_usage(); 1310 1311 /* 1312 * Start with the fss_update_marker list, then do the rest. 1313 */ 1314 i = fss_update_marker; 1315 1316 /* 1317 * Go around all threads, set new priorities and decay 1318 * per-thread CPU usages. 1319 */ 1320 do { 1321 /* 1322 * If this is the first list after the current marker to have 1323 * threads with priority updates, advance the marker to this 1324 * list for the next time fss_update runs. 1325 */ 1326 if (fss_update_list(i) && 1327 new_marker == -1 && i != fss_update_marker) 1328 new_marker = i; 1329 } while ((i = FSS_LIST_NEXT(i)) != fss_update_marker); 1330 1331 /* 1332 * Advance marker for the next fss_update call 1333 */ 1334 if (new_marker != -1) 1335 fss_update_marker = new_marker; 1336 1337 (void) timeout(fss_update, arg, hz); 1338 } 1339 1340 /* 1341 * Updates priority for a list of threads. Returns 1 if the priority of one 1342 * of the threads was actually updated, 0 if none were for various reasons 1343 * (thread is no longer in the FSS class, is not runnable, has the preemption 1344 * control no-preempt bit set, etc.) 1345 */ 1346 static int 1347 fss_update_list(int i) 1348 { 1349 fssproc_t *fssproc; 1350 fssproj_t *fssproj; 1351 fsspri_t fsspri; 1352 pri_t fss_umdpri; 1353 kthread_t *t; 1354 int updated = 0; 1355 1356 mutex_enter(&fss_listlock[i]); 1357 for (fssproc = fss_listhead[i].fss_next; fssproc != &fss_listhead[i]; 1358 fssproc = fssproc->fss_next) { 1359 t = fssproc->fss_tp; 1360 /* 1361 * Lock the thread and verify the state. 1362 */ 1363 thread_lock(t); 1364 /* 1365 * Skip the thread if it is no longer in the FSS class or 1366 * is running with kernel mode priority. 1367 */ 1368 if (t->t_cid != fss_cid) 1369 goto next; 1370 if ((fssproc->fss_flags & FSSKPRI) != 0) 1371 goto next; 1372 1373 fssproj = FSSPROC2FSSPROJ(fssproc); 1374 if (fssproj == NULL) 1375 goto next; 1376 1377 if (fssproj->fssp_shares != 0) { 1378 /* 1379 * Decay fsspri value. 1380 */ 1381 fsspri = fssproc->fss_fsspri; 1382 fsspri = (fsspri * fss_nice_decay[fssproc->fss_nice]) / 1383 FSS_DECAY_BASE; 1384 fssproc->fss_fsspri = fsspri; 1385 } 1386 1387 if (t->t_schedctl && schedctl_get_nopreempt(t)) 1388 goto next; 1389 if (t->t_state != TS_RUN && t->t_state != TS_WAIT) { 1390 /* 1391 * Make next syscall/trap call fss_trapret 1392 */ 1393 t->t_trapret = 1; 1394 aston(t); 1395 if (t->t_state == TS_ONPROC) 1396 DTRACE_PROBE1(fss__onproc, fssproc_t *, 1397 fssproc); 1398 goto next; 1399 } 1400 fss_newpri(fssproc, B_FALSE); 1401 updated = 1; 1402 1403 fss_umdpri = fssproc->fss_umdpri; 1404 1405 /* 1406 * Only dequeue the thread if it needs to be moved; otherwise 1407 * it should just round-robin here. 1408 */ 1409 if (t->t_pri != fss_umdpri) 1410 fss_change_priority(t, fssproc); 1411 next: 1412 thread_unlock(t); 1413 } 1414 mutex_exit(&fss_listlock[i]); 1415 return (updated); 1416 } 1417 1418 /*ARGSUSED*/ 1419 static int 1420 fss_admin(caddr_t uaddr, cred_t *reqpcredp) 1421 { 1422 fssadmin_t fssadmin; 1423 1424 if (copyin(uaddr, &fssadmin, sizeof (fssadmin_t))) 1425 return (EFAULT); 1426 1427 switch (fssadmin.fss_cmd) { 1428 case FSS_SETADMIN: 1429 if (secpolicy_dispadm(reqpcredp) != 0) 1430 return (EPERM); 1431 if (fssadmin.fss_quantum <= 0 || fssadmin.fss_quantum >= hz) 1432 return (EINVAL); 1433 fss_quantum = fssadmin.fss_quantum; 1434 break; 1435 case FSS_GETADMIN: 1436 fssadmin.fss_quantum = fss_quantum; 1437 if (copyout(&fssadmin, uaddr, sizeof (fssadmin_t))) 1438 return (EFAULT); 1439 break; 1440 default: 1441 return (EINVAL); 1442 } 1443 return (0); 1444 } 1445 1446 static int 1447 fss_getclinfo(void *infop) 1448 { 1449 fssinfo_t *fssinfo = (fssinfo_t *)infop; 1450 fssinfo->fss_maxupri = fss_maxupri; 1451 return (0); 1452 } 1453 1454 static int 1455 fss_parmsin(void *parmsp) 1456 { 1457 fssparms_t *fssparmsp = (fssparms_t *)parmsp; 1458 1459 /* 1460 * Check validity of parameters. 1461 */ 1462 if ((fssparmsp->fss_uprilim > fss_maxupri || 1463 fssparmsp->fss_uprilim < -fss_maxupri) && 1464 fssparmsp->fss_uprilim != FSS_NOCHANGE) 1465 return (EINVAL); 1466 1467 if ((fssparmsp->fss_upri > fss_maxupri || 1468 fssparmsp->fss_upri < -fss_maxupri) && 1469 fssparmsp->fss_upri != FSS_NOCHANGE) 1470 return (EINVAL); 1471 1472 return (0); 1473 } 1474 1475 /*ARGSUSED*/ 1476 static int 1477 fss_parmsout(void *parmsp, pc_vaparms_t *vaparmsp) 1478 { 1479 return (0); 1480 } 1481 1482 static int 1483 fss_vaparmsin(void *parmsp, pc_vaparms_t *vaparmsp) 1484 { 1485 fssparms_t *fssparmsp = (fssparms_t *)parmsp; 1486 int priflag = 0; 1487 int limflag = 0; 1488 uint_t cnt; 1489 pc_vaparm_t *vpp = &vaparmsp->pc_parms[0]; 1490 1491 /* 1492 * FSS_NOCHANGE (-32768) is outside of the range of values for 1493 * fss_uprilim and fss_upri. If the structure fssparms_t is changed, 1494 * FSS_NOCHANGE should be replaced by a flag word. 1495 */ 1496 fssparmsp->fss_uprilim = FSS_NOCHANGE; 1497 fssparmsp->fss_upri = FSS_NOCHANGE; 1498 1499 /* 1500 * Get the varargs parameter and check validity of parameters. 1501 */ 1502 if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT) 1503 return (EINVAL); 1504 1505 for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) { 1506 switch (vpp->pc_key) { 1507 case FSS_KY_UPRILIM: 1508 if (limflag++) 1509 return (EINVAL); 1510 fssparmsp->fss_uprilim = (pri_t)vpp->pc_parm; 1511 if (fssparmsp->fss_uprilim > fss_maxupri || 1512 fssparmsp->fss_uprilim < -fss_maxupri) 1513 return (EINVAL); 1514 break; 1515 case FSS_KY_UPRI: 1516 if (priflag++) 1517 return (EINVAL); 1518 fssparmsp->fss_upri = (pri_t)vpp->pc_parm; 1519 if (fssparmsp->fss_upri > fss_maxupri || 1520 fssparmsp->fss_upri < -fss_maxupri) 1521 return (EINVAL); 1522 break; 1523 default: 1524 return (EINVAL); 1525 } 1526 } 1527 1528 if (vaparmsp->pc_vaparmscnt == 0) { 1529 /* 1530 * Use default parameters. 1531 */ 1532 fssparmsp->fss_upri = fssparmsp->fss_uprilim = 0; 1533 } 1534 1535 return (0); 1536 } 1537 1538 /* 1539 * Copy all selected fair-sharing class parameters to the user. The parameters 1540 * are specified by a key. 1541 */ 1542 static int 1543 fss_vaparmsout(void *parmsp, pc_vaparms_t *vaparmsp) 1544 { 1545 fssparms_t *fssparmsp = (fssparms_t *)parmsp; 1546 int priflag = 0; 1547 int limflag = 0; 1548 uint_t cnt; 1549 pc_vaparm_t *vpp = &vaparmsp->pc_parms[0]; 1550 1551 ASSERT(MUTEX_NOT_HELD(&curproc->p_lock)); 1552 1553 if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT) 1554 return (EINVAL); 1555 1556 for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) { 1557 switch (vpp->pc_key) { 1558 case FSS_KY_UPRILIM: 1559 if (limflag++) 1560 return (EINVAL); 1561 if (copyout(&fssparmsp->fss_uprilim, 1562 (caddr_t)(uintptr_t)vpp->pc_parm, sizeof (pri_t))) 1563 return (EFAULT); 1564 break; 1565 case FSS_KY_UPRI: 1566 if (priflag++) 1567 return (EINVAL); 1568 if (copyout(&fssparmsp->fss_upri, 1569 (caddr_t)(uintptr_t)vpp->pc_parm, sizeof (pri_t))) 1570 return (EFAULT); 1571 break; 1572 default: 1573 return (EINVAL); 1574 } 1575 } 1576 1577 return (0); 1578 } 1579 1580 /* 1581 * Return the user mode scheduling priority range. 1582 */ 1583 static int 1584 fss_getclpri(pcpri_t *pcprip) 1585 { 1586 pcprip->pc_clpmax = fss_maxupri; 1587 pcprip->pc_clpmin = -fss_maxupri; 1588 return (0); 1589 } 1590 1591 static int 1592 fss_alloc(void **p, int flag) 1593 { 1594 void *bufp; 1595 1596 if ((bufp = kmem_zalloc(sizeof (fssproc_t), flag)) == NULL) { 1597 return (ENOMEM); 1598 } else { 1599 *p = bufp; 1600 return (0); 1601 } 1602 } 1603 1604 static void 1605 fss_free(void *bufp) 1606 { 1607 if (bufp) 1608 kmem_free(bufp, sizeof (fssproc_t)); 1609 } 1610 1611 /* 1612 * Thread functions 1613 */ 1614 static int 1615 fss_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp, 1616 void *bufp) 1617 { 1618 fssparms_t *fssparmsp = (fssparms_t *)parmsp; 1619 fssproc_t *fssproc; 1620 pri_t reqfssuprilim; 1621 pri_t reqfssupri; 1622 static uint32_t fssexists = 0; 1623 fsspset_t *fsspset; 1624 fssproj_t *fssproj; 1625 fsszone_t *fsszone; 1626 kproject_t *kpj; 1627 zone_t *zone; 1628 int fsszone_allocated = 0; 1629 1630 fssproc = (fssproc_t *)bufp; 1631 ASSERT(fssproc != NULL); 1632 1633 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); 1634 1635 /* 1636 * Only root can move threads to FSS class. 1637 */ 1638 if (reqpcredp != NULL && secpolicy_setpriority(reqpcredp) != 0) 1639 return (EPERM); 1640 /* 1641 * Initialize the fssproc structure. 1642 */ 1643 fssproc->fss_umdpri = fss_maxumdpri / 2; 1644 1645 if (fssparmsp == NULL) { 1646 /* 1647 * Use default values. 1648 */ 1649 fssproc->fss_nice = NZERO; 1650 fssproc->fss_uprilim = fssproc->fss_upri = 0; 1651 } else { 1652 /* 1653 * Use supplied values. 1654 */ 1655 if (fssparmsp->fss_uprilim == FSS_NOCHANGE) { 1656 reqfssuprilim = 0; 1657 } else { 1658 if (fssparmsp->fss_uprilim > 0 && 1659 secpolicy_setpriority(reqpcredp) != 0) 1660 return (EPERM); 1661 reqfssuprilim = fssparmsp->fss_uprilim; 1662 } 1663 if (fssparmsp->fss_upri == FSS_NOCHANGE) { 1664 reqfssupri = reqfssuprilim; 1665 } else { 1666 if (fssparmsp->fss_upri > 0 && 1667 secpolicy_setpriority(reqpcredp) != 0) 1668 return (EPERM); 1669 /* 1670 * Set the user priority to the requested value or 1671 * the upri limit, whichever is lower. 1672 */ 1673 reqfssupri = fssparmsp->fss_upri; 1674 if (reqfssupri > reqfssuprilim) 1675 reqfssupri = reqfssuprilim; 1676 } 1677 fssproc->fss_uprilim = reqfssuprilim; 1678 fssproc->fss_upri = reqfssupri; 1679 fssproc->fss_nice = NZERO - (NZERO * reqfssupri) / fss_maxupri; 1680 if (fssproc->fss_nice > FSS_NICE_MAX) 1681 fssproc->fss_nice = FSS_NICE_MAX; 1682 } 1683 1684 fssproc->fss_timeleft = fss_quantum; 1685 fssproc->fss_tp = t; 1686 cpucaps_sc_init(&fssproc->fss_caps); 1687 1688 /* 1689 * Put a lock on our fsspset structure. 1690 */ 1691 mutex_enter(&fsspsets_lock); 1692 fsspset = fss_find_fsspset(t->t_cpupart); 1693 mutex_enter(&fsspset->fssps_lock); 1694 mutex_exit(&fsspsets_lock); 1695 1696 zone = ttoproc(t)->p_zone; 1697 if ((fsszone = fss_find_fsszone(fsspset, zone)) == NULL) { 1698 if ((fsszone = kmem_zalloc(sizeof (fsszone_t), KM_NOSLEEP)) 1699 == NULL) { 1700 mutex_exit(&fsspset->fssps_lock); 1701 return (ENOMEM); 1702 } else { 1703 fsszone_allocated = 1; 1704 fss_insert_fsszone(fsspset, zone, fsszone); 1705 } 1706 } 1707 kpj = ttoproj(t); 1708 if ((fssproj = fss_find_fssproj(fsspset, kpj)) == NULL) { 1709 if ((fssproj = kmem_zalloc(sizeof (fssproj_t), KM_NOSLEEP)) 1710 == NULL) { 1711 if (fsszone_allocated) { 1712 fss_remove_fsszone(fsspset, fsszone); 1713 kmem_free(fsszone, sizeof (fsszone_t)); 1714 } 1715 mutex_exit(&fsspset->fssps_lock); 1716 return (ENOMEM); 1717 } else { 1718 fss_insert_fssproj(fsspset, kpj, fsszone, fssproj); 1719 } 1720 } 1721 fssproj->fssp_threads++; 1722 fssproc->fss_proj = fssproj; 1723 1724 /* 1725 * Reset priority. Process goes to a "user mode" priority here 1726 * regardless of whether or not it has slept since entering the kernel. 1727 */ 1728 thread_lock(t); 1729 t->t_clfuncs = &(sclass[cid].cl_funcs->thread); 1730 t->t_cid = cid; 1731 t->t_cldata = (void *)fssproc; 1732 t->t_schedflag |= TS_RUNQMATCH; 1733 fss_change_priority(t, fssproc); 1734 if (t->t_state == TS_RUN || t->t_state == TS_ONPROC || 1735 t->t_state == TS_WAIT) 1736 fss_active(t); 1737 thread_unlock(t); 1738 1739 mutex_exit(&fsspset->fssps_lock); 1740 1741 /* 1742 * Link new structure into fssproc list. 1743 */ 1744 FSS_LIST_INSERT(fssproc); 1745 1746 /* 1747 * If this is the first fair-sharing thread to occur since boot, 1748 * we set up the initial call to fss_update() here. Use an atomic 1749 * compare-and-swap since that's easier and faster than a mutex 1750 * (but check with an ordinary load first since most of the time 1751 * this will already be done). 1752 */ 1753 if (fssexists == 0 && atomic_cas_32(&fssexists, 0, 1) == 0) 1754 (void) timeout(fss_update, NULL, hz); 1755 1756 return (0); 1757 } 1758 1759 /* 1760 * Remove fssproc_t from the list. 1761 */ 1762 static void 1763 fss_exitclass(void *procp) 1764 { 1765 fssproc_t *fssproc = (fssproc_t *)procp; 1766 fssproj_t *fssproj; 1767 fsspset_t *fsspset; 1768 fsszone_t *fsszone; 1769 kthread_t *t = fssproc->fss_tp; 1770 1771 /* 1772 * We should be either getting this thread off the deathrow or 1773 * this thread has already moved to another scheduling class and 1774 * we're being called with its old cldata buffer pointer. In both 1775 * cases, the content of this buffer can not be changed while we're 1776 * here. 1777 */ 1778 mutex_enter(&fsspsets_lock); 1779 thread_lock(t); 1780 if (t->t_cid != fss_cid) { 1781 /* 1782 * We're being called as a result of the priocntl() system 1783 * call -- someone is trying to move our thread to another 1784 * scheduling class. We can't call fss_inactive() here 1785 * because our thread's t_cldata pointer already points 1786 * to another scheduling class specific data. 1787 */ 1788 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); 1789 1790 fssproj = FSSPROC2FSSPROJ(fssproc); 1791 fsspset = FSSPROJ2FSSPSET(fssproj); 1792 fsszone = fssproj->fssp_fsszone; 1793 1794 if (fssproc->fss_runnable) { 1795 disp_lock_enter_high(&fsspset->fssps_displock); 1796 if (--fssproj->fssp_runnable == 0) { 1797 fsszone->fssz_shares -= fssproj->fssp_shares; 1798 if (--fsszone->fssz_runnable == 0) 1799 fsspset->fssps_shares -= 1800 fsszone->fssz_rshares; 1801 } 1802 disp_lock_exit_high(&fsspset->fssps_displock); 1803 } 1804 thread_unlock(t); 1805 1806 mutex_enter(&fsspset->fssps_lock); 1807 if (--fssproj->fssp_threads == 0) { 1808 fss_remove_fssproj(fsspset, fssproj); 1809 if (fsszone->fssz_nproj == 0) 1810 kmem_free(fsszone, sizeof (fsszone_t)); 1811 kmem_free(fssproj, sizeof (fssproj_t)); 1812 } 1813 mutex_exit(&fsspset->fssps_lock); 1814 1815 } else { 1816 ASSERT(t->t_state == TS_FREE); 1817 /* 1818 * We're being called from thread_free() when our thread 1819 * is removed from the deathrow. There is nothing we need 1820 * do here since everything should've been done earlier 1821 * in fss_exit(). 1822 */ 1823 thread_unlock(t); 1824 } 1825 mutex_exit(&fsspsets_lock); 1826 1827 FSS_LIST_DELETE(fssproc); 1828 fss_free(fssproc); 1829 } 1830 1831 /*ARGSUSED*/ 1832 static int 1833 fss_canexit(kthread_t *t, cred_t *credp) 1834 { 1835 /* 1836 * A thread is allowed to exit FSS only if we have sufficient 1837 * privileges. 1838 */ 1839 if (credp != NULL && secpolicy_setpriority(credp) != 0) 1840 return (EPERM); 1841 else 1842 return (0); 1843 } 1844 1845 /* 1846 * Initialize fair-share class specific proc structure for a child. 1847 */ 1848 static int 1849 fss_fork(kthread_t *pt, kthread_t *ct, void *bufp) 1850 { 1851 fssproc_t *pfssproc; /* ptr to parent's fssproc structure */ 1852 fssproc_t *cfssproc; /* ptr to child's fssproc structure */ 1853 fssproj_t *fssproj; 1854 fsspset_t *fsspset; 1855 1856 ASSERT(MUTEX_HELD(&ttoproc(pt)->p_lock)); 1857 ASSERT(ct->t_state == TS_STOPPED); 1858 1859 cfssproc = (fssproc_t *)bufp; 1860 ASSERT(cfssproc != NULL); 1861 bzero(cfssproc, sizeof (fssproc_t)); 1862 1863 thread_lock(pt); 1864 pfssproc = FSSPROC(pt); 1865 fssproj = FSSPROC2FSSPROJ(pfssproc); 1866 fsspset = FSSPROJ2FSSPSET(fssproj); 1867 thread_unlock(pt); 1868 1869 mutex_enter(&fsspset->fssps_lock); 1870 /* 1871 * Initialize child's fssproc structure. 1872 */ 1873 thread_lock(pt); 1874 ASSERT(FSSPROJ(pt) == fssproj); 1875 cfssproc->fss_proj = fssproj; 1876 cfssproc->fss_timeleft = fss_quantum; 1877 cfssproc->fss_umdpri = pfssproc->fss_umdpri; 1878 cfssproc->fss_fsspri = 0; 1879 cfssproc->fss_uprilim = pfssproc->fss_uprilim; 1880 cfssproc->fss_upri = pfssproc->fss_upri; 1881 cfssproc->fss_tp = ct; 1882 cfssproc->fss_nice = pfssproc->fss_nice; 1883 cpucaps_sc_init(&cfssproc->fss_caps); 1884 1885 cfssproc->fss_flags = 1886 pfssproc->fss_flags & ~(FSSKPRI | FSSBACKQ | FSSRESTORE); 1887 ct->t_cldata = (void *)cfssproc; 1888 ct->t_schedflag |= TS_RUNQMATCH; 1889 thread_unlock(pt); 1890 1891 fssproj->fssp_threads++; 1892 mutex_exit(&fsspset->fssps_lock); 1893 1894 /* 1895 * Link new structure into fssproc hash table. 1896 */ 1897 FSS_LIST_INSERT(cfssproc); 1898 return (0); 1899 } 1900 1901 /* 1902 * Child is placed at back of dispatcher queue and parent gives up processor 1903 * so that the child runs first after the fork. This allows the child 1904 * immediately execing to break the multiple use of copy on write pages with no 1905 * disk home. The parent will get to steal them back rather than uselessly 1906 * copying them. 1907 */ 1908 static void 1909 fss_forkret(kthread_t *t, kthread_t *ct) 1910 { 1911 proc_t *pp = ttoproc(t); 1912 proc_t *cp = ttoproc(ct); 1913 fssproc_t *fssproc; 1914 1915 ASSERT(t == curthread); 1916 ASSERT(MUTEX_HELD(&pidlock)); 1917 1918 /* 1919 * Grab the child's p_lock before dropping pidlock to ensure the 1920 * process does not disappear before we set it running. 1921 */ 1922 mutex_enter(&cp->p_lock); 1923 continuelwps(cp); 1924 mutex_exit(&cp->p_lock); 1925 1926 mutex_enter(&pp->p_lock); 1927 mutex_exit(&pidlock); 1928 continuelwps(pp); 1929 1930 thread_lock(t); 1931 1932 fssproc = FSSPROC(t); 1933 fss_newpri(fssproc, B_FALSE); 1934 fssproc->fss_timeleft = fss_quantum; 1935 t->t_pri = fssproc->fss_umdpri; 1936 ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri); 1937 fssproc->fss_flags &= ~FSSKPRI; 1938 THREAD_TRANSITION(t); 1939 1940 /* 1941 * We don't want to call fss_setrun(t) here because it may call 1942 * fss_active, which we don't need. 1943 */ 1944 fssproc->fss_flags &= ~FSSBACKQ; 1945 1946 if (t->t_disp_time != ddi_get_lbolt()) 1947 setbackdq(t); 1948 else 1949 setfrontdq(t); 1950 1951 thread_unlock(t); 1952 /* 1953 * Safe to drop p_lock now since it is safe to change 1954 * the scheduling class after this point. 1955 */ 1956 mutex_exit(&pp->p_lock); 1957 1958 swtch(); 1959 } 1960 1961 /* 1962 * Get the fair-sharing parameters of the thread pointed to by fssprocp into 1963 * the buffer pointed by fssparmsp. 1964 */ 1965 static void 1966 fss_parmsget(kthread_t *t, void *parmsp) 1967 { 1968 fssproc_t *fssproc = FSSPROC(t); 1969 fssparms_t *fssparmsp = (fssparms_t *)parmsp; 1970 1971 fssparmsp->fss_uprilim = fssproc->fss_uprilim; 1972 fssparmsp->fss_upri = fssproc->fss_upri; 1973 } 1974 1975 /*ARGSUSED*/ 1976 static int 1977 fss_parmsset(kthread_t *t, void *parmsp, id_t reqpcid, cred_t *reqpcredp) 1978 { 1979 char nice; 1980 pri_t reqfssuprilim; 1981 pri_t reqfssupri; 1982 fssproc_t *fssproc = FSSPROC(t); 1983 fssparms_t *fssparmsp = (fssparms_t *)parmsp; 1984 1985 ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock)); 1986 1987 if (fssparmsp->fss_uprilim == FSS_NOCHANGE) 1988 reqfssuprilim = fssproc->fss_uprilim; 1989 else 1990 reqfssuprilim = fssparmsp->fss_uprilim; 1991 1992 if (fssparmsp->fss_upri == FSS_NOCHANGE) 1993 reqfssupri = fssproc->fss_upri; 1994 else 1995 reqfssupri = fssparmsp->fss_upri; 1996 1997 /* 1998 * Make sure the user priority doesn't exceed the upri limit. 1999 */ 2000 if (reqfssupri > reqfssuprilim) 2001 reqfssupri = reqfssuprilim; 2002 2003 /* 2004 * Basic permissions enforced by generic kernel code for all classes 2005 * require that a thread attempting to change the scheduling parameters 2006 * of a target thread be privileged or have a real or effective UID 2007 * matching that of the target thread. We are not called unless these 2008 * basic permission checks have already passed. The fair-sharing class 2009 * requires in addition that the calling thread be privileged if it 2010 * is attempting to raise the upri limit above its current value. 2011 * This may have been checked previously but if our caller passed us 2012 * a non-NULL credential pointer we assume it hasn't and we check it 2013 * here. 2014 */ 2015 if ((reqpcredp != NULL) && 2016 (reqfssuprilim > fssproc->fss_uprilim) && 2017 secpolicy_raisepriority(reqpcredp) != 0) 2018 return (EPERM); 2019 2020 /* 2021 * Set fss_nice to the nice value corresponding to the user priority we 2022 * are setting. Note that setting the nice field of the parameter 2023 * struct won't affect upri or nice. 2024 */ 2025 nice = NZERO - (reqfssupri * NZERO) / fss_maxupri; 2026 if (nice > FSS_NICE_MAX) 2027 nice = FSS_NICE_MAX; 2028 2029 thread_lock(t); 2030 2031 fssproc->fss_uprilim = reqfssuprilim; 2032 fssproc->fss_upri = reqfssupri; 2033 fssproc->fss_nice = nice; 2034 fss_newpri(fssproc, B_FALSE); 2035 2036 if ((fssproc->fss_flags & FSSKPRI) != 0) { 2037 thread_unlock(t); 2038 return (0); 2039 } 2040 2041 fss_change_priority(t, fssproc); 2042 thread_unlock(t); 2043 return (0); 2044 2045 } 2046 2047 /* 2048 * The thread is being stopped. 2049 */ 2050 /*ARGSUSED*/ 2051 static void 2052 fss_stop(kthread_t *t, int why, int what) 2053 { 2054 ASSERT(THREAD_LOCK_HELD(t)); 2055 ASSERT(t == curthread); 2056 2057 fss_inactive(t); 2058 } 2059 2060 /* 2061 * The current thread is exiting, do necessary adjustments to its project 2062 */ 2063 static void 2064 fss_exit(kthread_t *t) 2065 { 2066 fsspset_t *fsspset; 2067 fssproj_t *fssproj; 2068 fssproc_t *fssproc; 2069 fsszone_t *fsszone; 2070 int free = 0; 2071 2072 /* 2073 * Thread t here is either a current thread (in which case we hold 2074 * its process' p_lock), or a thread being destroyed by forklwp_fail(), 2075 * in which case we hold pidlock and thread is no longer on the 2076 * thread list. 2077 */ 2078 ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock) || MUTEX_HELD(&pidlock)); 2079 2080 fssproc = FSSPROC(t); 2081 fssproj = FSSPROC2FSSPROJ(fssproc); 2082 fsspset = FSSPROJ2FSSPSET(fssproj); 2083 fsszone = fssproj->fssp_fsszone; 2084 2085 mutex_enter(&fsspsets_lock); 2086 mutex_enter(&fsspset->fssps_lock); 2087 2088 thread_lock(t); 2089 disp_lock_enter_high(&fsspset->fssps_displock); 2090 if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) { 2091 if (--fssproj->fssp_runnable == 0) { 2092 fsszone->fssz_shares -= fssproj->fssp_shares; 2093 if (--fsszone->fssz_runnable == 0) 2094 fsspset->fssps_shares -= fsszone->fssz_rshares; 2095 } 2096 ASSERT(fssproc->fss_runnable == 1); 2097 fssproc->fss_runnable = 0; 2098 } 2099 if (--fssproj->fssp_threads == 0) { 2100 fss_remove_fssproj(fsspset, fssproj); 2101 free = 1; 2102 } 2103 disp_lock_exit_high(&fsspset->fssps_displock); 2104 fssproc->fss_proj = NULL; /* mark this thread as already exited */ 2105 thread_unlock(t); 2106 2107 if (free) { 2108 if (fsszone->fssz_nproj == 0) 2109 kmem_free(fsszone, sizeof (fsszone_t)); 2110 kmem_free(fssproj, sizeof (fssproj_t)); 2111 } 2112 mutex_exit(&fsspset->fssps_lock); 2113 mutex_exit(&fsspsets_lock); 2114 2115 /* 2116 * A thread could be exiting in between clock ticks, so we need to 2117 * calculate how much CPU time it used since it was charged last time. 2118 * 2119 * CPU caps are not enforced on exiting processes - it is usually 2120 * desirable to exit as soon as possible to free resources. 2121 */ 2122 if (CPUCAPS_ON()) { 2123 thread_lock(t); 2124 fssproc = FSSPROC(t); 2125 (void) cpucaps_charge(t, &fssproc->fss_caps, 2126 CPUCAPS_CHARGE_ONLY); 2127 thread_unlock(t); 2128 } 2129 } 2130 2131 static void 2132 fss_nullsys() 2133 { 2134 } 2135 2136 /* 2137 * If thread is currently at a kernel mode priority (has slept) and is 2138 * returning to the userland we assign it the appropriate user mode priority 2139 * and time quantum here. If we're lowering the thread's priority below that 2140 * of other runnable threads then we will set runrun via cpu_surrender() to 2141 * cause preemption. 2142 */ 2143 static void 2144 fss_trapret(kthread_t *t) 2145 { 2146 fssproc_t *fssproc = FSSPROC(t); 2147 cpu_t *cp = CPU; 2148 2149 ASSERT(THREAD_LOCK_HELD(t)); 2150 ASSERT(t == curthread); 2151 ASSERT(cp->cpu_dispthread == t); 2152 ASSERT(t->t_state == TS_ONPROC); 2153 2154 t->t_kpri_req = 0; 2155 if (fssproc->fss_flags & FSSKPRI) { 2156 /* 2157 * If thread has blocked in the kernel 2158 */ 2159 THREAD_CHANGE_PRI(t, fssproc->fss_umdpri); 2160 cp->cpu_dispatch_pri = DISP_PRIO(t); 2161 ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri); 2162 fssproc->fss_flags &= ~FSSKPRI; 2163 2164 if (DISP_MUST_SURRENDER(t)) 2165 cpu_surrender(t); 2166 } 2167 } 2168 2169 /* 2170 * Arrange for thread to be placed in appropriate location on dispatcher queue. 2171 * This is called with the current thread in TS_ONPROC and locked. 2172 */ 2173 static void 2174 fss_preempt(kthread_t *t) 2175 { 2176 fssproc_t *fssproc = FSSPROC(t); 2177 klwp_t *lwp; 2178 uint_t flags; 2179 2180 ASSERT(t == curthread); 2181 ASSERT(THREAD_LOCK_HELD(curthread)); 2182 ASSERT(t->t_state == TS_ONPROC); 2183 2184 /* 2185 * If preempted in the kernel, make sure the thread has a kernel 2186 * priority if needed. 2187 */ 2188 lwp = curthread->t_lwp; 2189 if (!(fssproc->fss_flags & FSSKPRI) && lwp != NULL && t->t_kpri_req) { 2190 fssproc->fss_flags |= FSSKPRI; 2191 THREAD_CHANGE_PRI(t, minclsyspri); 2192 ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri); 2193 t->t_trapret = 1; /* so that fss_trapret will run */ 2194 aston(t); 2195 } 2196 2197 /* 2198 * This thread may be placed on wait queue by CPU Caps. In this case we 2199 * do not need to do anything until it is removed from the wait queue. 2200 * Do not enforce CPU caps on threads running at a kernel priority 2201 */ 2202 if (CPUCAPS_ON()) { 2203 (void) cpucaps_charge(t, &fssproc->fss_caps, 2204 CPUCAPS_CHARGE_ENFORCE); 2205 2206 if (!(fssproc->fss_flags & FSSKPRI) && CPUCAPS_ENFORCE(t)) 2207 return; 2208 } 2209 2210 /* 2211 * Check to see if we're doing "preemption control" here. If 2212 * we are, and if the user has requested that this thread not 2213 * be preempted, and if preemptions haven't been put off for 2214 * too long, let the preemption happen here but try to make 2215 * sure the thread is rescheduled as soon as possible. We do 2216 * this by putting it on the front of the highest priority run 2217 * queue in the FSS class. If the preemption has been put off 2218 * for too long, clear the "nopreempt" bit and let the thread 2219 * be preempted. 2220 */ 2221 if (t->t_schedctl && schedctl_get_nopreempt(t)) { 2222 if (fssproc->fss_timeleft > -SC_MAX_TICKS) { 2223 DTRACE_SCHED1(schedctl__nopreempt, kthread_t *, t); 2224 if (!(fssproc->fss_flags & FSSKPRI)) { 2225 /* 2226 * If not already remembered, remember current 2227 * priority for restoration in fss_yield(). 2228 */ 2229 if (!(fssproc->fss_flags & FSSRESTORE)) { 2230 fssproc->fss_scpri = t->t_pri; 2231 fssproc->fss_flags |= FSSRESTORE; 2232 } 2233 THREAD_CHANGE_PRI(t, fss_maxumdpri); 2234 } 2235 schedctl_set_yield(t, 1); 2236 setfrontdq(t); 2237 return; 2238 } else { 2239 if (fssproc->fss_flags & FSSRESTORE) { 2240 THREAD_CHANGE_PRI(t, fssproc->fss_scpri); 2241 fssproc->fss_flags &= ~FSSRESTORE; 2242 } 2243 schedctl_set_nopreempt(t, 0); 2244 DTRACE_SCHED1(schedctl__preempt, kthread_t *, t); 2245 /* 2246 * Fall through and be preempted below. 2247 */ 2248 } 2249 } 2250 2251 flags = fssproc->fss_flags & (FSSBACKQ | FSSKPRI); 2252 2253 if (flags == FSSBACKQ) { 2254 fssproc->fss_timeleft = fss_quantum; 2255 fssproc->fss_flags &= ~FSSBACKQ; 2256 setbackdq(t); 2257 } else if (flags == (FSSBACKQ | FSSKPRI)) { 2258 fssproc->fss_flags &= ~FSSBACKQ; 2259 setbackdq(t); 2260 } else { 2261 setfrontdq(t); 2262 } 2263 } 2264 2265 /* 2266 * Called when a thread is waking up and is to be placed on the run queue. 2267 */ 2268 static void 2269 fss_setrun(kthread_t *t) 2270 { 2271 fssproc_t *fssproc = FSSPROC(t); 2272 2273 ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */ 2274 2275 if (t->t_state == TS_SLEEP || t->t_state == TS_STOPPED) 2276 fss_active(t); 2277 2278 fssproc->fss_timeleft = fss_quantum; 2279 2280 fssproc->fss_flags &= ~FSSBACKQ; 2281 /* 2282 * If previously were running at the kernel priority then keep that 2283 * priority and the fss_timeleft doesn't matter. 2284 */ 2285 if ((fssproc->fss_flags & FSSKPRI) == 0) 2286 THREAD_CHANGE_PRI(t, fssproc->fss_umdpri); 2287 2288 if (t->t_disp_time != ddi_get_lbolt()) 2289 setbackdq(t); 2290 else 2291 setfrontdq(t); 2292 } 2293 2294 /* 2295 * Prepare thread for sleep. We reset the thread priority so it will run at the 2296 * kernel priority level when it wakes up. 2297 */ 2298 static void 2299 fss_sleep(kthread_t *t) 2300 { 2301 fssproc_t *fssproc = FSSPROC(t); 2302 2303 ASSERT(t == curthread); 2304 ASSERT(THREAD_LOCK_HELD(t)); 2305 2306 ASSERT(t->t_state == TS_ONPROC); 2307 2308 /* 2309 * Account for time spent on CPU before going to sleep. 2310 */ 2311 (void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE); 2312 2313 fss_inactive(t); 2314 2315 /* 2316 * Assign a system priority to the thread and arrange for it to be 2317 * retained when the thread is next placed on the run queue (i.e., 2318 * when it wakes up) instead of being given a new pri. Also arrange 2319 * for trapret processing as the thread leaves the system call so it 2320 * will drop back to normal priority range. 2321 */ 2322 if (t->t_kpri_req) { 2323 THREAD_CHANGE_PRI(t, minclsyspri); 2324 fssproc->fss_flags |= FSSKPRI; 2325 t->t_trapret = 1; /* so that fss_trapret will run */ 2326 aston(t); 2327 } else if (fssproc->fss_flags & FSSKPRI) { 2328 /* 2329 * The thread has done a THREAD_KPRI_REQUEST(), slept, then 2330 * done THREAD_KPRI_RELEASE() (so no t_kpri_req is 0 again), 2331 * then slept again all without finishing the current system 2332 * call so trapret won't have cleared FSSKPRI 2333 */ 2334 fssproc->fss_flags &= ~FSSKPRI; 2335 THREAD_CHANGE_PRI(t, fssproc->fss_umdpri); 2336 if (DISP_MUST_SURRENDER(curthread)) 2337 cpu_surrender(t); 2338 } 2339 } 2340 2341 /* 2342 * A tick interrupt has ocurrend on a running thread. Check to see if our 2343 * time slice has expired. 2344 */ 2345 static void 2346 fss_tick(kthread_t *t) 2347 { 2348 fssproc_t *fssproc; 2349 fssproj_t *fssproj; 2350 boolean_t call_cpu_surrender = B_FALSE; 2351 boolean_t cpucaps_enforce = B_FALSE; 2352 2353 ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock)); 2354 2355 /* 2356 * It's safe to access fsspset and fssproj structures because we're 2357 * holding our p_lock here. 2358 */ 2359 thread_lock(t); 2360 fssproc = FSSPROC(t); 2361 fssproj = FSSPROC2FSSPROJ(fssproc); 2362 if (fssproj != NULL) { 2363 fsspset_t *fsspset = FSSPROJ2FSSPSET(fssproj); 2364 disp_lock_enter_high(&fsspset->fssps_displock); 2365 fssproj->fssp_ticks += fss_nice_tick[fssproc->fss_nice]; 2366 fssproj->fssp_tick_cnt++; 2367 fssproc->fss_ticks++; 2368 disp_lock_exit_high(&fsspset->fssps_displock); 2369 } 2370 2371 /* 2372 * Keep track of thread's project CPU usage. Note that projects 2373 * get charged even when threads are running in the kernel. 2374 * Do not surrender CPU if running in the SYS class. 2375 */ 2376 if (CPUCAPS_ON()) { 2377 cpucaps_enforce = cpucaps_charge(t, 2378 &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE) && 2379 !(fssproc->fss_flags & FSSKPRI); 2380 } 2381 2382 /* 2383 * A thread's execution time for threads running in the SYS class 2384 * is not tracked. 2385 */ 2386 if ((fssproc->fss_flags & FSSKPRI) == 0) { 2387 /* 2388 * If thread is not in kernel mode, decrement its fss_timeleft 2389 */ 2390 if (--fssproc->fss_timeleft <= 0) { 2391 pri_t new_pri; 2392 2393 /* 2394 * If we're doing preemption control and trying to 2395 * avoid preempting this thread, just note that the 2396 * thread should yield soon and let it keep running 2397 * (unless it's been a while). 2398 */ 2399 if (t->t_schedctl && schedctl_get_nopreempt(t)) { 2400 if (fssproc->fss_timeleft > -SC_MAX_TICKS) { 2401 DTRACE_SCHED1(schedctl__nopreempt, 2402 kthread_t *, t); 2403 schedctl_set_yield(t, 1); 2404 thread_unlock_nopreempt(t); 2405 return; 2406 } 2407 } 2408 fssproc->fss_flags &= ~FSSRESTORE; 2409 2410 fss_newpri(fssproc, B_TRUE); 2411 new_pri = fssproc->fss_umdpri; 2412 ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri); 2413 2414 /* 2415 * When the priority of a thread is changed, it may 2416 * be necessary to adjust its position on a sleep queue 2417 * or dispatch queue. The function thread_change_pri 2418 * accomplishes this. 2419 */ 2420 if (thread_change_pri(t, new_pri, 0)) { 2421 fssproc->fss_timeleft = fss_quantum; 2422 } else { 2423 call_cpu_surrender = B_TRUE; 2424 } 2425 } else if (t->t_state == TS_ONPROC && 2426 t->t_pri < t->t_disp_queue->disp_maxrunpri) { 2427 /* 2428 * If there is a higher-priority thread which is 2429 * waiting for a processor, then thread surrenders 2430 * the processor. 2431 */ 2432 call_cpu_surrender = B_TRUE; 2433 } 2434 } 2435 2436 if (cpucaps_enforce && 2 * fssproc->fss_timeleft > fss_quantum) { 2437 /* 2438 * The thread used more than half of its quantum, so assume that 2439 * it used the whole quantum. 2440 * 2441 * Update thread's priority just before putting it on the wait 2442 * queue so that it gets charged for the CPU time from its 2443 * quantum even before that quantum expires. 2444 */ 2445 fss_newpri(fssproc, B_FALSE); 2446 if (t->t_pri != fssproc->fss_umdpri) 2447 fss_change_priority(t, fssproc); 2448 2449 /* 2450 * We need to call cpu_surrender for this thread due to cpucaps 2451 * enforcement, but fss_change_priority may have already done 2452 * so. In this case FSSBACKQ is set and there is no need to call 2453 * cpu-surrender again. 2454 */ 2455 if (!(fssproc->fss_flags & FSSBACKQ)) 2456 call_cpu_surrender = B_TRUE; 2457 } 2458 2459 if (call_cpu_surrender) { 2460 fssproc->fss_flags |= FSSBACKQ; 2461 cpu_surrender(t); 2462 } 2463 2464 thread_unlock_nopreempt(t); /* clock thread can't be preempted */ 2465 } 2466 2467 /* 2468 * Processes waking up go to the back of their queue. We don't need to assign 2469 * a time quantum here because thread is still at a kernel mode priority and 2470 * the time slicing is not done for threads running in the kernel after 2471 * sleeping. The proper time quantum will be assigned by fss_trapret before the 2472 * thread returns to user mode. 2473 */ 2474 static void 2475 fss_wakeup(kthread_t *t) 2476 { 2477 fssproc_t *fssproc; 2478 2479 ASSERT(THREAD_LOCK_HELD(t)); 2480 ASSERT(t->t_state == TS_SLEEP); 2481 2482 fss_active(t); 2483 2484 fssproc = FSSPROC(t); 2485 fssproc->fss_flags &= ~FSSBACKQ; 2486 2487 if (fssproc->fss_flags & FSSKPRI) { 2488 /* 2489 * If we already have a kernel priority assigned, then we 2490 * just use it. 2491 */ 2492 setbackdq(t); 2493 } else if (t->t_kpri_req) { 2494 /* 2495 * Give thread a priority boost if we were asked. 2496 */ 2497 fssproc->fss_flags |= FSSKPRI; 2498 THREAD_CHANGE_PRI(t, minclsyspri); 2499 setbackdq(t); 2500 t->t_trapret = 1; /* so that fss_trapret will run */ 2501 aston(t); 2502 } else { 2503 /* 2504 * Otherwise, we recalculate the priority. 2505 */ 2506 if (t->t_disp_time == ddi_get_lbolt()) { 2507 setfrontdq(t); 2508 } else { 2509 fssproc->fss_timeleft = fss_quantum; 2510 THREAD_CHANGE_PRI(t, fssproc->fss_umdpri); 2511 setbackdq(t); 2512 } 2513 } 2514 } 2515 2516 /* 2517 * fss_donice() is called when a nice(1) command is issued on the thread to 2518 * alter the priority. The nice(1) command exists in Solaris for compatibility. 2519 * Thread priority adjustments should be done via priocntl(1). 2520 */ 2521 static int 2522 fss_donice(kthread_t *t, cred_t *cr, int incr, int *retvalp) 2523 { 2524 int newnice; 2525 fssproc_t *fssproc = FSSPROC(t); 2526 fssparms_t fssparms; 2527 2528 /* 2529 * If there is no change to priority, just return current setting. 2530 */ 2531 if (incr == 0) { 2532 if (retvalp) 2533 *retvalp = fssproc->fss_nice - NZERO; 2534 return (0); 2535 } 2536 2537 if ((incr < 0 || incr > 2 * NZERO) && secpolicy_raisepriority(cr) != 0) 2538 return (EPERM); 2539 2540 /* 2541 * Specifying a nice increment greater than the upper limit of 2542 * FSS_NICE_MAX (== 2 * NZERO - 1) will result in the thread's nice 2543 * value being set to the upper limit. We check for this before 2544 * computing the new value because otherwise we could get overflow 2545 * if a privileged user specified some ridiculous increment. 2546 */ 2547 if (incr > FSS_NICE_MAX) 2548 incr = FSS_NICE_MAX; 2549 2550 newnice = fssproc->fss_nice + incr; 2551 if (newnice > FSS_NICE_MAX) 2552 newnice = FSS_NICE_MAX; 2553 else if (newnice < FSS_NICE_MIN) 2554 newnice = FSS_NICE_MIN; 2555 2556 fssparms.fss_uprilim = fssparms.fss_upri = 2557 -((newnice - NZERO) * fss_maxupri) / NZERO; 2558 2559 /* 2560 * Reset the uprilim and upri values of the thread. 2561 */ 2562 (void) fss_parmsset(t, (void *)&fssparms, (id_t)0, (cred_t *)NULL); 2563 2564 /* 2565 * Although fss_parmsset already reset fss_nice it may not have been 2566 * set to precisely the value calculated above because fss_parmsset 2567 * determines the nice value from the user priority and we may have 2568 * truncated during the integer conversion from nice value to user 2569 * priority and back. We reset fss_nice to the value we calculated 2570 * above. 2571 */ 2572 fssproc->fss_nice = (char)newnice; 2573 2574 if (retvalp) 2575 *retvalp = newnice - NZERO; 2576 return (0); 2577 } 2578 2579 /* 2580 * Increment the priority of the specified thread by incr and 2581 * return the new value in *retvalp. 2582 */ 2583 static int 2584 fss_doprio(kthread_t *t, cred_t *cr, int incr, int *retvalp) 2585 { 2586 int newpri; 2587 fssproc_t *fssproc = FSSPROC(t); 2588 fssparms_t fssparms; 2589 2590 /* 2591 * If there is no change to priority, just return current setting. 2592 */ 2593 if (incr == 0) { 2594 *retvalp = fssproc->fss_upri; 2595 return (0); 2596 } 2597 2598 newpri = fssproc->fss_upri + incr; 2599 if (newpri > fss_maxupri || newpri < -fss_maxupri) 2600 return (EINVAL); 2601 2602 *retvalp = newpri; 2603 fssparms.fss_uprilim = fssparms.fss_upri = newpri; 2604 2605 /* 2606 * Reset the uprilim and upri values of the thread. 2607 */ 2608 return (fss_parmsset(t, &fssparms, (id_t)0, cr)); 2609 } 2610 2611 /* 2612 * Return the global scheduling priority that would be assigned to a thread 2613 * entering the fair-sharing class with the fss_upri. 2614 */ 2615 /*ARGSUSED*/ 2616 static pri_t 2617 fss_globpri(kthread_t *t) 2618 { 2619 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); 2620 2621 return (fss_maxumdpri / 2); 2622 } 2623 2624 /* 2625 * Called from the yield(2) system call when a thread is yielding (surrendering) 2626 * the processor. The kernel thread is placed at the back of a dispatch queue. 2627 */ 2628 static void 2629 fss_yield(kthread_t *t) 2630 { 2631 fssproc_t *fssproc = FSSPROC(t); 2632 2633 ASSERT(t == curthread); 2634 ASSERT(THREAD_LOCK_HELD(t)); 2635 2636 /* 2637 * Collect CPU usage spent before yielding 2638 */ 2639 (void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE); 2640 2641 /* 2642 * Clear the preemption control "yield" bit since the user is 2643 * doing a yield. 2644 */ 2645 if (t->t_schedctl) 2646 schedctl_set_yield(t, 0); 2647 /* 2648 * If fss_preempt() artifically increased the thread's priority 2649 * to avoid preemption, restore the original priority now. 2650 */ 2651 if (fssproc->fss_flags & FSSRESTORE) { 2652 THREAD_CHANGE_PRI(t, fssproc->fss_scpri); 2653 fssproc->fss_flags &= ~FSSRESTORE; 2654 } 2655 if (fssproc->fss_timeleft < 0) { 2656 /* 2657 * Time slice was artificially extended to avoid preemption, 2658 * so pretend we're preempting it now. 2659 */ 2660 DTRACE_SCHED1(schedctl__yield, int, -fssproc->fss_timeleft); 2661 fssproc->fss_timeleft = fss_quantum; 2662 } 2663 fssproc->fss_flags &= ~FSSBACKQ; 2664 setbackdq(t); 2665 } 2666 2667 void 2668 fss_changeproj(kthread_t *t, void *kp, void *zp, fssbuf_t *projbuf, 2669 fssbuf_t *zonebuf) 2670 { 2671 kproject_t *kpj_new = kp; 2672 zone_t *zone = zp; 2673 fssproj_t *fssproj_old, *fssproj_new; 2674 fsspset_t *fsspset; 2675 kproject_t *kpj_old; 2676 fssproc_t *fssproc; 2677 fsszone_t *fsszone_old, *fsszone_new; 2678 int free = 0; 2679 int id; 2680 2681 ASSERT(MUTEX_HELD(&cpu_lock)); 2682 ASSERT(MUTEX_HELD(&pidlock)); 2683 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); 2684 2685 if (t->t_cid != fss_cid) 2686 return; 2687 2688 fssproc = FSSPROC(t); 2689 mutex_enter(&fsspsets_lock); 2690 fssproj_old = FSSPROC2FSSPROJ(fssproc); 2691 if (fssproj_old == NULL) { 2692 mutex_exit(&fsspsets_lock); 2693 return; 2694 } 2695 2696 fsspset = FSSPROJ2FSSPSET(fssproj_old); 2697 mutex_enter(&fsspset->fssps_lock); 2698 kpj_old = FSSPROJ2KPROJ(fssproj_old); 2699 fsszone_old = fssproj_old->fssp_fsszone; 2700 2701 ASSERT(t->t_cpupart == fsspset->fssps_cpupart); 2702 2703 if (kpj_old == kpj_new) { 2704 mutex_exit(&fsspset->fssps_lock); 2705 mutex_exit(&fsspsets_lock); 2706 return; 2707 } 2708 2709 if ((fsszone_new = fss_find_fsszone(fsspset, zone)) == NULL) { 2710 /* 2711 * If the zone for the new project is not currently active on 2712 * the cpu partition we're on, get one of the pre-allocated 2713 * buffers and link it in our per-pset zone list. Such buffers 2714 * should already exist. 2715 */ 2716 for (id = 0; id < zonebuf->fssb_size; id++) { 2717 if ((fsszone_new = zonebuf->fssb_list[id]) != NULL) { 2718 fss_insert_fsszone(fsspset, zone, fsszone_new); 2719 zonebuf->fssb_list[id] = NULL; 2720 break; 2721 } 2722 } 2723 } 2724 ASSERT(fsszone_new != NULL); 2725 if ((fssproj_new = fss_find_fssproj(fsspset, kpj_new)) == NULL) { 2726 /* 2727 * If our new project is not currently running 2728 * on the cpu partition we're on, get one of the 2729 * pre-allocated buffers and link it in our new cpu 2730 * partition doubly linked list. Such buffers should already 2731 * exist. 2732 */ 2733 for (id = 0; id < projbuf->fssb_size; id++) { 2734 if ((fssproj_new = projbuf->fssb_list[id]) != NULL) { 2735 fss_insert_fssproj(fsspset, kpj_new, 2736 fsszone_new, fssproj_new); 2737 projbuf->fssb_list[id] = NULL; 2738 break; 2739 } 2740 } 2741 } 2742 ASSERT(fssproj_new != NULL); 2743 2744 thread_lock(t); 2745 if (t->t_state == TS_RUN || t->t_state == TS_ONPROC || 2746 t->t_state == TS_WAIT) 2747 fss_inactive(t); 2748 ASSERT(fssproj_old->fssp_threads > 0); 2749 if (--fssproj_old->fssp_threads == 0) { 2750 fss_remove_fssproj(fsspset, fssproj_old); 2751 free = 1; 2752 } 2753 fssproc->fss_proj = fssproj_new; 2754 fssproc->fss_fsspri = 0; 2755 fssproj_new->fssp_threads++; 2756 if (t->t_state == TS_RUN || t->t_state == TS_ONPROC || 2757 t->t_state == TS_WAIT) 2758 fss_active(t); 2759 thread_unlock(t); 2760 if (free) { 2761 if (fsszone_old->fssz_nproj == 0) 2762 kmem_free(fsszone_old, sizeof (fsszone_t)); 2763 kmem_free(fssproj_old, sizeof (fssproj_t)); 2764 } 2765 2766 mutex_exit(&fsspset->fssps_lock); 2767 mutex_exit(&fsspsets_lock); 2768 } 2769 2770 void 2771 fss_changepset(kthread_t *t, void *newcp, fssbuf_t *projbuf, 2772 fssbuf_t *zonebuf) 2773 { 2774 fsspset_t *fsspset_old, *fsspset_new; 2775 fssproj_t *fssproj_old, *fssproj_new; 2776 fsszone_t *fsszone_old, *fsszone_new; 2777 fssproc_t *fssproc; 2778 kproject_t *kpj; 2779 zone_t *zone; 2780 int id; 2781 2782 ASSERT(MUTEX_HELD(&cpu_lock)); 2783 ASSERT(MUTEX_HELD(&pidlock)); 2784 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); 2785 2786 if (t->t_cid != fss_cid) 2787 return; 2788 2789 fssproc = FSSPROC(t); 2790 zone = ttoproc(t)->p_zone; 2791 mutex_enter(&fsspsets_lock); 2792 fssproj_old = FSSPROC2FSSPROJ(fssproc); 2793 if (fssproj_old == NULL) { 2794 mutex_exit(&fsspsets_lock); 2795 return; 2796 } 2797 fsszone_old = fssproj_old->fssp_fsszone; 2798 fsspset_old = FSSPROJ2FSSPSET(fssproj_old); 2799 kpj = FSSPROJ2KPROJ(fssproj_old); 2800 2801 if (fsspset_old->fssps_cpupart == newcp) { 2802 mutex_exit(&fsspsets_lock); 2803 return; 2804 } 2805 2806 ASSERT(ttoproj(t) == kpj); 2807 2808 fsspset_new = fss_find_fsspset(newcp); 2809 2810 mutex_enter(&fsspset_new->fssps_lock); 2811 if ((fsszone_new = fss_find_fsszone(fsspset_new, zone)) == NULL) { 2812 for (id = 0; id < zonebuf->fssb_size; id++) { 2813 if ((fsszone_new = zonebuf->fssb_list[id]) != NULL) { 2814 fss_insert_fsszone(fsspset_new, zone, 2815 fsszone_new); 2816 zonebuf->fssb_list[id] = NULL; 2817 break; 2818 } 2819 } 2820 } 2821 ASSERT(fsszone_new != NULL); 2822 if ((fssproj_new = fss_find_fssproj(fsspset_new, kpj)) == NULL) { 2823 for (id = 0; id < projbuf->fssb_size; id++) { 2824 if ((fssproj_new = projbuf->fssb_list[id]) != NULL) { 2825 fss_insert_fssproj(fsspset_new, kpj, 2826 fsszone_new, fssproj_new); 2827 projbuf->fssb_list[id] = NULL; 2828 break; 2829 } 2830 } 2831 } 2832 ASSERT(fssproj_new != NULL); 2833 2834 fssproj_new->fssp_threads++; 2835 thread_lock(t); 2836 if (t->t_state == TS_RUN || t->t_state == TS_ONPROC || 2837 t->t_state == TS_WAIT) 2838 fss_inactive(t); 2839 fssproc->fss_proj = fssproj_new; 2840 fssproc->fss_fsspri = 0; 2841 if (t->t_state == TS_RUN || t->t_state == TS_ONPROC || 2842 t->t_state == TS_WAIT) 2843 fss_active(t); 2844 thread_unlock(t); 2845 mutex_exit(&fsspset_new->fssps_lock); 2846 2847 mutex_enter(&fsspset_old->fssps_lock); 2848 if (--fssproj_old->fssp_threads == 0) { 2849 fss_remove_fssproj(fsspset_old, fssproj_old); 2850 if (fsszone_old->fssz_nproj == 0) 2851 kmem_free(fsszone_old, sizeof (fsszone_t)); 2852 kmem_free(fssproj_old, sizeof (fssproj_t)); 2853 } 2854 mutex_exit(&fsspset_old->fssps_lock); 2855 2856 mutex_exit(&fsspsets_lock); 2857 }