Print this page
6583 remove whole-process swapping
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/disp/disp.c
+++ new/usr/src/uts/common/disp/disp.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 27 /* All Rights Reserved */
28 28
29 29
30 30 #include <sys/types.h>
31 31 #include <sys/param.h>
32 32 #include <sys/sysmacros.h>
33 33 #include <sys/signal.h>
34 34 #include <sys/user.h>
35 35 #include <sys/systm.h>
36 36 #include <sys/sysinfo.h>
37 37 #include <sys/var.h>
38 38 #include <sys/errno.h>
39 39 #include <sys/cmn_err.h>
40 40 #include <sys/debug.h>
41 41 #include <sys/inline.h>
42 42 #include <sys/disp.h>
43 43 #include <sys/class.h>
44 44 #include <sys/bitmap.h>
45 45 #include <sys/kmem.h>
46 46 #include <sys/cpuvar.h>
47 47 #include <sys/vtrace.h>
48 48 #include <sys/tnf.h>
49 49 #include <sys/cpupart.h>
50 50 #include <sys/lgrp.h>
51 51 #include <sys/pg.h>
52 52 #include <sys/cmt.h>
53 53 #include <sys/bitset.h>
54 54 #include <sys/schedctl.h>
55 55 #include <sys/atomic.h>
56 56 #include <sys/dtrace.h>
57 57 #include <sys/sdt.h>
58 58 #include <sys/archsystm.h>
59 59
60 60 #include <vm/as.h>
61 61
62 62 #define BOUND_CPU 0x1
63 63 #define BOUND_PARTITION 0x2
64 64 #define BOUND_INTR 0x4
65 65
66 66 /* Dispatch queue allocation structure and functions */
67 67 struct disp_queue_info {
68 68 disp_t *dp;
69 69 dispq_t *olddispq;
70 70 dispq_t *newdispq;
71 71 ulong_t *olddqactmap;
72 72 ulong_t *newdqactmap;
73 73 int oldnglobpris;
74 74 };
75 75 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
76 76 disp_t *dp);
77 77 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris);
78 78 static void disp_dq_free(struct disp_queue_info *dptr);
79 79
80 80 /* platform-specific routine to call when processor is idle */
81 81 static void generic_idle_cpu();
82 82 void (*idle_cpu)() = generic_idle_cpu;
83 83
84 84 /* routines invoked when a CPU enters/exits the idle loop */
85 85 static void idle_enter();
86 86 static void idle_exit();
87 87
88 88 /* platform-specific routine to call when thread is enqueued */
↓ open down ↓ |
88 lines elided |
↑ open up ↑ |
89 89 static void generic_enq_thread(cpu_t *, int);
90 90 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
91 91
92 92 pri_t kpreemptpri; /* priority where kernel preemption applies */
93 93 pri_t upreemptpri = 0; /* priority where normal preemption applies */
94 94 pri_t intr_pri; /* interrupt thread priority base level */
95 95
96 96 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */
97 97 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */
98 98 disp_t cpu0_disp; /* boot CPU's dispatch queue */
99 -disp_lock_t swapped_lock; /* lock swapped threads and swap queue */
100 99 int nswapped; /* total number of swapped threads */
101 -void disp_swapped_enq(kthread_t *tp);
102 100 static void disp_swapped_setrun(kthread_t *tp);
103 101 static void cpu_resched(cpu_t *cp, pri_t tpri);
104 102
105 103 /*
106 104 * If this is set, only interrupt threads will cause kernel preemptions.
107 105 * This is done by changing the value of kpreemptpri. kpreemptpri
108 106 * will either be the max sysclass pri + 1 or the min interrupt pri.
109 107 */
110 108 int only_intr_kpreempt;
111 109
112 110 extern void set_idle_cpu(int cpun);
113 111 extern void unset_idle_cpu(int cpun);
114 112 static void setkpdq(kthread_t *tp, int borf);
115 113 #define SETKP_BACK 0
116 114 #define SETKP_FRONT 1
117 115 /*
118 116 * Parameter that determines how recently a thread must have run
119 117 * on the CPU to be considered loosely-bound to that CPU to reduce
120 118 * cold cache effects. The interval is in hertz.
121 119 */
122 120 #define RECHOOSE_INTERVAL 3
123 121 int rechoose_interval = RECHOOSE_INTERVAL;
124 122
125 123 /*
126 124 * Parameter that determines how long (in nanoseconds) a thread must
127 125 * be sitting on a run queue before it can be stolen by another CPU
128 126 * to reduce migrations. The interval is in nanoseconds.
129 127 *
130 128 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
131 129 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED
132 130 * here indicating it is uninitiallized.
133 131 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
134 132 *
135 133 */
136 134 #define NOSTEAL_UNINITIALIZED (-1)
137 135 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
138 136 extern void cmp_set_nosteal_interval(void);
139 137
140 138 id_t defaultcid; /* system "default" class; see dispadmin(1M) */
141 139
142 140 disp_lock_t transition_lock; /* lock on transitioning threads */
143 141 disp_lock_t stop_lock; /* lock on stopped threads */
144 142
145 143 static void cpu_dispqalloc(int numpris);
146 144
147 145 /*
148 146 * This gets returned by disp_getwork/disp_getbest if we couldn't steal
149 147 * a thread because it was sitting on its run queue for a very short
150 148 * period of time.
151 149 */
152 150 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */
153 151
154 152 static kthread_t *disp_getwork(cpu_t *to);
155 153 static kthread_t *disp_getbest(disp_t *from);
156 154 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq);
157 155
158 156 void swtch_to(kthread_t *);
159 157
160 158 /*
161 159 * dispatcher and scheduler initialization
162 160 */
163 161
164 162 /*
165 163 * disp_setup - Common code to calculate and allocate dispatcher
166 164 * variables and structures based on the maximum priority.
167 165 */
168 166 static void
169 167 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
170 168 {
171 169 pri_t newnglobpris;
172 170
173 171 ASSERT(MUTEX_HELD(&cpu_lock));
174 172
175 173 newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
176 174
177 175 if (newnglobpris > oldnglobpris) {
178 176 /*
179 177 * Allocate new kp queues for each CPU partition.
180 178 */
181 179 cpupart_kpqalloc(newnglobpris);
182 180
183 181 /*
184 182 * Allocate new dispatch queues for each CPU.
185 183 */
186 184 cpu_dispqalloc(newnglobpris);
187 185
188 186 /*
189 187 * compute new interrupt thread base priority
190 188 */
191 189 intr_pri = maxglobpri;
192 190 if (only_intr_kpreempt) {
193 191 kpreemptpri = intr_pri + 1;
194 192 if (kpqpri == KPQPRI)
195 193 kpqpri = kpreemptpri;
196 194 }
197 195 v.v_nglobpris = newnglobpris;
198 196 }
199 197 }
200 198
201 199 /*
202 200 * dispinit - Called to initialize all loaded classes and the
203 201 * dispatcher framework.
204 202 */
205 203 void
206 204 dispinit(void)
207 205 {
208 206 id_t cid;
209 207 pri_t maxglobpri;
210 208 pri_t cl_maxglobpri;
211 209
212 210 maxglobpri = -1;
213 211
214 212 /*
215 213 * Initialize transition lock, which will always be set.
216 214 */
217 215 DISP_LOCK_INIT(&transition_lock);
218 216 disp_lock_enter_high(&transition_lock);
219 217 DISP_LOCK_INIT(&stop_lock);
220 218
221 219 mutex_enter(&cpu_lock);
222 220 CPU->cpu_disp->disp_maxrunpri = -1;
223 221 CPU->cpu_disp->disp_max_unbound_pri = -1;
224 222
225 223 /*
226 224 * Initialize the default CPU partition.
227 225 */
228 226 cpupart_initialize_default();
229 227 /*
230 228 * Call the class specific initialization functions for
231 229 * all pre-installed schedulers.
232 230 *
233 231 * We pass the size of a class specific parameter
234 232 * buffer to each of the initialization functions
235 233 * to try to catch problems with backward compatibility
236 234 * of class modules.
237 235 *
238 236 * For example a new class module running on an old system
239 237 * which didn't provide sufficiently large parameter buffers
240 238 * would be bad news. Class initialization modules can check for
241 239 * this and take action if they detect a problem.
242 240 */
243 241
244 242 for (cid = 0; cid < nclass; cid++) {
245 243 sclass_t *sc;
246 244
247 245 sc = &sclass[cid];
248 246 if (SCHED_INSTALLED(sc)) {
249 247 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
250 248 &sc->cl_funcs);
251 249 if (cl_maxglobpri > maxglobpri)
252 250 maxglobpri = cl_maxglobpri;
253 251 }
254 252 }
255 253 kpreemptpri = (pri_t)v.v_maxsyspri + 1;
256 254 if (kpqpri == KPQPRI)
257 255 kpqpri = kpreemptpri;
258 256
259 257 ASSERT(maxglobpri >= 0);
260 258 disp_setup(maxglobpri, 0);
261 259
262 260 mutex_exit(&cpu_lock);
263 261
264 262 /*
265 263 * Platform specific sticky scheduler setup.
266 264 */
267 265 if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
268 266 cmp_set_nosteal_interval();
269 267
270 268 /*
271 269 * Get the default class ID; this may be later modified via
272 270 * dispadmin(1M). This will load the class (normally TS) and that will
273 271 * call disp_add(), which is why we had to drop cpu_lock first.
274 272 */
275 273 if (getcid(defaultclass, &defaultcid) != 0) {
276 274 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
277 275 defaultclass);
278 276 }
279 277 }
280 278
281 279 /*
282 280 * disp_add - Called with class pointer to initialize the dispatcher
283 281 * for a newly loaded class.
284 282 */
285 283 void
286 284 disp_add(sclass_t *clp)
287 285 {
288 286 pri_t maxglobpri;
289 287 pri_t cl_maxglobpri;
290 288
291 289 mutex_enter(&cpu_lock);
292 290 /*
293 291 * Initialize the scheduler class.
294 292 */
295 293 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
296 294 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
297 295 if (cl_maxglobpri > maxglobpri)
298 296 maxglobpri = cl_maxglobpri;
299 297
300 298 /*
301 299 * Save old queue information. Since we're initializing a
302 300 * new scheduling class which has just been loaded, then
303 301 * the size of the dispq may have changed. We need to handle
304 302 * that here.
305 303 */
306 304 disp_setup(maxglobpri, v.v_nglobpris);
307 305
308 306 mutex_exit(&cpu_lock);
309 307 }
310 308
311 309
312 310 /*
313 311 * For each CPU, allocate new dispatch queues
314 312 * with the stated number of priorities.
315 313 */
316 314 static void
317 315 cpu_dispqalloc(int numpris)
318 316 {
319 317 cpu_t *cpup;
320 318 struct disp_queue_info *disp_mem;
321 319 int i, num;
322 320
323 321 ASSERT(MUTEX_HELD(&cpu_lock));
324 322
325 323 disp_mem = kmem_zalloc(NCPU *
326 324 sizeof (struct disp_queue_info), KM_SLEEP);
327 325
328 326 /*
329 327 * This routine must allocate all of the memory before stopping
330 328 * the cpus because it must not sleep in kmem_alloc while the
331 329 * CPUs are stopped. Locks they hold will not be freed until they
332 330 * are restarted.
333 331 */
334 332 i = 0;
335 333 cpup = cpu_list;
336 334 do {
337 335 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
338 336 i++;
339 337 cpup = cpup->cpu_next;
340 338 } while (cpup != cpu_list);
341 339 num = i;
342 340
343 341 pause_cpus(NULL, NULL);
344 342 for (i = 0; i < num; i++)
345 343 disp_dq_assign(&disp_mem[i], numpris);
346 344 start_cpus();
347 345
348 346 /*
349 347 * I must free all of the memory after starting the cpus because
350 348 * I can not risk sleeping in kmem_free while the cpus are stopped.
351 349 */
352 350 for (i = 0; i < num; i++)
353 351 disp_dq_free(&disp_mem[i]);
354 352
355 353 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
356 354 }
357 355
358 356 static void
359 357 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
360 358 {
361 359 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
362 360 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
363 361 sizeof (long), KM_SLEEP);
364 362 dptr->dp = dp;
365 363 }
366 364
367 365 static void
368 366 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
369 367 {
370 368 disp_t *dp;
371 369
372 370 dp = dptr->dp;
373 371 dptr->olddispq = dp->disp_q;
374 372 dptr->olddqactmap = dp->disp_qactmap;
375 373 dptr->oldnglobpris = dp->disp_npri;
376 374
377 375 ASSERT(dptr->oldnglobpris < numpris);
378 376
379 377 if (dptr->olddispq != NULL) {
380 378 /*
381 379 * Use kcopy because bcopy is platform-specific
382 380 * and could block while we might have paused the cpus.
383 381 */
384 382 (void) kcopy(dptr->olddispq, dptr->newdispq,
385 383 dptr->oldnglobpris * sizeof (dispq_t));
386 384 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
387 385 ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
388 386 sizeof (long));
389 387 }
390 388 dp->disp_q = dptr->newdispq;
391 389 dp->disp_qactmap = dptr->newdqactmap;
392 390 dp->disp_q_limit = &dptr->newdispq[numpris];
393 391 dp->disp_npri = numpris;
394 392 }
395 393
396 394 static void
397 395 disp_dq_free(struct disp_queue_info *dptr)
398 396 {
399 397 if (dptr->olddispq != NULL)
400 398 kmem_free(dptr->olddispq,
401 399 dptr->oldnglobpris * sizeof (dispq_t));
402 400 if (dptr->olddqactmap != NULL)
403 401 kmem_free(dptr->olddqactmap,
404 402 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
405 403 }
406 404
407 405 /*
408 406 * For a newly created CPU, initialize the dispatch queue.
409 407 * This is called before the CPU is known through cpu[] or on any lists.
410 408 */
411 409 void
412 410 disp_cpu_init(cpu_t *cp)
413 411 {
414 412 disp_t *dp;
415 413 dispq_t *newdispq;
416 414 ulong_t *newdqactmap;
417 415
418 416 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */
419 417
420 418 if (cp == cpu0_disp.disp_cpu)
421 419 dp = &cpu0_disp;
422 420 else
423 421 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
424 422 bzero(dp, sizeof (disp_t));
425 423 cp->cpu_disp = dp;
426 424 dp->disp_cpu = cp;
427 425 dp->disp_maxrunpri = -1;
428 426 dp->disp_max_unbound_pri = -1;
429 427 DISP_LOCK_INIT(&cp->cpu_thread_lock);
430 428 /*
431 429 * Allocate memory for the dispatcher queue headers
432 430 * and the active queue bitmap.
433 431 */
434 432 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
435 433 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
436 434 sizeof (long), KM_SLEEP);
437 435 dp->disp_q = newdispq;
438 436 dp->disp_qactmap = newdqactmap;
439 437 dp->disp_q_limit = &newdispq[v.v_nglobpris];
440 438 dp->disp_npri = v.v_nglobpris;
441 439 }
442 440
443 441 void
444 442 disp_cpu_fini(cpu_t *cp)
445 443 {
446 444 ASSERT(MUTEX_HELD(&cpu_lock));
447 445
448 446 disp_kp_free(cp->cpu_disp);
449 447 if (cp->cpu_disp != &cpu0_disp)
450 448 kmem_free(cp->cpu_disp, sizeof (disp_t));
451 449 }
452 450
453 451 /*
454 452 * Allocate new, larger kpreempt dispatch queue to replace the old one.
455 453 */
456 454 void
457 455 disp_kp_alloc(disp_t *dq, pri_t npri)
458 456 {
459 457 struct disp_queue_info mem_info;
460 458
461 459 if (npri > dq->disp_npri) {
462 460 /*
463 461 * Allocate memory for the new array.
464 462 */
465 463 disp_dq_alloc(&mem_info, npri, dq);
466 464
467 465 /*
468 466 * We need to copy the old structures to the new
469 467 * and free the old.
470 468 */
471 469 disp_dq_assign(&mem_info, npri);
472 470 disp_dq_free(&mem_info);
473 471 }
474 472 }
475 473
476 474 /*
477 475 * Free dispatch queue.
478 476 * Used for the kpreempt queues for a removed CPU partition and
479 477 * for the per-CPU queues of deleted CPUs.
480 478 */
481 479 void
482 480 disp_kp_free(disp_t *dq)
483 481 {
484 482 struct disp_queue_info mem_info;
485 483
486 484 mem_info.olddispq = dq->disp_q;
487 485 mem_info.olddqactmap = dq->disp_qactmap;
488 486 mem_info.oldnglobpris = dq->disp_npri;
489 487 disp_dq_free(&mem_info);
490 488 }
491 489
492 490 /*
493 491 * End dispatcher and scheduler initialization.
494 492 */
495 493
496 494 /*
497 495 * See if there's anything to do other than remain idle.
498 496 * Return non-zero if there is.
499 497 *
500 498 * This function must be called with high spl, or with
501 499 * kernel preemption disabled to prevent the partition's
502 500 * active cpu list from changing while being traversed.
503 501 *
504 502 * This is essentially a simpler version of disp_getwork()
505 503 * to be called by CPUs preparing to "halt".
506 504 */
507 505 int
508 506 disp_anywork(void)
509 507 {
510 508 cpu_t *cp = CPU;
511 509 cpu_t *ocp;
512 510 volatile int *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
513 511
514 512 if (!(cp->cpu_flags & CPU_OFFLINE)) {
515 513 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
516 514 return (1);
517 515
518 516 for (ocp = cp->cpu_next_part; ocp != cp;
519 517 ocp = ocp->cpu_next_part) {
520 518 ASSERT(CPU_ACTIVE(ocp));
521 519
522 520 /*
523 521 * Something has appeared on the local run queue.
524 522 */
525 523 if (*local_nrunnable > 0)
526 524 return (1);
527 525 /*
528 526 * If we encounter another idle CPU that will
529 527 * soon be trolling around through disp_anywork()
530 528 * terminate our walk here and let this other CPU
531 529 * patrol the next part of the list.
532 530 */
533 531 if (ocp->cpu_dispatch_pri == -1 &&
534 532 (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
535 533 return (0);
536 534 /*
537 535 * Work can be taken from another CPU if:
538 536 * - There is unbound work on the run queue
539 537 * - That work isn't a thread undergoing a
540 538 * - context switch on an otherwise empty queue.
541 539 * - The CPU isn't running the idle loop.
542 540 */
543 541 if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
544 542 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
545 543 ocp->cpu_disp->disp_nrunnable == 1) &&
546 544 ocp->cpu_dispatch_pri != -1)
547 545 return (1);
548 546 }
549 547 }
550 548 return (0);
551 549 }
552 550
553 551 /*
554 552 * Called when CPU enters the idle loop
555 553 */
556 554 static void
557 555 idle_enter()
558 556 {
559 557 cpu_t *cp = CPU;
560 558
561 559 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
562 560 CPU_STATS_ADDQ(cp, sys, idlethread, 1);
563 561 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */
564 562 }
565 563
566 564 /*
567 565 * Called when CPU exits the idle loop
568 566 */
569 567 static void
570 568 idle_exit()
571 569 {
572 570 cpu_t *cp = CPU;
573 571
574 572 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
575 573 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */
576 574 }
577 575
578 576 /*
579 577 * Idle loop.
580 578 */
581 579 void
582 580 idle()
583 581 {
584 582 struct cpu *cp = CPU; /* pointer to this CPU */
585 583 kthread_t *t; /* taken thread */
586 584
587 585 idle_enter();
588 586
589 587 /*
590 588 * Uniprocessor version of idle loop.
591 589 * Do this until notified that we're on an actual multiprocessor.
592 590 */
593 591 while (ncpus == 1) {
594 592 if (cp->cpu_disp->disp_nrunnable == 0) {
595 593 (*idle_cpu)();
596 594 continue;
597 595 }
598 596 idle_exit();
599 597 swtch();
600 598
601 599 idle_enter(); /* returned from swtch */
602 600 }
603 601
604 602 /*
605 603 * Multiprocessor idle loop.
606 604 */
607 605 for (;;) {
608 606 /*
609 607 * If CPU is completely quiesced by p_online(2), just wait
610 608 * here with minimal bus traffic until put online.
611 609 */
612 610 while (cp->cpu_flags & CPU_QUIESCED)
613 611 (*idle_cpu)();
614 612
615 613 if (cp->cpu_disp->disp_nrunnable != 0) {
616 614 idle_exit();
617 615 swtch();
618 616 } else {
619 617 if (cp->cpu_flags & CPU_OFFLINE)
620 618 continue;
621 619 if ((t = disp_getwork(cp)) == NULL) {
622 620 if (cp->cpu_chosen_level != -1) {
623 621 disp_t *dp = cp->cpu_disp;
624 622 disp_t *kpq;
625 623
626 624 disp_lock_enter(&dp->disp_lock);
627 625 /*
628 626 * Set kpq under lock to prevent
629 627 * migration between partitions.
630 628 */
631 629 kpq = &cp->cpu_part->cp_kp_queue;
632 630 if (kpq->disp_maxrunpri == -1)
633 631 cp->cpu_chosen_level = -1;
634 632 disp_lock_exit(&dp->disp_lock);
635 633 }
636 634 (*idle_cpu)();
637 635 continue;
638 636 }
639 637 /*
640 638 * If there was a thread but we couldn't steal
641 639 * it, then keep trying.
642 640 */
643 641 if (t == T_DONTSTEAL)
644 642 continue;
645 643 idle_exit();
646 644 swtch_to(t);
647 645 }
648 646 idle_enter(); /* returned from swtch/swtch_to */
649 647 }
650 648 }
651 649
652 650
653 651 /*
654 652 * Preempt the currently running thread in favor of the highest
655 653 * priority thread. The class of the current thread controls
656 654 * where it goes on the dispatcher queues. If panicking, turn
657 655 * preemption off.
658 656 */
659 657 void
660 658 preempt()
661 659 {
662 660 kthread_t *t = curthread;
663 661 klwp_t *lwp = ttolwp(curthread);
664 662
665 663 if (panicstr)
666 664 return;
667 665
668 666 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
669 667
670 668 thread_lock(t);
671 669
672 670 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
673 671 /*
674 672 * this thread has already been chosen to be run on
675 673 * another CPU. Clear kprunrun on this CPU since we're
676 674 * already headed for swtch().
677 675 */
678 676 CPU->cpu_kprunrun = 0;
679 677 thread_unlock_nopreempt(t);
680 678 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
681 679 } else {
682 680 if (lwp != NULL)
683 681 lwp->lwp_ru.nivcsw++;
684 682 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
685 683 THREAD_TRANSITION(t);
686 684 CL_PREEMPT(t);
687 685 DTRACE_SCHED(preempt);
688 686 thread_unlock_nopreempt(t);
689 687
690 688 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
691 689
692 690 swtch(); /* clears CPU->cpu_runrun via disp() */
693 691 }
694 692 }
695 693
696 694 extern kthread_t *thread_unpin();
697 695
698 696 /*
699 697 * disp() - find the highest priority thread for this processor to run, and
700 698 * set it in TS_ONPROC state so that resume() can be called to run it.
701 699 */
702 700 static kthread_t *
703 701 disp()
704 702 {
705 703 cpu_t *cpup;
706 704 disp_t *dp;
707 705 kthread_t *tp;
708 706 dispq_t *dq;
709 707 int maxrunword;
710 708 pri_t pri;
711 709 disp_t *kpq;
712 710
713 711 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
714 712
715 713 cpup = CPU;
716 714 /*
717 715 * Find the highest priority loaded, runnable thread.
718 716 */
719 717 dp = cpup->cpu_disp;
720 718
721 719 reschedule:
722 720 /*
723 721 * If there is more important work on the global queue with a better
724 722 * priority than the maximum on this CPU, take it now.
725 723 */
726 724 kpq = &cpup->cpu_part->cp_kp_queue;
727 725 while ((pri = kpq->disp_maxrunpri) >= 0 &&
728 726 pri >= dp->disp_maxrunpri &&
729 727 (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
730 728 (tp = disp_getbest(kpq)) != NULL) {
731 729 if (disp_ratify(tp, kpq) != NULL) {
732 730 TRACE_1(TR_FAC_DISP, TR_DISP_END,
733 731 "disp_end:tid %p", tp);
734 732 return (tp);
735 733 }
736 734 }
737 735
738 736 disp_lock_enter(&dp->disp_lock);
739 737 pri = dp->disp_maxrunpri;
740 738
741 739 /*
742 740 * If there is nothing to run, look at what's runnable on other queues.
743 741 * Choose the idle thread if the CPU is quiesced.
744 742 * Note that CPUs that have the CPU_OFFLINE flag set can still run
745 743 * interrupt threads, which will be the only threads on the CPU's own
746 744 * queue, but cannot run threads from other queues.
747 745 */
748 746 if (pri == -1) {
749 747 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
750 748 disp_lock_exit(&dp->disp_lock);
751 749 if ((tp = disp_getwork(cpup)) == NULL ||
752 750 tp == T_DONTSTEAL) {
753 751 tp = cpup->cpu_idle_thread;
754 752 (void) splhigh();
755 753 THREAD_ONPROC(tp, cpup);
756 754 cpup->cpu_dispthread = tp;
757 755 cpup->cpu_dispatch_pri = -1;
758 756 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
759 757 cpup->cpu_chosen_level = -1;
760 758 }
761 759 } else {
762 760 disp_lock_exit_high(&dp->disp_lock);
763 761 tp = cpup->cpu_idle_thread;
764 762 THREAD_ONPROC(tp, cpup);
765 763 cpup->cpu_dispthread = tp;
766 764 cpup->cpu_dispatch_pri = -1;
767 765 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
768 766 cpup->cpu_chosen_level = -1;
↓ open down ↓ |
657 lines elided |
↑ open up ↑ |
769 767 }
770 768 TRACE_1(TR_FAC_DISP, TR_DISP_END,
771 769 "disp_end:tid %p", tp);
772 770 return (tp);
773 771 }
774 772
775 773 dq = &dp->disp_q[pri];
776 774 tp = dq->dq_first;
777 775
778 776 ASSERT(tp != NULL);
779 - ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */
780 777
781 778 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
782 779
783 780 /*
784 781 * Found it so remove it from queue.
785 782 */
786 783 dp->disp_nrunnable--;
787 784 dq->dq_sruncnt--;
788 785 if ((dq->dq_first = tp->t_link) == NULL) {
789 786 ulong_t *dqactmap = dp->disp_qactmap;
790 787
791 788 ASSERT(dq->dq_sruncnt == 0);
792 789 dq->dq_last = NULL;
793 790
794 791 /*
795 792 * The queue is empty, so the corresponding bit needs to be
796 793 * turned off in dqactmap. If nrunnable != 0 just took the
797 794 * last runnable thread off the
798 795 * highest queue, so recompute disp_maxrunpri.
799 796 */
800 797 maxrunword = pri >> BT_ULSHIFT;
801 798 dqactmap[maxrunword] &= ~BT_BIW(pri);
802 799
803 800 if (dp->disp_nrunnable == 0) {
804 801 dp->disp_max_unbound_pri = -1;
805 802 dp->disp_maxrunpri = -1;
806 803 } else {
807 804 int ipri;
↓ open down ↓ |
18 lines elided |
↑ open up ↑ |
808 805
809 806 ipri = bt_gethighbit(dqactmap, maxrunword);
810 807 dp->disp_maxrunpri = ipri;
811 808 if (ipri < dp->disp_max_unbound_pri)
812 809 dp->disp_max_unbound_pri = ipri;
813 810 }
814 811 } else {
815 812 tp->t_link = NULL;
816 813 }
817 814
818 - /*
819 - * Set TS_DONT_SWAP flag to prevent another processor from swapping
820 - * out this thread before we have a chance to run it.
821 - * While running, it is protected against swapping by t_lock.
822 - */
823 - tp->t_schedflag |= TS_DONT_SWAP;
824 815 cpup->cpu_dispthread = tp; /* protected by spl only */
825 816 cpup->cpu_dispatch_pri = pri;
826 817 ASSERT(pri == DISP_PRIO(tp));
827 818 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */
828 819 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */
829 820
830 821 ASSERT(tp != NULL);
831 822 TRACE_1(TR_FAC_DISP, TR_DISP_END,
832 823 "disp_end:tid %p", tp);
833 824
834 825 if (disp_ratify(tp, kpq) == NULL)
835 826 goto reschedule;
836 827
837 828 return (tp);
838 829 }
839 830
840 831 /*
841 832 * swtch()
842 833 * Find best runnable thread and run it.
843 834 * Called with the current thread already switched to a new state,
844 835 * on a sleep queue, run queue, stopped, and not zombied.
845 836 * May be called at any spl level less than or equal to LOCK_LEVEL.
846 837 * Always drops spl to the base level (spl0()).
847 838 */
848 839 void
849 840 swtch()
850 841 {
851 842 kthread_t *t = curthread;
852 843 kthread_t *next;
853 844 cpu_t *cp;
854 845
855 846 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
856 847
857 848 if (t->t_flag & T_INTR_THREAD)
858 849 cpu_intr_swtch_enter(t);
859 850
860 851 if (t->t_intr != NULL) {
861 852 /*
862 853 * We are an interrupt thread. Setup and return
863 854 * the interrupted thread to be resumed.
864 855 */
865 856 (void) splhigh(); /* block other scheduler action */
866 857 cp = CPU; /* now protected against migration */
867 858 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
868 859 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
869 860 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
870 861 next = thread_unpin();
871 862 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
872 863 resume_from_intr(next);
873 864 } else {
874 865 #ifdef DEBUG
875 866 if (t->t_state == TS_ONPROC &&
876 867 t->t_disp_queue->disp_cpu == CPU &&
877 868 t->t_preempt == 0) {
878 869 thread_lock(t);
879 870 ASSERT(t->t_state != TS_ONPROC ||
880 871 t->t_disp_queue->disp_cpu != CPU ||
881 872 t->t_preempt != 0); /* cannot migrate */
882 873 thread_unlock_nopreempt(t);
883 874 }
884 875 #endif /* DEBUG */
885 876 cp = CPU;
886 877 next = disp(); /* returns with spl high */
887 878 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
888 879
889 880 /* OK to steal anything left on run queue */
890 881 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
891 882
892 883 if (next != t) {
893 884 hrtime_t now;
894 885
895 886 now = gethrtime_unscaled();
896 887 pg_ev_thread_swtch(cp, now, t, next);
897 888
898 889 /*
899 890 * If t was previously in the TS_ONPROC state,
900 891 * setfrontdq and setbackdq won't have set its t_waitrq.
901 892 * Since we now finally know that we're switching away
902 893 * from this thread, set its t_waitrq if it is on a run
903 894 * queue.
904 895 */
905 896 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
906 897 t->t_waitrq = now;
907 898 }
908 899
909 900 /*
910 901 * restore mstate of thread that we are switching to
911 902 */
912 903 restore_mstate(next);
913 904
914 905 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
915 906 cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
916 907 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
917 908
918 909 if (dtrace_vtime_active)
919 910 dtrace_vtime_switch(next);
920 911
921 912 resume(next);
922 913 /*
923 914 * The TR_RESUME_END and TR_SWTCH_END trace points
924 915 * appear at the end of resume(), because we may not
925 916 * return here
926 917 */
927 918 } else {
928 919 if (t->t_flag & T_INTR_THREAD)
929 920 cpu_intr_swtch_exit(t);
930 921 /*
931 922 * Threads that enqueue themselves on a run queue defer
932 923 * setting t_waitrq. It is then either set in swtch()
933 924 * when the CPU is actually yielded, or not at all if it
934 925 * is remaining on the CPU.
935 926 * There is however a window between where the thread
936 927 * placed itself on a run queue, and where it selects
937 928 * itself in disp(), where a third party (eg. clock()
938 929 * doing tick processing) may have re-enqueued this
939 930 * thread, setting t_waitrq in the process. We detect
940 931 * this race by noticing that despite switching to
941 932 * ourself, our t_waitrq has been set, and should be
942 933 * cleared.
943 934 */
944 935 if (t->t_waitrq != 0)
945 936 t->t_waitrq = 0;
946 937
947 938 pg_ev_thread_remain(cp, t);
948 939
949 940 DTRACE_SCHED(remain__cpu);
950 941 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
951 942 (void) spl0();
952 943 }
953 944 }
954 945 }
955 946
956 947 /*
957 948 * swtch_from_zombie()
958 949 * Special case of swtch(), which allows checks for TS_ZOMB to be
959 950 * eliminated from normal resume.
960 951 * Find best runnable thread and run it.
961 952 * Called with the current thread zombied.
962 953 * Zombies cannot migrate, so CPU references are safe.
963 954 */
964 955 void
965 956 swtch_from_zombie()
966 957 {
967 958 kthread_t *next;
968 959 cpu_t *cpu = CPU;
969 960
970 961 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
971 962
972 963 ASSERT(curthread->t_state == TS_ZOMB);
973 964
974 965 next = disp(); /* returns with spl high */
975 966 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */
976 967 CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
977 968 ASSERT(next != curthread);
978 969 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
979 970
980 971 pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
981 972
982 973 restore_mstate(next);
983 974
984 975 if (dtrace_vtime_active)
985 976 dtrace_vtime_switch(next);
986 977
987 978 resume_from_zombie(next);
988 979 /*
989 980 * The TR_RESUME_END and TR_SWTCH_END trace points
990 981 * appear at the end of resume(), because we certainly will not
991 982 * return here
992 983 */
993 984 }
994 985
995 986 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
996 987
997 988 /*
998 989 * search_disp_queues()
999 990 * Search the given dispatch queues for thread tp.
1000 991 * Return 1 if tp is found, otherwise return 0.
1001 992 */
1002 993 static int
1003 994 search_disp_queues(disp_t *dp, kthread_t *tp)
1004 995 {
1005 996 dispq_t *dq;
1006 997 dispq_t *eq;
1007 998
1008 999 disp_lock_enter_high(&dp->disp_lock);
1009 1000
1010 1001 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1011 1002 kthread_t *rp;
1012 1003
1013 1004 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1014 1005
1015 1006 for (rp = dq->dq_first; rp; rp = rp->t_link)
1016 1007 if (tp == rp) {
1017 1008 disp_lock_exit_high(&dp->disp_lock);
1018 1009 return (1);
1019 1010 }
1020 1011 }
1021 1012 disp_lock_exit_high(&dp->disp_lock);
1022 1013
1023 1014 return (0);
1024 1015 }
1025 1016
1026 1017 /*
1027 1018 * thread_on_queue()
1028 1019 * Search all per-CPU dispatch queues and all partition-wide kpreempt
1029 1020 * queues for thread tp. Return 1 if tp is found, otherwise return 0.
1030 1021 */
1031 1022 static int
1032 1023 thread_on_queue(kthread_t *tp)
1033 1024 {
1034 1025 cpu_t *cp;
1035 1026 struct cpupart *part;
1036 1027
1037 1028 ASSERT(getpil() >= DISP_LEVEL);
1038 1029
1039 1030 /*
1040 1031 * Search the per-CPU dispatch queues for tp.
1041 1032 */
1042 1033 cp = CPU;
1043 1034 do {
1044 1035 if (search_disp_queues(cp->cpu_disp, tp))
1045 1036 return (1);
1046 1037 } while ((cp = cp->cpu_next_onln) != CPU);
1047 1038
1048 1039 /*
1049 1040 * Search the partition-wide kpreempt queues for tp.
1050 1041 */
1051 1042 part = CPU->cpu_part;
1052 1043 do {
1053 1044 if (search_disp_queues(&part->cp_kp_queue, tp))
1054 1045 return (1);
1055 1046 } while ((part = part->cp_next) != CPU->cpu_part);
1056 1047
1057 1048 return (0);
1058 1049 }
1059 1050
1060 1051 #else
1061 1052
1062 1053 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */
1063 1054
1064 1055 #endif /* DEBUG */
1065 1056
1066 1057 /*
1067 1058 * like swtch(), but switch to a specified thread taken from another CPU.
1068 1059 * called with spl high..
1069 1060 */
1070 1061 void
1071 1062 swtch_to(kthread_t *next)
1072 1063 {
1073 1064 cpu_t *cp = CPU;
1074 1065 hrtime_t now;
1075 1066
1076 1067 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1077 1068
1078 1069 /*
1079 1070 * Update context switch statistics.
1080 1071 */
1081 1072 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1082 1073
1083 1074 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1084 1075
1085 1076 now = gethrtime_unscaled();
1086 1077 pg_ev_thread_swtch(cp, now, curthread, next);
1087 1078
1088 1079 /* OK to steal anything left on run queue */
1089 1080 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1090 1081
1091 1082 /* record last execution time */
1092 1083 cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1093 1084
1094 1085 /*
1095 1086 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1096 1087 * won't have set its t_waitrq. Since we now finally know that we're
1097 1088 * switching away from this thread, set its t_waitrq if it is on a run
1098 1089 * queue.
1099 1090 */
1100 1091 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1101 1092 curthread->t_waitrq = now;
1102 1093 }
1103 1094
1104 1095 /* restore next thread to previously running microstate */
1105 1096 restore_mstate(next);
1106 1097
1107 1098 if (dtrace_vtime_active)
1108 1099 dtrace_vtime_switch(next);
1109 1100
1110 1101 resume(next);
1111 1102 /*
1112 1103 * The TR_RESUME_END and TR_SWTCH_END trace points
1113 1104 * appear at the end of resume(), because we may not
1114 1105 * return here
1115 1106 */
1116 1107 }
1117 1108
1118 1109 #define CPU_IDLING(pri) ((pri) == -1)
1119 1110
1120 1111 static void
1121 1112 cpu_resched(cpu_t *cp, pri_t tpri)
1122 1113 {
1123 1114 int call_poke_cpu = 0;
1124 1115 pri_t cpupri = cp->cpu_dispatch_pri;
1125 1116
1126 1117 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1127 1118 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1128 1119 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1129 1120 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1130 1121 cp->cpu_runrun = 1;
1131 1122 aston(cp->cpu_dispthread);
1132 1123 if (tpri < kpreemptpri && cp != CPU)
1133 1124 call_poke_cpu = 1;
1134 1125 }
1135 1126 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1136 1127 cp->cpu_kprunrun = 1;
1137 1128 if (cp != CPU)
1138 1129 call_poke_cpu = 1;
1139 1130 }
1140 1131 }
1141 1132
1142 1133 /*
1143 1134 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1144 1135 */
1145 1136 membar_enter();
1146 1137
1147 1138 if (call_poke_cpu)
1148 1139 poke_cpu(cp->cpu_id);
1149 1140 }
1150 1141
1151 1142 /*
1152 1143 * setbackdq() keeps runqs balanced such that the difference in length
1153 1144 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1154 1145 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1155 1146 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1156 1147 * try to keep runqs perfectly balanced regardless of the thread priority.
1157 1148 */
1158 1149 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */
1159 1150 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */
1160 1151 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1161 1152
1162 1153 /*
1163 1154 * Macro that evaluates to true if it is likely that the thread has cache
1164 1155 * warmth. This is based on the amount of time that has elapsed since the
1165 1156 * thread last ran. If that amount of time is less than "rechoose_interval"
1166 1157 * ticks, then we decide that the thread has enough cache warmth to warrant
1167 1158 * some affinity for t->t_cpu.
1168 1159 */
1169 1160 #define THREAD_HAS_CACHE_WARMTH(thread) \
1170 1161 ((thread == curthread) || \
1171 1162 ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1172 1163 /*
1173 1164 * Put the specified thread on the back of the dispatcher
1174 1165 * queue corresponding to its current priority.
1175 1166 *
1176 1167 * Called with the thread in transition, onproc or stopped state
1177 1168 * and locked (transition implies locked) and at high spl.
1178 1169 * Returns with the thread in TS_RUN state and still locked.
1179 1170 */
1180 1171 void
1181 1172 setbackdq(kthread_t *tp)
1182 1173 {
1183 1174 dispq_t *dq;
↓ open down ↓ |
350 lines elided |
↑ open up ↑ |
1184 1175 disp_t *dp;
1185 1176 cpu_t *cp;
1186 1177 pri_t tpri;
1187 1178 int bound;
1188 1179 boolean_t self;
1189 1180
1190 1181 ASSERT(THREAD_LOCK_HELD(tp));
1191 1182 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1192 1183 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1193 1184
1194 - /*
1195 - * If thread is "swapped" or on the swap queue don't
1196 - * queue it, but wake sched.
1197 - */
1198 - if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1199 - disp_swapped_setrun(tp);
1200 - return;
1201 - }
1202 -
1203 1185 self = (tp == curthread);
1204 1186
1205 1187 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1206 1188 bound = 1;
1207 1189 else
1208 1190 bound = 0;
1209 1191
1210 1192 tpri = DISP_PRIO(tp);
1211 1193 if (ncpus == 1)
1212 1194 cp = tp->t_cpu;
1213 1195 else if (!bound) {
1214 1196 if (tpri >= kpqpri) {
1215 1197 setkpdq(tp, SETKP_BACK);
1216 1198 return;
1217 1199 }
1218 1200
1219 1201 /*
1220 1202 * We'll generally let this thread continue to run where
1221 1203 * it last ran...but will consider migration if:
1222 1204 * - We thread probably doesn't have much cache warmth.
1223 1205 * - The CPU where it last ran is the target of an offline
1224 1206 * request.
1225 1207 * - The thread last ran outside it's home lgroup.
1226 1208 */
1227 1209 if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1228 1210 (tp->t_cpu == cpu_inmotion)) {
1229 1211 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1230 1212 } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1231 1213 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1232 1214 self ? tp->t_cpu : NULL);
1233 1215 } else {
1234 1216 cp = tp->t_cpu;
1235 1217 }
1236 1218
1237 1219 if (tp->t_cpupart == cp->cpu_part) {
1238 1220 int qlen;
1239 1221
1240 1222 /*
1241 1223 * Perform any CMT load balancing
1242 1224 */
1243 1225 cp = cmt_balance(tp, cp);
1244 1226
1245 1227 /*
1246 1228 * Balance across the run queues
1247 1229 */
1248 1230 qlen = RUNQ_LEN(cp, tpri);
1249 1231 if (tpri >= RUNQ_MATCH_PRI &&
1250 1232 !(tp->t_schedflag & TS_RUNQMATCH))
1251 1233 qlen -= RUNQ_MAX_DIFF;
1252 1234 if (qlen > 0) {
1253 1235 cpu_t *newcp;
1254 1236
1255 1237 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1256 1238 newcp = cp->cpu_next_part;
1257 1239 } else if ((newcp = cp->cpu_next_lpl) == cp) {
1258 1240 newcp = cp->cpu_next_part;
1259 1241 }
1260 1242
1261 1243 if (RUNQ_LEN(newcp, tpri) < qlen) {
1262 1244 DTRACE_PROBE3(runq__balance,
1263 1245 kthread_t *, tp,
1264 1246 cpu_t *, cp, cpu_t *, newcp);
1265 1247 cp = newcp;
1266 1248 }
1267 1249 }
1268 1250 } else {
1269 1251 /*
1270 1252 * Migrate to a cpu in the new partition.
1271 1253 */
1272 1254 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1273 1255 tp->t_lpl, tp->t_pri, NULL);
1274 1256 }
1275 1257 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1276 1258 } else {
1277 1259 /*
1278 1260 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1279 1261 * a short time until weak binding that existed when the
1280 1262 * strong binding was established has dropped) so we must
1281 1263 * favour weak binding over strong.
1282 1264 */
1283 1265 cp = tp->t_weakbound_cpu ?
1284 1266 tp->t_weakbound_cpu : tp->t_bound_cpu;
1285 1267 }
1286 1268 /*
1287 1269 * A thread that is ONPROC may be temporarily placed on the run queue
1288 1270 * but then chosen to run again by disp. If the thread we're placing on
1289 1271 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1290 1272 * replacement process is actually scheduled in swtch(). In this
1291 1273 * situation, curthread is the only thread that could be in the ONPROC
1292 1274 * state.
1293 1275 */
1294 1276 if ((!self) && (tp->t_waitrq == 0)) {
1295 1277 hrtime_t curtime;
1296 1278
1297 1279 curtime = gethrtime_unscaled();
1298 1280 (void) cpu_update_pct(tp, curtime);
1299 1281 tp->t_waitrq = curtime;
1300 1282 } else {
1301 1283 (void) cpu_update_pct(tp, gethrtime_unscaled());
1302 1284 }
1303 1285
1304 1286 dp = cp->cpu_disp;
1305 1287 disp_lock_enter_high(&dp->disp_lock);
1306 1288
1307 1289 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1308 1290 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1309 1291 tpri, cp, tp);
1310 1292
1311 1293 #ifndef NPROBE
1312 1294 /* Kernel probe */
1313 1295 if (tnf_tracing_active)
1314 1296 tnf_thread_queue(tp, cp, tpri);
1315 1297 #endif /* NPROBE */
1316 1298
1317 1299 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1318 1300
1319 1301 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1320 1302 tp->t_disp_queue = dp;
1321 1303 tp->t_link = NULL;
1322 1304
1323 1305 dq = &dp->disp_q[tpri];
1324 1306 dp->disp_nrunnable++;
1325 1307 if (!bound)
1326 1308 dp->disp_steal = 0;
1327 1309 membar_enter();
1328 1310
1329 1311 if (dq->dq_sruncnt++ != 0) {
1330 1312 ASSERT(dq->dq_first != NULL);
1331 1313 dq->dq_last->t_link = tp;
1332 1314 dq->dq_last = tp;
1333 1315 } else {
1334 1316 ASSERT(dq->dq_first == NULL);
1335 1317 ASSERT(dq->dq_last == NULL);
1336 1318 dq->dq_first = dq->dq_last = tp;
1337 1319 BT_SET(dp->disp_qactmap, tpri);
1338 1320 if (tpri > dp->disp_maxrunpri) {
1339 1321 dp->disp_maxrunpri = tpri;
1340 1322 membar_enter();
1341 1323 cpu_resched(cp, tpri);
1342 1324 }
1343 1325 }
1344 1326
1345 1327 if (!bound && tpri > dp->disp_max_unbound_pri) {
1346 1328 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1347 1329 /*
1348 1330 * If there are no other unbound threads on the
1349 1331 * run queue, don't allow other CPUs to steal
1350 1332 * this thread while we are in the middle of a
1351 1333 * context switch. We may just switch to it
1352 1334 * again right away. CPU_DISP_DONTSTEAL is cleared
1353 1335 * in swtch and swtch_to.
1354 1336 */
1355 1337 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1356 1338 }
1357 1339 dp->disp_max_unbound_pri = tpri;
1358 1340 }
1359 1341 (*disp_enq_thread)(cp, bound);
1360 1342 }
1361 1343
1362 1344 /*
1363 1345 * Put the specified thread on the front of the dispatcher
1364 1346 * queue corresponding to its current priority.
1365 1347 *
1366 1348 * Called with the thread in transition, onproc or stopped state
1367 1349 * and locked (transition implies locked) and at high spl.
1368 1350 * Returns with the thread in TS_RUN state and still locked.
1369 1351 */
1370 1352 void
1371 1353 setfrontdq(kthread_t *tp)
1372 1354 {
↓ open down ↓ |
160 lines elided |
↑ open up ↑ |
1373 1355 disp_t *dp;
1374 1356 dispq_t *dq;
1375 1357 cpu_t *cp;
1376 1358 pri_t tpri;
1377 1359 int bound;
1378 1360
1379 1361 ASSERT(THREAD_LOCK_HELD(tp));
1380 1362 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1381 1363 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1382 1364
1383 - /*
1384 - * If thread is "swapped" or on the swap queue don't
1385 - * queue it, but wake sched.
1386 - */
1387 - if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1388 - disp_swapped_setrun(tp);
1389 - return;
1390 - }
1391 -
1392 1365 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1393 1366 bound = 1;
1394 1367 else
1395 1368 bound = 0;
1396 1369
1397 1370 tpri = DISP_PRIO(tp);
1398 1371 if (ncpus == 1)
1399 1372 cp = tp->t_cpu;
1400 1373 else if (!bound) {
1401 1374 if (tpri >= kpqpri) {
1402 1375 setkpdq(tp, SETKP_FRONT);
1403 1376 return;
1404 1377 }
1405 1378 cp = tp->t_cpu;
1406 1379 if (tp->t_cpupart == cp->cpu_part) {
1407 1380 /*
1408 1381 * We'll generally let this thread continue to run
1409 1382 * where it last ran, but will consider migration if:
1410 1383 * - The thread last ran outside it's home lgroup.
1411 1384 * - The CPU where it last ran is the target of an
1412 1385 * offline request (a thread_nomigrate() on the in
1413 1386 * motion CPU relies on this when forcing a preempt).
1414 1387 * - The thread isn't the highest priority thread where
1415 1388 * it last ran, and it is considered not likely to
1416 1389 * have significant cache warmth.
1417 1390 */
1418 1391 if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1419 1392 (cp == cpu_inmotion)) {
1420 1393 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1421 1394 (tp == curthread) ? cp : NULL);
1422 1395 } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1423 1396 (!THREAD_HAS_CACHE_WARMTH(tp))) {
1424 1397 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1425 1398 NULL);
1426 1399 }
1427 1400 } else {
1428 1401 /*
1429 1402 * Migrate to a cpu in the new partition.
1430 1403 */
1431 1404 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1432 1405 tp->t_lpl, tp->t_pri, NULL);
1433 1406 }
1434 1407 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1435 1408 } else {
1436 1409 /*
1437 1410 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1438 1411 * a short time until weak binding that existed when the
1439 1412 * strong binding was established has dropped) so we must
1440 1413 * favour weak binding over strong.
1441 1414 */
1442 1415 cp = tp->t_weakbound_cpu ?
1443 1416 tp->t_weakbound_cpu : tp->t_bound_cpu;
1444 1417 }
1445 1418
1446 1419 /*
1447 1420 * A thread that is ONPROC may be temporarily placed on the run queue
1448 1421 * but then chosen to run again by disp. If the thread we're placing on
1449 1422 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1450 1423 * replacement process is actually scheduled in swtch(). In this
1451 1424 * situation, curthread is the only thread that could be in the ONPROC
1452 1425 * state.
1453 1426 */
1454 1427 if ((tp != curthread) && (tp->t_waitrq == 0)) {
1455 1428 hrtime_t curtime;
1456 1429
1457 1430 curtime = gethrtime_unscaled();
1458 1431 (void) cpu_update_pct(tp, curtime);
1459 1432 tp->t_waitrq = curtime;
1460 1433 } else {
1461 1434 (void) cpu_update_pct(tp, gethrtime_unscaled());
1462 1435 }
1463 1436
1464 1437 dp = cp->cpu_disp;
1465 1438 disp_lock_enter_high(&dp->disp_lock);
1466 1439
1467 1440 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1468 1441 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1469 1442
1470 1443 #ifndef NPROBE
1471 1444 /* Kernel probe */
1472 1445 if (tnf_tracing_active)
1473 1446 tnf_thread_queue(tp, cp, tpri);
1474 1447 #endif /* NPROBE */
1475 1448
1476 1449 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1477 1450
1478 1451 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */
1479 1452 tp->t_disp_queue = dp;
1480 1453
1481 1454 dq = &dp->disp_q[tpri];
1482 1455 dp->disp_nrunnable++;
1483 1456 if (!bound)
1484 1457 dp->disp_steal = 0;
1485 1458 membar_enter();
1486 1459
1487 1460 if (dq->dq_sruncnt++ != 0) {
1488 1461 ASSERT(dq->dq_last != NULL);
1489 1462 tp->t_link = dq->dq_first;
1490 1463 dq->dq_first = tp;
1491 1464 } else {
1492 1465 ASSERT(dq->dq_last == NULL);
1493 1466 ASSERT(dq->dq_first == NULL);
1494 1467 tp->t_link = NULL;
1495 1468 dq->dq_first = dq->dq_last = tp;
1496 1469 BT_SET(dp->disp_qactmap, tpri);
1497 1470 if (tpri > dp->disp_maxrunpri) {
1498 1471 dp->disp_maxrunpri = tpri;
1499 1472 membar_enter();
1500 1473 cpu_resched(cp, tpri);
1501 1474 }
1502 1475 }
1503 1476
1504 1477 if (!bound && tpri > dp->disp_max_unbound_pri) {
1505 1478 if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1506 1479 cp == CPU) {
1507 1480 /*
1508 1481 * If there are no other unbound threads on the
1509 1482 * run queue, don't allow other CPUs to steal
1510 1483 * this thread while we are in the middle of a
1511 1484 * context switch. We may just switch to it
1512 1485 * again right away. CPU_DISP_DONTSTEAL is cleared
1513 1486 * in swtch and swtch_to.
1514 1487 */
1515 1488 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1516 1489 }
1517 1490 dp->disp_max_unbound_pri = tpri;
1518 1491 }
1519 1492 (*disp_enq_thread)(cp, bound);
1520 1493 }
1521 1494
1522 1495 /*
1523 1496 * Put a high-priority unbound thread on the kp queue
1524 1497 */
1525 1498 static void
1526 1499 setkpdq(kthread_t *tp, int borf)
1527 1500 {
1528 1501 dispq_t *dq;
1529 1502 disp_t *dp;
1530 1503 cpu_t *cp;
1531 1504 pri_t tpri;
1532 1505
1533 1506 tpri = DISP_PRIO(tp);
1534 1507
1535 1508 dp = &tp->t_cpupart->cp_kp_queue;
1536 1509 disp_lock_enter_high(&dp->disp_lock);
1537 1510
1538 1511 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1539 1512
1540 1513 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1541 1514 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1542 1515 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1543 1516 tp->t_disp_queue = dp;
1544 1517 dp->disp_nrunnable++;
1545 1518 dq = &dp->disp_q[tpri];
1546 1519
1547 1520 if (dq->dq_sruncnt++ != 0) {
1548 1521 if (borf == SETKP_BACK) {
1549 1522 ASSERT(dq->dq_first != NULL);
1550 1523 tp->t_link = NULL;
1551 1524 dq->dq_last->t_link = tp;
1552 1525 dq->dq_last = tp;
1553 1526 } else {
1554 1527 ASSERT(dq->dq_last != NULL);
1555 1528 tp->t_link = dq->dq_first;
1556 1529 dq->dq_first = tp;
1557 1530 }
1558 1531 } else {
1559 1532 if (borf == SETKP_BACK) {
1560 1533 ASSERT(dq->dq_first == NULL);
1561 1534 ASSERT(dq->dq_last == NULL);
1562 1535 dq->dq_first = dq->dq_last = tp;
1563 1536 } else {
1564 1537 ASSERT(dq->dq_last == NULL);
1565 1538 ASSERT(dq->dq_first == NULL);
1566 1539 tp->t_link = NULL;
1567 1540 dq->dq_first = dq->dq_last = tp;
1568 1541 }
1569 1542 BT_SET(dp->disp_qactmap, tpri);
1570 1543 if (tpri > dp->disp_max_unbound_pri)
1571 1544 dp->disp_max_unbound_pri = tpri;
1572 1545 if (tpri > dp->disp_maxrunpri) {
1573 1546 dp->disp_maxrunpri = tpri;
1574 1547 membar_enter();
1575 1548 }
1576 1549 }
1577 1550
1578 1551 cp = tp->t_cpu;
1579 1552 if (tp->t_cpupart != cp->cpu_part) {
1580 1553 /* migrate to a cpu in the new partition */
1581 1554 cp = tp->t_cpupart->cp_cpulist;
1582 1555 }
1583 1556 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1584 1557 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1585 1558 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1586 1559
1587 1560 #ifndef NPROBE
1588 1561 /* Kernel probe */
1589 1562 if (tnf_tracing_active)
1590 1563 tnf_thread_queue(tp, cp, tpri);
1591 1564 #endif /* NPROBE */
1592 1565
1593 1566 if (cp->cpu_chosen_level < tpri)
1594 1567 cp->cpu_chosen_level = tpri;
1595 1568 cpu_resched(cp, tpri);
1596 1569 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1597 1570 (*disp_enq_thread)(cp, 0);
1598 1571 }
1599 1572
1600 1573 /*
1601 1574 * Remove a thread from the dispatcher queue if it is on it.
1602 1575 * It is not an error if it is not found but we return whether
1603 1576 * or not it was found in case the caller wants to check.
1604 1577 */
1605 1578 int
1606 1579 dispdeq(kthread_t *tp)
1607 1580 {
1608 1581 disp_t *dp;
1609 1582 dispq_t *dq;
↓ open down ↓ |
208 lines elided |
↑ open up ↑ |
1610 1583 kthread_t *rp;
1611 1584 kthread_t *trp;
1612 1585 kthread_t **ptp;
1613 1586 int tpri;
1614 1587
1615 1588 ASSERT(THREAD_LOCK_HELD(tp));
1616 1589
1617 1590 if (tp->t_state != TS_RUN)
1618 1591 return (0);
1619 1592
1620 - /*
1621 - * The thread is "swapped" or is on the swap queue and
1622 - * hence no longer on the run queue, so return true.
1623 - */
1624 - if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1625 - return (1);
1626 -
1627 1593 tpri = DISP_PRIO(tp);
1628 1594 dp = tp->t_disp_queue;
1629 1595 ASSERT(tpri < dp->disp_npri);
1630 1596 dq = &dp->disp_q[tpri];
1631 1597 ptp = &dq->dq_first;
1632 1598 rp = *ptp;
1633 1599 trp = NULL;
1634 1600
1635 1601 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1636 1602
1637 1603 /*
1638 1604 * Search for thread in queue.
1639 1605 * Double links would simplify this at the expense of disp/setrun.
1640 1606 */
1641 1607 while (rp != tp && rp != NULL) {
1642 1608 trp = rp;
1643 1609 ptp = &trp->t_link;
1644 1610 rp = trp->t_link;
1645 1611 }
1646 1612
1647 1613 if (rp == NULL) {
1648 1614 panic("dispdeq: thread not on queue");
1649 1615 }
1650 1616
1651 1617 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1652 1618
1653 1619 /*
1654 1620 * Found it so remove it from queue.
1655 1621 */
1656 1622 if ((*ptp = rp->t_link) == NULL)
1657 1623 dq->dq_last = trp;
1658 1624
1659 1625 dp->disp_nrunnable--;
1660 1626 if (--dq->dq_sruncnt == 0) {
1661 1627 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1662 1628 if (dp->disp_nrunnable == 0) {
1663 1629 dp->disp_max_unbound_pri = -1;
1664 1630 dp->disp_maxrunpri = -1;
1665 1631 } else if (tpri == dp->disp_maxrunpri) {
1666 1632 int ipri;
1667 1633
1668 1634 ipri = bt_gethighbit(dp->disp_qactmap,
1669 1635 dp->disp_maxrunpri >> BT_ULSHIFT);
↓ open down ↓ |
33 lines elided |
↑ open up ↑ |
1670 1636 if (ipri < dp->disp_max_unbound_pri)
1671 1637 dp->disp_max_unbound_pri = ipri;
1672 1638 dp->disp_maxrunpri = ipri;
1673 1639 }
1674 1640 }
1675 1641 tp->t_link = NULL;
1676 1642 THREAD_TRANSITION(tp); /* put in intermediate state */
1677 1643 return (1);
1678 1644 }
1679 1645
1680 -
1681 -/*
1682 - * dq_sruninc and dq_srundec are public functions for
1683 - * incrementing/decrementing the sruncnts when a thread on
1684 - * a dispatcher queue is made schedulable/unschedulable by
1685 - * resetting the TS_LOAD flag.
1686 - *
1687 - * The caller MUST have the thread lock and therefore the dispatcher
1688 - * queue lock so that the operation which changes
1689 - * the flag, the operation that checks the status of the thread to
1690 - * determine if it's on a disp queue AND the call to this function
1691 - * are one atomic operation with respect to interrupts.
1692 - */
1693 -
1694 -/*
1695 - * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1696 - */
1697 -void
1698 -dq_sruninc(kthread_t *t)
1699 -{
1700 - ASSERT(t->t_state == TS_RUN);
1701 - ASSERT(t->t_schedflag & TS_LOAD);
1702 -
1703 - THREAD_TRANSITION(t);
1704 - setfrontdq(t);
1705 -}
1706 -
1707 -/*
1708 - * See comment on calling conventions above.
1709 - * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1710 - */
1711 -void
1712 -dq_srundec(kthread_t *t)
1713 -{
1714 - ASSERT(t->t_schedflag & TS_LOAD);
1715 -
1716 - (void) dispdeq(t);
1717 - disp_swapped_enq(t);
1718 -}
1719 -
1720 -/*
1721 - * Change the dispatcher lock of thread to the "swapped_lock"
1722 - * and return with thread lock still held.
1723 - *
1724 - * Called with thread_lock held, in transition state, and at high spl.
1725 - */
1726 -void
1727 -disp_swapped_enq(kthread_t *tp)
1728 -{
1729 - ASSERT(THREAD_LOCK_HELD(tp));
1730 - ASSERT(tp->t_schedflag & TS_LOAD);
1731 -
1732 - switch (tp->t_state) {
1733 - case TS_RUN:
1734 - disp_lock_enter_high(&swapped_lock);
1735 - THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1736 - break;
1737 - case TS_ONPROC:
1738 - disp_lock_enter_high(&swapped_lock);
1739 - THREAD_TRANSITION(tp);
1740 - wake_sched_sec = 1; /* tell clock to wake sched */
1741 - THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1742 - break;
1743 - default:
1744 - panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1745 - }
1746 -}
1747 -
1748 -/*
1749 - * This routine is called by setbackdq/setfrontdq if the thread is
1750 - * not loaded or loaded and on the swap queue.
1751 - *
1752 - * Thread state TS_SLEEP implies that a swapped thread
1753 - * has been woken up and needs to be swapped in by the swapper.
1754 - *
1755 - * Thread state TS_RUN, it implies that the priority of a swapped
1756 - * thread is being increased by scheduling class (e.g. ts_update).
1757 - */
1758 -static void
1759 -disp_swapped_setrun(kthread_t *tp)
1760 -{
1761 - ASSERT(THREAD_LOCK_HELD(tp));
1762 - ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1763 -
1764 - switch (tp->t_state) {
1765 - case TS_SLEEP:
1766 - disp_lock_enter_high(&swapped_lock);
1767 - /*
1768 - * Wakeup sched immediately (i.e., next tick) if the
1769 - * thread priority is above maxclsyspri.
1770 - */
1771 - if (DISP_PRIO(tp) > maxclsyspri)
1772 - wake_sched = 1;
1773 - else
1774 - wake_sched_sec = 1;
1775 - THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1776 - break;
1777 - case TS_RUN: /* called from ts_update */
1778 - break;
1779 - default:
1780 - panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1781 - }
1782 -}
1783 -
1784 1646 /*
1785 1647 * Make a thread give up its processor. Find the processor on
1786 1648 * which this thread is executing, and have that processor
1787 1649 * preempt.
1788 1650 *
1789 1651 * We allow System Duty Cycle (SDC) threads to be preempted even if
1790 1652 * they are running at kernel priorities. To implement this, we always
1791 1653 * set cpu_kprunrun; this ensures preempt() will be called. Since SDC
1792 1654 * calls cpu_surrender() very often, we only preempt if there is anyone
1793 1655 * competing with us.
1794 1656 */
1795 1657 void
1796 1658 cpu_surrender(kthread_t *tp)
1797 1659 {
1798 1660 cpu_t *cpup;
1799 1661 int max_pri;
1800 1662 int max_run_pri;
1801 1663 klwp_t *lwp;
1802 1664
1803 1665 ASSERT(THREAD_LOCK_HELD(tp));
1804 1666
1805 1667 if (tp->t_state != TS_ONPROC)
1806 1668 return;
1807 1669 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */
1808 1670 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1809 1671 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1810 1672 if (max_pri < max_run_pri)
1811 1673 max_pri = max_run_pri;
1812 1674
1813 1675 if (tp->t_cid == sysdccid) {
1814 1676 uint_t t_pri = DISP_PRIO(tp);
1815 1677 if (t_pri > max_pri)
1816 1678 return; /* we are not competing w/ anyone */
1817 1679 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1818 1680 } else {
1819 1681 cpup->cpu_runrun = 1;
1820 1682 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1821 1683 cpup->cpu_kprunrun = 1;
1822 1684 }
1823 1685 }
1824 1686
1825 1687 /*
1826 1688 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1827 1689 */
1828 1690 membar_enter();
1829 1691
1830 1692 DTRACE_SCHED1(surrender, kthread_t *, tp);
1831 1693
1832 1694 /*
1833 1695 * Make the target thread take an excursion through trap()
1834 1696 * to do preempt() (unless we're already in trap or post_syscall,
1835 1697 * calling cpu_surrender via CL_TRAPRET).
1836 1698 */
1837 1699 if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1838 1700 lwp->lwp_state != LWP_USER) {
1839 1701 aston(tp);
1840 1702 if (cpup != CPU)
1841 1703 poke_cpu(cpup->cpu_id);
1842 1704 }
1843 1705 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1844 1706 "cpu_surrender:tid %p cpu %p", tp, cpup);
1845 1707 }
1846 1708
1847 1709 /*
1848 1710 * Commit to and ratify a scheduling decision
1849 1711 */
1850 1712 /*ARGSUSED*/
1851 1713 static kthread_t *
1852 1714 disp_ratify(kthread_t *tp, disp_t *kpq)
1853 1715 {
1854 1716 pri_t tpri, maxpri;
1855 1717 pri_t maxkpri;
1856 1718 cpu_t *cpup;
1857 1719
1858 1720 ASSERT(tp != NULL);
1859 1721 /*
1860 1722 * Commit to, then ratify scheduling decision
1861 1723 */
1862 1724 cpup = CPU;
1863 1725 if (cpup->cpu_runrun != 0)
1864 1726 cpup->cpu_runrun = 0;
1865 1727 if (cpup->cpu_kprunrun != 0)
1866 1728 cpup->cpu_kprunrun = 0;
1867 1729 if (cpup->cpu_chosen_level != -1)
1868 1730 cpup->cpu_chosen_level = -1;
1869 1731 membar_enter();
1870 1732 tpri = DISP_PRIO(tp);
1871 1733 maxpri = cpup->cpu_disp->disp_maxrunpri;
1872 1734 maxkpri = kpq->disp_maxrunpri;
1873 1735 if (maxpri < maxkpri)
1874 1736 maxpri = maxkpri;
1875 1737 if (tpri < maxpri) {
1876 1738 /*
1877 1739 * should have done better
1878 1740 * put this one back and indicate to try again
1879 1741 */
1880 1742 cpup->cpu_dispthread = curthread; /* fixup dispthread */
1881 1743 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1882 1744 thread_lock_high(tp);
1883 1745 THREAD_TRANSITION(tp);
1884 1746 setfrontdq(tp);
1885 1747 thread_unlock_nopreempt(tp);
1886 1748
1887 1749 tp = NULL;
1888 1750 }
1889 1751 return (tp);
1890 1752 }
1891 1753
1892 1754 /*
1893 1755 * See if there is any work on the dispatcher queue for other CPUs.
1894 1756 * If there is, dequeue the best thread and return.
1895 1757 */
1896 1758 static kthread_t *
1897 1759 disp_getwork(cpu_t *cp)
1898 1760 {
1899 1761 cpu_t *ocp; /* other CPU */
1900 1762 cpu_t *ocp_start;
1901 1763 cpu_t *tcp; /* target local CPU */
1902 1764 kthread_t *tp;
1903 1765 kthread_t *retval = NULL;
1904 1766 pri_t maxpri;
1905 1767 disp_t *kpq; /* kp queue for this partition */
1906 1768 lpl_t *lpl, *lpl_leaf;
1907 1769 int leafidx, startidx;
1908 1770 hrtime_t stealtime;
1909 1771 lgrp_id_t local_id;
1910 1772
1911 1773 maxpri = -1;
1912 1774 tcp = NULL;
1913 1775
1914 1776 kpq = &cp->cpu_part->cp_kp_queue;
1915 1777 while (kpq->disp_maxrunpri >= 0) {
1916 1778 /*
1917 1779 * Try to take a thread from the kp_queue.
1918 1780 */
1919 1781 tp = (disp_getbest(kpq));
1920 1782 if (tp)
1921 1783 return (disp_ratify(tp, kpq));
1922 1784 }
1923 1785
1924 1786 kpreempt_disable(); /* protect the cpu_active list */
1925 1787
1926 1788 /*
1927 1789 * Try to find something to do on another CPU's run queue.
1928 1790 * Loop through all other CPUs looking for the one with the highest
1929 1791 * priority unbound thread.
1930 1792 *
1931 1793 * On NUMA machines, the partition's CPUs are consulted in order of
1932 1794 * distance from the current CPU. This way, the first available
1933 1795 * work found is also the closest, and will suffer the least
1934 1796 * from being migrated.
1935 1797 */
1936 1798 lpl = lpl_leaf = cp->cpu_lpl;
1937 1799 local_id = lpl_leaf->lpl_lgrpid;
1938 1800 leafidx = startidx = 0;
1939 1801
1940 1802 /*
1941 1803 * This loop traverses the lpl hierarchy. Higher level lpls represent
1942 1804 * broader levels of locality
1943 1805 */
1944 1806 do {
1945 1807 /* This loop iterates over the lpl's leaves */
1946 1808 do {
1947 1809 if (lpl_leaf != cp->cpu_lpl)
1948 1810 ocp = lpl_leaf->lpl_cpus;
1949 1811 else
1950 1812 ocp = cp->cpu_next_lpl;
1951 1813
1952 1814 /* This loop iterates over the CPUs in the leaf */
1953 1815 ocp_start = ocp;
1954 1816 do {
1955 1817 pri_t pri;
1956 1818
1957 1819 ASSERT(CPU_ACTIVE(ocp));
1958 1820
1959 1821 /*
1960 1822 * End our stroll around this lpl if:
1961 1823 *
1962 1824 * - Something became runnable on the local
1963 1825 * queue...which also ends our stroll around
1964 1826 * the partition.
1965 1827 *
1966 1828 * - We happen across another idle CPU.
1967 1829 * Since it is patrolling the next portion
1968 1830 * of the lpl's list (assuming it's not
1969 1831 * halted, or busy servicing an interrupt),
1970 1832 * move to the next higher level of locality.
1971 1833 */
1972 1834 if (cp->cpu_disp->disp_nrunnable != 0) {
1973 1835 kpreempt_enable();
1974 1836 return (NULL);
1975 1837 }
1976 1838 if (ocp->cpu_dispatch_pri == -1) {
1977 1839 if (ocp->cpu_disp_flags &
1978 1840 CPU_DISP_HALTED ||
1979 1841 ocp->cpu_intr_actv != 0)
1980 1842 continue;
1981 1843 else
1982 1844 goto next_level;
1983 1845 }
1984 1846
1985 1847 /*
1986 1848 * If there's only one thread and the CPU
1987 1849 * is in the middle of a context switch,
1988 1850 * or it's currently running the idle thread,
1989 1851 * don't steal it.
1990 1852 */
1991 1853 if ((ocp->cpu_disp_flags &
1992 1854 CPU_DISP_DONTSTEAL) &&
1993 1855 ocp->cpu_disp->disp_nrunnable == 1)
1994 1856 continue;
1995 1857
1996 1858 pri = ocp->cpu_disp->disp_max_unbound_pri;
1997 1859 if (pri > maxpri) {
1998 1860 /*
1999 1861 * Don't steal threads that we attempted
2000 1862 * to steal recently until they're ready
2001 1863 * to be stolen again.
2002 1864 */
2003 1865 stealtime = ocp->cpu_disp->disp_steal;
2004 1866 if (stealtime == 0 ||
2005 1867 stealtime - gethrtime() <= 0) {
2006 1868 maxpri = pri;
2007 1869 tcp = ocp;
2008 1870 } else {
2009 1871 /*
2010 1872 * Don't update tcp, just set
2011 1873 * the retval to T_DONTSTEAL, so
2012 1874 * that if no acceptable CPUs
2013 1875 * are found the return value
2014 1876 * will be T_DONTSTEAL rather
2015 1877 * then NULL.
2016 1878 */
2017 1879 retval = T_DONTSTEAL;
2018 1880 }
2019 1881 }
2020 1882 } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2021 1883
2022 1884 /*
2023 1885 * Iterate to the next leaf lpl in the resource set
2024 1886 * at this level of locality. If we hit the end of
2025 1887 * the set, wrap back around to the beginning.
2026 1888 *
2027 1889 * Note: This iteration is NULL terminated for a reason
2028 1890 * see lpl_topo_bootstrap() in lgrp.c for details.
2029 1891 */
2030 1892 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2031 1893 leafidx = 0;
2032 1894 lpl_leaf = lpl->lpl_rset[leafidx];
2033 1895 }
2034 1896 } while (leafidx != startidx);
2035 1897
2036 1898 next_level:
2037 1899 /*
2038 1900 * Expand the search to include farther away CPUs (next
2039 1901 * locality level). The closer CPUs that have already been
2040 1902 * checked will be checked again. In doing so, idle CPUs
2041 1903 * will tend to be more aggresive about stealing from CPUs
2042 1904 * that are closer (since the closer CPUs will be considered
2043 1905 * more often).
2044 1906 * Begin at this level with the CPUs local leaf lpl.
2045 1907 */
2046 1908 if ((lpl = lpl->lpl_parent) != NULL) {
2047 1909 leafidx = startidx = lpl->lpl_id2rset[local_id];
2048 1910 lpl_leaf = lpl->lpl_rset[leafidx];
2049 1911 }
2050 1912 } while (!tcp && lpl);
2051 1913
2052 1914 kpreempt_enable();
2053 1915
2054 1916 /*
2055 1917 * If another queue looks good, and there is still nothing on
2056 1918 * the local queue, try to transfer one or more threads
2057 1919 * from it to our queue.
2058 1920 */
2059 1921 if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2060 1922 tp = disp_getbest(tcp->cpu_disp);
2061 1923 if (tp == NULL || tp == T_DONTSTEAL)
2062 1924 return (tp);
2063 1925 return (disp_ratify(tp, kpq));
2064 1926 }
2065 1927 return (retval);
2066 1928 }
2067 1929
2068 1930
2069 1931 /*
2070 1932 * disp_fix_unbound_pri()
2071 1933 * Determines the maximum priority of unbound threads on the queue.
2072 1934 * The priority is kept for the queue, but is only increased, never
2073 1935 * reduced unless some CPU is looking for something on that queue.
2074 1936 *
2075 1937 * The priority argument is the known upper limit.
2076 1938 *
2077 1939 * Perhaps this should be kept accurately, but that probably means
2078 1940 * separate bitmaps for bound and unbound threads. Since only idled
2079 1941 * CPUs will have to do this recalculation, it seems better this way.
2080 1942 */
2081 1943 static void
2082 1944 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2083 1945 {
2084 1946 kthread_t *tp;
2085 1947 dispq_t *dq;
2086 1948 ulong_t *dqactmap = dp->disp_qactmap;
2087 1949 ulong_t mapword;
2088 1950 int wx;
2089 1951
2090 1952 ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2091 1953
2092 1954 ASSERT(pri >= 0); /* checked by caller */
2093 1955
2094 1956 /*
2095 1957 * Start the search at the next lowest priority below the supplied
2096 1958 * priority. This depends on the bitmap implementation.
2097 1959 */
2098 1960 do {
2099 1961 wx = pri >> BT_ULSHIFT; /* index of word in map */
2100 1962
2101 1963 /*
2102 1964 * Form mask for all lower priorities in the word.
2103 1965 */
2104 1966 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2105 1967
2106 1968 /*
2107 1969 * Get next lower active priority.
2108 1970 */
2109 1971 if (mapword != 0) {
2110 1972 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2111 1973 } else if (wx > 0) {
2112 1974 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2113 1975 if (pri < 0)
2114 1976 break;
2115 1977 } else {
2116 1978 pri = -1;
2117 1979 break;
2118 1980 }
2119 1981
2120 1982 /*
2121 1983 * Search the queue for unbound, runnable threads.
2122 1984 */
2123 1985 dq = &dp->disp_q[pri];
2124 1986 tp = dq->dq_first;
2125 1987
2126 1988 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2127 1989 tp = tp->t_link;
2128 1990 }
2129 1991
2130 1992 /*
2131 1993 * If a thread was found, set the priority and return.
2132 1994 */
2133 1995 } while (tp == NULL);
2134 1996
2135 1997 /*
2136 1998 * pri holds the maximum unbound thread priority or -1.
2137 1999 */
2138 2000 if (dp->disp_max_unbound_pri != pri)
2139 2001 dp->disp_max_unbound_pri = pri;
2140 2002 }
2141 2003
2142 2004 /*
2143 2005 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2144 2006 * check if the CPU to which is was previously bound should have
2145 2007 * its disp_max_unbound_pri increased.
2146 2008 */
↓ open down ↓ |
353 lines elided |
↑ open up ↑ |
2147 2009 void
2148 2010 disp_adjust_unbound_pri(kthread_t *tp)
2149 2011 {
2150 2012 disp_t *dp;
2151 2013 pri_t tpri;
2152 2014
2153 2015 ASSERT(THREAD_LOCK_HELD(tp));
2154 2016
2155 2017 /*
2156 2018 * Don't do anything if the thread is not bound, or
2157 - * currently not runnable or swapped out.
2019 + * currently not runnable.
2158 2020 */
2159 2021 if (tp->t_bound_cpu == NULL ||
2160 - tp->t_state != TS_RUN ||
2161 - tp->t_schedflag & TS_ON_SWAPQ)
2022 + tp->t_state != TS_RUN)
2162 2023 return;
2163 2024
2164 2025 tpri = DISP_PRIO(tp);
2165 2026 dp = tp->t_bound_cpu->cpu_disp;
2166 2027 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2167 2028 if (tpri > dp->disp_max_unbound_pri)
2168 2029 dp->disp_max_unbound_pri = tpri;
2169 2030 }
2170 2031
2171 2032 /*
2172 2033 * disp_getbest()
2173 2034 * De-queue the highest priority unbound runnable thread.
2174 2035 * Returns with the thread unlocked and onproc but at splhigh (like disp()).
2175 2036 * Returns NULL if nothing found.
2176 2037 * Returns T_DONTSTEAL if the thread was not stealable.
2177 2038 * so that the caller will try again later.
2178 2039 *
2179 2040 * Passed a pointer to a dispatch queue not associated with this CPU, and
2180 2041 * its type.
2181 2042 */
2182 2043 static kthread_t *
2183 2044 disp_getbest(disp_t *dp)
2184 2045 {
2185 2046 kthread_t *tp;
2186 2047 dispq_t *dq;
2187 2048 pri_t pri;
2188 2049 cpu_t *cp, *tcp;
2189 2050 boolean_t allbound;
2190 2051
2191 2052 disp_lock_enter(&dp->disp_lock);
2192 2053
2193 2054 /*
2194 2055 * If there is nothing to run, or the CPU is in the middle of a
2195 2056 * context switch of the only thread, return NULL.
2196 2057 */
2197 2058 tcp = dp->disp_cpu;
2198 2059 cp = CPU;
2199 2060 pri = dp->disp_max_unbound_pri;
2200 2061 if (pri == -1 ||
2201 2062 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2202 2063 tcp->cpu_disp->disp_nrunnable == 1)) {
2203 2064 disp_lock_exit_nopreempt(&dp->disp_lock);
2204 2065 return (NULL);
2205 2066 }
2206 2067
2207 2068 dq = &dp->disp_q[pri];
2208 2069
2209 2070
2210 2071 /*
2211 2072 * Assume that all threads are bound on this queue, and change it
2212 2073 * later when we find out that it is not the case.
2213 2074 */
2214 2075 allbound = B_TRUE;
2215 2076 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2216 2077 hrtime_t now, nosteal, rqtime;
2217 2078
2218 2079 /*
2219 2080 * Skip over bound threads which could be here even
2220 2081 * though disp_max_unbound_pri indicated this level.
2221 2082 */
2222 2083 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2223 2084 continue;
2224 2085
2225 2086 /*
2226 2087 * We've got some unbound threads on this queue, so turn
2227 2088 * the allbound flag off now.
2228 2089 */
2229 2090 allbound = B_FALSE;
2230 2091
2231 2092 /*
2232 2093 * The thread is a candidate for stealing from its run queue. We
2233 2094 * don't want to steal threads that became runnable just a
2234 2095 * moment ago. This improves CPU affinity for threads that get
2235 2096 * preempted for short periods of time and go back on the run
2236 2097 * queue.
2237 2098 *
2238 2099 * We want to let it stay on its run queue if it was only placed
2239 2100 * there recently and it was running on the same CPU before that
2240 2101 * to preserve its cache investment. For the thread to remain on
2241 2102 * its run queue, ALL of the following conditions must be
2242 2103 * satisfied:
2243 2104 *
2244 2105 * - the disp queue should not be the kernel preemption queue
2245 2106 * - delayed idle stealing should not be disabled
2246 2107 * - nosteal_nsec should be non-zero
2247 2108 * - it should run with user priority
2248 2109 * - it should be on the run queue of the CPU where it was
2249 2110 * running before being placed on the run queue
2250 2111 * - it should be the only thread on the run queue (to prevent
2251 2112 * extra scheduling latency for other threads)
2252 2113 * - it should sit on the run queue for less than per-chip
2253 2114 * nosteal interval or global nosteal interval
2254 2115 * - in case of CPUs with shared cache it should sit in a run
2255 2116 * queue of a CPU from a different chip
2256 2117 *
2257 2118 * The checks are arranged so that the ones that are faster are
2258 2119 * placed earlier.
2259 2120 */
2260 2121 if (tcp == NULL ||
2261 2122 pri >= minclsyspri ||
2262 2123 tp->t_cpu != tcp)
2263 2124 break;
2264 2125
2265 2126 /*
2266 2127 * Steal immediately if, due to CMT processor architecture
2267 2128 * migraiton between cp and tcp would incur no performance
2268 2129 * penalty.
2269 2130 */
2270 2131 if (pg_cmt_can_migrate(cp, tcp))
2271 2132 break;
2272 2133
2273 2134 nosteal = nosteal_nsec;
2274 2135 if (nosteal == 0)
2275 2136 break;
2276 2137
2277 2138 /*
2278 2139 * Calculate time spent sitting on run queue
2279 2140 */
2280 2141 now = gethrtime_unscaled();
2281 2142 rqtime = now - tp->t_waitrq;
2282 2143 scalehrtime(&rqtime);
2283 2144
2284 2145 /*
2285 2146 * Steal immediately if the time spent on this run queue is more
2286 2147 * than allowed nosteal delay.
2287 2148 *
2288 2149 * Negative rqtime check is needed here to avoid infinite
2289 2150 * stealing delays caused by unlikely but not impossible
2290 2151 * drifts between CPU times on different CPUs.
2291 2152 */
2292 2153 if (rqtime > nosteal || rqtime < 0)
2293 2154 break;
2294 2155
2295 2156 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2296 2157 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2297 2158 scalehrtime(&now);
2298 2159 /*
2299 2160 * Calculate when this thread becomes stealable
2300 2161 */
2301 2162 now += (nosteal - rqtime);
2302 2163
2303 2164 /*
2304 2165 * Calculate time when some thread becomes stealable
2305 2166 */
2306 2167 if (now < dp->disp_steal)
2307 2168 dp->disp_steal = now;
2308 2169 }
2309 2170
2310 2171 /*
2311 2172 * If there were no unbound threads on this queue, find the queue
2312 2173 * where they are and then return later. The value of
2313 2174 * disp_max_unbound_pri is not always accurate because it isn't
2314 2175 * reduced until another idle CPU looks for work.
2315 2176 */
2316 2177 if (allbound)
2317 2178 disp_fix_unbound_pri(dp, pri);
2318 2179
2319 2180 /*
2320 2181 * If we reached the end of the queue and found no unbound threads
2321 2182 * then return NULL so that other CPUs will be considered. If there
2322 2183 * are unbound threads but they cannot yet be stolen, then
2323 2184 * return T_DONTSTEAL and try again later.
2324 2185 */
2325 2186 if (tp == NULL) {
2326 2187 disp_lock_exit_nopreempt(&dp->disp_lock);
2327 2188 return (allbound ? NULL : T_DONTSTEAL);
2328 2189 }
2329 2190
2330 2191 /*
2331 2192 * Found a runnable, unbound thread, so remove it from queue.
2332 2193 * dispdeq() requires that we have the thread locked, and we do,
2333 2194 * by virtue of holding the dispatch queue lock. dispdeq() will
2334 2195 * put the thread in transition state, thereby dropping the dispq
2335 2196 * lock.
2336 2197 */
2337 2198
2338 2199 #ifdef DEBUG
2339 2200 {
2340 2201 int thread_was_on_queue;
2341 2202
2342 2203 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */
2343 2204 ASSERT(thread_was_on_queue);
2344 2205 }
↓ open down ↓ |
173 lines elided |
↑ open up ↑ |
2345 2206
2346 2207 #else /* DEBUG */
2347 2208 (void) dispdeq(tp); /* drops disp_lock */
2348 2209 #endif /* DEBUG */
2349 2210
2350 2211 /*
2351 2212 * Reset the disp_queue steal time - we do not know what is the smallest
2352 2213 * value across the queue is.
2353 2214 */
2354 2215 dp->disp_steal = 0;
2355 -
2356 - tp->t_schedflag |= TS_DONT_SWAP;
2357 2216
2358 2217 /*
2359 2218 * Setup thread to run on the current CPU.
2360 2219 */
2361 2220 tp->t_disp_queue = cp->cpu_disp;
2362 2221
2363 2222 cp->cpu_dispthread = tp; /* protected by spl only */
2364 2223 cp->cpu_dispatch_pri = pri;
2365 2224
2366 2225 /*
2367 2226 * There can be a memory synchronization race between disp_getbest()
2368 2227 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2369 2228 * to preempt the current thread to run the enqueued thread while
2370 2229 * disp_getbest() and disp_ratify() are changing the current thread
2371 2230 * to the stolen thread. This may lead to a situation where
2372 2231 * cpu_resched() tries to preempt the wrong thread and the
2373 2232 * stolen thread continues to run on the CPU which has been tagged
2374 2233 * for preemption.
2375 2234 * Later the clock thread gets enqueued but doesn't get to run on the
2376 2235 * CPU causing the system to hang.
2377 2236 *
2378 2237 * To avoid this, grabbing and dropping the disp_lock (which does
2379 2238 * a memory barrier) is needed to synchronize the execution of
2380 2239 * cpu_resched() with disp_getbest() and disp_ratify() and
2381 2240 * synchronize the memory read and written by cpu_resched(),
2382 2241 * disp_getbest(), and disp_ratify() with each other.
2383 2242 * (see CR#6482861 for more details).
2384 2243 */
2385 2244 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2386 2245 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2387 2246
2388 2247 ASSERT(pri == DISP_PRIO(tp));
2389 2248
2390 2249 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2391 2250
2392 2251 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */
2393 2252
2394 2253 /*
2395 2254 * Return with spl high so that swtch() won't need to raise it.
2396 2255 * The disp_lock was dropped by dispdeq().
2397 2256 */
2398 2257
2399 2258 return (tp);
2400 2259 }
2401 2260
2402 2261 /*
2403 2262 * disp_bound_common() - common routine for higher level functions
2404 2263 * that check for bound threads under certain conditions.
2405 2264 * If 'threadlistsafe' is set then there is no need to acquire
2406 2265 * pidlock to stop the thread list from changing (eg, if
2407 2266 * disp_bound_* is called with cpus paused).
2408 2267 */
2409 2268 static int
2410 2269 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2411 2270 {
2412 2271 int found = 0;
2413 2272 kthread_t *tp;
2414 2273
2415 2274 ASSERT(flag);
2416 2275
2417 2276 if (!threadlistsafe)
2418 2277 mutex_enter(&pidlock);
2419 2278 tp = curthread; /* faster than allthreads */
2420 2279 do {
2421 2280 if (tp->t_state != TS_FREE) {
2422 2281 /*
2423 2282 * If an interrupt thread is busy, but the
2424 2283 * caller doesn't care (i.e. BOUND_INTR is off),
2425 2284 * then just ignore it and continue through.
2426 2285 */
2427 2286 if ((tp->t_flag & T_INTR_THREAD) &&
2428 2287 !(flag & BOUND_INTR))
2429 2288 continue;
2430 2289
2431 2290 /*
2432 2291 * Skip the idle thread for the CPU
2433 2292 * we're about to set offline.
2434 2293 */
2435 2294 if (tp == cp->cpu_idle_thread)
2436 2295 continue;
2437 2296
2438 2297 /*
2439 2298 * Skip the pause thread for the CPU
2440 2299 * we're about to set offline.
2441 2300 */
2442 2301 if (tp == cp->cpu_pause_thread)
2443 2302 continue;
2444 2303
2445 2304 if ((flag & BOUND_CPU) &&
2446 2305 (tp->t_bound_cpu == cp ||
2447 2306 tp->t_bind_cpu == cp->cpu_id ||
2448 2307 tp->t_weakbound_cpu == cp)) {
2449 2308 found = 1;
2450 2309 break;
2451 2310 }
2452 2311
2453 2312 if ((flag & BOUND_PARTITION) &&
2454 2313 (tp->t_cpupart == cp->cpu_part)) {
2455 2314 found = 1;
2456 2315 break;
2457 2316 }
2458 2317 }
2459 2318 } while ((tp = tp->t_next) != curthread && found == 0);
2460 2319 if (!threadlistsafe)
2461 2320 mutex_exit(&pidlock);
2462 2321 return (found);
2463 2322 }
2464 2323
2465 2324 /*
2466 2325 * disp_bound_threads - return nonzero if threads are bound to the processor.
2467 2326 * Called infrequently. Keep this simple.
2468 2327 * Includes threads that are asleep or stopped but not onproc.
2469 2328 */
2470 2329 int
2471 2330 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2472 2331 {
2473 2332 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2474 2333 }
2475 2334
2476 2335 /*
2477 2336 * disp_bound_anythreads - return nonzero if _any_ threads are bound
2478 2337 * to the given processor, including interrupt threads.
2479 2338 */
2480 2339 int
2481 2340 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2482 2341 {
2483 2342 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2484 2343 }
2485 2344
2486 2345 /*
2487 2346 * disp_bound_partition - return nonzero if threads are bound to the same
2488 2347 * partition as the processor.
2489 2348 * Called infrequently. Keep this simple.
2490 2349 * Includes threads that are asleep or stopped but not onproc.
2491 2350 */
2492 2351 int
2493 2352 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2494 2353 {
2495 2354 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2496 2355 }
2497 2356
2498 2357 /*
2499 2358 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2500 2359 * threads to other CPUs.
2501 2360 */
2502 2361 void
2503 2362 disp_cpu_inactive(cpu_t *cp)
2504 2363 {
2505 2364 kthread_t *tp;
2506 2365 disp_t *dp = cp->cpu_disp;
2507 2366 dispq_t *dq;
2508 2367 pri_t pri;
2509 2368 int wasonq;
2510 2369
2511 2370 disp_lock_enter(&dp->disp_lock);
2512 2371 while ((pri = dp->disp_max_unbound_pri) != -1) {
2513 2372 dq = &dp->disp_q[pri];
2514 2373 tp = dq->dq_first;
2515 2374
2516 2375 /*
2517 2376 * Skip over bound threads.
2518 2377 */
2519 2378 while (tp != NULL && tp->t_bound_cpu != NULL) {
2520 2379 tp = tp->t_link;
2521 2380 }
2522 2381
2523 2382 if (tp == NULL) {
2524 2383 /* disp_max_unbound_pri must be inaccurate, so fix it */
2525 2384 disp_fix_unbound_pri(dp, pri);
2526 2385 continue;
2527 2386 }
2528 2387
2529 2388 wasonq = dispdeq(tp); /* drops disp_lock */
2530 2389 ASSERT(wasonq);
2531 2390 ASSERT(tp->t_weakbound_cpu == NULL);
2532 2391
2533 2392 setbackdq(tp);
2534 2393 /*
2535 2394 * Called from cpu_offline:
2536 2395 *
2537 2396 * cp has already been removed from the list of active cpus
2538 2397 * and tp->t_cpu has been changed so there is no risk of
2539 2398 * tp ending up back on cp.
2540 2399 *
2541 2400 * Called from cpupart_move_cpu:
2542 2401 *
2543 2402 * The cpu has moved to a new cpupart. Any threads that
2544 2403 * were on it's dispatch queues before the move remain
2545 2404 * in the old partition and can't run in the new partition.
2546 2405 */
2547 2406 ASSERT(tp->t_cpu != cp);
2548 2407 thread_unlock(tp);
2549 2408
2550 2409 disp_lock_enter(&dp->disp_lock);
2551 2410 }
2552 2411 disp_lock_exit(&dp->disp_lock);
2553 2412 }
2554 2413
2555 2414 /*
2556 2415 * disp_lowpri_cpu - find CPU running the lowest priority thread.
2557 2416 * The hint passed in is used as a starting point so we don't favor
2558 2417 * CPU 0 or any other CPU. The caller should pass in the most recently
2559 2418 * used CPU for the thread.
2560 2419 *
2561 2420 * The lgroup and priority are used to determine the best CPU to run on
2562 2421 * in a NUMA machine. The lgroup specifies which CPUs are closest while
2563 2422 * the thread priority will indicate whether the thread will actually run
2564 2423 * there. To pick the best CPU, the CPUs inside and outside of the given
2565 2424 * lgroup which are running the lowest priority threads are found. The
2566 2425 * remote CPU is chosen only if the thread will not run locally on a CPU
2567 2426 * within the lgroup, but will run on the remote CPU. If the thread
2568 2427 * cannot immediately run on any CPU, the best local CPU will be chosen.
2569 2428 *
2570 2429 * The lpl specified also identifies the cpu partition from which
2571 2430 * disp_lowpri_cpu should select a CPU.
2572 2431 *
2573 2432 * curcpu is used to indicate that disp_lowpri_cpu is being called on
2574 2433 * behalf of the current thread. (curthread is looking for a new cpu)
2575 2434 * In this case, cpu_dispatch_pri for this thread's cpu should be
2576 2435 * ignored.
2577 2436 *
2578 2437 * If a cpu is the target of an offline request then try to avoid it.
2579 2438 *
2580 2439 * This function must be called at either high SPL, or with preemption
2581 2440 * disabled, so that the "hint" CPU cannot be removed from the online
2582 2441 * CPU list while we are traversing it.
2583 2442 */
2584 2443 cpu_t *
2585 2444 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2586 2445 {
2587 2446 cpu_t *bestcpu;
2588 2447 cpu_t *besthomecpu;
2589 2448 cpu_t *cp, *cpstart;
2590 2449
2591 2450 pri_t bestpri;
2592 2451 pri_t cpupri;
2593 2452
2594 2453 klgrpset_t done;
2595 2454 klgrpset_t cur_set;
2596 2455
2597 2456 lpl_t *lpl_iter, *lpl_leaf;
2598 2457 int i;
2599 2458
2600 2459 /*
2601 2460 * Scan for a CPU currently running the lowest priority thread.
2602 2461 * Cannot get cpu_lock here because it is adaptive.
2603 2462 * We do not require lock on CPU list.
2604 2463 */
2605 2464 ASSERT(hint != NULL);
2606 2465 ASSERT(lpl != NULL);
2607 2466 ASSERT(lpl->lpl_ncpu > 0);
2608 2467
2609 2468 /*
2610 2469 * First examine local CPUs. Note that it's possible the hint CPU
2611 2470 * passed in in remote to the specified home lgroup. If our priority
2612 2471 * isn't sufficient enough such that we can run immediately at home,
2613 2472 * then examine CPUs remote to our home lgroup.
2614 2473 * We would like to give preference to CPUs closest to "home".
2615 2474 * If we can't find a CPU where we'll run at a given level
2616 2475 * of locality, we expand our search to include the next level.
2617 2476 */
2618 2477 bestcpu = besthomecpu = NULL;
2619 2478 klgrpset_clear(done);
2620 2479 /* start with lpl we were passed */
2621 2480
2622 2481 lpl_iter = lpl;
2623 2482
2624 2483 do {
2625 2484
2626 2485 bestpri = SHRT_MAX;
2627 2486 klgrpset_clear(cur_set);
2628 2487
2629 2488 for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2630 2489 lpl_leaf = lpl_iter->lpl_rset[i];
2631 2490 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2632 2491 continue;
2633 2492
2634 2493 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2635 2494
2636 2495 if (hint->cpu_lpl == lpl_leaf)
2637 2496 cp = cpstart = hint;
2638 2497 else
2639 2498 cp = cpstart = lpl_leaf->lpl_cpus;
2640 2499
2641 2500 do {
2642 2501 if (cp == curcpu)
2643 2502 cpupri = -1;
2644 2503 else if (cp == cpu_inmotion)
2645 2504 cpupri = SHRT_MAX;
2646 2505 else
2647 2506 cpupri = cp->cpu_dispatch_pri;
2648 2507 if (cp->cpu_disp->disp_maxrunpri > cpupri)
2649 2508 cpupri = cp->cpu_disp->disp_maxrunpri;
2650 2509 if (cp->cpu_chosen_level > cpupri)
2651 2510 cpupri = cp->cpu_chosen_level;
2652 2511 if (cpupri < bestpri) {
2653 2512 if (CPU_IDLING(cpupri)) {
2654 2513 ASSERT((cp->cpu_flags &
2655 2514 CPU_QUIESCED) == 0);
2656 2515 return (cp);
2657 2516 }
2658 2517 bestcpu = cp;
2659 2518 bestpri = cpupri;
2660 2519 }
2661 2520 } while ((cp = cp->cpu_next_lpl) != cpstart);
2662 2521 }
2663 2522
2664 2523 if (bestcpu && (tpri > bestpri)) {
2665 2524 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2666 2525 return (bestcpu);
2667 2526 }
2668 2527 if (besthomecpu == NULL)
2669 2528 besthomecpu = bestcpu;
2670 2529 /*
2671 2530 * Add the lgrps we just considered to the "done" set
2672 2531 */
2673 2532 klgrpset_or(done, cur_set);
2674 2533
2675 2534 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2676 2535
2677 2536 /*
2678 2537 * The specified priority isn't high enough to run immediately
2679 2538 * anywhere, so just return the best CPU from the home lgroup.
2680 2539 */
2681 2540 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2682 2541 return (besthomecpu);
2683 2542 }
2684 2543
2685 2544 /*
2686 2545 * This routine provides the generic idle cpu function for all processors.
2687 2546 * If a processor has some specific code to execute when idle (say, to stop
2688 2547 * the pipeline and save power) then that routine should be defined in the
2689 2548 * processors specific code (module_xx.c) and the global variable idle_cpu
2690 2549 * set to that function.
2691 2550 */
2692 2551 static void
2693 2552 generic_idle_cpu(void)
2694 2553 {
2695 2554 }
2696 2555
2697 2556 /*ARGSUSED*/
2698 2557 static void
2699 2558 generic_enq_thread(cpu_t *cpu, int bound)
2700 2559 {
2701 2560 }
↓ open down ↓ |
335 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX