Print this page
5285 pass in cpu_pause_func via pause_cpus
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/os/cpu_pm.c
+++ new/usr/src/uts/common/os/cpu_pm.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 #include <sys/cpu_pm.h>
27 27 #include <sys/cmn_err.h>
28 28 #include <sys/time.h>
29 29 #include <sys/sdt.h>
30 30
31 31 /*
32 32 * Solaris Event Based CPU Power Manager
33 33 *
34 34 * This file implements platform independent event based CPU power management.
35 35 * When CPUs are configured into the system, the CMT scheduling subsystem will
36 36 * query the platform to determine if the CPU belongs to any power management
37 37 * domains. That is, sets of CPUs that share power management states.
38 38 *
39 39 * Active Power Management domains represent a group of CPUs across which the
40 40 * Operating System can request speed changes (which may in turn result
41 41 * in voltage changes). This allows the operating system to trade off
42 42 * performance for power savings.
43 43 *
44 44 * Idle Power Management domains can enter power savings states when they are
45 45 * unutilized. These states allow the Operating System to trade off power
46 46 * for performance (in the form of latency to transition from the idle state
47 47 * to an active one).
48 48 *
49 49 * For each active and idle power domain the CMT subsystem instantiates, a
50 50 * cpupm_domain_t structure is created. As the dispatcher schedules threads
51 51 * to run on the system's CPUs, it will also track the utilization of the
52 52 * enumerated power domains. Significant changes in utilization will result
53 53 * in the dispatcher sending the power manager events that relate to the
54 54 * utilization of the power domain. The power manager recieves the events,
55 55 * and in the context of the policy objectives in force, may decide to request
56 56 * the domain's power/performance state be changed.
57 57 *
58 58 * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
59 59 * manager will request the CPUs in the domain run at their fastest (and most
60 60 * power consuming) state. When the domain becomes idle (utilization at zero),
61 61 * the power manager will request that the CPUs run at a speed that saves the
62 62 * most power.
63 63 *
64 64 * The advantage of this scheme, is that the CPU power manager working with the
65 65 * dispatcher can be extremely responsive to changes in utilization. Optimizing
66 66 * for performance in the presence of utilization, and power savings in the
67 67 * presence of idleness. Such close collaboration with the dispatcher has other
68 68 * benefits that will play out in the form of more sophisticated power /
69 69 * performance policy in the near future.
70 70 *
71 71 * Avoiding state thrashing in the presence of transient periods of utilization
72 72 * and idleness while still being responsive to non-transient periods is key.
73 73 * The power manager implements a "governor" that is used to throttle
74 74 * state transitions when a significant amount of transient idle or transient
75 75 * work is detected.
76 76 *
77 77 * Kernel background activity (e.g. taskq threads) are by far the most common
78 78 * form of transient utilization. Ungoverned in the face of this utililzation,
79 79 * hundreds of state transitions per second would result on an idle system.
80 80 *
81 81 * Transient idleness is common when a thread briefly yields the CPU to
82 82 * wait for an event elsewhere in the system. Where the idle period is short
83 83 * enough, the overhead associated with making the state transition doesn't
84 84 * justify the power savings.
85 85 *
86 86 * The following is the state machine for the governor implemented by
87 87 * cpupm_utilization_event():
88 88 *
89 89 * ----->---tw---->-----
90 90 * / \
91 91 * (I)-<-ti-<- -<-ntw-<(W)
92 92 * | \ / |
93 93 * \ \ / /
94 94 * >-nti/rm->(D)--->-tw->-
95 95 * Key:
96 96 *
97 97 * States
98 98 * - (D): Default (ungoverned)
99 99 * - (W): Transient work governed
100 100 * - (I): Transient idle governed
101 101 * State Transitions
102 102 * - tw: transient work
103 103 * - ti: transient idleness
104 104 * - ntw: non-transient work
105 105 * - nti: non-transient idleness
106 106 * - rm: thread remain event
107 107 */
108 108
109 109 static cpupm_domain_t *cpupm_domains = NULL;
110 110
111 111 /*
112 112 * Uninitialized state of CPU power management is disabled
113 113 */
114 114 cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
115 115
116 116 /*
117 117 * Periods of utilization lasting less than this time interval are characterized
118 118 * as transient. State changes associated with transient work are considered
119 119 * to be mispredicted. That is, it's not worth raising and lower power states
120 120 * where the utilization lasts for less than this interval.
121 121 */
122 122 hrtime_t cpupm_tw_predict_interval;
123 123
124 124 /*
125 125 * Periods of idleness lasting less than this time interval are characterized
126 126 * as transient. State changes associated with transient idle are considered
127 127 * to be mispredicted. That is, it's not worth lowering and raising power
128 128 * states where the idleness lasts for less than this interval.
129 129 */
130 130 hrtime_t cpupm_ti_predict_interval;
131 131
132 132 /*
133 133 * Number of mispredictions after which future transitions will be governed.
134 134 */
135 135 int cpupm_mispredict_thresh = 4;
136 136
137 137 /*
138 138 * Likewise, the number of mispredicted governed transitions after which the
139 139 * governor will be removed.
140 140 */
141 141 int cpupm_mispredict_gov_thresh = 4;
142 142
143 143 /*
144 144 * The transient work and transient idle prediction intervals are specified
145 145 * here. Tuning them higher will result in the transient work, and transient
146 146 * idle governors being used more aggresively, which limits the frequency of
147 147 * state transitions at the expense of performance and power savings,
148 148 * respectively. The intervals are specified in nanoseconds.
149 149 */
150 150 /*
151 151 * 400 usec
152 152 */
153 153 #define CPUPM_DEFAULT_TI_INTERVAL 400000
154 154 /*
155 155 * 400 usec
156 156 */
157 157 #define CPUPM_DEFAULT_TW_INTERVAL 400000
158 158
159 159 hrtime_t cpupm_ti_gov_interval = CPUPM_DEFAULT_TI_INTERVAL;
160 160 hrtime_t cpupm_tw_gov_interval = CPUPM_DEFAULT_TW_INTERVAL;
161 161
162 162
163 163 static void cpupm_governor_initialize(void);
164 164 static void cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
165 165
166 166 cpupm_policy_t
167 167 cpupm_get_policy(void)
168 168 {
169 169 return (cpupm_policy);
170 170 }
171 171
172 172 int
173 173 cpupm_set_policy(cpupm_policy_t new_policy)
174 174 {
175 175 static int gov_init = 0;
176 176 int result = 0;
177 177
178 178 mutex_enter(&cpu_lock);
179 179 if (new_policy == cpupm_policy) {
180 180 mutex_exit(&cpu_lock);
↓ open down ↓ |
180 lines elided |
↑ open up ↑ |
181 181 return (result);
182 182 }
183 183
184 184 /*
185 185 * Pausing CPUs causes a high priority thread to be scheduled
186 186 * on all other CPUs (besides the current one). This locks out
187 187 * other CPUs from making CPUPM state transitions.
188 188 */
189 189 switch (new_policy) {
190 190 case CPUPM_POLICY_DISABLED:
191 - pause_cpus(NULL);
191 + pause_cpus(NULL, NULL);
192 192 cpupm_policy = CPUPM_POLICY_DISABLED;
193 193 start_cpus();
194 194
195 195 result = cmt_pad_disable(PGHW_POW_ACTIVE);
196 196
197 197 /*
198 198 * Once PAD has been enabled, it should always be possible
199 199 * to disable it.
200 200 */
201 201 ASSERT(result == 0);
202 202
203 203 /*
204 204 * Bring all the active power domains to the maximum
205 205 * performance state.
206 206 */
207 207 cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
208 208 CPUPM_STATE_MAX_PERF);
209 209
210 210 break;
211 211 case CPUPM_POLICY_ELASTIC:
212 212
213 213 result = cmt_pad_enable(PGHW_POW_ACTIVE);
214 214 if (result < 0) {
215 215 /*
216 216 * Failed to enable PAD across the active power
217 217 * domains, which may well be because none were
218 218 * enumerated.
219 219 */
220 220 break;
↓ open down ↓ |
19 lines elided |
↑ open up ↑ |
221 221 }
222 222
223 223 /*
224 224 * Initialize the governor parameters the first time through.
225 225 */
226 226 if (gov_init == 0) {
227 227 cpupm_governor_initialize();
228 228 gov_init = 1;
229 229 }
230 230
231 - pause_cpus(NULL);
231 + pause_cpus(NULL, NULL);
232 232 cpupm_policy = CPUPM_POLICY_ELASTIC;
233 233 start_cpus();
234 234
235 235 break;
236 236 default:
237 237 cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
238 238 new_policy);
239 239 ASSERT(0);
240 240 break;
241 241 }
242 242 mutex_exit(&cpu_lock);
243 243
244 244 return (result);
245 245 }
246 246
247 247 /*
248 248 * Look for an existing power domain
249 249 */
250 250 static cpupm_domain_t *
251 251 cpupm_domain_find(id_t id, cpupm_dtype_t type)
252 252 {
253 253 ASSERT(MUTEX_HELD(&cpu_lock));
254 254
255 255 cpupm_domain_t *dom;
256 256
257 257 dom = cpupm_domains;
258 258 while (dom != NULL) {
259 259 if (id == dom->cpd_id && type == dom->cpd_type)
260 260 return (dom);
261 261 dom = dom->cpd_next;
262 262 }
263 263 return (NULL);
264 264 }
265 265
266 266 /*
267 267 * Create a new domain
268 268 */
269 269 static cpupm_domain_t *
270 270 cpupm_domain_create(id_t id, cpupm_dtype_t type)
271 271 {
272 272 cpupm_domain_t *dom;
273 273
274 274 ASSERT(MUTEX_HELD(&cpu_lock));
275 275
276 276 dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
277 277 dom->cpd_id = id;
278 278 dom->cpd_type = type;
279 279
280 280 /* Link into the known domain list */
281 281 dom->cpd_next = cpupm_domains;
282 282 cpupm_domains = dom;
283 283
284 284 return (dom);
285 285 }
286 286
287 287 static void
288 288 cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
289 289 {
290 290 /*
291 291 * In the envent we're enumerating because the domain's state
292 292 * configuration has changed, toss any existing states.
293 293 */
294 294 if (dom->cpd_nstates > 0) {
295 295 kmem_free(dom->cpd_states,
296 296 sizeof (cpupm_state_t) * dom->cpd_nstates);
297 297 dom->cpd_nstates = 0;
298 298 }
299 299
300 300 /*
301 301 * Query to determine the number of states, allocate storage
302 302 * large enough to hold the state information, and pass it back
303 303 * to the platform driver to complete the enumeration.
304 304 */
305 305 dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
306 306
307 307 if (dom->cpd_nstates == 0)
308 308 return;
309 309
310 310 dom->cpd_states =
311 311 kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
312 312 (void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
313 313 }
314 314
315 315 /*
316 316 * Initialize the specified type of power domain on behalf of the CPU
317 317 */
318 318 cpupm_domain_t *
319 319 cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
320 320 {
321 321 cpupm_domain_t *dom;
322 322 id_t did;
323 323
324 324 ASSERT(MUTEX_HELD(&cpu_lock));
325 325
326 326 /*
327 327 * Instantiate the domain if it doesn't already exist
328 328 * and enumerate its power states.
329 329 */
330 330 did = cpupm_domain_id(cp, type);
331 331 dom = cpupm_domain_find(did, type);
332 332 if (dom == NULL) {
333 333 dom = cpupm_domain_create(did, type);
334 334 cpupm_domain_state_enum(cp, dom);
335 335 }
336 336
337 337 /*
338 338 * Named state initialization
339 339 */
340 340 if (type == CPUPM_DTYPE_ACTIVE) {
341 341 /*
342 342 * For active power domains, the highest performance
343 343 * state is defined as first state returned from
344 344 * the domain enumeration.
345 345 */
346 346 dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
347 347 &dom->cpd_states[0];
348 348 dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
349 349 &dom->cpd_states[dom->cpd_nstates - 1];
350 350
351 351 /*
352 352 * Begin by assuming CPU is running at the max perf state.
353 353 */
354 354 dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
355 355 }
356 356
357 357 return (dom);
358 358 }
359 359
360 360 /*
361 361 * Return the id associated with the given type of domain
362 362 * to which cp belongs
363 363 */
364 364 id_t
365 365 cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
366 366 {
367 367 return (cpupm_plat_domain_id(cp, type));
368 368 }
369 369
370 370 /*
371 371 * Initiate a state change for the specified domain on behalf of cp
372 372 */
373 373 int
374 374 cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
375 375 {
376 376 if (cpupm_plat_change_state(cp, state) < 0)
377 377 return (-1);
378 378
379 379 DTRACE_PROBE2(cpupm__change__state,
380 380 cpupm_domain_t *, dom,
381 381 cpupm_state_t *, state);
382 382
383 383 dom->cpd_state = state;
384 384 return (0);
385 385 }
386 386
387 387 /*
388 388 * Interface into the CPU power manager to indicate a significant change
389 389 * in utilization of the specified active power domain
390 390 */
391 391 void
392 392 cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
393 393 cpupm_util_event_t event)
394 394 {
395 395 cpupm_state_t *new_state = NULL;
396 396 hrtime_t last;
397 397
398 398 if (cpupm_policy == CPUPM_POLICY_DISABLED) {
399 399 return;
400 400 }
401 401
402 402 /*
403 403 * What follows is a simple elastic power state management policy.
404 404 *
405 405 * If the utilization has become non-zero, and the domain was
406 406 * previously at it's lowest power state, then transition it
407 407 * to the highest state in the spirit of "race to idle".
408 408 *
409 409 * If the utilization has dropped to zero, then transition the
410 410 * domain to its lowest power state.
411 411 *
412 412 * Statistics are maintained to implement a governor to reduce state
413 413 * transitions resulting from either transient work, or periods of
414 414 * transient idleness on the domain.
415 415 */
416 416 switch (event) {
417 417 case CPUPM_DOM_REMAIN_BUSY:
418 418
419 419 /*
420 420 * We've received an event that the domain is running a thread
421 421 * that's made it to the end of it's time slice. If we are at
422 422 * low power, then raise it. If the transient work governor
423 423 * is engaged, then remove it.
424 424 */
425 425 if (dom->cpd_state ==
426 426 dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
427 427 new_state =
428 428 dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
429 429 if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
430 430 dom->cpd_governor = CPUPM_GOV_DISENGAGED;
431 431 dom->cpd_tw = 0;
432 432 }
433 433 }
434 434 break;
435 435
436 436 case CPUPM_DOM_BUSY_FROM_IDLE:
437 437 last = dom->cpd_last_lower;
438 438 dom->cpd_last_raise = now;
439 439
440 440 DTRACE_PROBE3(cpupm__raise__req,
441 441 cpupm_domain_t *, dom,
442 442 hrtime_t, last,
443 443 hrtime_t, now);
444 444
445 445 if (dom->cpd_state ==
446 446 dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
447 447
448 448 /*
449 449 * There's non-zero utilization, and the domain is
450 450 * running in the lower power state. Before we
451 451 * consider raising power, check if the preceeding
452 452 * idle period was transient in duration.
453 453 *
454 454 * If the domain is already transient work governed,
455 455 * then we don't bother maintaining transient idle
456 456 * statistics, as the presence of enough transient work
457 457 * can also make the domain frequently transiently idle.
458 458 * In this case, we still want to remain transient work
459 459 * governed.
460 460 */
461 461 if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
462 462 if ((now - last) < cpupm_ti_predict_interval) {
463 463 /*
464 464 * We're raising the domain power and
465 465 * we *just* lowered it. Consider
466 466 * this a mispredicted power state
467 467 * transition due to a transient
468 468 * idle period.
469 469 */
470 470 if (++dom->cpd_ti >=
471 471 cpupm_mispredict_thresh) {
472 472 /*
473 473 * There's enough transient
474 474 * idle transitions to
475 475 * justify governing future
476 476 * lowering requests.
477 477 */
478 478 dom->cpd_governor =
479 479 CPUPM_GOV_TRANS_IDLE;
480 480 dom->cpd_ti = 0;
481 481 DTRACE_PROBE1(
482 482 cpupm__ti__governed,
483 483 cpupm_domain_t *, dom);
484 484 }
485 485 } else {
486 486 /*
487 487 * We correctly predicted the last
488 488 * lowering.
489 489 */
490 490 dom->cpd_ti = 0;
491 491 }
492 492 }
493 493 if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
494 494 /*
495 495 * Raise requests are governed due to
496 496 * transient work.
497 497 */
498 498 DTRACE_PROBE1(cpupm__raise__governed,
499 499 cpupm_domain_t *, dom);
500 500
501 501 return;
502 502 }
503 503 /*
504 504 * Prepare to transition to the higher power state
505 505 */
506 506 new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
507 507
508 508 } else if (dom->cpd_state ==
509 509 dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
510 510
511 511 /*
512 512 * Utilization is non-zero, and we're already running
513 513 * in the higher power state. Take this opportunity to
514 514 * perform some book keeping if the last lowering
515 515 * request was governed.
516 516 */
517 517 if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
518 518
519 519 if ((now - last) >= cpupm_ti_predict_interval) {
520 520 /*
521 521 * The domain is transient idle
522 522 * governed, and we mispredicted
523 523 * governing the last lowering request.
524 524 */
525 525 if (++dom->cpd_ti >=
526 526 cpupm_mispredict_gov_thresh) {
527 527 /*
528 528 * There's enough non-transient
529 529 * idle periods to justify
530 530 * removing the governor.
531 531 */
532 532 dom->cpd_governor =
533 533 CPUPM_GOV_DISENGAGED;
534 534 dom->cpd_ti = 0;
535 535 DTRACE_PROBE1(
536 536 cpupm__ti__ungoverned,
537 537 cpupm_domain_t *, dom);
538 538 }
539 539 } else {
540 540 /*
541 541 * Correctly predicted governing the
542 542 * last lowering request.
543 543 */
544 544 dom->cpd_ti = 0;
545 545 }
546 546 }
547 547 }
548 548 break;
549 549
550 550 case CPUPM_DOM_IDLE_FROM_BUSY:
551 551 last = dom->cpd_last_raise;
552 552 dom->cpd_last_lower = now;
553 553
554 554 DTRACE_PROBE3(cpupm__lower__req,
555 555 cpupm_domain_t *, dom,
556 556 hrtime_t, last,
557 557 hrtime_t, now);
558 558
559 559 if (dom->cpd_state ==
560 560 dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
561 561
562 562 /*
563 563 * The domain is idle, and is running in the highest
564 564 * performance state. Before we consider lowering power,
565 565 * perform some book keeping for the transient work
566 566 * governor.
567 567 */
568 568 if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
569 569 if ((now - last) < cpupm_tw_predict_interval) {
570 570 /*
571 571 * We're lowering the domain power and
572 572 * we *just* raised it. Consider the
573 573 * last raise mispredicted due to
574 574 * transient work.
575 575 */
576 576 if (++dom->cpd_tw >=
577 577 cpupm_mispredict_thresh) {
578 578 /*
579 579 * There's enough transient work
580 580 * transitions to justify
581 581 * governing future raise
582 582 * requests.
583 583 */
584 584 dom->cpd_governor =
585 585 CPUPM_GOV_TRANS_WORK;
586 586 dom->cpd_tw = 0;
587 587 DTRACE_PROBE1(
588 588 cpupm__tw__governed,
589 589 cpupm_domain_t *, dom);
590 590 }
591 591 } else {
592 592 /*
593 593 * We correctly predicted during the
594 594 * last raise.
595 595 */
596 596 dom->cpd_tw = 0;
597 597 }
598 598 }
599 599 if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
600 600 /*
601 601 * Lowering requests are governed due to
602 602 * transient idleness.
603 603 */
604 604 DTRACE_PROBE1(cpupm__lowering__governed,
605 605 cpupm_domain_t *, dom);
606 606
607 607 return;
608 608 }
609 609
610 610 /*
611 611 * Prepare to transition to a lower power state.
612 612 */
613 613 new_state =
614 614 dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
615 615
616 616 } else if (dom->cpd_state ==
617 617 dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
618 618
619 619 /*
620 620 * The domain is idle, and we're already running in
621 621 * the lower power state. Take this opportunity to
622 622 * perform some book keeping if the last raising
623 623 * request was governed.
624 624 */
625 625 if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
626 626 if ((now - last) >= cpupm_tw_predict_interval) {
627 627 /*
628 628 * The domain is transient work
629 629 * governed, and we mispredicted
630 630 * governing the last raising request.
631 631 */
632 632 if (++dom->cpd_tw >=
633 633 cpupm_mispredict_gov_thresh) {
634 634 /*
635 635 * There's enough non-transient
636 636 * work to justify removing
637 637 * the governor.
638 638 */
639 639 dom->cpd_governor =
640 640 CPUPM_GOV_DISENGAGED;
641 641 dom->cpd_tw = 0;
642 642 DTRACE_PROBE1(
643 643 cpupm__tw__ungoverned,
644 644 cpupm_domain_t *, dom);
645 645 }
646 646 } else {
647 647 /*
648 648 * We correctly predicted governing
649 649 * the last raise.
650 650 */
651 651 dom->cpd_tw = 0;
652 652 }
653 653 }
654 654 }
655 655 break;
656 656 }
657 657 /*
658 658 * Change the power state
659 659 * Not much currently done if this doesn't succeed
660 660 */
661 661 if (new_state)
662 662 (void) cpupm_change_state(cp, dom, new_state);
663 663 }
664 664
665 665
666 666 /*
667 667 * Interface called by platforms to dynamically change the
668 668 * MAX performance cpupm state
669 669 */
670 670 void
671 671 cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
672 672 {
673 673 cpupm_domain_t *dom;
674 674 id_t did;
675 675 cpupm_dtype_t type = CPUPM_DTYPE_ACTIVE;
676 676 boolean_t change_state = B_FALSE;
677 677 cpupm_state_t *new_state = NULL;
678 678
679 679 did = cpupm_domain_id(cp, type);
680 680 if (MUTEX_HELD(&cpu_lock)) {
681 681 dom = cpupm_domain_find(did, type);
682 682 } else {
683 683 mutex_enter(&cpu_lock);
684 684 dom = cpupm_domain_find(did, type);
685 685 mutex_exit(&cpu_lock);
686 686 }
687 687
688 688 /*
689 689 * Can use a lock to avoid changing the power state of the cpu when
690 690 * CPUPM_STATE_MAX_PERF is getting changed.
691 691 * Since the occurance of events to change MAX_PERF is not frequent,
692 692 * it may not be a good idea to overburden with locks. In the worst
693 693 * case, for one cycle the power may not get changed to the required
694 694 * level
695 695 */
696 696 if (dom != NULL) {
697 697 if (dom->cpd_state ==
698 698 dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
699 699 change_state = B_TRUE;
700 700 }
701 701
702 702 /*
703 703 * If an out of range level is passed, use the lowest supported
704 704 * speed.
705 705 */
706 706 if (max_perf_level >= dom->cpd_nstates &&
707 707 dom->cpd_nstates > 1) {
708 708 max_perf_level = dom->cpd_nstates - 1;
709 709 }
710 710
711 711 dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
712 712 &dom->cpd_states[max_perf_level];
713 713
714 714 /*
715 715 * If the current state is MAX_PERF, change the current state
716 716 * to the new MAX_PERF
717 717 */
718 718 if (change_state) {
719 719 new_state =
720 720 dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
721 721 if (new_state) {
722 722 (void) cpupm_change_state(cp, dom, new_state);
723 723 }
724 724 }
725 725 }
726 726 }
727 727
728 728 /*
729 729 * Initialize the parameters for the transience governor state machine
730 730 */
731 731 static void
732 732 cpupm_governor_initialize(void)
733 733 {
734 734 /*
735 735 * The default prediction intervals are specified in nanoseconds.
736 736 * Convert these to the equivalent in unscaled hrtime, which is the
737 737 * format of the timestamps passed to cpupm_utilization_event()
738 738 */
739 739 cpupm_ti_predict_interval = unscalehrtime(cpupm_ti_gov_interval);
740 740 cpupm_tw_predict_interval = unscalehrtime(cpupm_tw_gov_interval);
741 741 }
742 742
743 743 /*
744 744 * Initiate a state change in all CPUPM domain instances of the specified type
745 745 */
746 746 static void
747 747 cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
748 748 {
749 749 cpu_t *cp;
750 750 pg_cmt_t *pwr_pg;
751 751 cpupm_domain_t *dom;
752 752 group_t *hwset;
753 753 group_iter_t giter;
754 754 pg_cpu_itr_t cpu_iter;
755 755 pghw_type_t hw;
756 756
757 757 ASSERT(MUTEX_HELD(&cpu_lock));
758 758
759 759 switch (type) {
760 760 case CPUPM_DTYPE_ACTIVE:
761 761 hw = PGHW_POW_ACTIVE;
762 762 break;
763 763 default:
764 764 /*
765 765 * Power domain types other than "active" unsupported.
766 766 */
767 767 ASSERT(type == CPUPM_DTYPE_ACTIVE);
768 768 return;
769 769 }
770 770
771 771 if ((hwset = pghw_set_lookup(hw)) == NULL)
772 772 return;
773 773
774 774 /*
775 775 * Iterate over the power domains
776 776 */
777 777 group_iter_init(&giter);
778 778 while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
779 779
780 780 dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
781 781
782 782 /*
783 783 * Iterate over the CPUs in each domain
784 784 */
785 785 PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
786 786 while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
787 787 (void) cpupm_change_state(cp, dom,
788 788 dom->cpd_named_states[state]);
789 789 }
790 790 }
791 791 }
↓ open down ↓ |
550 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX