5285-pass-in-cpu_pause_func-via-pause_cpus Wdiff usr/src/uts/common/os/cpu_pm.c

Print this page

5285 pass in cpu_pause_func via pause_cpus

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/cpu_pm.c
          +++ new/usr/src/uts/common/os/cpu_pm.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  #include <sys/cpu_pm.h>
  27   27  #include <sys/cmn_err.h>
  28   28  #include <sys/time.h>
  29   29  #include <sys/sdt.h>
  30   30  
  31   31  /*
  32   32   * Solaris Event Based CPU Power Manager
  33   33   *
  34   34   * This file implements platform independent event based CPU power management.
  35   35   * When CPUs are configured into the system, the CMT scheduling subsystem will
  36   36   * query the platform to determine if the CPU belongs to any power management
  37   37   * domains. That is, sets of CPUs that share power management states.
  38   38   *
  39   39   * Active Power Management domains represent a group of CPUs across which the
  40   40   * Operating System can request speed changes (which may in turn result
  41   41   * in voltage changes). This allows the operating system to trade off
  42   42   * performance for power savings.
  43   43   *
  44   44   * Idle Power Management domains can enter power savings states when they are
  45   45   * unutilized. These states allow the Operating System to trade off power
  46   46   * for performance (in the form of latency to transition from the idle state
  47   47   * to an active one).
  48   48   *
  49   49   * For each active and idle power domain the CMT subsystem instantiates, a
  50   50   * cpupm_domain_t structure is created. As the dispatcher schedules threads
  51   51   * to run on the system's CPUs, it will also track the utilization of the
  52   52   * enumerated power domains. Significant changes in utilization will result
  53   53   * in the dispatcher sending the power manager events that relate to the
  54   54   * utilization of the power domain. The power manager recieves the events,
  55   55   * and in the context of the policy objectives in force, may decide to request
  56   56   * the domain's power/performance state be changed.
  57   57   *
  58   58   * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
  59   59   * manager will request the CPUs in the domain run at their fastest (and most
  60   60   * power consuming) state. When the domain becomes idle (utilization at zero),
  61   61   * the power manager will request that the CPUs run at a speed that saves the
  62   62   * most power.
  63   63   *
  64   64   * The advantage of this scheme, is that the CPU power manager working with the
  65   65   * dispatcher can be extremely responsive to changes in utilization. Optimizing
  66   66   * for performance in the presence of utilization, and power savings in the
  67   67   * presence of idleness. Such close collaboration with the dispatcher has other
  68   68   * benefits that will play out in the form of more sophisticated power /
  69   69   * performance policy in the near future.
  70   70   *
  71   71   * Avoiding state thrashing in the presence of transient periods of utilization
  72   72   * and idleness while still being responsive to non-transient periods is key.
  73   73   * The power manager implements a "governor" that is used to throttle
  74   74   * state transitions when a significant amount of transient idle or transient
  75   75   * work is detected.
  76   76   *
  77   77   * Kernel background activity (e.g. taskq threads) are by far the most common
  78   78   * form of transient utilization. Ungoverned in the face of this utililzation,
  79   79   * hundreds of state transitions per second would result on an idle system.
  80   80   *
  81   81   * Transient idleness is common when a thread briefly yields the CPU to
  82   82   * wait for an event elsewhere in the system. Where the idle period is short
  83   83   * enough, the overhead associated with making the state transition doesn't
  84   84   * justify the power savings.
  85   85   *
  86   86   * The following is the state machine for the governor implemented by
  87   87   * cpupm_utilization_event():
  88   88   *
  89   89   *         ----->---tw---->-----
  90   90   *        /                     \
  91   91   *      (I)-<-ti-<-     -<-ntw-<(W)
  92   92   *       |         \   /         |
  93   93   *       \          \ /          /
  94   94   *        >-nti/rm->(D)--->-tw->-
  95   95   * Key:
  96   96   *
  97   97   * States
  98   98   * - (D): Default (ungoverned)
  99   99   * - (W): Transient work governed
 100  100   * - (I): Transient idle governed
 101  101   * State Transitions
 102  102   * - tw: transient work
 103  103   * - ti: transient idleness
 104  104   * - ntw: non-transient work
 105  105   * - nti: non-transient idleness
 106  106   * - rm: thread remain event
 107  107   */
 108  108  
 109  109  static cpupm_domain_t *cpupm_domains = NULL;
 110  110  
 111  111  /*
 112  112   * Uninitialized state of CPU power management is disabled
 113  113   */
 114  114  cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
 115  115  
 116  116  /*
 117  117   * Periods of utilization lasting less than this time interval are characterized
 118  118   * as transient. State changes associated with transient work are considered
 119  119   * to be mispredicted. That is, it's not worth raising and lower power states
 120  120   * where the utilization lasts for less than this interval.
 121  121   */
 122  122  hrtime_t cpupm_tw_predict_interval;
 123  123  
 124  124  /*
 125  125   * Periods of idleness lasting less than this time interval are characterized
 126  126   * as transient. State changes associated with transient idle are considered
 127  127   * to be mispredicted. That is, it's not worth lowering and raising power
 128  128   * states where the idleness lasts for less than this interval.
 129  129   */
 130  130  hrtime_t cpupm_ti_predict_interval;
 131  131  
 132  132  /*
 133  133   * Number of mispredictions after which future transitions will be governed.
 134  134   */
 135  135  int cpupm_mispredict_thresh = 4;
 136  136  
 137  137  /*
 138  138   * Likewise, the number of mispredicted governed transitions after which the
 139  139   * governor will be removed.
 140  140   */
 141  141  int cpupm_mispredict_gov_thresh = 4;
 142  142  
 143  143  /*
 144  144   * The transient work and transient idle prediction intervals are specified
 145  145   * here. Tuning them higher will result in the transient work, and transient
 146  146   * idle governors being used more aggresively, which limits the frequency of
 147  147   * state transitions at the expense of performance and power savings,
 148  148   * respectively. The intervals are specified in nanoseconds.
 149  149   */
 150  150  /*
 151  151   * 400 usec
 152  152   */
 153  153  #define CPUPM_DEFAULT_TI_INTERVAL       400000
 154  154  /*
 155  155   * 400 usec
 156  156   */
 157  157  #define CPUPM_DEFAULT_TW_INTERVAL       400000
 158  158  
 159  159  hrtime_t cpupm_ti_gov_interval = CPUPM_DEFAULT_TI_INTERVAL;
 160  160  hrtime_t cpupm_tw_gov_interval = CPUPM_DEFAULT_TW_INTERVAL;
 161  161  
 162  162  
 163  163  static void     cpupm_governor_initialize(void);
 164  164  static void     cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
 165  165  
 166  166  cpupm_policy_t
 167  167  cpupm_get_policy(void)
 168  168  {
 169  169          return (cpupm_policy);
 170  170  }
 171  171  
 172  172  int
 173  173  cpupm_set_policy(cpupm_policy_t new_policy)
 174  174  {
 175  175          static int      gov_init = 0;
 176  176          int             result = 0;
 177  177  
 178  178          mutex_enter(&cpu_lock);
 179  179          if (new_policy == cpupm_policy) {
 180  180                  mutex_exit(&cpu_lock);

↓ open down ↓

180 lines elided

↑ open up ↑

 181  181                  return (result);
 182  182          }
 183  183  
 184  184          /*
 185  185           * Pausing CPUs causes a high priority thread to be scheduled
 186  186           * on all other CPUs (besides the current one). This locks out
 187  187           * other CPUs from making CPUPM state transitions.
 188  188           */
 189  189          switch (new_policy) {
 190  190          case CPUPM_POLICY_DISABLED:
 191      -                pause_cpus(NULL);
      191 +                pause_cpus(NULL, NULL);
 192  192                  cpupm_policy = CPUPM_POLICY_DISABLED;
 193  193                  start_cpus();
 194  194  
 195  195                  result = cmt_pad_disable(PGHW_POW_ACTIVE);
 196  196  
 197  197                  /*
 198  198                   * Once PAD has been enabled, it should always be possible
 199  199                   * to disable it.
 200  200                   */
 201  201                  ASSERT(result == 0);

 202  202  
 203  203                  /*
 204  204                   * Bring all the active power domains to the maximum
 205  205                   * performance state.
 206  206                   */
 207  207                  cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
 208  208                      CPUPM_STATE_MAX_PERF);
 209  209  
 210  210                  break;
 211  211          case CPUPM_POLICY_ELASTIC:
 212  212  
 213  213                  result = cmt_pad_enable(PGHW_POW_ACTIVE);
 214  214                  if (result < 0) {
 215  215                          /*
 216  216                           * Failed to enable PAD across the active power
 217  217                           * domains, which may well be because none were
 218  218                           * enumerated.
 219  219                           */
 220  220                          break;

↓ open down ↓

19 lines elided

↑ open up ↑

 221  221                  }
 222  222  
 223  223                  /*
 224  224                   * Initialize the governor parameters the first time through.
 225  225                   */
 226  226                  if (gov_init == 0) {
 227  227                          cpupm_governor_initialize();
 228  228                          gov_init = 1;
 229  229                  }
 230  230  
 231      -                pause_cpus(NULL);
      231 +                pause_cpus(NULL, NULL);
 232  232                  cpupm_policy = CPUPM_POLICY_ELASTIC;
 233  233                  start_cpus();
 234  234  
 235  235                  break;
 236  236          default:
 237  237                  cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
 238  238                      new_policy);
 239  239                  ASSERT(0);
 240  240                  break;
 241  241          }

 242  242          mutex_exit(&cpu_lock);
 243  243  
 244  244          return (result);
 245  245  }
 246  246  
 247  247  /*
 248  248   * Look for an existing power domain
 249  249   */
 250  250  static cpupm_domain_t *
 251  251  cpupm_domain_find(id_t id, cpupm_dtype_t type)
 252  252  {
 253  253          ASSERT(MUTEX_HELD(&cpu_lock));
 254  254  
 255  255          cpupm_domain_t *dom;
 256  256  
 257  257          dom = cpupm_domains;
 258  258          while (dom != NULL) {
 259  259                  if (id == dom->cpd_id && type == dom->cpd_type)
 260  260                          return (dom);
 261  261                  dom = dom->cpd_next;
 262  262          }
 263  263          return (NULL);
 264  264  }
 265  265  
 266  266  /*
 267  267   * Create a new domain
 268  268   */
 269  269  static cpupm_domain_t *
 270  270  cpupm_domain_create(id_t id, cpupm_dtype_t type)
 271  271  {
 272  272          cpupm_domain_t *dom;
 273  273  
 274  274          ASSERT(MUTEX_HELD(&cpu_lock));
 275  275  
 276  276          dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
 277  277          dom->cpd_id = id;
 278  278          dom->cpd_type = type;
 279  279  
 280  280          /* Link into the known domain list */
 281  281          dom->cpd_next = cpupm_domains;
 282  282          cpupm_domains = dom;
 283  283  
 284  284          return (dom);
 285  285  }
 286  286  
 287  287  static void
 288  288  cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
 289  289  {
 290  290          /*
 291  291           * In the envent we're enumerating because the domain's state
 292  292           * configuration has changed, toss any existing states.
 293  293           */
 294  294          if (dom->cpd_nstates > 0) {
 295  295                  kmem_free(dom->cpd_states,
 296  296                      sizeof (cpupm_state_t) * dom->cpd_nstates);
 297  297                  dom->cpd_nstates = 0;
 298  298          }
 299  299  
 300  300          /*
 301  301           * Query to determine the number of states, allocate storage
 302  302           * large enough to hold the state information, and pass it back
 303  303           * to the platform driver to complete the enumeration.
 304  304           */
 305  305          dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
 306  306  
 307  307          if (dom->cpd_nstates == 0)
 308  308                  return;
 309  309  
 310  310          dom->cpd_states =
 311  311              kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
 312  312          (void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
 313  313  }
 314  314  
 315  315  /*
 316  316   * Initialize the specified type of power domain on behalf of the CPU
 317  317   */
 318  318  cpupm_domain_t *
 319  319  cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
 320  320  {
 321  321          cpupm_domain_t  *dom;
 322  322          id_t            did;
 323  323  
 324  324          ASSERT(MUTEX_HELD(&cpu_lock));
 325  325  
 326  326          /*
 327  327           * Instantiate the domain if it doesn't already exist
 328  328           * and enumerate its power states.
 329  329           */
 330  330          did = cpupm_domain_id(cp, type);
 331  331          dom = cpupm_domain_find(did, type);
 332  332          if (dom == NULL) {
 333  333                  dom = cpupm_domain_create(did, type);
 334  334                  cpupm_domain_state_enum(cp, dom);
 335  335          }
 336  336  
 337  337          /*
 338  338           * Named state initialization
 339  339           */
 340  340          if (type == CPUPM_DTYPE_ACTIVE) {
 341  341                  /*
 342  342                   * For active power domains, the highest performance
 343  343                   * state is defined as first state returned from
 344  344                   * the domain enumeration.
 345  345                   */
 346  346                  dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
 347  347                      &dom->cpd_states[0];
 348  348                  dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
 349  349                      &dom->cpd_states[dom->cpd_nstates - 1];
 350  350  
 351  351                  /*
 352  352                   * Begin by assuming CPU is running at the max perf state.
 353  353                   */
 354  354                  dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
 355  355          }
 356  356  
 357  357          return (dom);
 358  358  }
 359  359  
 360  360  /*
 361  361   * Return the id associated with the given type of domain
 362  362   * to which cp belongs
 363  363   */
 364  364  id_t
 365  365  cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
 366  366  {
 367  367          return (cpupm_plat_domain_id(cp, type));
 368  368  }
 369  369  
 370  370  /*
 371  371   * Initiate a state change for the specified domain on behalf of cp
 372  372   */
 373  373  int
 374  374  cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
 375  375  {
 376  376          if (cpupm_plat_change_state(cp, state) < 0)
 377  377                  return (-1);
 378  378  
 379  379          DTRACE_PROBE2(cpupm__change__state,
 380  380              cpupm_domain_t *, dom,
 381  381              cpupm_state_t *, state);
 382  382  
 383  383          dom->cpd_state = state;
 384  384          return (0);
 385  385  }
 386  386  
 387  387  /*
 388  388   * Interface into the CPU power manager to indicate a significant change
 389  389   * in utilization of the specified active power domain
 390  390   */
 391  391  void
 392  392  cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
 393  393                              cpupm_util_event_t event)
 394  394  {
 395  395          cpupm_state_t   *new_state = NULL;
 396  396          hrtime_t        last;
 397  397  
 398  398          if (cpupm_policy == CPUPM_POLICY_DISABLED) {
 399  399                  return;
 400  400          }
 401  401  
 402  402          /*
 403  403           * What follows is a simple elastic power state management policy.
 404  404           *
 405  405           * If the utilization has become non-zero, and the domain was
 406  406           * previously at it's lowest power state, then transition it
 407  407           * to the highest state in the spirit of "race to idle".
 408  408           *
 409  409           * If the utilization has dropped to zero, then transition the
 410  410           * domain to its lowest power state.
 411  411           *
 412  412           * Statistics are maintained to implement a governor to reduce state
 413  413           * transitions resulting from either transient work, or periods of
 414  414           * transient idleness on the domain.
 415  415           */
 416  416          switch (event) {
 417  417          case CPUPM_DOM_REMAIN_BUSY:
 418  418  
 419  419                  /*
 420  420                   * We've received an event that the domain is running a thread
 421  421                   * that's made it to the end of it's time slice. If we are at
 422  422                   * low power, then raise it. If the transient work governor
 423  423                   * is engaged, then remove it.
 424  424                   */
 425  425                  if (dom->cpd_state ==
 426  426                      dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
 427  427                          new_state =
 428  428                              dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
 429  429                          if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
 430  430                                  dom->cpd_governor = CPUPM_GOV_DISENGAGED;
 431  431                                  dom->cpd_tw = 0;
 432  432                          }
 433  433                  }
 434  434                  break;
 435  435  
 436  436          case CPUPM_DOM_BUSY_FROM_IDLE:
 437  437                  last = dom->cpd_last_lower;
 438  438                  dom->cpd_last_raise = now;
 439  439  
 440  440                  DTRACE_PROBE3(cpupm__raise__req,
 441  441                      cpupm_domain_t *, dom,
 442  442                      hrtime_t, last,
 443  443                      hrtime_t, now);
 444  444  
 445  445                  if (dom->cpd_state ==
 446  446                      dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
 447  447  
 448  448                          /*
 449  449                           * There's non-zero utilization, and the domain is
 450  450                           * running in the lower power state. Before we
 451  451                           * consider raising power, check if the preceeding
 452  452                           * idle period was transient in duration.
 453  453                           *
 454  454                           * If the domain is already transient work governed,
 455  455                           * then we don't bother maintaining transient idle
 456  456                           * statistics, as the presence of enough transient work
 457  457                           * can also make the domain frequently transiently idle.
 458  458                           * In this case, we still want to remain transient work
 459  459                           * governed.
 460  460                           */
 461  461                          if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
 462  462                                  if ((now - last) < cpupm_ti_predict_interval) {
 463  463                                          /*
 464  464                                           * We're raising the domain power and
 465  465                                           * we *just* lowered it. Consider
 466  466                                           * this a mispredicted power state
 467  467                                           * transition due to a transient
 468  468                                           * idle period.
 469  469                                           */
 470  470                                          if (++dom->cpd_ti >=
 471  471                                              cpupm_mispredict_thresh) {
 472  472                                                  /*
 473  473                                                   * There's enough transient
 474  474                                                   * idle transitions to
 475  475                                                   * justify governing future
 476  476                                                   * lowering requests.
 477  477                                                   */
 478  478                                                  dom->cpd_governor =
 479  479                                                      CPUPM_GOV_TRANS_IDLE;
 480  480                                                  dom->cpd_ti = 0;
 481  481                                                  DTRACE_PROBE1(
 482  482                                                      cpupm__ti__governed,
 483  483                                                      cpupm_domain_t *, dom);
 484  484                                          }
 485  485                                  } else {
 486  486                                          /*
 487  487                                           * We correctly predicted the last
 488  488                                           * lowering.
 489  489                                           */
 490  490                                          dom->cpd_ti = 0;
 491  491                                  }
 492  492                          }
 493  493                          if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
 494  494                                  /*
 495  495                                   * Raise requests are governed due to
 496  496                                   * transient work.
 497  497                                   */
 498  498                                  DTRACE_PROBE1(cpupm__raise__governed,
 499  499                                      cpupm_domain_t *, dom);
 500  500  
 501  501                                  return;
 502  502                          }
 503  503                          /*
 504  504                           * Prepare to transition to the higher power state
 505  505                           */
 506  506                          new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
 507  507  
 508  508                  } else if (dom->cpd_state ==
 509  509                      dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
 510  510  
 511  511                          /*
 512  512                           * Utilization is non-zero, and we're already running
 513  513                           * in the higher power state. Take this opportunity to
 514  514                           * perform some book keeping if the last lowering
 515  515                           * request was governed.
 516  516                           */
 517  517                          if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
 518  518  
 519  519                                  if ((now - last) >= cpupm_ti_predict_interval) {
 520  520                                          /*
 521  521                                           * The domain is transient idle
 522  522                                           * governed, and we mispredicted
 523  523                                           * governing the last lowering request.
 524  524                                           */
 525  525                                          if (++dom->cpd_ti >=
 526  526                                              cpupm_mispredict_gov_thresh) {
 527  527                                                  /*
 528  528                                                   * There's enough non-transient
 529  529                                                   * idle periods to justify
 530  530                                                   * removing the governor.
 531  531                                                   */
 532  532                                                  dom->cpd_governor =
 533  533                                                      CPUPM_GOV_DISENGAGED;
 534  534                                                  dom->cpd_ti = 0;
 535  535                                                  DTRACE_PROBE1(
 536  536                                                      cpupm__ti__ungoverned,
 537  537                                                      cpupm_domain_t *, dom);
 538  538                                          }
 539  539                                  } else {
 540  540                                          /*
 541  541                                           * Correctly predicted governing the
 542  542                                           * last lowering request.
 543  543                                           */
 544  544                                          dom->cpd_ti = 0;
 545  545                                  }
 546  546                          }
 547  547                  }
 548  548                  break;
 549  549  
 550  550          case CPUPM_DOM_IDLE_FROM_BUSY:
 551  551                  last = dom->cpd_last_raise;
 552  552                  dom->cpd_last_lower = now;
 553  553  
 554  554                  DTRACE_PROBE3(cpupm__lower__req,
 555  555                      cpupm_domain_t *, dom,
 556  556                      hrtime_t, last,
 557  557                      hrtime_t, now);
 558  558  
 559  559                  if (dom->cpd_state ==
 560  560                      dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
 561  561  
 562  562                          /*
 563  563                           * The domain is idle, and is running in the highest
 564  564                           * performance state. Before we consider lowering power,
 565  565                           * perform some book keeping for the transient work
 566  566                           * governor.
 567  567                           */
 568  568                          if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
 569  569                                  if ((now - last) < cpupm_tw_predict_interval) {
 570  570                                          /*
 571  571                                           * We're lowering the domain power and
 572  572                                           * we *just* raised it. Consider the
 573  573                                           * last raise mispredicted due to
 574  574                                           * transient work.
 575  575                                           */
 576  576                                          if (++dom->cpd_tw >=
 577  577                                              cpupm_mispredict_thresh) {
 578  578                                                  /*
 579  579                                                   * There's enough transient work
 580  580                                                   * transitions to justify
 581  581                                                   * governing future raise
 582  582                                                   * requests.
 583  583                                                   */
 584  584                                                  dom->cpd_governor =
 585  585                                                      CPUPM_GOV_TRANS_WORK;
 586  586                                                  dom->cpd_tw = 0;
 587  587                                                  DTRACE_PROBE1(
 588  588                                                      cpupm__tw__governed,
 589  589                                                      cpupm_domain_t *, dom);
 590  590                                          }
 591  591                                  } else {
 592  592                                          /*
 593  593                                           * We correctly predicted during the
 594  594                                           * last raise.
 595  595                                           */
 596  596                                          dom->cpd_tw = 0;
 597  597                                  }
 598  598                          }
 599  599                          if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
 600  600                                  /*
 601  601                                   * Lowering requests are governed due to
 602  602                                   * transient idleness.
 603  603                                   */
 604  604                                  DTRACE_PROBE1(cpupm__lowering__governed,
 605  605                                      cpupm_domain_t *, dom);
 606  606  
 607  607                                  return;
 608  608                          }
 609  609  
 610  610                          /*
 611  611                           * Prepare to transition to a lower power state.
 612  612                           */
 613  613                          new_state =
 614  614                              dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
 615  615  
 616  616                  } else if (dom->cpd_state ==
 617  617                      dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
 618  618  
 619  619                          /*
 620  620                           * The domain is idle, and we're already running in
 621  621                           * the lower power state. Take this opportunity to
 622  622                           * perform some book keeping if the last raising
 623  623                           * request was governed.
 624  624                           */
 625  625                          if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
 626  626                                  if ((now - last) >= cpupm_tw_predict_interval) {
 627  627                                          /*
 628  628                                           * The domain is transient work
 629  629                                           * governed, and we mispredicted
 630  630                                           * governing the last raising request.
 631  631                                           */
 632  632                                          if (++dom->cpd_tw >=
 633  633                                              cpupm_mispredict_gov_thresh) {
 634  634                                                  /*
 635  635                                                   * There's enough non-transient
 636  636                                                   * work to justify removing
 637  637                                                   * the governor.
 638  638                                                   */
 639  639                                                  dom->cpd_governor =
 640  640                                                      CPUPM_GOV_DISENGAGED;
 641  641                                                  dom->cpd_tw = 0;
 642  642                                                  DTRACE_PROBE1(
 643  643                                                      cpupm__tw__ungoverned,
 644  644                                                      cpupm_domain_t *, dom);
 645  645                                          }
 646  646                                  } else {
 647  647                                          /*
 648  648                                           * We correctly predicted governing
 649  649                                           * the last raise.
 650  650                                           */
 651  651                                          dom->cpd_tw = 0;
 652  652                                  }
 653  653                          }
 654  654                  }
 655  655                  break;
 656  656          }
 657  657          /*
 658  658           * Change the power state
 659  659           * Not much currently done if this doesn't succeed
 660  660           */
 661  661          if (new_state)
 662  662                  (void) cpupm_change_state(cp, dom, new_state);
 663  663  }
 664  664  
 665  665  
 666  666  /*
 667  667   * Interface called by platforms to dynamically change the
 668  668   * MAX performance cpupm state
 669  669   */
 670  670  void
 671  671  cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
 672  672  {
 673  673          cpupm_domain_t  *dom;
 674  674          id_t            did;
 675  675          cpupm_dtype_t   type = CPUPM_DTYPE_ACTIVE;
 676  676          boolean_t       change_state = B_FALSE;
 677  677          cpupm_state_t   *new_state = NULL;
 678  678  
 679  679          did = cpupm_domain_id(cp, type);
 680  680          if (MUTEX_HELD(&cpu_lock)) {
 681  681                  dom = cpupm_domain_find(did, type);
 682  682          } else {
 683  683                  mutex_enter(&cpu_lock);
 684  684                  dom = cpupm_domain_find(did, type);
 685  685                  mutex_exit(&cpu_lock);
 686  686          }
 687  687  
 688  688          /*
 689  689           * Can use a lock to avoid changing the power state of the cpu when
 690  690           * CPUPM_STATE_MAX_PERF is getting changed.
 691  691           * Since the occurance of events to change MAX_PERF is not frequent,
 692  692           * it may not be a good idea to overburden with locks. In the worst
 693  693           * case, for one cycle the power may not get changed to the required
 694  694           * level
 695  695           */
 696  696          if (dom != NULL) {
 697  697                  if (dom->cpd_state ==
 698  698                      dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
 699  699                          change_state = B_TRUE;
 700  700                  }
 701  701  
 702  702                  /*
 703  703                   * If an out of range level is passed, use the lowest supported
 704  704                   * speed.
 705  705                   */
 706  706                  if (max_perf_level >= dom->cpd_nstates &&
 707  707                      dom->cpd_nstates > 1) {
 708  708                          max_perf_level = dom->cpd_nstates - 1;
 709  709                  }
 710  710  
 711  711                  dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
 712  712                      &dom->cpd_states[max_perf_level];
 713  713  
 714  714                  /*
 715  715                   * If the current state is MAX_PERF, change the current state
 716  716                   * to the new MAX_PERF
 717  717                   */
 718  718                  if (change_state) {
 719  719                          new_state =
 720  720                              dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
 721  721                          if (new_state) {
 722  722                                  (void) cpupm_change_state(cp, dom, new_state);
 723  723                          }
 724  724                  }
 725  725          }
 726  726  }
 727  727  
 728  728  /*
 729  729   * Initialize the parameters for the transience governor state machine
 730  730   */
 731  731  static void
 732  732  cpupm_governor_initialize(void)
 733  733  {
 734  734          /*
 735  735           * The default prediction intervals are specified in nanoseconds.
 736  736           * Convert these to the equivalent in unscaled hrtime, which is the
 737  737           * format of the timestamps passed to cpupm_utilization_event()
 738  738           */
 739  739          cpupm_ti_predict_interval = unscalehrtime(cpupm_ti_gov_interval);
 740  740          cpupm_tw_predict_interval = unscalehrtime(cpupm_tw_gov_interval);
 741  741  }
 742  742  
 743  743  /*
 744  744   * Initiate a state change in all CPUPM domain instances of the specified type
 745  745   */
 746  746  static void
 747  747  cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
 748  748  {
 749  749          cpu_t           *cp;
 750  750          pg_cmt_t        *pwr_pg;
 751  751          cpupm_domain_t  *dom;
 752  752          group_t         *hwset;
 753  753          group_iter_t    giter;
 754  754          pg_cpu_itr_t    cpu_iter;
 755  755          pghw_type_t     hw;
 756  756  
 757  757          ASSERT(MUTEX_HELD(&cpu_lock));
 758  758  
 759  759          switch (type) {
 760  760          case CPUPM_DTYPE_ACTIVE:
 761  761                  hw = PGHW_POW_ACTIVE;
 762  762                  break;
 763  763          default:
 764  764                  /*
 765  765                   * Power domain types other than "active" unsupported.
 766  766                   */
 767  767                  ASSERT(type == CPUPM_DTYPE_ACTIVE);
 768  768                  return;
 769  769          }
 770  770  
 771  771          if ((hwset = pghw_set_lookup(hw)) == NULL)
 772  772                  return;
 773  773  
 774  774          /*
 775  775           * Iterate over the power domains
 776  776           */
 777  777          group_iter_init(&giter);
 778  778          while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
 779  779  
 780  780                  dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
 781  781  
 782  782                  /*
 783  783                   * Iterate over the CPUs in each domain
 784  784                   */
 785  785                  PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
 786  786                  while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
 787  787                          (void) cpupm_change_state(cp, dom,
 788  788                              dom->cpd_named_states[state]);
 789  789                  }
 790  790          }
 791  791  }

↓ open down ↓

550 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX