6583-remove-whole-process-swapping Wdiff usr/src/uts/common/disp/disp.c

Print this page

6583 remove whole-process swapping

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/disp/disp.c
          +++ new/usr/src/uts/common/disp/disp.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  27   27  /*        All Rights Reserved   */
  28   28  
  29   29  
  30   30  #include <sys/types.h>
  31   31  #include <sys/param.h>
  32   32  #include <sys/sysmacros.h>
  33   33  #include <sys/signal.h>
  34   34  #include <sys/user.h>
  35   35  #include <sys/systm.h>
  36   36  #include <sys/sysinfo.h>
  37   37  #include <sys/var.h>
  38   38  #include <sys/errno.h>
  39   39  #include <sys/cmn_err.h>
  40   40  #include <sys/debug.h>
  41   41  #include <sys/inline.h>
  42   42  #include <sys/disp.h>
  43   43  #include <sys/class.h>
  44   44  #include <sys/bitmap.h>
  45   45  #include <sys/kmem.h>
  46   46  #include <sys/cpuvar.h>
  47   47  #include <sys/vtrace.h>
  48   48  #include <sys/tnf.h>
  49   49  #include <sys/cpupart.h>
  50   50  #include <sys/lgrp.h>
  51   51  #include <sys/pg.h>
  52   52  #include <sys/cmt.h>
  53   53  #include <sys/bitset.h>
  54   54  #include <sys/schedctl.h>
  55   55  #include <sys/atomic.h>
  56   56  #include <sys/dtrace.h>
  57   57  #include <sys/sdt.h>
  58   58  #include <sys/archsystm.h>
  59   59  
  60   60  #include <vm/as.h>
  61   61  
  62   62  #define BOUND_CPU       0x1
  63   63  #define BOUND_PARTITION 0x2
  64   64  #define BOUND_INTR      0x4
  65   65  
  66   66  /* Dispatch queue allocation structure and functions */
  67   67  struct disp_queue_info {
  68   68          disp_t  *dp;
  69   69          dispq_t *olddispq;
  70   70          dispq_t *newdispq;
  71   71          ulong_t *olddqactmap;
  72   72          ulong_t *newdqactmap;
  73   73          int     oldnglobpris;
  74   74  };
  75   75  static void     disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
  76   76      disp_t *dp);
  77   77  static void     disp_dq_assign(struct disp_queue_info *dptr, int numpris);
  78   78  static void     disp_dq_free(struct disp_queue_info *dptr);
  79   79  
  80   80  /* platform-specific routine to call when processor is idle */
  81   81  static void     generic_idle_cpu();
  82   82  void            (*idle_cpu)() = generic_idle_cpu;
  83   83  
  84   84  /* routines invoked when a CPU enters/exits the idle loop */
  85   85  static void     idle_enter();
  86   86  static void     idle_exit();
  87   87  
  88   88  /* platform-specific routine to call when thread is enqueued */

↓ open down ↓

88 lines elided

↑ open up ↑

  89   89  static void     generic_enq_thread(cpu_t *, int);
  90   90  void            (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
  91   91  
  92   92  pri_t   kpreemptpri;            /* priority where kernel preemption applies */
  93   93  pri_t   upreemptpri = 0;        /* priority where normal preemption applies */
  94   94  pri_t   intr_pri;               /* interrupt thread priority base level */
  95   95  
  96   96  #define KPQPRI  -1              /* pri where cpu affinity is dropped for kpq */
  97   97  pri_t   kpqpri = KPQPRI;        /* can be set in /etc/system */
  98   98  disp_t  cpu0_disp;              /* boot CPU's dispatch queue */
  99      -disp_lock_t     swapped_lock;   /* lock swapped threads and swap queue */
 100   99  int     nswapped;               /* total number of swapped threads */
 101      -void    disp_swapped_enq(kthread_t *tp);
 102  100  static void     disp_swapped_setrun(kthread_t *tp);
 103  101  static void     cpu_resched(cpu_t *cp, pri_t tpri);
 104  102  
 105  103  /*
 106  104   * If this is set, only interrupt threads will cause kernel preemptions.
 107  105   * This is done by changing the value of kpreemptpri.  kpreemptpri
 108  106   * will either be the max sysclass pri + 1 or the min interrupt pri.
 109  107   */
 110  108  int     only_intr_kpreempt;
 111  109

 112  110  extern void set_idle_cpu(int cpun);
 113  111  extern void unset_idle_cpu(int cpun);
 114  112  static void setkpdq(kthread_t *tp, int borf);
 115  113  #define SETKP_BACK      0
 116  114  #define SETKP_FRONT     1
 117  115  /*
 118  116   * Parameter that determines how recently a thread must have run
 119  117   * on the CPU to be considered loosely-bound to that CPU to reduce
 120  118   * cold cache effects.  The interval is in hertz.
 121  119   */
 122  120  #define RECHOOSE_INTERVAL 3
 123  121  int     rechoose_interval = RECHOOSE_INTERVAL;
 124  122  
 125  123  /*
 126  124   * Parameter that determines how long (in nanoseconds) a thread must
 127  125   * be sitting on a run queue before it can be stolen by another CPU
 128  126   * to reduce migrations.  The interval is in nanoseconds.
 129  127   *
 130  128   * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
 131  129   * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
 132  130   * here indicating it is uninitiallized.
 133  131   * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
 134  132   *
 135  133   */
 136  134  #define NOSTEAL_UNINITIALIZED   (-1)
 137  135  hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
 138  136  extern void cmp_set_nosteal_interval(void);
 139  137  
 140  138  id_t    defaultcid;     /* system "default" class; see dispadmin(1M) */
 141  139  
 142  140  disp_lock_t     transition_lock;        /* lock on transitioning threads */
 143  141  disp_lock_t     stop_lock;              /* lock on stopped threads */
 144  142  
 145  143  static void     cpu_dispqalloc(int numpris);
 146  144  
 147  145  /*
 148  146   * This gets returned by disp_getwork/disp_getbest if we couldn't steal
 149  147   * a thread because it was sitting on its run queue for a very short
 150  148   * period of time.
 151  149   */
 152  150  #define T_DONTSTEAL     (kthread_t *)(-1) /* returned by disp_getwork/getbest */
 153  151  
 154  152  static kthread_t        *disp_getwork(cpu_t *to);
 155  153  static kthread_t        *disp_getbest(disp_t *from);
 156  154  static kthread_t        *disp_ratify(kthread_t *tp, disp_t *kpq);
 157  155  
 158  156  void    swtch_to(kthread_t *);
 159  157  
 160  158  /*
 161  159   * dispatcher and scheduler initialization
 162  160   */
 163  161  
 164  162  /*
 165  163   * disp_setup - Common code to calculate and allocate dispatcher
 166  164   *              variables and structures based on the maximum priority.
 167  165   */
 168  166  static void
 169  167  disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
 170  168  {
 171  169          pri_t   newnglobpris;
 172  170  
 173  171          ASSERT(MUTEX_HELD(&cpu_lock));
 174  172  
 175  173          newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
 176  174  
 177  175          if (newnglobpris > oldnglobpris) {
 178  176                  /*
 179  177                   * Allocate new kp queues for each CPU partition.
 180  178                   */
 181  179                  cpupart_kpqalloc(newnglobpris);
 182  180  
 183  181                  /*
 184  182                   * Allocate new dispatch queues for each CPU.
 185  183                   */
 186  184                  cpu_dispqalloc(newnglobpris);
 187  185  
 188  186                  /*
 189  187                   * compute new interrupt thread base priority
 190  188                   */
 191  189                  intr_pri = maxglobpri;
 192  190                  if (only_intr_kpreempt) {
 193  191                          kpreemptpri = intr_pri + 1;
 194  192                          if (kpqpri == KPQPRI)
 195  193                                  kpqpri = kpreemptpri;
 196  194                  }
 197  195                  v.v_nglobpris = newnglobpris;
 198  196          }
 199  197  }
 200  198  
 201  199  /*
 202  200   * dispinit - Called to initialize all loaded classes and the
 203  201   *            dispatcher framework.
 204  202   */
 205  203  void
 206  204  dispinit(void)
 207  205  {
 208  206          id_t    cid;
 209  207          pri_t   maxglobpri;
 210  208          pri_t   cl_maxglobpri;
 211  209  
 212  210          maxglobpri = -1;
 213  211  
 214  212          /*
 215  213           * Initialize transition lock, which will always be set.
 216  214           */
 217  215          DISP_LOCK_INIT(&transition_lock);
 218  216          disp_lock_enter_high(&transition_lock);
 219  217          DISP_LOCK_INIT(&stop_lock);
 220  218  
 221  219          mutex_enter(&cpu_lock);
 222  220          CPU->cpu_disp->disp_maxrunpri = -1;
 223  221          CPU->cpu_disp->disp_max_unbound_pri = -1;
 224  222  
 225  223          /*
 226  224           * Initialize the default CPU partition.
 227  225           */
 228  226          cpupart_initialize_default();
 229  227          /*
 230  228           * Call the class specific initialization functions for
 231  229           * all pre-installed schedulers.
 232  230           *
 233  231           * We pass the size of a class specific parameter
 234  232           * buffer to each of the initialization functions
 235  233           * to try to catch problems with backward compatibility
 236  234           * of class modules.
 237  235           *
 238  236           * For example a new class module running on an old system
 239  237           * which didn't provide sufficiently large parameter buffers
 240  238           * would be bad news. Class initialization modules can check for
 241  239           * this and take action if they detect a problem.
 242  240           */
 243  241  
 244  242          for (cid = 0; cid < nclass; cid++) {
 245  243                  sclass_t        *sc;
 246  244  
 247  245                  sc = &sclass[cid];
 248  246                  if (SCHED_INSTALLED(sc)) {
 249  247                          cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
 250  248                              &sc->cl_funcs);
 251  249                          if (cl_maxglobpri > maxglobpri)
 252  250                                  maxglobpri = cl_maxglobpri;
 253  251                  }
 254  252          }
 255  253          kpreemptpri = (pri_t)v.v_maxsyspri + 1;
 256  254          if (kpqpri == KPQPRI)
 257  255                  kpqpri = kpreemptpri;
 258  256  
 259  257          ASSERT(maxglobpri >= 0);
 260  258          disp_setup(maxglobpri, 0);
 261  259  
 262  260          mutex_exit(&cpu_lock);
 263  261  
 264  262          /*
 265  263           * Platform specific sticky scheduler setup.
 266  264           */
 267  265          if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
 268  266                  cmp_set_nosteal_interval();
 269  267  
 270  268          /*
 271  269           * Get the default class ID; this may be later modified via
 272  270           * dispadmin(1M).  This will load the class (normally TS) and that will
 273  271           * call disp_add(), which is why we had to drop cpu_lock first.
 274  272           */
 275  273          if (getcid(defaultclass, &defaultcid) != 0) {
 276  274                  cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
 277  275                      defaultclass);
 278  276          }
 279  277  }
 280  278  
 281  279  /*
 282  280   * disp_add - Called with class pointer to initialize the dispatcher
 283  281   *            for a newly loaded class.
 284  282   */
 285  283  void
 286  284  disp_add(sclass_t *clp)
 287  285  {
 288  286          pri_t   maxglobpri;
 289  287          pri_t   cl_maxglobpri;
 290  288  
 291  289          mutex_enter(&cpu_lock);
 292  290          /*
 293  291           * Initialize the scheduler class.
 294  292           */
 295  293          maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
 296  294          cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
 297  295          if (cl_maxglobpri > maxglobpri)
 298  296                  maxglobpri = cl_maxglobpri;
 299  297  
 300  298          /*
 301  299           * Save old queue information.  Since we're initializing a
 302  300           * new scheduling class which has just been loaded, then
 303  301           * the size of the dispq may have changed.  We need to handle
 304  302           * that here.
 305  303           */
 306  304          disp_setup(maxglobpri, v.v_nglobpris);
 307  305  
 308  306          mutex_exit(&cpu_lock);
 309  307  }
 310  308  
 311  309  
 312  310  /*
 313  311   * For each CPU, allocate new dispatch queues
 314  312   * with the stated number of priorities.
 315  313   */
 316  314  static void
 317  315  cpu_dispqalloc(int numpris)
 318  316  {
 319  317          cpu_t   *cpup;
 320  318          struct disp_queue_info  *disp_mem;
 321  319          int i, num;
 322  320  
 323  321          ASSERT(MUTEX_HELD(&cpu_lock));
 324  322  
 325  323          disp_mem = kmem_zalloc(NCPU *
 326  324              sizeof (struct disp_queue_info), KM_SLEEP);
 327  325  
 328  326          /*
 329  327           * This routine must allocate all of the memory before stopping
 330  328           * the cpus because it must not sleep in kmem_alloc while the
 331  329           * CPUs are stopped.  Locks they hold will not be freed until they
 332  330           * are restarted.
 333  331           */
 334  332          i = 0;
 335  333          cpup = cpu_list;
 336  334          do {
 337  335                  disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
 338  336                  i++;
 339  337                  cpup = cpup->cpu_next;
 340  338          } while (cpup != cpu_list);
 341  339          num = i;
 342  340  
 343  341          pause_cpus(NULL, NULL);
 344  342          for (i = 0; i < num; i++)
 345  343                  disp_dq_assign(&disp_mem[i], numpris);
 346  344          start_cpus();
 347  345  
 348  346          /*
 349  347           * I must free all of the memory after starting the cpus because
 350  348           * I can not risk sleeping in kmem_free while the cpus are stopped.
 351  349           */
 352  350          for (i = 0; i < num; i++)
 353  351                  disp_dq_free(&disp_mem[i]);
 354  352  
 355  353          kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
 356  354  }
 357  355  
 358  356  static void
 359  357  disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
 360  358  {
 361  359          dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
 362  360          dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
 363  361              sizeof (long), KM_SLEEP);
 364  362          dptr->dp = dp;
 365  363  }
 366  364  
 367  365  static void
 368  366  disp_dq_assign(struct disp_queue_info *dptr, int numpris)
 369  367  {
 370  368          disp_t  *dp;
 371  369  
 372  370          dp = dptr->dp;
 373  371          dptr->olddispq = dp->disp_q;
 374  372          dptr->olddqactmap = dp->disp_qactmap;
 375  373          dptr->oldnglobpris = dp->disp_npri;
 376  374  
 377  375          ASSERT(dptr->oldnglobpris < numpris);
 378  376  
 379  377          if (dptr->olddispq != NULL) {
 380  378                  /*
 381  379                   * Use kcopy because bcopy is platform-specific
 382  380                   * and could block while we might have paused the cpus.
 383  381                   */
 384  382                  (void) kcopy(dptr->olddispq, dptr->newdispq,
 385  383                      dptr->oldnglobpris * sizeof (dispq_t));
 386  384                  (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
 387  385                      ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
 388  386                      sizeof (long));
 389  387          }
 390  388          dp->disp_q = dptr->newdispq;
 391  389          dp->disp_qactmap = dptr->newdqactmap;
 392  390          dp->disp_q_limit = &dptr->newdispq[numpris];
 393  391          dp->disp_npri = numpris;
 394  392  }
 395  393  
 396  394  static void
 397  395  disp_dq_free(struct disp_queue_info *dptr)
 398  396  {
 399  397          if (dptr->olddispq != NULL)
 400  398                  kmem_free(dptr->olddispq,
 401  399                      dptr->oldnglobpris * sizeof (dispq_t));
 402  400          if (dptr->olddqactmap != NULL)
 403  401                  kmem_free(dptr->olddqactmap,
 404  402                      ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
 405  403  }
 406  404  
 407  405  /*
 408  406   * For a newly created CPU, initialize the dispatch queue.
 409  407   * This is called before the CPU is known through cpu[] or on any lists.
 410  408   */
 411  409  void
 412  410  disp_cpu_init(cpu_t *cp)
 413  411  {
 414  412          disp_t  *dp;
 415  413          dispq_t *newdispq;
 416  414          ulong_t *newdqactmap;
 417  415  
 418  416          ASSERT(MUTEX_HELD(&cpu_lock));  /* protect dispatcher queue sizes */
 419  417  
 420  418          if (cp == cpu0_disp.disp_cpu)
 421  419                  dp = &cpu0_disp;
 422  420          else
 423  421                  dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
 424  422          bzero(dp, sizeof (disp_t));
 425  423          cp->cpu_disp = dp;
 426  424          dp->disp_cpu = cp;
 427  425          dp->disp_maxrunpri = -1;
 428  426          dp->disp_max_unbound_pri = -1;
 429  427          DISP_LOCK_INIT(&cp->cpu_thread_lock);
 430  428          /*
 431  429           * Allocate memory for the dispatcher queue headers
 432  430           * and the active queue bitmap.
 433  431           */
 434  432          newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
 435  433          newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
 436  434              sizeof (long), KM_SLEEP);
 437  435          dp->disp_q = newdispq;
 438  436          dp->disp_qactmap = newdqactmap;
 439  437          dp->disp_q_limit = &newdispq[v.v_nglobpris];
 440  438          dp->disp_npri = v.v_nglobpris;
 441  439  }
 442  440  
 443  441  void
 444  442  disp_cpu_fini(cpu_t *cp)
 445  443  {
 446  444          ASSERT(MUTEX_HELD(&cpu_lock));
 447  445  
 448  446          disp_kp_free(cp->cpu_disp);
 449  447          if (cp->cpu_disp != &cpu0_disp)
 450  448                  kmem_free(cp->cpu_disp, sizeof (disp_t));
 451  449  }
 452  450  
 453  451  /*
 454  452   * Allocate new, larger kpreempt dispatch queue to replace the old one.
 455  453   */
 456  454  void
 457  455  disp_kp_alloc(disp_t *dq, pri_t npri)
 458  456  {
 459  457          struct disp_queue_info  mem_info;
 460  458  
 461  459          if (npri > dq->disp_npri) {
 462  460                  /*
 463  461                   * Allocate memory for the new array.
 464  462                   */
 465  463                  disp_dq_alloc(&mem_info, npri, dq);
 466  464  
 467  465                  /*
 468  466                   * We need to copy the old structures to the new
 469  467                   * and free the old.
 470  468                   */
 471  469                  disp_dq_assign(&mem_info, npri);
 472  470                  disp_dq_free(&mem_info);
 473  471          }
 474  472  }
 475  473  
 476  474  /*
 477  475   * Free dispatch queue.
 478  476   * Used for the kpreempt queues for a removed CPU partition and
 479  477   * for the per-CPU queues of deleted CPUs.
 480  478   */
 481  479  void
 482  480  disp_kp_free(disp_t *dq)
 483  481  {
 484  482          struct disp_queue_info  mem_info;
 485  483  
 486  484          mem_info.olddispq = dq->disp_q;
 487  485          mem_info.olddqactmap = dq->disp_qactmap;
 488  486          mem_info.oldnglobpris = dq->disp_npri;
 489  487          disp_dq_free(&mem_info);
 490  488  }
 491  489  
 492  490  /*
 493  491   * End dispatcher and scheduler initialization.
 494  492   */
 495  493  
 496  494  /*
 497  495   * See if there's anything to do other than remain idle.
 498  496   * Return non-zero if there is.
 499  497   *
 500  498   * This function must be called with high spl, or with
 501  499   * kernel preemption disabled to prevent the partition's
 502  500   * active cpu list from changing while being traversed.
 503  501   *
 504  502   * This is essentially a simpler version of disp_getwork()
 505  503   * to be called by CPUs preparing to "halt".
 506  504   */
 507  505  int
 508  506  disp_anywork(void)
 509  507  {
 510  508          cpu_t           *cp = CPU;
 511  509          cpu_t           *ocp;
 512  510          volatile int    *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
 513  511  
 514  512          if (!(cp->cpu_flags & CPU_OFFLINE)) {
 515  513                  if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
 516  514                          return (1);
 517  515  
 518  516                  for (ocp = cp->cpu_next_part; ocp != cp;
 519  517                      ocp = ocp->cpu_next_part) {
 520  518                          ASSERT(CPU_ACTIVE(ocp));
 521  519  
 522  520                          /*
 523  521                           * Something has appeared on the local run queue.
 524  522                           */
 525  523                          if (*local_nrunnable > 0)
 526  524                                  return (1);
 527  525                          /*
 528  526                           * If we encounter another idle CPU that will
 529  527                           * soon be trolling around through disp_anywork()
 530  528                           * terminate our walk here and let this other CPU
 531  529                           * patrol the next part of the list.
 532  530                           */
 533  531                          if (ocp->cpu_dispatch_pri == -1 &&
 534  532                              (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
 535  533                                  return (0);
 536  534                          /*
 537  535                           * Work can be taken from another CPU if:
 538  536                           *      - There is unbound work on the run queue
 539  537                           *      - That work isn't a thread undergoing a
 540  538                           *      - context switch on an otherwise empty queue.
 541  539                           *      - The CPU isn't running the idle loop.
 542  540                           */
 543  541                          if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
 544  542                              !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
 545  543                              ocp->cpu_disp->disp_nrunnable == 1) &&
 546  544                              ocp->cpu_dispatch_pri != -1)
 547  545                                  return (1);
 548  546                  }
 549  547          }
 550  548          return (0);
 551  549  }
 552  550  
 553  551  /*
 554  552   * Called when CPU enters the idle loop
 555  553   */
 556  554  static void
 557  555  idle_enter()
 558  556  {
 559  557          cpu_t           *cp = CPU;
 560  558  
 561  559          new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
 562  560          CPU_STATS_ADDQ(cp, sys, idlethread, 1);
 563  561          set_idle_cpu(cp->cpu_id);       /* arch-dependent hook */
 564  562  }
 565  563  
 566  564  /*
 567  565   * Called when CPU exits the idle loop
 568  566   */
 569  567  static void
 570  568  idle_exit()
 571  569  {
 572  570          cpu_t           *cp = CPU;
 573  571  
 574  572          new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
 575  573          unset_idle_cpu(cp->cpu_id);     /* arch-dependent hook */
 576  574  }
 577  575  
 578  576  /*
 579  577   * Idle loop.
 580  578   */
 581  579  void
 582  580  idle()
 583  581  {
 584  582          struct cpu      *cp = CPU;              /* pointer to this CPU */
 585  583          kthread_t       *t;                     /* taken thread */
 586  584  
 587  585          idle_enter();
 588  586  
 589  587          /*
 590  588           * Uniprocessor version of idle loop.
 591  589           * Do this until notified that we're on an actual multiprocessor.
 592  590           */
 593  591          while (ncpus == 1) {
 594  592                  if (cp->cpu_disp->disp_nrunnable == 0) {
 595  593                          (*idle_cpu)();
 596  594                          continue;
 597  595                  }
 598  596                  idle_exit();
 599  597                  swtch();
 600  598  
 601  599                  idle_enter(); /* returned from swtch */
 602  600          }
 603  601  
 604  602          /*
 605  603           * Multiprocessor idle loop.
 606  604           */
 607  605          for (;;) {
 608  606                  /*
 609  607                   * If CPU is completely quiesced by p_online(2), just wait
 610  608                   * here with minimal bus traffic until put online.
 611  609                   */
 612  610                  while (cp->cpu_flags & CPU_QUIESCED)
 613  611                          (*idle_cpu)();
 614  612  
 615  613                  if (cp->cpu_disp->disp_nrunnable != 0) {
 616  614                          idle_exit();
 617  615                          swtch();
 618  616                  } else {
 619  617                          if (cp->cpu_flags & CPU_OFFLINE)
 620  618                                  continue;
 621  619                          if ((t = disp_getwork(cp)) == NULL) {
 622  620                                  if (cp->cpu_chosen_level != -1) {
 623  621                                          disp_t *dp = cp->cpu_disp;
 624  622                                          disp_t *kpq;
 625  623  
 626  624                                          disp_lock_enter(&dp->disp_lock);
 627  625                                          /*
 628  626                                           * Set kpq under lock to prevent
 629  627                                           * migration between partitions.
 630  628                                           */
 631  629                                          kpq = &cp->cpu_part->cp_kp_queue;
 632  630                                          if (kpq->disp_maxrunpri == -1)
 633  631                                                  cp->cpu_chosen_level = -1;
 634  632                                          disp_lock_exit(&dp->disp_lock);
 635  633                                  }
 636  634                                  (*idle_cpu)();
 637  635                                  continue;
 638  636                          }
 639  637                          /*
 640  638                           * If there was a thread but we couldn't steal
 641  639                           * it, then keep trying.
 642  640                           */
 643  641                          if (t == T_DONTSTEAL)
 644  642                                  continue;
 645  643                          idle_exit();
 646  644                          swtch_to(t);
 647  645                  }
 648  646                  idle_enter(); /* returned from swtch/swtch_to */
 649  647          }
 650  648  }
 651  649  
 652  650  
 653  651  /*
 654  652   * Preempt the currently running thread in favor of the highest
 655  653   * priority thread.  The class of the current thread controls
 656  654   * where it goes on the dispatcher queues. If panicking, turn
 657  655   * preemption off.
 658  656   */
 659  657  void
 660  658  preempt()
 661  659  {
 662  660          kthread_t       *t = curthread;
 663  661          klwp_t          *lwp = ttolwp(curthread);
 664  662  
 665  663          if (panicstr)
 666  664                  return;
 667  665  
 668  666          TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
 669  667  
 670  668          thread_lock(t);
 671  669  
 672  670          if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
 673  671                  /*
 674  672                   * this thread has already been chosen to be run on
 675  673                   * another CPU. Clear kprunrun on this CPU since we're
 676  674                   * already headed for swtch().
 677  675                   */
 678  676                  CPU->cpu_kprunrun = 0;
 679  677                  thread_unlock_nopreempt(t);
 680  678                  TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 681  679          } else {
 682  680                  if (lwp != NULL)
 683  681                          lwp->lwp_ru.nivcsw++;
 684  682                  CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
 685  683                  THREAD_TRANSITION(t);
 686  684                  CL_PREEMPT(t);
 687  685                  DTRACE_SCHED(preempt);
 688  686                  thread_unlock_nopreempt(t);
 689  687  
 690  688                  TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 691  689  
 692  690                  swtch();                /* clears CPU->cpu_runrun via disp() */
 693  691          }
 694  692  }
 695  693  
 696  694  extern kthread_t *thread_unpin();
 697  695  
 698  696  /*
 699  697   * disp() - find the highest priority thread for this processor to run, and
 700  698   * set it in TS_ONPROC state so that resume() can be called to run it.
 701  699   */
 702  700  static kthread_t *
 703  701  disp()
 704  702  {
 705  703          cpu_t           *cpup;
 706  704          disp_t          *dp;
 707  705          kthread_t       *tp;
 708  706          dispq_t         *dq;
 709  707          int             maxrunword;
 710  708          pri_t           pri;
 711  709          disp_t          *kpq;
 712  710  
 713  711          TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
 714  712  
 715  713          cpup = CPU;
 716  714          /*
 717  715           * Find the highest priority loaded, runnable thread.
 718  716           */
 719  717          dp = cpup->cpu_disp;
 720  718  
 721  719  reschedule:
 722  720          /*
 723  721           * If there is more important work on the global queue with a better
 724  722           * priority than the maximum on this CPU, take it now.
 725  723           */
 726  724          kpq = &cpup->cpu_part->cp_kp_queue;
 727  725          while ((pri = kpq->disp_maxrunpri) >= 0 &&
 728  726              pri >= dp->disp_maxrunpri &&
 729  727              (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
 730  728              (tp = disp_getbest(kpq)) != NULL) {
 731  729                  if (disp_ratify(tp, kpq) != NULL) {
 732  730                          TRACE_1(TR_FAC_DISP, TR_DISP_END,
 733  731                              "disp_end:tid %p", tp);
 734  732                          return (tp);
 735  733                  }
 736  734          }
 737  735  
 738  736          disp_lock_enter(&dp->disp_lock);
 739  737          pri = dp->disp_maxrunpri;
 740  738  
 741  739          /*
 742  740           * If there is nothing to run, look at what's runnable on other queues.
 743  741           * Choose the idle thread if the CPU is quiesced.
 744  742           * Note that CPUs that have the CPU_OFFLINE flag set can still run
 745  743           * interrupt threads, which will be the only threads on the CPU's own
 746  744           * queue, but cannot run threads from other queues.
 747  745           */
 748  746          if (pri == -1) {
 749  747                  if (!(cpup->cpu_flags & CPU_OFFLINE)) {
 750  748                          disp_lock_exit(&dp->disp_lock);
 751  749                          if ((tp = disp_getwork(cpup)) == NULL ||
 752  750                              tp == T_DONTSTEAL) {
 753  751                                  tp = cpup->cpu_idle_thread;
 754  752                                  (void) splhigh();
 755  753                                  THREAD_ONPROC(tp, cpup);
 756  754                                  cpup->cpu_dispthread = tp;
 757  755                                  cpup->cpu_dispatch_pri = -1;
 758  756                                  cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 759  757                                  cpup->cpu_chosen_level = -1;
 760  758                          }
 761  759                  } else {
 762  760                          disp_lock_exit_high(&dp->disp_lock);
 763  761                          tp = cpup->cpu_idle_thread;
 764  762                          THREAD_ONPROC(tp, cpup);
 765  763                          cpup->cpu_dispthread = tp;
 766  764                          cpup->cpu_dispatch_pri = -1;
 767  765                          cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 768  766                          cpup->cpu_chosen_level = -1;

↓ open down ↓

657 lines elided

↑ open up ↑

 769  767                  }
 770  768                  TRACE_1(TR_FAC_DISP, TR_DISP_END,
 771  769                      "disp_end:tid %p", tp);
 772  770                  return (tp);
 773  771          }
 774  772  
 775  773          dq = &dp->disp_q[pri];
 776  774          tp = dq->dq_first;
 777  775  
 778  776          ASSERT(tp != NULL);
 779      -        ASSERT(tp->t_schedflag & TS_LOAD);      /* thread must be swapped in */
 780  777  
 781  778          DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
 782  779  
 783  780          /*
 784  781           * Found it so remove it from queue.
 785  782           */
 786  783          dp->disp_nrunnable--;
 787  784          dq->dq_sruncnt--;
 788  785          if ((dq->dq_first = tp->t_link) == NULL) {
 789  786                  ulong_t *dqactmap = dp->disp_qactmap;

 790  787  
 791  788                  ASSERT(dq->dq_sruncnt == 0);
 792  789                  dq->dq_last = NULL;
 793  790  
 794  791                  /*
 795  792                   * The queue is empty, so the corresponding bit needs to be
 796  793                   * turned off in dqactmap.   If nrunnable != 0 just took the
 797  794                   * last runnable thread off the
 798  795                   * highest queue, so recompute disp_maxrunpri.
 799  796                   */
 800  797                  maxrunword = pri >> BT_ULSHIFT;
 801  798                  dqactmap[maxrunword] &= ~BT_BIW(pri);
 802  799  
 803  800                  if (dp->disp_nrunnable == 0) {
 804  801                          dp->disp_max_unbound_pri = -1;
 805  802                          dp->disp_maxrunpri = -1;
 806  803                  } else {
 807  804                          int ipri;

↓ open down ↓

18 lines elided

↑ open up ↑

 808  805  
 809  806                          ipri = bt_gethighbit(dqactmap, maxrunword);
 810  807                          dp->disp_maxrunpri = ipri;
 811  808                          if (ipri < dp->disp_max_unbound_pri)
 812  809                                  dp->disp_max_unbound_pri = ipri;
 813  810                  }
 814  811          } else {
 815  812                  tp->t_link = NULL;
 816  813          }
 817  814  
 818      -        /*
 819      -         * Set TS_DONT_SWAP flag to prevent another processor from swapping
 820      -         * out this thread before we have a chance to run it.
 821      -         * While running, it is protected against swapping by t_lock.
 822      -         */
 823      -        tp->t_schedflag |= TS_DONT_SWAP;
 824  815          cpup->cpu_dispthread = tp;              /* protected by spl only */
 825  816          cpup->cpu_dispatch_pri = pri;
 826  817          ASSERT(pri == DISP_PRIO(tp));
 827  818          thread_onproc(tp, cpup);                /* set t_state to TS_ONPROC */
 828  819          disp_lock_exit_high(&dp->disp_lock);    /* drop run queue lock */
 829  820  
 830  821          ASSERT(tp != NULL);
 831  822          TRACE_1(TR_FAC_DISP, TR_DISP_END,
 832  823              "disp_end:tid %p", tp);
 833  824

 834  825          if (disp_ratify(tp, kpq) == NULL)
 835  826                  goto reschedule;
 836  827  
 837  828          return (tp);
 838  829  }
 839  830  
 840  831  /*
 841  832   * swtch()
 842  833   *      Find best runnable thread and run it.
 843  834   *      Called with the current thread already switched to a new state,
 844  835   *      on a sleep queue, run queue, stopped, and not zombied.
 845  836   *      May be called at any spl level less than or equal to LOCK_LEVEL.
 846  837   *      Always drops spl to the base level (spl0()).
 847  838   */
 848  839  void
 849  840  swtch()
 850  841  {
 851  842          kthread_t       *t = curthread;
 852  843          kthread_t       *next;
 853  844          cpu_t           *cp;
 854  845  
 855  846          TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 856  847  
 857  848          if (t->t_flag & T_INTR_THREAD)
 858  849                  cpu_intr_swtch_enter(t);
 859  850  
 860  851          if (t->t_intr != NULL) {
 861  852                  /*
 862  853                   * We are an interrupt thread.  Setup and return
 863  854                   * the interrupted thread to be resumed.
 864  855                   */
 865  856                  (void) splhigh();       /* block other scheduler action */
 866  857                  cp = CPU;               /* now protected against migration */
 867  858                  ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 868  859                  CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 869  860                  CPU_STATS_ADDQ(cp, sys, intrblk, 1);
 870  861                  next = thread_unpin();
 871  862                  TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 872  863                  resume_from_intr(next);
 873  864          } else {
 874  865  #ifdef  DEBUG
 875  866                  if (t->t_state == TS_ONPROC &&
 876  867                      t->t_disp_queue->disp_cpu == CPU &&
 877  868                      t->t_preempt == 0) {
 878  869                          thread_lock(t);
 879  870                          ASSERT(t->t_state != TS_ONPROC ||
 880  871                              t->t_disp_queue->disp_cpu != CPU ||
 881  872                              t->t_preempt != 0); /* cannot migrate */
 882  873                          thread_unlock_nopreempt(t);
 883  874                  }
 884  875  #endif  /* DEBUG */
 885  876                  cp = CPU;
 886  877                  next = disp();          /* returns with spl high */
 887  878                  ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 888  879  
 889  880                  /* OK to steal anything left on run queue */
 890  881                  cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
 891  882  
 892  883                  if (next != t) {
 893  884                          hrtime_t now;
 894  885  
 895  886                          now = gethrtime_unscaled();
 896  887                          pg_ev_thread_swtch(cp, now, t, next);
 897  888  
 898  889                          /*
 899  890                           * If t was previously in the TS_ONPROC state,
 900  891                           * setfrontdq and setbackdq won't have set its t_waitrq.
 901  892                           * Since we now finally know that we're switching away
 902  893                           * from this thread, set its t_waitrq if it is on a run
 903  894                           * queue.
 904  895                           */
 905  896                          if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
 906  897                                  t->t_waitrq = now;
 907  898                          }
 908  899  
 909  900                          /*
 910  901                           * restore mstate of thread that we are switching to
 911  902                           */
 912  903                          restore_mstate(next);
 913  904  
 914  905                          CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 915  906                          cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
 916  907                          TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 917  908  
 918  909                          if (dtrace_vtime_active)
 919  910                                  dtrace_vtime_switch(next);
 920  911  
 921  912                          resume(next);
 922  913                          /*
 923  914                           * The TR_RESUME_END and TR_SWTCH_END trace points
 924  915                           * appear at the end of resume(), because we may not
 925  916                           * return here
 926  917                           */
 927  918                  } else {
 928  919                          if (t->t_flag & T_INTR_THREAD)
 929  920                                  cpu_intr_swtch_exit(t);
 930  921                          /*
 931  922                           * Threads that enqueue themselves on a run queue defer
 932  923                           * setting t_waitrq. It is then either set in swtch()
 933  924                           * when the CPU is actually yielded, or not at all if it
 934  925                           * is remaining on the CPU.
 935  926                           * There is however a window between where the thread
 936  927                           * placed itself on a run queue, and where it selects
 937  928                           * itself in disp(), where a third party (eg. clock()
 938  929                           * doing tick processing) may have re-enqueued this
 939  930                           * thread, setting t_waitrq in the process. We detect
 940  931                           * this race by noticing that despite switching to
 941  932                           * ourself, our t_waitrq has been set, and should be
 942  933                           * cleared.
 943  934                           */
 944  935                          if (t->t_waitrq != 0)
 945  936                                  t->t_waitrq = 0;
 946  937  
 947  938                          pg_ev_thread_remain(cp, t);
 948  939  
 949  940                          DTRACE_SCHED(remain__cpu);
 950  941                          TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
 951  942                          (void) spl0();
 952  943                  }
 953  944          }
 954  945  }
 955  946  
 956  947  /*
 957  948   * swtch_from_zombie()
 958  949   *      Special case of swtch(), which allows checks for TS_ZOMB to be
 959  950   *      eliminated from normal resume.
 960  951   *      Find best runnable thread and run it.
 961  952   *      Called with the current thread zombied.
 962  953   *      Zombies cannot migrate, so CPU references are safe.
 963  954   */
 964  955  void
 965  956  swtch_from_zombie()
 966  957  {
 967  958          kthread_t       *next;
 968  959          cpu_t           *cpu = CPU;
 969  960  
 970  961          TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 971  962  
 972  963          ASSERT(curthread->t_state == TS_ZOMB);
 973  964  
 974  965          next = disp();                  /* returns with spl high */
 975  966          ASSERT(CPU_ON_INTR(CPU) == 0);  /* not called with PIL > 10 */
 976  967          CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
 977  968          ASSERT(next != curthread);
 978  969          TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 979  970  
 980  971          pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
 981  972  
 982  973          restore_mstate(next);
 983  974  
 984  975          if (dtrace_vtime_active)
 985  976                  dtrace_vtime_switch(next);
 986  977  
 987  978          resume_from_zombie(next);
 988  979          /*
 989  980           * The TR_RESUME_END and TR_SWTCH_END trace points
 990  981           * appear at the end of resume(), because we certainly will not
 991  982           * return here
 992  983           */
 993  984  }
 994  985  
 995  986  #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
 996  987  
 997  988  /*
 998  989   * search_disp_queues()
 999  990   *      Search the given dispatch queues for thread tp.
1000  991   *      Return 1 if tp is found, otherwise return 0.
1001  992   */
1002  993  static int
1003  994  search_disp_queues(disp_t *dp, kthread_t *tp)
1004  995  {
1005  996          dispq_t         *dq;
1006  997          dispq_t         *eq;
1007  998  
1008  999          disp_lock_enter_high(&dp->disp_lock);
1009 1000  
1010 1001          for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1011 1002                  kthread_t       *rp;
1012 1003  
1013 1004                  ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1014 1005  
1015 1006                  for (rp = dq->dq_first; rp; rp = rp->t_link)
1016 1007                          if (tp == rp) {
1017 1008                                  disp_lock_exit_high(&dp->disp_lock);
1018 1009                                  return (1);
1019 1010                          }
1020 1011          }
1021 1012          disp_lock_exit_high(&dp->disp_lock);
1022 1013  
1023 1014          return (0);
1024 1015  }
1025 1016  
1026 1017  /*
1027 1018   * thread_on_queue()
1028 1019   *      Search all per-CPU dispatch queues and all partition-wide kpreempt
1029 1020   *      queues for thread tp. Return 1 if tp is found, otherwise return 0.
1030 1021   */
1031 1022  static int
1032 1023  thread_on_queue(kthread_t *tp)
1033 1024  {
1034 1025          cpu_t           *cp;
1035 1026          struct cpupart  *part;
1036 1027  
1037 1028          ASSERT(getpil() >= DISP_LEVEL);
1038 1029  
1039 1030          /*
1040 1031           * Search the per-CPU dispatch queues for tp.
1041 1032           */
1042 1033          cp = CPU;
1043 1034          do {
1044 1035                  if (search_disp_queues(cp->cpu_disp, tp))
1045 1036                          return (1);
1046 1037          } while ((cp = cp->cpu_next_onln) != CPU);
1047 1038  
1048 1039          /*
1049 1040           * Search the partition-wide kpreempt queues for tp.
1050 1041           */
1051 1042          part = CPU->cpu_part;
1052 1043          do {
1053 1044                  if (search_disp_queues(&part->cp_kp_queue, tp))
1054 1045                          return (1);
1055 1046          } while ((part = part->cp_next) != CPU->cpu_part);
1056 1047  
1057 1048          return (0);
1058 1049  }
1059 1050  
1060 1051  #else
1061 1052  
1062 1053  #define thread_on_queue(tp)     0       /* ASSERT must be !thread_on_queue */
1063 1054  
1064 1055  #endif  /* DEBUG */
1065 1056  
1066 1057  /*
1067 1058   * like swtch(), but switch to a specified thread taken from another CPU.
1068 1059   *      called with spl high..
1069 1060   */
1070 1061  void
1071 1062  swtch_to(kthread_t *next)
1072 1063  {
1073 1064          cpu_t                   *cp = CPU;
1074 1065          hrtime_t                now;
1075 1066  
1076 1067          TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1077 1068  
1078 1069          /*
1079 1070           * Update context switch statistics.
1080 1071           */
1081 1072          CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1082 1073  
1083 1074          TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1084 1075  
1085 1076          now = gethrtime_unscaled();
1086 1077          pg_ev_thread_swtch(cp, now, curthread, next);
1087 1078  
1088 1079          /* OK to steal anything left on run queue */
1089 1080          cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1090 1081  
1091 1082          /* record last execution time */
1092 1083          cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1093 1084  
1094 1085          /*
1095 1086           * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1096 1087           * won't have set its t_waitrq.  Since we now finally know that we're
1097 1088           * switching away from this thread, set its t_waitrq if it is on a run
1098 1089           * queue.
1099 1090           */
1100 1091          if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1101 1092                  curthread->t_waitrq = now;
1102 1093          }
1103 1094  
1104 1095          /* restore next thread to previously running microstate */
1105 1096          restore_mstate(next);
1106 1097  
1107 1098          if (dtrace_vtime_active)
1108 1099                  dtrace_vtime_switch(next);
1109 1100  
1110 1101          resume(next);
1111 1102          /*
1112 1103           * The TR_RESUME_END and TR_SWTCH_END trace points
1113 1104           * appear at the end of resume(), because we may not
1114 1105           * return here
1115 1106           */
1116 1107  }
1117 1108  
1118 1109  #define CPU_IDLING(pri) ((pri) == -1)
1119 1110  
1120 1111  static void
1121 1112  cpu_resched(cpu_t *cp, pri_t tpri)
1122 1113  {
1123 1114          int     call_poke_cpu = 0;
1124 1115          pri_t   cpupri = cp->cpu_dispatch_pri;
1125 1116  
1126 1117          if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1127 1118                  TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1128 1119                      "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1129 1120                  if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1130 1121                          cp->cpu_runrun = 1;
1131 1122                          aston(cp->cpu_dispthread);
1132 1123                          if (tpri < kpreemptpri && cp != CPU)
1133 1124                                  call_poke_cpu = 1;
1134 1125                  }
1135 1126                  if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1136 1127                          cp->cpu_kprunrun = 1;
1137 1128                          if (cp != CPU)
1138 1129                                  call_poke_cpu = 1;
1139 1130                  }
1140 1131          }
1141 1132  
1142 1133          /*
1143 1134           * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1144 1135           */
1145 1136          membar_enter();
1146 1137  
1147 1138          if (call_poke_cpu)
1148 1139                  poke_cpu(cp->cpu_id);
1149 1140  }
1150 1141  
1151 1142  /*
1152 1143   * setbackdq() keeps runqs balanced such that the difference in length
1153 1144   * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1154 1145   * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1155 1146   * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1156 1147   * try to keep runqs perfectly balanced regardless of the thread priority.
1157 1148   */
1158 1149  #define RUNQ_MATCH_PRI  16      /* pri below which queue lengths must match */
1159 1150  #define RUNQ_MAX_DIFF   2       /* maximum runq length difference */
1160 1151  #define RUNQ_LEN(cp, pri)       ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1161 1152  
1162 1153  /*
1163 1154   * Macro that evaluates to true if it is likely that the thread has cache
1164 1155   * warmth. This is based on the amount of time that has elapsed since the
1165 1156   * thread last ran. If that amount of time is less than "rechoose_interval"
1166 1157   * ticks, then we decide that the thread has enough cache warmth to warrant
1167 1158   * some affinity for t->t_cpu.
1168 1159   */
1169 1160  #define THREAD_HAS_CACHE_WARMTH(thread) \
1170 1161          ((thread == curthread) ||       \
1171 1162          ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1172 1163  /*
1173 1164   * Put the specified thread on the back of the dispatcher
1174 1165   * queue corresponding to its current priority.
1175 1166   *
1176 1167   * Called with the thread in transition, onproc or stopped state
1177 1168   * and locked (transition implies locked) and at high spl.
1178 1169   * Returns with the thread in TS_RUN state and still locked.
1179 1170   */
1180 1171  void
1181 1172  setbackdq(kthread_t *tp)
1182 1173  {
1183 1174          dispq_t *dq;

↓ open down ↓

350 lines elided

↑ open up ↑

1184 1175          disp_t          *dp;
1185 1176          cpu_t           *cp;
1186 1177          pri_t           tpri;
1187 1178          int             bound;
1188 1179          boolean_t       self;
1189 1180  
1190 1181          ASSERT(THREAD_LOCK_HELD(tp));
1191 1182          ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1192 1183          ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1193 1184  
1194      -        /*
1195      -         * If thread is "swapped" or on the swap queue don't
1196      -         * queue it, but wake sched.
1197      -         */
1198      -        if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1199      -                disp_swapped_setrun(tp);
1200      -                return;
1201      -        }
1202      -
1203 1185          self = (tp == curthread);
1204 1186  
1205 1187          if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1206 1188                  bound = 1;
1207 1189          else
1208 1190                  bound = 0;
1209 1191  
1210 1192          tpri = DISP_PRIO(tp);
1211 1193          if (ncpus == 1)
1212 1194                  cp = tp->t_cpu;

1213 1195          else if (!bound) {
1214 1196                  if (tpri >= kpqpri) {
1215 1197                          setkpdq(tp, SETKP_BACK);
1216 1198                          return;
1217 1199                  }
1218 1200  
1219 1201                  /*
1220 1202                   * We'll generally let this thread continue to run where
1221 1203                   * it last ran...but will consider migration if:
1222 1204                   * - We thread probably doesn't have much cache warmth.
1223 1205                   * - The CPU where it last ran is the target of an offline
1224 1206                   *   request.
1225 1207                   * - The thread last ran outside it's home lgroup.
1226 1208                   */
1227 1209                  if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1228 1210                      (tp->t_cpu == cpu_inmotion)) {
1229 1211                          cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1230 1212                  } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1231 1213                          cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1232 1214                              self ? tp->t_cpu : NULL);
1233 1215                  } else {
1234 1216                          cp = tp->t_cpu;
1235 1217                  }
1236 1218  
1237 1219                  if (tp->t_cpupart == cp->cpu_part) {
1238 1220                          int     qlen;
1239 1221  
1240 1222                          /*
1241 1223                           * Perform any CMT load balancing
1242 1224                           */
1243 1225                          cp = cmt_balance(tp, cp);
1244 1226  
1245 1227                          /*
1246 1228                           * Balance across the run queues
1247 1229                           */
1248 1230                          qlen = RUNQ_LEN(cp, tpri);
1249 1231                          if (tpri >= RUNQ_MATCH_PRI &&
1250 1232                              !(tp->t_schedflag & TS_RUNQMATCH))
1251 1233                                  qlen -= RUNQ_MAX_DIFF;
1252 1234                          if (qlen > 0) {
1253 1235                                  cpu_t *newcp;
1254 1236  
1255 1237                                  if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1256 1238                                          newcp = cp->cpu_next_part;
1257 1239                                  } else if ((newcp = cp->cpu_next_lpl) == cp) {
1258 1240                                          newcp = cp->cpu_next_part;
1259 1241                                  }
1260 1242  
1261 1243                                  if (RUNQ_LEN(newcp, tpri) < qlen) {
1262 1244                                          DTRACE_PROBE3(runq__balance,
1263 1245                                              kthread_t *, tp,
1264 1246                                              cpu_t *, cp, cpu_t *, newcp);
1265 1247                                          cp = newcp;
1266 1248                                  }
1267 1249                          }
1268 1250                  } else {
1269 1251                          /*
1270 1252                           * Migrate to a cpu in the new partition.
1271 1253                           */
1272 1254                          cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1273 1255                              tp->t_lpl, tp->t_pri, NULL);
1274 1256                  }
1275 1257                  ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1276 1258          } else {
1277 1259                  /*
1278 1260                   * It is possible that t_weakbound_cpu != t_bound_cpu (for
1279 1261                   * a short time until weak binding that existed when the
1280 1262                   * strong binding was established has dropped) so we must
1281 1263                   * favour weak binding over strong.
1282 1264                   */
1283 1265                  cp = tp->t_weakbound_cpu ?
1284 1266                      tp->t_weakbound_cpu : tp->t_bound_cpu;
1285 1267          }
1286 1268          /*
1287 1269           * A thread that is ONPROC may be temporarily placed on the run queue
1288 1270           * but then chosen to run again by disp.  If the thread we're placing on
1289 1271           * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1290 1272           * replacement process is actually scheduled in swtch().  In this
1291 1273           * situation, curthread is the only thread that could be in the ONPROC
1292 1274           * state.
1293 1275           */
1294 1276          if ((!self) && (tp->t_waitrq == 0)) {
1295 1277                  hrtime_t curtime;
1296 1278  
1297 1279                  curtime = gethrtime_unscaled();
1298 1280                  (void) cpu_update_pct(tp, curtime);
1299 1281                  tp->t_waitrq = curtime;
1300 1282          } else {
1301 1283                  (void) cpu_update_pct(tp, gethrtime_unscaled());
1302 1284          }
1303 1285  
1304 1286          dp = cp->cpu_disp;
1305 1287          disp_lock_enter_high(&dp->disp_lock);
1306 1288  
1307 1289          DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1308 1290          TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1309 1291              tpri, cp, tp);
1310 1292  
1311 1293  #ifndef NPROBE
1312 1294          /* Kernel probe */
1313 1295          if (tnf_tracing_active)
1314 1296                  tnf_thread_queue(tp, cp, tpri);
1315 1297  #endif /* NPROBE */
1316 1298  
1317 1299          ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1318 1300  
1319 1301          THREAD_RUN(tp, &dp->disp_lock);         /* set t_state to TS_RUN */
1320 1302          tp->t_disp_queue = dp;
1321 1303          tp->t_link = NULL;
1322 1304  
1323 1305          dq = &dp->disp_q[tpri];
1324 1306          dp->disp_nrunnable++;
1325 1307          if (!bound)
1326 1308                  dp->disp_steal = 0;
1327 1309          membar_enter();
1328 1310  
1329 1311          if (dq->dq_sruncnt++ != 0) {
1330 1312                  ASSERT(dq->dq_first != NULL);
1331 1313                  dq->dq_last->t_link = tp;
1332 1314                  dq->dq_last = tp;
1333 1315          } else {
1334 1316                  ASSERT(dq->dq_first == NULL);
1335 1317                  ASSERT(dq->dq_last == NULL);
1336 1318                  dq->dq_first = dq->dq_last = tp;
1337 1319                  BT_SET(dp->disp_qactmap, tpri);
1338 1320                  if (tpri > dp->disp_maxrunpri) {
1339 1321                          dp->disp_maxrunpri = tpri;
1340 1322                          membar_enter();
1341 1323                          cpu_resched(cp, tpri);
1342 1324                  }
1343 1325          }
1344 1326  
1345 1327          if (!bound && tpri > dp->disp_max_unbound_pri) {
1346 1328                  if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1347 1329                          /*
1348 1330                           * If there are no other unbound threads on the
1349 1331                           * run queue, don't allow other CPUs to steal
1350 1332                           * this thread while we are in the middle of a
1351 1333                           * context switch. We may just switch to it
1352 1334                           * again right away. CPU_DISP_DONTSTEAL is cleared
1353 1335                           * in swtch and swtch_to.
1354 1336                           */
1355 1337                          cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1356 1338                  }
1357 1339                  dp->disp_max_unbound_pri = tpri;
1358 1340          }
1359 1341          (*disp_enq_thread)(cp, bound);
1360 1342  }
1361 1343  
1362 1344  /*
1363 1345   * Put the specified thread on the front of the dispatcher
1364 1346   * queue corresponding to its current priority.
1365 1347   *
1366 1348   * Called with the thread in transition, onproc or stopped state
1367 1349   * and locked (transition implies locked) and at high spl.
1368 1350   * Returns with the thread in TS_RUN state and still locked.
1369 1351   */
1370 1352  void
1371 1353  setfrontdq(kthread_t *tp)
1372 1354  {

↓ open down ↓

160 lines elided

↑ open up ↑

1373 1355          disp_t          *dp;
1374 1356          dispq_t         *dq;
1375 1357          cpu_t           *cp;
1376 1358          pri_t           tpri;
1377 1359          int             bound;
1378 1360  
1379 1361          ASSERT(THREAD_LOCK_HELD(tp));
1380 1362          ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1381 1363          ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1382 1364  
1383      -        /*
1384      -         * If thread is "swapped" or on the swap queue don't
1385      -         * queue it, but wake sched.
1386      -         */
1387      -        if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1388      -                disp_swapped_setrun(tp);
1389      -                return;
1390      -        }
1391      -
1392 1365          if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1393 1366                  bound = 1;
1394 1367          else
1395 1368                  bound = 0;
1396 1369  
1397 1370          tpri = DISP_PRIO(tp);
1398 1371          if (ncpus == 1)
1399 1372                  cp = tp->t_cpu;
1400 1373          else if (!bound) {
1401 1374                  if (tpri >= kpqpri) {

1402 1375                          setkpdq(tp, SETKP_FRONT);
1403 1376                          return;
1404 1377                  }
1405 1378                  cp = tp->t_cpu;
1406 1379                  if (tp->t_cpupart == cp->cpu_part) {
1407 1380                          /*
1408 1381                           * We'll generally let this thread continue to run
1409 1382                           * where it last ran, but will consider migration if:
1410 1383                           * - The thread last ran outside it's home lgroup.
1411 1384                           * - The CPU where it last ran is the target of an
1412 1385                           *   offline request (a thread_nomigrate() on the in
1413 1386                           *   motion CPU relies on this when forcing a preempt).
1414 1387                           * - The thread isn't the highest priority thread where
1415 1388                           *   it last ran, and it is considered not likely to
1416 1389                           *   have significant cache warmth.
1417 1390                           */
1418 1391                          if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1419 1392                              (cp == cpu_inmotion)) {
1420 1393                                  cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1421 1394                                      (tp == curthread) ? cp : NULL);
1422 1395                          } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1423 1396                              (!THREAD_HAS_CACHE_WARMTH(tp))) {
1424 1397                                  cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1425 1398                                      NULL);
1426 1399                          }
1427 1400                  } else {
1428 1401                          /*
1429 1402                           * Migrate to a cpu in the new partition.
1430 1403                           */
1431 1404                          cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1432 1405                              tp->t_lpl, tp->t_pri, NULL);
1433 1406                  }
1434 1407                  ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1435 1408          } else {
1436 1409                  /*
1437 1410                   * It is possible that t_weakbound_cpu != t_bound_cpu (for
1438 1411                   * a short time until weak binding that existed when the
1439 1412                   * strong binding was established has dropped) so we must
1440 1413                   * favour weak binding over strong.
1441 1414                   */
1442 1415                  cp = tp->t_weakbound_cpu ?
1443 1416                      tp->t_weakbound_cpu : tp->t_bound_cpu;
1444 1417          }
1445 1418  
1446 1419          /*
1447 1420           * A thread that is ONPROC may be temporarily placed on the run queue
1448 1421           * but then chosen to run again by disp.  If the thread we're placing on
1449 1422           * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1450 1423           * replacement process is actually scheduled in swtch().  In this
1451 1424           * situation, curthread is the only thread that could be in the ONPROC
1452 1425           * state.
1453 1426           */
1454 1427          if ((tp != curthread) && (tp->t_waitrq == 0)) {
1455 1428                  hrtime_t curtime;
1456 1429  
1457 1430                  curtime = gethrtime_unscaled();
1458 1431                  (void) cpu_update_pct(tp, curtime);
1459 1432                  tp->t_waitrq = curtime;
1460 1433          } else {
1461 1434                  (void) cpu_update_pct(tp, gethrtime_unscaled());
1462 1435          }
1463 1436  
1464 1437          dp = cp->cpu_disp;
1465 1438          disp_lock_enter_high(&dp->disp_lock);
1466 1439  
1467 1440          TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1468 1441          DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1469 1442  
1470 1443  #ifndef NPROBE
1471 1444          /* Kernel probe */
1472 1445          if (tnf_tracing_active)
1473 1446                  tnf_thread_queue(tp, cp, tpri);
1474 1447  #endif /* NPROBE */
1475 1448  
1476 1449          ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1477 1450  
1478 1451          THREAD_RUN(tp, &dp->disp_lock);         /* set TS_RUN state and lock */
1479 1452          tp->t_disp_queue = dp;
1480 1453  
1481 1454          dq = &dp->disp_q[tpri];
1482 1455          dp->disp_nrunnable++;
1483 1456          if (!bound)
1484 1457                  dp->disp_steal = 0;
1485 1458          membar_enter();
1486 1459  
1487 1460          if (dq->dq_sruncnt++ != 0) {
1488 1461                  ASSERT(dq->dq_last != NULL);
1489 1462                  tp->t_link = dq->dq_first;
1490 1463                  dq->dq_first = tp;
1491 1464          } else {
1492 1465                  ASSERT(dq->dq_last == NULL);
1493 1466                  ASSERT(dq->dq_first == NULL);
1494 1467                  tp->t_link = NULL;
1495 1468                  dq->dq_first = dq->dq_last = tp;
1496 1469                  BT_SET(dp->disp_qactmap, tpri);
1497 1470                  if (tpri > dp->disp_maxrunpri) {
1498 1471                          dp->disp_maxrunpri = tpri;
1499 1472                          membar_enter();
1500 1473                          cpu_resched(cp, tpri);
1501 1474                  }
1502 1475          }
1503 1476  
1504 1477          if (!bound && tpri > dp->disp_max_unbound_pri) {
1505 1478                  if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1506 1479                      cp == CPU) {
1507 1480                          /*
1508 1481                           * If there are no other unbound threads on the
1509 1482                           * run queue, don't allow other CPUs to steal
1510 1483                           * this thread while we are in the middle of a
1511 1484                           * context switch. We may just switch to it
1512 1485                           * again right away. CPU_DISP_DONTSTEAL is cleared
1513 1486                           * in swtch and swtch_to.
1514 1487                           */
1515 1488                          cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1516 1489                  }
1517 1490                  dp->disp_max_unbound_pri = tpri;
1518 1491          }
1519 1492          (*disp_enq_thread)(cp, bound);
1520 1493  }
1521 1494  
1522 1495  /*
1523 1496   * Put a high-priority unbound thread on the kp queue
1524 1497   */
1525 1498  static void
1526 1499  setkpdq(kthread_t *tp, int borf)
1527 1500  {
1528 1501          dispq_t *dq;
1529 1502          disp_t  *dp;
1530 1503          cpu_t   *cp;
1531 1504          pri_t   tpri;
1532 1505  
1533 1506          tpri = DISP_PRIO(tp);
1534 1507  
1535 1508          dp = &tp->t_cpupart->cp_kp_queue;
1536 1509          disp_lock_enter_high(&dp->disp_lock);
1537 1510  
1538 1511          TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1539 1512  
1540 1513          ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1541 1514          DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1542 1515          THREAD_RUN(tp, &dp->disp_lock);         /* set t_state to TS_RUN */
1543 1516          tp->t_disp_queue = dp;
1544 1517          dp->disp_nrunnable++;
1545 1518          dq = &dp->disp_q[tpri];
1546 1519  
1547 1520          if (dq->dq_sruncnt++ != 0) {
1548 1521                  if (borf == SETKP_BACK) {
1549 1522                          ASSERT(dq->dq_first != NULL);
1550 1523                          tp->t_link = NULL;
1551 1524                          dq->dq_last->t_link = tp;
1552 1525                          dq->dq_last = tp;
1553 1526                  } else {
1554 1527                          ASSERT(dq->dq_last != NULL);
1555 1528                          tp->t_link = dq->dq_first;
1556 1529                          dq->dq_first = tp;
1557 1530                  }
1558 1531          } else {
1559 1532                  if (borf == SETKP_BACK) {
1560 1533                          ASSERT(dq->dq_first == NULL);
1561 1534                          ASSERT(dq->dq_last == NULL);
1562 1535                          dq->dq_first = dq->dq_last = tp;
1563 1536                  } else {
1564 1537                          ASSERT(dq->dq_last == NULL);
1565 1538                          ASSERT(dq->dq_first == NULL);
1566 1539                          tp->t_link = NULL;
1567 1540                          dq->dq_first = dq->dq_last = tp;
1568 1541                  }
1569 1542                  BT_SET(dp->disp_qactmap, tpri);
1570 1543                  if (tpri > dp->disp_max_unbound_pri)
1571 1544                          dp->disp_max_unbound_pri = tpri;
1572 1545                  if (tpri > dp->disp_maxrunpri) {
1573 1546                          dp->disp_maxrunpri = tpri;
1574 1547                          membar_enter();
1575 1548                  }
1576 1549          }
1577 1550  
1578 1551          cp = tp->t_cpu;
1579 1552          if (tp->t_cpupart != cp->cpu_part) {
1580 1553                  /* migrate to a cpu in the new partition */
1581 1554                  cp = tp->t_cpupart->cp_cpulist;
1582 1555          }
1583 1556          cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1584 1557          disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1585 1558          ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1586 1559  
1587 1560  #ifndef NPROBE
1588 1561          /* Kernel probe */
1589 1562          if (tnf_tracing_active)
1590 1563                  tnf_thread_queue(tp, cp, tpri);
1591 1564  #endif /* NPROBE */
1592 1565  
1593 1566          if (cp->cpu_chosen_level < tpri)
1594 1567                  cp->cpu_chosen_level = tpri;
1595 1568          cpu_resched(cp, tpri);
1596 1569          disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1597 1570          (*disp_enq_thread)(cp, 0);
1598 1571  }
1599 1572  
1600 1573  /*
1601 1574   * Remove a thread from the dispatcher queue if it is on it.
1602 1575   * It is not an error if it is not found but we return whether
1603 1576   * or not it was found in case the caller wants to check.
1604 1577   */
1605 1578  int
1606 1579  dispdeq(kthread_t *tp)
1607 1580  {
1608 1581          disp_t          *dp;
1609 1582          dispq_t         *dq;

↓ open down ↓

208 lines elided

↑ open up ↑

1610 1583          kthread_t       *rp;
1611 1584          kthread_t       *trp;
1612 1585          kthread_t       **ptp;
1613 1586          int             tpri;
1614 1587  
1615 1588          ASSERT(THREAD_LOCK_HELD(tp));
1616 1589  
1617 1590          if (tp->t_state != TS_RUN)
1618 1591                  return (0);
1619 1592  
1620      -        /*
1621      -         * The thread is "swapped" or is on the swap queue and
1622      -         * hence no longer on the run queue, so return true.
1623      -         */
1624      -        if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1625      -                return (1);
1626      -
1627 1593          tpri = DISP_PRIO(tp);
1628 1594          dp = tp->t_disp_queue;
1629 1595          ASSERT(tpri < dp->disp_npri);
1630 1596          dq = &dp->disp_q[tpri];
1631 1597          ptp = &dq->dq_first;
1632 1598          rp = *ptp;
1633 1599          trp = NULL;
1634 1600  
1635 1601          ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1636 1602

1637 1603          /*
1638 1604           * Search for thread in queue.
1639 1605           * Double links would simplify this at the expense of disp/setrun.
1640 1606           */
1641 1607          while (rp != tp && rp != NULL) {
1642 1608                  trp = rp;
1643 1609                  ptp = &trp->t_link;
1644 1610                  rp = trp->t_link;
1645 1611          }
1646 1612  
1647 1613          if (rp == NULL) {
1648 1614                  panic("dispdeq: thread not on queue");
1649 1615          }
1650 1616  
1651 1617          DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1652 1618  
1653 1619          /*
1654 1620           * Found it so remove it from queue.
1655 1621           */
1656 1622          if ((*ptp = rp->t_link) == NULL)
1657 1623                  dq->dq_last = trp;
1658 1624  
1659 1625          dp->disp_nrunnable--;
1660 1626          if (--dq->dq_sruncnt == 0) {
1661 1627                  dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1662 1628                  if (dp->disp_nrunnable == 0) {
1663 1629                          dp->disp_max_unbound_pri = -1;
1664 1630                          dp->disp_maxrunpri = -1;
1665 1631                  } else if (tpri == dp->disp_maxrunpri) {
1666 1632                          int ipri;
1667 1633  
1668 1634                          ipri = bt_gethighbit(dp->disp_qactmap,
1669 1635                              dp->disp_maxrunpri >> BT_ULSHIFT);

↓ open down ↓

33 lines elided

↑ open up ↑

1670 1636                          if (ipri < dp->disp_max_unbound_pri)
1671 1637                                  dp->disp_max_unbound_pri = ipri;
1672 1638                          dp->disp_maxrunpri = ipri;
1673 1639                  }
1674 1640          }
1675 1641          tp->t_link = NULL;
1676 1642          THREAD_TRANSITION(tp);          /* put in intermediate state */
1677 1643          return (1);
1678 1644  }
1679 1645  
1680      -
1681      -/*
1682      - * dq_sruninc and dq_srundec are public functions for
1683      - * incrementing/decrementing the sruncnts when a thread on
1684      - * a dispatcher queue is made schedulable/unschedulable by
1685      - * resetting the TS_LOAD flag.
1686      - *
1687      - * The caller MUST have the thread lock and therefore the dispatcher
1688      - * queue lock so that the operation which changes
1689      - * the flag, the operation that checks the status of the thread to
1690      - * determine if it's on a disp queue AND the call to this function
1691      - * are one atomic operation with respect to interrupts.
1692      - */
1693      -
1694      -/*
1695      - * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1696      - */
1697      -void
1698      -dq_sruninc(kthread_t *t)
1699      -{
1700      -        ASSERT(t->t_state == TS_RUN);
1701      -        ASSERT(t->t_schedflag & TS_LOAD);
1702      -
1703      -        THREAD_TRANSITION(t);
1704      -        setfrontdq(t);
1705      -}
1706      -
1707      -/*
1708      - * See comment on calling conventions above.
1709      - * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1710      - */
1711      -void
1712      -dq_srundec(kthread_t *t)
1713      -{
1714      -        ASSERT(t->t_schedflag & TS_LOAD);
1715      -
1716      -        (void) dispdeq(t);
1717      -        disp_swapped_enq(t);
1718      -}
1719      -
1720      -/*
1721      - * Change the dispatcher lock of thread to the "swapped_lock"
1722      - * and return with thread lock still held.
1723      - *
1724      - * Called with thread_lock held, in transition state, and at high spl.
1725      - */
1726      -void
1727      -disp_swapped_enq(kthread_t *tp)
1728      -{
1729      -        ASSERT(THREAD_LOCK_HELD(tp));
1730      -        ASSERT(tp->t_schedflag & TS_LOAD);
1731      -
1732      -        switch (tp->t_state) {
1733      -        case TS_RUN:
1734      -                disp_lock_enter_high(&swapped_lock);
1735      -                THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1736      -                break;
1737      -        case TS_ONPROC:
1738      -                disp_lock_enter_high(&swapped_lock);
1739      -                THREAD_TRANSITION(tp);
1740      -                wake_sched_sec = 1;             /* tell clock to wake sched */
1741      -                THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1742      -                break;
1743      -        default:
1744      -                panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1745      -        }
1746      -}
1747      -
1748      -/*
1749      - * This routine is called by setbackdq/setfrontdq if the thread is
1750      - * not loaded or loaded and on the swap queue.
1751      - *
1752      - * Thread state TS_SLEEP implies that a swapped thread
1753      - * has been woken up and needs to be swapped in by the swapper.
1754      - *
1755      - * Thread state TS_RUN, it implies that the priority of a swapped
1756      - * thread is being increased by scheduling class (e.g. ts_update).
1757      - */
1758      -static void
1759      -disp_swapped_setrun(kthread_t *tp)
1760      -{
1761      -        ASSERT(THREAD_LOCK_HELD(tp));
1762      -        ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1763      -
1764      -        switch (tp->t_state) {
1765      -        case TS_SLEEP:
1766      -                disp_lock_enter_high(&swapped_lock);
1767      -                /*
1768      -                 * Wakeup sched immediately (i.e., next tick) if the
1769      -                 * thread priority is above maxclsyspri.
1770      -                 */
1771      -                if (DISP_PRIO(tp) > maxclsyspri)
1772      -                        wake_sched = 1;
1773      -                else
1774      -                        wake_sched_sec = 1;
1775      -                THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1776      -                break;
1777      -        case TS_RUN:                            /* called from ts_update */
1778      -                break;
1779      -        default:
1780      -                panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1781      -        }
1782      -}
1783      -
1784 1646  /*
1785 1647   *      Make a thread give up its processor.  Find the processor on
1786 1648   *      which this thread is executing, and have that processor
1787 1649   *      preempt.
1788 1650   *
1789 1651   *      We allow System Duty Cycle (SDC) threads to be preempted even if
1790 1652   *      they are running at kernel priorities.  To implement this, we always
1791 1653   *      set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1792 1654   *      calls cpu_surrender() very often, we only preempt if there is anyone
1793 1655   *      competing with us.

1794 1656   */
1795 1657  void
1796 1658  cpu_surrender(kthread_t *tp)
1797 1659  {
1798 1660          cpu_t   *cpup;
1799 1661          int     max_pri;
1800 1662          int     max_run_pri;
1801 1663          klwp_t  *lwp;
1802 1664  
1803 1665          ASSERT(THREAD_LOCK_HELD(tp));
1804 1666  
1805 1667          if (tp->t_state != TS_ONPROC)
1806 1668                  return;
1807 1669          cpup = tp->t_disp_queue->disp_cpu;      /* CPU thread dispatched to */
1808 1670          max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1809 1671          max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1810 1672          if (max_pri < max_run_pri)
1811 1673                  max_pri = max_run_pri;
1812 1674  
1813 1675          if (tp->t_cid == sysdccid) {
1814 1676                  uint_t t_pri = DISP_PRIO(tp);
1815 1677                  if (t_pri > max_pri)
1816 1678                          return;         /* we are not competing w/ anyone */
1817 1679                  cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1818 1680          } else {
1819 1681                  cpup->cpu_runrun = 1;
1820 1682                  if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1821 1683                          cpup->cpu_kprunrun = 1;
1822 1684                  }
1823 1685          }
1824 1686  
1825 1687          /*
1826 1688           * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1827 1689           */
1828 1690          membar_enter();
1829 1691  
1830 1692          DTRACE_SCHED1(surrender, kthread_t *, tp);
1831 1693  
1832 1694          /*
1833 1695           * Make the target thread take an excursion through trap()
1834 1696           * to do preempt() (unless we're already in trap or post_syscall,
1835 1697           * calling cpu_surrender via CL_TRAPRET).
1836 1698           */
1837 1699          if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1838 1700              lwp->lwp_state != LWP_USER) {
1839 1701                  aston(tp);
1840 1702                  if (cpup != CPU)
1841 1703                          poke_cpu(cpup->cpu_id);
1842 1704          }
1843 1705          TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1844 1706              "cpu_surrender:tid %p cpu %p", tp, cpup);
1845 1707  }
1846 1708  
1847 1709  /*
1848 1710   * Commit to and ratify a scheduling decision
1849 1711   */
1850 1712  /*ARGSUSED*/
1851 1713  static kthread_t *
1852 1714  disp_ratify(kthread_t *tp, disp_t *kpq)
1853 1715  {
1854 1716          pri_t   tpri, maxpri;
1855 1717          pri_t   maxkpri;
1856 1718          cpu_t   *cpup;
1857 1719  
1858 1720          ASSERT(tp != NULL);
1859 1721          /*
1860 1722           * Commit to, then ratify scheduling decision
1861 1723           */
1862 1724          cpup = CPU;
1863 1725          if (cpup->cpu_runrun != 0)
1864 1726                  cpup->cpu_runrun = 0;
1865 1727          if (cpup->cpu_kprunrun != 0)
1866 1728                  cpup->cpu_kprunrun = 0;
1867 1729          if (cpup->cpu_chosen_level != -1)
1868 1730                  cpup->cpu_chosen_level = -1;
1869 1731          membar_enter();
1870 1732          tpri = DISP_PRIO(tp);
1871 1733          maxpri = cpup->cpu_disp->disp_maxrunpri;
1872 1734          maxkpri = kpq->disp_maxrunpri;
1873 1735          if (maxpri < maxkpri)
1874 1736                  maxpri = maxkpri;
1875 1737          if (tpri < maxpri) {
1876 1738                  /*
1877 1739                   * should have done better
1878 1740                   * put this one back and indicate to try again
1879 1741                   */
1880 1742                  cpup->cpu_dispthread = curthread;       /* fixup dispthread */
1881 1743                  cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1882 1744                  thread_lock_high(tp);
1883 1745                  THREAD_TRANSITION(tp);
1884 1746                  setfrontdq(tp);
1885 1747                  thread_unlock_nopreempt(tp);
1886 1748  
1887 1749                  tp = NULL;
1888 1750          }
1889 1751          return (tp);
1890 1752  }
1891 1753  
1892 1754  /*
1893 1755   * See if there is any work on the dispatcher queue for other CPUs.
1894 1756   * If there is, dequeue the best thread and return.
1895 1757   */
1896 1758  static kthread_t *
1897 1759  disp_getwork(cpu_t *cp)
1898 1760  {
1899 1761          cpu_t           *ocp;           /* other CPU */
1900 1762          cpu_t           *ocp_start;
1901 1763          cpu_t           *tcp;           /* target local CPU */
1902 1764          kthread_t       *tp;
1903 1765          kthread_t       *retval = NULL;
1904 1766          pri_t           maxpri;
1905 1767          disp_t          *kpq;           /* kp queue for this partition */
1906 1768          lpl_t           *lpl, *lpl_leaf;
1907 1769          int             leafidx, startidx;
1908 1770          hrtime_t        stealtime;
1909 1771          lgrp_id_t       local_id;
1910 1772  
1911 1773          maxpri = -1;
1912 1774          tcp = NULL;
1913 1775  
1914 1776          kpq = &cp->cpu_part->cp_kp_queue;
1915 1777          while (kpq->disp_maxrunpri >= 0) {
1916 1778                  /*
1917 1779                   * Try to take a thread from the kp_queue.
1918 1780                   */
1919 1781                  tp = (disp_getbest(kpq));
1920 1782                  if (tp)
1921 1783                          return (disp_ratify(tp, kpq));
1922 1784          }
1923 1785  
1924 1786          kpreempt_disable();             /* protect the cpu_active list */
1925 1787  
1926 1788          /*
1927 1789           * Try to find something to do on another CPU's run queue.
1928 1790           * Loop through all other CPUs looking for the one with the highest
1929 1791           * priority unbound thread.
1930 1792           *
1931 1793           * On NUMA machines, the partition's CPUs are consulted in order of
1932 1794           * distance from the current CPU. This way, the first available
1933 1795           * work found is also the closest, and will suffer the least
1934 1796           * from being migrated.
1935 1797           */
1936 1798          lpl = lpl_leaf = cp->cpu_lpl;
1937 1799          local_id = lpl_leaf->lpl_lgrpid;
1938 1800          leafidx = startidx = 0;
1939 1801  
1940 1802          /*
1941 1803           * This loop traverses the lpl hierarchy. Higher level lpls represent
1942 1804           * broader levels of locality
1943 1805           */
1944 1806          do {
1945 1807                  /* This loop iterates over the lpl's leaves */
1946 1808                  do {
1947 1809                          if (lpl_leaf != cp->cpu_lpl)
1948 1810                                  ocp = lpl_leaf->lpl_cpus;
1949 1811                          else
1950 1812                                  ocp = cp->cpu_next_lpl;
1951 1813  
1952 1814                          /* This loop iterates over the CPUs in the leaf */
1953 1815                          ocp_start = ocp;
1954 1816                          do {
1955 1817                                  pri_t pri;
1956 1818  
1957 1819                                  ASSERT(CPU_ACTIVE(ocp));
1958 1820  
1959 1821                                  /*
1960 1822                                   * End our stroll around this lpl if:
1961 1823                                   *
1962 1824                                   * - Something became runnable on the local
1963 1825                                   *   queue...which also ends our stroll around
1964 1826                                   *   the partition.
1965 1827                                   *
1966 1828                                   * - We happen across another idle CPU.
1967 1829                                   *   Since it is patrolling the next portion
1968 1830                                   *   of the lpl's list (assuming it's not
1969 1831                                   *   halted, or busy servicing an interrupt),
1970 1832                                   *   move to the next higher level of locality.
1971 1833                                   */
1972 1834                                  if (cp->cpu_disp->disp_nrunnable != 0) {
1973 1835                                          kpreempt_enable();
1974 1836                                          return (NULL);
1975 1837                                  }
1976 1838                                  if (ocp->cpu_dispatch_pri == -1) {
1977 1839                                          if (ocp->cpu_disp_flags &
1978 1840                                              CPU_DISP_HALTED ||
1979 1841                                              ocp->cpu_intr_actv != 0)
1980 1842                                                  continue;
1981 1843                                          else
1982 1844                                                  goto next_level;
1983 1845                                  }
1984 1846  
1985 1847                                  /*
1986 1848                                   * If there's only one thread and the CPU
1987 1849                                   * is in the middle of a context switch,
1988 1850                                   * or it's currently running the idle thread,
1989 1851                                   * don't steal it.
1990 1852                                   */
1991 1853                                  if ((ocp->cpu_disp_flags &
1992 1854                                      CPU_DISP_DONTSTEAL) &&
1993 1855                                      ocp->cpu_disp->disp_nrunnable == 1)
1994 1856                                          continue;
1995 1857  
1996 1858                                  pri = ocp->cpu_disp->disp_max_unbound_pri;
1997 1859                                  if (pri > maxpri) {
1998 1860                                          /*
1999 1861                                           * Don't steal threads that we attempted
2000 1862                                           * to steal recently until they're ready
2001 1863                                           * to be stolen again.
2002 1864                                           */
2003 1865                                          stealtime = ocp->cpu_disp->disp_steal;
2004 1866                                          if (stealtime == 0 ||
2005 1867                                              stealtime - gethrtime() <= 0) {
2006 1868                                                  maxpri = pri;
2007 1869                                                  tcp = ocp;
2008 1870                                          } else {
2009 1871                                                  /*
2010 1872                                                   * Don't update tcp, just set
2011 1873                                                   * the retval to T_DONTSTEAL, so
2012 1874                                                   * that if no acceptable CPUs
2013 1875                                                   * are found the return value
2014 1876                                                   * will be T_DONTSTEAL rather
2015 1877                                                   * then NULL.
2016 1878                                                   */
2017 1879                                                  retval = T_DONTSTEAL;
2018 1880                                          }
2019 1881                                  }
2020 1882                          } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2021 1883  
2022 1884                          /*
2023 1885                           * Iterate to the next leaf lpl in the resource set
2024 1886                           * at this level of locality. If we hit the end of
2025 1887                           * the set, wrap back around to the beginning.
2026 1888                           *
2027 1889                           * Note: This iteration is NULL terminated for a reason
2028 1890                           * see lpl_topo_bootstrap() in lgrp.c for details.
2029 1891                           */
2030 1892                          if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2031 1893                                  leafidx = 0;
2032 1894                                  lpl_leaf = lpl->lpl_rset[leafidx];
2033 1895                          }
2034 1896                  } while (leafidx != startidx);
2035 1897  
2036 1898  next_level:
2037 1899                  /*
2038 1900                   * Expand the search to include farther away CPUs (next
2039 1901                   * locality level). The closer CPUs that have already been
2040 1902                   * checked will be checked again. In doing so, idle CPUs
2041 1903                   * will tend to be more aggresive about stealing from CPUs
2042 1904                   * that are closer (since the closer CPUs will be considered
2043 1905                   * more often).
2044 1906                   * Begin at this level with the CPUs local leaf lpl.
2045 1907                   */
2046 1908                  if ((lpl = lpl->lpl_parent) != NULL) {
2047 1909                          leafidx = startidx = lpl->lpl_id2rset[local_id];
2048 1910                          lpl_leaf = lpl->lpl_rset[leafidx];
2049 1911                  }
2050 1912          } while (!tcp && lpl);
2051 1913  
2052 1914          kpreempt_enable();
2053 1915  
2054 1916          /*
2055 1917           * If another queue looks good, and there is still nothing on
2056 1918           * the local queue, try to transfer one or more threads
2057 1919           * from it to our queue.
2058 1920           */
2059 1921          if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2060 1922                  tp = disp_getbest(tcp->cpu_disp);
2061 1923                  if (tp == NULL || tp == T_DONTSTEAL)
2062 1924                          return (tp);
2063 1925                  return (disp_ratify(tp, kpq));
2064 1926          }
2065 1927          return (retval);
2066 1928  }
2067 1929  
2068 1930  
2069 1931  /*
2070 1932   * disp_fix_unbound_pri()
2071 1933   *      Determines the maximum priority of unbound threads on the queue.
2072 1934   *      The priority is kept for the queue, but is only increased, never
2073 1935   *      reduced unless some CPU is looking for something on that queue.
2074 1936   *
2075 1937   *      The priority argument is the known upper limit.
2076 1938   *
2077 1939   *      Perhaps this should be kept accurately, but that probably means
2078 1940   *      separate bitmaps for bound and unbound threads.  Since only idled
2079 1941   *      CPUs will have to do this recalculation, it seems better this way.
2080 1942   */
2081 1943  static void
2082 1944  disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2083 1945  {
2084 1946          kthread_t       *tp;
2085 1947          dispq_t         *dq;
2086 1948          ulong_t         *dqactmap = dp->disp_qactmap;
2087 1949          ulong_t         mapword;
2088 1950          int             wx;
2089 1951  
2090 1952          ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2091 1953  
2092 1954          ASSERT(pri >= 0);                       /* checked by caller */
2093 1955  
2094 1956          /*
2095 1957           * Start the search at the next lowest priority below the supplied
2096 1958           * priority.  This depends on the bitmap implementation.
2097 1959           */
2098 1960          do {
2099 1961                  wx = pri >> BT_ULSHIFT;         /* index of word in map */
2100 1962  
2101 1963                  /*
2102 1964                   * Form mask for all lower priorities in the word.
2103 1965                   */
2104 1966                  mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2105 1967  
2106 1968                  /*
2107 1969                   * Get next lower active priority.
2108 1970                   */
2109 1971                  if (mapword != 0) {
2110 1972                          pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2111 1973                  } else if (wx > 0) {
2112 1974                          pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2113 1975                          if (pri < 0)
2114 1976                                  break;
2115 1977                  } else {
2116 1978                          pri = -1;
2117 1979                          break;
2118 1980                  }
2119 1981  
2120 1982                  /*
2121 1983                   * Search the queue for unbound, runnable threads.
2122 1984                   */
2123 1985                  dq = &dp->disp_q[pri];
2124 1986                  tp = dq->dq_first;
2125 1987  
2126 1988                  while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2127 1989                          tp = tp->t_link;
2128 1990                  }
2129 1991  
2130 1992                  /*
2131 1993                   * If a thread was found, set the priority and return.
2132 1994                   */
2133 1995          } while (tp == NULL);
2134 1996  
2135 1997          /*
2136 1998           * pri holds the maximum unbound thread priority or -1.
2137 1999           */
2138 2000          if (dp->disp_max_unbound_pri != pri)
2139 2001                  dp->disp_max_unbound_pri = pri;
2140 2002  }
2141 2003  
2142 2004  /*
2143 2005   * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2144 2006   *      check if the CPU to which is was previously bound should have
2145 2007   *      its disp_max_unbound_pri increased.
2146 2008   */

↓ open down ↓

353 lines elided

↑ open up ↑

2147 2009  void
2148 2010  disp_adjust_unbound_pri(kthread_t *tp)
2149 2011  {
2150 2012          disp_t *dp;
2151 2013          pri_t tpri;
2152 2014  
2153 2015          ASSERT(THREAD_LOCK_HELD(tp));
2154 2016  
2155 2017          /*
2156 2018           * Don't do anything if the thread is not bound, or
2157      -         * currently not runnable or swapped out.
     2019 +         * currently not runnable.
2158 2020           */
2159 2021          if (tp->t_bound_cpu == NULL ||
2160      -            tp->t_state != TS_RUN ||
2161      -            tp->t_schedflag & TS_ON_SWAPQ)
     2022 +            tp->t_state != TS_RUN)
2162 2023                  return;
2163 2024  
2164 2025          tpri = DISP_PRIO(tp);
2165 2026          dp = tp->t_bound_cpu->cpu_disp;
2166 2027          ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2167 2028          if (tpri > dp->disp_max_unbound_pri)
2168 2029                  dp->disp_max_unbound_pri = tpri;
2169 2030  }
2170 2031  
2171 2032  /*

2172 2033   * disp_getbest()
2173 2034   *   De-queue the highest priority unbound runnable thread.
2174 2035   *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2175 2036   *   Returns NULL if nothing found.
2176 2037   *   Returns T_DONTSTEAL if the thread was not stealable.
2177 2038   *   so that the caller will try again later.
2178 2039   *
2179 2040   *   Passed a pointer to a dispatch queue not associated with this CPU, and
2180 2041   *   its type.
2181 2042   */
2182 2043  static kthread_t *
2183 2044  disp_getbest(disp_t *dp)
2184 2045  {
2185 2046          kthread_t       *tp;
2186 2047          dispq_t         *dq;
2187 2048          pri_t           pri;
2188 2049          cpu_t           *cp, *tcp;
2189 2050          boolean_t       allbound;
2190 2051  
2191 2052          disp_lock_enter(&dp->disp_lock);
2192 2053  
2193 2054          /*
2194 2055           * If there is nothing to run, or the CPU is in the middle of a
2195 2056           * context switch of the only thread, return NULL.
2196 2057           */
2197 2058          tcp = dp->disp_cpu;
2198 2059          cp = CPU;
2199 2060          pri = dp->disp_max_unbound_pri;
2200 2061          if (pri == -1 ||
2201 2062              (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2202 2063              tcp->cpu_disp->disp_nrunnable == 1)) {
2203 2064                  disp_lock_exit_nopreempt(&dp->disp_lock);
2204 2065                  return (NULL);
2205 2066          }
2206 2067  
2207 2068          dq = &dp->disp_q[pri];
2208 2069  
2209 2070  
2210 2071          /*
2211 2072           * Assume that all threads are bound on this queue, and change it
2212 2073           * later when we find out that it is not the case.
2213 2074           */
2214 2075          allbound = B_TRUE;
2215 2076          for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2216 2077                  hrtime_t now, nosteal, rqtime;
2217 2078  
2218 2079                  /*
2219 2080                   * Skip over bound threads which could be here even
2220 2081                   * though disp_max_unbound_pri indicated this level.
2221 2082                   */
2222 2083                  if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2223 2084                          continue;
2224 2085  
2225 2086                  /*
2226 2087                   * We've got some unbound threads on this queue, so turn
2227 2088                   * the allbound flag off now.
2228 2089                   */
2229 2090                  allbound = B_FALSE;
2230 2091  
2231 2092                  /*
2232 2093                   * The thread is a candidate for stealing from its run queue. We
2233 2094                   * don't want to steal threads that became runnable just a
2234 2095                   * moment ago. This improves CPU affinity for threads that get
2235 2096                   * preempted for short periods of time and go back on the run
2236 2097                   * queue.
2237 2098                   *
2238 2099                   * We want to let it stay on its run queue if it was only placed
2239 2100                   * there recently and it was running on the same CPU before that
2240 2101                   * to preserve its cache investment. For the thread to remain on
2241 2102                   * its run queue, ALL of the following conditions must be
2242 2103                   * satisfied:
2243 2104                   *
2244 2105                   * - the disp queue should not be the kernel preemption queue
2245 2106                   * - delayed idle stealing should not be disabled
2246 2107                   * - nosteal_nsec should be non-zero
2247 2108                   * - it should run with user priority
2248 2109                   * - it should be on the run queue of the CPU where it was
2249 2110                   *   running before being placed on the run queue
2250 2111                   * - it should be the only thread on the run queue (to prevent
2251 2112                   *   extra scheduling latency for other threads)
2252 2113                   * - it should sit on the run queue for less than per-chip
2253 2114                   *   nosteal interval or global nosteal interval
2254 2115                   * - in case of CPUs with shared cache it should sit in a run
2255 2116                   *   queue of a CPU from a different chip
2256 2117                   *
2257 2118                   * The checks are arranged so that the ones that are faster are
2258 2119                   * placed earlier.
2259 2120                   */
2260 2121                  if (tcp == NULL ||
2261 2122                      pri >= minclsyspri ||
2262 2123                      tp->t_cpu != tcp)
2263 2124                          break;
2264 2125  
2265 2126                  /*
2266 2127                   * Steal immediately if, due to CMT processor architecture
2267 2128                   * migraiton between cp and tcp would incur no performance
2268 2129                   * penalty.
2269 2130                   */
2270 2131                  if (pg_cmt_can_migrate(cp, tcp))
2271 2132                          break;
2272 2133  
2273 2134                  nosteal = nosteal_nsec;
2274 2135                  if (nosteal == 0)
2275 2136                          break;
2276 2137  
2277 2138                  /*
2278 2139                   * Calculate time spent sitting on run queue
2279 2140                   */
2280 2141                  now = gethrtime_unscaled();
2281 2142                  rqtime = now - tp->t_waitrq;
2282 2143                  scalehrtime(&rqtime);
2283 2144  
2284 2145                  /*
2285 2146                   * Steal immediately if the time spent on this run queue is more
2286 2147                   * than allowed nosteal delay.
2287 2148                   *
2288 2149                   * Negative rqtime check is needed here to avoid infinite
2289 2150                   * stealing delays caused by unlikely but not impossible
2290 2151                   * drifts between CPU times on different CPUs.
2291 2152                   */
2292 2153                  if (rqtime > nosteal || rqtime < 0)
2293 2154                          break;
2294 2155  
2295 2156                  DTRACE_PROBE4(nosteal, kthread_t *, tp,
2296 2157                      cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2297 2158                  scalehrtime(&now);
2298 2159                  /*
2299 2160                   * Calculate when this thread becomes stealable
2300 2161                   */
2301 2162                  now += (nosteal - rqtime);
2302 2163  
2303 2164                  /*
2304 2165                   * Calculate time when some thread becomes stealable
2305 2166                   */
2306 2167                  if (now < dp->disp_steal)
2307 2168                          dp->disp_steal = now;
2308 2169          }
2309 2170  
2310 2171          /*
2311 2172           * If there were no unbound threads on this queue, find the queue
2312 2173           * where they are and then return later. The value of
2313 2174           * disp_max_unbound_pri is not always accurate because it isn't
2314 2175           * reduced until another idle CPU looks for work.
2315 2176           */
2316 2177          if (allbound)
2317 2178                  disp_fix_unbound_pri(dp, pri);
2318 2179  
2319 2180          /*
2320 2181           * If we reached the end of the queue and found no unbound threads
2321 2182           * then return NULL so that other CPUs will be considered.  If there
2322 2183           * are unbound threads but they cannot yet be stolen, then
2323 2184           * return T_DONTSTEAL and try again later.
2324 2185           */
2325 2186          if (tp == NULL) {
2326 2187                  disp_lock_exit_nopreempt(&dp->disp_lock);
2327 2188                  return (allbound ? NULL : T_DONTSTEAL);
2328 2189          }
2329 2190  
2330 2191          /*
2331 2192           * Found a runnable, unbound thread, so remove it from queue.
2332 2193           * dispdeq() requires that we have the thread locked, and we do,
2333 2194           * by virtue of holding the dispatch queue lock.  dispdeq() will
2334 2195           * put the thread in transition state, thereby dropping the dispq
2335 2196           * lock.
2336 2197           */
2337 2198  
2338 2199  #ifdef DEBUG
2339 2200          {
2340 2201                  int     thread_was_on_queue;
2341 2202  
2342 2203                  thread_was_on_queue = dispdeq(tp);      /* drops disp_lock */
2343 2204                  ASSERT(thread_was_on_queue);
2344 2205          }

↓ open down ↓

173 lines elided

↑ open up ↑

2345 2206  
2346 2207  #else /* DEBUG */
2347 2208          (void) dispdeq(tp);                     /* drops disp_lock */
2348 2209  #endif /* DEBUG */
2349 2210  
2350 2211          /*
2351 2212           * Reset the disp_queue steal time - we do not know what is the smallest
2352 2213           * value across the queue is.
2353 2214           */
2354 2215          dp->disp_steal = 0;
2355      -
2356      -        tp->t_schedflag |= TS_DONT_SWAP;
2357 2216  
2358 2217          /*
2359 2218           * Setup thread to run on the current CPU.
2360 2219           */
2361 2220          tp->t_disp_queue = cp->cpu_disp;
2362 2221  
2363 2222          cp->cpu_dispthread = tp;                /* protected by spl only */
2364 2223          cp->cpu_dispatch_pri = pri;
2365 2224  
2366 2225          /*

2367 2226           * There can be a memory synchronization race between disp_getbest()
2368 2227           * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2369 2228           * to preempt the current thread to run the enqueued thread while
2370 2229           * disp_getbest() and disp_ratify() are changing the current thread
2371 2230           * to the stolen thread. This may lead to a situation where
2372 2231           * cpu_resched() tries to preempt the wrong thread and the
2373 2232           * stolen thread continues to run on the CPU which has been tagged
2374 2233           * for preemption.
2375 2234           * Later the clock thread gets enqueued but doesn't get to run on the
2376 2235           * CPU causing the system to hang.
2377 2236           *
2378 2237           * To avoid this, grabbing and dropping the disp_lock (which does
2379 2238           * a memory barrier) is needed to synchronize the execution of
2380 2239           * cpu_resched() with disp_getbest() and disp_ratify() and
2381 2240           * synchronize the memory read and written by cpu_resched(),
2382 2241           * disp_getbest(), and disp_ratify() with each other.
2383 2242           *  (see CR#6482861 for more details).
2384 2243           */
2385 2244          disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2386 2245          disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2387 2246  
2388 2247          ASSERT(pri == DISP_PRIO(tp));
2389 2248  
2390 2249          DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2391 2250  
2392 2251          thread_onproc(tp, cp);                  /* set t_state to TS_ONPROC */
2393 2252  
2394 2253          /*
2395 2254           * Return with spl high so that swtch() won't need to raise it.
2396 2255           * The disp_lock was dropped by dispdeq().
2397 2256           */
2398 2257  
2399 2258          return (tp);
2400 2259  }
2401 2260  
2402 2261  /*
2403 2262   * disp_bound_common() - common routine for higher level functions
2404 2263   *      that check for bound threads under certain conditions.
2405 2264   *      If 'threadlistsafe' is set then there is no need to acquire
2406 2265   *      pidlock to stop the thread list from changing (eg, if
2407 2266   *      disp_bound_* is called with cpus paused).
2408 2267   */
2409 2268  static int
2410 2269  disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2411 2270  {
2412 2271          int             found = 0;
2413 2272          kthread_t       *tp;
2414 2273  
2415 2274          ASSERT(flag);
2416 2275  
2417 2276          if (!threadlistsafe)
2418 2277                  mutex_enter(&pidlock);
2419 2278          tp = curthread;         /* faster than allthreads */
2420 2279          do {
2421 2280                  if (tp->t_state != TS_FREE) {
2422 2281                          /*
2423 2282                           * If an interrupt thread is busy, but the
2424 2283                           * caller doesn't care (i.e. BOUND_INTR is off),
2425 2284                           * then just ignore it and continue through.
2426 2285                           */
2427 2286                          if ((tp->t_flag & T_INTR_THREAD) &&
2428 2287                              !(flag & BOUND_INTR))
2429 2288                                  continue;
2430 2289  
2431 2290                          /*
2432 2291                           * Skip the idle thread for the CPU
2433 2292                           * we're about to set offline.
2434 2293                           */
2435 2294                          if (tp == cp->cpu_idle_thread)
2436 2295                                  continue;
2437 2296  
2438 2297                          /*
2439 2298                           * Skip the pause thread for the CPU
2440 2299                           * we're about to set offline.
2441 2300                           */
2442 2301                          if (tp == cp->cpu_pause_thread)
2443 2302                                  continue;
2444 2303  
2445 2304                          if ((flag & BOUND_CPU) &&
2446 2305                              (tp->t_bound_cpu == cp ||
2447 2306                              tp->t_bind_cpu == cp->cpu_id ||
2448 2307                              tp->t_weakbound_cpu == cp)) {
2449 2308                                  found = 1;
2450 2309                                  break;
2451 2310                          }
2452 2311  
2453 2312                          if ((flag & BOUND_PARTITION) &&
2454 2313                              (tp->t_cpupart == cp->cpu_part)) {
2455 2314                                  found = 1;
2456 2315                                  break;
2457 2316                          }
2458 2317                  }
2459 2318          } while ((tp = tp->t_next) != curthread && found == 0);
2460 2319          if (!threadlistsafe)
2461 2320                  mutex_exit(&pidlock);
2462 2321          return (found);
2463 2322  }
2464 2323  
2465 2324  /*
2466 2325   * disp_bound_threads - return nonzero if threads are bound to the processor.
2467 2326   *      Called infrequently.  Keep this simple.
2468 2327   *      Includes threads that are asleep or stopped but not onproc.
2469 2328   */
2470 2329  int
2471 2330  disp_bound_threads(cpu_t *cp, int threadlistsafe)
2472 2331  {
2473 2332          return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2474 2333  }
2475 2334  
2476 2335  /*
2477 2336   * disp_bound_anythreads - return nonzero if _any_ threads are bound
2478 2337   * to the given processor, including interrupt threads.
2479 2338   */
2480 2339  int
2481 2340  disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2482 2341  {
2483 2342          return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2484 2343  }
2485 2344  
2486 2345  /*
2487 2346   * disp_bound_partition - return nonzero if threads are bound to the same
2488 2347   * partition as the processor.
2489 2348   *      Called infrequently.  Keep this simple.
2490 2349   *      Includes threads that are asleep or stopped but not onproc.
2491 2350   */
2492 2351  int
2493 2352  disp_bound_partition(cpu_t *cp, int threadlistsafe)
2494 2353  {
2495 2354          return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2496 2355  }
2497 2356  
2498 2357  /*
2499 2358   * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2500 2359   * threads to other CPUs.
2501 2360   */
2502 2361  void
2503 2362  disp_cpu_inactive(cpu_t *cp)
2504 2363  {
2505 2364          kthread_t       *tp;
2506 2365          disp_t          *dp = cp->cpu_disp;
2507 2366          dispq_t         *dq;
2508 2367          pri_t           pri;
2509 2368          int             wasonq;
2510 2369  
2511 2370          disp_lock_enter(&dp->disp_lock);
2512 2371          while ((pri = dp->disp_max_unbound_pri) != -1) {
2513 2372                  dq = &dp->disp_q[pri];
2514 2373                  tp = dq->dq_first;
2515 2374  
2516 2375                  /*
2517 2376                   * Skip over bound threads.
2518 2377                   */
2519 2378                  while (tp != NULL && tp->t_bound_cpu != NULL) {
2520 2379                          tp = tp->t_link;
2521 2380                  }
2522 2381  
2523 2382                  if (tp == NULL) {
2524 2383                          /* disp_max_unbound_pri must be inaccurate, so fix it */
2525 2384                          disp_fix_unbound_pri(dp, pri);
2526 2385                          continue;
2527 2386                  }
2528 2387  
2529 2388                  wasonq = dispdeq(tp);           /* drops disp_lock */
2530 2389                  ASSERT(wasonq);
2531 2390                  ASSERT(tp->t_weakbound_cpu == NULL);
2532 2391  
2533 2392                  setbackdq(tp);
2534 2393                  /*
2535 2394                   * Called from cpu_offline:
2536 2395                   *
2537 2396                   * cp has already been removed from the list of active cpus
2538 2397                   * and tp->t_cpu has been changed so there is no risk of
2539 2398                   * tp ending up back on cp.
2540 2399                   *
2541 2400                   * Called from cpupart_move_cpu:
2542 2401                   *
2543 2402                   * The cpu has moved to a new cpupart.  Any threads that
2544 2403                   * were on it's dispatch queues before the move remain
2545 2404                   * in the old partition and can't run in the new partition.
2546 2405                   */
2547 2406                  ASSERT(tp->t_cpu != cp);
2548 2407                  thread_unlock(tp);
2549 2408  
2550 2409                  disp_lock_enter(&dp->disp_lock);
2551 2410          }
2552 2411          disp_lock_exit(&dp->disp_lock);
2553 2412  }
2554 2413  
2555 2414  /*
2556 2415   * disp_lowpri_cpu - find CPU running the lowest priority thread.
2557 2416   *      The hint passed in is used as a starting point so we don't favor
2558 2417   *      CPU 0 or any other CPU.  The caller should pass in the most recently
2559 2418   *      used CPU for the thread.
2560 2419   *
2561 2420   *      The lgroup and priority are used to determine the best CPU to run on
2562 2421   *      in a NUMA machine.  The lgroup specifies which CPUs are closest while
2563 2422   *      the thread priority will indicate whether the thread will actually run
2564 2423   *      there.  To pick the best CPU, the CPUs inside and outside of the given
2565 2424   *      lgroup which are running the lowest priority threads are found.  The
2566 2425   *      remote CPU is chosen only if the thread will not run locally on a CPU
2567 2426   *      within the lgroup, but will run on the remote CPU. If the thread
2568 2427   *      cannot immediately run on any CPU, the best local CPU will be chosen.
2569 2428   *
2570 2429   *      The lpl specified also identifies the cpu partition from which
2571 2430   *      disp_lowpri_cpu should select a CPU.
2572 2431   *
2573 2432   *      curcpu is used to indicate that disp_lowpri_cpu is being called on
2574 2433   *      behalf of the current thread. (curthread is looking for a new cpu)
2575 2434   *      In this case, cpu_dispatch_pri for this thread's cpu should be
2576 2435   *      ignored.
2577 2436   *
2578 2437   *      If a cpu is the target of an offline request then try to avoid it.
2579 2438   *
2580 2439   *      This function must be called at either high SPL, or with preemption
2581 2440   *      disabled, so that the "hint" CPU cannot be removed from the online
2582 2441   *      CPU list while we are traversing it.
2583 2442   */
2584 2443  cpu_t *
2585 2444  disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2586 2445  {
2587 2446          cpu_t   *bestcpu;
2588 2447          cpu_t   *besthomecpu;
2589 2448          cpu_t   *cp, *cpstart;
2590 2449  
2591 2450          pri_t   bestpri;
2592 2451          pri_t   cpupri;
2593 2452  
2594 2453          klgrpset_t      done;
2595 2454          klgrpset_t      cur_set;
2596 2455  
2597 2456          lpl_t           *lpl_iter, *lpl_leaf;
2598 2457          int             i;
2599 2458  
2600 2459          /*
2601 2460           * Scan for a CPU currently running the lowest priority thread.
2602 2461           * Cannot get cpu_lock here because it is adaptive.
2603 2462           * We do not require lock on CPU list.
2604 2463           */
2605 2464          ASSERT(hint != NULL);
2606 2465          ASSERT(lpl != NULL);
2607 2466          ASSERT(lpl->lpl_ncpu > 0);
2608 2467  
2609 2468          /*
2610 2469           * First examine local CPUs. Note that it's possible the hint CPU
2611 2470           * passed in in remote to the specified home lgroup. If our priority
2612 2471           * isn't sufficient enough such that we can run immediately at home,
2613 2472           * then examine CPUs remote to our home lgroup.
2614 2473           * We would like to give preference to CPUs closest to "home".
2615 2474           * If we can't find a CPU where we'll run at a given level
2616 2475           * of locality, we expand our search to include the next level.
2617 2476           */
2618 2477          bestcpu = besthomecpu = NULL;
2619 2478          klgrpset_clear(done);
2620 2479          /* start with lpl we were passed */
2621 2480  
2622 2481          lpl_iter = lpl;
2623 2482  
2624 2483          do {
2625 2484  
2626 2485                  bestpri = SHRT_MAX;
2627 2486                  klgrpset_clear(cur_set);
2628 2487  
2629 2488                  for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2630 2489                          lpl_leaf = lpl_iter->lpl_rset[i];
2631 2490                          if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2632 2491                                  continue;
2633 2492  
2634 2493                          klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2635 2494  
2636 2495                          if (hint->cpu_lpl == lpl_leaf)
2637 2496                                  cp = cpstart = hint;
2638 2497                          else
2639 2498                                  cp = cpstart = lpl_leaf->lpl_cpus;
2640 2499  
2641 2500                          do {
2642 2501                                  if (cp == curcpu)
2643 2502                                          cpupri = -1;
2644 2503                                  else if (cp == cpu_inmotion)
2645 2504                                          cpupri = SHRT_MAX;
2646 2505                                  else
2647 2506                                          cpupri = cp->cpu_dispatch_pri;
2648 2507                                  if (cp->cpu_disp->disp_maxrunpri > cpupri)
2649 2508                                          cpupri = cp->cpu_disp->disp_maxrunpri;
2650 2509                                  if (cp->cpu_chosen_level > cpupri)
2651 2510                                          cpupri = cp->cpu_chosen_level;
2652 2511                                  if (cpupri < bestpri) {
2653 2512                                          if (CPU_IDLING(cpupri)) {
2654 2513                                                  ASSERT((cp->cpu_flags &
2655 2514                                                      CPU_QUIESCED) == 0);
2656 2515                                                  return (cp);
2657 2516                                          }
2658 2517                                          bestcpu = cp;
2659 2518                                          bestpri = cpupri;
2660 2519                                  }
2661 2520                          } while ((cp = cp->cpu_next_lpl) != cpstart);
2662 2521                  }
2663 2522  
2664 2523                  if (bestcpu && (tpri > bestpri)) {
2665 2524                          ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2666 2525                          return (bestcpu);
2667 2526                  }
2668 2527                  if (besthomecpu == NULL)
2669 2528                          besthomecpu = bestcpu;
2670 2529                  /*
2671 2530                   * Add the lgrps we just considered to the "done" set
2672 2531                   */
2673 2532                  klgrpset_or(done, cur_set);
2674 2533  
2675 2534          } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2676 2535  
2677 2536          /*
2678 2537           * The specified priority isn't high enough to run immediately
2679 2538           * anywhere, so just return the best CPU from the home lgroup.
2680 2539           */
2681 2540          ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2682 2541          return (besthomecpu);
2683 2542  }
2684 2543  
2685 2544  /*
2686 2545   * This routine provides the generic idle cpu function for all processors.
2687 2546   * If a processor has some specific code to execute when idle (say, to stop
2688 2547   * the pipeline and save power) then that routine should be defined in the
2689 2548   * processors specific code (module_xx.c) and the global variable idle_cpu
2690 2549   * set to that function.
2691 2550   */
2692 2551  static void
2693 2552  generic_idle_cpu(void)
2694 2553  {
2695 2554  }
2696 2555  
2697 2556  /*ARGSUSED*/
2698 2557  static void
2699 2558  generic_enq_thread(cpu_t *cpu, int bound)
2700 2559  {
2701 2560  }

↓ open down ↓

335 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX