5045-use-atomic_inc_*-atomic_dec_*-instead-of-atomic_add_* Wdiff usr/src/uts/common/vm/vm_page.c

Print this page

5045 use atomic_{inc,dec}_* instead of atomic_add_*

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_page.c
          +++ new/usr/src/uts/common/vm/vm_page.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   */
  24   24  
  25   25  /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989  AT&T    */
  26   26  /*        All Rights Reserved   */
  27   27  
  28   28  /*
  29   29   * University Copyright- Copyright (c) 1982, 1986, 1988
  30   30   * The Regents of the University of California
  31   31   * All Rights Reserved
  32   32   *
  33   33   * University Acknowledgment- Portions of this document are derived from
  34   34   * software developed by the University of California, Berkeley, and its
  35   35   * contributors.
  36   36   */
  37   37  
  38   38  /*
  39   39   * VM - physical page management.
  40   40   */
  41   41  
  42   42  #include <sys/types.h>
  43   43  #include <sys/t_lock.h>
  44   44  #include <sys/param.h>
  45   45  #include <sys/systm.h>
  46   46  #include <sys/errno.h>
  47   47  #include <sys/time.h>
  48   48  #include <sys/vnode.h>
  49   49  #include <sys/vm.h>
  50   50  #include <sys/vtrace.h>
  51   51  #include <sys/swap.h>
  52   52  #include <sys/cmn_err.h>
  53   53  #include <sys/tuneable.h>
  54   54  #include <sys/sysmacros.h>
  55   55  #include <sys/cpuvar.h>
  56   56  #include <sys/callb.h>
  57   57  #include <sys/debug.h>
  58   58  #include <sys/tnf_probe.h>
  59   59  #include <sys/condvar_impl.h>
  60   60  #include <sys/mem_config.h>
  61   61  #include <sys/mem_cage.h>
  62   62  #include <sys/kmem.h>
  63   63  #include <sys/atomic.h>
  64   64  #include <sys/strlog.h>
  65   65  #include <sys/mman.h>
  66   66  #include <sys/ontrap.h>
  67   67  #include <sys/lgrp.h>
  68   68  #include <sys/vfs.h>
  69   69  
  70   70  #include <vm/hat.h>
  71   71  #include <vm/anon.h>
  72   72  #include <vm/page.h>
  73   73  #include <vm/seg.h>
  74   74  #include <vm/pvn.h>
  75   75  #include <vm/seg_kmem.h>
  76   76  #include <vm/vm_dep.h>
  77   77  #include <sys/vm_usage.h>
  78   78  #include <fs/fs_subr.h>
  79   79  #include <sys/ddi.h>
  80   80  #include <sys/modctl.h>
  81   81  
  82   82  static int nopageage = 0;
  83   83  
  84   84  static pgcnt_t max_page_get;    /* max page_get request size in pages */
  85   85  pgcnt_t total_pages = 0;        /* total number of pages (used by /proc) */
  86   86  
  87   87  /*
  88   88   * freemem_lock protects all freemem variables:
  89   89   * availrmem. Also this lock protects the globals which track the
  90   90   * availrmem changes for accurate kernel footprint calculation.
  91   91   * See below for an explanation of these
  92   92   * globals.
  93   93   */
  94   94  kmutex_t freemem_lock;
  95   95  pgcnt_t availrmem;
  96   96  pgcnt_t availrmem_initial;
  97   97  
  98   98  /*
  99   99   * These globals track availrmem changes to get a more accurate
 100  100   * estimate of tke kernel size. Historically pp_kernel is used for
 101  101   * kernel size and is based on availrmem. But availrmem is adjusted for
 102  102   * locked pages in the system not just for kernel locked pages.
 103  103   * These new counters will track the pages locked through segvn and
 104  104   * by explicit user locking.
 105  105   *
 106  106   * pages_locked : How many pages are locked because of user specified
 107  107   * locking through mlock or plock.
 108  108   *
 109  109   * pages_useclaim,pages_claimed : These two variables track the
 110  110   * claim adjustments because of the protection changes on a segvn segment.
 111  111   *
 112  112   * All these globals are protected by the same lock which protects availrmem.
 113  113   */
 114  114  pgcnt_t pages_locked = 0;
 115  115  pgcnt_t pages_useclaim = 0;
 116  116  pgcnt_t pages_claimed = 0;
 117  117  
 118  118  
 119  119  /*
 120  120   * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
 121  121   */
 122  122  static kmutex_t new_freemem_lock;
 123  123  static uint_t   freemem_wait;   /* someone waiting for freemem */
 124  124  static kcondvar_t freemem_cv;
 125  125  
 126  126  /*
 127  127   * The logical page free list is maintained as two lists, the 'free'
 128  128   * and the 'cache' lists.
 129  129   * The free list contains those pages that should be reused first.
 130  130   *
 131  131   * The implementation of the lists is machine dependent.
 132  132   * page_get_freelist(), page_get_cachelist(),
 133  133   * page_list_sub(), and page_list_add()
 134  134   * form the interface to the machine dependent implementation.
 135  135   *
 136  136   * Pages with p_free set are on the cache list.
 137  137   * Pages with p_free and p_age set are on the free list,
 138  138   *
 139  139   * A page may be locked while on either list.
 140  140   */
 141  141  
 142  142  /*
 143  143   * free list accounting stuff.
 144  144   *
 145  145   *
 146  146   * Spread out the value for the number of pages on the
 147  147   * page free and page cache lists.  If there is just one
 148  148   * value, then it must be under just one lock.
 149  149   * The lock contention and cache traffic are a real bother.
 150  150   *
 151  151   * When we acquire and then drop a single pcf lock
 152  152   * we can start in the middle of the array of pcf structures.
 153  153   * If we acquire more than one pcf lock at a time, we need to
 154  154   * start at the front to avoid deadlocking.
 155  155   *
 156  156   * pcf_count holds the number of pages in each pool.
 157  157   *
 158  158   * pcf_block is set when page_create_get_something() has asked the
 159  159   * PSM page freelist and page cachelist routines without specifying
 160  160   * a color and nothing came back.  This is used to block anything
 161  161   * else from moving pages from one list to the other while the
 162  162   * lists are searched again.  If a page is freeed while pcf_block is
 163  163   * set, then pcf_reserve is incremented.  pcgs_unblock() takes care
 164  164   * of clearning pcf_block, doing the wakeups, etc.
 165  165   */
 166  166  
 167  167  #define MAX_PCF_FANOUT NCPU
 168  168  static uint_t pcf_fanout = 1; /* Will get changed at boot time */
 169  169  static uint_t pcf_fanout_mask = 0;
 170  170  
 171  171  struct pcf {
 172  172          kmutex_t        pcf_lock;       /* protects the structure */
 173  173          uint_t          pcf_count;      /* page count */
 174  174          uint_t          pcf_wait;       /* number of waiters */
 175  175          uint_t          pcf_block;      /* pcgs flag to page_free() */
 176  176          uint_t          pcf_reserve;    /* pages freed after pcf_block set */
 177  177          uint_t          pcf_fill[10];   /* to line up on the caches */
 178  178  };
 179  179  
 180  180  /*
 181  181   * PCF_INDEX hash needs to be dynamic (every so often the hash changes where
 182  182   * it will hash the cpu to).  This is done to prevent a drain condition
 183  183   * from happening.  This drain condition will occur when pcf_count decrement
 184  184   * occurs on cpu A and the increment of pcf_count always occurs on cpu B.  An
 185  185   * example of this shows up with device interrupts.  The dma buffer is allocated
 186  186   * by the cpu requesting the IO thus the pcf_count is decremented based on that.
 187  187   * When the memory is returned by the interrupt thread, the pcf_count will be
 188  188   * incremented based on the cpu servicing the interrupt.
 189  189   */
 190  190  static struct pcf pcf[MAX_PCF_FANOUT];
 191  191  #define PCF_INDEX() ((int)(((long)CPU->cpu_seqid) + \
 192  192          (randtick() >> 24)) & (pcf_fanout_mask))
 193  193  
 194  194  static int pcf_decrement_bucket(pgcnt_t);
 195  195  static int pcf_decrement_multiple(pgcnt_t *, pgcnt_t, int);
 196  196  
 197  197  kmutex_t        pcgs_lock;              /* serializes page_create_get_ */
 198  198  kmutex_t        pcgs_cagelock;          /* serializes NOSLEEP cage allocs */
 199  199  kmutex_t        pcgs_wait_lock;         /* used for delay in pcgs */
 200  200  static kcondvar_t       pcgs_cv;        /* cv for delay in pcgs */
 201  201  
 202  202  #ifdef VM_STATS
 203  203  
 204  204  /*
 205  205   * No locks, but so what, they are only statistics.
 206  206   */
 207  207  
 208  208  static struct page_tcnt {
 209  209          int     pc_free_cache;          /* free's into cache list */
 210  210          int     pc_free_dontneed;       /* free's with dontneed */
 211  211          int     pc_free_pageout;        /* free's from pageout */
 212  212          int     pc_free_free;           /* free's into free list */
 213  213          int     pc_free_pages;          /* free's into large page free list */
 214  214          int     pc_destroy_pages;       /* large page destroy's */
 215  215          int     pc_get_cache;           /* get's from cache list */
 216  216          int     pc_get_free;            /* get's from free list */
 217  217          int     pc_reclaim;             /* reclaim's */
 218  218          int     pc_abortfree;           /* abort's of free pages */
 219  219          int     pc_find_hit;            /* find's that find page */
 220  220          int     pc_find_miss;           /* find's that don't find page */
 221  221          int     pc_destroy_free;        /* # of free pages destroyed */
 222  222  #define PC_HASH_CNT     (4*PAGE_HASHAVELEN)
 223  223          int     pc_find_hashlen[PC_HASH_CNT+1];
 224  224          int     pc_addclaim_pages;
 225  225          int     pc_subclaim_pages;
 226  226          int     pc_free_replacement_page[2];
 227  227          int     pc_try_demote_pages[6];
 228  228          int     pc_demote_pages[2];
 229  229  } pagecnt;
 230  230  
 231  231  uint_t  hashin_count;
 232  232  uint_t  hashin_not_held;
 233  233  uint_t  hashin_already;
 234  234  
 235  235  uint_t  hashout_count;
 236  236  uint_t  hashout_not_held;
 237  237  
 238  238  uint_t  page_create_count;
 239  239  uint_t  page_create_not_enough;
 240  240  uint_t  page_create_not_enough_again;
 241  241  uint_t  page_create_zero;
 242  242  uint_t  page_create_hashout;
 243  243  uint_t  page_create_page_lock_failed;
 244  244  uint_t  page_create_trylock_failed;
 245  245  uint_t  page_create_found_one;
 246  246  uint_t  page_create_hashin_failed;
 247  247  uint_t  page_create_dropped_phm;
 248  248  
 249  249  uint_t  page_create_new;
 250  250  uint_t  page_create_exists;
 251  251  uint_t  page_create_putbacks;
 252  252  uint_t  page_create_overshoot;
 253  253  
 254  254  uint_t  page_reclaim_zero;
 255  255  uint_t  page_reclaim_zero_locked;
 256  256  
 257  257  uint_t  page_rename_exists;
 258  258  uint_t  page_rename_count;
 259  259  
 260  260  uint_t  page_lookup_cnt[20];
 261  261  uint_t  page_lookup_nowait_cnt[10];
 262  262  uint_t  page_find_cnt;
 263  263  uint_t  page_exists_cnt;
 264  264  uint_t  page_exists_forreal_cnt;
 265  265  uint_t  page_lookup_dev_cnt;
 266  266  uint_t  get_cachelist_cnt;
 267  267  uint_t  page_create_cnt[10];
 268  268  uint_t  alloc_pages[9];
 269  269  uint_t  page_exphcontg[19];
 270  270  uint_t  page_create_large_cnt[10];
 271  271  
 272  272  /*
 273  273   * Collects statistics.
 274  274   */
 275  275  #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
 276  276          uint_t  mylen = 0; \
 277  277                          \
 278  278          for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \
 279  279                  if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
 280  280                          break; \
 281  281          } \
 282  282          if ((pp) != NULL) \
 283  283                  pagecnt.pc_find_hit++; \
 284  284          else \
 285  285                  pagecnt.pc_find_miss++; \
 286  286          if (mylen > PC_HASH_CNT) \
 287  287                  mylen = PC_HASH_CNT; \
 288  288          pagecnt.pc_find_hashlen[mylen]++; \
 289  289  }
 290  290  
 291  291  #else   /* VM_STATS */
 292  292  
 293  293  /*
 294  294   * Don't collect statistics
 295  295   */
 296  296  #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
 297  297          for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
 298  298                  if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
 299  299                          break; \
 300  300          } \
 301  301  }
 302  302  
 303  303  #endif  /* VM_STATS */
 304  304  
 305  305  
 306  306  
 307  307  #ifdef DEBUG
 308  308  #define MEMSEG_SEARCH_STATS
 309  309  #endif

↓ open down ↓

309 lines elided

↑ open up ↑

 310  310  
 311  311  #ifdef MEMSEG_SEARCH_STATS
 312  312  struct memseg_stats {
 313  313      uint_t nsearch;
 314  314      uint_t nlastwon;
 315  315      uint_t nhashwon;
 316  316      uint_t nnotfound;
 317  317  } memseg_stats;
 318  318  
 319  319  #define MEMSEG_STAT_INCR(v) \
 320      -        atomic_add_32(&memseg_stats.v, 1)
      320 +        atomic_inc_32(&memseg_stats.v)
 321  321  #else
 322  322  #define MEMSEG_STAT_INCR(x)
 323  323  #endif
 324  324  
 325  325  struct memseg *memsegs;         /* list of memory segments */
 326  326  
 327  327  /*
 328  328   * /etc/system tunable to control large page allocation hueristic.
 329  329   *
 330  330   * Setting to LPAP_LOCAL will heavily prefer the local lgroup over remote lgroup

 331  331   * for large page allocation requests.  If a large page is not readily
 332  332   * avaliable on the local freelists we will go through additional effort
 333  333   * to create a large page, potentially moving smaller pages around to coalesce
 334  334   * larger pages in the local lgroup.
 335  335   * Default value of LPAP_DEFAULT will go to remote freelists if large pages
 336  336   * are not readily available in the local lgroup.
 337  337   */
 338  338  enum lpap {
 339  339          LPAP_DEFAULT,   /* default large page allocation policy */
 340  340          LPAP_LOCAL      /* local large page allocation policy */
 341  341  };
 342  342  
 343  343  enum lpap lpg_alloc_prefer = LPAP_DEFAULT;
 344  344  
 345  345  static void page_init_mem_config(void);
 346  346  static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
 347  347  static void page_do_hashout(page_t *);
 348  348  static void page_capture_init();
 349  349  int page_capture_take_action(page_t *, uint_t, void *);
 350  350  
 351  351  static void page_demote_vp_pages(page_t *);
 352  352  
 353  353  
 354  354  void
 355  355  pcf_init(void)
 356  356  
 357  357  {
 358  358          if (boot_ncpus != -1) {
 359  359                  pcf_fanout = boot_ncpus;
 360  360          } else {
 361  361                  pcf_fanout = max_ncpus;
 362  362          }
 363  363  #ifdef sun4v
 364  364          /*
 365  365           * Force at least 4 buckets if possible for sun4v.
 366  366           */
 367  367          pcf_fanout = MAX(pcf_fanout, 4);
 368  368  #endif /* sun4v */
 369  369  
 370  370          /*
 371  371           * Round up to the nearest power of 2.
 372  372           */
 373  373          pcf_fanout = MIN(pcf_fanout, MAX_PCF_FANOUT);
 374  374          if (!ISP2(pcf_fanout)) {
 375  375                  pcf_fanout = 1 << highbit(pcf_fanout);
 376  376  
 377  377                  if (pcf_fanout > MAX_PCF_FANOUT) {
 378  378                          pcf_fanout = 1 << (highbit(MAX_PCF_FANOUT) - 1);
 379  379                  }
 380  380          }
 381  381          pcf_fanout_mask = pcf_fanout - 1;
 382  382  }
 383  383  
 384  384  /*
 385  385   * vm subsystem related initialization
 386  386   */
 387  387  void
 388  388  vm_init(void)
 389  389  {
 390  390          boolean_t callb_vm_cpr(void *, int);
 391  391  
 392  392          (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
 393  393          page_init_mem_config();
 394  394          page_retire_init();
 395  395          vm_usage_init();
 396  396          page_capture_init();
 397  397  }
 398  398  
 399  399  /*
 400  400   * This function is called at startup and when memory is added or deleted.
 401  401   */
 402  402  void
 403  403  init_pages_pp_maximum()
 404  404  {
 405  405          static pgcnt_t p_min;
 406  406          static pgcnt_t pages_pp_maximum_startup;
 407  407          static pgcnt_t avrmem_delta;
 408  408          static int init_done;
 409  409          static int user_set;    /* true if set in /etc/system */
 410  410  
 411  411          if (init_done == 0) {
 412  412  
 413  413                  /* If the user specified a value, save it */
 414  414                  if (pages_pp_maximum != 0) {
 415  415                          user_set = 1;
 416  416                          pages_pp_maximum_startup = pages_pp_maximum;
 417  417                  }
 418  418  
 419  419                  /*
 420  420                   * Setting of pages_pp_maximum is based first time
 421  421                   * on the value of availrmem just after the start-up
 422  422                   * allocations. To preserve this relationship at run
 423  423                   * time, use a delta from availrmem_initial.
 424  424                   */
 425  425                  ASSERT(availrmem_initial >= availrmem);
 426  426                  avrmem_delta = availrmem_initial - availrmem;
 427  427  
 428  428                  /* The allowable floor of pages_pp_maximum */
 429  429                  p_min = tune.t_minarmem + 100;
 430  430  
 431  431                  /* Make sure we don't come through here again. */
 432  432                  init_done = 1;
 433  433          }
 434  434          /*
 435  435           * Determine pages_pp_maximum, the number of currently available
 436  436           * pages (availrmem) that can't be `locked'. If not set by
 437  437           * the user, we set it to 4% of the currently available memory
 438  438           * plus 4MB.
 439  439           * But we also insist that it be greater than tune.t_minarmem;
 440  440           * otherwise a process could lock down a lot of memory, get swapped
 441  441           * out, and never have enough to get swapped back in.
 442  442           */
 443  443          if (user_set)
 444  444                  pages_pp_maximum = pages_pp_maximum_startup;
 445  445          else
 446  446                  pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
 447  447                      + btop(4 * 1024 * 1024);
 448  448  
 449  449          if (pages_pp_maximum <= p_min) {
 450  450                  pages_pp_maximum = p_min;
 451  451          }
 452  452  }
 453  453  
 454  454  void
 455  455  set_max_page_get(pgcnt_t target_total_pages)
 456  456  {
 457  457          max_page_get = target_total_pages / 2;
 458  458  }
 459  459  
 460  460  static pgcnt_t pending_delete;
 461  461  
 462  462  /*ARGSUSED*/
 463  463  static void
 464  464  page_mem_config_post_add(
 465  465          void *arg,
 466  466          pgcnt_t delta_pages)
 467  467  {
 468  468          set_max_page_get(total_pages - pending_delete);
 469  469          init_pages_pp_maximum();
 470  470  }
 471  471  
 472  472  /*ARGSUSED*/
 473  473  static int
 474  474  page_mem_config_pre_del(
 475  475          void *arg,
 476  476          pgcnt_t delta_pages)
 477  477  {
 478  478          pgcnt_t nv;
 479  479  
 480  480          nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
 481  481          set_max_page_get(total_pages - nv);
 482  482          return (0);
 483  483  }
 484  484  
 485  485  /*ARGSUSED*/
 486  486  static void
 487  487  page_mem_config_post_del(
 488  488          void *arg,
 489  489          pgcnt_t delta_pages,
 490  490          int cancelled)
 491  491  {
 492  492          pgcnt_t nv;
 493  493  
 494  494          nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
 495  495          set_max_page_get(total_pages - nv);
 496  496          if (!cancelled)
 497  497                  init_pages_pp_maximum();
 498  498  }
 499  499  
 500  500  static kphysm_setup_vector_t page_mem_config_vec = {
 501  501          KPHYSM_SETUP_VECTOR_VERSION,
 502  502          page_mem_config_post_add,
 503  503          page_mem_config_pre_del,
 504  504          page_mem_config_post_del,
 505  505  };
 506  506  
 507  507  static void
 508  508  page_init_mem_config(void)
 509  509  {
 510  510          int ret;
 511  511  
 512  512          ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
 513  513          ASSERT(ret == 0);
 514  514  }
 515  515  
 516  516  /*
 517  517   * Evenly spread out the PCF counters for large free pages
 518  518   */
 519  519  static void
 520  520  page_free_large_ctr(pgcnt_t npages)
 521  521  {
 522  522          static struct pcf       *p = pcf;
 523  523          pgcnt_t                 lump;
 524  524  
 525  525          freemem += npages;
 526  526  
 527  527          lump = roundup(npages, pcf_fanout) / pcf_fanout;
 528  528  
 529  529          while (npages > 0) {
 530  530  
 531  531                  ASSERT(!p->pcf_block);
 532  532  
 533  533                  if (lump < npages) {
 534  534                          p->pcf_count += (uint_t)lump;
 535  535                          npages -= lump;
 536  536                  } else {
 537  537                          p->pcf_count += (uint_t)npages;
 538  538                          npages = 0;
 539  539                  }
 540  540  
 541  541                  ASSERT(!p->pcf_wait);
 542  542  
 543  543                  if (++p > &pcf[pcf_fanout - 1])
 544  544                          p = pcf;
 545  545          }
 546  546  
 547  547          ASSERT(npages == 0);
 548  548  }
 549  549  
 550  550  /*
 551  551   * Add a physical chunk of memory to the system free lists during startup.
 552  552   * Platform specific startup() allocates the memory for the page structs.
 553  553   *
 554  554   * num  - number of page structures
 555  555   * base - page number (pfn) to be associated with the first page.
 556  556   *
 557  557   * Since we are doing this during startup (ie. single threaded), we will
 558  558   * use shortcut routines to avoid any locking overhead while putting all
 559  559   * these pages on the freelists.
 560  560   *
 561  561   * NOTE: Any changes performed to page_free(), must also be performed to
 562  562   *       add_physmem() since this is how we initialize all page_t's at
 563  563   *       boot time.
 564  564   */
 565  565  void
 566  566  add_physmem(
 567  567          page_t  *pp,
 568  568          pgcnt_t num,
 569  569          pfn_t   pnum)
 570  570  {
 571  571          page_t  *root = NULL;
 572  572          uint_t  szc = page_num_pagesizes() - 1;
 573  573          pgcnt_t large = page_get_pagecnt(szc);
 574  574          pgcnt_t cnt = 0;
 575  575  
 576  576          TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
 577  577              "add_physmem:pp %p num %lu", pp, num);
 578  578  
 579  579          /*
 580  580           * Arbitrarily limit the max page_get request
 581  581           * to 1/2 of the page structs we have.
 582  582           */
 583  583          total_pages += num;
 584  584          set_max_page_get(total_pages);
 585  585  
 586  586          PLCNT_MODIFY_MAX(pnum, (long)num);
 587  587  
 588  588          /*
 589  589           * The physical space for the pages array
 590  590           * representing ram pages has already been
 591  591           * allocated.  Here we initialize each lock
 592  592           * in the page structure, and put each on
 593  593           * the free list
 594  594           */
 595  595          for (; num; pp++, pnum++, num--) {
 596  596  
 597  597                  /*
 598  598                   * this needs to fill in the page number
 599  599                   * and do any other arch specific initialization
 600  600                   */
 601  601                  add_physmem_cb(pp, pnum);
 602  602  
 603  603                  pp->p_lckcnt = 0;
 604  604                  pp->p_cowcnt = 0;
 605  605                  pp->p_slckcnt = 0;
 606  606  
 607  607                  /*
 608  608                   * Initialize the page lock as unlocked, since nobody
 609  609                   * can see or access this page yet.
 610  610                   */
 611  611                  pp->p_selock = 0;
 612  612  
 613  613                  /*
 614  614                   * Initialize IO lock
 615  615                   */
 616  616                  page_iolock_init(pp);
 617  617  
 618  618                  /*
 619  619                   * initialize other fields in the page_t
 620  620                   */
 621  621                  PP_SETFREE(pp);
 622  622                  page_clr_all_props(pp);
 623  623                  PP_SETAGED(pp);
 624  624                  pp->p_offset = (u_offset_t)-1;
 625  625                  pp->p_next = pp;
 626  626                  pp->p_prev = pp;
 627  627  
 628  628                  /*
 629  629                   * Simple case: System doesn't support large pages.
 630  630                   */
 631  631                  if (szc == 0) {
 632  632                          pp->p_szc = 0;
 633  633                          page_free_at_startup(pp);
 634  634                          continue;
 635  635                  }
 636  636  
 637  637                  /*
 638  638                   * Handle unaligned pages, we collect them up onto
 639  639                   * the root page until we have a full large page.
 640  640                   */
 641  641                  if (!IS_P2ALIGNED(pnum, large)) {
 642  642  
 643  643                          /*
 644  644                           * If not in a large page,
 645  645                           * just free as small page.
 646  646                           */
 647  647                          if (root == NULL) {
 648  648                                  pp->p_szc = 0;
 649  649                                  page_free_at_startup(pp);
 650  650                                  continue;
 651  651                          }
 652  652  
 653  653                          /*
 654  654                           * Link a constituent page into the large page.
 655  655                           */
 656  656                          pp->p_szc = szc;
 657  657                          page_list_concat(&root, &pp);
 658  658  
 659  659                          /*
 660  660                           * When large page is fully formed, free it.
 661  661                           */
 662  662                          if (++cnt == large) {
 663  663                                  page_free_large_ctr(cnt);
 664  664                                  page_list_add_pages(root, PG_LIST_ISINIT);
 665  665                                  root = NULL;
 666  666                                  cnt = 0;
 667  667                          }
 668  668                          continue;
 669  669                  }
 670  670  
 671  671                  /*
 672  672                   * At this point we have a page number which
 673  673                   * is aligned. We assert that we aren't already
 674  674                   * in a different large page.
 675  675                   */
 676  676                  ASSERT(IS_P2ALIGNED(pnum, large));
 677  677                  ASSERT(root == NULL && cnt == 0);
 678  678  
 679  679                  /*
 680  680                   * If insufficient number of pages left to form
 681  681                   * a large page, just free the small page.
 682  682                   */
 683  683                  if (num < large) {
 684  684                          pp->p_szc = 0;
 685  685                          page_free_at_startup(pp);
 686  686                          continue;
 687  687                  }
 688  688  
 689  689                  /*
 690  690                   * Otherwise start a new large page.
 691  691                   */
 692  692                  pp->p_szc = szc;
 693  693                  cnt++;
 694  694                  root = pp;
 695  695          }
 696  696          ASSERT(root == NULL && cnt == 0);
 697  697  }
 698  698  
 699  699  /*
 700  700   * Find a page representing the specified [vp, offset].
 701  701   * If we find the page but it is intransit coming in,
 702  702   * it will have an "exclusive" lock and we wait for
 703  703   * the i/o to complete.  A page found on the free list
 704  704   * is always reclaimed and then locked.  On success, the page
 705  705   * is locked, its data is valid and it isn't on the free
 706  706   * list, while a NULL is returned if the page doesn't exist.
 707  707   */
 708  708  page_t *
 709  709  page_lookup(vnode_t *vp, u_offset_t off, se_t se)
 710  710  {
 711  711          return (page_lookup_create(vp, off, se, NULL, NULL, 0));
 712  712  }
 713  713  
 714  714  /*
 715  715   * Find a page representing the specified [vp, offset].
 716  716   * We either return the one we found or, if passed in,
 717  717   * create one with identity of [vp, offset] of the
 718  718   * pre-allocated page. If we find existing page but it is
 719  719   * intransit coming in, it will have an "exclusive" lock
 720  720   * and we wait for the i/o to complete.  A page found on
 721  721   * the free list is always reclaimed and then locked.
 722  722   * On success, the page is locked, its data is valid and
 723  723   * it isn't on the free list, while a NULL is returned
 724  724   * if the page doesn't exist and newpp is NULL;
 725  725   */
 726  726  page_t *
 727  727  page_lookup_create(
 728  728          vnode_t *vp,
 729  729          u_offset_t off,
 730  730          se_t se,
 731  731          page_t *newpp,
 732  732          spgcnt_t *nrelocp,
 733  733          int flags)
 734  734  {
 735  735          page_t          *pp;
 736  736          kmutex_t        *phm;
 737  737          ulong_t         index;
 738  738          uint_t          hash_locked;
 739  739          uint_t          es;
 740  740  
 741  741          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
 742  742          VM_STAT_ADD(page_lookup_cnt[0]);
 743  743          ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
 744  744  
 745  745          /*
 746  746           * Acquire the appropriate page hash lock since
 747  747           * we have to search the hash list.  Pages that
 748  748           * hash to this list can't change identity while
 749  749           * this lock is held.
 750  750           */
 751  751          hash_locked = 0;
 752  752          index = PAGE_HASH_FUNC(vp, off);
 753  753          phm = NULL;
 754  754  top:
 755  755          PAGE_HASH_SEARCH(index, pp, vp, off);
 756  756          if (pp != NULL) {
 757  757                  VM_STAT_ADD(page_lookup_cnt[1]);
 758  758                  es = (newpp != NULL) ? 1 : 0;
 759  759                  es |= flags;
 760  760                  if (!hash_locked) {
 761  761                          VM_STAT_ADD(page_lookup_cnt[2]);
 762  762                          if (!page_try_reclaim_lock(pp, se, es)) {
 763  763                                  /*
 764  764                                   * On a miss, acquire the phm.  Then
 765  765                                   * next time, page_lock() will be called,
 766  766                                   * causing a wait if the page is busy.
 767  767                                   * just looping with page_trylock() would
 768  768                                   * get pretty boring.
 769  769                                   */
 770  770                                  VM_STAT_ADD(page_lookup_cnt[3]);
 771  771                                  phm = PAGE_HASH_MUTEX(index);
 772  772                                  mutex_enter(phm);
 773  773                                  hash_locked = 1;
 774  774                                  goto top;
 775  775                          }
 776  776                  } else {
 777  777                          VM_STAT_ADD(page_lookup_cnt[4]);
 778  778                          if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {
 779  779                                  VM_STAT_ADD(page_lookup_cnt[5]);
 780  780                                  goto top;
 781  781                          }
 782  782                  }
 783  783  
 784  784                  /*
 785  785                   * Since `pp' is locked it can not change identity now.
 786  786                   * Reconfirm we locked the correct page.
 787  787                   *
 788  788                   * Both the p_vnode and p_offset *must* be cast volatile
 789  789                   * to force a reload of their values: The PAGE_HASH_SEARCH
 790  790                   * macro will have stuffed p_vnode and p_offset into
 791  791                   * registers before calling page_trylock(); another thread,
 792  792                   * actually holding the hash lock, could have changed the
 793  793                   * page's identity in memory, but our registers would not
 794  794                   * be changed, fooling the reconfirmation.  If the hash
 795  795                   * lock was held during the search, the casting would
 796  796                   * not be needed.
 797  797                   */
 798  798                  VM_STAT_ADD(page_lookup_cnt[6]);
 799  799                  if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
 800  800                      ((volatile u_offset_t)(pp->p_offset) != off)) {
 801  801                          VM_STAT_ADD(page_lookup_cnt[7]);
 802  802                          if (hash_locked) {
 803  803                                  panic("page_lookup_create: lost page %p",
 804  804                                      (void *)pp);
 805  805                                  /*NOTREACHED*/
 806  806                          }
 807  807                          page_unlock(pp);
 808  808                          phm = PAGE_HASH_MUTEX(index);
 809  809                          mutex_enter(phm);
 810  810                          hash_locked = 1;
 811  811                          goto top;
 812  812                  }
 813  813  
 814  814                  /*
 815  815                   * If page_trylock() was called, then pp may still be on
 816  816                   * the cachelist (can't be on the free list, it would not
 817  817                   * have been found in the search).  If it is on the
 818  818                   * cachelist it must be pulled now. To pull the page from
 819  819                   * the cachelist, it must be exclusively locked.
 820  820                   *
 821  821                   * The other big difference between page_trylock() and
 822  822                   * page_lock(), is that page_lock() will pull the
 823  823                   * page from whatever free list (the cache list in this
 824  824                   * case) the page is on.  If page_trylock() was used
 825  825                   * above, then we have to do the reclaim ourselves.
 826  826                   */
 827  827                  if ((!hash_locked) && (PP_ISFREE(pp))) {
 828  828                          ASSERT(PP_ISAGED(pp) == 0);
 829  829                          VM_STAT_ADD(page_lookup_cnt[8]);
 830  830  
 831  831                          /*
 832  832                           * page_relcaim will insure that we
 833  833                           * have this page exclusively
 834  834                           */
 835  835  
 836  836                          if (!page_reclaim(pp, NULL)) {
 837  837                                  /*
 838  838                                   * Page_reclaim dropped whatever lock
 839  839                                   * we held.
 840  840                                   */
 841  841                                  VM_STAT_ADD(page_lookup_cnt[9]);
 842  842                                  phm = PAGE_HASH_MUTEX(index);
 843  843                                  mutex_enter(phm);
 844  844                                  hash_locked = 1;
 845  845                                  goto top;
 846  846                          } else if (se == SE_SHARED && newpp == NULL) {
 847  847                                  VM_STAT_ADD(page_lookup_cnt[10]);
 848  848                                  page_downgrade(pp);
 849  849                          }
 850  850                  }
 851  851  
 852  852                  if (hash_locked) {
 853  853                          mutex_exit(phm);
 854  854                  }
 855  855  
 856  856                  if (newpp != NULL && pp->p_szc < newpp->p_szc &&
 857  857                      PAGE_EXCL(pp) && nrelocp != NULL) {
 858  858                          ASSERT(nrelocp != NULL);
 859  859                          (void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
 860  860                              NULL);
 861  861                          if (*nrelocp > 0) {
 862  862                                  VM_STAT_COND_ADD(*nrelocp == 1,
 863  863                                      page_lookup_cnt[11]);
 864  864                                  VM_STAT_COND_ADD(*nrelocp > 1,
 865  865                                      page_lookup_cnt[12]);
 866  866                                  pp = newpp;
 867  867                                  se = SE_EXCL;
 868  868                          } else {
 869  869                                  if (se == SE_SHARED) {
 870  870                                          page_downgrade(pp);
 871  871                                  }
 872  872                                  VM_STAT_ADD(page_lookup_cnt[13]);
 873  873                          }
 874  874                  } else if (newpp != NULL && nrelocp != NULL) {
 875  875                          if (PAGE_EXCL(pp) && se == SE_SHARED) {
 876  876                                  page_downgrade(pp);
 877  877                          }
 878  878                          VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
 879  879                              page_lookup_cnt[14]);
 880  880                          VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
 881  881                              page_lookup_cnt[15]);
 882  882                          VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
 883  883                              page_lookup_cnt[16]);
 884  884                  } else if (newpp != NULL && PAGE_EXCL(pp)) {
 885  885                          se = SE_EXCL;
 886  886                  }
 887  887          } else if (!hash_locked) {
 888  888                  VM_STAT_ADD(page_lookup_cnt[17]);
 889  889                  phm = PAGE_HASH_MUTEX(index);
 890  890                  mutex_enter(phm);
 891  891                  hash_locked = 1;
 892  892                  goto top;
 893  893          } else if (newpp != NULL) {
 894  894                  /*
 895  895                   * If we have a preallocated page then
 896  896                   * insert it now and basically behave like
 897  897                   * page_create.
 898  898                   */
 899  899                  VM_STAT_ADD(page_lookup_cnt[18]);
 900  900                  /*
 901  901                   * Since we hold the page hash mutex and
 902  902                   * just searched for this page, page_hashin
 903  903                   * had better not fail.  If it does, that
 904  904                   * means some thread did not follow the
 905  905                   * page hash mutex rules.  Panic now and
 906  906                   * get it over with.  As usual, go down
 907  907                   * holding all the locks.
 908  908                   */
 909  909                  ASSERT(MUTEX_HELD(phm));
 910  910                  if (!page_hashin(newpp, vp, off, phm)) {
 911  911                          ASSERT(MUTEX_HELD(phm));
 912  912                          panic("page_lookup_create: hashin failed %p %p %llx %p",
 913  913                              (void *)newpp, (void *)vp, off, (void *)phm);
 914  914                          /*NOTREACHED*/
 915  915                  }
 916  916                  ASSERT(MUTEX_HELD(phm));
 917  917                  mutex_exit(phm);
 918  918                  phm = NULL;
 919  919                  page_set_props(newpp, P_REF);
 920  920                  page_io_lock(newpp);
 921  921                  pp = newpp;
 922  922                  se = SE_EXCL;
 923  923          } else {
 924  924                  VM_STAT_ADD(page_lookup_cnt[19]);
 925  925                  mutex_exit(phm);
 926  926          }
 927  927  
 928  928          ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
 929  929  
 930  930          ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
 931  931  
 932  932          return (pp);
 933  933  }
 934  934  
 935  935  /*
 936  936   * Search the hash list for the page representing the
 937  937   * specified [vp, offset] and return it locked.  Skip
 938  938   * free pages and pages that cannot be locked as requested.
 939  939   * Used while attempting to kluster pages.
 940  940   */
 941  941  page_t *
 942  942  page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)
 943  943  {
 944  944          page_t          *pp;
 945  945          kmutex_t        *phm;
 946  946          ulong_t         index;
 947  947          uint_t          locked;
 948  948  
 949  949          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
 950  950          VM_STAT_ADD(page_lookup_nowait_cnt[0]);
 951  951  
 952  952          index = PAGE_HASH_FUNC(vp, off);
 953  953          PAGE_HASH_SEARCH(index, pp, vp, off);
 954  954          locked = 0;
 955  955          if (pp == NULL) {
 956  956  top:
 957  957                  VM_STAT_ADD(page_lookup_nowait_cnt[1]);
 958  958                  locked = 1;
 959  959                  phm = PAGE_HASH_MUTEX(index);
 960  960                  mutex_enter(phm);
 961  961                  PAGE_HASH_SEARCH(index, pp, vp, off);
 962  962          }
 963  963  
 964  964          if (pp == NULL || PP_ISFREE(pp)) {
 965  965                  VM_STAT_ADD(page_lookup_nowait_cnt[2]);
 966  966                  pp = NULL;
 967  967          } else {
 968  968                  if (!page_trylock(pp, se)) {
 969  969                          VM_STAT_ADD(page_lookup_nowait_cnt[3]);
 970  970                          pp = NULL;
 971  971                  } else {
 972  972                          VM_STAT_ADD(page_lookup_nowait_cnt[4]);
 973  973                          /*
 974  974                           * See the comment in page_lookup()
 975  975                           */
 976  976                          if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
 977  977                              ((u_offset_t)(pp->p_offset) != off)) {
 978  978                                  VM_STAT_ADD(page_lookup_nowait_cnt[5]);
 979  979                                  if (locked) {
 980  980                                          panic("page_lookup_nowait %p",
 981  981                                              (void *)pp);
 982  982                                          /*NOTREACHED*/
 983  983                                  }
 984  984                                  page_unlock(pp);
 985  985                                  goto top;
 986  986                          }
 987  987                          if (PP_ISFREE(pp)) {
 988  988                                  VM_STAT_ADD(page_lookup_nowait_cnt[6]);
 989  989                                  page_unlock(pp);
 990  990                                  pp = NULL;
 991  991                          }
 992  992                  }
 993  993          }
 994  994          if (locked) {
 995  995                  VM_STAT_ADD(page_lookup_nowait_cnt[7]);
 996  996                  mutex_exit(phm);
 997  997          }
 998  998  
 999  999          ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
1000 1000  
1001 1001          return (pp);
1002 1002  }
1003 1003  
1004 1004  /*
1005 1005   * Search the hash list for a page with the specified [vp, off]
1006 1006   * that is known to exist and is already locked.  This routine
1007 1007   * is typically used by segment SOFTUNLOCK routines.
1008 1008   */
1009 1009  page_t *
1010 1010  page_find(vnode_t *vp, u_offset_t off)
1011 1011  {
1012 1012          page_t          *pp;
1013 1013          kmutex_t        *phm;
1014 1014          ulong_t         index;
1015 1015  
1016 1016          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1017 1017          VM_STAT_ADD(page_find_cnt);
1018 1018  
1019 1019          index = PAGE_HASH_FUNC(vp, off);
1020 1020          phm = PAGE_HASH_MUTEX(index);
1021 1021  
1022 1022          mutex_enter(phm);
1023 1023          PAGE_HASH_SEARCH(index, pp, vp, off);
1024 1024          mutex_exit(phm);
1025 1025  
1026 1026          ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr);
1027 1027          return (pp);
1028 1028  }
1029 1029  
1030 1030  /*
1031 1031   * Determine whether a page with the specified [vp, off]
1032 1032   * currently exists in the system.  Obviously this should
1033 1033   * only be considered as a hint since nothing prevents the
1034 1034   * page from disappearing or appearing immediately after
1035 1035   * the return from this routine. Subsequently, we don't
1036 1036   * even bother to lock the list.
1037 1037   */
1038 1038  page_t *
1039 1039  page_exists(vnode_t *vp, u_offset_t off)
1040 1040  {
1041 1041          page_t  *pp;
1042 1042          ulong_t         index;
1043 1043  
1044 1044          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1045 1045          VM_STAT_ADD(page_exists_cnt);
1046 1046  
1047 1047          index = PAGE_HASH_FUNC(vp, off);
1048 1048          PAGE_HASH_SEARCH(index, pp, vp, off);
1049 1049  
1050 1050          return (pp);
1051 1051  }
1052 1052  
1053 1053  /*
1054 1054   * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
1055 1055   * page_size(szc)) range.  if they exist and ppa is not NULL fill ppa array
1056 1056   * with these pages locked SHARED. If necessary reclaim pages from
1057 1057   * freelist. Return 1 if contiguous pages exist and 0 otherwise.
1058 1058   *
1059 1059   * If we fail to lock pages still return 1 if pages exist and contiguous.
1060 1060   * But in this case return value is just a hint. ppa array won't be filled.
1061 1061   * Caller should initialize ppa[0] as NULL to distinguish return value.
1062 1062   *
1063 1063   * Returns 0 if pages don't exist or not physically contiguous.
1064 1064   *
1065 1065   * This routine doesn't work for anonymous(swapfs) pages.
1066 1066   */
1067 1067  int
1068 1068  page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
1069 1069  {
1070 1070          pgcnt_t pages;
1071 1071          pfn_t pfn;
1072 1072          page_t *rootpp;
1073 1073          pgcnt_t i;
1074 1074          pgcnt_t j;
1075 1075          u_offset_t save_off = off;
1076 1076          ulong_t index;
1077 1077          kmutex_t *phm;
1078 1078          page_t *pp;
1079 1079          uint_t pszc;
1080 1080          int loopcnt = 0;
1081 1081  
1082 1082          ASSERT(szc != 0);
1083 1083          ASSERT(vp != NULL);
1084 1084          ASSERT(!IS_SWAPFSVP(vp));
1085 1085          ASSERT(!VN_ISKAS(vp));
1086 1086  
1087 1087  again:
1088 1088          if (++loopcnt > 3) {
1089 1089                  VM_STAT_ADD(page_exphcontg[0]);
1090 1090                  return (0);
1091 1091          }
1092 1092  
1093 1093          index = PAGE_HASH_FUNC(vp, off);
1094 1094          phm = PAGE_HASH_MUTEX(index);
1095 1095  
1096 1096          mutex_enter(phm);
1097 1097          PAGE_HASH_SEARCH(index, pp, vp, off);
1098 1098          mutex_exit(phm);
1099 1099  
1100 1100          VM_STAT_ADD(page_exphcontg[1]);
1101 1101  
1102 1102          if (pp == NULL) {
1103 1103                  VM_STAT_ADD(page_exphcontg[2]);
1104 1104                  return (0);
1105 1105          }
1106 1106  
1107 1107          pages = page_get_pagecnt(szc);
1108 1108          rootpp = pp;
1109 1109          pfn = rootpp->p_pagenum;
1110 1110  
1111 1111          if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
1112 1112                  VM_STAT_ADD(page_exphcontg[3]);
1113 1113                  if (!page_trylock(pp, SE_SHARED)) {
1114 1114                          VM_STAT_ADD(page_exphcontg[4]);
1115 1115                          return (1);
1116 1116                  }
1117 1117                  /*
1118 1118                   * Also check whether p_pagenum was modified by DR.
1119 1119                   */
1120 1120                  if (pp->p_szc != pszc || pp->p_vnode != vp ||
1121 1121                      pp->p_offset != off || pp->p_pagenum != pfn) {
1122 1122                          VM_STAT_ADD(page_exphcontg[5]);
1123 1123                          page_unlock(pp);
1124 1124                          off = save_off;
1125 1125                          goto again;
1126 1126                  }
1127 1127                  /*
1128 1128                   * szc was non zero and vnode and offset matched after we
1129 1129                   * locked the page it means it can't become free on us.
1130 1130                   */
1131 1131                  ASSERT(!PP_ISFREE(pp));
1132 1132                  if (!IS_P2ALIGNED(pfn, pages)) {
1133 1133                          page_unlock(pp);
1134 1134                          return (0);
1135 1135                  }
1136 1136                  ppa[0] = pp;
1137 1137                  pp++;
1138 1138                  off += PAGESIZE;
1139 1139                  pfn++;
1140 1140                  for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1141 1141                          if (!page_trylock(pp, SE_SHARED)) {
1142 1142                                  VM_STAT_ADD(page_exphcontg[6]);
1143 1143                                  pp--;
1144 1144                                  while (i-- > 0) {
1145 1145                                          page_unlock(pp);
1146 1146                                          pp--;
1147 1147                                  }
1148 1148                                  ppa[0] = NULL;
1149 1149                                  return (1);
1150 1150                          }
1151 1151                          if (pp->p_szc != pszc) {
1152 1152                                  VM_STAT_ADD(page_exphcontg[7]);
1153 1153                                  page_unlock(pp);
1154 1154                                  pp--;
1155 1155                                  while (i-- > 0) {
1156 1156                                          page_unlock(pp);
1157 1157                                          pp--;
1158 1158                                  }
1159 1159                                  ppa[0] = NULL;
1160 1160                                  off = save_off;
1161 1161                                  goto again;
1162 1162                          }
1163 1163                          /*
1164 1164                           * szc the same as for previous already locked pages
1165 1165                           * with right identity. Since this page had correct
1166 1166                           * szc after we locked it can't get freed or destroyed
1167 1167                           * and therefore must have the expected identity.
1168 1168                           */
1169 1169                          ASSERT(!PP_ISFREE(pp));
1170 1170                          if (pp->p_vnode != vp ||
1171 1171                              pp->p_offset != off) {
1172 1172                                  panic("page_exists_physcontig: "
1173 1173                                      "large page identity doesn't match");
1174 1174                          }
1175 1175                          ppa[i] = pp;
1176 1176                          ASSERT(pp->p_pagenum == pfn);
1177 1177                  }
1178 1178                  VM_STAT_ADD(page_exphcontg[8]);
1179 1179                  ppa[pages] = NULL;
1180 1180                  return (1);
1181 1181          } else if (pszc >= szc) {
1182 1182                  VM_STAT_ADD(page_exphcontg[9]);
1183 1183                  if (!IS_P2ALIGNED(pfn, pages)) {
1184 1184                          return (0);
1185 1185                  }
1186 1186                  return (1);
1187 1187          }
1188 1188  
1189 1189          if (!IS_P2ALIGNED(pfn, pages)) {
1190 1190                  VM_STAT_ADD(page_exphcontg[10]);
1191 1191                  return (0);
1192 1192          }
1193 1193  
1194 1194          if (page_numtomemseg_nolock(pfn) !=
1195 1195              page_numtomemseg_nolock(pfn + pages - 1)) {
1196 1196                  VM_STAT_ADD(page_exphcontg[11]);
1197 1197                  return (0);
1198 1198          }
1199 1199  
1200 1200          /*
1201 1201           * We loop up 4 times across pages to promote page size.
1202 1202           * We're extra cautious to promote page size atomically with respect
1203 1203           * to everybody else.  But we can probably optimize into 1 loop if
1204 1204           * this becomes an issue.
1205 1205           */
1206 1206  
1207 1207          for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1208 1208                  if (!page_trylock(pp, SE_EXCL)) {
1209 1209                          VM_STAT_ADD(page_exphcontg[12]);
1210 1210                          break;
1211 1211                  }
1212 1212                  /*
1213 1213                   * Check whether p_pagenum was modified by DR.
1214 1214                   */
1215 1215                  if (pp->p_pagenum != pfn) {
1216 1216                          page_unlock(pp);
1217 1217                          break;
1218 1218                  }
1219 1219                  if (pp->p_vnode != vp ||
1220 1220                      pp->p_offset != off) {
1221 1221                          VM_STAT_ADD(page_exphcontg[13]);
1222 1222                          page_unlock(pp);
1223 1223                          break;
1224 1224                  }
1225 1225                  if (pp->p_szc >= szc) {
1226 1226                          ASSERT(i == 0);
1227 1227                          page_unlock(pp);
1228 1228                          off = save_off;
1229 1229                          goto again;
1230 1230                  }
1231 1231          }
1232 1232  
1233 1233          if (i != pages) {
1234 1234                  VM_STAT_ADD(page_exphcontg[14]);
1235 1235                  --pp;
1236 1236                  while (i-- > 0) {
1237 1237                          page_unlock(pp);
1238 1238                          --pp;
1239 1239                  }
1240 1240                  return (0);
1241 1241          }
1242 1242  
1243 1243          pp = rootpp;
1244 1244          for (i = 0; i < pages; i++, pp++) {
1245 1245                  if (PP_ISFREE(pp)) {
1246 1246                          VM_STAT_ADD(page_exphcontg[15]);
1247 1247                          ASSERT(!PP_ISAGED(pp));
1248 1248                          ASSERT(pp->p_szc == 0);
1249 1249                          if (!page_reclaim(pp, NULL)) {
1250 1250                                  break;
1251 1251                          }
1252 1252                  } else {
1253 1253                          ASSERT(pp->p_szc < szc);
1254 1254                          VM_STAT_ADD(page_exphcontg[16]);
1255 1255                          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1256 1256                  }
1257 1257          }
1258 1258          if (i < pages) {
1259 1259                  VM_STAT_ADD(page_exphcontg[17]);
1260 1260                  /*
1261 1261                   * page_reclaim failed because we were out of memory.
1262 1262                   * drop the rest of the locks and return because this page
1263 1263                   * must be already reallocated anyway.
1264 1264                   */
1265 1265                  pp = rootpp;
1266 1266                  for (j = 0; j < pages; j++, pp++) {
1267 1267                          if (j != i) {
1268 1268                                  page_unlock(pp);
1269 1269                          }
1270 1270                  }
1271 1271                  return (0);
1272 1272          }
1273 1273  
1274 1274          off = save_off;
1275 1275          pp = rootpp;
1276 1276          for (i = 0; i < pages; i++, pp++, off += PAGESIZE) {
1277 1277                  ASSERT(PAGE_EXCL(pp));
1278 1278                  ASSERT(!PP_ISFREE(pp));
1279 1279                  ASSERT(!hat_page_is_mapped(pp));
1280 1280                  ASSERT(pp->p_vnode == vp);
1281 1281                  ASSERT(pp->p_offset == off);
1282 1282                  pp->p_szc = szc;
1283 1283          }
1284 1284          pp = rootpp;
1285 1285          for (i = 0; i < pages; i++, pp++) {
1286 1286                  if (ppa == NULL) {
1287 1287                          page_unlock(pp);
1288 1288                  } else {
1289 1289                          ppa[i] = pp;
1290 1290                          page_downgrade(ppa[i]);
1291 1291                  }
1292 1292          }
1293 1293          if (ppa != NULL) {
1294 1294                  ppa[pages] = NULL;
1295 1295          }
1296 1296          VM_STAT_ADD(page_exphcontg[18]);
1297 1297          ASSERT(vp->v_pages != NULL);
1298 1298          return (1);
1299 1299  }
1300 1300  
1301 1301  /*
1302 1302   * Determine whether a page with the specified [vp, off]
1303 1303   * currently exists in the system and if so return its
1304 1304   * size code. Obviously this should only be considered as
1305 1305   * a hint since nothing prevents the page from disappearing
1306 1306   * or appearing immediately after the return from this routine.
1307 1307   */
1308 1308  int
1309 1309  page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc)
1310 1310  {
1311 1311          page_t          *pp;
1312 1312          kmutex_t        *phm;
1313 1313          ulong_t         index;
1314 1314          int             rc = 0;
1315 1315  
1316 1316          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1317 1317          ASSERT(szc != NULL);
1318 1318          VM_STAT_ADD(page_exists_forreal_cnt);
1319 1319  
1320 1320          index = PAGE_HASH_FUNC(vp, off);
1321 1321          phm = PAGE_HASH_MUTEX(index);
1322 1322  
1323 1323          mutex_enter(phm);
1324 1324          PAGE_HASH_SEARCH(index, pp, vp, off);
1325 1325          if (pp != NULL) {
1326 1326                  *szc = pp->p_szc;
1327 1327                  rc = 1;
1328 1328          }
1329 1329          mutex_exit(phm);
1330 1330          return (rc);
1331 1331  }
1332 1332  
1333 1333  /* wakeup threads waiting for pages in page_create_get_something() */
1334 1334  void
1335 1335  wakeup_pcgs(void)
1336 1336  {
1337 1337          if (!CV_HAS_WAITERS(&pcgs_cv))
1338 1338                  return;
1339 1339          cv_broadcast(&pcgs_cv);
1340 1340  }
1341 1341  
1342 1342  /*
1343 1343   * 'freemem' is used all over the kernel as an indication of how many
1344 1344   * pages are free (either on the cache list or on the free page list)
1345 1345   * in the system.  In very few places is a really accurate 'freemem'
1346 1346   * needed.  To avoid contention of the lock protecting a the
1347 1347   * single freemem, it was spread out into NCPU buckets.  Set_freemem
1348 1348   * sets freemem to the total of all NCPU buckets.  It is called from
1349 1349   * clock() on each TICK.
1350 1350   */
1351 1351  void
1352 1352  set_freemem()
1353 1353  {
1354 1354          struct pcf      *p;
1355 1355          ulong_t         t;
1356 1356          uint_t          i;
1357 1357  
1358 1358          t = 0;
1359 1359          p = pcf;
1360 1360          for (i = 0;  i < pcf_fanout; i++) {
1361 1361                  t += p->pcf_count;
1362 1362                  p++;
1363 1363          }
1364 1364          freemem = t;
1365 1365  
1366 1366          /*
1367 1367           * Don't worry about grabbing mutex.  It's not that
1368 1368           * critical if we miss a tick or two.  This is
1369 1369           * where we wakeup possible delayers in
1370 1370           * page_create_get_something().
1371 1371           */
1372 1372          wakeup_pcgs();
1373 1373  }
1374 1374  
1375 1375  ulong_t
1376 1376  get_freemem()
1377 1377  {
1378 1378          struct pcf      *p;
1379 1379          ulong_t         t;
1380 1380          uint_t          i;
1381 1381  
1382 1382          t = 0;
1383 1383          p = pcf;
1384 1384          for (i = 0; i < pcf_fanout; i++) {
1385 1385                  t += p->pcf_count;
1386 1386                  p++;
1387 1387          }
1388 1388          /*
1389 1389           * We just calculated it, might as well set it.
1390 1390           */
1391 1391          freemem = t;
1392 1392          return (t);
1393 1393  }
1394 1394  
1395 1395  /*
1396 1396   * Acquire all of the page cache & free (pcf) locks.
1397 1397   */
1398 1398  void
1399 1399  pcf_acquire_all()
1400 1400  {
1401 1401          struct pcf      *p;
1402 1402          uint_t          i;
1403 1403  
1404 1404          p = pcf;
1405 1405          for (i = 0; i < pcf_fanout; i++) {
1406 1406                  mutex_enter(&p->pcf_lock);
1407 1407                  p++;
1408 1408          }
1409 1409  }
1410 1410  
1411 1411  /*
1412 1412   * Release all the pcf_locks.
1413 1413   */
1414 1414  void
1415 1415  pcf_release_all()
1416 1416  {
1417 1417          struct pcf      *p;
1418 1418          uint_t          i;
1419 1419  
1420 1420          p = pcf;
1421 1421          for (i = 0; i < pcf_fanout; i++) {
1422 1422                  mutex_exit(&p->pcf_lock);
1423 1423                  p++;
1424 1424          }
1425 1425  }
1426 1426  
1427 1427  /*
1428 1428   * Inform the VM system that we need some pages freed up.
1429 1429   * Calls must be symmetric, e.g.:
1430 1430   *
1431 1431   *      page_needfree(100);
1432 1432   *      wait a bit;
1433 1433   *      page_needfree(-100);
1434 1434   */
1435 1435  void
1436 1436  page_needfree(spgcnt_t npages)
1437 1437  {
1438 1438          mutex_enter(&new_freemem_lock);
1439 1439          needfree += npages;
1440 1440          mutex_exit(&new_freemem_lock);
1441 1441  }
1442 1442  
1443 1443  /*
1444 1444   * Throttle for page_create(): try to prevent freemem from dropping
1445 1445   * below throttlefree.  We can't provide a 100% guarantee because
1446 1446   * KM_NOSLEEP allocations, page_reclaim(), and various other things
1447 1447   * nibble away at the freelist.  However, we can block all PG_WAIT
1448 1448   * allocations until memory becomes available.  The motivation is
1449 1449   * that several things can fall apart when there's no free memory:
1450 1450   *
1451 1451   * (1) If pageout() needs memory to push a page, the system deadlocks.
1452 1452   *
1453 1453   * (2) By (broken) specification, timeout(9F) can neither fail nor
1454 1454   *     block, so it has no choice but to panic the system if it
1455 1455   *     cannot allocate a callout structure.
1456 1456   *
1457 1457   * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block;
1458 1458   *     it panics if it cannot allocate a callback structure.
1459 1459   *
1460 1460   * (4) Untold numbers of third-party drivers have not yet been hardened
1461 1461   *     against KM_NOSLEEP and/or allocb() failures; they simply assume
1462 1462   *     success and panic the system with a data fault on failure.
1463 1463   *     (The long-term solution to this particular problem is to ship
1464 1464   *     hostile fault-injecting DEBUG kernels with the DDK.)
1465 1465   *
1466 1466   * It is theoretically impossible to guarantee success of non-blocking
1467 1467   * allocations, but in practice, this throttle is very hard to break.
1468 1468   */
1469 1469  static int
1470 1470  page_create_throttle(pgcnt_t npages, int flags)
1471 1471  {
1472 1472          ulong_t fm;
1473 1473          uint_t  i;
1474 1474          pgcnt_t tf;     /* effective value of throttlefree */
1475 1475  
1476 1476          /*
1477 1477           * Normal priority allocations.
1478 1478           */
1479 1479          if ((flags & (PG_WAIT | PG_NORMALPRI)) == PG_NORMALPRI) {
1480 1480                  ASSERT(!(flags & (PG_PANIC | PG_PUSHPAGE)));
1481 1481                  return (freemem >= npages + throttlefree);
1482 1482          }
1483 1483  
1484 1484          /*
1485 1485           * Never deny pages when:
1486 1486           * - it's a thread that cannot block [NOMEMWAIT()]
1487 1487           * - the allocation cannot block and must not fail
1488 1488           * - the allocation cannot block and is pageout dispensated
1489 1489           */
1490 1490          if (NOMEMWAIT() ||
1491 1491              ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) ||
1492 1492              ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE))
1493 1493                  return (1);
1494 1494  
1495 1495          /*
1496 1496           * If the allocation can't block, we look favorably upon it
1497 1497           * unless we're below pageout_reserve.  In that case we fail
1498 1498           * the allocation because we want to make sure there are a few
1499 1499           * pages available for pageout.
1500 1500           */
1501 1501          if ((flags & PG_WAIT) == 0)
1502 1502                  return (freemem >= npages + pageout_reserve);
1503 1503  
1504 1504          /* Calculate the effective throttlefree value */
1505 1505          tf = throttlefree -
1506 1506              ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
1507 1507  
1508 1508          cv_signal(&proc_pageout->p_cv);
1509 1509  
1510 1510          for (;;) {
1511 1511                  fm = 0;
1512 1512                  pcf_acquire_all();
1513 1513                  mutex_enter(&new_freemem_lock);
1514 1514                  for (i = 0; i < pcf_fanout; i++) {
1515 1515                          fm += pcf[i].pcf_count;
1516 1516                          pcf[i].pcf_wait++;
1517 1517                          mutex_exit(&pcf[i].pcf_lock);
1518 1518                  }
1519 1519                  freemem = fm;
1520 1520                  if (freemem >= npages + tf) {
1521 1521                          mutex_exit(&new_freemem_lock);
1522 1522                          break;
1523 1523                  }
1524 1524                  needfree += npages;
1525 1525                  freemem_wait++;
1526 1526                  cv_wait(&freemem_cv, &new_freemem_lock);
1527 1527                  freemem_wait--;
1528 1528                  needfree -= npages;
1529 1529                  mutex_exit(&new_freemem_lock);
1530 1530          }
1531 1531          return (1);
1532 1532  }
1533 1533  
1534 1534  /*
1535 1535   * page_create_wait() is called to either coalesce pages from the
1536 1536   * different pcf buckets or to wait because there simply are not
1537 1537   * enough pages to satisfy the caller's request.
1538 1538   *
1539 1539   * Sadly, this is called from platform/vm/vm_machdep.c
1540 1540   */
1541 1541  int
1542 1542  page_create_wait(pgcnt_t npages, uint_t flags)
1543 1543  {
1544 1544          pgcnt_t         total;
1545 1545          uint_t          i;
1546 1546          struct pcf      *p;
1547 1547  
1548 1548          /*
1549 1549           * Wait until there are enough free pages to satisfy our
1550 1550           * entire request.
1551 1551           * We set needfree += npages before prodding pageout, to make sure
1552 1552           * it does real work when npages > lotsfree > freemem.
1553 1553           */
1554 1554          VM_STAT_ADD(page_create_not_enough);
1555 1555  
1556 1556          ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1);
1557 1557  checkagain:
1558 1558          if ((flags & PG_NORELOC) &&
1559 1559              kcage_freemem < kcage_throttlefree + npages)
1560 1560                  (void) kcage_create_throttle(npages, flags);
1561 1561  
1562 1562          if (freemem < npages + throttlefree)
1563 1563                  if (!page_create_throttle(npages, flags))
1564 1564                          return (0);
1565 1565  
1566 1566          if (pcf_decrement_bucket(npages) ||
1567 1567              pcf_decrement_multiple(&total, npages, 0))
1568 1568                  return (1);
1569 1569  
1570 1570          /*
1571 1571           * All of the pcf locks are held, there are not enough pages
1572 1572           * to satisfy the request (npages < total).
1573 1573           * Be sure to acquire the new_freemem_lock before dropping
1574 1574           * the pcf locks.  This prevents dropping wakeups in page_free().
1575 1575           * The order is always pcf_lock then new_freemem_lock.
1576 1576           *
1577 1577           * Since we hold all the pcf locks, it is a good time to set freemem.
1578 1578           *
1579 1579           * If the caller does not want to wait, return now.
1580 1580           * Else turn the pageout daemon loose to find something
1581 1581           * and wait till it does.
1582 1582           *
1583 1583           */
1584 1584          freemem = total;
1585 1585  
1586 1586          if ((flags & PG_WAIT) == 0) {
1587 1587                  pcf_release_all();
1588 1588  
1589 1589                  TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM,
1590 1590                  "page_create_nomem:npages %ld freemem %ld", npages, freemem);
1591 1591                  return (0);
1592 1592          }
1593 1593  
1594 1594          ASSERT(proc_pageout != NULL);
1595 1595          cv_signal(&proc_pageout->p_cv);
1596 1596  
1597 1597          TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
1598 1598              "page_create_sleep_start: freemem %ld needfree %ld",
1599 1599              freemem, needfree);
1600 1600  
1601 1601          /*
1602 1602           * We are going to wait.
1603 1603           * We currently hold all of the pcf_locks,
1604 1604           * get the new_freemem_lock (it protects freemem_wait),
1605 1605           * before dropping the pcf_locks.
1606 1606           */
1607 1607          mutex_enter(&new_freemem_lock);
1608 1608  
1609 1609          p = pcf;
1610 1610          for (i = 0; i < pcf_fanout; i++) {
1611 1611                  p->pcf_wait++;
1612 1612                  mutex_exit(&p->pcf_lock);
1613 1613                  p++;
1614 1614          }
1615 1615  
1616 1616          needfree += npages;
1617 1617          freemem_wait++;
1618 1618  
1619 1619          cv_wait(&freemem_cv, &new_freemem_lock);
1620 1620  
1621 1621          freemem_wait--;
1622 1622          needfree -= npages;
1623 1623  
1624 1624          mutex_exit(&new_freemem_lock);
1625 1625  
1626 1626          TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END,
1627 1627              "page_create_sleep_end: freemem %ld needfree %ld",
1628 1628              freemem, needfree);
1629 1629  
1630 1630          VM_STAT_ADD(page_create_not_enough_again);
1631 1631          goto checkagain;
1632 1632  }
1633 1633  /*
1634 1634   * A routine to do the opposite of page_create_wait().
1635 1635   */
1636 1636  void
1637 1637  page_create_putback(spgcnt_t npages)
1638 1638  {
1639 1639          struct pcf      *p;
1640 1640          pgcnt_t         lump;
1641 1641          uint_t          *which;
1642 1642  
1643 1643          /*
1644 1644           * When a contiguous lump is broken up, we have to
1645 1645           * deal with lots of pages (min 64) so lets spread
1646 1646           * the wealth around.
1647 1647           */
1648 1648          lump = roundup(npages, pcf_fanout) / pcf_fanout;
1649 1649          freemem += npages;
1650 1650  
1651 1651          for (p = pcf; (npages > 0) && (p < &pcf[pcf_fanout]); p++) {
1652 1652                  which = &p->pcf_count;
1653 1653  
1654 1654                  mutex_enter(&p->pcf_lock);
1655 1655  
1656 1656                  if (p->pcf_block) {
1657 1657                          which = &p->pcf_reserve;
1658 1658                  }
1659 1659  
1660 1660                  if (lump < npages) {
1661 1661                          *which += (uint_t)lump;
1662 1662                          npages -= lump;
1663 1663                  } else {
1664 1664                          *which += (uint_t)npages;
1665 1665                          npages = 0;
1666 1666                  }
1667 1667  
1668 1668                  if (p->pcf_wait) {
1669 1669                          mutex_enter(&new_freemem_lock);
1670 1670                          /*
1671 1671                           * Check to see if some other thread
1672 1672                           * is actually waiting.  Another bucket
1673 1673                           * may have woken it up by now.  If there
1674 1674                           * are no waiters, then set our pcf_wait
1675 1675                           * count to zero to avoid coming in here
1676 1676                           * next time.
1677 1677                           */
1678 1678                          if (freemem_wait) {
1679 1679                                  if (npages > 1) {
1680 1680                                          cv_broadcast(&freemem_cv);
1681 1681                                  } else {
1682 1682                                          cv_signal(&freemem_cv);
1683 1683                                  }
1684 1684                                  p->pcf_wait--;
1685 1685                          } else {
1686 1686                                  p->pcf_wait = 0;
1687 1687                          }
1688 1688                          mutex_exit(&new_freemem_lock);
1689 1689                  }
1690 1690                  mutex_exit(&p->pcf_lock);
1691 1691          }
1692 1692          ASSERT(npages == 0);
1693 1693  }
1694 1694  
1695 1695  /*
1696 1696   * A helper routine for page_create_get_something.
1697 1697   * The indenting got to deep down there.
1698 1698   * Unblock the pcf counters.  Any pages freed after
1699 1699   * pcf_block got set are moved to pcf_count and
1700 1700   * wakeups (cv_broadcast() or cv_signal()) are done as needed.
1701 1701   */
1702 1702  static void
1703 1703  pcgs_unblock(void)
1704 1704  {
1705 1705          int             i;
1706 1706          struct pcf      *p;
1707 1707  
1708 1708          /* Update freemem while we're here. */
1709 1709          freemem = 0;
1710 1710          p = pcf;
1711 1711          for (i = 0; i < pcf_fanout; i++) {
1712 1712                  mutex_enter(&p->pcf_lock);
1713 1713                  ASSERT(p->pcf_count == 0);
1714 1714                  p->pcf_count = p->pcf_reserve;
1715 1715                  p->pcf_block = 0;
1716 1716                  freemem += p->pcf_count;
1717 1717                  if (p->pcf_wait) {
1718 1718                          mutex_enter(&new_freemem_lock);
1719 1719                          if (freemem_wait) {
1720 1720                                  if (p->pcf_reserve > 1) {
1721 1721                                          cv_broadcast(&freemem_cv);
1722 1722                                          p->pcf_wait = 0;
1723 1723                                  } else {
1724 1724                                          cv_signal(&freemem_cv);
1725 1725                                          p->pcf_wait--;
1726 1726                                  }
1727 1727                          } else {
1728 1728                                  p->pcf_wait = 0;
1729 1729                          }
1730 1730                          mutex_exit(&new_freemem_lock);
1731 1731                  }
1732 1732                  p->pcf_reserve = 0;
1733 1733                  mutex_exit(&p->pcf_lock);
1734 1734                  p++;
1735 1735          }
1736 1736  }
1737 1737  
1738 1738  /*
1739 1739   * Called from page_create_va() when both the cache and free lists
1740 1740   * have been checked once.
1741 1741   *
1742 1742   * Either returns a page or panics since the accounting was done
1743 1743   * way before we got here.
1744 1744   *
1745 1745   * We don't come here often, so leave the accounting on permanently.
1746 1746   */
1747 1747  
1748 1748  #define MAX_PCGS        100
1749 1749  
1750 1750  #ifdef  DEBUG
1751 1751  #define PCGS_TRIES      100
1752 1752  #else   /* DEBUG */
1753 1753  #define PCGS_TRIES      10
1754 1754  #endif  /* DEBUG */
1755 1755  
1756 1756  #ifdef  VM_STATS
1757 1757  uint_t  pcgs_counts[PCGS_TRIES];
1758 1758  uint_t  pcgs_too_many;
1759 1759  uint_t  pcgs_entered;
1760 1760  uint_t  pcgs_entered_noreloc;
1761 1761  uint_t  pcgs_locked;
1762 1762  uint_t  pcgs_cagelocked;
1763 1763  #endif  /* VM_STATS */
1764 1764  
1765 1765  static page_t *
1766 1766  page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg,
1767 1767      caddr_t vaddr, uint_t flags)
1768 1768  {
1769 1769          uint_t          count;
1770 1770          page_t          *pp;
1771 1771          uint_t          locked, i;
1772 1772          struct  pcf     *p;
1773 1773          lgrp_t          *lgrp;
1774 1774          int             cagelocked = 0;
1775 1775  
1776 1776          VM_STAT_ADD(pcgs_entered);
1777 1777  
1778 1778          /*
1779 1779           * Tap any reserve freelists: if we fail now, we'll die
1780 1780           * since the page(s) we're looking for have already been
1781 1781           * accounted for.
1782 1782           */
1783 1783          flags |= PG_PANIC;
1784 1784  
1785 1785          if ((flags & PG_NORELOC) != 0) {
1786 1786                  VM_STAT_ADD(pcgs_entered_noreloc);
1787 1787                  /*
1788 1788                   * Requests for free pages from critical threads
1789 1789                   * such as pageout still won't throttle here, but
1790 1790                   * we must try again, to give the cageout thread
1791 1791                   * another chance to catch up. Since we already
1792 1792                   * accounted for the pages, we had better get them
1793 1793                   * this time.
1794 1794                   *
1795 1795                   * N.B. All non-critical threads acquire the pcgs_cagelock
1796 1796                   * to serialize access to the freelists. This implements a
1797 1797                   * turnstile-type synchornization to avoid starvation of
1798 1798                   * critical requests for PG_NORELOC memory by non-critical
1799 1799                   * threads: all non-critical threads must acquire a 'ticket'
1800 1800                   * before passing through, which entails making sure
1801 1801                   * kcage_freemem won't fall below minfree prior to grabbing
1802 1802                   * pages from the freelists.
1803 1803                   */
1804 1804                  if (kcage_create_throttle(1, flags) == KCT_NONCRIT) {
1805 1805                          mutex_enter(&pcgs_cagelock);
1806 1806                          cagelocked = 1;
1807 1807                          VM_STAT_ADD(pcgs_cagelocked);
1808 1808                  }
1809 1809          }
1810 1810  
1811 1811          /*
1812 1812           * Time to get serious.
1813 1813           * We failed to get a `correctly colored' page from both the
1814 1814           * free and cache lists.
1815 1815           * We escalate in stage.
1816 1816           *
1817 1817           * First try both lists without worring about color.
1818 1818           *
1819 1819           * Then, grab all page accounting locks (ie. pcf[]) and
1820 1820           * steal any pages that they have and set the pcf_block flag to
1821 1821           * stop deletions from the lists.  This will help because
1822 1822           * a page can get added to the free list while we are looking
1823 1823           * at the cache list, then another page could be added to the cache
1824 1824           * list allowing the page on the free list to be removed as we
1825 1825           * move from looking at the cache list to the free list. This
1826 1826           * could happen over and over. We would never find the page
1827 1827           * we have accounted for.
1828 1828           *
1829 1829           * Noreloc pages are a subset of the global (relocatable) page pool.
1830 1830           * They are not tracked separately in the pcf bins, so it is
1831 1831           * impossible to know when doing pcf accounting if the available
1832 1832           * page(s) are noreloc pages or not. When looking for a noreloc page
1833 1833           * it is quite easy to end up here even if the global (relocatable)
1834 1834           * page pool has plenty of free pages but the noreloc pool is empty.
1835 1835           *
1836 1836           * When the noreloc pool is empty (or low), additional noreloc pages
1837 1837           * are created by converting pages from the global page pool. This
1838 1838           * process will stall during pcf accounting if the pcf bins are
1839 1839           * already locked. Such is the case when a noreloc allocation is
1840 1840           * looping here in page_create_get_something waiting for more noreloc
1841 1841           * pages to appear.
1842 1842           *
1843 1843           * Short of adding a new field to the pcf bins to accurately track
1844 1844           * the number of free noreloc pages, we instead do not grab the
1845 1845           * pcgs_lock, do not set the pcf blocks and do not timeout when
1846 1846           * allocating a noreloc page. This allows noreloc allocations to
1847 1847           * loop without blocking global page pool allocations.
1848 1848           *
1849 1849           * NOTE: the behaviour of page_create_get_something has not changed
1850 1850           * for the case of global page pool allocations.
1851 1851           */
1852 1852  
1853 1853          flags &= ~PG_MATCH_COLOR;
1854 1854          locked = 0;
1855 1855  #if defined(__i386) || defined(__amd64)
1856 1856          flags = page_create_update_flags_x86(flags);
1857 1857  #endif
1858 1858  
1859 1859          lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
1860 1860  
1861 1861          for (count = 0; kcage_on || count < MAX_PCGS; count++) {
1862 1862                  pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
1863 1863                      flags, lgrp);
1864 1864                  if (pp == NULL) {
1865 1865                          pp = page_get_cachelist(vp, off, seg, vaddr,
1866 1866                              flags, lgrp);
1867 1867                  }
1868 1868                  if (pp == NULL) {
1869 1869                          /*
1870 1870                           * Serialize.  Don't fight with other pcgs().
1871 1871                           */
1872 1872                          if (!locked && (!kcage_on || !(flags & PG_NORELOC))) {
1873 1873                                  mutex_enter(&pcgs_lock);
1874 1874                                  VM_STAT_ADD(pcgs_locked);
1875 1875                                  locked = 1;
1876 1876                                  p = pcf;
1877 1877                                  for (i = 0; i < pcf_fanout; i++) {
1878 1878                                          mutex_enter(&p->pcf_lock);
1879 1879                                          ASSERT(p->pcf_block == 0);
1880 1880                                          p->pcf_block = 1;
1881 1881                                          p->pcf_reserve = p->pcf_count;
1882 1882                                          p->pcf_count = 0;
1883 1883                                          mutex_exit(&p->pcf_lock);
1884 1884                                          p++;
1885 1885                                  }
1886 1886                                  freemem = 0;
1887 1887                          }
1888 1888  
1889 1889                          if (count) {
1890 1890                                  /*
1891 1891                                   * Since page_free() puts pages on
1892 1892                                   * a list then accounts for it, we
1893 1893                                   * just have to wait for page_free()
1894 1894                                   * to unlock any page it was working
1895 1895                                   * with. The page_lock()-page_reclaim()
1896 1896                                   * path falls in the same boat.
1897 1897                                   *
1898 1898                                   * We don't need to check on the
1899 1899                                   * PG_WAIT flag, we have already
1900 1900                                   * accounted for the page we are
1901 1901                                   * looking for in page_create_va().
1902 1902                                   *
1903 1903                                   * We just wait a moment to let any
1904 1904                                   * locked pages on the lists free up,
1905 1905                                   * then continue around and try again.
1906 1906                                   *
1907 1907                                   * Will be awakened by set_freemem().
1908 1908                                   */
1909 1909                                  mutex_enter(&pcgs_wait_lock);
1910 1910                                  cv_wait(&pcgs_cv, &pcgs_wait_lock);
1911 1911                                  mutex_exit(&pcgs_wait_lock);
1912 1912                          }
1913 1913                  } else {
1914 1914  #ifdef VM_STATS
1915 1915                          if (count >= PCGS_TRIES) {
1916 1916                                  VM_STAT_ADD(pcgs_too_many);
1917 1917                          } else {
1918 1918                                  VM_STAT_ADD(pcgs_counts[count]);
1919 1919                          }
1920 1920  #endif
1921 1921                          if (locked) {
1922 1922                                  pcgs_unblock();
1923 1923                                  mutex_exit(&pcgs_lock);
1924 1924                          }
1925 1925                          if (cagelocked)
1926 1926                                  mutex_exit(&pcgs_cagelock);
1927 1927                          return (pp);
1928 1928                  }
1929 1929          }
1930 1930          /*
1931 1931           * we go down holding the pcf locks.
1932 1932           */
1933 1933          panic("no %spage found %d",
1934 1934              ((flags & PG_NORELOC) ? "non-reloc " : ""), count);
1935 1935          /*NOTREACHED*/
1936 1936  }
1937 1937  
1938 1938  /*
1939 1939   * Create enough pages for "bytes" worth of data starting at
1940 1940   * "off" in "vp".
1941 1941   *
1942 1942   *      Where flag must be one of:
1943 1943   *
1944 1944   *              PG_EXCL:        Exclusive create (fail if any page already
1945 1945   *                              exists in the page cache) which does not
1946 1946   *                              wait for memory to become available.
1947 1947   *
1948 1948   *              PG_WAIT:        Non-exclusive create which can wait for
1949 1949   *                              memory to become available.
1950 1950   *
1951 1951   *              PG_PHYSCONTIG:  Allocate physically contiguous pages.
1952 1952   *                              (Not Supported)
1953 1953   *
1954 1954   * A doubly linked list of pages is returned to the caller.  Each page
1955 1955   * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock)
1956 1956   * lock.
1957 1957   *
1958 1958   * Unable to change the parameters to page_create() in a minor release,
1959 1959   * we renamed page_create() to page_create_va(), changed all known calls
1960 1960   * from page_create() to page_create_va(), and created this wrapper.
1961 1961   *
1962 1962   * Upon a major release, we should break compatibility by deleting this
1963 1963   * wrapper, and replacing all the strings "page_create_va", with "page_create".
1964 1964   *
1965 1965   * NOTE: There is a copy of this interface as page_create_io() in
1966 1966   *       i86/vm/vm_machdep.c. Any bugs fixed here should be applied
1967 1967   *       there.
1968 1968   */
1969 1969  page_t *
1970 1970  page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags)
1971 1971  {
1972 1972          caddr_t random_vaddr;
1973 1973          struct seg kseg;
1974 1974  
1975 1975  #ifdef DEBUG
1976 1976          cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p",
1977 1977              (void *)caller());
1978 1978  #endif
1979 1979  
1980 1980          random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^
1981 1981              (uintptr_t)(off >> PAGESHIFT));
1982 1982          kseg.s_as = &kas;
1983 1983  
1984 1984          return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr));
1985 1985  }
1986 1986  
1987 1987  #ifdef DEBUG
1988 1988  uint32_t pg_alloc_pgs_mtbf = 0;
1989 1989  #endif
1990 1990  
1991 1991  /*
1992 1992   * Used for large page support. It will attempt to allocate
1993 1993   * a large page(s) off the freelist.
1994 1994   *
1995 1995   * Returns non zero on failure.
1996 1996   */
1997 1997  int
1998 1998  page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr,
1999 1999      page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz, int pgflags)
2000 2000  {
2001 2001          pgcnt_t         npgs, curnpgs, totpgs;
2002 2002          size_t          pgsz;
2003 2003          page_t          *pplist = NULL, *pp;
2004 2004          int             err = 0;
2005 2005          lgrp_t          *lgrp;
2006 2006  
2007 2007          ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
2008 2008          ASSERT(pgflags == 0 || pgflags == PG_LOCAL);
2009 2009  
2010 2010          /*
2011 2011           * Check if system heavily prefers local large pages over remote
2012 2012           * on systems with multiple lgroups.
2013 2013           */
2014 2014          if (lpg_alloc_prefer == LPAP_LOCAL && nlgrps > 1) {
2015 2015                  pgflags = PG_LOCAL;
2016 2016          }
2017 2017  
2018 2018          VM_STAT_ADD(alloc_pages[0]);
2019 2019  
2020 2020  #ifdef DEBUG
2021 2021          if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) {
2022 2022                  return (ENOMEM);
2023 2023          }
2024 2024  #endif
2025 2025  
2026 2026          /*
2027 2027           * One must be NULL but not both.
2028 2028           * And one must be non NULL but not both.
2029 2029           */
2030 2030          ASSERT(basepp != NULL || ppa != NULL);
2031 2031          ASSERT(basepp == NULL || ppa == NULL);
2032 2032  
2033 2033  #if defined(__i386) || defined(__amd64)
2034 2034          while (page_chk_freelist(szc) == 0) {
2035 2035                  VM_STAT_ADD(alloc_pages[8]);
2036 2036                  if (anypgsz == 0 || --szc == 0)
2037 2037                          return (ENOMEM);
2038 2038          }
2039 2039  #endif
2040 2040  
2041 2041          pgsz = page_get_pagesize(szc);
2042 2042          totpgs = curnpgs = npgs = pgsz >> PAGESHIFT;
2043 2043  
2044 2044          ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0);
2045 2045  
2046 2046          (void) page_create_wait(npgs, PG_WAIT);
2047 2047  
2048 2048          while (npgs && szc) {
2049 2049                  lgrp = lgrp_mem_choose(seg, addr, pgsz);
2050 2050                  if (pgflags == PG_LOCAL) {
2051 2051                          pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2052 2052                              pgflags, lgrp);
2053 2053                          if (pp == NULL) {
2054 2054                                  pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2055 2055                                      0, lgrp);
2056 2056                          }
2057 2057                  } else {
2058 2058                          pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2059 2059                              0, lgrp);
2060 2060                  }
2061 2061                  if (pp != NULL) {
2062 2062                          VM_STAT_ADD(alloc_pages[1]);
2063 2063                          page_list_concat(&pplist, &pp);
2064 2064                          ASSERT(npgs >= curnpgs);
2065 2065                          npgs -= curnpgs;
2066 2066                  } else if (anypgsz) {
2067 2067                          VM_STAT_ADD(alloc_pages[2]);
2068 2068                          szc--;
2069 2069                          pgsz = page_get_pagesize(szc);
2070 2070                          curnpgs = pgsz >> PAGESHIFT;
2071 2071                  } else {
2072 2072                          VM_STAT_ADD(alloc_pages[3]);
2073 2073                          ASSERT(npgs == totpgs);
2074 2074                          page_create_putback(npgs);
2075 2075                          return (ENOMEM);
2076 2076                  }
2077 2077          }
2078 2078          if (szc == 0) {
2079 2079                  VM_STAT_ADD(alloc_pages[4]);
2080 2080                  ASSERT(npgs != 0);
2081 2081                  page_create_putback(npgs);
2082 2082                  err = ENOMEM;
2083 2083          } else if (basepp != NULL) {
2084 2084                  ASSERT(npgs == 0);
2085 2085                  ASSERT(ppa == NULL);
2086 2086                  *basepp = pplist;
2087 2087          }
2088 2088  
2089 2089          npgs = totpgs - npgs;
2090 2090          pp = pplist;
2091 2091  
2092 2092          /*
2093 2093           * Clear the free and age bits. Also if we were passed in a ppa then
2094 2094           * fill it in with all the constituent pages from the large page. But
2095 2095           * if we failed to allocate all the pages just free what we got.
2096 2096           */
2097 2097          while (npgs != 0) {
2098 2098                  ASSERT(PP_ISFREE(pp));
2099 2099                  ASSERT(PP_ISAGED(pp));
2100 2100                  if (ppa != NULL || err != 0) {
2101 2101                          if (err == 0) {
2102 2102                                  VM_STAT_ADD(alloc_pages[5]);
2103 2103                                  PP_CLRFREE(pp);
2104 2104                                  PP_CLRAGED(pp);
2105 2105                                  page_sub(&pplist, pp);
2106 2106                                  *ppa++ = pp;
2107 2107                                  npgs--;
2108 2108                          } else {
2109 2109                                  VM_STAT_ADD(alloc_pages[6]);
2110 2110                                  ASSERT(pp->p_szc != 0);
2111 2111                                  curnpgs = page_get_pagecnt(pp->p_szc);
2112 2112                                  page_list_break(&pp, &pplist, curnpgs);
2113 2113                                  page_list_add_pages(pp, 0);
2114 2114                                  page_create_putback(curnpgs);
2115 2115                                  ASSERT(npgs >= curnpgs);
2116 2116                                  npgs -= curnpgs;
2117 2117                          }
2118 2118                          pp = pplist;
2119 2119                  } else {
2120 2120                          VM_STAT_ADD(alloc_pages[7]);
2121 2121                          PP_CLRFREE(pp);
2122 2122                          PP_CLRAGED(pp);
2123 2123                          pp = pp->p_next;
2124 2124                          npgs--;
2125 2125                  }
2126 2126          }
2127 2127          return (err);
2128 2128  }
2129 2129  
2130 2130  /*
2131 2131   * Get a single large page off of the freelists, and set it up for use.
2132 2132   * Number of bytes requested must be a supported page size.
2133 2133   *
2134 2134   * Note that this call may fail even if there is sufficient
2135 2135   * memory available or PG_WAIT is set, so the caller must
2136 2136   * be willing to fallback on page_create_va(), block and retry,
2137 2137   * or fail the requester.
2138 2138   */
2139 2139  page_t *
2140 2140  page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2141 2141      struct seg *seg, caddr_t vaddr, void *arg)
2142 2142  {
2143 2143          pgcnt_t         npages;
2144 2144          page_t          *pp;
2145 2145          page_t          *rootpp;
2146 2146          lgrp_t          *lgrp;
2147 2147          lgrp_id_t       *lgrpid = (lgrp_id_t *)arg;
2148 2148  
2149 2149          ASSERT(vp != NULL);
2150 2150  
2151 2151          ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2152 2152              PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2153 2153          /* but no others */
2154 2154  
2155 2155          ASSERT((flags & PG_EXCL) == PG_EXCL);
2156 2156  
2157 2157          npages = btop(bytes);
2158 2158  
2159 2159          if (!kcage_on || panicstr) {
2160 2160                  /*
2161 2161                   * Cage is OFF, or we are single threaded in
2162 2162                   * panic, so make everything a RELOC request.
2163 2163                   */
2164 2164                  flags &= ~PG_NORELOC;
2165 2165          }
2166 2166  
2167 2167          /*
2168 2168           * Make sure there's adequate physical memory available.
2169 2169           * Note: PG_WAIT is ignored here.
2170 2170           */
2171 2171          if (freemem <= throttlefree + npages) {
2172 2172                  VM_STAT_ADD(page_create_large_cnt[1]);
2173 2173                  return (NULL);
2174 2174          }
2175 2175  
2176 2176          /*
2177 2177           * If cage is on, dampen draw from cage when available
2178 2178           * cage space is low.
2179 2179           */
2180 2180          if ((flags & (PG_NORELOC | PG_WAIT)) ==  (PG_NORELOC | PG_WAIT) &&
2181 2181              kcage_freemem < kcage_throttlefree + npages) {
2182 2182  
2183 2183                  /*
2184 2184                   * The cage is on, the caller wants PG_NORELOC
2185 2185                   * pages and available cage memory is very low.
2186 2186                   * Call kcage_create_throttle() to attempt to
2187 2187                   * control demand on the cage.
2188 2188                   */
2189 2189                  if (kcage_create_throttle(npages, flags) == KCT_FAILURE) {
2190 2190                          VM_STAT_ADD(page_create_large_cnt[2]);
2191 2191                          return (NULL);
2192 2192                  }
2193 2193          }
2194 2194  
2195 2195          if (!pcf_decrement_bucket(npages) &&
2196 2196              !pcf_decrement_multiple(NULL, npages, 1)) {
2197 2197                  VM_STAT_ADD(page_create_large_cnt[4]);
2198 2198                  return (NULL);
2199 2199          }
2200 2200  
2201 2201          /*
2202 2202           * This is where this function behaves fundamentally differently
2203 2203           * than page_create_va(); since we're intending to map the page
2204 2204           * with a single TTE, we have to get it as a physically contiguous
2205 2205           * hardware pagesize chunk.  If we can't, we fail.
2206 2206           */
2207 2207          if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max &&
2208 2208              LGRP_EXISTS(lgrp_table[*lgrpid]))
2209 2209                  lgrp = lgrp_table[*lgrpid];
2210 2210          else
2211 2211                  lgrp = lgrp_mem_choose(seg, vaddr, bytes);
2212 2212  
2213 2213          if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr,
2214 2214              bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) {
2215 2215                  page_create_putback(npages);
2216 2216                  VM_STAT_ADD(page_create_large_cnt[5]);
2217 2217                  return (NULL);
2218 2218          }
2219 2219  
2220 2220          /*
2221 2221           * if we got the page with the wrong mtype give it back this is a
2222 2222           * workaround for CR 6249718. When CR 6249718 is fixed we never get
2223 2223           * inside "if" and the workaround becomes just a nop
2224 2224           */
2225 2225          if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) {
2226 2226                  page_list_add_pages(rootpp, 0);
2227 2227                  page_create_putback(npages);
2228 2228                  VM_STAT_ADD(page_create_large_cnt[6]);
2229 2229                  return (NULL);
2230 2230          }
2231 2231  
2232 2232          /*
2233 2233           * If satisfying this request has left us with too little
2234 2234           * memory, start the wheels turning to get some back.  The
2235 2235           * first clause of the test prevents waking up the pageout
2236 2236           * daemon in situations where it would decide that there's
2237 2237           * nothing to do.
2238 2238           */
2239 2239          if (nscan < desscan && freemem < minfree) {
2240 2240                  TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2241 2241                      "pageout_cv_signal:freemem %ld", freemem);
2242 2242                  cv_signal(&proc_pageout->p_cv);
2243 2243          }
2244 2244  
2245 2245          pp = rootpp;
2246 2246          while (npages--) {
2247 2247                  ASSERT(PAGE_EXCL(pp));
2248 2248                  ASSERT(pp->p_vnode == NULL);
2249 2249                  ASSERT(!hat_page_is_mapped(pp));
2250 2250                  PP_CLRFREE(pp);
2251 2251                  PP_CLRAGED(pp);
2252 2252                  if (!page_hashin(pp, vp, off, NULL))
2253 2253                          panic("page_create_large: hashin failed: page %p",
2254 2254                              (void *)pp);
2255 2255                  page_io_lock(pp);
2256 2256                  off += PAGESIZE;
2257 2257                  pp = pp->p_next;
2258 2258          }
2259 2259  
2260 2260          VM_STAT_ADD(page_create_large_cnt[0]);
2261 2261          return (rootpp);
2262 2262  }
2263 2263  
2264 2264  page_t *
2265 2265  page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2266 2266      struct seg *seg, caddr_t vaddr)
2267 2267  {
2268 2268          page_t          *plist = NULL;
2269 2269          pgcnt_t         npages;
2270 2270          pgcnt_t         found_on_free = 0;
2271 2271          pgcnt_t         pages_req;
2272 2272          page_t          *npp = NULL;
2273 2273          struct pcf      *p;
2274 2274          lgrp_t          *lgrp;
2275 2275  
2276 2276          TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
2277 2277              "page_create_start:vp %p off %llx bytes %lu flags %x",
2278 2278              vp, off, bytes, flags);
2279 2279  
2280 2280          ASSERT(bytes != 0 && vp != NULL);
2281 2281  
2282 2282          if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) {
2283 2283                  panic("page_create: invalid flags");
2284 2284                  /*NOTREACHED*/
2285 2285          }
2286 2286          ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2287 2287              PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2288 2288              /* but no others */
2289 2289  
2290 2290          pages_req = npages = btopr(bytes);
2291 2291          /*
2292 2292           * Try to see whether request is too large to *ever* be
2293 2293           * satisfied, in order to prevent deadlock.  We arbitrarily
2294 2294           * decide to limit maximum size requests to max_page_get.
2295 2295           */
2296 2296          if (npages >= max_page_get) {
2297 2297                  if ((flags & PG_WAIT) == 0) {
2298 2298                          TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG,
2299 2299                              "page_create_toobig:vp %p off %llx npages "
2300 2300                              "%lu max_page_get %lu",
2301 2301                              vp, off, npages, max_page_get);
2302 2302                          return (NULL);
2303 2303                  } else {
2304 2304                          cmn_err(CE_WARN,
2305 2305                              "Request for too much kernel memory "
2306 2306                              "(%lu bytes), will hang forever", bytes);
2307 2307                          for (;;)
2308 2308                                  delay(1000000000);
2309 2309                  }
2310 2310          }
2311 2311  
2312 2312          if (!kcage_on || panicstr) {
2313 2313                  /*
2314 2314                   * Cage is OFF, or we are single threaded in
2315 2315                   * panic, so make everything a RELOC request.
2316 2316                   */
2317 2317                  flags &= ~PG_NORELOC;
2318 2318          }
2319 2319  
2320 2320          if (freemem <= throttlefree + npages)
2321 2321                  if (!page_create_throttle(npages, flags))
2322 2322                          return (NULL);
2323 2323  
2324 2324          /*
2325 2325           * If cage is on, dampen draw from cage when available
2326 2326           * cage space is low.
2327 2327           */
2328 2328          if ((flags & PG_NORELOC) &&
2329 2329              kcage_freemem < kcage_throttlefree + npages) {
2330 2330  
2331 2331                  /*
2332 2332                   * The cage is on, the caller wants PG_NORELOC
2333 2333                   * pages and available cage memory is very low.
2334 2334                   * Call kcage_create_throttle() to attempt to
2335 2335                   * control demand on the cage.
2336 2336                   */
2337 2337                  if (kcage_create_throttle(npages, flags) == KCT_FAILURE)
2338 2338                          return (NULL);
2339 2339          }
2340 2340  
2341 2341          VM_STAT_ADD(page_create_cnt[0]);
2342 2342  
2343 2343          if (!pcf_decrement_bucket(npages)) {
2344 2344                  /*
2345 2345                   * Have to look harder.  If npages is greater than
2346 2346                   * one, then we might have to coalesce the counters.
2347 2347                   *
2348 2348                   * Go wait.  We come back having accounted
2349 2349                   * for the memory.
2350 2350                   */
2351 2351                  VM_STAT_ADD(page_create_cnt[1]);
2352 2352                  if (!page_create_wait(npages, flags)) {
2353 2353                          VM_STAT_ADD(page_create_cnt[2]);
2354 2354                          return (NULL);
2355 2355                  }
2356 2356          }
2357 2357  
2358 2358          TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
2359 2359              "page_create_success:vp %p off %llx", vp, off);
2360 2360  
2361 2361          /*
2362 2362           * If satisfying this request has left us with too little
2363 2363           * memory, start the wheels turning to get some back.  The
2364 2364           * first clause of the test prevents waking up the pageout
2365 2365           * daemon in situations where it would decide that there's
2366 2366           * nothing to do.
2367 2367           */
2368 2368          if (nscan < desscan && freemem < minfree) {
2369 2369                  TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2370 2370                      "pageout_cv_signal:freemem %ld", freemem);
2371 2371                  cv_signal(&proc_pageout->p_cv);
2372 2372          }
2373 2373  
2374 2374          /*
2375 2375           * Loop around collecting the requested number of pages.
2376 2376           * Most of the time, we have to `create' a new page. With
2377 2377           * this in mind, pull the page off the free list before
2378 2378           * getting the hash lock.  This will minimize the hash
2379 2379           * lock hold time, nesting, and the like.  If it turns
2380 2380           * out we don't need the page, we put it back at the end.
2381 2381           */
2382 2382          while (npages--) {
2383 2383                  page_t          *pp;
2384 2384                  kmutex_t        *phm = NULL;
2385 2385                  ulong_t         index;
2386 2386  
2387 2387                  index = PAGE_HASH_FUNC(vp, off);
2388 2388  top:
2389 2389                  ASSERT(phm == NULL);
2390 2390                  ASSERT(index == PAGE_HASH_FUNC(vp, off));
2391 2391                  ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
2392 2392  
2393 2393                  if (npp == NULL) {
2394 2394                          /*
2395 2395                           * Try to get a page from the freelist (ie,
2396 2396                           * a page with no [vp, off] tag).  If that
2397 2397                           * fails, use the cachelist.
2398 2398                           *
2399 2399                           * During the first attempt at both the free
2400 2400                           * and cache lists we try for the correct color.
2401 2401                           */
2402 2402                          /*
2403 2403                           * XXXX-how do we deal with virtual indexed
2404 2404                           * caches and and colors?
2405 2405                           */
2406 2406                          VM_STAT_ADD(page_create_cnt[4]);
2407 2407                          /*
2408 2408                           * Get lgroup to allocate next page of shared memory
2409 2409                           * from and use it to specify where to allocate
2410 2410                           * the physical memory
2411 2411                           */
2412 2412                          lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
2413 2413                          npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
2414 2414                              flags | PG_MATCH_COLOR, lgrp);
2415 2415                          if (npp == NULL) {
2416 2416                                  npp = page_get_cachelist(vp, off, seg,
2417 2417                                      vaddr, flags | PG_MATCH_COLOR, lgrp);
2418 2418                                  if (npp == NULL) {
2419 2419                                          npp = page_create_get_something(vp,
2420 2420                                              off, seg, vaddr,
2421 2421                                              flags & ~PG_MATCH_COLOR);
2422 2422                                  }
2423 2423  
2424 2424                                  if (PP_ISAGED(npp) == 0) {
2425 2425                                          /*
2426 2426                                           * Since this page came from the
2427 2427                                           * cachelist, we must destroy the
2428 2428                                           * old vnode association.
2429 2429                                           */
2430 2430                                          page_hashout(npp, NULL);
2431 2431                                  }
2432 2432                          }
2433 2433                  }
2434 2434  
2435 2435                  /*
2436 2436                   * We own this page!
2437 2437                   */
2438 2438                  ASSERT(PAGE_EXCL(npp));
2439 2439                  ASSERT(npp->p_vnode == NULL);
2440 2440                  ASSERT(!hat_page_is_mapped(npp));
2441 2441                  PP_CLRFREE(npp);
2442 2442                  PP_CLRAGED(npp);
2443 2443  
2444 2444                  /*
2445 2445                   * Here we have a page in our hot little mits and are
2446 2446                   * just waiting to stuff it on the appropriate lists.
2447 2447                   * Get the mutex and check to see if it really does
2448 2448                   * not exist.
2449 2449                   */
2450 2450                  phm = PAGE_HASH_MUTEX(index);
2451 2451                  mutex_enter(phm);
2452 2452                  PAGE_HASH_SEARCH(index, pp, vp, off);
2453 2453                  if (pp == NULL) {
2454 2454                          VM_STAT_ADD(page_create_new);
2455 2455                          pp = npp;
2456 2456                          npp = NULL;
2457 2457                          if (!page_hashin(pp, vp, off, phm)) {
2458 2458                                  /*
2459 2459                                   * Since we hold the page hash mutex and
2460 2460                                   * just searched for this page, page_hashin
2461 2461                                   * had better not fail.  If it does, that
2462 2462                                   * means somethread did not follow the
2463 2463                                   * page hash mutex rules.  Panic now and
2464 2464                                   * get it over with.  As usual, go down
2465 2465                                   * holding all the locks.
2466 2466                                   */
2467 2467                                  ASSERT(MUTEX_HELD(phm));
2468 2468                                  panic("page_create: "
2469 2469                                      "hashin failed %p %p %llx %p",
2470 2470                                      (void *)pp, (void *)vp, off, (void *)phm);
2471 2471                                  /*NOTREACHED*/
2472 2472                          }
2473 2473                          ASSERT(MUTEX_HELD(phm));
2474 2474                          mutex_exit(phm);
2475 2475                          phm = NULL;
2476 2476  
2477 2477                          /*
2478 2478                           * Hat layer locking need not be done to set
2479 2479                           * the following bits since the page is not hashed
2480 2480                           * and was on the free list (i.e., had no mappings).
2481 2481                           *
2482 2482                           * Set the reference bit to protect
2483 2483                           * against immediate pageout
2484 2484                           *
2485 2485                           * XXXmh modify freelist code to set reference
2486 2486                           * bit so we don't have to do it here.
2487 2487                           */
2488 2488                          page_set_props(pp, P_REF);
2489 2489                          found_on_free++;
2490 2490                  } else {
2491 2491                          VM_STAT_ADD(page_create_exists);
2492 2492                          if (flags & PG_EXCL) {
2493 2493                                  /*
2494 2494                                   * Found an existing page, and the caller
2495 2495                                   * wanted all new pages.  Undo all of the work
2496 2496                                   * we have done.
2497 2497                                   */
2498 2498                                  mutex_exit(phm);
2499 2499                                  phm = NULL;
2500 2500                                  while (plist != NULL) {
2501 2501                                          pp = plist;
2502 2502                                          page_sub(&plist, pp);
2503 2503                                          page_io_unlock(pp);
2504 2504                                          /* large pages should not end up here */
2505 2505                                          ASSERT(pp->p_szc == 0);
2506 2506                                          /*LINTED: constant in conditional ctx*/
2507 2507                                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
2508 2508                                  }
2509 2509                                  VM_STAT_ADD(page_create_found_one);
2510 2510                                  goto fail;
2511 2511                          }
2512 2512                          ASSERT(flags & PG_WAIT);
2513 2513                          if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) {
2514 2514                                  /*
2515 2515                                   * Start all over again if we blocked trying
2516 2516                                   * to lock the page.
2517 2517                                   */
2518 2518                                  mutex_exit(phm);
2519 2519                                  VM_STAT_ADD(page_create_page_lock_failed);
2520 2520                                  phm = NULL;
2521 2521                                  goto top;
2522 2522                          }
2523 2523                          mutex_exit(phm);
2524 2524                          phm = NULL;
2525 2525  
2526 2526                          if (PP_ISFREE(pp)) {
2527 2527                                  ASSERT(PP_ISAGED(pp) == 0);
2528 2528                                  VM_STAT_ADD(pagecnt.pc_get_cache);
2529 2529                                  page_list_sub(pp, PG_CACHE_LIST);
2530 2530                                  PP_CLRFREE(pp);
2531 2531                                  found_on_free++;
2532 2532                          }
2533 2533                  }
2534 2534  
2535 2535                  /*
2536 2536                   * Got a page!  It is locked.  Acquire the i/o
2537 2537                   * lock since we are going to use the p_next and
2538 2538                   * p_prev fields to link the requested pages together.
2539 2539                   */
2540 2540                  page_io_lock(pp);
2541 2541                  page_add(&plist, pp);
2542 2542                  plist = plist->p_next;
2543 2543                  off += PAGESIZE;
2544 2544                  vaddr += PAGESIZE;
2545 2545          }
2546 2546  
2547 2547          ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1);
2548 2548  fail:
2549 2549          if (npp != NULL) {
2550 2550                  /*
2551 2551                   * Did not need this page after all.
2552 2552                   * Put it back on the free list.
2553 2553                   */
2554 2554                  VM_STAT_ADD(page_create_putbacks);
2555 2555                  PP_SETFREE(npp);
2556 2556                  PP_SETAGED(npp);
2557 2557                  npp->p_offset = (u_offset_t)-1;
2558 2558                  page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
2559 2559                  page_unlock(npp);
2560 2560  
2561 2561          }
2562 2562  
2563 2563          ASSERT(pages_req >= found_on_free);
2564 2564  
2565 2565          {
2566 2566                  uint_t overshoot = (uint_t)(pages_req - found_on_free);
2567 2567  
2568 2568                  if (overshoot) {
2569 2569                          VM_STAT_ADD(page_create_overshoot);
2570 2570                          p = &pcf[PCF_INDEX()];
2571 2571                          mutex_enter(&p->pcf_lock);
2572 2572                          if (p->pcf_block) {
2573 2573                                  p->pcf_reserve += overshoot;
2574 2574                          } else {
2575 2575                                  p->pcf_count += overshoot;
2576 2576                                  if (p->pcf_wait) {
2577 2577                                          mutex_enter(&new_freemem_lock);
2578 2578                                          if (freemem_wait) {
2579 2579                                                  cv_signal(&freemem_cv);
2580 2580                                                  p->pcf_wait--;
2581 2581                                          } else {
2582 2582                                                  p->pcf_wait = 0;
2583 2583                                          }
2584 2584                                          mutex_exit(&new_freemem_lock);
2585 2585                                  }
2586 2586                          }
2587 2587                          mutex_exit(&p->pcf_lock);
2588 2588                          /* freemem is approximate, so this test OK */
2589 2589                          if (!p->pcf_block)
2590 2590                                  freemem += overshoot;
2591 2591                  }
2592 2592          }
2593 2593  
2594 2594          return (plist);
2595 2595  }
2596 2596  
2597 2597  /*
2598 2598   * One or more constituent pages of this large page has been marked
2599 2599   * toxic. Simply demote the large page to PAGESIZE pages and let
2600 2600   * page_free() handle it. This routine should only be called by
2601 2601   * large page free routines (page_free_pages() and page_destroy_pages().
2602 2602   * All pages are locked SE_EXCL and have already been marked free.
2603 2603   */
2604 2604  static void
2605 2605  page_free_toxic_pages(page_t *rootpp)
2606 2606  {
2607 2607          page_t  *tpp;
2608 2608          pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc);
2609 2609          uint_t  szc = rootpp->p_szc;
2610 2610  
2611 2611          for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) {
2612 2612                  ASSERT(tpp->p_szc == szc);
2613 2613                  ASSERT((PAGE_EXCL(tpp) &&
2614 2614                      !page_iolock_assert(tpp)) || panicstr);
2615 2615                  tpp->p_szc = 0;
2616 2616          }
2617 2617  
2618 2618          while (rootpp != NULL) {
2619 2619                  tpp = rootpp;
2620 2620                  page_sub(&rootpp, tpp);
2621 2621                  ASSERT(PP_ISFREE(tpp));
2622 2622                  PP_CLRFREE(tpp);
2623 2623                  page_free(tpp, 1);
2624 2624          }
2625 2625  }
2626 2626  
2627 2627  /*
2628 2628   * Put page on the "free" list.
2629 2629   * The free list is really two lists maintained by
2630 2630   * the PSM of whatever machine we happen to be on.
2631 2631   */
2632 2632  void
2633 2633  page_free(page_t *pp, int dontneed)
2634 2634  {
2635 2635          struct pcf      *p;
2636 2636          uint_t          pcf_index;
2637 2637  
2638 2638          ASSERT((PAGE_EXCL(pp) &&
2639 2639              !page_iolock_assert(pp)) || panicstr);
2640 2640  
2641 2641          if (PP_ISFREE(pp)) {
2642 2642                  panic("page_free: page %p is free", (void *)pp);
2643 2643          }
2644 2644  
2645 2645          if (pp->p_szc != 0) {
2646 2646                  if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
2647 2647                      PP_ISKAS(pp)) {
2648 2648                          panic("page_free: anon or kernel "
2649 2649                              "or no vnode large page %p", (void *)pp);
2650 2650                  }
2651 2651                  page_demote_vp_pages(pp);
2652 2652                  ASSERT(pp->p_szc == 0);
2653 2653          }
2654 2654  
2655 2655          /*
2656 2656           * The page_struct_lock need not be acquired to examine these
2657 2657           * fields since the page has an "exclusive" lock.
2658 2658           */
2659 2659          if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
2660 2660              pp->p_slckcnt != 0) {
2661 2661                  panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d "
2662 2662                      "slckcnt = %d", (void *)pp, page_pptonum(pp), pp->p_lckcnt,
2663 2663                      pp->p_cowcnt, pp->p_slckcnt);
2664 2664                  /*NOTREACHED*/
2665 2665          }
2666 2666  
2667 2667          ASSERT(!hat_page_getshare(pp));
2668 2668  
2669 2669          PP_SETFREE(pp);
2670 2670          ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
2671 2671              !hat_ismod(pp));
2672 2672          page_clr_all_props(pp);
2673 2673          ASSERT(!hat_page_getshare(pp));
2674 2674  
2675 2675          /*
2676 2676           * Now we add the page to the head of the free list.
2677 2677           * But if this page is associated with a paged vnode
2678 2678           * then we adjust the head forward so that the page is
2679 2679           * effectively at the end of the list.
2680 2680           */
2681 2681          if (pp->p_vnode == NULL) {
2682 2682                  /*
2683 2683                   * Page has no identity, put it on the free list.
2684 2684                   */
2685 2685                  PP_SETAGED(pp);
2686 2686                  pp->p_offset = (u_offset_t)-1;
2687 2687                  page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2688 2688                  VM_STAT_ADD(pagecnt.pc_free_free);
2689 2689                  TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2690 2690                      "page_free_free:pp %p", pp);
2691 2691          } else {
2692 2692                  PP_CLRAGED(pp);
2693 2693  
2694 2694                  if (!dontneed || nopageage) {
2695 2695                          /* move it to the tail of the list */
2696 2696                          page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL);
2697 2697  
2698 2698                          VM_STAT_ADD(pagecnt.pc_free_cache);
2699 2699                          TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL,
2700 2700                              "page_free_cache_tail:pp %p", pp);
2701 2701                  } else {
2702 2702                          page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD);
2703 2703  
2704 2704                          VM_STAT_ADD(pagecnt.pc_free_dontneed);
2705 2705                          TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD,
2706 2706                              "page_free_cache_head:pp %p", pp);
2707 2707                  }
2708 2708          }
2709 2709          page_unlock(pp);
2710 2710  
2711 2711          /*
2712 2712           * Now do the `freemem' accounting.
2713 2713           */
2714 2714          pcf_index = PCF_INDEX();
2715 2715          p = &pcf[pcf_index];
2716 2716  
2717 2717          mutex_enter(&p->pcf_lock);
2718 2718          if (p->pcf_block) {
2719 2719                  p->pcf_reserve += 1;
2720 2720          } else {
2721 2721                  p->pcf_count += 1;
2722 2722                  if (p->pcf_wait) {
2723 2723                          mutex_enter(&new_freemem_lock);
2724 2724                          /*
2725 2725                           * Check to see if some other thread
2726 2726                           * is actually waiting.  Another bucket
2727 2727                           * may have woken it up by now.  If there
2728 2728                           * are no waiters, then set our pcf_wait
2729 2729                           * count to zero to avoid coming in here
2730 2730                           * next time.  Also, since only one page
2731 2731                           * was put on the free list, just wake
2732 2732                           * up one waiter.
2733 2733                           */
2734 2734                          if (freemem_wait) {
2735 2735                                  cv_signal(&freemem_cv);
2736 2736                                  p->pcf_wait--;
2737 2737                          } else {
2738 2738                                  p->pcf_wait = 0;
2739 2739                          }
2740 2740                          mutex_exit(&new_freemem_lock);
2741 2741                  }
2742 2742          }
2743 2743          mutex_exit(&p->pcf_lock);
2744 2744  
2745 2745          /* freemem is approximate, so this test OK */
2746 2746          if (!p->pcf_block)
2747 2747                  freemem += 1;
2748 2748  }
2749 2749  
2750 2750  /*
2751 2751   * Put page on the "free" list during intial startup.
2752 2752   * This happens during initial single threaded execution.
2753 2753   */
2754 2754  void
2755 2755  page_free_at_startup(page_t *pp)
2756 2756  {
2757 2757          struct pcf      *p;
2758 2758          uint_t          pcf_index;
2759 2759  
2760 2760          page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT);
2761 2761          VM_STAT_ADD(pagecnt.pc_free_free);
2762 2762  
2763 2763          /*
2764 2764           * Now do the `freemem' accounting.
2765 2765           */
2766 2766          pcf_index = PCF_INDEX();
2767 2767          p = &pcf[pcf_index];
2768 2768  
2769 2769          ASSERT(p->pcf_block == 0);
2770 2770          ASSERT(p->pcf_wait == 0);
2771 2771          p->pcf_count += 1;
2772 2772  
2773 2773          /* freemem is approximate, so this is OK */
2774 2774          freemem += 1;
2775 2775  }
2776 2776  
2777 2777  void
2778 2778  page_free_pages(page_t *pp)
2779 2779  {
2780 2780          page_t  *tpp, *rootpp = NULL;
2781 2781          pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
2782 2782          pgcnt_t i;
2783 2783          uint_t  szc = pp->p_szc;
2784 2784  
2785 2785          VM_STAT_ADD(pagecnt.pc_free_pages);
2786 2786          TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2787 2787              "page_free_free:pp %p", pp);
2788 2788  
2789 2789          ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
2790 2790          if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
2791 2791                  panic("page_free_pages: not root page %p", (void *)pp);
2792 2792                  /*NOTREACHED*/
2793 2793          }
2794 2794  
2795 2795          for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
2796 2796                  ASSERT((PAGE_EXCL(tpp) &&
2797 2797                      !page_iolock_assert(tpp)) || panicstr);
2798 2798                  if (PP_ISFREE(tpp)) {
2799 2799                          panic("page_free_pages: page %p is free", (void *)tpp);
2800 2800                          /*NOTREACHED*/
2801 2801                  }
2802 2802                  if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 ||
2803 2803                      tpp->p_cowcnt != 0 || tpp->p_slckcnt != 0) {
2804 2804                          panic("page_free_pages %p", (void *)tpp);
2805 2805                          /*NOTREACHED*/
2806 2806                  }
2807 2807  
2808 2808                  ASSERT(!hat_page_getshare(tpp));
2809 2809                  ASSERT(tpp->p_vnode == NULL);
2810 2810                  ASSERT(tpp->p_szc == szc);
2811 2811  
2812 2812                  PP_SETFREE(tpp);
2813 2813                  page_clr_all_props(tpp);
2814 2814                  PP_SETAGED(tpp);
2815 2815                  tpp->p_offset = (u_offset_t)-1;
2816 2816                  ASSERT(tpp->p_next == tpp);
2817 2817                  ASSERT(tpp->p_prev == tpp);
2818 2818                  page_list_concat(&rootpp, &tpp);
2819 2819          }
2820 2820          ASSERT(rootpp == pp);
2821 2821  
2822 2822          page_list_add_pages(rootpp, 0);
2823 2823          page_create_putback(pgcnt);
2824 2824  }
2825 2825  
2826 2826  int free_pages = 1;
2827 2827  
2828 2828  /*
2829 2829   * This routine attempts to return pages to the cachelist via page_release().
2830 2830   * It does not *have* to be successful in all cases, since the pageout scanner
2831 2831   * will catch any pages it misses.  It does need to be fast and not introduce
2832 2832   * too much overhead.
2833 2833   *
2834 2834   * If a page isn't found on the unlocked sweep of the page_hash bucket, we
2835 2835   * don't lock and retry.  This is ok, since the page scanner will eventually
2836 2836   * find any page we miss in free_vp_pages().
2837 2837   */
2838 2838  void
2839 2839  free_vp_pages(vnode_t *vp, u_offset_t off, size_t len)
2840 2840  {
2841 2841          page_t *pp;
2842 2842          u_offset_t eoff;
2843 2843          extern int swap_in_range(vnode_t *, u_offset_t, size_t);
2844 2844  
2845 2845          eoff = off + len;
2846 2846  
2847 2847          if (free_pages == 0)
2848 2848                  return;
2849 2849          if (swap_in_range(vp, off, len))
2850 2850                  return;
2851 2851  
2852 2852          for (; off < eoff; off += PAGESIZE) {
2853 2853  
2854 2854                  /*
2855 2855                   * find the page using a fast, but inexact search. It'll be OK
2856 2856                   * if a few pages slip through the cracks here.
2857 2857                   */
2858 2858                  pp = page_exists(vp, off);
2859 2859  
2860 2860                  /*
2861 2861                   * If we didn't find the page (it may not exist), the page
2862 2862                   * is free, looks still in use (shared), or we can't lock it,
2863 2863                   * just give up.
2864 2864                   */
2865 2865                  if (pp == NULL ||
2866 2866                      PP_ISFREE(pp) ||
2867 2867                      page_share_cnt(pp) > 0 ||
2868 2868                      !page_trylock(pp, SE_EXCL))
2869 2869                          continue;
2870 2870  
2871 2871                  /*
2872 2872                   * Once we have locked pp, verify that it's still the
2873 2873                   * correct page and not already free
2874 2874                   */
2875 2875                  ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL));
2876 2876                  if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) {
2877 2877                          page_unlock(pp);
2878 2878                          continue;
2879 2879                  }
2880 2880  
2881 2881                  /*
2882 2882                   * try to release the page...
2883 2883                   */
2884 2884                  (void) page_release(pp, 1);
2885 2885          }
2886 2886  }
2887 2887  
2888 2888  /*
2889 2889   * Reclaim the given page from the free list.
2890 2890   * If pp is part of a large pages, only the given constituent page is reclaimed
2891 2891   * and the large page it belonged to will be demoted.  This can only happen
2892 2892   * if the page is not on the cachelist.
2893 2893   *
2894 2894   * Returns 1 on success or 0 on failure.
2895 2895   *
2896 2896   * The page is unlocked if it can't be reclaimed (when freemem == 0).
2897 2897   * If `lock' is non-null, it will be dropped and re-acquired if
2898 2898   * the routine must wait while freemem is 0.
2899 2899   *
2900 2900   * As it turns out, boot_getpages() does this.  It picks a page,
2901 2901   * based on where OBP mapped in some address, gets its pfn, searches
2902 2902   * the memsegs, locks the page, then pulls it off the free list!
2903 2903   */
2904 2904  int
2905 2905  page_reclaim(page_t *pp, kmutex_t *lock)
2906 2906  {
2907 2907          struct pcf      *p;
2908 2908          struct cpu      *cpup;
2909 2909          int             enough;
2910 2910          uint_t          i;
2911 2911  
2912 2912          ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
2913 2913          ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
2914 2914  
2915 2915          /*
2916 2916           * If `freemem' is 0, we cannot reclaim this page from the
2917 2917           * freelist, so release every lock we might hold: the page,
2918 2918           * and the `lock' before blocking.
2919 2919           *
2920 2920           * The only way `freemem' can become 0 while there are pages
2921 2921           * marked free (have their p->p_free bit set) is when the
2922 2922           * system is low on memory and doing a page_create().  In
2923 2923           * order to guarantee that once page_create() starts acquiring
2924 2924           * pages it will be able to get all that it needs since `freemem'
2925 2925           * was decreased by the requested amount.  So, we need to release
2926 2926           * this page, and let page_create() have it.
2927 2927           *
2928 2928           * Since `freemem' being zero is not supposed to happen, just
2929 2929           * use the usual hash stuff as a starting point.  If that bucket
2930 2930           * is empty, then assume the worst, and start at the beginning
2931 2931           * of the pcf array.  If we always start at the beginning
2932 2932           * when acquiring more than one pcf lock, there won't be any
2933 2933           * deadlock problems.
2934 2934           */
2935 2935  
2936 2936          /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */
2937 2937  
2938 2938          if (freemem <= throttlefree && !page_create_throttle(1l, 0)) {
2939 2939                  pcf_acquire_all();
2940 2940                  goto page_reclaim_nomem;
2941 2941          }
2942 2942  
2943 2943          enough = pcf_decrement_bucket(1);
2944 2944  
2945 2945          if (!enough) {
2946 2946                  VM_STAT_ADD(page_reclaim_zero);
2947 2947                  /*
2948 2948                   * Check again. Its possible that some other thread
2949 2949                   * could have been right behind us, and added one
2950 2950                   * to a list somewhere.  Acquire each of the pcf locks
2951 2951                   * until we find a page.
2952 2952                   */
2953 2953                  p = pcf;
2954 2954                  for (i = 0; i < pcf_fanout; i++) {
2955 2955                          mutex_enter(&p->pcf_lock);
2956 2956                          if (p->pcf_count >= 1) {
2957 2957                                  p->pcf_count -= 1;
2958 2958                                  /*
2959 2959                                   * freemem is not protected by any lock. Thus,
2960 2960                                   * we cannot have any assertion containing
2961 2961                                   * freemem here.
2962 2962                                   */
2963 2963                                  freemem -= 1;
2964 2964                                  enough = 1;
2965 2965                                  break;
2966 2966                          }
2967 2967                          p++;
2968 2968                  }
2969 2969  
2970 2970                  if (!enough) {
2971 2971  page_reclaim_nomem:
2972 2972                          /*
2973 2973                           * We really can't have page `pp'.
2974 2974                           * Time for the no-memory dance with
2975 2975                           * page_free().  This is just like
2976 2976                           * page_create_wait().  Plus the added
2977 2977                           * attraction of releasing whatever mutex
2978 2978                           * we held when we were called with in `lock'.
2979 2979                           * Page_unlock() will wakeup any thread
2980 2980                           * waiting around for this page.
2981 2981                           */
2982 2982                          if (lock) {
2983 2983                                  VM_STAT_ADD(page_reclaim_zero_locked);
2984 2984                                  mutex_exit(lock);
2985 2985                          }
2986 2986                          page_unlock(pp);
2987 2987  
2988 2988                          /*
2989 2989                           * get this before we drop all the pcf locks.
2990 2990                           */
2991 2991                          mutex_enter(&new_freemem_lock);
2992 2992  
2993 2993                          p = pcf;
2994 2994                          for (i = 0; i < pcf_fanout; i++) {
2995 2995                                  p->pcf_wait++;
2996 2996                                  mutex_exit(&p->pcf_lock);
2997 2997                                  p++;
2998 2998                          }
2999 2999  
3000 3000                          freemem_wait++;
3001 3001                          cv_wait(&freemem_cv, &new_freemem_lock);
3002 3002                          freemem_wait--;
3003 3003  
3004 3004                          mutex_exit(&new_freemem_lock);
3005 3005  
3006 3006                          if (lock) {
3007 3007                                  mutex_enter(lock);
3008 3008                          }
3009 3009                          return (0);
3010 3010                  }
3011 3011  
3012 3012                  /*
3013 3013                   * The pcf accounting has been done,
3014 3014                   * though none of the pcf_wait flags have been set,
3015 3015                   * drop the locks and continue on.
3016 3016                   */
3017 3017                  while (p >= pcf) {
3018 3018                          mutex_exit(&p->pcf_lock);
3019 3019                          p--;
3020 3020                  }
3021 3021          }
3022 3022  
3023 3023  
3024 3024          VM_STAT_ADD(pagecnt.pc_reclaim);
3025 3025  
3026 3026          /*
3027 3027           * page_list_sub will handle the case where pp is a large page.
3028 3028           * It's possible that the page was promoted while on the freelist
3029 3029           */
3030 3030          if (PP_ISAGED(pp)) {
3031 3031                  page_list_sub(pp, PG_FREE_LIST);
3032 3032                  TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
3033 3033                      "page_reclaim_free:pp %p", pp);
3034 3034          } else {
3035 3035                  page_list_sub(pp, PG_CACHE_LIST);
3036 3036                  TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
3037 3037                      "page_reclaim_cache:pp %p", pp);
3038 3038          }
3039 3039  
3040 3040          /*
3041 3041           * clear the p_free & p_age bits since this page is no longer
3042 3042           * on the free list.  Notice that there was a brief time where
3043 3043           * a page is marked as free, but is not on the list.
3044 3044           *
3045 3045           * Set the reference bit to protect against immediate pageout.
3046 3046           */
3047 3047          PP_CLRFREE(pp);
3048 3048          PP_CLRAGED(pp);
3049 3049          page_set_props(pp, P_REF);
3050 3050  
3051 3051          CPU_STATS_ENTER_K();
3052 3052          cpup = CPU;     /* get cpup now that CPU cannot change */
3053 3053          CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
3054 3054          CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
3055 3055          CPU_STATS_EXIT_K();
3056 3056          ASSERT(pp->p_szc == 0);
3057 3057  
3058 3058          return (1);
3059 3059  }
3060 3060  
3061 3061  /*
3062 3062   * Destroy identity of the page and put it back on
3063 3063   * the page free list.  Assumes that the caller has
3064 3064   * acquired the "exclusive" lock on the page.
3065 3065   */
3066 3066  void
3067 3067  page_destroy(page_t *pp, int dontfree)
3068 3068  {
3069 3069          ASSERT((PAGE_EXCL(pp) &&
3070 3070              !page_iolock_assert(pp)) || panicstr);
3071 3071          ASSERT(pp->p_slckcnt == 0 || panicstr);
3072 3072  
3073 3073          if (pp->p_szc != 0) {
3074 3074                  if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
3075 3075                      PP_ISKAS(pp)) {
3076 3076                          panic("page_destroy: anon or kernel or no vnode "
3077 3077                              "large page %p", (void *)pp);
3078 3078                  }
3079 3079                  page_demote_vp_pages(pp);
3080 3080                  ASSERT(pp->p_szc == 0);
3081 3081          }
3082 3082  
3083 3083          TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp);
3084 3084  
3085 3085          /*
3086 3086           * Unload translations, if any, then hash out the
3087 3087           * page to erase its identity.
3088 3088           */
3089 3089          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3090 3090          page_hashout(pp, NULL);
3091 3091  
3092 3092          if (!dontfree) {
3093 3093                  /*
3094 3094                   * Acquire the "freemem_lock" for availrmem.
3095 3095                   * The page_struct_lock need not be acquired for lckcnt
3096 3096                   * and cowcnt since the page has an "exclusive" lock.
3097 3097                   * We are doing a modified version of page_pp_unlock here.
3098 3098                   */
3099 3099                  if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) {
3100 3100                          mutex_enter(&freemem_lock);
3101 3101                          if (pp->p_lckcnt != 0) {
3102 3102                                  availrmem++;
3103 3103                                  pages_locked--;
3104 3104                                  pp->p_lckcnt = 0;
3105 3105                          }
3106 3106                          if (pp->p_cowcnt != 0) {
3107 3107                                  availrmem += pp->p_cowcnt;
3108 3108                                  pages_locked -= pp->p_cowcnt;
3109 3109                                  pp->p_cowcnt = 0;
3110 3110                          }
3111 3111                          mutex_exit(&freemem_lock);
3112 3112                  }
3113 3113                  /*
3114 3114                   * Put the page on the "free" list.
3115 3115                   */
3116 3116                  page_free(pp, 0);
3117 3117          }
3118 3118  }
3119 3119  
3120 3120  void
3121 3121  page_destroy_pages(page_t *pp)
3122 3122  {
3123 3123  
3124 3124          page_t  *tpp, *rootpp = NULL;
3125 3125          pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
3126 3126          pgcnt_t i, pglcks = 0;
3127 3127          uint_t  szc = pp->p_szc;
3128 3128  
3129 3129          ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
3130 3130  
3131 3131          VM_STAT_ADD(pagecnt.pc_destroy_pages);
3132 3132  
3133 3133          TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp);
3134 3134  
3135 3135          if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
3136 3136                  panic("page_destroy_pages: not root page %p", (void *)pp);
3137 3137                  /*NOTREACHED*/
3138 3138          }
3139 3139  
3140 3140          for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
3141 3141                  ASSERT((PAGE_EXCL(tpp) &&
3142 3142                      !page_iolock_assert(tpp)) || panicstr);
3143 3143                  ASSERT(tpp->p_slckcnt == 0 || panicstr);
3144 3144                  (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
3145 3145                  page_hashout(tpp, NULL);
3146 3146                  ASSERT(tpp->p_offset == (u_offset_t)-1);
3147 3147                  if (tpp->p_lckcnt != 0) {
3148 3148                          pglcks++;
3149 3149                          tpp->p_lckcnt = 0;
3150 3150                  } else if (tpp->p_cowcnt != 0) {
3151 3151                          pglcks += tpp->p_cowcnt;
3152 3152                          tpp->p_cowcnt = 0;
3153 3153                  }
3154 3154                  ASSERT(!hat_page_getshare(tpp));
3155 3155                  ASSERT(tpp->p_vnode == NULL);
3156 3156                  ASSERT(tpp->p_szc == szc);
3157 3157  
3158 3158                  PP_SETFREE(tpp);
3159 3159                  page_clr_all_props(tpp);
3160 3160                  PP_SETAGED(tpp);
3161 3161                  ASSERT(tpp->p_next == tpp);
3162 3162                  ASSERT(tpp->p_prev == tpp);
3163 3163                  page_list_concat(&rootpp, &tpp);
3164 3164          }
3165 3165  
3166 3166          ASSERT(rootpp == pp);
3167 3167          if (pglcks != 0) {
3168 3168                  mutex_enter(&freemem_lock);
3169 3169                  availrmem += pglcks;
3170 3170                  mutex_exit(&freemem_lock);
3171 3171          }
3172 3172  
3173 3173          page_list_add_pages(rootpp, 0);
3174 3174          page_create_putback(pgcnt);
3175 3175  }
3176 3176  
3177 3177  /*
3178 3178   * Similar to page_destroy(), but destroys pages which are
3179 3179   * locked and known to be on the page free list.  Since
3180 3180   * the page is known to be free and locked, no one can access
3181 3181   * it.
3182 3182   *
3183 3183   * Also, the number of free pages does not change.
3184 3184   */
3185 3185  void
3186 3186  page_destroy_free(page_t *pp)
3187 3187  {
3188 3188          ASSERT(PAGE_EXCL(pp));
3189 3189          ASSERT(PP_ISFREE(pp));
3190 3190          ASSERT(pp->p_vnode);
3191 3191          ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0);
3192 3192          ASSERT(!hat_page_is_mapped(pp));
3193 3193          ASSERT(PP_ISAGED(pp) == 0);
3194 3194          ASSERT(pp->p_szc == 0);
3195 3195  
3196 3196          VM_STAT_ADD(pagecnt.pc_destroy_free);
3197 3197          page_list_sub(pp, PG_CACHE_LIST);
3198 3198  
3199 3199          page_hashout(pp, NULL);
3200 3200          ASSERT(pp->p_vnode == NULL);
3201 3201          ASSERT(pp->p_offset == (u_offset_t)-1);
3202 3202          ASSERT(pp->p_hash == NULL);
3203 3203  
3204 3204          PP_SETAGED(pp);
3205 3205          page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3206 3206          page_unlock(pp);
3207 3207  
3208 3208          mutex_enter(&new_freemem_lock);
3209 3209          if (freemem_wait) {
3210 3210                  cv_signal(&freemem_cv);
3211 3211          }
3212 3212          mutex_exit(&new_freemem_lock);
3213 3213  }
3214 3214  
3215 3215  /*
3216 3216   * Rename the page "opp" to have an identity specified
3217 3217   * by [vp, off].  If a page already exists with this name
3218 3218   * it is locked and destroyed.  Note that the page's
3219 3219   * translations are not unloaded during the rename.
3220 3220   *
3221 3221   * This routine is used by the anon layer to "steal" the
3222 3222   * original page and is not unlike destroying a page and
3223 3223   * creating a new page using the same page frame.
3224 3224   *
3225 3225   * XXX -- Could deadlock if caller 1 tries to rename A to B while
3226 3226   * caller 2 tries to rename B to A.
3227 3227   */
3228 3228  void
3229 3229  page_rename(page_t *opp, vnode_t *vp, u_offset_t off)
3230 3230  {
3231 3231          page_t          *pp;
3232 3232          int             olckcnt = 0;
3233 3233          int             ocowcnt = 0;
3234 3234          kmutex_t        *phm;
3235 3235          ulong_t         index;
3236 3236  
3237 3237          ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp));
3238 3238          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3239 3239          ASSERT(PP_ISFREE(opp) == 0);
3240 3240  
3241 3241          VM_STAT_ADD(page_rename_count);
3242 3242  
3243 3243          TRACE_3(TR_FAC_VM, TR_PAGE_RENAME,
3244 3244              "page rename:pp %p vp %p off %llx", opp, vp, off);
3245 3245  
3246 3246          /*
3247 3247           * CacheFS may call page_rename for a large NFS page
3248 3248           * when both CacheFS and NFS mount points are used
3249 3249           * by applications. Demote this large page before
3250 3250           * renaming it, to ensure that there are no "partial"
3251 3251           * large pages left lying around.
3252 3252           */
3253 3253          if (opp->p_szc != 0) {
3254 3254                  vnode_t *ovp = opp->p_vnode;
3255 3255                  ASSERT(ovp != NULL);
3256 3256                  ASSERT(!IS_SWAPFSVP(ovp));
3257 3257                  ASSERT(!VN_ISKAS(ovp));
3258 3258                  page_demote_vp_pages(opp);
3259 3259                  ASSERT(opp->p_szc == 0);
3260 3260          }
3261 3261  
3262 3262          page_hashout(opp, NULL);
3263 3263          PP_CLRAGED(opp);
3264 3264  
3265 3265          /*
3266 3266           * Acquire the appropriate page hash lock, since
3267 3267           * we're going to rename the page.
3268 3268           */
3269 3269          index = PAGE_HASH_FUNC(vp, off);
3270 3270          phm = PAGE_HASH_MUTEX(index);
3271 3271          mutex_enter(phm);
3272 3272  top:
3273 3273          /*
3274 3274           * Look for an existing page with this name and destroy it if found.
3275 3275           * By holding the page hash lock all the way to the page_hashin()
3276 3276           * call, we are assured that no page can be created with this
3277 3277           * identity.  In the case when the phm lock is dropped to undo any
3278 3278           * hat layer mappings, the existing page is held with an "exclusive"
3279 3279           * lock, again preventing another page from being created with
3280 3280           * this identity.
3281 3281           */
3282 3282          PAGE_HASH_SEARCH(index, pp, vp, off);
3283 3283          if (pp != NULL) {
3284 3284                  VM_STAT_ADD(page_rename_exists);
3285 3285  
3286 3286                  /*
3287 3287                   * As it turns out, this is one of only two places where
3288 3288                   * page_lock() needs to hold the passed in lock in the
3289 3289                   * successful case.  In all of the others, the lock could
3290 3290                   * be dropped as soon as the attempt is made to lock
3291 3291                   * the page.  It is tempting to add yet another arguement,
3292 3292                   * PL_KEEP or PL_DROP, to let page_lock know what to do.
3293 3293                   */
3294 3294                  if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) {
3295 3295                          /*
3296 3296                           * Went to sleep because the page could not
3297 3297                           * be locked.  We were woken up when the page
3298 3298                           * was unlocked, or when the page was destroyed.
3299 3299                           * In either case, `phm' was dropped while we
3300 3300                           * slept.  Hence we should not just roar through
3301 3301                           * this loop.
3302 3302                           */
3303 3303                          goto top;
3304 3304                  }
3305 3305  
3306 3306                  /*
3307 3307                   * If an existing page is a large page, then demote
3308 3308                   * it to ensure that no "partial" large pages are
3309 3309                   * "created" after page_rename. An existing page
3310 3310                   * can be a CacheFS page, and can't belong to swapfs.
3311 3311                   */
3312 3312                  if (hat_page_is_mapped(pp)) {
3313 3313                          /*
3314 3314                           * Unload translations.  Since we hold the
3315 3315                           * exclusive lock on this page, the page
3316 3316                           * can not be changed while we drop phm.
3317 3317                           * This is also not a lock protocol violation,
3318 3318                           * but rather the proper way to do things.
3319 3319                           */
3320 3320                          mutex_exit(phm);
3321 3321                          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3322 3322                          if (pp->p_szc != 0) {
3323 3323                                  ASSERT(!IS_SWAPFSVP(vp));
3324 3324                                  ASSERT(!VN_ISKAS(vp));
3325 3325                                  page_demote_vp_pages(pp);
3326 3326                                  ASSERT(pp->p_szc == 0);
3327 3327                          }
3328 3328                          mutex_enter(phm);
3329 3329                  } else if (pp->p_szc != 0) {
3330 3330                          ASSERT(!IS_SWAPFSVP(vp));
3331 3331                          ASSERT(!VN_ISKAS(vp));
3332 3332                          mutex_exit(phm);
3333 3333                          page_demote_vp_pages(pp);
3334 3334                          ASSERT(pp->p_szc == 0);
3335 3335                          mutex_enter(phm);
3336 3336                  }
3337 3337                  page_hashout(pp, phm);
3338 3338          }
3339 3339          /*
3340 3340           * Hash in the page with the new identity.
3341 3341           */
3342 3342          if (!page_hashin(opp, vp, off, phm)) {
3343 3343                  /*
3344 3344                   * We were holding phm while we searched for [vp, off]
3345 3345                   * and only dropped phm if we found and locked a page.
3346 3346                   * If we can't create this page now, then some thing
3347 3347                   * is really broken.
3348 3348                   */
3349 3349                  panic("page_rename: Can't hash in page: %p", (void *)pp);
3350 3350                  /*NOTREACHED*/
3351 3351          }
3352 3352  
3353 3353          ASSERT(MUTEX_HELD(phm));
3354 3354          mutex_exit(phm);
3355 3355  
3356 3356          /*
3357 3357           * Now that we have dropped phm, lets get around to finishing up
3358 3358           * with pp.
3359 3359           */
3360 3360          if (pp != NULL) {
3361 3361                  ASSERT(!hat_page_is_mapped(pp));
3362 3362                  /* for now large pages should not end up here */
3363 3363                  ASSERT(pp->p_szc == 0);
3364 3364                  /*
3365 3365                   * Save the locks for transfer to the new page and then
3366 3366                   * clear them so page_free doesn't think they're important.
3367 3367                   * The page_struct_lock need not be acquired for lckcnt and
3368 3368                   * cowcnt since the page has an "exclusive" lock.
3369 3369                   */
3370 3370                  olckcnt = pp->p_lckcnt;
3371 3371                  ocowcnt = pp->p_cowcnt;
3372 3372                  pp->p_lckcnt = pp->p_cowcnt = 0;
3373 3373  
3374 3374                  /*
3375 3375                   * Put the page on the "free" list after we drop
3376 3376                   * the lock.  The less work under the lock the better.
3377 3377                   */
3378 3378                  /*LINTED: constant in conditional context*/
3379 3379                  VN_DISPOSE(pp, B_FREE, 0, kcred);
3380 3380          }
3381 3381  
3382 3382          /*
3383 3383           * Transfer the lock count from the old page (if any).
3384 3384           * The page_struct_lock need not be acquired for lckcnt and
3385 3385           * cowcnt since the page has an "exclusive" lock.
3386 3386           */
3387 3387          opp->p_lckcnt += olckcnt;
3388 3388          opp->p_cowcnt += ocowcnt;
3389 3389  }
3390 3390  
3391 3391  /*
3392 3392   * low level routine to add page `pp' to the hash and vp chains for [vp, offset]
3393 3393   *
3394 3394   * Pages are normally inserted at the start of a vnode's v_pages list.
3395 3395   * If the vnode is VMODSORT and the page is modified, it goes at the end.
3396 3396   * This can happen when a modified page is relocated for DR.
3397 3397   *
3398 3398   * Returns 1 on success and 0 on failure.
3399 3399   */
3400 3400  static int
3401 3401  page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset)
3402 3402  {
3403 3403          page_t          **listp;
3404 3404          page_t          *tp;
3405 3405          ulong_t         index;
3406 3406  
3407 3407          ASSERT(PAGE_EXCL(pp));
3408 3408          ASSERT(vp != NULL);
3409 3409          ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3410 3410  
3411 3411          /*
3412 3412           * Be sure to set these up before the page is inserted on the hash
3413 3413           * list.  As soon as the page is placed on the list some other
3414 3414           * thread might get confused and wonder how this page could
3415 3415           * possibly hash to this list.
3416 3416           */
3417 3417          pp->p_vnode = vp;
3418 3418          pp->p_offset = offset;
3419 3419  
3420 3420          /*
3421 3421           * record if this page is on a swap vnode
3422 3422           */
3423 3423          if ((vp->v_flag & VISSWAP) != 0)
3424 3424                  PP_SETSWAP(pp);
3425 3425  
3426 3426          index = PAGE_HASH_FUNC(vp, offset);
3427 3427          ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index)));
3428 3428          listp = &page_hash[index];
3429 3429  
3430 3430          /*
3431 3431           * If this page is already hashed in, fail this attempt to add it.
3432 3432           */
3433 3433          for (tp = *listp; tp != NULL; tp = tp->p_hash) {
3434 3434                  if (tp->p_vnode == vp && tp->p_offset == offset) {
3435 3435                          pp->p_vnode = NULL;
3436 3436                          pp->p_offset = (u_offset_t)(-1);
3437 3437                          return (0);
3438 3438                  }
3439 3439          }
3440 3440          pp->p_hash = *listp;
3441 3441          *listp = pp;
3442 3442  
3443 3443          /*
3444 3444           * Add the page to the vnode's list of pages
3445 3445           */
3446 3446          if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp))
3447 3447                  listp = &vp->v_pages->p_vpprev->p_vpnext;
3448 3448          else
3449 3449                  listp = &vp->v_pages;
3450 3450  
3451 3451          page_vpadd(listp, pp);
3452 3452  
3453 3453          return (1);
3454 3454  }
3455 3455  
3456 3456  /*
3457 3457   * Add page `pp' to both the hash and vp chains for [vp, offset].
3458 3458   *
3459 3459   * Returns 1 on success and 0 on failure.
3460 3460   * If hold is passed in, it is not dropped.
3461 3461   */
3462 3462  int
3463 3463  page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold)
3464 3464  {
3465 3465          kmutex_t        *phm = NULL;
3466 3466          kmutex_t        *vphm;
3467 3467          int             rc;
3468 3468  
3469 3469          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3470 3470          ASSERT(pp->p_fsdata == 0 || panicstr);
3471 3471  
3472 3472          TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN,
3473 3473              "page_hashin:pp %p vp %p offset %llx",
3474 3474              pp, vp, offset);
3475 3475  
3476 3476          VM_STAT_ADD(hashin_count);
3477 3477  
3478 3478          if (hold != NULL)
3479 3479                  phm = hold;
3480 3480          else {
3481 3481                  VM_STAT_ADD(hashin_not_held);
3482 3482                  phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset));
3483 3483                  mutex_enter(phm);
3484 3484          }
3485 3485  
3486 3486          vphm = page_vnode_mutex(vp);
3487 3487          mutex_enter(vphm);
3488 3488          rc = page_do_hashin(pp, vp, offset);
3489 3489          mutex_exit(vphm);
3490 3490          if (hold == NULL)
3491 3491                  mutex_exit(phm);
3492 3492          if (rc == 0)
3493 3493                  VM_STAT_ADD(hashin_already);
3494 3494          return (rc);
3495 3495  }
3496 3496  
3497 3497  /*
3498 3498   * Remove page ``pp'' from the hash and vp chains and remove vp association.
3499 3499   * All mutexes must be held
3500 3500   */
3501 3501  static void
3502 3502  page_do_hashout(page_t *pp)
3503 3503  {
3504 3504          page_t  **hpp;
3505 3505          page_t  *hp;
3506 3506          vnode_t *vp = pp->p_vnode;
3507 3507  
3508 3508          ASSERT(vp != NULL);
3509 3509          ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3510 3510  
3511 3511          /*
3512 3512           * First, take pp off of its hash chain.
3513 3513           */
3514 3514          hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)];
3515 3515  
3516 3516          for (;;) {
3517 3517                  hp = *hpp;
3518 3518                  if (hp == pp)
3519 3519                          break;
3520 3520                  if (hp == NULL) {
3521 3521                          panic("page_do_hashout");
3522 3522                          /*NOTREACHED*/
3523 3523                  }
3524 3524                  hpp = &hp->p_hash;
3525 3525          }
3526 3526          *hpp = pp->p_hash;
3527 3527  
3528 3528          /*
3529 3529           * Now remove it from its associated vnode.
3530 3530           */
3531 3531          if (vp->v_pages)
3532 3532                  page_vpsub(&vp->v_pages, pp);
3533 3533  
3534 3534          pp->p_hash = NULL;
3535 3535          page_clr_all_props(pp);
3536 3536          PP_CLRSWAP(pp);
3537 3537          pp->p_vnode = NULL;
3538 3538          pp->p_offset = (u_offset_t)-1;
3539 3539          pp->p_fsdata = 0;
3540 3540  }
3541 3541  
3542 3542  /*
3543 3543   * Remove page ``pp'' from the hash and vp chains and remove vp association.
3544 3544   *
3545 3545   * When `phm' is non-NULL it contains the address of the mutex protecting the
3546 3546   * hash list pp is on.  It is not dropped.
3547 3547   */
3548 3548  void
3549 3549  page_hashout(page_t *pp, kmutex_t *phm)
3550 3550  {
3551 3551          vnode_t         *vp;
3552 3552          ulong_t         index;
3553 3553          kmutex_t        *nphm;
3554 3554          kmutex_t        *vphm;
3555 3555          kmutex_t        *sep;
3556 3556  
3557 3557          ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1);
3558 3558          ASSERT(pp->p_vnode != NULL);
3559 3559          ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
3560 3560          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode)));
3561 3561  
3562 3562          vp = pp->p_vnode;
3563 3563  
3564 3564          TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT,
3565 3565              "page_hashout:pp %p vp %p", pp, vp);
3566 3566  
3567 3567          /* Kernel probe */
3568 3568          TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */,
3569 3569              tnf_opaque, vnode, vp,
3570 3570              tnf_offset, offset, pp->p_offset);
3571 3571  
3572 3572          /*
3573 3573           *
3574 3574           */
3575 3575          VM_STAT_ADD(hashout_count);
3576 3576          index = PAGE_HASH_FUNC(vp, pp->p_offset);
3577 3577          if (phm == NULL) {
3578 3578                  VM_STAT_ADD(hashout_not_held);
3579 3579                  nphm = PAGE_HASH_MUTEX(index);
3580 3580                  mutex_enter(nphm);
3581 3581          }
3582 3582          ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1);
3583 3583  
3584 3584  
3585 3585          /*
3586 3586           * grab page vnode mutex and remove it...
3587 3587           */
3588 3588          vphm = page_vnode_mutex(vp);
3589 3589          mutex_enter(vphm);
3590 3590  
3591 3591          page_do_hashout(pp);
3592 3592  
3593 3593          mutex_exit(vphm);
3594 3594          if (phm == NULL)
3595 3595                  mutex_exit(nphm);
3596 3596  
3597 3597          /*
3598 3598           * Wake up processes waiting for this page.  The page's
3599 3599           * identity has been changed, and is probably not the
3600 3600           * desired page any longer.
3601 3601           */
3602 3602          sep = page_se_mutex(pp);
3603 3603          mutex_enter(sep);
3604 3604          pp->p_selock &= ~SE_EWANTED;
3605 3605          if (CV_HAS_WAITERS(&pp->p_cv))
3606 3606                  cv_broadcast(&pp->p_cv);
3607 3607          mutex_exit(sep);
3608 3608  }
3609 3609  
3610 3610  /*
3611 3611   * Add the page to the front of a linked list of pages
3612 3612   * using the p_next & p_prev pointers for the list.
3613 3613   * The caller is responsible for protecting the list pointers.
3614 3614   */
3615 3615  void
3616 3616  page_add(page_t **ppp, page_t *pp)
3617 3617  {
3618 3618          ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3619 3619  
3620 3620          page_add_common(ppp, pp);
3621 3621  }
3622 3622  
3623 3623  
3624 3624  
3625 3625  /*
3626 3626   *  Common code for page_add() and mach_page_add()
3627 3627   */
3628 3628  void
3629 3629  page_add_common(page_t **ppp, page_t *pp)
3630 3630  {
3631 3631          if (*ppp == NULL) {
3632 3632                  pp->p_next = pp->p_prev = pp;
3633 3633          } else {
3634 3634                  pp->p_next = *ppp;
3635 3635                  pp->p_prev = (*ppp)->p_prev;
3636 3636                  (*ppp)->p_prev = pp;
3637 3637                  pp->p_prev->p_next = pp;
3638 3638          }
3639 3639          *ppp = pp;
3640 3640  }
3641 3641  
3642 3642  
3643 3643  /*
3644 3644   * Remove this page from a linked list of pages
3645 3645   * using the p_next & p_prev pointers for the list.
3646 3646   *
3647 3647   * The caller is responsible for protecting the list pointers.
3648 3648   */
3649 3649  void
3650 3650  page_sub(page_t **ppp, page_t *pp)
3651 3651  {
3652 3652          ASSERT((PP_ISFREE(pp)) ? 1 :
3653 3653              (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3654 3654  
3655 3655          if (*ppp == NULL || pp == NULL) {
3656 3656                  panic("page_sub: bad arg(s): pp %p, *ppp %p",
3657 3657                      (void *)pp, (void *)(*ppp));
3658 3658                  /*NOTREACHED*/
3659 3659          }
3660 3660  
3661 3661          page_sub_common(ppp, pp);
3662 3662  }
3663 3663  
3664 3664  
3665 3665  /*
3666 3666   *  Common code for page_sub() and mach_page_sub()
3667 3667   */
3668 3668  void
3669 3669  page_sub_common(page_t **ppp, page_t *pp)
3670 3670  {
3671 3671          if (*ppp == pp)
3672 3672                  *ppp = pp->p_next;              /* go to next page */
3673 3673  
3674 3674          if (*ppp == pp)
3675 3675                  *ppp = NULL;                    /* page list is gone */
3676 3676          else {
3677 3677                  pp->p_prev->p_next = pp->p_next;
3678 3678                  pp->p_next->p_prev = pp->p_prev;
3679 3679          }
3680 3680          pp->p_prev = pp->p_next = pp;           /* make pp a list of one */
3681 3681  }
3682 3682  
3683 3683  
3684 3684  /*
3685 3685   * Break page list cppp into two lists with npages in the first list.
3686 3686   * The tail is returned in nppp.
3687 3687   */
3688 3688  void
3689 3689  page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages)
3690 3690  {
3691 3691          page_t *s1pp = *oppp;
3692 3692          page_t *s2pp;
3693 3693          page_t *e1pp, *e2pp;
3694 3694          long n = 0;
3695 3695  
3696 3696          if (s1pp == NULL) {
3697 3697                  *nppp = NULL;
3698 3698                  return;
3699 3699          }
3700 3700          if (npages == 0) {
3701 3701                  *nppp = s1pp;
3702 3702                  *oppp = NULL;
3703 3703                  return;
3704 3704          }
3705 3705          for (n = 0, s2pp = *oppp; n < npages; n++) {
3706 3706                  s2pp = s2pp->p_next;
3707 3707          }
3708 3708          /* Fix head and tail of new lists */
3709 3709          e1pp = s2pp->p_prev;
3710 3710          e2pp = s1pp->p_prev;
3711 3711          s1pp->p_prev = e1pp;
3712 3712          e1pp->p_next = s1pp;
3713 3713          s2pp->p_prev = e2pp;
3714 3714          e2pp->p_next = s2pp;
3715 3715  
3716 3716          /* second list empty */
3717 3717          if (s2pp == s1pp) {
3718 3718                  *oppp = s1pp;
3719 3719                  *nppp = NULL;
3720 3720          } else {
3721 3721                  *oppp = s1pp;
3722 3722                  *nppp = s2pp;
3723 3723          }
3724 3724  }
3725 3725  
3726 3726  /*
3727 3727   * Concatenate page list nppp onto the end of list ppp.
3728 3728   */
3729 3729  void
3730 3730  page_list_concat(page_t **ppp, page_t **nppp)
3731 3731  {
3732 3732          page_t *s1pp, *s2pp, *e1pp, *e2pp;
3733 3733  
3734 3734          if (*nppp == NULL) {
3735 3735                  return;
3736 3736          }
3737 3737          if (*ppp == NULL) {
3738 3738                  *ppp = *nppp;
3739 3739                  return;
3740 3740          }
3741 3741          s1pp = *ppp;
3742 3742          e1pp =  s1pp->p_prev;
3743 3743          s2pp = *nppp;
3744 3744          e2pp = s2pp->p_prev;
3745 3745          s1pp->p_prev = e2pp;
3746 3746          e2pp->p_next = s1pp;
3747 3747          e1pp->p_next = s2pp;
3748 3748          s2pp->p_prev = e1pp;
3749 3749  }
3750 3750  
3751 3751  /*
3752 3752   * return the next page in the page list
3753 3753   */
3754 3754  page_t *
3755 3755  page_list_next(page_t *pp)
3756 3756  {
3757 3757          return (pp->p_next);
3758 3758  }
3759 3759  
3760 3760  
3761 3761  /*
3762 3762   * Add the page to the front of the linked list of pages
3763 3763   * using p_vpnext/p_vpprev pointers for the list.
3764 3764   *
3765 3765   * The caller is responsible for protecting the lists.
3766 3766   */
3767 3767  void
3768 3768  page_vpadd(page_t **ppp, page_t *pp)
3769 3769  {
3770 3770          if (*ppp == NULL) {
3771 3771                  pp->p_vpnext = pp->p_vpprev = pp;
3772 3772          } else {
3773 3773                  pp->p_vpnext = *ppp;
3774 3774                  pp->p_vpprev = (*ppp)->p_vpprev;
3775 3775                  (*ppp)->p_vpprev = pp;
3776 3776                  pp->p_vpprev->p_vpnext = pp;
3777 3777          }
3778 3778          *ppp = pp;
3779 3779  }
3780 3780  
3781 3781  /*
3782 3782   * Remove this page from the linked list of pages
3783 3783   * using p_vpnext/p_vpprev pointers for the list.
3784 3784   *
3785 3785   * The caller is responsible for protecting the lists.
3786 3786   */
3787 3787  void
3788 3788  page_vpsub(page_t **ppp, page_t *pp)
3789 3789  {
3790 3790          if (*ppp == NULL || pp == NULL) {
3791 3791                  panic("page_vpsub: bad arg(s): pp %p, *ppp %p",
3792 3792                      (void *)pp, (void *)(*ppp));
3793 3793                  /*NOTREACHED*/
3794 3794          }
3795 3795  
3796 3796          if (*ppp == pp)
3797 3797                  *ppp = pp->p_vpnext;            /* go to next page */
3798 3798  
3799 3799          if (*ppp == pp)
3800 3800                  *ppp = NULL;                    /* page list is gone */
3801 3801          else {
3802 3802                  pp->p_vpprev->p_vpnext = pp->p_vpnext;
3803 3803                  pp->p_vpnext->p_vpprev = pp->p_vpprev;
3804 3804          }
3805 3805          pp->p_vpprev = pp->p_vpnext = pp;       /* make pp a list of one */
3806 3806  }
3807 3807  
3808 3808  /*
3809 3809   * Lock a physical page into memory "long term".  Used to support "lock
3810 3810   * in memory" functions.  Accepts the page to be locked, and a cow variable
3811 3811   * to indicate whether a the lock will travel to the new page during
3812 3812   * a potential copy-on-write.
3813 3813   */
3814 3814  int
3815 3815  page_pp_lock(
3816 3816          page_t *pp,                     /* page to be locked */
3817 3817          int cow,                        /* cow lock */
3818 3818          int kernel)                     /* must succeed -- ignore checking */
3819 3819  {
3820 3820          int r = 0;                      /* result -- assume failure */
3821 3821  
3822 3822          ASSERT(PAGE_LOCKED(pp));
3823 3823  
3824 3824          page_struct_lock(pp);
3825 3825          /*
3826 3826           * Acquire the "freemem_lock" for availrmem.
3827 3827           */
3828 3828          if (cow) {
3829 3829                  mutex_enter(&freemem_lock);
3830 3830                  if ((availrmem > pages_pp_maximum) &&
3831 3831                      (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
3832 3832                          availrmem--;
3833 3833                          pages_locked++;
3834 3834                          mutex_exit(&freemem_lock);
3835 3835                          r = 1;
3836 3836                          if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3837 3837                                  cmn_err(CE_WARN,
3838 3838                                      "COW lock limit reached on pfn 0x%lx",
3839 3839                                      page_pptonum(pp));
3840 3840                          }
3841 3841                  } else
3842 3842                          mutex_exit(&freemem_lock);
3843 3843          } else {
3844 3844                  if (pp->p_lckcnt) {
3845 3845                          if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
3846 3846                                  r = 1;
3847 3847                                  if (++pp->p_lckcnt ==
3848 3848                                      (ushort_t)PAGE_LOCK_MAXIMUM) {
3849 3849                                          cmn_err(CE_WARN, "Page lock limit "
3850 3850                                              "reached on pfn 0x%lx",
3851 3851                                              page_pptonum(pp));
3852 3852                                  }
3853 3853                          }
3854 3854                  } else {
3855 3855                          if (kernel) {
3856 3856                                  /* availrmem accounting done by caller */
3857 3857                                  ++pp->p_lckcnt;
3858 3858                                  r = 1;
3859 3859                          } else {
3860 3860                                  mutex_enter(&freemem_lock);
3861 3861                                  if (availrmem > pages_pp_maximum) {
3862 3862                                          availrmem--;
3863 3863                                          pages_locked++;
3864 3864                                          ++pp->p_lckcnt;
3865 3865                                          r = 1;
3866 3866                                  }
3867 3867                                  mutex_exit(&freemem_lock);
3868 3868                          }
3869 3869                  }
3870 3870          }
3871 3871          page_struct_unlock(pp);
3872 3872          return (r);
3873 3873  }
3874 3874  
3875 3875  /*
3876 3876   * Decommit a lock on a physical page frame.  Account for cow locks if
3877 3877   * appropriate.
3878 3878   */
3879 3879  void
3880 3880  page_pp_unlock(
3881 3881          page_t *pp,                     /* page to be unlocked */
3882 3882          int cow,                        /* expect cow lock */
3883 3883          int kernel)                     /* this was a kernel lock */
3884 3884  {
3885 3885          ASSERT(PAGE_LOCKED(pp));
3886 3886  
3887 3887          page_struct_lock(pp);
3888 3888          /*
3889 3889           * Acquire the "freemem_lock" for availrmem.
3890 3890           * If cowcnt or lcknt is already 0 do nothing; i.e., we
3891 3891           * could be called to unlock even if nothing is locked. This could
3892 3892           * happen if locked file pages were truncated (removing the lock)
3893 3893           * and the file was grown again and new pages faulted in; the new
3894 3894           * pages are unlocked but the segment still thinks they're locked.
3895 3895           */
3896 3896          if (cow) {
3897 3897                  if (pp->p_cowcnt) {
3898 3898                          mutex_enter(&freemem_lock);
3899 3899                          pp->p_cowcnt--;
3900 3900                          availrmem++;
3901 3901                          pages_locked--;
3902 3902                          mutex_exit(&freemem_lock);
3903 3903                  }
3904 3904          } else {
3905 3905                  if (pp->p_lckcnt && --pp->p_lckcnt == 0) {
3906 3906                          if (!kernel) {
3907 3907                                  mutex_enter(&freemem_lock);
3908 3908                                  availrmem++;
3909 3909                                  pages_locked--;
3910 3910                                  mutex_exit(&freemem_lock);
3911 3911                          }
3912 3912                  }
3913 3913          }
3914 3914          page_struct_unlock(pp);
3915 3915  }
3916 3916  
3917 3917  /*
3918 3918   * This routine reserves availrmem for npages;
3919 3919   *      flags: KM_NOSLEEP or KM_SLEEP
3920 3920   *      returns 1 on success or 0 on failure
3921 3921   */
3922 3922  int
3923 3923  page_resv(pgcnt_t npages, uint_t flags)
3924 3924  {
3925 3925          mutex_enter(&freemem_lock);
3926 3926          while (availrmem < tune.t_minarmem + npages) {
3927 3927                  if (flags & KM_NOSLEEP) {
3928 3928                          mutex_exit(&freemem_lock);
3929 3929                          return (0);
3930 3930                  }
3931 3931                  mutex_exit(&freemem_lock);
3932 3932                  page_needfree(npages);
3933 3933                  kmem_reap();
3934 3934                  delay(hz >> 2);
3935 3935                  page_needfree(-(spgcnt_t)npages);
3936 3936                  mutex_enter(&freemem_lock);
3937 3937          }
3938 3938          availrmem -= npages;
3939 3939          mutex_exit(&freemem_lock);
3940 3940          return (1);
3941 3941  }
3942 3942  
3943 3943  /*
3944 3944   * This routine unreserves availrmem for npages;
3945 3945   */
3946 3946  void
3947 3947  page_unresv(pgcnt_t npages)
3948 3948  {
3949 3949          mutex_enter(&freemem_lock);
3950 3950          availrmem += npages;
3951 3951          mutex_exit(&freemem_lock);
3952 3952  }
3953 3953  
3954 3954  /*
3955 3955   * See Statement at the beginning of segvn_lockop() regarding
3956 3956   * the way we handle cowcnts and lckcnts.
3957 3957   *
3958 3958   * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage
3959 3959   * that breaks COW has PROT_WRITE.
3960 3960   *
3961 3961   * Note that, we may also break COW in case we are softlocking
3962 3962   * on read access during physio;
3963 3963   * in this softlock case, the vpage may not have PROT_WRITE.
3964 3964   * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp'
3965 3965   * if the vpage doesn't have PROT_WRITE.
3966 3966   *
3967 3967   * This routine is never called if we are stealing a page
3968 3968   * in anon_private.
3969 3969   *
3970 3970   * The caller subtracted from availrmem for read only mapping.
3971 3971   * if lckcnt is 1 increment availrmem.
3972 3972   */
3973 3973  void
3974 3974  page_pp_useclaim(
3975 3975          page_t *opp,            /* original page frame losing lock */
3976 3976          page_t *npp,            /* new page frame gaining lock */
3977 3977          uint_t  write_perm)     /* set if vpage has PROT_WRITE */
3978 3978  {
3979 3979          int payback = 0;
3980 3980          int nidx, oidx;
3981 3981  
3982 3982          ASSERT(PAGE_LOCKED(opp));
3983 3983          ASSERT(PAGE_LOCKED(npp));
3984 3984  
3985 3985          /*
3986 3986           * Since we have two pages we probably have two locks.  We need to take
3987 3987           * them in a defined order to avoid deadlocks.  It's also possible they
3988 3988           * both hash to the same lock in which case this is a non-issue.
3989 3989           */
3990 3990          nidx = PAGE_LLOCK_HASH(PP_PAGEROOT(npp));
3991 3991          oidx = PAGE_LLOCK_HASH(PP_PAGEROOT(opp));
3992 3992          if (nidx < oidx) {
3993 3993                  page_struct_lock(npp);
3994 3994                  page_struct_lock(opp);
3995 3995          } else if (oidx < nidx) {
3996 3996                  page_struct_lock(opp);
3997 3997                  page_struct_lock(npp);
3998 3998          } else {        /* The pages hash to the same lock */
3999 3999                  page_struct_lock(npp);
4000 4000          }
4001 4001  
4002 4002          ASSERT(npp->p_cowcnt == 0);
4003 4003          ASSERT(npp->p_lckcnt == 0);
4004 4004  
4005 4005          /* Don't use claim if nothing is locked (see page_pp_unlock above) */
4006 4006          if ((write_perm && opp->p_cowcnt != 0) ||
4007 4007              (!write_perm && opp->p_lckcnt != 0)) {
4008 4008  
4009 4009                  if (write_perm) {
4010 4010                          npp->p_cowcnt++;
4011 4011                          ASSERT(opp->p_cowcnt != 0);
4012 4012                          opp->p_cowcnt--;
4013 4013                  } else {
4014 4014  
4015 4015                          ASSERT(opp->p_lckcnt != 0);
4016 4016  
4017 4017                          /*
4018 4018                           * We didn't need availrmem decremented if p_lckcnt on
4019 4019                           * original page is 1. Here, we are unlocking
4020 4020                           * read-only copy belonging to original page and
4021 4021                           * are locking a copy belonging to new page.
4022 4022                           */
4023 4023                          if (opp->p_lckcnt == 1)
4024 4024                                  payback = 1;
4025 4025  
4026 4026                          npp->p_lckcnt++;
4027 4027                          opp->p_lckcnt--;
4028 4028                  }
4029 4029          }
4030 4030          if (payback) {
4031 4031                  mutex_enter(&freemem_lock);
4032 4032                  availrmem++;
4033 4033                  pages_useclaim--;
4034 4034                  mutex_exit(&freemem_lock);
4035 4035          }
4036 4036  
4037 4037          if (nidx < oidx) {
4038 4038                  page_struct_unlock(opp);
4039 4039                  page_struct_unlock(npp);
4040 4040          } else if (oidx < nidx) {
4041 4041                  page_struct_unlock(npp);
4042 4042                  page_struct_unlock(opp);
4043 4043          } else {        /* The pages hash to the same lock */
4044 4044                  page_struct_unlock(npp);
4045 4045          }
4046 4046  }
4047 4047  
4048 4048  /*
4049 4049   * Simple claim adjust functions -- used to support changes in
4050 4050   * claims due to changes in access permissions.  Used by segvn_setprot().
4051 4051   */
4052 4052  int
4053 4053  page_addclaim(page_t *pp)
4054 4054  {
4055 4055          int r = 0;                      /* result */
4056 4056  
4057 4057          ASSERT(PAGE_LOCKED(pp));
4058 4058  
4059 4059          page_struct_lock(pp);
4060 4060          ASSERT(pp->p_lckcnt != 0);
4061 4061  
4062 4062          if (pp->p_lckcnt == 1) {
4063 4063                  if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4064 4064                          --pp->p_lckcnt;
4065 4065                          r = 1;
4066 4066                          if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4067 4067                                  cmn_err(CE_WARN,
4068 4068                                      "COW lock limit reached on pfn 0x%lx",
4069 4069                                      page_pptonum(pp));
4070 4070                          }
4071 4071                  }
4072 4072          } else {
4073 4073                  mutex_enter(&freemem_lock);
4074 4074                  if ((availrmem > pages_pp_maximum) &&
4075 4075                      (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
4076 4076                          --availrmem;
4077 4077                          ++pages_claimed;
4078 4078                          mutex_exit(&freemem_lock);
4079 4079                          --pp->p_lckcnt;
4080 4080                          r = 1;
4081 4081                          if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4082 4082                                  cmn_err(CE_WARN,
4083 4083                                      "COW lock limit reached on pfn 0x%lx",
4084 4084                                      page_pptonum(pp));
4085 4085                          }
4086 4086                  } else
4087 4087                          mutex_exit(&freemem_lock);
4088 4088          }
4089 4089          page_struct_unlock(pp);
4090 4090          return (r);
4091 4091  }
4092 4092  
4093 4093  int
4094 4094  page_subclaim(page_t *pp)
4095 4095  {
4096 4096          int r = 0;
4097 4097  
4098 4098          ASSERT(PAGE_LOCKED(pp));
4099 4099  
4100 4100          page_struct_lock(pp);
4101 4101          ASSERT(pp->p_cowcnt != 0);
4102 4102  
4103 4103          if (pp->p_lckcnt) {
4104 4104                  if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4105 4105                          r = 1;
4106 4106                          /*
4107 4107                           * for availrmem
4108 4108                           */
4109 4109                          mutex_enter(&freemem_lock);
4110 4110                          availrmem++;
4111 4111                          pages_claimed--;
4112 4112                          mutex_exit(&freemem_lock);
4113 4113  
4114 4114                          pp->p_cowcnt--;
4115 4115  
4116 4116                          if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4117 4117                                  cmn_err(CE_WARN,
4118 4118                                      "Page lock limit reached on pfn 0x%lx",
4119 4119                                      page_pptonum(pp));
4120 4120                          }
4121 4121                  }
4122 4122          } else {
4123 4123                  r = 1;
4124 4124                  pp->p_cowcnt--;
4125 4125                  pp->p_lckcnt++;
4126 4126          }
4127 4127          page_struct_unlock(pp);
4128 4128          return (r);
4129 4129  }
4130 4130  
4131 4131  /*
4132 4132   * Variant of page_addclaim(), where ppa[] contains the pages of a single large
4133 4133   * page.
4134 4134   */
4135 4135  int
4136 4136  page_addclaim_pages(page_t  **ppa)
4137 4137  {
4138 4138          pgcnt_t lckpgs = 0, pg_idx;
4139 4139  
4140 4140          VM_STAT_ADD(pagecnt.pc_addclaim_pages);
4141 4141  
4142 4142          /*
4143 4143           * Only need to take the page struct lock on the large page root.
4144 4144           */
4145 4145          page_struct_lock(ppa[0]);
4146 4146          for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4147 4147  
4148 4148                  ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4149 4149                  ASSERT(ppa[pg_idx]->p_lckcnt != 0);
4150 4150                  if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4151 4151                          page_struct_unlock(ppa[0]);
4152 4152                          return (0);
4153 4153                  }
4154 4154                  if (ppa[pg_idx]->p_lckcnt > 1)
4155 4155                          lckpgs++;
4156 4156          }
4157 4157  
4158 4158          if (lckpgs != 0) {
4159 4159                  mutex_enter(&freemem_lock);
4160 4160                  if (availrmem >= pages_pp_maximum + lckpgs) {
4161 4161                          availrmem -= lckpgs;
4162 4162                          pages_claimed += lckpgs;
4163 4163                  } else {
4164 4164                          mutex_exit(&freemem_lock);
4165 4165                          page_struct_unlock(ppa[0]);
4166 4166                          return (0);
4167 4167                  }
4168 4168                  mutex_exit(&freemem_lock);
4169 4169          }
4170 4170  
4171 4171          for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4172 4172                  ppa[pg_idx]->p_lckcnt--;
4173 4173                  ppa[pg_idx]->p_cowcnt++;
4174 4174          }
4175 4175          page_struct_unlock(ppa[0]);
4176 4176          return (1);
4177 4177  }
4178 4178  
4179 4179  /*
4180 4180   * Variant of page_subclaim(), where ppa[] contains the pages of a single large
4181 4181   * page.
4182 4182   */
4183 4183  int
4184 4184  page_subclaim_pages(page_t  **ppa)
4185 4185  {
4186 4186          pgcnt_t ulckpgs = 0, pg_idx;
4187 4187  
4188 4188          VM_STAT_ADD(pagecnt.pc_subclaim_pages);
4189 4189  
4190 4190          /*
4191 4191           * Only need to take the page struct lock on the large page root.
4192 4192           */
4193 4193          page_struct_lock(ppa[0]);
4194 4194          for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4195 4195  
4196 4196                  ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4197 4197                  ASSERT(ppa[pg_idx]->p_cowcnt != 0);
4198 4198                  if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4199 4199                          page_struct_unlock(ppa[0]);
4200 4200                          return (0);
4201 4201                  }
4202 4202                  if (ppa[pg_idx]->p_lckcnt != 0)
4203 4203                          ulckpgs++;
4204 4204          }
4205 4205  
4206 4206          if (ulckpgs != 0) {
4207 4207                  mutex_enter(&freemem_lock);
4208 4208                  availrmem += ulckpgs;
4209 4209                  pages_claimed -= ulckpgs;
4210 4210                  mutex_exit(&freemem_lock);
4211 4211          }
4212 4212  
4213 4213          for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4214 4214                  ppa[pg_idx]->p_cowcnt--;
4215 4215                  ppa[pg_idx]->p_lckcnt++;
4216 4216  
4217 4217          }
4218 4218          page_struct_unlock(ppa[0]);
4219 4219          return (1);
4220 4220  }
4221 4221  
4222 4222  page_t *
4223 4223  page_numtopp(pfn_t pfnum, se_t se)
4224 4224  {
4225 4225          page_t *pp;
4226 4226  
4227 4227  retry:
4228 4228          pp = page_numtopp_nolock(pfnum);
4229 4229          if (pp == NULL) {
4230 4230                  return ((page_t *)NULL);
4231 4231          }
4232 4232  
4233 4233          /*
4234 4234           * Acquire the appropriate lock on the page.
4235 4235           */
4236 4236          while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) {
4237 4237                  if (page_pptonum(pp) != pfnum)
4238 4238                          goto retry;
4239 4239                  continue;
4240 4240          }
4241 4241  
4242 4242          if (page_pptonum(pp) != pfnum) {
4243 4243                  page_unlock(pp);
4244 4244                  goto retry;
4245 4245          }
4246 4246  
4247 4247          return (pp);
4248 4248  }
4249 4249  
4250 4250  page_t *
4251 4251  page_numtopp_noreclaim(pfn_t pfnum, se_t se)
4252 4252  {
4253 4253          page_t *pp;
4254 4254  
4255 4255  retry:
4256 4256          pp = page_numtopp_nolock(pfnum);
4257 4257          if (pp == NULL) {
4258 4258                  return ((page_t *)NULL);
4259 4259          }
4260 4260  
4261 4261          /*
4262 4262           * Acquire the appropriate lock on the page.
4263 4263           */
4264 4264          while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) {
4265 4265                  if (page_pptonum(pp) != pfnum)
4266 4266                          goto retry;
4267 4267                  continue;
4268 4268          }
4269 4269  
4270 4270          if (page_pptonum(pp) != pfnum) {
4271 4271                  page_unlock(pp);
4272 4272                  goto retry;
4273 4273          }
4274 4274  
4275 4275          return (pp);
4276 4276  }
4277 4277  
4278 4278  /*
4279 4279   * This routine is like page_numtopp, but will only return page structs
4280 4280   * for pages which are ok for loading into hardware using the page struct.
4281 4281   */
4282 4282  page_t *
4283 4283  page_numtopp_nowait(pfn_t pfnum, se_t se)
4284 4284  {
4285 4285          page_t *pp;
4286 4286  
4287 4287  retry:
4288 4288          pp = page_numtopp_nolock(pfnum);
4289 4289          if (pp == NULL) {
4290 4290                  return ((page_t *)NULL);
4291 4291          }
4292 4292  
4293 4293          /*
4294 4294           * Try to acquire the appropriate lock on the page.
4295 4295           */
4296 4296          if (PP_ISFREE(pp))
4297 4297                  pp = NULL;
4298 4298          else {
4299 4299                  if (!page_trylock(pp, se))
4300 4300                          pp = NULL;
4301 4301                  else {
4302 4302                          if (page_pptonum(pp) != pfnum) {
4303 4303                                  page_unlock(pp);
4304 4304                                  goto retry;
4305 4305                          }
4306 4306                          if (PP_ISFREE(pp)) {
4307 4307                                  page_unlock(pp);
4308 4308                                  pp = NULL;
4309 4309                          }
4310 4310                  }
4311 4311          }
4312 4312          return (pp);
4313 4313  }
4314 4314  
4315 4315  #define SYNC_PROGRESS_NPAGES    1000
4316 4316  
4317 4317  /*
4318 4318   * Returns a count of dirty pages that are in the process
4319 4319   * of being written out.  If 'cleanit' is set, try to push the page.
4320 4320   */
4321 4321  pgcnt_t
4322 4322  page_busy(int cleanit)
4323 4323  {
4324 4324          page_t *page0 = page_first();
4325 4325          page_t *pp = page0;
4326 4326          pgcnt_t nppbusy = 0;
4327 4327          int counter = 0;
4328 4328          u_offset_t off;
4329 4329  
4330 4330          do {
4331 4331                  vnode_t *vp = pp->p_vnode;
4332 4332  
4333 4333                  /*
4334 4334                   * Reset the sync timeout. The page list is very long
4335 4335                   * on large memory systems.
4336 4336                   */
4337 4337                  if (++counter > SYNC_PROGRESS_NPAGES) {
4338 4338                          counter = 0;
4339 4339                          vfs_syncprogress();
4340 4340                  }
4341 4341  
4342 4342                  /*
4343 4343                   * A page is a candidate for syncing if it is:
4344 4344                   *
4345 4345                   * (a)  On neither the freelist nor the cachelist
4346 4346                   * (b)  Hashed onto a vnode
4347 4347                   * (c)  Not a kernel page
4348 4348                   * (d)  Dirty
4349 4349                   * (e)  Not part of a swapfile
4350 4350                   * (f)  a page which belongs to a real vnode; eg has a non-null
4351 4351                   *      v_vfsp pointer.
4352 4352                   * (g)  Backed by a filesystem which doesn't have a
4353 4353                   *      stubbed-out sync operation
4354 4354                   */
4355 4355                  if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) &&
4356 4356                      hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
4357 4357                      vfs_can_sync(vp->v_vfsp)) {
4358 4358                          nppbusy++;
4359 4359  
4360 4360                          if (!cleanit)
4361 4361                                  continue;
4362 4362                          if (!page_trylock(pp, SE_EXCL))
4363 4363                                  continue;
4364 4364  
4365 4365                          if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) ||
4366 4366                              pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
4367 4367                              !(hat_pagesync(pp,
4368 4368                              HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) {
4369 4369                                  page_unlock(pp);
4370 4370                                  continue;
4371 4371                          }
4372 4372                          off = pp->p_offset;
4373 4373                          VN_HOLD(vp);
4374 4374                          page_unlock(pp);
4375 4375                          (void) VOP_PUTPAGE(vp, off, PAGESIZE,
4376 4376                              B_ASYNC | B_FREE, kcred, NULL);
4377 4377                          VN_RELE(vp);
4378 4378                  }
4379 4379          } while ((pp = page_next(pp)) != page0);
4380 4380  
4381 4381          vfs_syncprogress();
4382 4382          return (nppbusy);
4383 4383  }
4384 4384  
4385 4385  void page_invalidate_pages(void);
4386 4386  
4387 4387  /*
4388 4388   * callback handler to vm sub-system
4389 4389   *
4390 4390   * callers make sure no recursive entries to this func.
4391 4391   */
4392 4392  /*ARGSUSED*/
4393 4393  boolean_t
4394 4394  callb_vm_cpr(void *arg, int code)
4395 4395  {
4396 4396          if (code == CB_CODE_CPR_CHKPT)
4397 4397                  page_invalidate_pages();
4398 4398          return (B_TRUE);
4399 4399  }
4400 4400  
4401 4401  /*
4402 4402   * Invalidate all pages of the system.
4403 4403   * It shouldn't be called until all user page activities are all stopped.
4404 4404   */
4405 4405  void
4406 4406  page_invalidate_pages()
4407 4407  {
4408 4408          page_t *pp;
4409 4409          page_t *page0;
4410 4410          pgcnt_t nbusypages;
4411 4411          int retry = 0;
4412 4412          const int MAXRETRIES = 4;
4413 4413  top:
4414 4414          /*
4415 4415           * Flush dirty pages and destroy the clean ones.
4416 4416           */
4417 4417          nbusypages = 0;
4418 4418  
4419 4419          pp = page0 = page_first();
4420 4420          do {
4421 4421                  struct vnode    *vp;
4422 4422                  u_offset_t      offset;
4423 4423                  int             mod;
4424 4424  
4425 4425                  /*
4426 4426                   * skip the page if it has no vnode or the page associated
4427 4427                   * with the kernel vnode or prom allocated kernel mem.
4428 4428                   */
4429 4429                  if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp))
4430 4430                          continue;
4431 4431  
4432 4432                  /*
4433 4433                   * skip the page which is already free invalidated.
4434 4434                   */
4435 4435                  if (PP_ISFREE(pp) && PP_ISAGED(pp))
4436 4436                          continue;
4437 4437  
4438 4438                  /*
4439 4439                   * skip pages that are already locked or can't be "exclusively"
4440 4440                   * locked or are already free.  After we lock the page, check
4441 4441                   * the free and age bits again to be sure it's not destroyed
4442 4442                   * yet.
4443 4443                   * To achieve max. parallelization, we use page_trylock instead
4444 4444                   * of page_lock so that we don't get block on individual pages
4445 4445                   * while we have thousands of other pages to process.
4446 4446                   */
4447 4447                  if (!page_trylock(pp, SE_EXCL)) {
4448 4448                          nbusypages++;
4449 4449                          continue;
4450 4450                  } else if (PP_ISFREE(pp)) {
4451 4451                          if (!PP_ISAGED(pp)) {
4452 4452                                  page_destroy_free(pp);
4453 4453                          } else {
4454 4454                                  page_unlock(pp);
4455 4455                          }
4456 4456                          continue;
4457 4457                  }
4458 4458                  /*
4459 4459                   * Is this page involved in some I/O? shared?
4460 4460                   *
4461 4461                   * The page_struct_lock need not be acquired to
4462 4462                   * examine these fields since the page has an
4463 4463                   * "exclusive" lock.
4464 4464                   */
4465 4465                  if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
4466 4466                          page_unlock(pp);
4467 4467                          continue;
4468 4468                  }
4469 4469  
4470 4470                  if (vp->v_type == VCHR) {
4471 4471                          panic("vp->v_type == VCHR");
4472 4472                          /*NOTREACHED*/
4473 4473                  }
4474 4474  
4475 4475                  if (!page_try_demote_pages(pp)) {
4476 4476                          page_unlock(pp);
4477 4477                          continue;
4478 4478                  }
4479 4479  
4480 4480                  /*
4481 4481                   * Check the modified bit. Leave the bits alone in hardware
4482 4482                   * (they will be modified if we do the putpage).
4483 4483                   */
4484 4484                  mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD)
4485 4485                      & P_MOD);
4486 4486                  if (mod) {
4487 4487                          offset = pp->p_offset;
4488 4488                          /*
4489 4489                           * Hold the vnode before releasing the page lock
4490 4490                           * to prevent it from being freed and re-used by
4491 4491                           * some other thread.
4492 4492                           */
4493 4493                          VN_HOLD(vp);
4494 4494                          page_unlock(pp);
4495 4495                          /*
4496 4496                           * No error return is checked here. Callers such as
4497 4497                           * cpr deals with the dirty pages at the dump time
4498 4498                           * if this putpage fails.
4499 4499                           */
4500 4500                          (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL,
4501 4501                              kcred, NULL);
4502 4502                          VN_RELE(vp);
4503 4503                  } else {
4504 4504                          /*LINTED: constant in conditional context*/
4505 4505                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
4506 4506                  }
4507 4507          } while ((pp = page_next(pp)) != page0);
4508 4508          if (nbusypages && retry++ < MAXRETRIES) {
4509 4509                  delay(1);
4510 4510                  goto top;
4511 4511          }
4512 4512  }
4513 4513  
4514 4514  /*
4515 4515   * Replace the page "old" with the page "new" on the page hash and vnode lists
4516 4516   *
4517 4517   * the replacement must be done in place, ie the equivalent sequence:
4518 4518   *
4519 4519   *      vp = old->p_vnode;
4520 4520   *      off = old->p_offset;
4521 4521   *      page_do_hashout(old)
4522 4522   *      page_do_hashin(new, vp, off)
4523 4523   *
4524 4524   * doesn't work, since
4525 4525   *  1) if old is the only page on the vnode, the v_pages list has a window
4526 4526   *     where it looks empty. This will break file system assumptions.
4527 4527   * and
4528 4528   *  2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list.
4529 4529   */
4530 4530  static void
4531 4531  page_do_relocate_hash(page_t *new, page_t *old)
4532 4532  {
4533 4533          page_t  **hash_list;
4534 4534          vnode_t *vp = old->p_vnode;
4535 4535          kmutex_t *sep;
4536 4536  
4537 4537          ASSERT(PAGE_EXCL(old));
4538 4538          ASSERT(PAGE_EXCL(new));
4539 4539          ASSERT(vp != NULL);
4540 4540          ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
4541 4541          ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset))));
4542 4542  
4543 4543          /*
4544 4544           * First find old page on the page hash list
4545 4545           */
4546 4546          hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)];
4547 4547  
4548 4548          for (;;) {
4549 4549                  if (*hash_list == old)
4550 4550                          break;
4551 4551                  if (*hash_list == NULL) {
4552 4552                          panic("page_do_hashout");
4553 4553                          /*NOTREACHED*/
4554 4554                  }
4555 4555                  hash_list = &(*hash_list)->p_hash;
4556 4556          }
4557 4557  
4558 4558          /*
4559 4559           * update new and replace old with new on the page hash list
4560 4560           */
4561 4561          new->p_vnode = old->p_vnode;
4562 4562          new->p_offset = old->p_offset;
4563 4563          new->p_hash = old->p_hash;
4564 4564          *hash_list = new;
4565 4565  
4566 4566          if ((new->p_vnode->v_flag & VISSWAP) != 0)
4567 4567                  PP_SETSWAP(new);
4568 4568  
4569 4569          /*
4570 4570           * replace old with new on the vnode's page list
4571 4571           */
4572 4572          if (old->p_vpnext == old) {
4573 4573                  new->p_vpnext = new;
4574 4574                  new->p_vpprev = new;
4575 4575          } else {
4576 4576                  new->p_vpnext = old->p_vpnext;
4577 4577                  new->p_vpprev = old->p_vpprev;
4578 4578                  new->p_vpnext->p_vpprev = new;
4579 4579                  new->p_vpprev->p_vpnext = new;
4580 4580          }
4581 4581          if (vp->v_pages == old)
4582 4582                  vp->v_pages = new;
4583 4583  
4584 4584          /*
4585 4585           * clear out the old page
4586 4586           */
4587 4587          old->p_hash = NULL;
4588 4588          old->p_vpnext = NULL;
4589 4589          old->p_vpprev = NULL;
4590 4590          old->p_vnode = NULL;
4591 4591          PP_CLRSWAP(old);
4592 4592          old->p_offset = (u_offset_t)-1;
4593 4593          page_clr_all_props(old);
4594 4594  
4595 4595          /*
4596 4596           * Wake up processes waiting for this page.  The page's
4597 4597           * identity has been changed, and is probably not the
4598 4598           * desired page any longer.
4599 4599           */
4600 4600          sep = page_se_mutex(old);
4601 4601          mutex_enter(sep);
4602 4602          old->p_selock &= ~SE_EWANTED;
4603 4603          if (CV_HAS_WAITERS(&old->p_cv))
4604 4604                  cv_broadcast(&old->p_cv);
4605 4605          mutex_exit(sep);
4606 4606  }
4607 4607  
4608 4608  /*
4609 4609   * This function moves the identity of page "pp_old" to page "pp_new".
4610 4610   * Both pages must be locked on entry.  "pp_new" is free, has no identity,
4611 4611   * and need not be hashed out from anywhere.
4612 4612   */
4613 4613  void
4614 4614  page_relocate_hash(page_t *pp_new, page_t *pp_old)
4615 4615  {
4616 4616          vnode_t *vp = pp_old->p_vnode;
4617 4617          u_offset_t off = pp_old->p_offset;
4618 4618          kmutex_t *phm, *vphm;
4619 4619  
4620 4620          /*
4621 4621           * Rehash two pages
4622 4622           */
4623 4623          ASSERT(PAGE_EXCL(pp_old));
4624 4624          ASSERT(PAGE_EXCL(pp_new));
4625 4625          ASSERT(vp != NULL);
4626 4626          ASSERT(pp_new->p_vnode == NULL);
4627 4627  
4628 4628          /*
4629 4629           * hashout then hashin while holding the mutexes
4630 4630           */
4631 4631          phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off));
4632 4632          mutex_enter(phm);
4633 4633          vphm = page_vnode_mutex(vp);
4634 4634          mutex_enter(vphm);
4635 4635  
4636 4636          page_do_relocate_hash(pp_new, pp_old);
4637 4637  
4638 4638          /* The following comment preserved from page_flip(). */
4639 4639          pp_new->p_fsdata = pp_old->p_fsdata;
4640 4640          pp_old->p_fsdata = 0;
4641 4641          mutex_exit(vphm);
4642 4642          mutex_exit(phm);
4643 4643  
4644 4644          /*
4645 4645           * The page_struct_lock need not be acquired for lckcnt and
4646 4646           * cowcnt since the page has an "exclusive" lock.
4647 4647           */
4648 4648          ASSERT(pp_new->p_lckcnt == 0);
4649 4649          ASSERT(pp_new->p_cowcnt == 0);
4650 4650          pp_new->p_lckcnt = pp_old->p_lckcnt;
4651 4651          pp_new->p_cowcnt = pp_old->p_cowcnt;
4652 4652          pp_old->p_lckcnt = pp_old->p_cowcnt = 0;
4653 4653  
4654 4654  }
4655 4655  
4656 4656  /*
4657 4657   * Helper routine used to lock all remaining members of a
4658 4658   * large page. The caller is responsible for passing in a locked
4659 4659   * pp. If pp is a large page, then it succeeds in locking all the
4660 4660   * remaining constituent pages or it returns with only the
4661 4661   * original page locked.
4662 4662   *
4663 4663   * Returns 1 on success, 0 on failure.
4664 4664   *
4665 4665   * If success is returned this routine guarantees p_szc for all constituent
4666 4666   * pages of a large page pp belongs to can't change. To achieve this we
4667 4667   * recheck szc of pp after locking all constituent pages and retry if szc
4668 4668   * changed (it could only decrease). Since hat_page_demote() needs an EXCL
4669 4669   * lock on one of constituent pages it can't be running after all constituent
4670 4670   * pages are locked.  hat_page_demote() with a lock on a constituent page
4671 4671   * outside of this large page (i.e. pp belonged to a larger large page) is
4672 4672   * already done with all constituent pages of pp since the root's p_szc is
4673 4673   * changed last. Therefore no need to synchronize with hat_page_demote() that
4674 4674   * locked a constituent page outside of pp's current large page.
4675 4675   */
4676 4676  #ifdef DEBUG
4677 4677  uint32_t gpg_trylock_mtbf = 0;
4678 4678  #endif
4679 4679  
4680 4680  int
4681 4681  group_page_trylock(page_t *pp, se_t se)
4682 4682  {
4683 4683          page_t  *tpp;
4684 4684          pgcnt_t npgs, i, j;
4685 4685          uint_t pszc = pp->p_szc;
4686 4686  
4687 4687  #ifdef DEBUG
4688 4688          if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) {
4689 4689                  return (0);
4690 4690          }
4691 4691  #endif
4692 4692  
4693 4693          if (pp != PP_GROUPLEADER(pp, pszc)) {
4694 4694                  return (0);
4695 4695          }
4696 4696  
4697 4697  retry:
4698 4698          ASSERT(PAGE_LOCKED_SE(pp, se));
4699 4699          ASSERT(!PP_ISFREE(pp));
4700 4700          if (pszc == 0) {
4701 4701                  return (1);
4702 4702          }
4703 4703          npgs = page_get_pagecnt(pszc);
4704 4704          tpp = pp + 1;
4705 4705          for (i = 1; i < npgs; i++, tpp++) {
4706 4706                  if (!page_trylock(tpp, se)) {
4707 4707                          tpp = pp + 1;
4708 4708                          for (j = 1; j < i; j++, tpp++) {
4709 4709                                  page_unlock(tpp);
4710 4710                          }
4711 4711                          return (0);
4712 4712                  }
4713 4713          }
4714 4714          if (pp->p_szc != pszc) {
4715 4715                  ASSERT(pp->p_szc < pszc);
4716 4716                  ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) &&
4717 4717                      !IS_SWAPFSVP(pp->p_vnode));
4718 4718                  tpp = pp + 1;
4719 4719                  for (i = 1; i < npgs; i++, tpp++) {
4720 4720                          page_unlock(tpp);
4721 4721                  }
4722 4722                  pszc = pp->p_szc;
4723 4723                  goto retry;
4724 4724          }
4725 4725          return (1);
4726 4726  }
4727 4727  
4728 4728  void
4729 4729  group_page_unlock(page_t *pp)
4730 4730  {
4731 4731          page_t *tpp;
4732 4732          pgcnt_t npgs, i;
4733 4733  
4734 4734          ASSERT(PAGE_LOCKED(pp));
4735 4735          ASSERT(!PP_ISFREE(pp));
4736 4736          ASSERT(pp == PP_PAGEROOT(pp));
4737 4737          npgs = page_get_pagecnt(pp->p_szc);
4738 4738          for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) {
4739 4739                  page_unlock(tpp);
4740 4740          }
4741 4741  }
4742 4742  
4743 4743  /*
4744 4744   * returns
4745 4745   * 0            : on success and *nrelocp is number of relocated PAGESIZE pages
4746 4746   * ERANGE       : this is not a base page
4747 4747   * EBUSY        : failure to get locks on the page/pages
4748 4748   * ENOMEM       : failure to obtain replacement pages
4749 4749   * EAGAIN       : OBP has not yet completed its boot-time handoff to the kernel
4750 4750   * EIO          : An error occurred while trying to copy the page data
4751 4751   *
4752 4752   * Return with all constituent members of target and replacement
4753 4753   * SE_EXCL locked. It is the callers responsibility to drop the
4754 4754   * locks.
4755 4755   */
4756 4756  int
4757 4757  do_page_relocate(
4758 4758          page_t **target,
4759 4759          page_t **replacement,
4760 4760          int grouplock,
4761 4761          spgcnt_t *nrelocp,
4762 4762          lgrp_t *lgrp)
4763 4763  {
4764 4764          page_t *first_repl;
4765 4765          page_t *repl;
4766 4766          page_t *targ;
4767 4767          page_t *pl = NULL;
4768 4768          uint_t ppattr;
4769 4769          pfn_t   pfn, repl_pfn;
4770 4770          uint_t  szc;
4771 4771          spgcnt_t npgs, i;
4772 4772          int repl_contig = 0;
4773 4773          uint_t flags = 0;
4774 4774          spgcnt_t dofree = 0;
4775 4775  
4776 4776          *nrelocp = 0;
4777 4777  
4778 4778  #if defined(__sparc)
4779 4779          /*
4780 4780           * We need to wait till OBP has completed
4781 4781           * its boot-time handoff of its resources to the kernel
4782 4782           * before we allow page relocation
4783 4783           */
4784 4784          if (page_relocate_ready == 0) {
4785 4785                  return (EAGAIN);
4786 4786          }
4787 4787  #endif
4788 4788  
4789 4789          /*
4790 4790           * If this is not a base page,
4791 4791           * just return with 0x0 pages relocated.
4792 4792           */
4793 4793          targ = *target;
4794 4794          ASSERT(PAGE_EXCL(targ));
4795 4795          ASSERT(!PP_ISFREE(targ));
4796 4796          szc = targ->p_szc;
4797 4797          ASSERT(szc < mmu_page_sizes);
4798 4798          VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4799 4799          pfn = targ->p_pagenum;
4800 4800          if (pfn != PFN_BASE(pfn, szc)) {
4801 4801                  VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]);
4802 4802                  return (ERANGE);
4803 4803          }
4804 4804  
4805 4805          if ((repl = *replacement) != NULL && repl->p_szc >= szc) {
4806 4806                  repl_pfn = repl->p_pagenum;
4807 4807                  if (repl_pfn != PFN_BASE(repl_pfn, szc)) {
4808 4808                          VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]);
4809 4809                          return (ERANGE);
4810 4810                  }
4811 4811                  repl_contig = 1;
4812 4812          }
4813 4813  
4814 4814          /*
4815 4815           * We must lock all members of this large page or we cannot
4816 4816           * relocate any part of it.
4817 4817           */
4818 4818          if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) {
4819 4819                  VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]);
4820 4820                  return (EBUSY);
4821 4821          }
4822 4822  
4823 4823          /*
4824 4824           * reread szc it could have been decreased before
4825 4825           * group_page_trylock() was done.
4826 4826           */
4827 4827          szc = targ->p_szc;
4828 4828          ASSERT(szc < mmu_page_sizes);
4829 4829          VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4830 4830          ASSERT(pfn == PFN_BASE(pfn, szc));
4831 4831  
4832 4832          npgs = page_get_pagecnt(targ->p_szc);
4833 4833  
4834 4834          if (repl == NULL) {
4835 4835                  dofree = npgs;          /* Size of target page in MMU pages */
4836 4836                  if (!page_create_wait(dofree, 0)) {
4837 4837                          if (grouplock != 0) {
4838 4838                                  group_page_unlock(targ);
4839 4839                          }
4840 4840                          VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4841 4841                          return (ENOMEM);
4842 4842                  }
4843 4843  
4844 4844                  /*
4845 4845                   * seg kmem pages require that the target and replacement
4846 4846                   * page be the same pagesize.
4847 4847                   */
4848 4848                  flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0;
4849 4849                  repl = page_get_replacement_page(targ, lgrp, flags);
4850 4850                  if (repl == NULL) {
4851 4851                          if (grouplock != 0) {
4852 4852                                  group_page_unlock(targ);
4853 4853                          }
4854 4854                          page_create_putback(dofree);
4855 4855                          VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4856 4856                          return (ENOMEM);
4857 4857                  }
4858 4858          }
4859 4859  #ifdef DEBUG
4860 4860          else {
4861 4861                  ASSERT(PAGE_LOCKED(repl));
4862 4862          }
4863 4863  #endif /* DEBUG */
4864 4864  
4865 4865  #if defined(__sparc)
4866 4866          /*
4867 4867           * Let hat_page_relocate() complete the relocation if it's kernel page
4868 4868           */
4869 4869          if (VN_ISKAS(targ->p_vnode)) {
4870 4870                  *replacement = repl;
4871 4871                  if (hat_page_relocate(target, replacement, nrelocp) != 0) {
4872 4872                          if (grouplock != 0) {
4873 4873                                  group_page_unlock(targ);
4874 4874                          }
4875 4875                          if (dofree) {
4876 4876                                  *replacement = NULL;
4877 4877                                  page_free_replacement_page(repl);
4878 4878                                  page_create_putback(dofree);
4879 4879                          }
4880 4880                          VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]);
4881 4881                          return (EAGAIN);
4882 4882                  }
4883 4883                  VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4884 4884                  return (0);
4885 4885          }
4886 4886  #else
4887 4887  #if defined(lint)
4888 4888          dofree = dofree;
4889 4889  #endif
4890 4890  #endif
4891 4891  
4892 4892          first_repl = repl;
4893 4893  
4894 4894          for (i = 0; i < npgs; i++) {
4895 4895                  ASSERT(PAGE_EXCL(targ));
4896 4896                  ASSERT(targ->p_slckcnt == 0);
4897 4897                  ASSERT(repl->p_slckcnt == 0);
4898 4898  
4899 4899                  (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD);
4900 4900  
4901 4901                  ASSERT(hat_page_getshare(targ) == 0);
4902 4902                  ASSERT(!PP_ISFREE(targ));
4903 4903                  ASSERT(targ->p_pagenum == (pfn + i));
4904 4904                  ASSERT(repl_contig == 0 ||
4905 4905                      repl->p_pagenum == (repl_pfn + i));
4906 4906  
4907 4907                  /*
4908 4908                   * Copy the page contents and attributes then
4909 4909                   * relocate the page in the page hash.
4910 4910                   */
4911 4911                  if (ppcopy(targ, repl) == 0) {
4912 4912                          targ = *target;
4913 4913                          repl = first_repl;
4914 4914                          VM_STAT_ADD(vmm_vmstats.ppr_copyfail);
4915 4915                          if (grouplock != 0) {
4916 4916                                  group_page_unlock(targ);
4917 4917                          }
4918 4918                          if (dofree) {
4919 4919                                  *replacement = NULL;
4920 4920                                  page_free_replacement_page(repl);
4921 4921                                  page_create_putback(dofree);
4922 4922                          }
4923 4923                          return (EIO);
4924 4924                  }
4925 4925  
4926 4926                  targ++;
4927 4927                  if (repl_contig != 0) {
4928 4928                          repl++;
4929 4929                  } else {
4930 4930                          repl = repl->p_next;
4931 4931                  }
4932 4932          }
4933 4933  
4934 4934          repl = first_repl;
4935 4935          targ = *target;
4936 4936  
4937 4937          for (i = 0; i < npgs; i++) {
4938 4938                  ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
4939 4939                  page_clr_all_props(repl);
4940 4940                  page_set_props(repl, ppattr);
4941 4941                  page_relocate_hash(repl, targ);
4942 4942  
4943 4943                  ASSERT(hat_page_getshare(targ) == 0);
4944 4944                  ASSERT(hat_page_getshare(repl) == 0);
4945 4945                  /*
4946 4946                   * Now clear the props on targ, after the
4947 4947                   * page_relocate_hash(), they no longer
4948 4948                   * have any meaning.
4949 4949                   */
4950 4950                  page_clr_all_props(targ);
4951 4951                  ASSERT(targ->p_next == targ);
4952 4952                  ASSERT(targ->p_prev == targ);
4953 4953                  page_list_concat(&pl, &targ);
4954 4954  
4955 4955                  targ++;
4956 4956                  if (repl_contig != 0) {
4957 4957                          repl++;
4958 4958                  } else {
4959 4959                          repl = repl->p_next;
4960 4960                  }
4961 4961          }
4962 4962          /* assert that we have come full circle with repl */
4963 4963          ASSERT(repl_contig == 1 || first_repl == repl);
4964 4964  
4965 4965          *target = pl;
4966 4966          if (*replacement == NULL) {
4967 4967                  ASSERT(first_repl == repl);
4968 4968                  *replacement = repl;
4969 4969          }
4970 4970          VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4971 4971          *nrelocp = npgs;
4972 4972          return (0);
4973 4973  }
4974 4974  /*
4975 4975   * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated.
4976 4976   */
4977 4977  int
4978 4978  page_relocate(
4979 4979          page_t **target,
4980 4980          page_t **replacement,
4981 4981          int grouplock,
4982 4982          int freetarget,
4983 4983          spgcnt_t *nrelocp,
4984 4984          lgrp_t *lgrp)
4985 4985  {
4986 4986          spgcnt_t ret;
4987 4987  
4988 4988          /* do_page_relocate returns 0 on success or errno value */
4989 4989          ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp);
4990 4990  
4991 4991          if (ret != 0 || freetarget == 0) {
4992 4992                  return (ret);
4993 4993          }
4994 4994          if (*nrelocp == 1) {
4995 4995                  ASSERT(*target != NULL);
4996 4996                  page_free(*target, 1);
4997 4997          } else {
4998 4998                  page_t *tpp = *target;
4999 4999                  uint_t szc = tpp->p_szc;
5000 5000                  pgcnt_t npgs = page_get_pagecnt(szc);
5001 5001                  ASSERT(npgs > 1);
5002 5002                  ASSERT(szc != 0);
5003 5003                  do {
5004 5004                          ASSERT(PAGE_EXCL(tpp));
5005 5005                          ASSERT(!hat_page_is_mapped(tpp));
5006 5006                          ASSERT(tpp->p_szc == szc);
5007 5007                          PP_SETFREE(tpp);
5008 5008                          PP_SETAGED(tpp);
5009 5009                          npgs--;
5010 5010                  } while ((tpp = tpp->p_next) != *target);
5011 5011                  ASSERT(npgs == 0);
5012 5012                  page_list_add_pages(*target, 0);
5013 5013                  npgs = page_get_pagecnt(szc);
5014 5014                  page_create_putback(npgs);
5015 5015          }
5016 5016          return (ret);
5017 5017  }
5018 5018  
5019 5019  /*
5020 5020   * it is up to the caller to deal with pcf accounting.
5021 5021   */
5022 5022  void
5023 5023  page_free_replacement_page(page_t *pplist)
5024 5024  {
5025 5025          page_t *pp;
5026 5026  
5027 5027          while (pplist != NULL) {
5028 5028                  /*
5029 5029                   * pp_targ is a linked list.
5030 5030                   */
5031 5031                  pp = pplist;
5032 5032                  if (pp->p_szc == 0) {
5033 5033                          page_sub(&pplist, pp);
5034 5034                          page_clr_all_props(pp);
5035 5035                          PP_SETFREE(pp);
5036 5036                          PP_SETAGED(pp);
5037 5037                          page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
5038 5038                          page_unlock(pp);
5039 5039                          VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]);
5040 5040                  } else {
5041 5041                          spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc);
5042 5042                          page_t *tpp;
5043 5043                          page_list_break(&pp, &pplist, curnpgs);
5044 5044                          tpp = pp;
5045 5045                          do {
5046 5046                                  ASSERT(PAGE_EXCL(tpp));
5047 5047                                  ASSERT(!hat_page_is_mapped(tpp));
5048 5048                                  page_clr_all_props(tpp);
5049 5049                                  PP_SETFREE(tpp);
5050 5050                                  PP_SETAGED(tpp);
5051 5051                          } while ((tpp = tpp->p_next) != pp);
5052 5052                          page_list_add_pages(pp, 0);
5053 5053                          VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]);
5054 5054                  }
5055 5055          }
5056 5056  }
5057 5057  
5058 5058  /*
5059 5059   * Relocate target to non-relocatable replacement page.
5060 5060   */
5061 5061  int
5062 5062  page_relocate_cage(page_t **target, page_t **replacement)
5063 5063  {
5064 5064          page_t *tpp, *rpp;
5065 5065          spgcnt_t pgcnt, npgs;
5066 5066          int result;
5067 5067  
5068 5068          tpp = *target;
5069 5069  
5070 5070          ASSERT(PAGE_EXCL(tpp));
5071 5071          ASSERT(tpp->p_szc == 0);
5072 5072  
5073 5073          pgcnt = btop(page_get_pagesize(tpp->p_szc));
5074 5074  
5075 5075          do {
5076 5076                  (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC);
5077 5077                  rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC);
5078 5078                  if (rpp == NULL) {
5079 5079                          page_create_putback(pgcnt);
5080 5080                          kcage_cageout_wakeup();
5081 5081                  }
5082 5082          } while (rpp == NULL);
5083 5083  
5084 5084          ASSERT(PP_ISNORELOC(rpp));
5085 5085  
5086 5086          result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL);
5087 5087  
5088 5088          if (result == 0) {
5089 5089                  *replacement = rpp;
5090 5090                  if (pgcnt != npgs)
5091 5091                          panic("page_relocate_cage: partial relocation");
5092 5092          }
5093 5093  
5094 5094          return (result);
5095 5095  }
5096 5096  
5097 5097  /*
5098 5098   * Release the page lock on a page, place on cachelist
5099 5099   * tail if no longer mapped. Caller can let us know if
5100 5100   * the page is known to be clean.
5101 5101   */
5102 5102  int
5103 5103  page_release(page_t *pp, int checkmod)
5104 5104  {
5105 5105          int status;
5106 5106  
5107 5107          ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) &&
5108 5108              (pp->p_vnode != NULL));
5109 5109  
5110 5110          if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) &&
5111 5111              ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) &&
5112 5112              pp->p_lckcnt == 0 && pp->p_cowcnt == 0 &&
5113 5113              !hat_page_is_mapped(pp)) {
5114 5114  
5115 5115                  /*
5116 5116                   * If page is modified, unlock it
5117 5117                   *
5118 5118                   * (p_nrm & P_MOD) bit has the latest stuff because:
5119 5119                   * (1) We found that this page doesn't have any mappings
5120 5120                   *      _after_ holding SE_EXCL and
5121 5121                   * (2) We didn't drop SE_EXCL lock after the check in (1)
5122 5122                   */
5123 5123                  if (checkmod && hat_ismod(pp)) {
5124 5124                          page_unlock(pp);
5125 5125                          status = PGREL_MOD;
5126 5126                  } else {
5127 5127                          /*LINTED: constant in conditional context*/
5128 5128                          VN_DISPOSE(pp, B_FREE, 0, kcred);
5129 5129                          status = PGREL_CLEAN;
5130 5130                  }
5131 5131          } else {
5132 5132                  page_unlock(pp);
5133 5133                  status = PGREL_NOTREL;
5134 5134          }
5135 5135          return (status);
5136 5136  }
5137 5137  
5138 5138  /*
5139 5139   * Given a constituent page, try to demote the large page on the freelist.
5140 5140   *
5141 5141   * Returns nonzero if the page could be demoted successfully. Returns with
5142 5142   * the constituent page still locked.
5143 5143   */
5144 5144  int
5145 5145  page_try_demote_free_pages(page_t *pp)
5146 5146  {
5147 5147          page_t *rootpp = pp;
5148 5148          pfn_t   pfn = page_pptonum(pp);
5149 5149          spgcnt_t npgs;
5150 5150          uint_t  szc = pp->p_szc;
5151 5151  
5152 5152          ASSERT(PP_ISFREE(pp));
5153 5153          ASSERT(PAGE_EXCL(pp));
5154 5154  
5155 5155          /*
5156 5156           * Adjust rootpp and lock it, if `pp' is not the base
5157 5157           * constituent page.
5158 5158           */
5159 5159          npgs = page_get_pagecnt(pp->p_szc);
5160 5160          if (npgs == 1) {
5161 5161                  return (0);
5162 5162          }
5163 5163  
5164 5164          if (!IS_P2ALIGNED(pfn, npgs)) {
5165 5165                  pfn = P2ALIGN(pfn, npgs);
5166 5166                  rootpp = page_numtopp_nolock(pfn);
5167 5167          }
5168 5168  
5169 5169          if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) {
5170 5170                  return (0);
5171 5171          }
5172 5172  
5173 5173          if (rootpp->p_szc != szc) {
5174 5174                  if (pp != rootpp)
5175 5175                          page_unlock(rootpp);
5176 5176                  return (0);
5177 5177          }
5178 5178  
5179 5179          page_demote_free_pages(rootpp);
5180 5180  
5181 5181          if (pp != rootpp)
5182 5182                  page_unlock(rootpp);
5183 5183  
5184 5184          ASSERT(PP_ISFREE(pp));
5185 5185          ASSERT(PAGE_EXCL(pp));
5186 5186          return (1);
5187 5187  }
5188 5188  
5189 5189  /*
5190 5190   * Given a constituent page, try to demote the large page.
5191 5191   *
5192 5192   * Returns nonzero if the page could be demoted successfully. Returns with
5193 5193   * the constituent page still locked.
5194 5194   */
5195 5195  int
5196 5196  page_try_demote_pages(page_t *pp)
5197 5197  {
5198 5198          page_t *tpp, *rootpp = pp;
5199 5199          pfn_t   pfn = page_pptonum(pp);
5200 5200          spgcnt_t i, npgs;
5201 5201          uint_t  szc = pp->p_szc;
5202 5202          vnode_t *vp = pp->p_vnode;
5203 5203  
5204 5204          ASSERT(PAGE_EXCL(pp));
5205 5205  
5206 5206          VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
5207 5207  
5208 5208          if (pp->p_szc == 0) {
5209 5209                  VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
5210 5210                  return (1);
5211 5211          }
5212 5212  
5213 5213          if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) {
5214 5214                  VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
5215 5215                  page_demote_vp_pages(pp);
5216 5216                  ASSERT(pp->p_szc == 0);
5217 5217                  return (1);
5218 5218          }
5219 5219  
5220 5220          /*
5221 5221           * Adjust rootpp if passed in is not the base
5222 5222           * constituent page.
5223 5223           */
5224 5224          npgs = page_get_pagecnt(pp->p_szc);
5225 5225          ASSERT(npgs > 1);
5226 5226          if (!IS_P2ALIGNED(pfn, npgs)) {
5227 5227                  pfn = P2ALIGN(pfn, npgs);
5228 5228                  rootpp = page_numtopp_nolock(pfn);
5229 5229                  VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]);
5230 5230                  ASSERT(rootpp->p_vnode != NULL);
5231 5231                  ASSERT(rootpp->p_szc == szc);
5232 5232          }
5233 5233  
5234 5234          /*
5235 5235           * We can't demote kernel pages since we can't hat_unload()
5236 5236           * the mappings.
5237 5237           */
5238 5238          if (VN_ISKAS(rootpp->p_vnode))
5239 5239                  return (0);
5240 5240  
5241 5241          /*
5242 5242           * Attempt to lock all constituent pages except the page passed
5243 5243           * in since it's already locked.
5244 5244           */
5245 5245          for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5246 5246                  ASSERT(!PP_ISFREE(tpp));
5247 5247                  ASSERT(tpp->p_vnode != NULL);
5248 5248  
5249 5249                  if (tpp != pp && !page_trylock(tpp, SE_EXCL))
5250 5250                          break;
5251 5251                  ASSERT(tpp->p_szc == rootpp->p_szc);
5252 5252                  ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
5253 5253          }
5254 5254  
5255 5255          /*
5256 5256           * If we failed to lock them all then unlock what we have
5257 5257           * locked so far and bail.
5258 5258           */
5259 5259          if (i < npgs) {
5260 5260                  tpp = rootpp;
5261 5261                  while (i-- > 0) {
5262 5262                          if (tpp != pp)
5263 5263                                  page_unlock(tpp);
5264 5264                          tpp++;
5265 5265                  }
5266 5266                  VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]);
5267 5267                  return (0);
5268 5268          }
5269 5269  
5270 5270          for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5271 5271                  ASSERT(PAGE_EXCL(tpp));
5272 5272                  ASSERT(tpp->p_slckcnt == 0);
5273 5273                  (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
5274 5274                  tpp->p_szc = 0;
5275 5275          }
5276 5276  
5277 5277          /*
5278 5278           * Unlock all pages except the page passed in.
5279 5279           */
5280 5280          for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5281 5281                  ASSERT(!hat_page_is_mapped(tpp));
5282 5282                  if (tpp != pp)
5283 5283                          page_unlock(tpp);
5284 5284          }
5285 5285  
5286 5286          VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
5287 5287          return (1);
5288 5288  }
5289 5289  
5290 5290  /*
5291 5291   * Called by page_free() and page_destroy() to demote the page size code
5292 5292   * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero
5293 5293   * p_szc on free list, neither can we just clear p_szc of a single page_t
5294 5294   * within a large page since it will break other code that relies on p_szc
5295 5295   * being the same for all page_t's of a large page). Anonymous pages should
5296 5296   * never end up here because anon_map_getpages() cannot deal with p_szc
5297 5297   * changes after a single constituent page is locked.  While anonymous or
5298 5298   * kernel large pages are demoted or freed the entire large page at a time
5299 5299   * with all constituent pages locked EXCL for the file system pages we
5300 5300   * have to be able to demote a large page (i.e. decrease all constituent pages
5301 5301   * p_szc) with only just an EXCL lock on one of constituent pages. The reason
5302 5302   * we can easily deal with anonymous page demotion the entire large page at a
5303 5303   * time is that those operation originate at address space level and concern
5304 5304   * the entire large page region with actual demotion only done when pages are
5305 5305   * not shared with any other processes (therefore we can always get EXCL lock
5306 5306   * on all anonymous constituent pages after clearing segment page
5307 5307   * cache). However file system pages can be truncated or invalidated at a
5308 5308   * PAGESIZE level from the file system side and end up in page_free() or
5309 5309   * page_destroy() (we also allow only part of the large page to be SOFTLOCKed
5310 5310   * and therefore pageout should be able to demote a large page by EXCL locking
5311 5311   * any constituent page that is not under SOFTLOCK). In those cases we cannot
5312 5312   * rely on being able to lock EXCL all constituent pages.
5313 5313   *
5314 5314   * To prevent szc changes on file system pages one has to lock all constituent
5315 5315   * pages at least SHARED (or call page_szc_lock()). The only subsystem that
5316 5316   * doesn't rely on locking all constituent pages (or using page_szc_lock()) to
5317 5317   * prevent szc changes is hat layer that uses its own page level mlist
5318 5318   * locks. hat assumes that szc doesn't change after mlist lock for a page is
5319 5319   * taken. Therefore we need to change szc under hat level locks if we only
5320 5320   * have an EXCL lock on a single constituent page and hat still references any
5321 5321   * of constituent pages.  (Note we can't "ignore" hat layer by simply
5322 5322   * hat_pageunload() all constituent pages without having EXCL locks on all of
5323 5323   * constituent pages). We use hat_page_demote() call to safely demote szc of
5324 5324   * all constituent pages under hat locks when we only have an EXCL lock on one
5325 5325   * of constituent pages.
5326 5326   *
5327 5327   * This routine calls page_szc_lock() before calling hat_page_demote() to
5328 5328   * allow segvn in one special case not to lock all constituent pages SHARED
5329 5329   * before calling hat_memload_array() that relies on p_szc not changing even
5330 5330   * before hat level mlist lock is taken.  In that case segvn uses
5331 5331   * page_szc_lock() to prevent hat_page_demote() changing p_szc values.
5332 5332   *
5333 5333   * Anonymous or kernel page demotion still has to lock all pages exclusively
5334 5334   * and do hat_pageunload() on all constituent pages before demoting the page
5335 5335   * therefore there's no need for anonymous or kernel page demotion to use
5336 5336   * hat_page_demote() mechanism.
5337 5337   *
5338 5338   * hat_page_demote() removes all large mappings that map pp and then decreases
5339 5339   * p_szc starting from the last constituent page of the large page. By working
5340 5340   * from the tail of a large page in pfn decreasing order allows one looking at
5341 5341   * the root page to know that hat_page_demote() is done for root's szc area.
5342 5342   * e.g. if a root page has szc 1 one knows it only has to lock all constituent
5343 5343   * pages within szc 1 area to prevent szc changes because hat_page_demote()
5344 5344   * that started on this page when it had szc > 1 is done for this szc 1 area.
5345 5345   *
5346 5346   * We are guaranteed that all constituent pages of pp's large page belong to
5347 5347   * the same vnode with the consecutive offsets increasing in the direction of
5348 5348   * the pfn i.e. the identity of constituent pages can't change until their
5349 5349   * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove
5350 5350   * large mappings to pp even though we don't lock any constituent page except
5351 5351   * pp (i.e. we won't unload e.g. kernel locked page).
5352 5352   */
5353 5353  static void
5354 5354  page_demote_vp_pages(page_t *pp)
5355 5355  {
5356 5356          kmutex_t *mtx;
5357 5357  
5358 5358          ASSERT(PAGE_EXCL(pp));
5359 5359          ASSERT(!PP_ISFREE(pp));
5360 5360          ASSERT(pp->p_vnode != NULL);
5361 5361          ASSERT(!IS_SWAPFSVP(pp->p_vnode));
5362 5362          ASSERT(!PP_ISKAS(pp));
5363 5363  
5364 5364          VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
5365 5365  
5366 5366          mtx = page_szc_lock(pp);
5367 5367          if (mtx != NULL) {
5368 5368                  hat_page_demote(pp);
5369 5369                  mutex_exit(mtx);
5370 5370          }
5371 5371          ASSERT(pp->p_szc == 0);
5372 5372  }
5373 5373  
5374 5374  /*
5375 5375   * Mark any existing pages for migration in the given range
5376 5376   */
5377 5377  void
5378 5378  page_mark_migrate(struct seg *seg, caddr_t addr, size_t len,
5379 5379      struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
5380 5380      u_offset_t vnoff, int rflag)
5381 5381  {
5382 5382          struct anon     *ap;
5383 5383          vnode_t         *curvp;
5384 5384          lgrp_t          *from;
5385 5385          pgcnt_t         nlocked;
5386 5386          u_offset_t      off;
5387 5387          pfn_t           pfn;
5388 5388          size_t          pgsz;
5389 5389          size_t          segpgsz;
5390 5390          pgcnt_t         pages;
5391 5391          uint_t          pszc;
5392 5392          page_t          *pp0, *pp;
5393 5393          caddr_t         va;
5394 5394          ulong_t         an_idx;
5395 5395          anon_sync_obj_t cookie;
5396 5396  
5397 5397          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5398 5398  
5399 5399          /*
5400 5400           * Don't do anything if don't need to do lgroup optimizations
5401 5401           * on this system
5402 5402           */
5403 5403          if (!lgrp_optimizations())
5404 5404                  return;
5405 5405  
5406 5406          /*
5407 5407           * Align address and length to (potentially large) page boundary
5408 5408           */
5409 5409          segpgsz = page_get_pagesize(seg->s_szc);
5410 5410          addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz);
5411 5411          if (rflag)
5412 5412                  len = P2ROUNDUP(len, segpgsz);
5413 5413  
5414 5414          /*
5415 5415           * Do one (large) page at a time
5416 5416           */
5417 5417          va = addr;
5418 5418          while (va < addr + len) {
5419 5419                  /*
5420 5420                   * Lookup (root) page for vnode and offset corresponding to
5421 5421                   * this virtual address
5422 5422                   * Try anonmap first since there may be copy-on-write
5423 5423                   * pages, but initialize vnode pointer and offset using
5424 5424                   * vnode arguments just in case there isn't an amp.
5425 5425                   */
5426 5426                  curvp = vp;
5427 5427                  off = vnoff + va - seg->s_base;
5428 5428                  if (amp) {
5429 5429                          ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5430 5430                          an_idx = anon_index + seg_page(seg, va);
5431 5431                          anon_array_enter(amp, an_idx, &cookie);
5432 5432                          ap = anon_get_ptr(amp->ahp, an_idx);
5433 5433                          if (ap)
5434 5434                                  swap_xlate(ap, &curvp, &off);
5435 5435                          anon_array_exit(&cookie);
5436 5436                          ANON_LOCK_EXIT(&amp->a_rwlock);
5437 5437                  }
5438 5438  
5439 5439                  pp = NULL;
5440 5440                  if (curvp)
5441 5441                          pp = page_lookup(curvp, off, SE_SHARED);
5442 5442  
5443 5443                  /*
5444 5444                   * If there isn't a page at this virtual address,
5445 5445                   * skip to next page
5446 5446                   */
5447 5447                  if (pp == NULL) {
5448 5448                          va += PAGESIZE;
5449 5449                          continue;
5450 5450                  }
5451 5451  
5452 5452                  /*
5453 5453                   * Figure out which lgroup this page is in for kstats
5454 5454                   */
5455 5455                  pfn = page_pptonum(pp);
5456 5456                  from = lgrp_pfn_to_lgrp(pfn);
5457 5457  
5458 5458                  /*
5459 5459                   * Get page size, and round up and skip to next page boundary
5460 5460                   * if unaligned address
5461 5461                   */
5462 5462                  pszc = pp->p_szc;
5463 5463                  pgsz = page_get_pagesize(pszc);
5464 5464                  pages = btop(pgsz);
5465 5465                  if (!IS_P2ALIGNED(va, pgsz) ||
5466 5466                      !IS_P2ALIGNED(pfn, pages) ||
5467 5467                      pgsz > segpgsz) {
5468 5468                          pgsz = MIN(pgsz, segpgsz);
5469 5469                          page_unlock(pp);
5470 5470                          pages = btop(P2END((uintptr_t)va, pgsz) -
5471 5471                              (uintptr_t)va);
5472 5472                          va = (caddr_t)P2END((uintptr_t)va, pgsz);
5473 5473                          lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, pages);
5474 5474                          continue;
5475 5475                  }
5476 5476  
5477 5477                  /*
5478 5478                   * Upgrade to exclusive lock on page
5479 5479                   */
5480 5480                  if (!page_tryupgrade(pp)) {
5481 5481                          page_unlock(pp);
5482 5482                          va += pgsz;
5483 5483                          lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5484 5484                              btop(pgsz));
5485 5485                          continue;
5486 5486                  }
5487 5487  
5488 5488                  pp0 = pp++;
5489 5489                  nlocked = 1;
5490 5490  
5491 5491                  /*
5492 5492                   * Lock constituent pages if this is large page
5493 5493                   */
5494 5494                  if (pages > 1) {
5495 5495                          /*
5496 5496                           * Lock all constituents except root page, since it
5497 5497                           * should be locked already.
5498 5498                           */
5499 5499                          for (; nlocked < pages; nlocked++) {
5500 5500                                  if (!page_trylock(pp, SE_EXCL)) {
5501 5501                                          break;
5502 5502                                  }
5503 5503                                  if (PP_ISFREE(pp) ||
5504 5504                                      pp->p_szc != pszc) {
5505 5505                                          /*
5506 5506                                           * hat_page_demote() raced in with us.
5507 5507                                           */
5508 5508                                          ASSERT(!IS_SWAPFSVP(curvp));
5509 5509                                          page_unlock(pp);
5510 5510                                          break;
5511 5511                                  }
5512 5512                                  pp++;
5513 5513                          }
5514 5514                  }
5515 5515  
5516 5516                  /*
5517 5517                   * If all constituent pages couldn't be locked,
5518 5518                   * unlock pages locked so far and skip to next page.
5519 5519                   */
5520 5520                  if (nlocked < pages) {
5521 5521                          while (pp0 < pp) {
5522 5522                                  page_unlock(pp0++);
5523 5523                          }
5524 5524                          va += pgsz;
5525 5525                          lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5526 5526                              btop(pgsz));
5527 5527                          continue;
5528 5528                  }
5529 5529  
5530 5530                  /*
5531 5531                   * hat_page_demote() can no longer happen
5532 5532                   * since last cons page had the right p_szc after
5533 5533                   * all cons pages were locked. all cons pages
5534 5534                   * should now have the same p_szc.
5535 5535                   */
5536 5536  
5537 5537                  /*
5538 5538                   * All constituent pages locked successfully, so mark
5539 5539                   * large page for migration and unload the mappings of
5540 5540                   * constituent pages, so a fault will occur on any part of the
5541 5541                   * large page
5542 5542                   */
5543 5543                  PP_SETMIGRATE(pp0);
5544 5544                  while (pp0 < pp) {
5545 5545                          (void) hat_pageunload(pp0, HAT_FORCE_PGUNLOAD);
5546 5546                          ASSERT(hat_page_getshare(pp0) == 0);
5547 5547                          page_unlock(pp0++);
5548 5548                  }
5549 5549                  lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked);
5550 5550  
5551 5551                  va += pgsz;
5552 5552          }
5553 5553  }
5554 5554  
5555 5555  /*
5556 5556   * Migrate any pages that have been marked for migration in the given range
5557 5557   */
5558 5558  void
5559 5559  page_migrate(
5560 5560          struct seg      *seg,
5561 5561          caddr_t         addr,
5562 5562          page_t          **ppa,
5563 5563          pgcnt_t         npages)
5564 5564  {
5565 5565          lgrp_t          *from;
5566 5566          lgrp_t          *to;
5567 5567          page_t          *newpp;
5568 5568          page_t          *pp;
5569 5569          pfn_t           pfn;
5570 5570          size_t          pgsz;
5571 5571          spgcnt_t        page_cnt;
5572 5572          spgcnt_t        i;
5573 5573          uint_t          pszc;
5574 5574  
5575 5575          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5576 5576  
5577 5577          while (npages > 0) {
5578 5578                  pp = *ppa;
5579 5579                  pszc = pp->p_szc;
5580 5580                  pgsz = page_get_pagesize(pszc);
5581 5581                  page_cnt = btop(pgsz);
5582 5582  
5583 5583                  /*
5584 5584                   * Check to see whether this page is marked for migration
5585 5585                   *
5586 5586                   * Assume that root page of large page is marked for
5587 5587                   * migration and none of the other constituent pages
5588 5588                   * are marked.  This really simplifies clearing the
5589 5589                   * migrate bit by not having to clear it from each
5590 5590                   * constituent page.
5591 5591                   *
5592 5592                   * note we don't want to relocate an entire large page if
5593 5593                   * someone is only using one subpage.
5594 5594                   */
5595 5595                  if (npages < page_cnt)
5596 5596                          break;
5597 5597  
5598 5598                  /*
5599 5599                   * Is it marked for migration?
5600 5600                   */
5601 5601                  if (!PP_ISMIGRATE(pp))
5602 5602                          goto next;
5603 5603  
5604 5604                  /*
5605 5605                   * Determine lgroups that page is being migrated between
5606 5606                   */
5607 5607                  pfn = page_pptonum(pp);
5608 5608                  if (!IS_P2ALIGNED(pfn, page_cnt)) {
5609 5609                          break;
5610 5610                  }
5611 5611                  from = lgrp_pfn_to_lgrp(pfn);
5612 5612                  to = lgrp_mem_choose(seg, addr, pgsz);
5613 5613  
5614 5614                  /*
5615 5615                   * Need to get exclusive lock's to migrate
5616 5616                   */
5617 5617                  for (i = 0; i < page_cnt; i++) {
5618 5618                          ASSERT(PAGE_LOCKED(ppa[i]));
5619 5619                          if (page_pptonum(ppa[i]) != pfn + i ||
5620 5620                              ppa[i]->p_szc != pszc) {
5621 5621                                  break;
5622 5622                          }
5623 5623                          if (!page_tryupgrade(ppa[i])) {
5624 5624                                  lgrp_stat_add(from->lgrp_id,
5625 5625                                      LGRP_PM_FAIL_LOCK_PGS,
5626 5626                                      page_cnt);
5627 5627                                  break;
5628 5628                          }
5629 5629  
5630 5630                          /*
5631 5631                           * Check to see whether we are trying to migrate
5632 5632                           * page to lgroup where it is allocated already.
5633 5633                           * If so, clear the migrate bit and skip to next
5634 5634                           * page.
5635 5635                           */
5636 5636                          if (i == 0 && to == from) {
5637 5637                                  PP_CLRMIGRATE(ppa[0]);
5638 5638                                  page_downgrade(ppa[0]);
5639 5639                                  goto next;
5640 5640                          }
5641 5641                  }
5642 5642  
5643 5643                  /*
5644 5644                   * If all constituent pages couldn't be locked,
5645 5645                   * unlock pages locked so far and skip to next page.
5646 5646                   */
5647 5647                  if (i != page_cnt) {
5648 5648                          while (--i != -1) {
5649 5649                                  page_downgrade(ppa[i]);
5650 5650                          }
5651 5651                          goto next;
5652 5652                  }
5653 5653  
5654 5654                  (void) page_create_wait(page_cnt, PG_WAIT);
5655 5655                  newpp = page_get_replacement_page(pp, to, PGR_SAMESZC);
5656 5656                  if (newpp == NULL) {
5657 5657                          page_create_putback(page_cnt);
5658 5658                          for (i = 0; i < page_cnt; i++) {
5659 5659                                  page_downgrade(ppa[i]);
5660 5660                          }
5661 5661                          lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS,
5662 5662                              page_cnt);
5663 5663                          goto next;
5664 5664                  }
5665 5665                  ASSERT(newpp->p_szc == pszc);
5666 5666                  /*
5667 5667                   * Clear migrate bit and relocate page
5668 5668                   */
5669 5669                  PP_CLRMIGRATE(pp);
5670 5670                  if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) {
5671 5671                          panic("page_migrate: page_relocate failed");
5672 5672                  }
5673 5673                  ASSERT(page_cnt * PAGESIZE == pgsz);
5674 5674  
5675 5675                  /*
5676 5676                   * Keep stats for number of pages migrated from and to
5677 5677                   * each lgroup
5678 5678                   */
5679 5679                  lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt);
5680 5680                  lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt);
5681 5681                  /*
5682 5682                   * update the page_t array we were passed in and
5683 5683                   * unlink constituent pages of a large page.
5684 5684                   */
5685 5685                  for (i = 0; i < page_cnt; ++i, ++pp) {
5686 5686                          ASSERT(PAGE_EXCL(newpp));
5687 5687                          ASSERT(newpp->p_szc == pszc);
5688 5688                          ppa[i] = newpp;
5689 5689                          pp = newpp;
5690 5690                          page_sub(&newpp, pp);
5691 5691                          page_downgrade(pp);
5692 5692                  }
5693 5693                  ASSERT(newpp == NULL);
5694 5694  next:
5695 5695                  addr += pgsz;
5696 5696                  ppa += page_cnt;
5697 5697                  npages -= page_cnt;
5698 5698          }
5699 5699  }
5700 5700  
5701 5701  #define MAX_CNT 60      /* max num of iterations */
5702 5702  /*
5703 5703   * Reclaim/reserve availrmem for npages.
5704 5704   * If there is not enough memory start reaping seg, kmem caches.
5705 5705   * Start pageout scanner (via page_needfree()).
5706 5706   * Exit after ~ MAX_CNT s regardless of how much memory has been released.
5707 5707   * Note: There is no guarantee that any availrmem will be freed as
5708 5708   * this memory typically is locked (kernel heap) or reserved for swap.
5709 5709   * Also due to memory fragmentation kmem allocator may not be able
5710 5710   * to free any memory (single user allocated buffer will prevent
5711 5711   * freeing slab or a page).
5712 5712   */
5713 5713  int
5714 5714  page_reclaim_mem(pgcnt_t npages, pgcnt_t epages, int adjust)
5715 5715  {
5716 5716          int     i = 0;
5717 5717          int     ret = 0;
5718 5718          pgcnt_t deficit;
5719 5719          pgcnt_t old_availrmem;
5720 5720  
5721 5721          mutex_enter(&freemem_lock);
5722 5722          old_availrmem = availrmem - 1;
5723 5723          while ((availrmem < tune.t_minarmem + npages + epages) &&
5724 5724              (old_availrmem < availrmem) && (i++ < MAX_CNT)) {
5725 5725                  old_availrmem = availrmem;
5726 5726                  deficit = tune.t_minarmem + npages + epages - availrmem;
5727 5727                  mutex_exit(&freemem_lock);
5728 5728                  page_needfree(deficit);
5729 5729                  kmem_reap();
5730 5730                  delay(hz);
5731 5731                  page_needfree(-(spgcnt_t)deficit);
5732 5732                  mutex_enter(&freemem_lock);
5733 5733          }
5734 5734  
5735 5735          if (adjust && (availrmem >= tune.t_minarmem + npages + epages)) {
5736 5736                  availrmem -= npages;
5737 5737                  ret = 1;
5738 5738          }
5739 5739  
5740 5740          mutex_exit(&freemem_lock);
5741 5741  
5742 5742          return (ret);
5743 5743  }
5744 5744  
5745 5745  /*
5746 5746   * Search the memory segments to locate the desired page.  Within a
5747 5747   * segment, pages increase linearly with one page structure per
5748 5748   * physical page frame (size PAGESIZE).  The search begins
5749 5749   * with the segment that was accessed last, to take advantage of locality.
5750 5750   * If the hint misses, we start from the beginning of the sorted memseg list
5751 5751   */
5752 5752  
5753 5753  
5754 5754  /*
5755 5755   * Some data structures for pfn to pp lookup.
5756 5756   */
5757 5757  ulong_t mhash_per_slot;
5758 5758  struct memseg *memseg_hash[N_MEM_SLOTS];
5759 5759  
5760 5760  page_t *
5761 5761  page_numtopp_nolock(pfn_t pfnum)
5762 5762  {
5763 5763          struct memseg *seg;
5764 5764          page_t *pp;
5765 5765          vm_cpu_data_t *vc;
5766 5766  
5767 5767          /*
5768 5768           * We need to disable kernel preemption while referencing the
5769 5769           * cpu_vm_data field in order to prevent us from being switched to
5770 5770           * another cpu and trying to reference it after it has been freed.
5771 5771           * This will keep us on cpu and prevent it from being removed while
5772 5772           * we are still on it.
5773 5773           *
5774 5774           * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5775 5775           * which is being resued by DR who will flush those references
5776 5776           * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5777 5777           */
5778 5778          kpreempt_disable();
5779 5779          vc = CPU->cpu_vm_data;
5780 5780          ASSERT(vc != NULL);
5781 5781  
5782 5782          MEMSEG_STAT_INCR(nsearch);
5783 5783  
5784 5784          /* Try last winner first */
5785 5785          if (((seg = vc->vc_pnum_memseg) != NULL) &&
5786 5786              (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5787 5787                  MEMSEG_STAT_INCR(nlastwon);
5788 5788                  pp = seg->pages + (pfnum - seg->pages_base);
5789 5789                  if (pp->p_pagenum == pfnum) {
5790 5790                          kpreempt_enable();
5791 5791                          return ((page_t *)pp);
5792 5792                  }
5793 5793          }
5794 5794  
5795 5795          /* Else Try hash */
5796 5796          if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5797 5797              (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5798 5798                  MEMSEG_STAT_INCR(nhashwon);
5799 5799                  vc->vc_pnum_memseg = seg;
5800 5800                  pp = seg->pages + (pfnum - seg->pages_base);
5801 5801                  if (pp->p_pagenum == pfnum) {
5802 5802                          kpreempt_enable();
5803 5803                          return ((page_t *)pp);
5804 5804                  }
5805 5805          }
5806 5806  
5807 5807          /* Else Brute force */
5808 5808          for (seg = memsegs; seg != NULL; seg = seg->next) {
5809 5809                  if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5810 5810                          vc->vc_pnum_memseg = seg;
5811 5811                          pp = seg->pages + (pfnum - seg->pages_base);
5812 5812                          if (pp->p_pagenum == pfnum) {
5813 5813                                  kpreempt_enable();
5814 5814                                  return ((page_t *)pp);
5815 5815                          }
5816 5816                  }
5817 5817          }
5818 5818          vc->vc_pnum_memseg = NULL;
5819 5819          kpreempt_enable();
5820 5820          MEMSEG_STAT_INCR(nnotfound);
5821 5821          return ((page_t *)NULL);
5822 5822  
5823 5823  }
5824 5824  
5825 5825  struct memseg *
5826 5826  page_numtomemseg_nolock(pfn_t pfnum)
5827 5827  {
5828 5828          struct memseg *seg;
5829 5829          page_t *pp;
5830 5830  
5831 5831          /*
5832 5832           * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5833 5833           * which is being resued by DR who will flush those references
5834 5834           * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5835 5835           */
5836 5836          kpreempt_disable();
5837 5837          /* Try hash */
5838 5838          if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5839 5839              (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5840 5840                  pp = seg->pages + (pfnum - seg->pages_base);
5841 5841                  if (pp->p_pagenum == pfnum) {
5842 5842                          kpreempt_enable();
5843 5843                          return (seg);
5844 5844                  }
5845 5845          }
5846 5846  
5847 5847          /* Else Brute force */
5848 5848          for (seg = memsegs; seg != NULL; seg = seg->next) {
5849 5849                  if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5850 5850                          pp = seg->pages + (pfnum - seg->pages_base);
5851 5851                          if (pp->p_pagenum == pfnum) {
5852 5852                                  kpreempt_enable();
5853 5853                                  return (seg);
5854 5854                          }
5855 5855                  }
5856 5856          }
5857 5857          kpreempt_enable();
5858 5858          return ((struct memseg *)NULL);
5859 5859  }
5860 5860  
5861 5861  /*
5862 5862   * Given a page and a count return the page struct that is
5863 5863   * n structs away from the current one in the global page
5864 5864   * list.
5865 5865   *
5866 5866   * This function wraps to the first page upon
5867 5867   * reaching the end of the memseg list.
5868 5868   */
5869 5869  page_t *
5870 5870  page_nextn(page_t *pp, ulong_t n)
5871 5871  {
5872 5872          struct memseg *seg;
5873 5873          page_t *ppn;
5874 5874          vm_cpu_data_t *vc;
5875 5875  
5876 5876          /*
5877 5877           * We need to disable kernel preemption while referencing the
5878 5878           * cpu_vm_data field in order to prevent us from being switched to
5879 5879           * another cpu and trying to reference it after it has been freed.
5880 5880           * This will keep us on cpu and prevent it from being removed while
5881 5881           * we are still on it.
5882 5882           *
5883 5883           * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5884 5884           * which is being resued by DR who will flush those references
5885 5885           * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5886 5886           */
5887 5887          kpreempt_disable();
5888 5888          vc = (vm_cpu_data_t *)CPU->cpu_vm_data;
5889 5889  
5890 5890          ASSERT(vc != NULL);
5891 5891  
5892 5892          if (((seg = vc->vc_pnext_memseg) == NULL) ||
5893 5893              (seg->pages_base == seg->pages_end) ||
5894 5894              !(pp >= seg->pages && pp < seg->epages)) {
5895 5895  
5896 5896                  for (seg = memsegs; seg; seg = seg->next) {
5897 5897                          if (pp >= seg->pages && pp < seg->epages)
5898 5898                                  break;
5899 5899                  }
5900 5900  
5901 5901                  if (seg == NULL) {
5902 5902                          /* Memory delete got in, return something valid. */
5903 5903                          /* TODO: fix me. */
5904 5904                          seg = memsegs;
5905 5905                          pp = seg->pages;
5906 5906                  }
5907 5907          }
5908 5908  
5909 5909          /* check for wraparound - possible if n is large */
5910 5910          while ((ppn = (pp + n)) >= seg->epages || ppn < pp) {
5911 5911                  n -= seg->epages - pp;
5912 5912                  seg = seg->next;
5913 5913                  if (seg == NULL)
5914 5914                          seg = memsegs;
5915 5915                  pp = seg->pages;
5916 5916          }
5917 5917          vc->vc_pnext_memseg = seg;
5918 5918          kpreempt_enable();
5919 5919          return (ppn);
5920 5920  }
5921 5921  
5922 5922  /*
5923 5923   * Initialize for a loop using page_next_scan_large().
5924 5924   */
5925 5925  page_t *
5926 5926  page_next_scan_init(void **cookie)
5927 5927  {
5928 5928          ASSERT(cookie != NULL);
5929 5929          *cookie = (void *)memsegs;
5930 5930          return ((page_t *)memsegs->pages);
5931 5931  }
5932 5932  
5933 5933  /*
5934 5934   * Return the next page in a scan of page_t's, assuming we want
5935 5935   * to skip over sub-pages within larger page sizes.
5936 5936   *
5937 5937   * The cookie is used to keep track of the current memseg.
5938 5938   */
5939 5939  page_t *
5940 5940  page_next_scan_large(
5941 5941          page_t          *pp,
5942 5942          ulong_t         *n,
5943 5943          void            **cookie)
5944 5944  {
5945 5945          struct memseg   *seg = (struct memseg *)*cookie;
5946 5946          page_t          *new_pp;
5947 5947          ulong_t         cnt;
5948 5948          pfn_t           pfn;
5949 5949  
5950 5950  
5951 5951          /*
5952 5952           * get the count of page_t's to skip based on the page size
5953 5953           */
5954 5954          ASSERT(pp != NULL);
5955 5955          if (pp->p_szc == 0) {
5956 5956                  cnt = 1;
5957 5957          } else {
5958 5958                  pfn = page_pptonum(pp);
5959 5959                  cnt = page_get_pagecnt(pp->p_szc);
5960 5960                  cnt -= pfn & (cnt - 1);
5961 5961          }
5962 5962          *n += cnt;
5963 5963          new_pp = pp + cnt;
5964 5964  
5965 5965          /*
5966 5966           * Catch if we went past the end of the current memory segment. If so,
5967 5967           * just move to the next segment with pages.
5968 5968           */
5969 5969          if (new_pp >= seg->epages || seg->pages_base == seg->pages_end) {
5970 5970                  do {
5971 5971                          seg = seg->next;
5972 5972                          if (seg == NULL)
5973 5973                                  seg = memsegs;
5974 5974                  } while (seg->pages_base == seg->pages_end);
5975 5975                  new_pp = seg->pages;
5976 5976                  *cookie = (void *)seg;
5977 5977          }
5978 5978  
5979 5979          return (new_pp);
5980 5980  }
5981 5981  
5982 5982  
5983 5983  /*
5984 5984   * Returns next page in list. Note: this function wraps
5985 5985   * to the first page in the list upon reaching the end
5986 5986   * of the list. Callers should be aware of this fact.
5987 5987   */
5988 5988  
5989 5989  /* We should change this be a #define */
5990 5990  
5991 5991  page_t *
5992 5992  page_next(page_t *pp)
5993 5993  {
5994 5994          return (page_nextn(pp, 1));
5995 5995  }
5996 5996  
5997 5997  page_t *
5998 5998  page_first()
5999 5999  {
6000 6000          return ((page_t *)memsegs->pages);
6001 6001  }
6002 6002  
6003 6003  
6004 6004  /*
6005 6005   * This routine is called at boot with the initial memory configuration
6006 6006   * and when memory is added or removed.
6007 6007   */
6008 6008  void
6009 6009  build_pfn_hash()
6010 6010  {
6011 6011          pfn_t cur;
6012 6012          pgcnt_t index;
6013 6013          struct memseg *pseg;
6014 6014          int     i;
6015 6015  
6016 6016          /*
6017 6017           * Clear memseg_hash array.
6018 6018           * Since memory add/delete is designed to operate concurrently
6019 6019           * with normal operation, the hash rebuild must be able to run
6020 6020           * concurrently with page_numtopp_nolock(). To support this
6021 6021           * functionality, assignments to memseg_hash array members must
6022 6022           * be done atomically.
6023 6023           *
6024 6024           * NOTE: bzero() does not currently guarantee this for kernel
6025 6025           * threads, and cannot be used here.
6026 6026           */
6027 6027          for (i = 0; i < N_MEM_SLOTS; i++)
6028 6028                  memseg_hash[i] = NULL;
6029 6029  
6030 6030          hat_kpm_mseghash_clear(N_MEM_SLOTS);
6031 6031  
6032 6032          /*
6033 6033           * Physmax is the last valid pfn.
6034 6034           */
6035 6035          mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT;
6036 6036          for (pseg = memsegs; pseg != NULL; pseg = pseg->next) {
6037 6037                  index = MEMSEG_PFN_HASH(pseg->pages_base);
6038 6038                  cur = pseg->pages_base;
6039 6039                  do {
6040 6040                          if (index >= N_MEM_SLOTS)
6041 6041                                  index = MEMSEG_PFN_HASH(cur);
6042 6042  
6043 6043                          if (memseg_hash[index] == NULL ||
6044 6044                              memseg_hash[index]->pages_base > pseg->pages_base) {
6045 6045                                  memseg_hash[index] = pseg;
6046 6046                                  hat_kpm_mseghash_update(index, pseg);
6047 6047                          }
6048 6048                          cur += mhash_per_slot;
6049 6049                          index++;
6050 6050                  } while (cur < pseg->pages_end);
6051 6051          }
6052 6052  }
6053 6053  
6054 6054  /*
6055 6055   * Return the pagenum for the pp
6056 6056   */
6057 6057  pfn_t
6058 6058  page_pptonum(page_t *pp)
6059 6059  {
6060 6060          return (pp->p_pagenum);
6061 6061  }
6062 6062  
6063 6063  /*
6064 6064   * interface to the referenced and modified etc bits
6065 6065   * in the PSM part of the page struct
6066 6066   * when no locking is desired.
6067 6067   */
6068 6068  void
6069 6069  page_set_props(page_t *pp, uint_t flags)
6070 6070  {
6071 6071          ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0);
6072 6072          pp->p_nrm |= (uchar_t)flags;
6073 6073  }
6074 6074  
6075 6075  void
6076 6076  page_clr_all_props(page_t *pp)
6077 6077  {
6078 6078          pp->p_nrm = 0;
6079 6079  }
6080 6080  
6081 6081  /*
6082 6082   * Clear p_lckcnt and p_cowcnt, adjusting freemem if required.
6083 6083   */
6084 6084  int
6085 6085  page_clear_lck_cow(page_t *pp, int adjust)
6086 6086  {
6087 6087          int     f_amount;
6088 6088  
6089 6089          ASSERT(PAGE_EXCL(pp));
6090 6090  
6091 6091          /*
6092 6092           * The page_struct_lock need not be acquired here since
6093 6093           * we require the caller hold the page exclusively locked.
6094 6094           */
6095 6095          f_amount = 0;
6096 6096          if (pp->p_lckcnt) {
6097 6097                  f_amount = 1;
6098 6098                  pp->p_lckcnt = 0;
6099 6099          }
6100 6100          if (pp->p_cowcnt) {
6101 6101                  f_amount += pp->p_cowcnt;
6102 6102                  pp->p_cowcnt = 0;
6103 6103          }
6104 6104  
6105 6105          if (adjust && f_amount) {
6106 6106                  mutex_enter(&freemem_lock);
6107 6107                  availrmem += f_amount;
6108 6108                  mutex_exit(&freemem_lock);
6109 6109          }
6110 6110  
6111 6111          return (f_amount);
6112 6112  }
6113 6113  
6114 6114  /*
6115 6115   * The following functions is called from free_vp_pages()
6116 6116   * for an inexact estimate of a newly free'd page...
6117 6117   */
6118 6118  ulong_t
6119 6119  page_share_cnt(page_t *pp)
6120 6120  {
6121 6121          return (hat_page_getshare(pp));
6122 6122  }
6123 6123  
6124 6124  int
6125 6125  page_isshared(page_t *pp)
6126 6126  {
6127 6127          return (hat_page_checkshare(pp, 1));
6128 6128  }
6129 6129  
6130 6130  int
6131 6131  page_isfree(page_t *pp)
6132 6132  {
6133 6133          return (PP_ISFREE(pp));
6134 6134  }
6135 6135  
6136 6136  int
6137 6137  page_isref(page_t *pp)
6138 6138  {
6139 6139          return (hat_page_getattr(pp, P_REF));
6140 6140  }
6141 6141  
6142 6142  int
6143 6143  page_ismod(page_t *pp)
6144 6144  {
6145 6145          return (hat_page_getattr(pp, P_MOD));
6146 6146  }
6147 6147  
6148 6148  /*
6149 6149   * The following code all currently relates to the page capture logic:
6150 6150   *
6151 6151   * This logic is used for cases where there is a desire to claim a certain
6152 6152   * physical page in the system for the caller.  As it may not be possible
6153 6153   * to capture the page immediately, the p_toxic bits are used in the page
6154 6154   * structure to indicate that someone wants to capture this page.  When the
6155 6155   * page gets unlocked, the toxic flag will be noted and an attempt to capture
6156 6156   * the page will be made.  If it is successful, the original callers callback
6157 6157   * will be called with the page to do with it what they please.
6158 6158   *
6159 6159   * There is also an async thread which wakes up to attempt to capture
6160 6160   * pages occasionally which have the capture bit set.  All of the pages which
6161 6161   * need to be captured asynchronously have been inserted into the
6162 6162   * page_capture_hash and thus this thread walks that hash list.  Items in the
6163 6163   * hash have an expiration time so this thread handles that as well by removing
6164 6164   * the item from the hash if it has expired.
6165 6165   *
6166 6166   * Some important things to note are:
6167 6167   * - if the PR_CAPTURE bit is set on a page, then the page is in the
6168 6168   *   page_capture_hash.  The page_capture_hash_head.pchh_mutex is needed
6169 6169   *   to set and clear this bit, and while the lock is held is the only time
6170 6170   *   you can add or remove an entry from the hash.
6171 6171   * - the PR_CAPTURE bit can only be set and cleared while holding the
6172 6172   *   page_capture_hash_head.pchh_mutex
6173 6173   * - the t_flag field of the thread struct is used with the T_CAPTURING
6174 6174   *   flag to prevent recursion while dealing with large pages.
6175 6175   * - pages which need to be retired never expire on the page_capture_hash.
6176 6176   */
6177 6177  
6178 6178  static void page_capture_thread(void);
6179 6179  static kthread_t *pc_thread_id;
6180 6180  kcondvar_t pc_cv;
6181 6181  static kmutex_t pc_thread_mutex;
6182 6182  static clock_t pc_thread_shortwait;
6183 6183  static clock_t pc_thread_longwait;
6184 6184  static int pc_thread_retry;
6185 6185  
6186 6186  struct page_capture_callback pc_cb[PC_NUM_CALLBACKS];
6187 6187  
6188 6188  /* Note that this is a circular linked list */
6189 6189  typedef struct page_capture_hash_bucket {
6190 6190          page_t *pp;
6191 6191          uchar_t szc;
6192 6192          uchar_t pri;
6193 6193          uint_t flags;
6194 6194          clock_t expires;        /* lbolt at which this request expires. */
6195 6195          void *datap;            /* Cached data passed in for callback */
6196 6196          struct page_capture_hash_bucket *next;
6197 6197          struct page_capture_hash_bucket *prev;
6198 6198  } page_capture_hash_bucket_t;
6199 6199  
6200 6200  #define PC_PRI_HI       0       /* capture now */
6201 6201  #define PC_PRI_LO       1       /* capture later */
6202 6202  #define PC_NUM_PRI      2
6203 6203  
6204 6204  #define PAGE_CAPTURE_PRIO(pp) (PP_ISRAF(pp) ? PC_PRI_LO : PC_PRI_HI)
6205 6205  
6206 6206  
6207 6207  /*
6208 6208   * Each hash bucket will have it's own mutex and two lists which are:
6209 6209   * active (0):  represents requests which have not been processed by
6210 6210   *              the page_capture async thread yet.
6211 6211   * walked (1):  represents requests which have been processed by the
6212 6212   *              page_capture async thread within it's given walk of this bucket.
6213 6213   *
6214 6214   * These are all needed so that we can synchronize all async page_capture
6215 6215   * events.  When the async thread moves to a new bucket, it will append the
6216 6216   * walked list to the active list and walk each item one at a time, moving it
6217 6217   * from the active list to the walked list.  Thus if there is an async request
6218 6218   * outstanding for a given page, it will always be in one of the two lists.
6219 6219   * New requests will always be added to the active list.
6220 6220   * If we were not able to capture a page before the request expired, we'd free
6221 6221   * up the request structure which would indicate to page_capture that there is
6222 6222   * no longer a need for the given page, and clear the PR_CAPTURE flag if
6223 6223   * possible.
6224 6224   */
6225 6225  typedef struct page_capture_hash_head {
6226 6226          kmutex_t pchh_mutex;
6227 6227          uint_t num_pages[PC_NUM_PRI];
6228 6228          page_capture_hash_bucket_t lists[2]; /* sentinel nodes */
6229 6229  } page_capture_hash_head_t;
6230 6230  
6231 6231  #ifdef DEBUG
6232 6232  #define NUM_PAGE_CAPTURE_BUCKETS 4
6233 6233  #else
6234 6234  #define NUM_PAGE_CAPTURE_BUCKETS 64
6235 6235  #endif
6236 6236  
6237 6237  page_capture_hash_head_t page_capture_hash[NUM_PAGE_CAPTURE_BUCKETS];
6238 6238  
6239 6239  /* for now use a very simple hash based upon the size of a page struct */
6240 6240  #define PAGE_CAPTURE_HASH(pp)   \
6241 6241          ((int)(((uintptr_t)pp >> 7) & (NUM_PAGE_CAPTURE_BUCKETS - 1)))
6242 6242  
6243 6243  extern pgcnt_t swapfs_minfree;
6244 6244  
6245 6245  int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap);
6246 6246  
6247 6247  /*
6248 6248   * a callback function is required for page capture requests.
6249 6249   */
6250 6250  void
6251 6251  page_capture_register_callback(uint_t index, clock_t duration,
6252 6252      int (*cb_func)(page_t *, void *, uint_t))
6253 6253  {
6254 6254          ASSERT(pc_cb[index].cb_active == 0);
6255 6255          ASSERT(cb_func != NULL);
6256 6256          rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6257 6257          pc_cb[index].duration = duration;
6258 6258          pc_cb[index].cb_func = cb_func;
6259 6259          pc_cb[index].cb_active = 1;
6260 6260          rw_exit(&pc_cb[index].cb_rwlock);
6261 6261  }
6262 6262  
6263 6263  void
6264 6264  page_capture_unregister_callback(uint_t index)
6265 6265  {
6266 6266          int i, j;
6267 6267          struct page_capture_hash_bucket *bp1;
6268 6268          struct page_capture_hash_bucket *bp2;
6269 6269          struct page_capture_hash_bucket *head = NULL;
6270 6270          uint_t flags = (1 << index);
6271 6271  
6272 6272          rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6273 6273          ASSERT(pc_cb[index].cb_active == 1);
6274 6274          pc_cb[index].duration = 0;      /* Paranoia */
6275 6275          pc_cb[index].cb_func = NULL;    /* Paranoia */
6276 6276          pc_cb[index].cb_active = 0;
6277 6277          rw_exit(&pc_cb[index].cb_rwlock);
6278 6278  
6279 6279          /*
6280 6280           * Just move all the entries to a private list which we can walk
6281 6281           * through without the need to hold any locks.
6282 6282           * No more requests can get added to the hash lists for this consumer
6283 6283           * as the cb_active field for the callback has been cleared.
6284 6284           */
6285 6285          for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
6286 6286                  mutex_enter(&page_capture_hash[i].pchh_mutex);
6287 6287                  for (j = 0; j < 2; j++) {
6288 6288                          bp1 = page_capture_hash[i].lists[j].next;
6289 6289                          /* walk through all but first (sentinel) element */
6290 6290                          while (bp1 != &page_capture_hash[i].lists[j]) {
6291 6291                                  bp2 = bp1;
6292 6292                                  if (bp2->flags & flags) {
6293 6293                                          bp1 = bp2->next;
6294 6294                                          bp1->prev = bp2->prev;
6295 6295                                          bp2->prev->next = bp1;
6296 6296                                          bp2->next = head;
6297 6297                                          head = bp2;
6298 6298                                          /*
6299 6299                                           * Clear the PR_CAPTURE bit as we
6300 6300                                           * hold appropriate locks here.
6301 6301                                           */
6302 6302                                          page_clrtoxic(head->pp, PR_CAPTURE);
6303 6303                                          page_capture_hash[i].
6304 6304                                              num_pages[bp2->pri]--;
6305 6305                                          continue;
6306 6306                                  }
6307 6307                                  bp1 = bp1->next;
6308 6308                          }
6309 6309                  }
6310 6310                  mutex_exit(&page_capture_hash[i].pchh_mutex);
6311 6311          }
6312 6312  
6313 6313          while (head != NULL) {
6314 6314                  bp1 = head;
6315 6315                  head = head->next;
6316 6316                  kmem_free(bp1, sizeof (*bp1));
6317 6317          }
6318 6318  }
6319 6319  
6320 6320  
6321 6321  /*
6322 6322   * Find pp in the active list and move it to the walked list if it
6323 6323   * exists.
6324 6324   * Note that most often pp should be at the front of the active list
6325 6325   * as it is currently used and thus there is no other sort of optimization
6326 6326   * being done here as this is a linked list data structure.
6327 6327   * Returns 1 on successful move or 0 if page could not be found.
6328 6328   */
6329 6329  static int
6330 6330  page_capture_move_to_walked(page_t *pp)
6331 6331  {
6332 6332          page_capture_hash_bucket_t *bp;
6333 6333          int index;
6334 6334  
6335 6335          index = PAGE_CAPTURE_HASH(pp);
6336 6336  
6337 6337          mutex_enter(&page_capture_hash[index].pchh_mutex);
6338 6338          bp = page_capture_hash[index].lists[0].next;
6339 6339          while (bp != &page_capture_hash[index].lists[0]) {
6340 6340                  if (bp->pp == pp) {
6341 6341                          /* Remove from old list */
6342 6342                          bp->next->prev = bp->prev;
6343 6343                          bp->prev->next = bp->next;
6344 6344  
6345 6345                          /* Add to new list */
6346 6346                          bp->next = page_capture_hash[index].lists[1].next;
6347 6347                          bp->prev = &page_capture_hash[index].lists[1];
6348 6348                          page_capture_hash[index].lists[1].next = bp;
6349 6349                          bp->next->prev = bp;
6350 6350  
6351 6351                          /*
6352 6352                           * There is a small probability of page on a free
6353 6353                           * list being retired while being allocated
6354 6354                           * and before P_RAF is set on it. The page may
6355 6355                           * end up marked as high priority request instead
6356 6356                           * of low priority request.
6357 6357                           * If P_RAF page is not marked as low priority request
6358 6358                           * change it to low priority request.
6359 6359                           */
6360 6360                          page_capture_hash[index].num_pages[bp->pri]--;
6361 6361                          bp->pri = PAGE_CAPTURE_PRIO(pp);
6362 6362                          page_capture_hash[index].num_pages[bp->pri]++;
6363 6363                          mutex_exit(&page_capture_hash[index].pchh_mutex);
6364 6364                          return (1);
6365 6365                  }
6366 6366                  bp = bp->next;
6367 6367          }
6368 6368          mutex_exit(&page_capture_hash[index].pchh_mutex);
6369 6369          return (0);
6370 6370  }
6371 6371  
6372 6372  /*
6373 6373   * Add a new entry to the page capture hash.  The only case where a new
6374 6374   * entry is not added is when the page capture consumer is no longer registered.
6375 6375   * In this case, we'll silently not add the page to the hash.  We know that
6376 6376   * page retire will always be registered for the case where we are currently
6377 6377   * unretiring a page and thus there are no conflicts.
6378 6378   */
6379 6379  static void
6380 6380  page_capture_add_hash(page_t *pp, uint_t szc, uint_t flags, void *datap)
6381 6381  {
6382 6382          page_capture_hash_bucket_t *bp1;
6383 6383          page_capture_hash_bucket_t *bp2;
6384 6384          int index;
6385 6385          int cb_index;
6386 6386          int i;
6387 6387          uchar_t pri;
6388 6388  #ifdef DEBUG
6389 6389          page_capture_hash_bucket_t *tp1;
6390 6390          int l;
6391 6391  #endif
6392 6392  
6393 6393          ASSERT(!(flags & CAPTURE_ASYNC));
6394 6394  
6395 6395          bp1 = kmem_alloc(sizeof (struct page_capture_hash_bucket), KM_SLEEP);
6396 6396  
6397 6397          bp1->pp = pp;
6398 6398          bp1->szc = szc;
6399 6399          bp1->flags = flags;
6400 6400          bp1->datap = datap;
6401 6401  
6402 6402          for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6403 6403                  if ((flags >> cb_index) & 1) {
6404 6404                          break;
6405 6405                  }
6406 6406          }
6407 6407  
6408 6408          ASSERT(cb_index != PC_NUM_CALLBACKS);
6409 6409  
6410 6410          rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6411 6411          if (pc_cb[cb_index].cb_active) {
6412 6412                  if (pc_cb[cb_index].duration == -1) {
6413 6413                          bp1->expires = (clock_t)-1;
6414 6414                  } else {
6415 6415                          bp1->expires = ddi_get_lbolt() +
6416 6416                              pc_cb[cb_index].duration;
6417 6417                  }
6418 6418          } else {
6419 6419                  /* There's no callback registered so don't add to the hash */
6420 6420                  rw_exit(&pc_cb[cb_index].cb_rwlock);
6421 6421                  kmem_free(bp1, sizeof (*bp1));
6422 6422                  return;
6423 6423          }
6424 6424  
6425 6425          index = PAGE_CAPTURE_HASH(pp);
6426 6426  
6427 6427          /*
6428 6428           * Only allow capture flag to be modified under this mutex.
6429 6429           * Prevents multiple entries for same page getting added.
6430 6430           */
6431 6431          mutex_enter(&page_capture_hash[index].pchh_mutex);
6432 6432  
6433 6433          /*
6434 6434           * if not already on the hash, set capture bit and add to the hash
6435 6435           */
6436 6436          if (!(pp->p_toxic & PR_CAPTURE)) {
6437 6437  #ifdef DEBUG
6438 6438                  /* Check for duplicate entries */
6439 6439                  for (l = 0; l < 2; l++) {
6440 6440                          tp1 = page_capture_hash[index].lists[l].next;
6441 6441                          while (tp1 != &page_capture_hash[index].lists[l]) {
6442 6442                                  if (tp1->pp == pp) {
6443 6443                                          panic("page pp 0x%p already on hash "
6444 6444                                              "at 0x%p\n",
6445 6445                                              (void *)pp, (void *)tp1);
6446 6446                                  }
6447 6447                                  tp1 = tp1->next;
6448 6448                          }
6449 6449                  }
6450 6450  
6451 6451  #endif
6452 6452                  page_settoxic(pp, PR_CAPTURE);
6453 6453                  pri = PAGE_CAPTURE_PRIO(pp);
6454 6454                  bp1->pri = pri;
6455 6455                  bp1->next = page_capture_hash[index].lists[0].next;
6456 6456                  bp1->prev = &page_capture_hash[index].lists[0];
6457 6457                  bp1->next->prev = bp1;
6458 6458                  page_capture_hash[index].lists[0].next = bp1;
6459 6459                  page_capture_hash[index].num_pages[pri]++;
6460 6460                  if (flags & CAPTURE_RETIRE) {
6461 6461                          page_retire_incr_pend_count(datap);
6462 6462                  }
6463 6463                  mutex_exit(&page_capture_hash[index].pchh_mutex);
6464 6464                  rw_exit(&pc_cb[cb_index].cb_rwlock);
6465 6465                  cv_signal(&pc_cv);
6466 6466                  return;
6467 6467          }
6468 6468  
6469 6469          /*
6470 6470           * A page retire request will replace any other request.
6471 6471           * A second physmem request which is for a different process than
6472 6472           * the currently registered one will be dropped as there is
6473 6473           * no way to hold the private data for both calls.
6474 6474           * In the future, once there are more callers, this will have to
6475 6475           * be worked out better as there needs to be private storage for
6476 6476           * at least each type of caller (maybe have datap be an array of
6477 6477           * *void's so that we can index based upon callers index).
6478 6478           */
6479 6479  
6480 6480          /* walk hash list to update expire time */
6481 6481          for (i = 0; i < 2; i++) {
6482 6482                  bp2 = page_capture_hash[index].lists[i].next;
6483 6483                  while (bp2 != &page_capture_hash[index].lists[i]) {
6484 6484                          if (bp2->pp == pp) {
6485 6485                                  if (flags & CAPTURE_RETIRE) {
6486 6486                                          if (!(bp2->flags & CAPTURE_RETIRE)) {
6487 6487                                                  page_retire_incr_pend_count(
6488 6488                                                      datap);
6489 6489                                                  bp2->flags = flags;
6490 6490                                                  bp2->expires = bp1->expires;
6491 6491                                                  bp2->datap = datap;
6492 6492                                          }
6493 6493                                  } else {
6494 6494                                          ASSERT(flags & CAPTURE_PHYSMEM);
6495 6495                                          if (!(bp2->flags & CAPTURE_RETIRE) &&
6496 6496                                              (datap == bp2->datap)) {
6497 6497                                                  bp2->expires = bp1->expires;
6498 6498                                          }
6499 6499                                  }
6500 6500                                  mutex_exit(&page_capture_hash[index].
6501 6501                                      pchh_mutex);
6502 6502                                  rw_exit(&pc_cb[cb_index].cb_rwlock);
6503 6503                                  kmem_free(bp1, sizeof (*bp1));
6504 6504                                  return;
6505 6505                          }
6506 6506                          bp2 = bp2->next;
6507 6507                  }
6508 6508          }
6509 6509  
6510 6510          /*
6511 6511           * the PR_CAPTURE flag is protected by the page_capture_hash mutexes
6512 6512           * and thus it either has to be set or not set and can't change
6513 6513           * while holding the mutex above.
6514 6514           */
6515 6515          panic("page_capture_add_hash, PR_CAPTURE flag set on pp %p\n",
6516 6516              (void *)pp);
6517 6517  }
6518 6518  
6519 6519  /*
6520 6520   * We have a page in our hands, lets try and make it ours by turning
6521 6521   * it into a clean page like it had just come off the freelists.
6522 6522   *
6523 6523   * Returns 0 on success, with the page still EXCL locked.
6524 6524   * On failure, the page will be unlocked, and returns EAGAIN
6525 6525   */
6526 6526  static int
6527 6527  page_capture_clean_page(page_t *pp)
6528 6528  {
6529 6529          page_t *newpp;
6530 6530          int skip_unlock = 0;
6531 6531          spgcnt_t count;
6532 6532          page_t *tpp;
6533 6533          int ret = 0;
6534 6534          int extra;
6535 6535  
6536 6536          ASSERT(PAGE_EXCL(pp));
6537 6537          ASSERT(!PP_RETIRED(pp));
6538 6538          ASSERT(curthread->t_flag & T_CAPTURING);
6539 6539  
6540 6540          if (PP_ISFREE(pp)) {
6541 6541                  if (!page_reclaim(pp, NULL)) {
6542 6542                          skip_unlock = 1;
6543 6543                          ret = EAGAIN;
6544 6544                          goto cleanup;
6545 6545                  }
6546 6546                  ASSERT(pp->p_szc == 0);
6547 6547                  if (pp->p_vnode != NULL) {
6548 6548                          /*
6549 6549                           * Since this page came from the
6550 6550                           * cachelist, we must destroy the
6551 6551                           * old vnode association.
6552 6552                           */
6553 6553                          page_hashout(pp, NULL);
6554 6554                  }
6555 6555                  goto cleanup;
6556 6556          }
6557 6557  
6558 6558          /*
6559 6559           * If we know page_relocate will fail, skip it
6560 6560           * It could still fail due to a UE on another page but we
6561 6561           * can't do anything about that.
6562 6562           */
6563 6563          if (pp->p_toxic & PR_UE) {
6564 6564                  goto skip_relocate;
6565 6565          }
6566 6566  
6567 6567          /*
6568 6568           * It's possible that pages can not have a vnode as fsflush comes
6569 6569           * through and cleans up these pages.  It's ugly but that's how it is.
6570 6570           */
6571 6571          if (pp->p_vnode == NULL) {
6572 6572                  goto skip_relocate;
6573 6573          }
6574 6574  
6575 6575          /*
6576 6576           * Page was not free, so lets try to relocate it.
6577 6577           * page_relocate only works with root pages, so if this is not a root
6578 6578           * page, we need to demote it to try and relocate it.
6579 6579           * Unfortunately this is the best we can do right now.
6580 6580           */
6581 6581          newpp = NULL;
6582 6582          if ((pp->p_szc > 0) && (pp != PP_PAGEROOT(pp))) {
6583 6583                  if (page_try_demote_pages(pp) == 0) {
6584 6584                          ret = EAGAIN;
6585 6585                          goto cleanup;
6586 6586                  }
6587 6587          }
6588 6588          ret = page_relocate(&pp, &newpp, 1, 0, &count, NULL);
6589 6589          if (ret == 0) {
6590 6590                  page_t *npp;
6591 6591                  /* unlock the new page(s) */
6592 6592                  while (count-- > 0) {
6593 6593                          ASSERT(newpp != NULL);
6594 6594                          npp = newpp;
6595 6595                          page_sub(&newpp, npp);
6596 6596                          page_unlock(npp);
6597 6597                  }
6598 6598                  ASSERT(newpp == NULL);
6599 6599                  /*
6600 6600                   * Check to see if the page we have is too large.
6601 6601                   * If so, demote it freeing up the extra pages.
6602 6602                   */
6603 6603                  if (pp->p_szc > 0) {
6604 6604                          /* For now demote extra pages to szc == 0 */
6605 6605                          extra = page_get_pagecnt(pp->p_szc) - 1;
6606 6606                          while (extra > 0) {
6607 6607                                  tpp = pp->p_next;
6608 6608                                  page_sub(&pp, tpp);
6609 6609                                  tpp->p_szc = 0;
6610 6610                                  page_free(tpp, 1);
6611 6611                                  extra--;
6612 6612                          }
6613 6613                          /* Make sure to set our page to szc 0 as well */
6614 6614                          ASSERT(pp->p_next == pp && pp->p_prev == pp);
6615 6615                          pp->p_szc = 0;
6616 6616                  }
6617 6617                  goto cleanup;
6618 6618          } else if (ret == EIO) {
6619 6619                  ret = EAGAIN;
6620 6620                  goto cleanup;
6621 6621          } else {
6622 6622                  /*
6623 6623                   * Need to reset return type as we failed to relocate the page
6624 6624                   * but that does not mean that some of the next steps will not
6625 6625                   * work.
6626 6626                   */
6627 6627                  ret = 0;
6628 6628          }
6629 6629  
6630 6630  skip_relocate:
6631 6631  
6632 6632          if (pp->p_szc > 0) {
6633 6633                  if (page_try_demote_pages(pp) == 0) {
6634 6634                          ret = EAGAIN;
6635 6635                          goto cleanup;
6636 6636                  }
6637 6637          }
6638 6638  
6639 6639          ASSERT(pp->p_szc == 0);
6640 6640  
6641 6641          if (hat_ismod(pp)) {
6642 6642                  ret = EAGAIN;
6643 6643                  goto cleanup;
6644 6644          }
6645 6645          if (PP_ISKAS(pp)) {
6646 6646                  ret = EAGAIN;
6647 6647                  goto cleanup;
6648 6648          }
6649 6649          if (pp->p_lckcnt || pp->p_cowcnt) {
6650 6650                  ret = EAGAIN;
6651 6651                  goto cleanup;
6652 6652          }
6653 6653  
6654 6654          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
6655 6655          ASSERT(!hat_page_is_mapped(pp));
6656 6656  
6657 6657          if (hat_ismod(pp)) {
6658 6658                  /*
6659 6659                   * This is a semi-odd case as the page is now modified but not
6660 6660                   * mapped as we just unloaded the mappings above.
6661 6661                   */
6662 6662                  ret = EAGAIN;
6663 6663                  goto cleanup;
6664 6664          }
6665 6665          if (pp->p_vnode != NULL) {
6666 6666                  page_hashout(pp, NULL);
6667 6667          }
6668 6668  
6669 6669          /*
6670 6670           * At this point, the page should be in a clean state and
6671 6671           * we can do whatever we want with it.
6672 6672           */
6673 6673  
6674 6674  cleanup:
6675 6675          if (ret != 0) {
6676 6676                  if (!skip_unlock) {
6677 6677                          page_unlock(pp);
6678 6678                  }
6679 6679          } else {
6680 6680                  ASSERT(pp->p_szc == 0);
6681 6681                  ASSERT(PAGE_EXCL(pp));
6682 6682  
6683 6683                  pp->p_next = pp;
6684 6684                  pp->p_prev = pp;
6685 6685          }
6686 6686          return (ret);
6687 6687  }
6688 6688  
6689 6689  /*
6690 6690   * Various callers of page_trycapture() can have different restrictions upon
6691 6691   * what memory they have access to.
6692 6692   * Returns 0 on success, with the following error codes on failure:
6693 6693   *      EPERM - The requested page is long term locked, and thus repeated
6694 6694   *              requests to capture this page will likely fail.
6695 6695   *      ENOMEM - There was not enough free memory in the system to safely
6696 6696   *              map the requested page.
6697 6697   *      ENOENT - The requested page was inside the kernel cage, and the
6698 6698   *              PHYSMEM_CAGE flag was not set.
6699 6699   */
6700 6700  int
6701 6701  page_capture_pre_checks(page_t *pp, uint_t flags)
6702 6702  {
6703 6703          ASSERT(pp != NULL);
6704 6704  
6705 6705  #if defined(__sparc)
6706 6706          if (pp->p_vnode == &promvp) {
6707 6707                  return (EPERM);
6708 6708          }
6709 6709  
6710 6710          if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE) &&
6711 6711              (flags & CAPTURE_PHYSMEM)) {
6712 6712                  return (ENOENT);
6713 6713          }
6714 6714  
6715 6715          if (PP_ISNORELOCKERNEL(pp)) {
6716 6716                  return (EPERM);
6717 6717          }
6718 6718  #else
6719 6719          if (PP_ISKAS(pp)) {
6720 6720                  return (EPERM);
6721 6721          }
6722 6722  #endif /* __sparc */
6723 6723  
6724 6724          /* only physmem currently has the restrictions checked below */
6725 6725          if (!(flags & CAPTURE_PHYSMEM)) {
6726 6726                  return (0);
6727 6727          }
6728 6728  
6729 6729          if (availrmem < swapfs_minfree) {
6730 6730                  /*
6731 6731                   * We won't try to capture this page as we are
6732 6732                   * running low on memory.
6733 6733                   */
6734 6734                  return (ENOMEM);
6735 6735          }
6736 6736          return (0);
6737 6737  }
6738 6738  
6739 6739  /*
6740 6740   * Once we have a page in our mits, go ahead and complete the capture
6741 6741   * operation.
6742 6742   * Returns 1 on failure where page is no longer needed
6743 6743   * Returns 0 on success
6744 6744   * Returns -1 if there was a transient failure.
6745 6745   * Failure cases must release the SE_EXCL lock on pp (usually via page_free).
6746 6746   */
6747 6747  int
6748 6748  page_capture_take_action(page_t *pp, uint_t flags, void *datap)
6749 6749  {
6750 6750          int cb_index;
6751 6751          int ret = 0;
6752 6752          page_capture_hash_bucket_t *bp1;
6753 6753          page_capture_hash_bucket_t *bp2;
6754 6754          int index;
6755 6755          int found = 0;
6756 6756          int i;
6757 6757  
6758 6758          ASSERT(PAGE_EXCL(pp));
6759 6759          ASSERT(curthread->t_flag & T_CAPTURING);
6760 6760  
6761 6761          for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6762 6762                  if ((flags >> cb_index) & 1) {
6763 6763                          break;
6764 6764                  }
6765 6765          }
6766 6766          ASSERT(cb_index < PC_NUM_CALLBACKS);
6767 6767  
6768 6768          /*
6769 6769           * Remove the entry from the page_capture hash, but don't free it yet
6770 6770           * as we may need to put it back.
6771 6771           * Since we own the page at this point in time, we should find it
6772 6772           * in the hash if this is an ASYNC call.  If we don't it's likely
6773 6773           * that the page_capture_async() thread decided that this request
6774 6774           * had expired, in which case we just continue on.
6775 6775           */
6776 6776          if (flags & CAPTURE_ASYNC) {
6777 6777  
6778 6778                  index = PAGE_CAPTURE_HASH(pp);
6779 6779  
6780 6780                  mutex_enter(&page_capture_hash[index].pchh_mutex);
6781 6781                  for (i = 0; i < 2 && !found; i++) {
6782 6782                          bp1 = page_capture_hash[index].lists[i].next;
6783 6783                          while (bp1 != &page_capture_hash[index].lists[i]) {
6784 6784                                  if (bp1->pp == pp) {
6785 6785                                          bp1->next->prev = bp1->prev;
6786 6786                                          bp1->prev->next = bp1->next;
6787 6787                                          page_capture_hash[index].
6788 6788                                              num_pages[bp1->pri]--;
6789 6789                                          page_clrtoxic(pp, PR_CAPTURE);
6790 6790                                          found = 1;
6791 6791                                          break;
6792 6792                                  }
6793 6793                                  bp1 = bp1->next;
6794 6794                          }
6795 6795                  }
6796 6796                  mutex_exit(&page_capture_hash[index].pchh_mutex);
6797 6797          }
6798 6798  
6799 6799          /* Synchronize with the unregister func. */
6800 6800          rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6801 6801          if (!pc_cb[cb_index].cb_active) {
6802 6802                  page_free(pp, 1);
6803 6803                  rw_exit(&pc_cb[cb_index].cb_rwlock);
6804 6804                  if (found) {
6805 6805                          kmem_free(bp1, sizeof (*bp1));
6806 6806                  }
6807 6807                  return (1);
6808 6808          }
6809 6809  
6810 6810          /*
6811 6811           * We need to remove the entry from the page capture hash and turn off
6812 6812           * the PR_CAPTURE bit before calling the callback.  We'll need to cache
6813 6813           * the entry here, and then based upon the return value, cleanup
6814 6814           * appropriately or re-add it to the hash, making sure that someone else
6815 6815           * hasn't already done so.
6816 6816           * It should be rare for the callback to fail and thus it's ok for
6817 6817           * the failure path to be a bit complicated as the success path is
6818 6818           * cleaner and the locking rules are easier to follow.
6819 6819           */
6820 6820  
6821 6821          ret = pc_cb[cb_index].cb_func(pp, datap, flags);
6822 6822  
6823 6823          rw_exit(&pc_cb[cb_index].cb_rwlock);
6824 6824  
6825 6825          /*
6826 6826           * If this was an ASYNC request, we need to cleanup the hash if the
6827 6827           * callback was successful or if the request was no longer valid.
6828 6828           * For non-ASYNC requests, we return failure to map and the caller
6829 6829           * will take care of adding the request to the hash.
6830 6830           * Note also that the callback itself is responsible for the page
6831 6831           * at this point in time in terms of locking ...  The most common
6832 6832           * case for the failure path should just be a page_free.
6833 6833           */
6834 6834          if (ret >= 0) {
6835 6835                  if (found) {
6836 6836                          if (bp1->flags & CAPTURE_RETIRE) {
6837 6837                                  page_retire_decr_pend_count(datap);
6838 6838                          }
6839 6839                          kmem_free(bp1, sizeof (*bp1));
6840 6840                  }
6841 6841                  return (ret);
6842 6842          }
6843 6843          if (!found) {
6844 6844                  return (ret);
6845 6845          }
6846 6846  
6847 6847          ASSERT(flags & CAPTURE_ASYNC);
6848 6848  
6849 6849          /*
6850 6850           * Check for expiration time first as we can just free it up if it's
6851 6851           * expired.
6852 6852           */
6853 6853          if (ddi_get_lbolt() > bp1->expires && bp1->expires != -1) {
6854 6854                  kmem_free(bp1, sizeof (*bp1));
6855 6855                  return (ret);
6856 6856          }
6857 6857  
6858 6858          /*
6859 6859           * The callback failed and there used to be an entry in the hash for
6860 6860           * this page, so we need to add it back to the hash.
6861 6861           */
6862 6862          mutex_enter(&page_capture_hash[index].pchh_mutex);
6863 6863          if (!(pp->p_toxic & PR_CAPTURE)) {
6864 6864                  /* just add bp1 back to head of walked list */
6865 6865                  page_settoxic(pp, PR_CAPTURE);
6866 6866                  bp1->next = page_capture_hash[index].lists[1].next;
6867 6867                  bp1->prev = &page_capture_hash[index].lists[1];
6868 6868                  bp1->next->prev = bp1;
6869 6869                  bp1->pri = PAGE_CAPTURE_PRIO(pp);
6870 6870                  page_capture_hash[index].lists[1].next = bp1;
6871 6871                  page_capture_hash[index].num_pages[bp1->pri]++;
6872 6872                  mutex_exit(&page_capture_hash[index].pchh_mutex);
6873 6873                  return (ret);
6874 6874          }
6875 6875  
6876 6876          /*
6877 6877           * Otherwise there was a new capture request added to list
6878 6878           * Need to make sure that our original data is represented if
6879 6879           * appropriate.
6880 6880           */
6881 6881          for (i = 0; i < 2; i++) {
6882 6882                  bp2 = page_capture_hash[index].lists[i].next;
6883 6883                  while (bp2 != &page_capture_hash[index].lists[i]) {
6884 6884                          if (bp2->pp == pp) {
6885 6885                                  if (bp1->flags & CAPTURE_RETIRE) {
6886 6886                                          if (!(bp2->flags & CAPTURE_RETIRE)) {
6887 6887                                                  bp2->szc = bp1->szc;
6888 6888                                                  bp2->flags = bp1->flags;
6889 6889                                                  bp2->expires = bp1->expires;
6890 6890                                                  bp2->datap = bp1->datap;
6891 6891                                          }
6892 6892                                  } else {
6893 6893                                          ASSERT(bp1->flags & CAPTURE_PHYSMEM);
6894 6894                                          if (!(bp2->flags & CAPTURE_RETIRE)) {
6895 6895                                                  bp2->szc = bp1->szc;
6896 6896                                                  bp2->flags = bp1->flags;
6897 6897                                                  bp2->expires = bp1->expires;
6898 6898                                                  bp2->datap = bp1->datap;
6899 6899                                          }
6900 6900                                  }
6901 6901                                  page_capture_hash[index].num_pages[bp2->pri]--;
6902 6902                                  bp2->pri = PAGE_CAPTURE_PRIO(pp);
6903 6903                                  page_capture_hash[index].num_pages[bp2->pri]++;
6904 6904                                  mutex_exit(&page_capture_hash[index].
6905 6905                                      pchh_mutex);
6906 6906                                  kmem_free(bp1, sizeof (*bp1));
6907 6907                                  return (ret);
6908 6908                          }
6909 6909                          bp2 = bp2->next;
6910 6910                  }
6911 6911          }
6912 6912          panic("PR_CAPTURE set but not on hash for pp 0x%p\n", (void *)pp);
6913 6913          /*NOTREACHED*/
6914 6914  }
6915 6915  
6916 6916  /*
6917 6917   * Try to capture the given page for the caller specified in the flags
6918 6918   * parameter.  The page will either be captured and handed over to the
6919 6919   * appropriate callback, or will be queued up in the page capture hash
6920 6920   * to be captured asynchronously.
6921 6921   * If the current request is due to an async capture, the page must be
6922 6922   * exclusively locked before calling this function.
6923 6923   * Currently szc must be 0 but in the future this should be expandable to
6924 6924   * other page sizes.
6925 6925   * Returns 0 on success, with the following error codes on failure:
6926 6926   *      EPERM - The requested page is long term locked, and thus repeated
6927 6927   *              requests to capture this page will likely fail.
6928 6928   *      ENOMEM - There was not enough free memory in the system to safely
6929 6929   *              map the requested page.
6930 6930   *      ENOENT - The requested page was inside the kernel cage, and the
6931 6931   *              CAPTURE_GET_CAGE flag was not set.
6932 6932   *      EAGAIN - The requested page could not be capturead at this point in
6933 6933   *              time but future requests will likely work.
6934 6934   *      EBUSY - The requested page is retired and the CAPTURE_GET_RETIRED flag
6935 6935   *              was not set.
6936 6936   */
6937 6937  int
6938 6938  page_itrycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
6939 6939  {
6940 6940          int ret;
6941 6941          int cb_index;
6942 6942  
6943 6943          if (flags & CAPTURE_ASYNC) {
6944 6944                  ASSERT(PAGE_EXCL(pp));
6945 6945                  goto async;
6946 6946          }
6947 6947  
6948 6948          /* Make sure there's enough availrmem ... */
6949 6949          ret = page_capture_pre_checks(pp, flags);
6950 6950          if (ret != 0) {
6951 6951                  return (ret);
6952 6952          }
6953 6953  
6954 6954          if (!page_trylock(pp, SE_EXCL)) {
6955 6955                  for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6956 6956                          if ((flags >> cb_index) & 1) {
6957 6957                                  break;
6958 6958                          }
6959 6959                  }
6960 6960                  ASSERT(cb_index < PC_NUM_CALLBACKS);
6961 6961                  ret = EAGAIN;
6962 6962                  /* Special case for retired pages */
6963 6963                  if (PP_RETIRED(pp)) {
6964 6964                          if (flags & CAPTURE_GET_RETIRED) {
6965 6965                                  if (!page_unretire_pp(pp, PR_UNR_TEMP)) {
6966 6966                                          /*
6967 6967                                           * Need to set capture bit and add to
6968 6968                                           * hash so that the page will be
6969 6969                                           * retired when freed.
6970 6970                                           */
6971 6971                                          page_capture_add_hash(pp, szc,
6972 6972                                              CAPTURE_RETIRE, NULL);
6973 6973                                          ret = 0;
6974 6974                                          goto own_page;
6975 6975                                  }
6976 6976                          } else {
6977 6977                                  return (EBUSY);
6978 6978                          }
6979 6979                  }
6980 6980                  page_capture_add_hash(pp, szc, flags, datap);
6981 6981                  return (ret);
6982 6982          }
6983 6983  
6984 6984  async:
6985 6985          ASSERT(PAGE_EXCL(pp));
6986 6986  
6987 6987          /* Need to check for physmem async requests that availrmem is sane */
6988 6988          if ((flags & (CAPTURE_ASYNC | CAPTURE_PHYSMEM)) ==
6989 6989              (CAPTURE_ASYNC | CAPTURE_PHYSMEM) &&
6990 6990              (availrmem < swapfs_minfree)) {
6991 6991                  page_unlock(pp);
6992 6992                  return (ENOMEM);
6993 6993          }
6994 6994  
6995 6995          ret = page_capture_clean_page(pp);
6996 6996  
6997 6997          if (ret != 0) {
6998 6998                  /* We failed to get the page, so lets add it to the hash */
6999 6999                  if (!(flags & CAPTURE_ASYNC)) {
7000 7000                          page_capture_add_hash(pp, szc, flags, datap);
7001 7001                  }
7002 7002                  return (ret);
7003 7003          }
7004 7004  
7005 7005  own_page:
7006 7006          ASSERT(PAGE_EXCL(pp));
7007 7007          ASSERT(pp->p_szc == 0);
7008 7008  
7009 7009          /* Call the callback */
7010 7010          ret = page_capture_take_action(pp, flags, datap);
7011 7011  
7012 7012          if (ret == 0) {
7013 7013                  return (0);
7014 7014          }
7015 7015  
7016 7016          /*
7017 7017           * Note that in the failure cases from page_capture_take_action, the
7018 7018           * EXCL lock will have already been dropped.
7019 7019           */
7020 7020          if ((ret == -1) && (!(flags & CAPTURE_ASYNC))) {
7021 7021                  page_capture_add_hash(pp, szc, flags, datap);
7022 7022          }
7023 7023          return (EAGAIN);
7024 7024  }
7025 7025  
7026 7026  int
7027 7027  page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
7028 7028  {
7029 7029          int ret;
7030 7030  
7031 7031          curthread->t_flag |= T_CAPTURING;
7032 7032          ret = page_itrycapture(pp, szc, flags, datap);
7033 7033          curthread->t_flag &= ~T_CAPTURING; /* xor works as we know its set */
7034 7034          return (ret);
7035 7035  }
7036 7036  
7037 7037  /*
7038 7038   * When unlocking a page which has the PR_CAPTURE bit set, this routine
7039 7039   * gets called to try and capture the page.
7040 7040   */
7041 7041  void
7042 7042  page_unlock_capture(page_t *pp)
7043 7043  {
7044 7044          page_capture_hash_bucket_t *bp;
7045 7045          int index;
7046 7046          int i;
7047 7047          uint_t szc;
7048 7048          uint_t flags = 0;
7049 7049          void *datap;
7050 7050          kmutex_t *mp;
7051 7051          extern vnode_t retired_pages;
7052 7052  
7053 7053          /*
7054 7054           * We need to protect against a possible deadlock here where we own
7055 7055           * the vnode page hash mutex and want to acquire it again as there
7056 7056           * are locations in the code, where we unlock a page while holding
7057 7057           * the mutex which can lead to the page being captured and eventually
7058 7058           * end up here.  As we may be hashing out the old page and hashing into
7059 7059           * the retire vnode, we need to make sure we don't own them.
7060 7060           * Other callbacks who do hash operations also need to make sure that
7061 7061           * before they hashin to a vnode that they do not currently own the
7062 7062           * vphm mutex otherwise there will be a panic.
7063 7063           */
7064 7064          if (mutex_owned(page_vnode_mutex(&retired_pages))) {
7065 7065                  page_unlock_nocapture(pp);
7066 7066                  return;
7067 7067          }
7068 7068          if (pp->p_vnode != NULL && mutex_owned(page_vnode_mutex(pp->p_vnode))) {
7069 7069                  page_unlock_nocapture(pp);
7070 7070                  return;
7071 7071          }
7072 7072  
7073 7073          index = PAGE_CAPTURE_HASH(pp);
7074 7074  
7075 7075          mp = &page_capture_hash[index].pchh_mutex;
7076 7076          mutex_enter(mp);
7077 7077          for (i = 0; i < 2; i++) {
7078 7078                  bp = page_capture_hash[index].lists[i].next;
7079 7079                  while (bp != &page_capture_hash[index].lists[i]) {
7080 7080                          if (bp->pp == pp) {
7081 7081                                  szc = bp->szc;
7082 7082                                  flags = bp->flags | CAPTURE_ASYNC;
7083 7083                                  datap = bp->datap;
7084 7084                                  mutex_exit(mp);
7085 7085                                  (void) page_trycapture(pp, szc, flags, datap);
7086 7086                                  return;
7087 7087                          }
7088 7088                          bp = bp->next;
7089 7089                  }
7090 7090          }
7091 7091  
7092 7092          /* Failed to find page in hash so clear flags and unlock it. */
7093 7093          page_clrtoxic(pp, PR_CAPTURE);
7094 7094          page_unlock(pp);
7095 7095  
7096 7096          mutex_exit(mp);
7097 7097  }
7098 7098  
7099 7099  void
7100 7100  page_capture_init()
7101 7101  {
7102 7102          int i;
7103 7103          for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7104 7104                  page_capture_hash[i].lists[0].next =
7105 7105                      &page_capture_hash[i].lists[0];
7106 7106                  page_capture_hash[i].lists[0].prev =
7107 7107                      &page_capture_hash[i].lists[0];
7108 7108                  page_capture_hash[i].lists[1].next =
7109 7109                      &page_capture_hash[i].lists[1];
7110 7110                  page_capture_hash[i].lists[1].prev =
7111 7111                      &page_capture_hash[i].lists[1];
7112 7112          }
7113 7113  
7114 7114          pc_thread_shortwait = 23 * hz;
7115 7115          pc_thread_longwait = 1201 * hz;
7116 7116          pc_thread_retry = 3;
7117 7117          mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
7118 7118          cv_init(&pc_cv, NULL, CV_DEFAULT, NULL);
7119 7119          pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0,
7120 7120              TS_RUN, minclsyspri);
7121 7121  }
7122 7122  
7123 7123  /*
7124 7124   * It is necessary to scrub any failing pages prior to reboot in order to
7125 7125   * prevent a latent error trap from occurring on the next boot.
7126 7126   */
7127 7127  void
7128 7128  page_retire_mdboot()
7129 7129  {
7130 7130          page_t *pp;
7131 7131          int i, j;
7132 7132          page_capture_hash_bucket_t *bp;
7133 7133          uchar_t pri;
7134 7134  
7135 7135          /* walk lists looking for pages to scrub */
7136 7136          for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7137 7137                  for (pri = 0; pri < PC_NUM_PRI; pri++) {
7138 7138                          if (page_capture_hash[i].num_pages[pri] != 0) {
7139 7139                                  break;
7140 7140                          }
7141 7141                  }
7142 7142                  if (pri == PC_NUM_PRI)
7143 7143                          continue;
7144 7144  
7145 7145                  mutex_enter(&page_capture_hash[i].pchh_mutex);
7146 7146  
7147 7147                  for (j = 0; j < 2; j++) {
7148 7148                          bp = page_capture_hash[i].lists[j].next;
7149 7149                          while (bp != &page_capture_hash[i].lists[j]) {
7150 7150                                  pp = bp->pp;
7151 7151                                  if (PP_TOXIC(pp)) {
7152 7152                                          if (page_trylock(pp, SE_EXCL)) {
7153 7153                                                  PP_CLRFREE(pp);
7154 7154                                                  pagescrub(pp, 0, PAGESIZE);
7155 7155                                                  page_unlock(pp);
7156 7156                                          }
7157 7157                                  }
7158 7158                                  bp = bp->next;
7159 7159                          }
7160 7160                  }
7161 7161                  mutex_exit(&page_capture_hash[i].pchh_mutex);
7162 7162          }
7163 7163  }
7164 7164  
7165 7165  /*
7166 7166   * Walk the page_capture_hash trying to capture pages and also cleanup old
7167 7167   * entries which have expired.
7168 7168   */
7169 7169  void
7170 7170  page_capture_async()
7171 7171  {
7172 7172          page_t *pp;
7173 7173          int i;
7174 7174          int ret;
7175 7175          page_capture_hash_bucket_t *bp1, *bp2;
7176 7176          uint_t szc;
7177 7177          uint_t flags;
7178 7178          void *datap;
7179 7179          uchar_t pri;
7180 7180  
7181 7181          /* If there are outstanding pages to be captured, get to work */
7182 7182          for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7183 7183                  for (pri = 0; pri < PC_NUM_PRI; pri++) {
7184 7184                          if (page_capture_hash[i].num_pages[pri] != 0)
7185 7185                                  break;
7186 7186                  }
7187 7187                  if (pri == PC_NUM_PRI)
7188 7188                          continue;
7189 7189  
7190 7190                  /* Append list 1 to list 0 and then walk through list 0 */
7191 7191                  mutex_enter(&page_capture_hash[i].pchh_mutex);
7192 7192                  bp1 = &page_capture_hash[i].lists[1];
7193 7193                  bp2 = bp1->next;
7194 7194                  if (bp1 != bp2) {
7195 7195                          bp1->prev->next = page_capture_hash[i].lists[0].next;
7196 7196                          bp2->prev = &page_capture_hash[i].lists[0];
7197 7197                          page_capture_hash[i].lists[0].next->prev = bp1->prev;
7198 7198                          page_capture_hash[i].lists[0].next = bp2;
7199 7199                          bp1->next = bp1;
7200 7200                          bp1->prev = bp1;
7201 7201                  }
7202 7202  
7203 7203                  /* list[1] will be empty now */
7204 7204  
7205 7205                  bp1 = page_capture_hash[i].lists[0].next;
7206 7206                  while (bp1 != &page_capture_hash[i].lists[0]) {
7207 7207                          /* Check expiration time */
7208 7208                          if ((ddi_get_lbolt() > bp1->expires &&
7209 7209                              bp1->expires != -1) ||
7210 7210                              page_deleted(bp1->pp)) {
7211 7211                                  page_capture_hash[i].lists[0].next = bp1->next;
7212 7212                                  bp1->next->prev =
7213 7213                                      &page_capture_hash[i].lists[0];
7214 7214                                  page_capture_hash[i].num_pages[bp1->pri]--;
7215 7215  
7216 7216                                  /*
7217 7217                                   * We can safely remove the PR_CAPTURE bit
7218 7218                                   * without holding the EXCL lock on the page
7219 7219                                   * as the PR_CAPTURE bit requres that the
7220 7220                                   * page_capture_hash[].pchh_mutex be held
7221 7221                                   * to modify it.
7222 7222                                   */
7223 7223                                  page_clrtoxic(bp1->pp, PR_CAPTURE);
7224 7224                                  mutex_exit(&page_capture_hash[i].pchh_mutex);
7225 7225                                  kmem_free(bp1, sizeof (*bp1));
7226 7226                                  mutex_enter(&page_capture_hash[i].pchh_mutex);
7227 7227                                  bp1 = page_capture_hash[i].lists[0].next;
7228 7228                                  continue;
7229 7229                          }
7230 7230                          pp = bp1->pp;
7231 7231                          szc = bp1->szc;
7232 7232                          flags = bp1->flags;
7233 7233                          datap = bp1->datap;
7234 7234                          mutex_exit(&page_capture_hash[i].pchh_mutex);
7235 7235                          if (page_trylock(pp, SE_EXCL)) {
7236 7236                                  ret = page_trycapture(pp, szc,
7237 7237                                      flags | CAPTURE_ASYNC, datap);
7238 7238                          } else {
7239 7239                                  ret = 1;        /* move to walked hash */
7240 7240                          }
7241 7241  
7242 7242                          if (ret != 0) {
7243 7243                                  /* Move to walked hash */
7244 7244                                  (void) page_capture_move_to_walked(pp);
7245 7245                          }
7246 7246                          mutex_enter(&page_capture_hash[i].pchh_mutex);
7247 7247                          bp1 = page_capture_hash[i].lists[0].next;
7248 7248                  }
7249 7249  
7250 7250                  mutex_exit(&page_capture_hash[i].pchh_mutex);
7251 7251          }
7252 7252  }
7253 7253  
7254 7254  /*
7255 7255   * This function is called by the page_capture_thread, and is needed in
7256 7256   * in order to initiate aio cleanup, so that pages used in aio
7257 7257   * will be unlocked and subsequently retired by page_capture_thread.
7258 7258   */
7259 7259  static int
7260 7260  do_aio_cleanup(void)
7261 7261  {
7262 7262          proc_t *procp;
7263 7263          int (*aio_cleanup_dr_delete_memory)(proc_t *);
7264 7264          int cleaned = 0;
7265 7265  
7266 7266          if (modload("sys", "kaio") == -1) {
7267 7267                  cmn_err(CE_WARN, "do_aio_cleanup: cannot load kaio");
7268 7268                  return (0);
7269 7269          }
7270 7270          /*
7271 7271           * We use the aio_cleanup_dr_delete_memory function to
7272 7272           * initiate the actual clean up; this function will wake
7273 7273           * up the per-process aio_cleanup_thread.
7274 7274           */
7275 7275          aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
7276 7276              modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
7277 7277          if (aio_cleanup_dr_delete_memory == NULL) {
7278 7278                  cmn_err(CE_WARN,
7279 7279              "aio_cleanup_dr_delete_memory not found in kaio");
7280 7280                  return (0);
7281 7281          }
7282 7282          mutex_enter(&pidlock);
7283 7283          for (procp = practive; (procp != NULL); procp = procp->p_next) {
7284 7284                  mutex_enter(&procp->p_lock);
7285 7285                  if (procp->p_aio != NULL) {
7286 7286                          /* cleanup proc's outstanding kaio */
7287 7287                          cleaned += (*aio_cleanup_dr_delete_memory)(procp);
7288 7288                  }
7289 7289                  mutex_exit(&procp->p_lock);
7290 7290          }
7291 7291          mutex_exit(&pidlock);
7292 7292          return (cleaned);
7293 7293  }
7294 7294  
7295 7295  /*
7296 7296   * helper function for page_capture_thread
7297 7297   */
7298 7298  static void
7299 7299  page_capture_handle_outstanding(void)
7300 7300  {
7301 7301          int ntry;
7302 7302  
7303 7303          /* Reap pages before attempting capture pages */
7304 7304          kmem_reap();
7305 7305  
7306 7306          if ((page_retire_pend_count() > page_retire_pend_kas_count()) &&
7307 7307              hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
7308 7308                  /*
7309 7309                   * Note: Purging only for platforms that support
7310 7310                   * ISM hat_pageunload() - mainly SPARC. On x86/x64
7311 7311                   * platforms ISM pages SE_SHARED locked until destroyed.
7312 7312                   */
7313 7313  
7314 7314                  /* disable and purge seg_pcache */
7315 7315                  (void) seg_p_disable();
7316 7316                  for (ntry = 0; ntry < pc_thread_retry; ntry++) {
7317 7317                          if (!page_retire_pend_count())
7318 7318                                  break;
7319 7319                          if (do_aio_cleanup()) {
7320 7320                                  /*
7321 7321                                   * allow the apps cleanup threads
7322 7322                                   * to run
7323 7323                                   */
7324 7324                                  delay(pc_thread_shortwait);
7325 7325                          }
7326 7326                          page_capture_async();
7327 7327                  }
7328 7328                  /* reenable seg_pcache */
7329 7329                  seg_p_enable();
7330 7330  
7331 7331                  /* completed what can be done.  break out */
7332 7332                  return;
7333 7333          }
7334 7334  
7335 7335          /*
7336 7336           * For kernel pages and/or unsupported HAT_DYNAMIC_ISM_UNMAP, reap
7337 7337           * and then attempt to capture.
7338 7338           */
7339 7339          seg_preap();
7340 7340          page_capture_async();
7341 7341  }
7342 7342  
7343 7343  /*
7344 7344   * The page_capture_thread loops forever, looking to see if there are
7345 7345   * pages still waiting to be captured.
7346 7346   */
7347 7347  static void
7348 7348  page_capture_thread(void)
7349 7349  {
7350 7350          callb_cpr_t c;
7351 7351          int i;
7352 7352          int high_pri_pages;
7353 7353          int low_pri_pages;
7354 7354          clock_t timeout;
7355 7355  
7356 7356          CALLB_CPR_INIT(&c, &pc_thread_mutex, callb_generic_cpr, "page_capture");
7357 7357  
7358 7358          mutex_enter(&pc_thread_mutex);
7359 7359          for (;;) {
7360 7360                  high_pri_pages = 0;
7361 7361                  low_pri_pages = 0;
7362 7362                  for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7363 7363                          high_pri_pages +=
7364 7364                              page_capture_hash[i].num_pages[PC_PRI_HI];
7365 7365                          low_pri_pages +=
7366 7366                              page_capture_hash[i].num_pages[PC_PRI_LO];
7367 7367                  }
7368 7368  
7369 7369                  timeout = pc_thread_longwait;
7370 7370                  if (high_pri_pages != 0) {
7371 7371                          timeout = pc_thread_shortwait;
7372 7372                          page_capture_handle_outstanding();
7373 7373                  } else if (low_pri_pages != 0) {
7374 7374                          page_capture_async();
7375 7375                  }
7376 7376                  CALLB_CPR_SAFE_BEGIN(&c);
7377 7377                  (void) cv_reltimedwait(&pc_cv, &pc_thread_mutex,
7378 7378                      timeout, TR_CLOCK_TICK);
7379 7379                  CALLB_CPR_SAFE_END(&c, &pc_thread_mutex);
7380 7380          }
7381 7381          /*NOTREACHED*/
7382 7382  }
7383 7383  /*
7384 7384   * Attempt to locate a bucket that has enough pages to satisfy the request.
7385 7385   * The initial check is done without the lock to avoid unneeded contention.
7386 7386   * The function returns 1 if enough pages were found, else 0 if it could not
7387 7387   * find enough pages in a bucket.
7388 7388   */
7389 7389  static int
7390 7390  pcf_decrement_bucket(pgcnt_t npages)
7391 7391  {
7392 7392          struct pcf      *p;
7393 7393          struct pcf      *q;
7394 7394          int i;
7395 7395  
7396 7396          p = &pcf[PCF_INDEX()];
7397 7397          q = &pcf[pcf_fanout];
7398 7398          for (i = 0; i < pcf_fanout; i++) {
7399 7399                  if (p->pcf_count > npages) {
7400 7400                          /*
7401 7401                           * a good one to try.
7402 7402                           */
7403 7403                          mutex_enter(&p->pcf_lock);
7404 7404                          if (p->pcf_count > npages) {
7405 7405                                  p->pcf_count -= (uint_t)npages;
7406 7406                                  /*
7407 7407                                   * freemem is not protected by any lock.
7408 7408                                   * Thus, we cannot have any assertion
7409 7409                                   * containing freemem here.
7410 7410                                   */
7411 7411                                  freemem -= npages;
7412 7412                                  mutex_exit(&p->pcf_lock);
7413 7413                                  return (1);
7414 7414                          }
7415 7415                          mutex_exit(&p->pcf_lock);
7416 7416                  }
7417 7417                  p++;
7418 7418                  if (p >= q) {
7419 7419                          p = pcf;
7420 7420                  }
7421 7421          }
7422 7422          return (0);
7423 7423  }
7424 7424  
7425 7425  /*
7426 7426   * Arguments:
7427 7427   *      pcftotal_ret:   If the value is not NULL and we have walked all the
7428 7428   *                      buckets but did not find enough pages then it will
7429 7429   *                      be set to the total number of pages in all the pcf
7430 7430   *                      buckets.
7431 7431   *      npages:         Is the number of pages we have been requested to
7432 7432   *                      find.
7433 7433   *      unlock:         If set to 0 we will leave the buckets locked if the
7434 7434   *                      requested number of pages are not found.
7435 7435   *
7436 7436   * Go and try to satisfy the page request  from any number of buckets.
7437 7437   * This can be a very expensive operation as we have to lock the buckets
7438 7438   * we are checking (and keep them locked), starting at bucket 0.
7439 7439   *
7440 7440   * The function returns 1 if enough pages were found, else 0 if it could not
7441 7441   * find enough pages in the buckets.
7442 7442   *
7443 7443   */
7444 7444  static int
7445 7445  pcf_decrement_multiple(pgcnt_t *pcftotal_ret, pgcnt_t npages, int unlock)
7446 7446  {
7447 7447          struct pcf      *p;
7448 7448          pgcnt_t pcftotal;
7449 7449          int i;
7450 7450  
7451 7451          p = pcf;
7452 7452          /* try to collect pages from several pcf bins */
7453 7453          for (pcftotal = 0, i = 0; i < pcf_fanout; i++) {
7454 7454                  mutex_enter(&p->pcf_lock);
7455 7455                  pcftotal += p->pcf_count;
7456 7456                  if (pcftotal >= npages) {
7457 7457                          /*
7458 7458                           * Wow!  There are enough pages laying around
7459 7459                           * to satisfy the request.  Do the accounting,
7460 7460                           * drop the locks we acquired, and go back.
7461 7461                           *
7462 7462                           * freemem is not protected by any lock. So,
7463 7463                           * we cannot have any assertion containing
7464 7464                           * freemem.
7465 7465                           */
7466 7466                          freemem -= npages;
7467 7467                          while (p >= pcf) {
7468 7468                                  if (p->pcf_count <= npages) {
7469 7469                                          npages -= p->pcf_count;
7470 7470                                          p->pcf_count = 0;
7471 7471                                  } else {
7472 7472                                          p->pcf_count -= (uint_t)npages;
7473 7473                                          npages = 0;
7474 7474                                  }
7475 7475                                  mutex_exit(&p->pcf_lock);
7476 7476                                  p--;
7477 7477                          }
7478 7478                          ASSERT(npages == 0);
7479 7479                          return (1);
7480 7480                  }
7481 7481                  p++;
7482 7482          }
7483 7483          if (unlock) {
7484 7484                  /* failed to collect pages - release the locks */
7485 7485                  while (--p >= pcf) {
7486 7486                          mutex_exit(&p->pcf_lock);
7487 7487                  }
7488 7488          }
7489 7489          if (pcftotal_ret != NULL)
7490 7490                  *pcftotal_ret = pcftotal;
7491 7491          return (0);
7492 7492  }

↓ open down ↓

7162 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX