as-lock-macros Wdiff usr/src/uts/common/vm/vm_page.c

Print this page

patch as-lock-macro-simplification

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_page.c
          +++ new/usr/src/uts/common/vm/vm_page.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
  24   24   */
  25   25  
  26   26  /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989  AT&T    */
  27   27  /*        All Rights Reserved   */
  28   28  
  29   29  /*
  30   30   * University Copyright- Copyright (c) 1982, 1986, 1988
  31   31   * The Regents of the University of California
  32   32   * All Rights Reserved
  33   33   *
  34   34   * University Acknowledgment- Portions of this document are derived from
  35   35   * software developed by the University of California, Berkeley, and its
  36   36   * contributors.
  37   37   */
  38   38  
  39   39  /*
  40   40   * VM - physical page management.
  41   41   */
  42   42  
  43   43  #include <sys/types.h>
  44   44  #include <sys/t_lock.h>
  45   45  #include <sys/param.h>
  46   46  #include <sys/systm.h>
  47   47  #include <sys/errno.h>
  48   48  #include <sys/time.h>
  49   49  #include <sys/vnode.h>
  50   50  #include <sys/vm.h>
  51   51  #include <sys/vtrace.h>
  52   52  #include <sys/swap.h>
  53   53  #include <sys/cmn_err.h>
  54   54  #include <sys/tuneable.h>
  55   55  #include <sys/sysmacros.h>
  56   56  #include <sys/cpuvar.h>
  57   57  #include <sys/callb.h>
  58   58  #include <sys/debug.h>
  59   59  #include <sys/tnf_probe.h>
  60   60  #include <sys/condvar_impl.h>
  61   61  #include <sys/mem_config.h>
  62   62  #include <sys/mem_cage.h>
  63   63  #include <sys/kmem.h>
  64   64  #include <sys/atomic.h>
  65   65  #include <sys/strlog.h>
  66   66  #include <sys/mman.h>
  67   67  #include <sys/ontrap.h>
  68   68  #include <sys/lgrp.h>
  69   69  #include <sys/vfs.h>
  70   70  
  71   71  #include <vm/hat.h>
  72   72  #include <vm/anon.h>
  73   73  #include <vm/page.h>
  74   74  #include <vm/seg.h>
  75   75  #include <vm/pvn.h>
  76   76  #include <vm/seg_kmem.h>
  77   77  #include <vm/vm_dep.h>
  78   78  #include <sys/vm_usage.h>
  79   79  #include <fs/fs_subr.h>
  80   80  #include <sys/ddi.h>
  81   81  #include <sys/modctl.h>
  82   82  
  83   83  static pgcnt_t max_page_get;    /* max page_get request size in pages */
  84   84  pgcnt_t total_pages = 0;        /* total number of pages (used by /proc) */
  85   85  
  86   86  /*
  87   87   * freemem_lock protects all freemem variables:
  88   88   * availrmem. Also this lock protects the globals which track the
  89   89   * availrmem changes for accurate kernel footprint calculation.
  90   90   * See below for an explanation of these
  91   91   * globals.
  92   92   */
  93   93  kmutex_t freemem_lock;
  94   94  pgcnt_t availrmem;
  95   95  pgcnt_t availrmem_initial;
  96   96  
  97   97  /*
  98   98   * These globals track availrmem changes to get a more accurate
  99   99   * estimate of tke kernel size. Historically pp_kernel is used for
 100  100   * kernel size and is based on availrmem. But availrmem is adjusted for
 101  101   * locked pages in the system not just for kernel locked pages.
 102  102   * These new counters will track the pages locked through segvn and
 103  103   * by explicit user locking.
 104  104   *
 105  105   * pages_locked : How many pages are locked because of user specified
 106  106   * locking through mlock or plock.
 107  107   *
 108  108   * pages_useclaim,pages_claimed : These two variables track the
 109  109   * claim adjustments because of the protection changes on a segvn segment.
 110  110   *
 111  111   * All these globals are protected by the same lock which protects availrmem.
 112  112   */
 113  113  pgcnt_t pages_locked = 0;
 114  114  pgcnt_t pages_useclaim = 0;
 115  115  pgcnt_t pages_claimed = 0;
 116  116  
 117  117  
 118  118  /*
 119  119   * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
 120  120   */
 121  121  static kmutex_t new_freemem_lock;
 122  122  static uint_t   freemem_wait;   /* someone waiting for freemem */
 123  123  static kcondvar_t freemem_cv;
 124  124  
 125  125  /*
 126  126   * The logical page free list is maintained as two lists, the 'free'
 127  127   * and the 'cache' lists.
 128  128   * The free list contains those pages that should be reused first.
 129  129   *
 130  130   * The implementation of the lists is machine dependent.
 131  131   * page_get_freelist(), page_get_cachelist(),
 132  132   * page_list_sub(), and page_list_add()
 133  133   * form the interface to the machine dependent implementation.
 134  134   *
 135  135   * Pages with p_free set are on the cache list.
 136  136   * Pages with p_free and p_age set are on the free list,
 137  137   *
 138  138   * A page may be locked while on either list.
 139  139   */
 140  140  
 141  141  /*
 142  142   * free list accounting stuff.
 143  143   *
 144  144   *
 145  145   * Spread out the value for the number of pages on the
 146  146   * page free and page cache lists.  If there is just one
 147  147   * value, then it must be under just one lock.
 148  148   * The lock contention and cache traffic are a real bother.
 149  149   *
 150  150   * When we acquire and then drop a single pcf lock
 151  151   * we can start in the middle of the array of pcf structures.
 152  152   * If we acquire more than one pcf lock at a time, we need to
 153  153   * start at the front to avoid deadlocking.
 154  154   *
 155  155   * pcf_count holds the number of pages in each pool.
 156  156   *
 157  157   * pcf_block is set when page_create_get_something() has asked the
 158  158   * PSM page freelist and page cachelist routines without specifying
 159  159   * a color and nothing came back.  This is used to block anything
 160  160   * else from moving pages from one list to the other while the
 161  161   * lists are searched again.  If a page is freeed while pcf_block is
 162  162   * set, then pcf_reserve is incremented.  pcgs_unblock() takes care
 163  163   * of clearning pcf_block, doing the wakeups, etc.
 164  164   */
 165  165  
 166  166  #define MAX_PCF_FANOUT NCPU
 167  167  static uint_t pcf_fanout = 1; /* Will get changed at boot time */
 168  168  static uint_t pcf_fanout_mask = 0;
 169  169  
 170  170  struct pcf {
 171  171          kmutex_t        pcf_lock;       /* protects the structure */
 172  172          uint_t          pcf_count;      /* page count */
 173  173          uint_t          pcf_wait;       /* number of waiters */
 174  174          uint_t          pcf_block;      /* pcgs flag to page_free() */
 175  175          uint_t          pcf_reserve;    /* pages freed after pcf_block set */
 176  176          uint_t          pcf_fill[10];   /* to line up on the caches */
 177  177  };
 178  178  
 179  179  /*
 180  180   * PCF_INDEX hash needs to be dynamic (every so often the hash changes where
 181  181   * it will hash the cpu to).  This is done to prevent a drain condition
 182  182   * from happening.  This drain condition will occur when pcf_count decrement
 183  183   * occurs on cpu A and the increment of pcf_count always occurs on cpu B.  An
 184  184   * example of this shows up with device interrupts.  The dma buffer is allocated
 185  185   * by the cpu requesting the IO thus the pcf_count is decremented based on that.
 186  186   * When the memory is returned by the interrupt thread, the pcf_count will be
 187  187   * incremented based on the cpu servicing the interrupt.
 188  188   */
 189  189  static struct pcf pcf[MAX_PCF_FANOUT];
 190  190  #define PCF_INDEX() ((int)(((long)CPU->cpu_seqid) + \
 191  191          (randtick() >> 24)) & (pcf_fanout_mask))
 192  192  
 193  193  static int pcf_decrement_bucket(pgcnt_t);
 194  194  static int pcf_decrement_multiple(pgcnt_t *, pgcnt_t, int);
 195  195  
 196  196  kmutex_t        pcgs_lock;              /* serializes page_create_get_ */
 197  197  kmutex_t        pcgs_cagelock;          /* serializes NOSLEEP cage allocs */
 198  198  kmutex_t        pcgs_wait_lock;         /* used for delay in pcgs */
 199  199  static kcondvar_t       pcgs_cv;        /* cv for delay in pcgs */
 200  200  
 201  201  #ifdef VM_STATS
 202  202  
 203  203  /*
 204  204   * No locks, but so what, they are only statistics.
 205  205   */
 206  206  
 207  207  static struct page_tcnt {
 208  208          int     pc_free_cache;          /* free's into cache list */
 209  209          int     pc_free_dontneed;       /* free's with dontneed */
 210  210          int     pc_free_pageout;        /* free's from pageout */
 211  211          int     pc_free_free;           /* free's into free list */
 212  212          int     pc_free_pages;          /* free's into large page free list */
 213  213          int     pc_destroy_pages;       /* large page destroy's */
 214  214          int     pc_get_cache;           /* get's from cache list */
 215  215          int     pc_get_free;            /* get's from free list */
 216  216          int     pc_reclaim;             /* reclaim's */
 217  217          int     pc_abortfree;           /* abort's of free pages */
 218  218          int     pc_find_hit;            /* find's that find page */
 219  219          int     pc_find_miss;           /* find's that don't find page */
 220  220          int     pc_destroy_free;        /* # of free pages destroyed */
 221  221  #define PC_HASH_CNT     (4*PAGE_HASHAVELEN)
 222  222          int     pc_find_hashlen[PC_HASH_CNT+1];
 223  223          int     pc_addclaim_pages;
 224  224          int     pc_subclaim_pages;
 225  225          int     pc_free_replacement_page[2];
 226  226          int     pc_try_demote_pages[6];
 227  227          int     pc_demote_pages[2];
 228  228  } pagecnt;
 229  229  
 230  230  uint_t  hashin_count;
 231  231  uint_t  hashin_not_held;
 232  232  uint_t  hashin_already;
 233  233  
 234  234  uint_t  hashout_count;
 235  235  uint_t  hashout_not_held;
 236  236  
 237  237  uint_t  page_create_count;
 238  238  uint_t  page_create_not_enough;
 239  239  uint_t  page_create_not_enough_again;
 240  240  uint_t  page_create_zero;
 241  241  uint_t  page_create_hashout;
 242  242  uint_t  page_create_page_lock_failed;
 243  243  uint_t  page_create_trylock_failed;
 244  244  uint_t  page_create_found_one;
 245  245  uint_t  page_create_hashin_failed;
 246  246  uint_t  page_create_dropped_phm;
 247  247  
 248  248  uint_t  page_create_new;
 249  249  uint_t  page_create_exists;
 250  250  uint_t  page_create_putbacks;
 251  251  uint_t  page_create_overshoot;
 252  252  
 253  253  uint_t  page_reclaim_zero;
 254  254  uint_t  page_reclaim_zero_locked;
 255  255  
 256  256  uint_t  page_rename_exists;
 257  257  uint_t  page_rename_count;
 258  258  
 259  259  uint_t  page_lookup_cnt[20];
 260  260  uint_t  page_lookup_nowait_cnt[10];
 261  261  uint_t  page_find_cnt;
 262  262  uint_t  page_exists_cnt;
 263  263  uint_t  page_exists_forreal_cnt;
 264  264  uint_t  page_lookup_dev_cnt;
 265  265  uint_t  get_cachelist_cnt;
 266  266  uint_t  page_create_cnt[10];
 267  267  uint_t  alloc_pages[9];
 268  268  uint_t  page_exphcontg[19];
 269  269  uint_t  page_create_large_cnt[10];
 270  270  
 271  271  #endif
 272  272  
 273  273  static inline page_t *
 274  274  page_hash_search(ulong_t index, vnode_t *vnode, u_offset_t off)
 275  275  {
 276  276          uint_t mylen = 0;
 277  277          page_t *page;
 278  278  
 279  279          for (page = page_hash[index]; page; page = page->p_hash, mylen++)
 280  280                  if (page->p_vnode == vnode && page->p_offset == off)
 281  281                          break;
 282  282  
 283  283  #ifdef  VM_STATS
 284  284          if (page != NULL)
 285  285                  pagecnt.pc_find_hit++;
 286  286          else
 287  287                  pagecnt.pc_find_miss++;
 288  288  
 289  289          pagecnt.pc_find_hashlen[MIN(mylen, PC_HASH_CNT)]++;
 290  290  #endif
 291  291  
 292  292          return (page);
 293  293  }
 294  294  
 295  295  
 296  296  #ifdef DEBUG
 297  297  #define MEMSEG_SEARCH_STATS
 298  298  #endif
 299  299  
 300  300  #ifdef MEMSEG_SEARCH_STATS
 301  301  struct memseg_stats {
 302  302      uint_t nsearch;
 303  303      uint_t nlastwon;
 304  304      uint_t nhashwon;
 305  305      uint_t nnotfound;
 306  306  } memseg_stats;
 307  307  
 308  308  #define MEMSEG_STAT_INCR(v) \
 309  309          atomic_inc_32(&memseg_stats.v)
 310  310  #else
 311  311  #define MEMSEG_STAT_INCR(x)
 312  312  #endif
 313  313  
 314  314  struct memseg *memsegs;         /* list of memory segments */
 315  315  
 316  316  /*
 317  317   * /etc/system tunable to control large page allocation hueristic.
 318  318   *
 319  319   * Setting to LPAP_LOCAL will heavily prefer the local lgroup over remote lgroup
 320  320   * for large page allocation requests.  If a large page is not readily
 321  321   * avaliable on the local freelists we will go through additional effort
 322  322   * to create a large page, potentially moving smaller pages around to coalesce
 323  323   * larger pages in the local lgroup.
 324  324   * Default value of LPAP_DEFAULT will go to remote freelists if large pages
 325  325   * are not readily available in the local lgroup.
 326  326   */
 327  327  enum lpap {
 328  328          LPAP_DEFAULT,   /* default large page allocation policy */
 329  329          LPAP_LOCAL      /* local large page allocation policy */
 330  330  };
 331  331  
 332  332  enum lpap lpg_alloc_prefer = LPAP_DEFAULT;
 333  333  
 334  334  static void page_init_mem_config(void);
 335  335  static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
 336  336  static void page_do_hashout(page_t *);
 337  337  static void page_capture_init();
 338  338  int page_capture_take_action(page_t *, uint_t, void *);
 339  339  
 340  340  static void page_demote_vp_pages(page_t *);
 341  341  
 342  342  
 343  343  void
 344  344  pcf_init(void)
 345  345  
 346  346  {
 347  347          if (boot_ncpus != -1) {
 348  348                  pcf_fanout = boot_ncpus;
 349  349          } else {
 350  350                  pcf_fanout = max_ncpus;
 351  351          }
 352  352  #ifdef sun4v
 353  353          /*
 354  354           * Force at least 4 buckets if possible for sun4v.
 355  355           */
 356  356          pcf_fanout = MAX(pcf_fanout, 4);
 357  357  #endif /* sun4v */
 358  358  
 359  359          /*
 360  360           * Round up to the nearest power of 2.
 361  361           */
 362  362          pcf_fanout = MIN(pcf_fanout, MAX_PCF_FANOUT);
 363  363          if (!ISP2(pcf_fanout)) {
 364  364                  pcf_fanout = 1 << highbit(pcf_fanout);
 365  365  
 366  366                  if (pcf_fanout > MAX_PCF_FANOUT) {
 367  367                          pcf_fanout = 1 << (highbit(MAX_PCF_FANOUT) - 1);
 368  368                  }
 369  369          }
 370  370          pcf_fanout_mask = pcf_fanout - 1;
 371  371  }
 372  372  
 373  373  /*
 374  374   * vm subsystem related initialization
 375  375   */
 376  376  void
 377  377  vm_init(void)
 378  378  {
 379  379          boolean_t callb_vm_cpr(void *, int);
 380  380  
 381  381          (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
 382  382          page_init_mem_config();
 383  383          page_retire_init();
 384  384          vm_usage_init();
 385  385          page_capture_init();
 386  386  }
 387  387  
 388  388  /*
 389  389   * This function is called at startup and when memory is added or deleted.
 390  390   */
 391  391  void
 392  392  init_pages_pp_maximum()
 393  393  {
 394  394          static pgcnt_t p_min;
 395  395          static pgcnt_t pages_pp_maximum_startup;
 396  396          static pgcnt_t avrmem_delta;
 397  397          static int init_done;
 398  398          static int user_set;    /* true if set in /etc/system */
 399  399  
 400  400          if (init_done == 0) {
 401  401  
 402  402                  /* If the user specified a value, save it */
 403  403                  if (pages_pp_maximum != 0) {
 404  404                          user_set = 1;
 405  405                          pages_pp_maximum_startup = pages_pp_maximum;
 406  406                  }
 407  407  
 408  408                  /*
 409  409                   * Setting of pages_pp_maximum is based first time
 410  410                   * on the value of availrmem just after the start-up
 411  411                   * allocations. To preserve this relationship at run
 412  412                   * time, use a delta from availrmem_initial.
 413  413                   */
 414  414                  ASSERT(availrmem_initial >= availrmem);
 415  415                  avrmem_delta = availrmem_initial - availrmem;
 416  416  
 417  417                  /* The allowable floor of pages_pp_maximum */
 418  418                  p_min = tune.t_minarmem + 100;
 419  419  
 420  420                  /* Make sure we don't come through here again. */
 421  421                  init_done = 1;
 422  422          }
 423  423          /*
 424  424           * Determine pages_pp_maximum, the number of currently available
 425  425           * pages (availrmem) that can't be `locked'. If not set by
 426  426           * the user, we set it to 4% of the currently available memory
 427  427           * plus 4MB.
 428  428           * But we also insist that it be greater than tune.t_minarmem;
 429  429           * otherwise a process could lock down a lot of memory, get swapped
 430  430           * out, and never have enough to get swapped back in.
 431  431           */
 432  432          if (user_set)
 433  433                  pages_pp_maximum = pages_pp_maximum_startup;
 434  434          else
 435  435                  pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
 436  436                      + btop(4 * 1024 * 1024);
 437  437  
 438  438          if (pages_pp_maximum <= p_min) {
 439  439                  pages_pp_maximum = p_min;
 440  440          }
 441  441  }
 442  442  
 443  443  void
 444  444  set_max_page_get(pgcnt_t target_total_pages)
 445  445  {
 446  446          max_page_get = target_total_pages / 2;
 447  447  }
 448  448  
 449  449  static pgcnt_t pending_delete;
 450  450  
 451  451  /*ARGSUSED*/
 452  452  static void
 453  453  page_mem_config_post_add(
 454  454          void *arg,
 455  455          pgcnt_t delta_pages)
 456  456  {
 457  457          set_max_page_get(total_pages - pending_delete);
 458  458          init_pages_pp_maximum();
 459  459  }
 460  460  
 461  461  /*ARGSUSED*/
 462  462  static int
 463  463  page_mem_config_pre_del(
 464  464          void *arg,
 465  465          pgcnt_t delta_pages)
 466  466  {
 467  467          pgcnt_t nv;
 468  468  
 469  469          nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
 470  470          set_max_page_get(total_pages - nv);
 471  471          return (0);
 472  472  }
 473  473  
 474  474  /*ARGSUSED*/
 475  475  static void
 476  476  page_mem_config_post_del(
 477  477          void *arg,
 478  478          pgcnt_t delta_pages,
 479  479          int cancelled)
 480  480  {
 481  481          pgcnt_t nv;
 482  482  
 483  483          nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
 484  484          set_max_page_get(total_pages - nv);
 485  485          if (!cancelled)
 486  486                  init_pages_pp_maximum();
 487  487  }
 488  488  
 489  489  static kphysm_setup_vector_t page_mem_config_vec = {
 490  490          KPHYSM_SETUP_VECTOR_VERSION,
 491  491          page_mem_config_post_add,
 492  492          page_mem_config_pre_del,
 493  493          page_mem_config_post_del,
 494  494  };
 495  495  
 496  496  static void
 497  497  page_init_mem_config(void)
 498  498  {
 499  499          int ret;
 500  500  
 501  501          ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
 502  502          ASSERT(ret == 0);
 503  503  }
 504  504  
 505  505  /*
 506  506   * Evenly spread out the PCF counters for large free pages
 507  507   */
 508  508  static void
 509  509  page_free_large_ctr(pgcnt_t npages)
 510  510  {
 511  511          static struct pcf       *p = pcf;
 512  512          pgcnt_t                 lump;
 513  513  
 514  514          freemem += npages;
 515  515  
 516  516          lump = roundup(npages, pcf_fanout) / pcf_fanout;
 517  517  
 518  518          while (npages > 0) {
 519  519  
 520  520                  ASSERT(!p->pcf_block);
 521  521  
 522  522                  if (lump < npages) {
 523  523                          p->pcf_count += (uint_t)lump;
 524  524                          npages -= lump;
 525  525                  } else {
 526  526                          p->pcf_count += (uint_t)npages;
 527  527                          npages = 0;
 528  528                  }
 529  529  
 530  530                  ASSERT(!p->pcf_wait);
 531  531  
 532  532                  if (++p > &pcf[pcf_fanout - 1])
 533  533                          p = pcf;
 534  534          }
 535  535  
 536  536          ASSERT(npages == 0);
 537  537  }
 538  538  
 539  539  /*
 540  540   * Add a physical chunk of memory to the system free lists during startup.
 541  541   * Platform specific startup() allocates the memory for the page structs.
 542  542   *
 543  543   * num  - number of page structures
 544  544   * base - page number (pfn) to be associated with the first page.
 545  545   *
 546  546   * Since we are doing this during startup (ie. single threaded), we will
 547  547   * use shortcut routines to avoid any locking overhead while putting all
 548  548   * these pages on the freelists.
 549  549   *
 550  550   * NOTE: Any changes performed to page_free(), must also be performed to
 551  551   *       add_physmem() since this is how we initialize all page_t's at
 552  552   *       boot time.
 553  553   */
 554  554  void
 555  555  add_physmem(
 556  556          page_t  *pp,
 557  557          pgcnt_t num,
 558  558          pfn_t   pnum)
 559  559  {
 560  560          page_t  *root = NULL;
 561  561          uint_t  szc = page_num_pagesizes() - 1;
 562  562          pgcnt_t large = page_get_pagecnt(szc);
 563  563          pgcnt_t cnt = 0;
 564  564  
 565  565          TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
 566  566              "add_physmem:pp %p num %lu", pp, num);
 567  567  
 568  568          /*
 569  569           * Arbitrarily limit the max page_get request
 570  570           * to 1/2 of the page structs we have.
 571  571           */
 572  572          total_pages += num;
 573  573          set_max_page_get(total_pages);
 574  574  
 575  575          PLCNT_MODIFY_MAX(pnum, (long)num);
 576  576  
 577  577          /*
 578  578           * The physical space for the pages array
 579  579           * representing ram pages has already been
 580  580           * allocated.  Here we initialize each lock
 581  581           * in the page structure, and put each on
 582  582           * the free list
 583  583           */
 584  584          for (; num; pp++, pnum++, num--) {
 585  585  
 586  586                  /*
 587  587                   * this needs to fill in the page number
 588  588                   * and do any other arch specific initialization
 589  589                   */
 590  590                  add_physmem_cb(pp, pnum);
 591  591  
 592  592                  pp->p_lckcnt = 0;
 593  593                  pp->p_cowcnt = 0;
 594  594                  pp->p_slckcnt = 0;
 595  595  
 596  596                  /*
 597  597                   * Initialize the page lock as unlocked, since nobody
 598  598                   * can see or access this page yet.
 599  599                   */
 600  600                  pp->p_selock = 0;
 601  601  
 602  602                  /*
 603  603                   * Initialize IO lock
 604  604                   */
 605  605                  page_iolock_init(pp);
 606  606  
 607  607                  /*
 608  608                   * initialize other fields in the page_t
 609  609                   */
 610  610                  PP_SETFREE(pp);
 611  611                  page_clr_all_props(pp);
 612  612                  PP_SETAGED(pp);
 613  613                  pp->p_offset = (u_offset_t)-1;
 614  614                  pp->p_next = pp;
 615  615                  pp->p_prev = pp;
 616  616  
 617  617                  /*
 618  618                   * Simple case: System doesn't support large pages.
 619  619                   */
 620  620                  if (szc == 0) {
 621  621                          pp->p_szc = 0;
 622  622                          page_free_at_startup(pp);
 623  623                          continue;
 624  624                  }
 625  625  
 626  626                  /*
 627  627                   * Handle unaligned pages, we collect them up onto
 628  628                   * the root page until we have a full large page.
 629  629                   */
 630  630                  if (!IS_P2ALIGNED(pnum, large)) {
 631  631  
 632  632                          /*
 633  633                           * If not in a large page,
 634  634                           * just free as small page.
 635  635                           */
 636  636                          if (root == NULL) {
 637  637                                  pp->p_szc = 0;
 638  638                                  page_free_at_startup(pp);
 639  639                                  continue;
 640  640                          }
 641  641  
 642  642                          /*
 643  643                           * Link a constituent page into the large page.
 644  644                           */
 645  645                          pp->p_szc = szc;
 646  646                          page_list_concat(&root, &pp);
 647  647  
 648  648                          /*
 649  649                           * When large page is fully formed, free it.
 650  650                           */
 651  651                          if (++cnt == large) {
 652  652                                  page_free_large_ctr(cnt);
 653  653                                  page_list_add_pages(root, PG_LIST_ISINIT);
 654  654                                  root = NULL;
 655  655                                  cnt = 0;
 656  656                          }
 657  657                          continue;
 658  658                  }
 659  659  
 660  660                  /*
 661  661                   * At this point we have a page number which
 662  662                   * is aligned. We assert that we aren't already
 663  663                   * in a different large page.
 664  664                   */
 665  665                  ASSERT(IS_P2ALIGNED(pnum, large));
 666  666                  ASSERT(root == NULL && cnt == 0);
 667  667  
 668  668                  /*
 669  669                   * If insufficient number of pages left to form
 670  670                   * a large page, just free the small page.
 671  671                   */
 672  672                  if (num < large) {
 673  673                          pp->p_szc = 0;
 674  674                          page_free_at_startup(pp);
 675  675                          continue;
 676  676                  }
 677  677  
 678  678                  /*
 679  679                   * Otherwise start a new large page.
 680  680                   */
 681  681                  pp->p_szc = szc;
 682  682                  cnt++;
 683  683                  root = pp;
 684  684          }
 685  685          ASSERT(root == NULL && cnt == 0);
 686  686  }
 687  687  
 688  688  /*
 689  689   * Find a page representing the specified [vp, offset].
 690  690   * If we find the page but it is intransit coming in,
 691  691   * it will have an "exclusive" lock and we wait for
 692  692   * the i/o to complete.  A page found on the free list
 693  693   * is always reclaimed and then locked.  On success, the page
 694  694   * is locked, its data is valid and it isn't on the free
 695  695   * list, while a NULL is returned if the page doesn't exist.
 696  696   */
 697  697  page_t *
 698  698  page_lookup(vnode_t *vp, u_offset_t off, se_t se)
 699  699  {
 700  700          return (page_lookup_create(vp, off, se, NULL, NULL, 0));
 701  701  }
 702  702  
 703  703  /*
 704  704   * Find a page representing the specified [vp, offset].
 705  705   * We either return the one we found or, if passed in,
 706  706   * create one with identity of [vp, offset] of the
 707  707   * pre-allocated page. If we find existing page but it is
 708  708   * intransit coming in, it will have an "exclusive" lock
 709  709   * and we wait for the i/o to complete.  A page found on
 710  710   * the free list is always reclaimed and then locked.
 711  711   * On success, the page is locked, its data is valid and
 712  712   * it isn't on the free list, while a NULL is returned
 713  713   * if the page doesn't exist and newpp is NULL;
 714  714   */
 715  715  page_t *
 716  716  page_lookup_create(
 717  717          vnode_t *vp,
 718  718          u_offset_t off,
 719  719          se_t se,
 720  720          page_t *newpp,
 721  721          spgcnt_t *nrelocp,
 722  722          int flags)
 723  723  {
 724  724          page_t          *pp;
 725  725          kmutex_t        *phm;
 726  726          ulong_t         index;
 727  727          uint_t          hash_locked;
 728  728          uint_t          es;
 729  729  
 730  730          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
 731  731          VM_STAT_ADD(page_lookup_cnt[0]);
 732  732          ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
 733  733  
 734  734          /*
 735  735           * Acquire the appropriate page hash lock since
 736  736           * we have to search the hash list.  Pages that
 737  737           * hash to this list can't change identity while
 738  738           * this lock is held.
 739  739           */
 740  740          hash_locked = 0;
 741  741          index = PAGE_HASH_FUNC(vp, off);
 742  742          phm = NULL;
 743  743  top:
 744  744          pp = page_hash_search(index, vp, off);
 745  745          if (pp != NULL) {
 746  746                  VM_STAT_ADD(page_lookup_cnt[1]);
 747  747                  es = (newpp != NULL) ? 1 : 0;
 748  748                  es |= flags;
 749  749                  if (!hash_locked) {
 750  750                          VM_STAT_ADD(page_lookup_cnt[2]);
 751  751                          if (!page_try_reclaim_lock(pp, se, es)) {
 752  752                                  /*
 753  753                                   * On a miss, acquire the phm.  Then
 754  754                                   * next time, page_lock() will be called,
 755  755                                   * causing a wait if the page is busy.
 756  756                                   * just looping with page_trylock() would
 757  757                                   * get pretty boring.
 758  758                                   */
 759  759                                  VM_STAT_ADD(page_lookup_cnt[3]);
 760  760                                  phm = PAGE_HASH_MUTEX(index);
 761  761                                  mutex_enter(phm);
 762  762                                  hash_locked = 1;
 763  763                                  goto top;
 764  764                          }
 765  765                  } else {
 766  766                          VM_STAT_ADD(page_lookup_cnt[4]);
 767  767                          if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {
 768  768                                  VM_STAT_ADD(page_lookup_cnt[5]);
 769  769                                  goto top;
 770  770                          }
 771  771                  }
 772  772  
 773  773                  /*
 774  774                   * Since `pp' is locked it can not change identity now.
 775  775                   * Reconfirm we locked the correct page.
 776  776                   *
 777  777                   * Both the p_vnode and p_offset *must* be cast volatile
 778  778                   * to force a reload of their values: The page_hash_search
 779  779                   * function will have stuffed p_vnode and p_offset into
 780  780                   * registers before calling page_trylock(); another thread,
 781  781                   * actually holding the hash lock, could have changed the
 782  782                   * page's identity in memory, but our registers would not
 783  783                   * be changed, fooling the reconfirmation.  If the hash
 784  784                   * lock was held during the search, the casting would
 785  785                   * not be needed.
 786  786                   */
 787  787                  VM_STAT_ADD(page_lookup_cnt[6]);
 788  788                  if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
 789  789                      ((volatile u_offset_t)(pp->p_offset) != off)) {
 790  790                          VM_STAT_ADD(page_lookup_cnt[7]);
 791  791                          if (hash_locked) {
 792  792                                  panic("page_lookup_create: lost page %p",
 793  793                                      (void *)pp);
 794  794                                  /*NOTREACHED*/
 795  795                          }
 796  796                          page_unlock(pp);
 797  797                          phm = PAGE_HASH_MUTEX(index);
 798  798                          mutex_enter(phm);
 799  799                          hash_locked = 1;
 800  800                          goto top;
 801  801                  }
 802  802  
 803  803                  /*
 804  804                   * If page_trylock() was called, then pp may still be on
 805  805                   * the cachelist (can't be on the free list, it would not
 806  806                   * have been found in the search).  If it is on the
 807  807                   * cachelist it must be pulled now. To pull the page from
 808  808                   * the cachelist, it must be exclusively locked.
 809  809                   *
 810  810                   * The other big difference between page_trylock() and
 811  811                   * page_lock(), is that page_lock() will pull the
 812  812                   * page from whatever free list (the cache list in this
 813  813                   * case) the page is on.  If page_trylock() was used
 814  814                   * above, then we have to do the reclaim ourselves.
 815  815                   */
 816  816                  if ((!hash_locked) && (PP_ISFREE(pp))) {
 817  817                          ASSERT(PP_ISAGED(pp) == 0);
 818  818                          VM_STAT_ADD(page_lookup_cnt[8]);
 819  819  
 820  820                          /*
 821  821                           * page_relcaim will insure that we
 822  822                           * have this page exclusively
 823  823                           */
 824  824  
 825  825                          if (!page_reclaim(pp, NULL)) {
 826  826                                  /*
 827  827                                   * Page_reclaim dropped whatever lock
 828  828                                   * we held.
 829  829                                   */
 830  830                                  VM_STAT_ADD(page_lookup_cnt[9]);
 831  831                                  phm = PAGE_HASH_MUTEX(index);
 832  832                                  mutex_enter(phm);
 833  833                                  hash_locked = 1;
 834  834                                  goto top;
 835  835                          } else if (se == SE_SHARED && newpp == NULL) {
 836  836                                  VM_STAT_ADD(page_lookup_cnt[10]);
 837  837                                  page_downgrade(pp);
 838  838                          }
 839  839                  }
 840  840  
 841  841                  if (hash_locked) {
 842  842                          mutex_exit(phm);
 843  843                  }
 844  844  
 845  845                  if (newpp != NULL && pp->p_szc < newpp->p_szc &&
 846  846                      PAGE_EXCL(pp) && nrelocp != NULL) {
 847  847                          ASSERT(nrelocp != NULL);
 848  848                          (void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
 849  849                              NULL);
 850  850                          if (*nrelocp > 0) {
 851  851                                  VM_STAT_COND_ADD(*nrelocp == 1,
 852  852                                      page_lookup_cnt[11]);
 853  853                                  VM_STAT_COND_ADD(*nrelocp > 1,
 854  854                                      page_lookup_cnt[12]);
 855  855                                  pp = newpp;
 856  856                                  se = SE_EXCL;
 857  857                          } else {
 858  858                                  if (se == SE_SHARED) {
 859  859                                          page_downgrade(pp);
 860  860                                  }
 861  861                                  VM_STAT_ADD(page_lookup_cnt[13]);
 862  862                          }
 863  863                  } else if (newpp != NULL && nrelocp != NULL) {
 864  864                          if (PAGE_EXCL(pp) && se == SE_SHARED) {
 865  865                                  page_downgrade(pp);
 866  866                          }
 867  867                          VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
 868  868                              page_lookup_cnt[14]);
 869  869                          VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
 870  870                              page_lookup_cnt[15]);
 871  871                          VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
 872  872                              page_lookup_cnt[16]);
 873  873                  } else if (newpp != NULL && PAGE_EXCL(pp)) {
 874  874                          se = SE_EXCL;
 875  875                  }
 876  876          } else if (!hash_locked) {
 877  877                  VM_STAT_ADD(page_lookup_cnt[17]);
 878  878                  phm = PAGE_HASH_MUTEX(index);
 879  879                  mutex_enter(phm);
 880  880                  hash_locked = 1;
 881  881                  goto top;
 882  882          } else if (newpp != NULL) {
 883  883                  /*
 884  884                   * If we have a preallocated page then
 885  885                   * insert it now and basically behave like
 886  886                   * page_create.
 887  887                   */
 888  888                  VM_STAT_ADD(page_lookup_cnt[18]);
 889  889                  /*
 890  890                   * Since we hold the page hash mutex and
 891  891                   * just searched for this page, page_hashin
 892  892                   * had better not fail.  If it does, that
 893  893                   * means some thread did not follow the
 894  894                   * page hash mutex rules.  Panic now and
 895  895                   * get it over with.  As usual, go down
 896  896                   * holding all the locks.
 897  897                   */
 898  898                  ASSERT(MUTEX_HELD(phm));
 899  899                  if (!page_hashin(newpp, vp, off, phm)) {
 900  900                          ASSERT(MUTEX_HELD(phm));
 901  901                          panic("page_lookup_create: hashin failed %p %p %llx %p",
 902  902                              (void *)newpp, (void *)vp, off, (void *)phm);
 903  903                          /*NOTREACHED*/
 904  904                  }
 905  905                  ASSERT(MUTEX_HELD(phm));
 906  906                  mutex_exit(phm);
 907  907                  phm = NULL;
 908  908                  page_set_props(newpp, P_REF);
 909  909                  page_io_lock(newpp);
 910  910                  pp = newpp;
 911  911                  se = SE_EXCL;
 912  912          } else {
 913  913                  VM_STAT_ADD(page_lookup_cnt[19]);
 914  914                  mutex_exit(phm);
 915  915          }
 916  916  
 917  917          ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
 918  918  
 919  919          ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
 920  920  
 921  921          return (pp);
 922  922  }
 923  923  
 924  924  /*
 925  925   * Search the hash list for the page representing the
 926  926   * specified [vp, offset] and return it locked.  Skip
 927  927   * free pages and pages that cannot be locked as requested.
 928  928   * Used while attempting to kluster pages.
 929  929   */
 930  930  page_t *
 931  931  page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)
 932  932  {
 933  933          page_t          *pp;
 934  934          kmutex_t        *phm;
 935  935          ulong_t         index;
 936  936          uint_t          locked;
 937  937  
 938  938          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
 939  939          VM_STAT_ADD(page_lookup_nowait_cnt[0]);
 940  940  
 941  941          index = PAGE_HASH_FUNC(vp, off);
 942  942          pp = page_hash_search(index, vp, off);
 943  943          locked = 0;
 944  944          if (pp == NULL) {
 945  945  top:
 946  946                  VM_STAT_ADD(page_lookup_nowait_cnt[1]);
 947  947                  locked = 1;
 948  948                  phm = PAGE_HASH_MUTEX(index);
 949  949                  mutex_enter(phm);
 950  950                  pp = page_hash_search(index, vp, off);
 951  951          }
 952  952  
 953  953          if (pp == NULL || PP_ISFREE(pp)) {
 954  954                  VM_STAT_ADD(page_lookup_nowait_cnt[2]);
 955  955                  pp = NULL;
 956  956          } else {
 957  957                  if (!page_trylock(pp, se)) {
 958  958                          VM_STAT_ADD(page_lookup_nowait_cnt[3]);
 959  959                          pp = NULL;
 960  960                  } else {
 961  961                          VM_STAT_ADD(page_lookup_nowait_cnt[4]);
 962  962                          /*
 963  963                           * See the comment in page_lookup()
 964  964                           */
 965  965                          if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
 966  966                              ((u_offset_t)(pp->p_offset) != off)) {
 967  967                                  VM_STAT_ADD(page_lookup_nowait_cnt[5]);
 968  968                                  if (locked) {
 969  969                                          panic("page_lookup_nowait %p",
 970  970                                              (void *)pp);
 971  971                                          /*NOTREACHED*/
 972  972                                  }
 973  973                                  page_unlock(pp);
 974  974                                  goto top;
 975  975                          }
 976  976                          if (PP_ISFREE(pp)) {
 977  977                                  VM_STAT_ADD(page_lookup_nowait_cnt[6]);
 978  978                                  page_unlock(pp);
 979  979                                  pp = NULL;
 980  980                          }
 981  981                  }
 982  982          }
 983  983          if (locked) {
 984  984                  VM_STAT_ADD(page_lookup_nowait_cnt[7]);
 985  985                  mutex_exit(phm);
 986  986          }
 987  987  
 988  988          ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
 989  989  
 990  990          return (pp);
 991  991  }
 992  992  
 993  993  /*
 994  994   * Search the hash list for a page with the specified [vp, off]
 995  995   * that is known to exist and is already locked.  This routine
 996  996   * is typically used by segment SOFTUNLOCK routines.
 997  997   */
 998  998  page_t *
 999  999  page_find(vnode_t *vp, u_offset_t off)
1000 1000  {
1001 1001          page_t          *pp;
1002 1002          kmutex_t        *phm;
1003 1003          ulong_t         index;
1004 1004  
1005 1005          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1006 1006          VM_STAT_ADD(page_find_cnt);
1007 1007  
1008 1008          index = PAGE_HASH_FUNC(vp, off);
1009 1009          phm = PAGE_HASH_MUTEX(index);
1010 1010  
1011 1011          mutex_enter(phm);
1012 1012          pp = page_hash_search(index, vp, off);
1013 1013          mutex_exit(phm);
1014 1014  
1015 1015          ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr);
1016 1016          return (pp);
1017 1017  }
1018 1018  
1019 1019  /*
1020 1020   * Determine whether a page with the specified [vp, off]
1021 1021   * currently exists in the system.  Obviously this should
1022 1022   * only be considered as a hint since nothing prevents the
1023 1023   * page from disappearing or appearing immediately after
1024 1024   * the return from this routine. Subsequently, we don't
1025 1025   * even bother to lock the list.
1026 1026   */
1027 1027  page_t *
1028 1028  page_exists(vnode_t *vp, u_offset_t off)
1029 1029  {
1030 1030          ulong_t         index;
1031 1031  
1032 1032          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1033 1033          VM_STAT_ADD(page_exists_cnt);
1034 1034  
1035 1035          index = PAGE_HASH_FUNC(vp, off);
1036 1036  
1037 1037          return (page_hash_search(index, vp, off));
1038 1038  }
1039 1039  
1040 1040  /*
1041 1041   * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
1042 1042   * page_size(szc)) range.  if they exist and ppa is not NULL fill ppa array
1043 1043   * with these pages locked SHARED. If necessary reclaim pages from
1044 1044   * freelist. Return 1 if contiguous pages exist and 0 otherwise.
1045 1045   *
1046 1046   * If we fail to lock pages still return 1 if pages exist and contiguous.
1047 1047   * But in this case return value is just a hint. ppa array won't be filled.
1048 1048   * Caller should initialize ppa[0] as NULL to distinguish return value.
1049 1049   *
1050 1050   * Returns 0 if pages don't exist or not physically contiguous.
1051 1051   *
1052 1052   * This routine doesn't work for anonymous(swapfs) pages.
1053 1053   */
1054 1054  int
1055 1055  page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
1056 1056  {
1057 1057          pgcnt_t pages;
1058 1058          pfn_t pfn;
1059 1059          page_t *rootpp;
1060 1060          pgcnt_t i;
1061 1061          pgcnt_t j;
1062 1062          u_offset_t save_off = off;
1063 1063          ulong_t index;
1064 1064          kmutex_t *phm;
1065 1065          page_t *pp;
1066 1066          uint_t pszc;
1067 1067          int loopcnt = 0;
1068 1068  
1069 1069          ASSERT(szc != 0);
1070 1070          ASSERT(vp != NULL);
1071 1071          ASSERT(!IS_SWAPFSVP(vp));
1072 1072          ASSERT(!VN_ISKAS(vp));
1073 1073  
1074 1074  again:
1075 1075          if (++loopcnt > 3) {
1076 1076                  VM_STAT_ADD(page_exphcontg[0]);
1077 1077                  return (0);
1078 1078          }
1079 1079  
1080 1080          index = PAGE_HASH_FUNC(vp, off);
1081 1081          phm = PAGE_HASH_MUTEX(index);
1082 1082  
1083 1083          mutex_enter(phm);
1084 1084          pp = page_hash_search(index, vp, off);
1085 1085          mutex_exit(phm);
1086 1086  
1087 1087          VM_STAT_ADD(page_exphcontg[1]);
1088 1088  
1089 1089          if (pp == NULL) {
1090 1090                  VM_STAT_ADD(page_exphcontg[2]);
1091 1091                  return (0);
1092 1092          }
1093 1093  
1094 1094          pages = page_get_pagecnt(szc);
1095 1095          rootpp = pp;
1096 1096          pfn = rootpp->p_pagenum;
1097 1097  
1098 1098          if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
1099 1099                  VM_STAT_ADD(page_exphcontg[3]);
1100 1100                  if (!page_trylock(pp, SE_SHARED)) {
1101 1101                          VM_STAT_ADD(page_exphcontg[4]);
1102 1102                          return (1);
1103 1103                  }
1104 1104                  /*
1105 1105                   * Also check whether p_pagenum was modified by DR.
1106 1106                   */
1107 1107                  if (pp->p_szc != pszc || pp->p_vnode != vp ||
1108 1108                      pp->p_offset != off || pp->p_pagenum != pfn) {
1109 1109                          VM_STAT_ADD(page_exphcontg[5]);
1110 1110                          page_unlock(pp);
1111 1111                          off = save_off;
1112 1112                          goto again;
1113 1113                  }
1114 1114                  /*
1115 1115                   * szc was non zero and vnode and offset matched after we
1116 1116                   * locked the page it means it can't become free on us.
1117 1117                   */
1118 1118                  ASSERT(!PP_ISFREE(pp));
1119 1119                  if (!IS_P2ALIGNED(pfn, pages)) {
1120 1120                          page_unlock(pp);
1121 1121                          return (0);
1122 1122                  }
1123 1123                  ppa[0] = pp;
1124 1124                  pp++;
1125 1125                  off += PAGESIZE;
1126 1126                  pfn++;
1127 1127                  for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1128 1128                          if (!page_trylock(pp, SE_SHARED)) {
1129 1129                                  VM_STAT_ADD(page_exphcontg[6]);
1130 1130                                  pp--;
1131 1131                                  while (i-- > 0) {
1132 1132                                          page_unlock(pp);
1133 1133                                          pp--;
1134 1134                                  }
1135 1135                                  ppa[0] = NULL;
1136 1136                                  return (1);
1137 1137                          }
1138 1138                          if (pp->p_szc != pszc) {
1139 1139                                  VM_STAT_ADD(page_exphcontg[7]);
1140 1140                                  page_unlock(pp);
1141 1141                                  pp--;
1142 1142                                  while (i-- > 0) {
1143 1143                                          page_unlock(pp);
1144 1144                                          pp--;
1145 1145                                  }
1146 1146                                  ppa[0] = NULL;
1147 1147                                  off = save_off;
1148 1148                                  goto again;
1149 1149                          }
1150 1150                          /*
1151 1151                           * szc the same as for previous already locked pages
1152 1152                           * with right identity. Since this page had correct
1153 1153                           * szc after we locked it can't get freed or destroyed
1154 1154                           * and therefore must have the expected identity.
1155 1155                           */
1156 1156                          ASSERT(!PP_ISFREE(pp));
1157 1157                          if (pp->p_vnode != vp ||
1158 1158                              pp->p_offset != off) {
1159 1159                                  panic("page_exists_physcontig: "
1160 1160                                      "large page identity doesn't match");
1161 1161                          }
1162 1162                          ppa[i] = pp;
1163 1163                          ASSERT(pp->p_pagenum == pfn);
1164 1164                  }
1165 1165                  VM_STAT_ADD(page_exphcontg[8]);
1166 1166                  ppa[pages] = NULL;
1167 1167                  return (1);
1168 1168          } else if (pszc >= szc) {
1169 1169                  VM_STAT_ADD(page_exphcontg[9]);
1170 1170                  if (!IS_P2ALIGNED(pfn, pages)) {
1171 1171                          return (0);
1172 1172                  }
1173 1173                  return (1);
1174 1174          }
1175 1175  
1176 1176          if (!IS_P2ALIGNED(pfn, pages)) {
1177 1177                  VM_STAT_ADD(page_exphcontg[10]);
1178 1178                  return (0);
1179 1179          }
1180 1180  
1181 1181          if (page_numtomemseg_nolock(pfn) !=
1182 1182              page_numtomemseg_nolock(pfn + pages - 1)) {
1183 1183                  VM_STAT_ADD(page_exphcontg[11]);
1184 1184                  return (0);
1185 1185          }
1186 1186  
1187 1187          /*
1188 1188           * We loop up 4 times across pages to promote page size.
1189 1189           * We're extra cautious to promote page size atomically with respect
1190 1190           * to everybody else.  But we can probably optimize into 1 loop if
1191 1191           * this becomes an issue.
1192 1192           */
1193 1193  
1194 1194          for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1195 1195                  if (!page_trylock(pp, SE_EXCL)) {
1196 1196                          VM_STAT_ADD(page_exphcontg[12]);
1197 1197                          break;
1198 1198                  }
1199 1199                  /*
1200 1200                   * Check whether p_pagenum was modified by DR.
1201 1201                   */
1202 1202                  if (pp->p_pagenum != pfn) {
1203 1203                          page_unlock(pp);
1204 1204                          break;
1205 1205                  }
1206 1206                  if (pp->p_vnode != vp ||
1207 1207                      pp->p_offset != off) {
1208 1208                          VM_STAT_ADD(page_exphcontg[13]);
1209 1209                          page_unlock(pp);
1210 1210                          break;
1211 1211                  }
1212 1212                  if (pp->p_szc >= szc) {
1213 1213                          ASSERT(i == 0);
1214 1214                          page_unlock(pp);
1215 1215                          off = save_off;
1216 1216                          goto again;
1217 1217                  }
1218 1218          }
1219 1219  
1220 1220          if (i != pages) {
1221 1221                  VM_STAT_ADD(page_exphcontg[14]);
1222 1222                  --pp;
1223 1223                  while (i-- > 0) {
1224 1224                          page_unlock(pp);
1225 1225                          --pp;
1226 1226                  }
1227 1227                  return (0);
1228 1228          }
1229 1229  
1230 1230          pp = rootpp;
1231 1231          for (i = 0; i < pages; i++, pp++) {
1232 1232                  if (PP_ISFREE(pp)) {
1233 1233                          VM_STAT_ADD(page_exphcontg[15]);
1234 1234                          ASSERT(!PP_ISAGED(pp));
1235 1235                          ASSERT(pp->p_szc == 0);
1236 1236                          if (!page_reclaim(pp, NULL)) {
1237 1237                                  break;
1238 1238                          }
1239 1239                  } else {
1240 1240                          ASSERT(pp->p_szc < szc);
1241 1241                          VM_STAT_ADD(page_exphcontg[16]);
1242 1242                          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1243 1243                  }
1244 1244          }
1245 1245          if (i < pages) {
1246 1246                  VM_STAT_ADD(page_exphcontg[17]);
1247 1247                  /*
1248 1248                   * page_reclaim failed because we were out of memory.
1249 1249                   * drop the rest of the locks and return because this page
1250 1250                   * must be already reallocated anyway.
1251 1251                   */
1252 1252                  pp = rootpp;
1253 1253                  for (j = 0; j < pages; j++, pp++) {
1254 1254                          if (j != i) {
1255 1255                                  page_unlock(pp);
1256 1256                          }
1257 1257                  }
1258 1258                  return (0);
1259 1259          }
1260 1260  
1261 1261          off = save_off;
1262 1262          pp = rootpp;
1263 1263          for (i = 0; i < pages; i++, pp++, off += PAGESIZE) {
1264 1264                  ASSERT(PAGE_EXCL(pp));
1265 1265                  ASSERT(!PP_ISFREE(pp));
1266 1266                  ASSERT(!hat_page_is_mapped(pp));
1267 1267                  ASSERT(pp->p_vnode == vp);
1268 1268                  ASSERT(pp->p_offset == off);
1269 1269                  pp->p_szc = szc;
1270 1270          }
1271 1271          pp = rootpp;
1272 1272          for (i = 0; i < pages; i++, pp++) {
1273 1273                  if (ppa == NULL) {
1274 1274                          page_unlock(pp);
1275 1275                  } else {
1276 1276                          ppa[i] = pp;
1277 1277                          page_downgrade(ppa[i]);
1278 1278                  }
1279 1279          }
1280 1280          if (ppa != NULL) {
1281 1281                  ppa[pages] = NULL;
1282 1282          }
1283 1283          VM_STAT_ADD(page_exphcontg[18]);
1284 1284          ASSERT(vp->v_pages != NULL);
1285 1285          return (1);
1286 1286  }
1287 1287  
1288 1288  /*
1289 1289   * Determine whether a page with the specified [vp, off]
1290 1290   * currently exists in the system and if so return its
1291 1291   * size code. Obviously this should only be considered as
1292 1292   * a hint since nothing prevents the page from disappearing
1293 1293   * or appearing immediately after the return from this routine.
1294 1294   */
1295 1295  int
1296 1296  page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc)
1297 1297  {
1298 1298          page_t          *pp;
1299 1299          kmutex_t        *phm;
1300 1300          ulong_t         index;
1301 1301          int             rc = 0;
1302 1302  
1303 1303          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1304 1304          ASSERT(szc != NULL);
1305 1305          VM_STAT_ADD(page_exists_forreal_cnt);
1306 1306  
1307 1307          index = PAGE_HASH_FUNC(vp, off);
1308 1308          phm = PAGE_HASH_MUTEX(index);
1309 1309  
1310 1310          mutex_enter(phm);
1311 1311          pp = page_hash_search(index, vp, off);
1312 1312          if (pp != NULL) {
1313 1313                  *szc = pp->p_szc;
1314 1314                  rc = 1;
1315 1315          }
1316 1316          mutex_exit(phm);
1317 1317          return (rc);
1318 1318  }
1319 1319  
1320 1320  /* wakeup threads waiting for pages in page_create_get_something() */
1321 1321  void
1322 1322  wakeup_pcgs(void)
1323 1323  {
1324 1324          if (!CV_HAS_WAITERS(&pcgs_cv))
1325 1325                  return;
1326 1326          cv_broadcast(&pcgs_cv);
1327 1327  }
1328 1328  
1329 1329  /*
1330 1330   * 'freemem' is used all over the kernel as an indication of how many
1331 1331   * pages are free (either on the cache list or on the free page list)
1332 1332   * in the system.  In very few places is a really accurate 'freemem'
1333 1333   * needed.  To avoid contention of the lock protecting a the
1334 1334   * single freemem, it was spread out into NCPU buckets.  Set_freemem
1335 1335   * sets freemem to the total of all NCPU buckets.  It is called from
1336 1336   * clock() on each TICK.
1337 1337   */
1338 1338  void
1339 1339  set_freemem()
1340 1340  {
1341 1341          struct pcf      *p;
1342 1342          ulong_t         t;
1343 1343          uint_t          i;
1344 1344  
1345 1345          t = 0;
1346 1346          p = pcf;
1347 1347          for (i = 0;  i < pcf_fanout; i++) {
1348 1348                  t += p->pcf_count;
1349 1349                  p++;
1350 1350          }
1351 1351          freemem = t;
1352 1352  
1353 1353          /*
1354 1354           * Don't worry about grabbing mutex.  It's not that
1355 1355           * critical if we miss a tick or two.  This is
1356 1356           * where we wakeup possible delayers in
1357 1357           * page_create_get_something().
1358 1358           */
1359 1359          wakeup_pcgs();
1360 1360  }
1361 1361  
1362 1362  ulong_t
1363 1363  get_freemem()
1364 1364  {
1365 1365          struct pcf      *p;
1366 1366          ulong_t         t;
1367 1367          uint_t          i;
1368 1368  
1369 1369          t = 0;
1370 1370          p = pcf;
1371 1371          for (i = 0; i < pcf_fanout; i++) {
1372 1372                  t += p->pcf_count;
1373 1373                  p++;
1374 1374          }
1375 1375          /*
1376 1376           * We just calculated it, might as well set it.
1377 1377           */
1378 1378          freemem = t;
1379 1379          return (t);
1380 1380  }
1381 1381  
1382 1382  /*
1383 1383   * Acquire all of the page cache & free (pcf) locks.
1384 1384   */
1385 1385  void
1386 1386  pcf_acquire_all()
1387 1387  {
1388 1388          struct pcf      *p;
1389 1389          uint_t          i;
1390 1390  
1391 1391          p = pcf;
1392 1392          for (i = 0; i < pcf_fanout; i++) {
1393 1393                  mutex_enter(&p->pcf_lock);
1394 1394                  p++;
1395 1395          }
1396 1396  }
1397 1397  
1398 1398  /*
1399 1399   * Release all the pcf_locks.
1400 1400   */
1401 1401  void
1402 1402  pcf_release_all()
1403 1403  {
1404 1404          struct pcf      *p;
1405 1405          uint_t          i;
1406 1406  
1407 1407          p = pcf;
1408 1408          for (i = 0; i < pcf_fanout; i++) {
1409 1409                  mutex_exit(&p->pcf_lock);
1410 1410                  p++;
1411 1411          }
1412 1412  }
1413 1413  
1414 1414  /*
1415 1415   * Inform the VM system that we need some pages freed up.
1416 1416   * Calls must be symmetric, e.g.:
1417 1417   *
1418 1418   *      page_needfree(100);
1419 1419   *      wait a bit;
1420 1420   *      page_needfree(-100);
1421 1421   */
1422 1422  void
1423 1423  page_needfree(spgcnt_t npages)
1424 1424  {
1425 1425          mutex_enter(&new_freemem_lock);
1426 1426          needfree += npages;
1427 1427          mutex_exit(&new_freemem_lock);
1428 1428  }
1429 1429  
1430 1430  /*
1431 1431   * Throttle for page_create(): try to prevent freemem from dropping
1432 1432   * below throttlefree.  We can't provide a 100% guarantee because
1433 1433   * KM_NOSLEEP allocations, page_reclaim(), and various other things
1434 1434   * nibble away at the freelist.  However, we can block all PG_WAIT
1435 1435   * allocations until memory becomes available.  The motivation is
1436 1436   * that several things can fall apart when there's no free memory:
1437 1437   *
1438 1438   * (1) If pageout() needs memory to push a page, the system deadlocks.
1439 1439   *
1440 1440   * (2) By (broken) specification, timeout(9F) can neither fail nor
1441 1441   *     block, so it has no choice but to panic the system if it
1442 1442   *     cannot allocate a callout structure.
1443 1443   *
1444 1444   * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block;
1445 1445   *     it panics if it cannot allocate a callback structure.
1446 1446   *
1447 1447   * (4) Untold numbers of third-party drivers have not yet been hardened
1448 1448   *     against KM_NOSLEEP and/or allocb() failures; they simply assume
1449 1449   *     success and panic the system with a data fault on failure.
1450 1450   *     (The long-term solution to this particular problem is to ship
1451 1451   *     hostile fault-injecting DEBUG kernels with the DDK.)
1452 1452   *
1453 1453   * It is theoretically impossible to guarantee success of non-blocking
1454 1454   * allocations, but in practice, this throttle is very hard to break.
1455 1455   */
1456 1456  static int
1457 1457  page_create_throttle(pgcnt_t npages, int flags)
1458 1458  {
1459 1459          ulong_t fm;
1460 1460          uint_t  i;
1461 1461          pgcnt_t tf;     /* effective value of throttlefree */
1462 1462  
1463 1463          /*
1464 1464           * Normal priority allocations.
1465 1465           */
1466 1466          if ((flags & (PG_WAIT | PG_NORMALPRI)) == PG_NORMALPRI) {
1467 1467                  ASSERT(!(flags & (PG_PANIC | PG_PUSHPAGE)));
1468 1468                  return (freemem >= npages + throttlefree);
1469 1469          }
1470 1470  
1471 1471          /*
1472 1472           * Never deny pages when:
1473 1473           * - it's a thread that cannot block [NOMEMWAIT()]
1474 1474           * - the allocation cannot block and must not fail
1475 1475           * - the allocation cannot block and is pageout dispensated
1476 1476           */
1477 1477          if (NOMEMWAIT() ||
1478 1478              ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) ||
1479 1479              ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE))
1480 1480                  return (1);
1481 1481  
1482 1482          /*
1483 1483           * If the allocation can't block, we look favorably upon it
1484 1484           * unless we're below pageout_reserve.  In that case we fail
1485 1485           * the allocation because we want to make sure there are a few
1486 1486           * pages available for pageout.
1487 1487           */
1488 1488          if ((flags & PG_WAIT) == 0)
1489 1489                  return (freemem >= npages + pageout_reserve);
1490 1490  
1491 1491          /* Calculate the effective throttlefree value */
1492 1492          tf = throttlefree -
1493 1493              ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
1494 1494  
1495 1495          cv_signal(&proc_pageout->p_cv);
1496 1496  
1497 1497          for (;;) {
1498 1498                  fm = 0;
1499 1499                  pcf_acquire_all();
1500 1500                  mutex_enter(&new_freemem_lock);
1501 1501                  for (i = 0; i < pcf_fanout; i++) {
1502 1502                          fm += pcf[i].pcf_count;
1503 1503                          pcf[i].pcf_wait++;
1504 1504                          mutex_exit(&pcf[i].pcf_lock);
1505 1505                  }
1506 1506                  freemem = fm;
1507 1507                  if (freemem >= npages + tf) {
1508 1508                          mutex_exit(&new_freemem_lock);
1509 1509                          break;
1510 1510                  }
1511 1511                  needfree += npages;
1512 1512                  freemem_wait++;
1513 1513                  cv_wait(&freemem_cv, &new_freemem_lock);
1514 1514                  freemem_wait--;
1515 1515                  needfree -= npages;
1516 1516                  mutex_exit(&new_freemem_lock);
1517 1517          }
1518 1518          return (1);
1519 1519  }
1520 1520  
1521 1521  /*
1522 1522   * page_create_wait() is called to either coalesce pages from the
1523 1523   * different pcf buckets or to wait because there simply are not
1524 1524   * enough pages to satisfy the caller's request.
1525 1525   *
1526 1526   * Sadly, this is called from platform/vm/vm_machdep.c
1527 1527   */
1528 1528  int
1529 1529  page_create_wait(pgcnt_t npages, uint_t flags)
1530 1530  {
1531 1531          pgcnt_t         total;
1532 1532          uint_t          i;
1533 1533          struct pcf      *p;
1534 1534  
1535 1535          /*
1536 1536           * Wait until there are enough free pages to satisfy our
1537 1537           * entire request.
1538 1538           * We set needfree += npages before prodding pageout, to make sure
1539 1539           * it does real work when npages > lotsfree > freemem.
1540 1540           */
1541 1541          VM_STAT_ADD(page_create_not_enough);
1542 1542  
1543 1543          ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1);
1544 1544  checkagain:
1545 1545          if ((flags & PG_NORELOC) &&
1546 1546              kcage_freemem < kcage_throttlefree + npages)
1547 1547                  (void) kcage_create_throttle(npages, flags);
1548 1548  
1549 1549          if (freemem < npages + throttlefree)
1550 1550                  if (!page_create_throttle(npages, flags))
1551 1551                          return (0);
1552 1552  
1553 1553          if (pcf_decrement_bucket(npages) ||
1554 1554              pcf_decrement_multiple(&total, npages, 0))
1555 1555                  return (1);
1556 1556  
1557 1557          /*
1558 1558           * All of the pcf locks are held, there are not enough pages
1559 1559           * to satisfy the request (npages < total).
1560 1560           * Be sure to acquire the new_freemem_lock before dropping
1561 1561           * the pcf locks.  This prevents dropping wakeups in page_free().
1562 1562           * The order is always pcf_lock then new_freemem_lock.
1563 1563           *
1564 1564           * Since we hold all the pcf locks, it is a good time to set freemem.
1565 1565           *
1566 1566           * If the caller does not want to wait, return now.
1567 1567           * Else turn the pageout daemon loose to find something
1568 1568           * and wait till it does.
1569 1569           *
1570 1570           */
1571 1571          freemem = total;
1572 1572  
1573 1573          if ((flags & PG_WAIT) == 0) {
1574 1574                  pcf_release_all();
1575 1575  
1576 1576                  TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM,
1577 1577                  "page_create_nomem:npages %ld freemem %ld", npages, freemem);
1578 1578                  return (0);
1579 1579          }
1580 1580  
1581 1581          ASSERT(proc_pageout != NULL);
1582 1582          cv_signal(&proc_pageout->p_cv);
1583 1583  
1584 1584          TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
1585 1585              "page_create_sleep_start: freemem %ld needfree %ld",
1586 1586              freemem, needfree);
1587 1587  
1588 1588          /*
1589 1589           * We are going to wait.
1590 1590           * We currently hold all of the pcf_locks,
1591 1591           * get the new_freemem_lock (it protects freemem_wait),
1592 1592           * before dropping the pcf_locks.
1593 1593           */
1594 1594          mutex_enter(&new_freemem_lock);
1595 1595  
1596 1596          p = pcf;
1597 1597          for (i = 0; i < pcf_fanout; i++) {
1598 1598                  p->pcf_wait++;
1599 1599                  mutex_exit(&p->pcf_lock);
1600 1600                  p++;
1601 1601          }
1602 1602  
1603 1603          needfree += npages;
1604 1604          freemem_wait++;
1605 1605  
1606 1606          cv_wait(&freemem_cv, &new_freemem_lock);
1607 1607  
1608 1608          freemem_wait--;
1609 1609          needfree -= npages;
1610 1610  
1611 1611          mutex_exit(&new_freemem_lock);
1612 1612  
1613 1613          TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END,
1614 1614              "page_create_sleep_end: freemem %ld needfree %ld",
1615 1615              freemem, needfree);
1616 1616  
1617 1617          VM_STAT_ADD(page_create_not_enough_again);
1618 1618          goto checkagain;
1619 1619  }
1620 1620  /*
1621 1621   * A routine to do the opposite of page_create_wait().
1622 1622   */
1623 1623  void
1624 1624  page_create_putback(spgcnt_t npages)
1625 1625  {
1626 1626          struct pcf      *p;
1627 1627          pgcnt_t         lump;
1628 1628          uint_t          *which;
1629 1629  
1630 1630          /*
1631 1631           * When a contiguous lump is broken up, we have to
1632 1632           * deal with lots of pages (min 64) so lets spread
1633 1633           * the wealth around.
1634 1634           */
1635 1635          lump = roundup(npages, pcf_fanout) / pcf_fanout;
1636 1636          freemem += npages;
1637 1637  
1638 1638          for (p = pcf; (npages > 0) && (p < &pcf[pcf_fanout]); p++) {
1639 1639                  which = &p->pcf_count;
1640 1640  
1641 1641                  mutex_enter(&p->pcf_lock);
1642 1642  
1643 1643                  if (p->pcf_block) {
1644 1644                          which = &p->pcf_reserve;
1645 1645                  }
1646 1646  
1647 1647                  if (lump < npages) {
1648 1648                          *which += (uint_t)lump;
1649 1649                          npages -= lump;
1650 1650                  } else {
1651 1651                          *which += (uint_t)npages;
1652 1652                          npages = 0;
1653 1653                  }
1654 1654  
1655 1655                  if (p->pcf_wait) {
1656 1656                          mutex_enter(&new_freemem_lock);
1657 1657                          /*
1658 1658                           * Check to see if some other thread
1659 1659                           * is actually waiting.  Another bucket
1660 1660                           * may have woken it up by now.  If there
1661 1661                           * are no waiters, then set our pcf_wait
1662 1662                           * count to zero to avoid coming in here
1663 1663                           * next time.
1664 1664                           */
1665 1665                          if (freemem_wait) {
1666 1666                                  if (npages > 1) {
1667 1667                                          cv_broadcast(&freemem_cv);
1668 1668                                  } else {
1669 1669                                          cv_signal(&freemem_cv);
1670 1670                                  }
1671 1671                                  p->pcf_wait--;
1672 1672                          } else {
1673 1673                                  p->pcf_wait = 0;
1674 1674                          }
1675 1675                          mutex_exit(&new_freemem_lock);
1676 1676                  }
1677 1677                  mutex_exit(&p->pcf_lock);
1678 1678          }
1679 1679          ASSERT(npages == 0);
1680 1680  }
1681 1681  
1682 1682  /*
1683 1683   * A helper routine for page_create_get_something.
1684 1684   * The indenting got to deep down there.
1685 1685   * Unblock the pcf counters.  Any pages freed after
1686 1686   * pcf_block got set are moved to pcf_count and
1687 1687   * wakeups (cv_broadcast() or cv_signal()) are done as needed.
1688 1688   */
1689 1689  static void
1690 1690  pcgs_unblock(void)
1691 1691  {
1692 1692          int             i;
1693 1693          struct pcf      *p;
1694 1694  
1695 1695          /* Update freemem while we're here. */
1696 1696          freemem = 0;
1697 1697          p = pcf;
1698 1698          for (i = 0; i < pcf_fanout; i++) {
1699 1699                  mutex_enter(&p->pcf_lock);
1700 1700                  ASSERT(p->pcf_count == 0);
1701 1701                  p->pcf_count = p->pcf_reserve;
1702 1702                  p->pcf_block = 0;
1703 1703                  freemem += p->pcf_count;
1704 1704                  if (p->pcf_wait) {
1705 1705                          mutex_enter(&new_freemem_lock);
1706 1706                          if (freemem_wait) {
1707 1707                                  if (p->pcf_reserve > 1) {
1708 1708                                          cv_broadcast(&freemem_cv);
1709 1709                                          p->pcf_wait = 0;
1710 1710                                  } else {
1711 1711                                          cv_signal(&freemem_cv);
1712 1712                                          p->pcf_wait--;
1713 1713                                  }
1714 1714                          } else {
1715 1715                                  p->pcf_wait = 0;
1716 1716                          }
1717 1717                          mutex_exit(&new_freemem_lock);
1718 1718                  }
1719 1719                  p->pcf_reserve = 0;
1720 1720                  mutex_exit(&p->pcf_lock);
1721 1721                  p++;
1722 1722          }
1723 1723  }
1724 1724  
1725 1725  /*
1726 1726   * Called from page_create_va() when both the cache and free lists
1727 1727   * have been checked once.
1728 1728   *
1729 1729   * Either returns a page or panics since the accounting was done
1730 1730   * way before we got here.
1731 1731   *
1732 1732   * We don't come here often, so leave the accounting on permanently.
1733 1733   */
1734 1734  
1735 1735  #define MAX_PCGS        100
1736 1736  
1737 1737  #ifdef  DEBUG
1738 1738  #define PCGS_TRIES      100
1739 1739  #else   /* DEBUG */
1740 1740  #define PCGS_TRIES      10
1741 1741  #endif  /* DEBUG */
1742 1742  
1743 1743  #ifdef  VM_STATS
1744 1744  uint_t  pcgs_counts[PCGS_TRIES];
1745 1745  uint_t  pcgs_too_many;
1746 1746  uint_t  pcgs_entered;
1747 1747  uint_t  pcgs_entered_noreloc;
1748 1748  uint_t  pcgs_locked;
1749 1749  uint_t  pcgs_cagelocked;
1750 1750  #endif  /* VM_STATS */
1751 1751  
1752 1752  static page_t *
1753 1753  page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg,
1754 1754      caddr_t vaddr, uint_t flags)
1755 1755  {
1756 1756          uint_t          count;
1757 1757          page_t          *pp;
1758 1758          uint_t          locked, i;
1759 1759          struct  pcf     *p;
1760 1760          lgrp_t          *lgrp;
1761 1761          int             cagelocked = 0;
1762 1762  
1763 1763          VM_STAT_ADD(pcgs_entered);
1764 1764  
1765 1765          /*
1766 1766           * Tap any reserve freelists: if we fail now, we'll die
1767 1767           * since the page(s) we're looking for have already been
1768 1768           * accounted for.
1769 1769           */
1770 1770          flags |= PG_PANIC;
1771 1771  
1772 1772          if ((flags & PG_NORELOC) != 0) {
1773 1773                  VM_STAT_ADD(pcgs_entered_noreloc);
1774 1774                  /*
1775 1775                   * Requests for free pages from critical threads
1776 1776                   * such as pageout still won't throttle here, but
1777 1777                   * we must try again, to give the cageout thread
1778 1778                   * another chance to catch up. Since we already
1779 1779                   * accounted for the pages, we had better get them
1780 1780                   * this time.
1781 1781                   *
1782 1782                   * N.B. All non-critical threads acquire the pcgs_cagelock
1783 1783                   * to serialize access to the freelists. This implements a
1784 1784                   * turnstile-type synchornization to avoid starvation of
1785 1785                   * critical requests for PG_NORELOC memory by non-critical
1786 1786                   * threads: all non-critical threads must acquire a 'ticket'
1787 1787                   * before passing through, which entails making sure
1788 1788                   * kcage_freemem won't fall below minfree prior to grabbing
1789 1789                   * pages from the freelists.
1790 1790                   */
1791 1791                  if (kcage_create_throttle(1, flags) == KCT_NONCRIT) {
1792 1792                          mutex_enter(&pcgs_cagelock);
1793 1793                          cagelocked = 1;
1794 1794                          VM_STAT_ADD(pcgs_cagelocked);
1795 1795                  }
1796 1796          }
1797 1797  
1798 1798          /*
1799 1799           * Time to get serious.
1800 1800           * We failed to get a `correctly colored' page from both the
1801 1801           * free and cache lists.
1802 1802           * We escalate in stage.
1803 1803           *
1804 1804           * First try both lists without worring about color.
1805 1805           *
1806 1806           * Then, grab all page accounting locks (ie. pcf[]) and
1807 1807           * steal any pages that they have and set the pcf_block flag to
1808 1808           * stop deletions from the lists.  This will help because
1809 1809           * a page can get added to the free list while we are looking
1810 1810           * at the cache list, then another page could be added to the cache
1811 1811           * list allowing the page on the free list to be removed as we
1812 1812           * move from looking at the cache list to the free list. This
1813 1813           * could happen over and over. We would never find the page
1814 1814           * we have accounted for.
1815 1815           *
1816 1816           * Noreloc pages are a subset of the global (relocatable) page pool.
1817 1817           * They are not tracked separately in the pcf bins, so it is
1818 1818           * impossible to know when doing pcf accounting if the available
1819 1819           * page(s) are noreloc pages or not. When looking for a noreloc page
1820 1820           * it is quite easy to end up here even if the global (relocatable)
1821 1821           * page pool has plenty of free pages but the noreloc pool is empty.
1822 1822           *
1823 1823           * When the noreloc pool is empty (or low), additional noreloc pages
1824 1824           * are created by converting pages from the global page pool. This
1825 1825           * process will stall during pcf accounting if the pcf bins are
1826 1826           * already locked. Such is the case when a noreloc allocation is
1827 1827           * looping here in page_create_get_something waiting for more noreloc
1828 1828           * pages to appear.
1829 1829           *
1830 1830           * Short of adding a new field to the pcf bins to accurately track
1831 1831           * the number of free noreloc pages, we instead do not grab the
1832 1832           * pcgs_lock, do not set the pcf blocks and do not timeout when
1833 1833           * allocating a noreloc page. This allows noreloc allocations to
1834 1834           * loop without blocking global page pool allocations.
1835 1835           *
1836 1836           * NOTE: the behaviour of page_create_get_something has not changed
1837 1837           * for the case of global page pool allocations.
1838 1838           */
1839 1839  
1840 1840          flags &= ~PG_MATCH_COLOR;
1841 1841          locked = 0;
1842 1842  #if defined(__i386) || defined(__amd64)
1843 1843          flags = page_create_update_flags_x86(flags);
1844 1844  #endif
1845 1845  
1846 1846          lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
1847 1847  
1848 1848          for (count = 0; kcage_on || count < MAX_PCGS; count++) {
1849 1849                  pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
1850 1850                      flags, lgrp);
1851 1851                  if (pp == NULL) {
1852 1852                          pp = page_get_cachelist(vp, off, seg, vaddr,
1853 1853                              flags, lgrp);
1854 1854                  }
1855 1855                  if (pp == NULL) {
1856 1856                          /*
1857 1857                           * Serialize.  Don't fight with other pcgs().
1858 1858                           */
1859 1859                          if (!locked && (!kcage_on || !(flags & PG_NORELOC))) {
1860 1860                                  mutex_enter(&pcgs_lock);
1861 1861                                  VM_STAT_ADD(pcgs_locked);
1862 1862                                  locked = 1;
1863 1863                                  p = pcf;
1864 1864                                  for (i = 0; i < pcf_fanout; i++) {
1865 1865                                          mutex_enter(&p->pcf_lock);
1866 1866                                          ASSERT(p->pcf_block == 0);
1867 1867                                          p->pcf_block = 1;
1868 1868                                          p->pcf_reserve = p->pcf_count;
1869 1869                                          p->pcf_count = 0;
1870 1870                                          mutex_exit(&p->pcf_lock);
1871 1871                                          p++;
1872 1872                                  }
1873 1873                                  freemem = 0;
1874 1874                          }
1875 1875  
1876 1876                          if (count) {
1877 1877                                  /*
1878 1878                                   * Since page_free() puts pages on
1879 1879                                   * a list then accounts for it, we
1880 1880                                   * just have to wait for page_free()
1881 1881                                   * to unlock any page it was working
1882 1882                                   * with. The page_lock()-page_reclaim()
1883 1883                                   * path falls in the same boat.
1884 1884                                   *
1885 1885                                   * We don't need to check on the
1886 1886                                   * PG_WAIT flag, we have already
1887 1887                                   * accounted for the page we are
1888 1888                                   * looking for in page_create_va().
1889 1889                                   *
1890 1890                                   * We just wait a moment to let any
1891 1891                                   * locked pages on the lists free up,
1892 1892                                   * then continue around and try again.
1893 1893                                   *
1894 1894                                   * Will be awakened by set_freemem().
1895 1895                                   */
1896 1896                                  mutex_enter(&pcgs_wait_lock);
1897 1897                                  cv_wait(&pcgs_cv, &pcgs_wait_lock);
1898 1898                                  mutex_exit(&pcgs_wait_lock);
1899 1899                          }
1900 1900                  } else {
1901 1901  #ifdef VM_STATS
1902 1902                          if (count >= PCGS_TRIES) {
1903 1903                                  VM_STAT_ADD(pcgs_too_many);
1904 1904                          } else {
1905 1905                                  VM_STAT_ADD(pcgs_counts[count]);
1906 1906                          }
1907 1907  #endif
1908 1908                          if (locked) {
1909 1909                                  pcgs_unblock();
1910 1910                                  mutex_exit(&pcgs_lock);
1911 1911                          }
1912 1912                          if (cagelocked)
1913 1913                                  mutex_exit(&pcgs_cagelock);
1914 1914                          return (pp);
1915 1915                  }
1916 1916          }
1917 1917          /*
1918 1918           * we go down holding the pcf locks.
1919 1919           */
1920 1920          panic("no %spage found %d",
1921 1921              ((flags & PG_NORELOC) ? "non-reloc " : ""), count);
1922 1922          /*NOTREACHED*/
1923 1923  }
1924 1924  
1925 1925  /*
1926 1926   * Create enough pages for "bytes" worth of data starting at
1927 1927   * "off" in "vp".
1928 1928   *
1929 1929   *      Where flag must be one of:
1930 1930   *
1931 1931   *              PG_EXCL:        Exclusive create (fail if any page already
1932 1932   *                              exists in the page cache) which does not
1933 1933   *                              wait for memory to become available.
1934 1934   *
1935 1935   *              PG_WAIT:        Non-exclusive create which can wait for
1936 1936   *                              memory to become available.
1937 1937   *
1938 1938   *              PG_PHYSCONTIG:  Allocate physically contiguous pages.
1939 1939   *                              (Not Supported)
1940 1940   *
1941 1941   * A doubly linked list of pages is returned to the caller.  Each page
1942 1942   * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock)
1943 1943   * lock.
1944 1944   *
1945 1945   * Unable to change the parameters to page_create() in a minor release,
1946 1946   * we renamed page_create() to page_create_va(), changed all known calls
1947 1947   * from page_create() to page_create_va(), and created this wrapper.
1948 1948   *
1949 1949   * Upon a major release, we should break compatibility by deleting this
1950 1950   * wrapper, and replacing all the strings "page_create_va", with "page_create".
1951 1951   *
1952 1952   * NOTE: There is a copy of this interface as page_create_io() in
1953 1953   *       i86/vm/vm_machdep.c. Any bugs fixed here should be applied
1954 1954   *       there.
1955 1955   */
1956 1956  page_t *
1957 1957  page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags)
1958 1958  {
1959 1959          caddr_t random_vaddr;
1960 1960          struct seg kseg;
1961 1961  
1962 1962  #ifdef DEBUG
1963 1963          cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p",
1964 1964              (void *)caller());
1965 1965  #endif
1966 1966  
1967 1967          random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^
1968 1968              (uintptr_t)(off >> PAGESHIFT));
1969 1969          kseg.s_as = &kas;
1970 1970  
1971 1971          return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr));
1972 1972  }
1973 1973  
1974 1974  #ifdef DEBUG
1975 1975  uint32_t pg_alloc_pgs_mtbf = 0;
1976 1976  #endif
1977 1977  
1978 1978  /*
1979 1979   * Used for large page support. It will attempt to allocate
1980 1980   * a large page(s) off the freelist.
1981 1981   *
1982 1982   * Returns non zero on failure.
1983 1983   */
1984 1984  int
1985 1985  page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr,
1986 1986      page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz, int pgflags)
1987 1987  {
1988 1988          pgcnt_t         npgs, curnpgs, totpgs;
1989 1989          size_t          pgsz;
1990 1990          page_t          *pplist = NULL, *pp;
1991 1991          int             err = 0;
1992 1992          lgrp_t          *lgrp;
1993 1993  
1994 1994          ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
1995 1995          ASSERT(pgflags == 0 || pgflags == PG_LOCAL);
1996 1996  
1997 1997          /*
1998 1998           * Check if system heavily prefers local large pages over remote
1999 1999           * on systems with multiple lgroups.
2000 2000           */
2001 2001          if (lpg_alloc_prefer == LPAP_LOCAL && nlgrps > 1) {
2002 2002                  pgflags = PG_LOCAL;
2003 2003          }
2004 2004  
2005 2005          VM_STAT_ADD(alloc_pages[0]);
2006 2006  
2007 2007  #ifdef DEBUG
2008 2008          if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) {
2009 2009                  return (ENOMEM);
2010 2010          }
2011 2011  #endif
2012 2012  
2013 2013          /*
2014 2014           * One must be NULL but not both.
2015 2015           * And one must be non NULL but not both.
2016 2016           */
2017 2017          ASSERT(basepp != NULL || ppa != NULL);
2018 2018          ASSERT(basepp == NULL || ppa == NULL);
2019 2019  
2020 2020  #if defined(__i386) || defined(__amd64)
2021 2021          while (page_chk_freelist(szc) == 0) {
2022 2022                  VM_STAT_ADD(alloc_pages[8]);
2023 2023                  if (anypgsz == 0 || --szc == 0)
2024 2024                          return (ENOMEM);
2025 2025          }
2026 2026  #endif
2027 2027  
2028 2028          pgsz = page_get_pagesize(szc);
2029 2029          totpgs = curnpgs = npgs = pgsz >> PAGESHIFT;
2030 2030  
2031 2031          ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0);
2032 2032  
2033 2033          (void) page_create_wait(npgs, PG_WAIT);
2034 2034  
2035 2035          while (npgs && szc) {
2036 2036                  lgrp = lgrp_mem_choose(seg, addr, pgsz);
2037 2037                  if (pgflags == PG_LOCAL) {
2038 2038                          pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2039 2039                              pgflags, lgrp);
2040 2040                          if (pp == NULL) {
2041 2041                                  pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2042 2042                                      0, lgrp);
2043 2043                          }
2044 2044                  } else {
2045 2045                          pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2046 2046                              0, lgrp);
2047 2047                  }
2048 2048                  if (pp != NULL) {
2049 2049                          VM_STAT_ADD(alloc_pages[1]);
2050 2050                          page_list_concat(&pplist, &pp);
2051 2051                          ASSERT(npgs >= curnpgs);
2052 2052                          npgs -= curnpgs;
2053 2053                  } else if (anypgsz) {
2054 2054                          VM_STAT_ADD(alloc_pages[2]);
2055 2055                          szc--;
2056 2056                          pgsz = page_get_pagesize(szc);
2057 2057                          curnpgs = pgsz >> PAGESHIFT;
2058 2058                  } else {
2059 2059                          VM_STAT_ADD(alloc_pages[3]);
2060 2060                          ASSERT(npgs == totpgs);
2061 2061                          page_create_putback(npgs);
2062 2062                          return (ENOMEM);
2063 2063                  }
2064 2064          }
2065 2065          if (szc == 0) {
2066 2066                  VM_STAT_ADD(alloc_pages[4]);
2067 2067                  ASSERT(npgs != 0);
2068 2068                  page_create_putback(npgs);
2069 2069                  err = ENOMEM;
2070 2070          } else if (basepp != NULL) {
2071 2071                  ASSERT(npgs == 0);
2072 2072                  ASSERT(ppa == NULL);
2073 2073                  *basepp = pplist;
2074 2074          }
2075 2075  
2076 2076          npgs = totpgs - npgs;
2077 2077          pp = pplist;
2078 2078  
2079 2079          /*
2080 2080           * Clear the free and age bits. Also if we were passed in a ppa then
2081 2081           * fill it in with all the constituent pages from the large page. But
2082 2082           * if we failed to allocate all the pages just free what we got.
2083 2083           */
2084 2084          while (npgs != 0) {
2085 2085                  ASSERT(PP_ISFREE(pp));
2086 2086                  ASSERT(PP_ISAGED(pp));
2087 2087                  if (ppa != NULL || err != 0) {
2088 2088                          if (err == 0) {
2089 2089                                  VM_STAT_ADD(alloc_pages[5]);
2090 2090                                  PP_CLRFREE(pp);
2091 2091                                  PP_CLRAGED(pp);
2092 2092                                  page_sub(&pplist, pp);
2093 2093                                  *ppa++ = pp;
2094 2094                                  npgs--;
2095 2095                          } else {
2096 2096                                  VM_STAT_ADD(alloc_pages[6]);
2097 2097                                  ASSERT(pp->p_szc != 0);
2098 2098                                  curnpgs = page_get_pagecnt(pp->p_szc);
2099 2099                                  page_list_break(&pp, &pplist, curnpgs);
2100 2100                                  page_list_add_pages(pp, 0);
2101 2101                                  page_create_putback(curnpgs);
2102 2102                                  ASSERT(npgs >= curnpgs);
2103 2103                                  npgs -= curnpgs;
2104 2104                          }
2105 2105                          pp = pplist;
2106 2106                  } else {
2107 2107                          VM_STAT_ADD(alloc_pages[7]);
2108 2108                          PP_CLRFREE(pp);
2109 2109                          PP_CLRAGED(pp);
2110 2110                          pp = pp->p_next;
2111 2111                          npgs--;
2112 2112                  }
2113 2113          }
2114 2114          return (err);
2115 2115  }
2116 2116  
2117 2117  /*
2118 2118   * Get a single large page off of the freelists, and set it up for use.
2119 2119   * Number of bytes requested must be a supported page size.
2120 2120   *
2121 2121   * Note that this call may fail even if there is sufficient
2122 2122   * memory available or PG_WAIT is set, so the caller must
2123 2123   * be willing to fallback on page_create_va(), block and retry,
2124 2124   * or fail the requester.
2125 2125   */
2126 2126  page_t *
2127 2127  page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2128 2128      struct seg *seg, caddr_t vaddr, void *arg)
2129 2129  {
2130 2130          pgcnt_t         npages;
2131 2131          page_t          *pp;
2132 2132          page_t          *rootpp;
2133 2133          lgrp_t          *lgrp;
2134 2134          lgrp_id_t       *lgrpid = (lgrp_id_t *)arg;
2135 2135  
2136 2136          ASSERT(vp != NULL);
2137 2137  
2138 2138          ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2139 2139              PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2140 2140          /* but no others */
2141 2141  
2142 2142          ASSERT((flags & PG_EXCL) == PG_EXCL);
2143 2143  
2144 2144          npages = btop(bytes);
2145 2145  
2146 2146          if (!kcage_on || panicstr) {
2147 2147                  /*
2148 2148                   * Cage is OFF, or we are single threaded in
2149 2149                   * panic, so make everything a RELOC request.
2150 2150                   */
2151 2151                  flags &= ~PG_NORELOC;
2152 2152          }
2153 2153  
2154 2154          /*
2155 2155           * Make sure there's adequate physical memory available.
2156 2156           * Note: PG_WAIT is ignored here.
2157 2157           */
2158 2158          if (freemem <= throttlefree + npages) {
2159 2159                  VM_STAT_ADD(page_create_large_cnt[1]);
2160 2160                  return (NULL);
2161 2161          }
2162 2162  
2163 2163          /*
2164 2164           * If cage is on, dampen draw from cage when available
2165 2165           * cage space is low.
2166 2166           */
2167 2167          if ((flags & (PG_NORELOC | PG_WAIT)) ==  (PG_NORELOC | PG_WAIT) &&
2168 2168              kcage_freemem < kcage_throttlefree + npages) {
2169 2169  
2170 2170                  /*
2171 2171                   * The cage is on, the caller wants PG_NORELOC
2172 2172                   * pages and available cage memory is very low.
2173 2173                   * Call kcage_create_throttle() to attempt to
2174 2174                   * control demand on the cage.
2175 2175                   */
2176 2176                  if (kcage_create_throttle(npages, flags) == KCT_FAILURE) {
2177 2177                          VM_STAT_ADD(page_create_large_cnt[2]);
2178 2178                          return (NULL);
2179 2179                  }
2180 2180          }
2181 2181  
2182 2182          if (!pcf_decrement_bucket(npages) &&
2183 2183              !pcf_decrement_multiple(NULL, npages, 1)) {
2184 2184                  VM_STAT_ADD(page_create_large_cnt[4]);
2185 2185                  return (NULL);
2186 2186          }
2187 2187  
2188 2188          /*
2189 2189           * This is where this function behaves fundamentally differently
2190 2190           * than page_create_va(); since we're intending to map the page
2191 2191           * with a single TTE, we have to get it as a physically contiguous
2192 2192           * hardware pagesize chunk.  If we can't, we fail.
2193 2193           */
2194 2194          if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max &&
2195 2195              LGRP_EXISTS(lgrp_table[*lgrpid]))
2196 2196                  lgrp = lgrp_table[*lgrpid];
2197 2197          else
2198 2198                  lgrp = lgrp_mem_choose(seg, vaddr, bytes);
2199 2199  
2200 2200          if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr,
2201 2201              bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) {
2202 2202                  page_create_putback(npages);
2203 2203                  VM_STAT_ADD(page_create_large_cnt[5]);
2204 2204                  return (NULL);
2205 2205          }
2206 2206  
2207 2207          /*
2208 2208           * if we got the page with the wrong mtype give it back this is a
2209 2209           * workaround for CR 6249718. When CR 6249718 is fixed we never get
2210 2210           * inside "if" and the workaround becomes just a nop
2211 2211           */
2212 2212          if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) {
2213 2213                  page_list_add_pages(rootpp, 0);
2214 2214                  page_create_putback(npages);
2215 2215                  VM_STAT_ADD(page_create_large_cnt[6]);
2216 2216                  return (NULL);
2217 2217          }
2218 2218  
2219 2219          /*
2220 2220           * If satisfying this request has left us with too little
2221 2221           * memory, start the wheels turning to get some back.  The
2222 2222           * first clause of the test prevents waking up the pageout
2223 2223           * daemon in situations where it would decide that there's
2224 2224           * nothing to do.
2225 2225           */
2226 2226          if (nscan < desscan && freemem < minfree) {
2227 2227                  TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2228 2228                      "pageout_cv_signal:freemem %ld", freemem);
2229 2229                  cv_signal(&proc_pageout->p_cv);
2230 2230          }
2231 2231  
2232 2232          pp = rootpp;
2233 2233          while (npages--) {
2234 2234                  ASSERT(PAGE_EXCL(pp));
2235 2235                  ASSERT(pp->p_vnode == NULL);
2236 2236                  ASSERT(!hat_page_is_mapped(pp));
2237 2237                  PP_CLRFREE(pp);
2238 2238                  PP_CLRAGED(pp);
2239 2239                  if (!page_hashin(pp, vp, off, NULL))
2240 2240                          panic("page_create_large: hashin failed: page %p",
2241 2241                              (void *)pp);
2242 2242                  page_io_lock(pp);
2243 2243                  off += PAGESIZE;
2244 2244                  pp = pp->p_next;
2245 2245          }
2246 2246  
2247 2247          VM_STAT_ADD(page_create_large_cnt[0]);
2248 2248          return (rootpp);
2249 2249  }
2250 2250  
2251 2251  page_t *
2252 2252  page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2253 2253      struct seg *seg, caddr_t vaddr)
2254 2254  {
2255 2255          page_t          *plist = NULL;
2256 2256          pgcnt_t         npages;
2257 2257          pgcnt_t         found_on_free = 0;
2258 2258          pgcnt_t         pages_req;
2259 2259          page_t          *npp = NULL;
2260 2260          struct pcf      *p;
2261 2261          lgrp_t          *lgrp;
2262 2262  
2263 2263          TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
2264 2264              "page_create_start:vp %p off %llx bytes %lu flags %x",
2265 2265              vp, off, bytes, flags);
2266 2266  
2267 2267          ASSERT(bytes != 0 && vp != NULL);
2268 2268  
2269 2269          if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) {
2270 2270                  panic("page_create: invalid flags");
2271 2271                  /*NOTREACHED*/
2272 2272          }
2273 2273          ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2274 2274              PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2275 2275              /* but no others */
2276 2276  
2277 2277          pages_req = npages = btopr(bytes);
2278 2278          /*
2279 2279           * Try to see whether request is too large to *ever* be
2280 2280           * satisfied, in order to prevent deadlock.  We arbitrarily
2281 2281           * decide to limit maximum size requests to max_page_get.
2282 2282           */
2283 2283          if (npages >= max_page_get) {
2284 2284                  if ((flags & PG_WAIT) == 0) {
2285 2285                          TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG,
2286 2286                              "page_create_toobig:vp %p off %llx npages "
2287 2287                              "%lu max_page_get %lu",
2288 2288                              vp, off, npages, max_page_get);
2289 2289                          return (NULL);
2290 2290                  } else {
2291 2291                          cmn_err(CE_WARN,
2292 2292                              "Request for too much kernel memory "
2293 2293                              "(%lu bytes), will hang forever", bytes);
2294 2294                          for (;;)
2295 2295                                  delay(1000000000);
2296 2296                  }
2297 2297          }
2298 2298  
2299 2299          if (!kcage_on || panicstr) {
2300 2300                  /*
2301 2301                   * Cage is OFF, or we are single threaded in
2302 2302                   * panic, so make everything a RELOC request.
2303 2303                   */
2304 2304                  flags &= ~PG_NORELOC;
2305 2305          }
2306 2306  
2307 2307          if (freemem <= throttlefree + npages)
2308 2308                  if (!page_create_throttle(npages, flags))
2309 2309                          return (NULL);
2310 2310  
2311 2311          /*
2312 2312           * If cage is on, dampen draw from cage when available
2313 2313           * cage space is low.
2314 2314           */
2315 2315          if ((flags & PG_NORELOC) &&
2316 2316              kcage_freemem < kcage_throttlefree + npages) {
2317 2317  
2318 2318                  /*
2319 2319                   * The cage is on, the caller wants PG_NORELOC
2320 2320                   * pages and available cage memory is very low.
2321 2321                   * Call kcage_create_throttle() to attempt to
2322 2322                   * control demand on the cage.
2323 2323                   */
2324 2324                  if (kcage_create_throttle(npages, flags) == KCT_FAILURE)
2325 2325                          return (NULL);
2326 2326          }
2327 2327  
2328 2328          VM_STAT_ADD(page_create_cnt[0]);
2329 2329  
2330 2330          if (!pcf_decrement_bucket(npages)) {
2331 2331                  /*
2332 2332                   * Have to look harder.  If npages is greater than
2333 2333                   * one, then we might have to coalesce the counters.
2334 2334                   *
2335 2335                   * Go wait.  We come back having accounted
2336 2336                   * for the memory.
2337 2337                   */
2338 2338                  VM_STAT_ADD(page_create_cnt[1]);
2339 2339                  if (!page_create_wait(npages, flags)) {
2340 2340                          VM_STAT_ADD(page_create_cnt[2]);
2341 2341                          return (NULL);
2342 2342                  }
2343 2343          }
2344 2344  
2345 2345          TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
2346 2346              "page_create_success:vp %p off %llx", vp, off);
2347 2347  
2348 2348          /*
2349 2349           * If satisfying this request has left us with too little
2350 2350           * memory, start the wheels turning to get some back.  The
2351 2351           * first clause of the test prevents waking up the pageout
2352 2352           * daemon in situations where it would decide that there's
2353 2353           * nothing to do.
2354 2354           */
2355 2355          if (nscan < desscan && freemem < minfree) {
2356 2356                  TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2357 2357                      "pageout_cv_signal:freemem %ld", freemem);
2358 2358                  cv_signal(&proc_pageout->p_cv);
2359 2359          }
2360 2360  
2361 2361          /*
2362 2362           * Loop around collecting the requested number of pages.
2363 2363           * Most of the time, we have to `create' a new page. With
2364 2364           * this in mind, pull the page off the free list before
2365 2365           * getting the hash lock.  This will minimize the hash
2366 2366           * lock hold time, nesting, and the like.  If it turns
2367 2367           * out we don't need the page, we put it back at the end.
2368 2368           */
2369 2369          while (npages--) {
2370 2370                  page_t          *pp;
2371 2371                  kmutex_t        *phm = NULL;
2372 2372                  ulong_t         index;
2373 2373  
2374 2374                  index = PAGE_HASH_FUNC(vp, off);
2375 2375  top:
2376 2376                  ASSERT(phm == NULL);
2377 2377                  ASSERT(index == PAGE_HASH_FUNC(vp, off));
2378 2378                  ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
2379 2379  
2380 2380                  if (npp == NULL) {
2381 2381                          /*
2382 2382                           * Try to get a page from the freelist (ie,
2383 2383                           * a page with no [vp, off] tag).  If that
2384 2384                           * fails, use the cachelist.
2385 2385                           *
2386 2386                           * During the first attempt at both the free
2387 2387                           * and cache lists we try for the correct color.
2388 2388                           */
2389 2389                          /*
2390 2390                           * XXXX-how do we deal with virtual indexed
2391 2391                           * caches and and colors?
2392 2392                           */
2393 2393                          VM_STAT_ADD(page_create_cnt[4]);
2394 2394                          /*
2395 2395                           * Get lgroup to allocate next page of shared memory
2396 2396                           * from and use it to specify where to allocate
2397 2397                           * the physical memory
2398 2398                           */
2399 2399                          lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
2400 2400                          npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
2401 2401                              flags | PG_MATCH_COLOR, lgrp);
2402 2402                          if (npp == NULL) {
2403 2403                                  npp = page_get_cachelist(vp, off, seg,
2404 2404                                      vaddr, flags | PG_MATCH_COLOR, lgrp);
2405 2405                                  if (npp == NULL) {
2406 2406                                          npp = page_create_get_something(vp,
2407 2407                                              off, seg, vaddr,
2408 2408                                              flags & ~PG_MATCH_COLOR);
2409 2409                                  }
2410 2410  
2411 2411                                  if (PP_ISAGED(npp) == 0) {
2412 2412                                          /*
2413 2413                                           * Since this page came from the
2414 2414                                           * cachelist, we must destroy the
2415 2415                                           * old vnode association.
2416 2416                                           */
2417 2417                                          page_hashout(npp, NULL);
2418 2418                                  }
2419 2419                          }
2420 2420                  }
2421 2421  
2422 2422                  /*
2423 2423                   * We own this page!
2424 2424                   */
2425 2425                  ASSERT(PAGE_EXCL(npp));
2426 2426                  ASSERT(npp->p_vnode == NULL);
2427 2427                  ASSERT(!hat_page_is_mapped(npp));
2428 2428                  PP_CLRFREE(npp);
2429 2429                  PP_CLRAGED(npp);
2430 2430  
2431 2431                  /*
2432 2432                   * Here we have a page in our hot little mits and are
2433 2433                   * just waiting to stuff it on the appropriate lists.
2434 2434                   * Get the mutex and check to see if it really does
2435 2435                   * not exist.
2436 2436                   */
2437 2437                  phm = PAGE_HASH_MUTEX(index);
2438 2438                  mutex_enter(phm);
2439 2439                  pp = page_hash_search(index, vp, off);
2440 2440                  if (pp == NULL) {
2441 2441                          VM_STAT_ADD(page_create_new);
2442 2442                          pp = npp;
2443 2443                          npp = NULL;
2444 2444                          if (!page_hashin(pp, vp, off, phm)) {
2445 2445                                  /*
2446 2446                                   * Since we hold the page hash mutex and
2447 2447                                   * just searched for this page, page_hashin
2448 2448                                   * had better not fail.  If it does, that
2449 2449                                   * means somethread did not follow the
2450 2450                                   * page hash mutex rules.  Panic now and
2451 2451                                   * get it over with.  As usual, go down
2452 2452                                   * holding all the locks.
2453 2453                                   */
2454 2454                                  ASSERT(MUTEX_HELD(phm));
2455 2455                                  panic("page_create: "
2456 2456                                      "hashin failed %p %p %llx %p",
2457 2457                                      (void *)pp, (void *)vp, off, (void *)phm);
2458 2458                                  /*NOTREACHED*/
2459 2459                          }
2460 2460                          ASSERT(MUTEX_HELD(phm));
2461 2461                          mutex_exit(phm);
2462 2462                          phm = NULL;
2463 2463  
2464 2464                          /*
2465 2465                           * Hat layer locking need not be done to set
2466 2466                           * the following bits since the page is not hashed
2467 2467                           * and was on the free list (i.e., had no mappings).
2468 2468                           *
2469 2469                           * Set the reference bit to protect
2470 2470                           * against immediate pageout
2471 2471                           *
2472 2472                           * XXXmh modify freelist code to set reference
2473 2473                           * bit so we don't have to do it here.
2474 2474                           */
2475 2475                          page_set_props(pp, P_REF);
2476 2476                          found_on_free++;
2477 2477                  } else {
2478 2478                          VM_STAT_ADD(page_create_exists);
2479 2479                          if (flags & PG_EXCL) {
2480 2480                                  /*
2481 2481                                   * Found an existing page, and the caller
2482 2482                                   * wanted all new pages.  Undo all of the work
2483 2483                                   * we have done.
2484 2484                                   */
2485 2485                                  mutex_exit(phm);
2486 2486                                  phm = NULL;
2487 2487                                  while (plist != NULL) {
2488 2488                                          pp = plist;
2489 2489                                          page_sub(&plist, pp);
2490 2490                                          page_io_unlock(pp);
2491 2491                                          /* large pages should not end up here */
2492 2492                                          ASSERT(pp->p_szc == 0);
2493 2493                                          /*LINTED: constant in conditional ctx*/
2494 2494                                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
2495 2495                                  }
2496 2496                                  VM_STAT_ADD(page_create_found_one);
2497 2497                                  goto fail;
2498 2498                          }
2499 2499                          ASSERT(flags & PG_WAIT);
2500 2500                          if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) {
2501 2501                                  /*
2502 2502                                   * Start all over again if we blocked trying
2503 2503                                   * to lock the page.
2504 2504                                   */
2505 2505                                  mutex_exit(phm);
2506 2506                                  VM_STAT_ADD(page_create_page_lock_failed);
2507 2507                                  phm = NULL;
2508 2508                                  goto top;
2509 2509                          }
2510 2510                          mutex_exit(phm);
2511 2511                          phm = NULL;
2512 2512  
2513 2513                          if (PP_ISFREE(pp)) {
2514 2514                                  ASSERT(PP_ISAGED(pp) == 0);
2515 2515                                  VM_STAT_ADD(pagecnt.pc_get_cache);
2516 2516                                  page_list_sub(pp, PG_CACHE_LIST);
2517 2517                                  PP_CLRFREE(pp);
2518 2518                                  found_on_free++;
2519 2519                          }
2520 2520                  }
2521 2521  
2522 2522                  /*
2523 2523                   * Got a page!  It is locked.  Acquire the i/o
2524 2524                   * lock since we are going to use the p_next and
2525 2525                   * p_prev fields to link the requested pages together.
2526 2526                   */
2527 2527                  page_io_lock(pp);
2528 2528                  page_add(&plist, pp);
2529 2529                  plist = plist->p_next;
2530 2530                  off += PAGESIZE;
2531 2531                  vaddr += PAGESIZE;
2532 2532          }
2533 2533  
2534 2534          ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1);
2535 2535  fail:
2536 2536          if (npp != NULL) {
2537 2537                  /*
2538 2538                   * Did not need this page after all.
2539 2539                   * Put it back on the free list.
2540 2540                   */
2541 2541                  VM_STAT_ADD(page_create_putbacks);
2542 2542                  PP_SETFREE(npp);
2543 2543                  PP_SETAGED(npp);
2544 2544                  npp->p_offset = (u_offset_t)-1;
2545 2545                  page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
2546 2546                  page_unlock(npp);
2547 2547  
2548 2548          }
2549 2549  
2550 2550          ASSERT(pages_req >= found_on_free);
2551 2551  
2552 2552          {
2553 2553                  uint_t overshoot = (uint_t)(pages_req - found_on_free);
2554 2554  
2555 2555                  if (overshoot) {
2556 2556                          VM_STAT_ADD(page_create_overshoot);
2557 2557                          p = &pcf[PCF_INDEX()];
2558 2558                          mutex_enter(&p->pcf_lock);
2559 2559                          if (p->pcf_block) {
2560 2560                                  p->pcf_reserve += overshoot;
2561 2561                          } else {
2562 2562                                  p->pcf_count += overshoot;
2563 2563                                  if (p->pcf_wait) {
2564 2564                                          mutex_enter(&new_freemem_lock);
2565 2565                                          if (freemem_wait) {
2566 2566                                                  cv_signal(&freemem_cv);
2567 2567                                                  p->pcf_wait--;
2568 2568                                          } else {
2569 2569                                                  p->pcf_wait = 0;
2570 2570                                          }
2571 2571                                          mutex_exit(&new_freemem_lock);
2572 2572                                  }
2573 2573                          }
2574 2574                          mutex_exit(&p->pcf_lock);
2575 2575                          /* freemem is approximate, so this test OK */
2576 2576                          if (!p->pcf_block)
2577 2577                                  freemem += overshoot;
2578 2578                  }
2579 2579          }
2580 2580  
2581 2581          return (plist);
2582 2582  }
2583 2583  
2584 2584  /*
2585 2585   * One or more constituent pages of this large page has been marked
2586 2586   * toxic. Simply demote the large page to PAGESIZE pages and let
2587 2587   * page_free() handle it. This routine should only be called by
2588 2588   * large page free routines (page_free_pages() and page_destroy_pages().
2589 2589   * All pages are locked SE_EXCL and have already been marked free.
2590 2590   */
2591 2591  static void
2592 2592  page_free_toxic_pages(page_t *rootpp)
2593 2593  {
2594 2594          page_t  *tpp;
2595 2595          pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc);
2596 2596          uint_t  szc = rootpp->p_szc;
2597 2597  
2598 2598          for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) {
2599 2599                  ASSERT(tpp->p_szc == szc);
2600 2600                  ASSERT((PAGE_EXCL(tpp) &&
2601 2601                      !page_iolock_assert(tpp)) || panicstr);
2602 2602                  tpp->p_szc = 0;
2603 2603          }
2604 2604  
2605 2605          while (rootpp != NULL) {
2606 2606                  tpp = rootpp;
2607 2607                  page_sub(&rootpp, tpp);
2608 2608                  ASSERT(PP_ISFREE(tpp));
2609 2609                  PP_CLRFREE(tpp);
2610 2610                  page_free(tpp, 1);
2611 2611          }
2612 2612  }
2613 2613  
2614 2614  /*
2615 2615   * Put page on the "free" list.
2616 2616   * The free list is really two lists maintained by
2617 2617   * the PSM of whatever machine we happen to be on.
2618 2618   */
2619 2619  void
2620 2620  page_free(page_t *pp, int dontneed)
2621 2621  {
2622 2622          struct pcf      *p;
2623 2623          uint_t          pcf_index;
2624 2624  
2625 2625          ASSERT((PAGE_EXCL(pp) &&
2626 2626              !page_iolock_assert(pp)) || panicstr);
2627 2627  
2628 2628          if (PP_ISFREE(pp)) {
2629 2629                  panic("page_free: page %p is free", (void *)pp);
2630 2630          }
2631 2631  
2632 2632          if (pp->p_szc != 0) {
2633 2633                  if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
2634 2634                      PP_ISKAS(pp)) {
2635 2635                          panic("page_free: anon or kernel "
2636 2636                              "or no vnode large page %p", (void *)pp);
2637 2637                  }
2638 2638                  page_demote_vp_pages(pp);
2639 2639                  ASSERT(pp->p_szc == 0);
2640 2640          }
2641 2641  
2642 2642          /*
2643 2643           * The page_struct_lock need not be acquired to examine these
2644 2644           * fields since the page has an "exclusive" lock.
2645 2645           */
2646 2646          if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
2647 2647              pp->p_slckcnt != 0) {
2648 2648                  panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d "
2649 2649                      "slckcnt = %d", (void *)pp, page_pptonum(pp), pp->p_lckcnt,
2650 2650                      pp->p_cowcnt, pp->p_slckcnt);
2651 2651                  /*NOTREACHED*/
2652 2652          }
2653 2653  
2654 2654          ASSERT(!hat_page_getshare(pp));
2655 2655  
2656 2656          PP_SETFREE(pp);
2657 2657          ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
2658 2658              !hat_ismod(pp));
2659 2659          page_clr_all_props(pp);
2660 2660          ASSERT(!hat_page_getshare(pp));
2661 2661  
2662 2662          /*
2663 2663           * Now we add the page to the head of the free list.
2664 2664           * But if this page is associated with a paged vnode
2665 2665           * then we adjust the head forward so that the page is
2666 2666           * effectively at the end of the list.
2667 2667           */
2668 2668          if (pp->p_vnode == NULL) {
2669 2669                  /*
2670 2670                   * Page has no identity, put it on the free list.
2671 2671                   */
2672 2672                  PP_SETAGED(pp);
2673 2673                  pp->p_offset = (u_offset_t)-1;
2674 2674                  page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2675 2675                  VM_STAT_ADD(pagecnt.pc_free_free);
2676 2676                  TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2677 2677                      "page_free_free:pp %p", pp);
2678 2678          } else {
2679 2679                  PP_CLRAGED(pp);
2680 2680  
2681 2681                  if (!dontneed) {
2682 2682                          /* move it to the tail of the list */
2683 2683                          page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL);
2684 2684  
2685 2685                          VM_STAT_ADD(pagecnt.pc_free_cache);
2686 2686                          TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL,
2687 2687                              "page_free_cache_tail:pp %p", pp);
2688 2688                  } else {
2689 2689                          page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD);
2690 2690  
2691 2691                          VM_STAT_ADD(pagecnt.pc_free_dontneed);
2692 2692                          TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD,
2693 2693                              "page_free_cache_head:pp %p", pp);
2694 2694                  }
2695 2695          }
2696 2696          page_unlock(pp);
2697 2697  
2698 2698          /*
2699 2699           * Now do the `freemem' accounting.
2700 2700           */
2701 2701          pcf_index = PCF_INDEX();
2702 2702          p = &pcf[pcf_index];
2703 2703  
2704 2704          mutex_enter(&p->pcf_lock);
2705 2705          if (p->pcf_block) {
2706 2706                  p->pcf_reserve += 1;
2707 2707          } else {
2708 2708                  p->pcf_count += 1;
2709 2709                  if (p->pcf_wait) {
2710 2710                          mutex_enter(&new_freemem_lock);
2711 2711                          /*
2712 2712                           * Check to see if some other thread
2713 2713                           * is actually waiting.  Another bucket
2714 2714                           * may have woken it up by now.  If there
2715 2715                           * are no waiters, then set our pcf_wait
2716 2716                           * count to zero to avoid coming in here
2717 2717                           * next time.  Also, since only one page
2718 2718                           * was put on the free list, just wake
2719 2719                           * up one waiter.
2720 2720                           */
2721 2721                          if (freemem_wait) {
2722 2722                                  cv_signal(&freemem_cv);
2723 2723                                  p->pcf_wait--;
2724 2724                          } else {
2725 2725                                  p->pcf_wait = 0;
2726 2726                          }
2727 2727                          mutex_exit(&new_freemem_lock);
2728 2728                  }
2729 2729          }
2730 2730          mutex_exit(&p->pcf_lock);
2731 2731  
2732 2732          /* freemem is approximate, so this test OK */
2733 2733          if (!p->pcf_block)
2734 2734                  freemem += 1;
2735 2735  }
2736 2736  
2737 2737  /*
2738 2738   * Put page on the "free" list during intial startup.
2739 2739   * This happens during initial single threaded execution.
2740 2740   */
2741 2741  void
2742 2742  page_free_at_startup(page_t *pp)
2743 2743  {
2744 2744          struct pcf      *p;
2745 2745          uint_t          pcf_index;
2746 2746  
2747 2747          page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT);
2748 2748          VM_STAT_ADD(pagecnt.pc_free_free);
2749 2749  
2750 2750          /*
2751 2751           * Now do the `freemem' accounting.
2752 2752           */
2753 2753          pcf_index = PCF_INDEX();
2754 2754          p = &pcf[pcf_index];
2755 2755  
2756 2756          ASSERT(p->pcf_block == 0);
2757 2757          ASSERT(p->pcf_wait == 0);
2758 2758          p->pcf_count += 1;
2759 2759  
2760 2760          /* freemem is approximate, so this is OK */
2761 2761          freemem += 1;
2762 2762  }
2763 2763  
2764 2764  void
2765 2765  page_free_pages(page_t *pp)
2766 2766  {
2767 2767          page_t  *tpp, *rootpp = NULL;
2768 2768          pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
2769 2769          pgcnt_t i;
2770 2770          uint_t  szc = pp->p_szc;
2771 2771  
2772 2772          VM_STAT_ADD(pagecnt.pc_free_pages);
2773 2773          TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2774 2774              "page_free_free:pp %p", pp);
2775 2775  
2776 2776          ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
2777 2777          if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
2778 2778                  panic("page_free_pages: not root page %p", (void *)pp);
2779 2779                  /*NOTREACHED*/
2780 2780          }
2781 2781  
2782 2782          for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
2783 2783                  ASSERT((PAGE_EXCL(tpp) &&
2784 2784                      !page_iolock_assert(tpp)) || panicstr);
2785 2785                  if (PP_ISFREE(tpp)) {
2786 2786                          panic("page_free_pages: page %p is free", (void *)tpp);
2787 2787                          /*NOTREACHED*/
2788 2788                  }
2789 2789                  if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 ||
2790 2790                      tpp->p_cowcnt != 0 || tpp->p_slckcnt != 0) {
2791 2791                          panic("page_free_pages %p", (void *)tpp);
2792 2792                          /*NOTREACHED*/
2793 2793                  }
2794 2794  
2795 2795                  ASSERT(!hat_page_getshare(tpp));
2796 2796                  ASSERT(tpp->p_vnode == NULL);
2797 2797                  ASSERT(tpp->p_szc == szc);
2798 2798  
2799 2799                  PP_SETFREE(tpp);
2800 2800                  page_clr_all_props(tpp);
2801 2801                  PP_SETAGED(tpp);
2802 2802                  tpp->p_offset = (u_offset_t)-1;
2803 2803                  ASSERT(tpp->p_next == tpp);
2804 2804                  ASSERT(tpp->p_prev == tpp);
2805 2805                  page_list_concat(&rootpp, &tpp);
2806 2806          }
2807 2807          ASSERT(rootpp == pp);
2808 2808  
2809 2809          page_list_add_pages(rootpp, 0);
2810 2810          page_create_putback(pgcnt);
2811 2811  }
2812 2812  
2813 2813  int free_pages = 1;
2814 2814  
2815 2815  /*
2816 2816   * This routine attempts to return pages to the cachelist via page_release().
2817 2817   * It does not *have* to be successful in all cases, since the pageout scanner
2818 2818   * will catch any pages it misses.  It does need to be fast and not introduce
2819 2819   * too much overhead.
2820 2820   *
2821 2821   * If a page isn't found on the unlocked sweep of the page_hash bucket, we
2822 2822   * don't lock and retry.  This is ok, since the page scanner will eventually
2823 2823   * find any page we miss in free_vp_pages().
2824 2824   */
2825 2825  void
2826 2826  free_vp_pages(vnode_t *vp, u_offset_t off, size_t len)
2827 2827  {
2828 2828          page_t *pp;
2829 2829          u_offset_t eoff;
2830 2830          extern int swap_in_range(vnode_t *, u_offset_t, size_t);
2831 2831  
2832 2832          eoff = off + len;
2833 2833  
2834 2834          if (free_pages == 0)
2835 2835                  return;
2836 2836          if (swap_in_range(vp, off, len))
2837 2837                  return;
2838 2838  
2839 2839          for (; off < eoff; off += PAGESIZE) {
2840 2840  
2841 2841                  /*
2842 2842                   * find the page using a fast, but inexact search. It'll be OK
2843 2843                   * if a few pages slip through the cracks here.
2844 2844                   */
2845 2845                  pp = page_exists(vp, off);
2846 2846  
2847 2847                  /*
2848 2848                   * If we didn't find the page (it may not exist), the page
2849 2849                   * is free, looks still in use (shared), or we can't lock it,
2850 2850                   * just give up.
2851 2851                   */
2852 2852                  if (pp == NULL ||
2853 2853                      PP_ISFREE(pp) ||
2854 2854                      page_share_cnt(pp) > 0 ||
2855 2855                      !page_trylock(pp, SE_EXCL))
2856 2856                          continue;
2857 2857  
2858 2858                  /*
2859 2859                   * Once we have locked pp, verify that it's still the
2860 2860                   * correct page and not already free
2861 2861                   */
2862 2862                  ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL));
2863 2863                  if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) {
2864 2864                          page_unlock(pp);
2865 2865                          continue;
2866 2866                  }
2867 2867  
2868 2868                  /*
2869 2869                   * try to release the page...
2870 2870                   */
2871 2871                  (void) page_release(pp, 1);
2872 2872          }
2873 2873  }
2874 2874  
2875 2875  /*
2876 2876   * Reclaim the given page from the free list.
2877 2877   * If pp is part of a large pages, only the given constituent page is reclaimed
2878 2878   * and the large page it belonged to will be demoted.  This can only happen
2879 2879   * if the page is not on the cachelist.
2880 2880   *
2881 2881   * Returns 1 on success or 0 on failure.
2882 2882   *
2883 2883   * The page is unlocked if it can't be reclaimed (when freemem == 0).
2884 2884   * If `lock' is non-null, it will be dropped and re-acquired if
2885 2885   * the routine must wait while freemem is 0.
2886 2886   *
2887 2887   * As it turns out, boot_getpages() does this.  It picks a page,
2888 2888   * based on where OBP mapped in some address, gets its pfn, searches
2889 2889   * the memsegs, locks the page, then pulls it off the free list!
2890 2890   */
2891 2891  int
2892 2892  page_reclaim(page_t *pp, kmutex_t *lock)
2893 2893  {
2894 2894          struct pcf      *p;
2895 2895          struct cpu      *cpup;
2896 2896          int             enough;
2897 2897          uint_t          i;
2898 2898  
2899 2899          ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
2900 2900          ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
2901 2901  
2902 2902          /*
2903 2903           * If `freemem' is 0, we cannot reclaim this page from the
2904 2904           * freelist, so release every lock we might hold: the page,
2905 2905           * and the `lock' before blocking.
2906 2906           *
2907 2907           * The only way `freemem' can become 0 while there are pages
2908 2908           * marked free (have their p->p_free bit set) is when the
2909 2909           * system is low on memory and doing a page_create().  In
2910 2910           * order to guarantee that once page_create() starts acquiring
2911 2911           * pages it will be able to get all that it needs since `freemem'
2912 2912           * was decreased by the requested amount.  So, we need to release
2913 2913           * this page, and let page_create() have it.
2914 2914           *
2915 2915           * Since `freemem' being zero is not supposed to happen, just
2916 2916           * use the usual hash stuff as a starting point.  If that bucket
2917 2917           * is empty, then assume the worst, and start at the beginning
2918 2918           * of the pcf array.  If we always start at the beginning
2919 2919           * when acquiring more than one pcf lock, there won't be any
2920 2920           * deadlock problems.
2921 2921           */
2922 2922  
2923 2923          /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */
2924 2924  
2925 2925          if (freemem <= throttlefree && !page_create_throttle(1l, 0)) {
2926 2926                  pcf_acquire_all();
2927 2927                  goto page_reclaim_nomem;
2928 2928          }
2929 2929  
2930 2930          enough = pcf_decrement_bucket(1);
2931 2931  
2932 2932          if (!enough) {
2933 2933                  VM_STAT_ADD(page_reclaim_zero);
2934 2934                  /*
2935 2935                   * Check again. Its possible that some other thread
2936 2936                   * could have been right behind us, and added one
2937 2937                   * to a list somewhere.  Acquire each of the pcf locks
2938 2938                   * until we find a page.
2939 2939                   */
2940 2940                  p = pcf;
2941 2941                  for (i = 0; i < pcf_fanout; i++) {
2942 2942                          mutex_enter(&p->pcf_lock);
2943 2943                          if (p->pcf_count >= 1) {
2944 2944                                  p->pcf_count -= 1;
2945 2945                                  /*
2946 2946                                   * freemem is not protected by any lock. Thus,
2947 2947                                   * we cannot have any assertion containing
2948 2948                                   * freemem here.
2949 2949                                   */
2950 2950                                  freemem -= 1;
2951 2951                                  enough = 1;
2952 2952                                  break;
2953 2953                          }
2954 2954                          p++;
2955 2955                  }
2956 2956  
2957 2957                  if (!enough) {
2958 2958  page_reclaim_nomem:
2959 2959                          /*
2960 2960                           * We really can't have page `pp'.
2961 2961                           * Time for the no-memory dance with
2962 2962                           * page_free().  This is just like
2963 2963                           * page_create_wait().  Plus the added
2964 2964                           * attraction of releasing whatever mutex
2965 2965                           * we held when we were called with in `lock'.
2966 2966                           * Page_unlock() will wakeup any thread
2967 2967                           * waiting around for this page.
2968 2968                           */
2969 2969                          if (lock) {
2970 2970                                  VM_STAT_ADD(page_reclaim_zero_locked);
2971 2971                                  mutex_exit(lock);
2972 2972                          }
2973 2973                          page_unlock(pp);
2974 2974  
2975 2975                          /*
2976 2976                           * get this before we drop all the pcf locks.
2977 2977                           */
2978 2978                          mutex_enter(&new_freemem_lock);
2979 2979  
2980 2980                          p = pcf;
2981 2981                          for (i = 0; i < pcf_fanout; i++) {
2982 2982                                  p->pcf_wait++;
2983 2983                                  mutex_exit(&p->pcf_lock);
2984 2984                                  p++;
2985 2985                          }
2986 2986  
2987 2987                          freemem_wait++;
2988 2988                          cv_wait(&freemem_cv, &new_freemem_lock);
2989 2989                          freemem_wait--;
2990 2990  
2991 2991                          mutex_exit(&new_freemem_lock);
2992 2992  
2993 2993                          if (lock) {
2994 2994                                  mutex_enter(lock);
2995 2995                          }
2996 2996                          return (0);
2997 2997                  }
2998 2998  
2999 2999                  /*
3000 3000                   * The pcf accounting has been done,
3001 3001                   * though none of the pcf_wait flags have been set,
3002 3002                   * drop the locks and continue on.
3003 3003                   */
3004 3004                  while (p >= pcf) {
3005 3005                          mutex_exit(&p->pcf_lock);
3006 3006                          p--;
3007 3007                  }
3008 3008          }
3009 3009  
3010 3010  
3011 3011          VM_STAT_ADD(pagecnt.pc_reclaim);
3012 3012  
3013 3013          /*
3014 3014           * page_list_sub will handle the case where pp is a large page.
3015 3015           * It's possible that the page was promoted while on the freelist
3016 3016           */
3017 3017          if (PP_ISAGED(pp)) {
3018 3018                  page_list_sub(pp, PG_FREE_LIST);
3019 3019                  TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
3020 3020                      "page_reclaim_free:pp %p", pp);
3021 3021          } else {
3022 3022                  page_list_sub(pp, PG_CACHE_LIST);
3023 3023                  TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
3024 3024                      "page_reclaim_cache:pp %p", pp);
3025 3025          }
3026 3026  
3027 3027          /*
3028 3028           * clear the p_free & p_age bits since this page is no longer
3029 3029           * on the free list.  Notice that there was a brief time where
3030 3030           * a page is marked as free, but is not on the list.
3031 3031           *
3032 3032           * Set the reference bit to protect against immediate pageout.
3033 3033           */
3034 3034          PP_CLRFREE(pp);
3035 3035          PP_CLRAGED(pp);
3036 3036          page_set_props(pp, P_REF);
3037 3037  
3038 3038          CPU_STATS_ENTER_K();
3039 3039          cpup = CPU;     /* get cpup now that CPU cannot change */
3040 3040          CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
3041 3041          CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
3042 3042          CPU_STATS_EXIT_K();
3043 3043          ASSERT(pp->p_szc == 0);
3044 3044  
3045 3045          return (1);
3046 3046  }
3047 3047  
3048 3048  /*
3049 3049   * Destroy identity of the page and put it back on
3050 3050   * the page free list.  Assumes that the caller has
3051 3051   * acquired the "exclusive" lock on the page.
3052 3052   */
3053 3053  void
3054 3054  page_destroy(page_t *pp, int dontfree)
3055 3055  {
3056 3056          ASSERT((PAGE_EXCL(pp) &&
3057 3057              !page_iolock_assert(pp)) || panicstr);
3058 3058          ASSERT(pp->p_slckcnt == 0 || panicstr);
3059 3059  
3060 3060          if (pp->p_szc != 0) {
3061 3061                  if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
3062 3062                      PP_ISKAS(pp)) {
3063 3063                          panic("page_destroy: anon or kernel or no vnode "
3064 3064                              "large page %p", (void *)pp);
3065 3065                  }
3066 3066                  page_demote_vp_pages(pp);
3067 3067                  ASSERT(pp->p_szc == 0);
3068 3068          }
3069 3069  
3070 3070          TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp);
3071 3071  
3072 3072          /*
3073 3073           * Unload translations, if any, then hash out the
3074 3074           * page to erase its identity.
3075 3075           */
3076 3076          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3077 3077          page_hashout(pp, NULL);
3078 3078  
3079 3079          if (!dontfree) {
3080 3080                  /*
3081 3081                   * Acquire the "freemem_lock" for availrmem.
3082 3082                   * The page_struct_lock need not be acquired for lckcnt
3083 3083                   * and cowcnt since the page has an "exclusive" lock.
3084 3084                   * We are doing a modified version of page_pp_unlock here.
3085 3085                   */
3086 3086                  if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) {
3087 3087                          mutex_enter(&freemem_lock);
3088 3088                          if (pp->p_lckcnt != 0) {
3089 3089                                  availrmem++;
3090 3090                                  pages_locked--;
3091 3091                                  pp->p_lckcnt = 0;
3092 3092                          }
3093 3093                          if (pp->p_cowcnt != 0) {
3094 3094                                  availrmem += pp->p_cowcnt;
3095 3095                                  pages_locked -= pp->p_cowcnt;
3096 3096                                  pp->p_cowcnt = 0;
3097 3097                          }
3098 3098                          mutex_exit(&freemem_lock);
3099 3099                  }
3100 3100                  /*
3101 3101                   * Put the page on the "free" list.
3102 3102                   */
3103 3103                  page_free(pp, 0);
3104 3104          }
3105 3105  }
3106 3106  
3107 3107  void
3108 3108  page_destroy_pages(page_t *pp)
3109 3109  {
3110 3110  
3111 3111          page_t  *tpp, *rootpp = NULL;
3112 3112          pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
3113 3113          pgcnt_t i, pglcks = 0;
3114 3114          uint_t  szc = pp->p_szc;
3115 3115  
3116 3116          ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
3117 3117  
3118 3118          VM_STAT_ADD(pagecnt.pc_destroy_pages);
3119 3119  
3120 3120          TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp);
3121 3121  
3122 3122          if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
3123 3123                  panic("page_destroy_pages: not root page %p", (void *)pp);
3124 3124                  /*NOTREACHED*/
3125 3125          }
3126 3126  
3127 3127          for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
3128 3128                  ASSERT((PAGE_EXCL(tpp) &&
3129 3129                      !page_iolock_assert(tpp)) || panicstr);
3130 3130                  ASSERT(tpp->p_slckcnt == 0 || panicstr);
3131 3131                  (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
3132 3132                  page_hashout(tpp, NULL);
3133 3133                  ASSERT(tpp->p_offset == (u_offset_t)-1);
3134 3134                  if (tpp->p_lckcnt != 0) {
3135 3135                          pglcks++;
3136 3136                          tpp->p_lckcnt = 0;
3137 3137                  } else if (tpp->p_cowcnt != 0) {
3138 3138                          pglcks += tpp->p_cowcnt;
3139 3139                          tpp->p_cowcnt = 0;
3140 3140                  }
3141 3141                  ASSERT(!hat_page_getshare(tpp));
3142 3142                  ASSERT(tpp->p_vnode == NULL);
3143 3143                  ASSERT(tpp->p_szc == szc);
3144 3144  
3145 3145                  PP_SETFREE(tpp);
3146 3146                  page_clr_all_props(tpp);
3147 3147                  PP_SETAGED(tpp);
3148 3148                  ASSERT(tpp->p_next == tpp);
3149 3149                  ASSERT(tpp->p_prev == tpp);
3150 3150                  page_list_concat(&rootpp, &tpp);
3151 3151          }
3152 3152  
3153 3153          ASSERT(rootpp == pp);
3154 3154          if (pglcks != 0) {
3155 3155                  mutex_enter(&freemem_lock);
3156 3156                  availrmem += pglcks;
3157 3157                  mutex_exit(&freemem_lock);
3158 3158          }
3159 3159  
3160 3160          page_list_add_pages(rootpp, 0);
3161 3161          page_create_putback(pgcnt);
3162 3162  }
3163 3163  
3164 3164  /*
3165 3165   * Similar to page_destroy(), but destroys pages which are
3166 3166   * locked and known to be on the page free list.  Since
3167 3167   * the page is known to be free and locked, no one can access
3168 3168   * it.
3169 3169   *
3170 3170   * Also, the number of free pages does not change.
3171 3171   */
3172 3172  void
3173 3173  page_destroy_free(page_t *pp)
3174 3174  {
3175 3175          ASSERT(PAGE_EXCL(pp));
3176 3176          ASSERT(PP_ISFREE(pp));
3177 3177          ASSERT(pp->p_vnode);
3178 3178          ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0);
3179 3179          ASSERT(!hat_page_is_mapped(pp));
3180 3180          ASSERT(PP_ISAGED(pp) == 0);
3181 3181          ASSERT(pp->p_szc == 0);
3182 3182  
3183 3183          VM_STAT_ADD(pagecnt.pc_destroy_free);
3184 3184          page_list_sub(pp, PG_CACHE_LIST);
3185 3185  
3186 3186          page_hashout(pp, NULL);
3187 3187          ASSERT(pp->p_vnode == NULL);
3188 3188          ASSERT(pp->p_offset == (u_offset_t)-1);
3189 3189          ASSERT(pp->p_hash == NULL);
3190 3190  
3191 3191          PP_SETAGED(pp);
3192 3192          page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3193 3193          page_unlock(pp);
3194 3194  
3195 3195          mutex_enter(&new_freemem_lock);
3196 3196          if (freemem_wait) {
3197 3197                  cv_signal(&freemem_cv);
3198 3198          }
3199 3199          mutex_exit(&new_freemem_lock);
3200 3200  }
3201 3201  
3202 3202  /*
3203 3203   * Rename the page "opp" to have an identity specified
3204 3204   * by [vp, off].  If a page already exists with this name
3205 3205   * it is locked and destroyed.  Note that the page's
3206 3206   * translations are not unloaded during the rename.
3207 3207   *
3208 3208   * This routine is used by the anon layer to "steal" the
3209 3209   * original page and is not unlike destroying a page and
3210 3210   * creating a new page using the same page frame.
3211 3211   *
3212 3212   * XXX -- Could deadlock if caller 1 tries to rename A to B while
3213 3213   * caller 2 tries to rename B to A.
3214 3214   */
3215 3215  void
3216 3216  page_rename(page_t *opp, vnode_t *vp, u_offset_t off)
3217 3217  {
3218 3218          page_t          *pp;
3219 3219          int             olckcnt = 0;
3220 3220          int             ocowcnt = 0;
3221 3221          kmutex_t        *phm;
3222 3222          ulong_t         index;
3223 3223  
3224 3224          ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp));
3225 3225          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3226 3226          ASSERT(PP_ISFREE(opp) == 0);
3227 3227  
3228 3228          VM_STAT_ADD(page_rename_count);
3229 3229  
3230 3230          TRACE_3(TR_FAC_VM, TR_PAGE_RENAME,
3231 3231              "page rename:pp %p vp %p off %llx", opp, vp, off);
3232 3232  
3233 3233          /*
3234 3234           * CacheFS may call page_rename for a large NFS page
3235 3235           * when both CacheFS and NFS mount points are used
3236 3236           * by applications. Demote this large page before
3237 3237           * renaming it, to ensure that there are no "partial"
3238 3238           * large pages left lying around.
3239 3239           */
3240 3240          if (opp->p_szc != 0) {
3241 3241                  vnode_t *ovp = opp->p_vnode;
3242 3242                  ASSERT(ovp != NULL);
3243 3243                  ASSERT(!IS_SWAPFSVP(ovp));
3244 3244                  ASSERT(!VN_ISKAS(ovp));
3245 3245                  page_demote_vp_pages(opp);
3246 3246                  ASSERT(opp->p_szc == 0);
3247 3247          }
3248 3248  
3249 3249          page_hashout(opp, NULL);
3250 3250          PP_CLRAGED(opp);
3251 3251  
3252 3252          /*
3253 3253           * Acquire the appropriate page hash lock, since
3254 3254           * we're going to rename the page.
3255 3255           */
3256 3256          index = PAGE_HASH_FUNC(vp, off);
3257 3257          phm = PAGE_HASH_MUTEX(index);
3258 3258          mutex_enter(phm);
3259 3259  top:
3260 3260          /*
3261 3261           * Look for an existing page with this name and destroy it if found.
3262 3262           * By holding the page hash lock all the way to the page_hashin()
3263 3263           * call, we are assured that no page can be created with this
3264 3264           * identity.  In the case when the phm lock is dropped to undo any
3265 3265           * hat layer mappings, the existing page is held with an "exclusive"
3266 3266           * lock, again preventing another page from being created with
3267 3267           * this identity.
3268 3268           */
3269 3269          pp = page_hash_search(index, vp, off);
3270 3270          if (pp != NULL) {
3271 3271                  VM_STAT_ADD(page_rename_exists);
3272 3272  
3273 3273                  /*
3274 3274                   * As it turns out, this is one of only two places where
3275 3275                   * page_lock() needs to hold the passed in lock in the
3276 3276                   * successful case.  In all of the others, the lock could
3277 3277                   * be dropped as soon as the attempt is made to lock
3278 3278                   * the page.  It is tempting to add yet another arguement,
3279 3279                   * PL_KEEP or PL_DROP, to let page_lock know what to do.
3280 3280                   */
3281 3281                  if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) {
3282 3282                          /*
3283 3283                           * Went to sleep because the page could not
3284 3284                           * be locked.  We were woken up when the page
3285 3285                           * was unlocked, or when the page was destroyed.
3286 3286                           * In either case, `phm' was dropped while we
3287 3287                           * slept.  Hence we should not just roar through
3288 3288                           * this loop.
3289 3289                           */
3290 3290                          goto top;
3291 3291                  }
3292 3292  
3293 3293                  /*
3294 3294                   * If an existing page is a large page, then demote
3295 3295                   * it to ensure that no "partial" large pages are
3296 3296                   * "created" after page_rename. An existing page
3297 3297                   * can be a CacheFS page, and can't belong to swapfs.
3298 3298                   */
3299 3299                  if (hat_page_is_mapped(pp)) {
3300 3300                          /*
3301 3301                           * Unload translations.  Since we hold the
3302 3302                           * exclusive lock on this page, the page
3303 3303                           * can not be changed while we drop phm.
3304 3304                           * This is also not a lock protocol violation,
3305 3305                           * but rather the proper way to do things.
3306 3306                           */
3307 3307                          mutex_exit(phm);
3308 3308                          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3309 3309                          if (pp->p_szc != 0) {
3310 3310                                  ASSERT(!IS_SWAPFSVP(vp));
3311 3311                                  ASSERT(!VN_ISKAS(vp));
3312 3312                                  page_demote_vp_pages(pp);
3313 3313                                  ASSERT(pp->p_szc == 0);
3314 3314                          }
3315 3315                          mutex_enter(phm);
3316 3316                  } else if (pp->p_szc != 0) {
3317 3317                          ASSERT(!IS_SWAPFSVP(vp));
3318 3318                          ASSERT(!VN_ISKAS(vp));
3319 3319                          mutex_exit(phm);
3320 3320                          page_demote_vp_pages(pp);
3321 3321                          ASSERT(pp->p_szc == 0);
3322 3322                          mutex_enter(phm);
3323 3323                  }
3324 3324                  page_hashout(pp, phm);
3325 3325          }
3326 3326          /*
3327 3327           * Hash in the page with the new identity.
3328 3328           */
3329 3329          if (!page_hashin(opp, vp, off, phm)) {
3330 3330                  /*
3331 3331                   * We were holding phm while we searched for [vp, off]
3332 3332                   * and only dropped phm if we found and locked a page.
3333 3333                   * If we can't create this page now, then some thing
3334 3334                   * is really broken.
3335 3335                   */
3336 3336                  panic("page_rename: Can't hash in page: %p", (void *)pp);
3337 3337                  /*NOTREACHED*/
3338 3338          }
3339 3339  
3340 3340          ASSERT(MUTEX_HELD(phm));
3341 3341          mutex_exit(phm);
3342 3342  
3343 3343          /*
3344 3344           * Now that we have dropped phm, lets get around to finishing up
3345 3345           * with pp.
3346 3346           */
3347 3347          if (pp != NULL) {
3348 3348                  ASSERT(!hat_page_is_mapped(pp));
3349 3349                  /* for now large pages should not end up here */
3350 3350                  ASSERT(pp->p_szc == 0);
3351 3351                  /*
3352 3352                   * Save the locks for transfer to the new page and then
3353 3353                   * clear them so page_free doesn't think they're important.
3354 3354                   * The page_struct_lock need not be acquired for lckcnt and
3355 3355                   * cowcnt since the page has an "exclusive" lock.
3356 3356                   */
3357 3357                  olckcnt = pp->p_lckcnt;
3358 3358                  ocowcnt = pp->p_cowcnt;
3359 3359                  pp->p_lckcnt = pp->p_cowcnt = 0;
3360 3360  
3361 3361                  /*
3362 3362                   * Put the page on the "free" list after we drop
3363 3363                   * the lock.  The less work under the lock the better.
3364 3364                   */
3365 3365                  /*LINTED: constant in conditional context*/
3366 3366                  VN_DISPOSE(pp, B_FREE, 0, kcred);
3367 3367          }
3368 3368  
3369 3369          /*
3370 3370           * Transfer the lock count from the old page (if any).
3371 3371           * The page_struct_lock need not be acquired for lckcnt and
3372 3372           * cowcnt since the page has an "exclusive" lock.
3373 3373           */
3374 3374          opp->p_lckcnt += olckcnt;
3375 3375          opp->p_cowcnt += ocowcnt;
3376 3376  }
3377 3377  
3378 3378  /*
3379 3379   * low level routine to add page `pp' to the hash and vp chains for [vp, offset]
3380 3380   *
3381 3381   * Pages are normally inserted at the start of a vnode's v_pages list.
3382 3382   * If the vnode is VMODSORT and the page is modified, it goes at the end.
3383 3383   * This can happen when a modified page is relocated for DR.
3384 3384   *
3385 3385   * Returns 1 on success and 0 on failure.
3386 3386   */
3387 3387  static int
3388 3388  page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset)
3389 3389  {
3390 3390          page_t          **listp;
3391 3391          page_t          *tp;
3392 3392          ulong_t         index;
3393 3393  
3394 3394          ASSERT(PAGE_EXCL(pp));
3395 3395          ASSERT(vp != NULL);
3396 3396          ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3397 3397  
3398 3398          /*
3399 3399           * Be sure to set these up before the page is inserted on the hash
3400 3400           * list.  As soon as the page is placed on the list some other
3401 3401           * thread might get confused and wonder how this page could
3402 3402           * possibly hash to this list.
3403 3403           */
3404 3404          pp->p_vnode = vp;
3405 3405          pp->p_offset = offset;
3406 3406  
3407 3407          /*
3408 3408           * record if this page is on a swap vnode
3409 3409           */
3410 3410          if ((vp->v_flag & VISSWAP) != 0)
3411 3411                  PP_SETSWAP(pp);
3412 3412  
3413 3413          index = PAGE_HASH_FUNC(vp, offset);
3414 3414          ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index)));
3415 3415          listp = &page_hash[index];
3416 3416  
3417 3417          /*
3418 3418           * If this page is already hashed in, fail this attempt to add it.
3419 3419           */
3420 3420          for (tp = *listp; tp != NULL; tp = tp->p_hash) {
3421 3421                  if (tp->p_vnode == vp && tp->p_offset == offset) {
3422 3422                          pp->p_vnode = NULL;
3423 3423                          pp->p_offset = (u_offset_t)(-1);
3424 3424                          return (0);
3425 3425                  }
3426 3426          }
3427 3427          pp->p_hash = *listp;
3428 3428          *listp = pp;
3429 3429  
3430 3430          /*
3431 3431           * Add the page to the vnode's list of pages
3432 3432           */
3433 3433          if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp))
3434 3434                  listp = &vp->v_pages->p_vpprev->p_vpnext;
3435 3435          else
3436 3436                  listp = &vp->v_pages;
3437 3437  
3438 3438          page_vpadd(listp, pp);
3439 3439  
3440 3440          return (1);
3441 3441  }
3442 3442  
3443 3443  /*
3444 3444   * Add page `pp' to both the hash and vp chains for [vp, offset].
3445 3445   *
3446 3446   * Returns 1 on success and 0 on failure.
3447 3447   * If hold is passed in, it is not dropped.
3448 3448   */
3449 3449  int
3450 3450  page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold)
3451 3451  {
3452 3452          kmutex_t        *phm = NULL;
3453 3453          kmutex_t        *vphm;
3454 3454          int             rc;
3455 3455  
3456 3456          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3457 3457          ASSERT(pp->p_fsdata == 0 || panicstr);
3458 3458  
3459 3459          TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN,
3460 3460              "page_hashin:pp %p vp %p offset %llx",
3461 3461              pp, vp, offset);
3462 3462  
3463 3463          VM_STAT_ADD(hashin_count);
3464 3464  
3465 3465          if (hold != NULL)
3466 3466                  phm = hold;
3467 3467          else {
3468 3468                  VM_STAT_ADD(hashin_not_held);
3469 3469                  phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset));
3470 3470                  mutex_enter(phm);
3471 3471          }
3472 3472  
3473 3473          vphm = page_vnode_mutex(vp);
3474 3474          mutex_enter(vphm);
3475 3475          rc = page_do_hashin(pp, vp, offset);
3476 3476          mutex_exit(vphm);
3477 3477          if (hold == NULL)
3478 3478                  mutex_exit(phm);
3479 3479          if (rc == 0)
3480 3480                  VM_STAT_ADD(hashin_already);
3481 3481          return (rc);
3482 3482  }
3483 3483  
3484 3484  /*
3485 3485   * Remove page ``pp'' from the hash and vp chains and remove vp association.
3486 3486   * All mutexes must be held
3487 3487   */
3488 3488  static void
3489 3489  page_do_hashout(page_t *pp)
3490 3490  {
3491 3491          page_t  **hpp;
3492 3492          page_t  *hp;
3493 3493          vnode_t *vp = pp->p_vnode;
3494 3494  
3495 3495          ASSERT(vp != NULL);
3496 3496          ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3497 3497  
3498 3498          /*
3499 3499           * First, take pp off of its hash chain.
3500 3500           */
3501 3501          hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)];
3502 3502  
3503 3503          for (;;) {
3504 3504                  hp = *hpp;
3505 3505                  if (hp == pp)
3506 3506                          break;
3507 3507                  if (hp == NULL) {
3508 3508                          panic("page_do_hashout");
3509 3509                          /*NOTREACHED*/
3510 3510                  }
3511 3511                  hpp = &hp->p_hash;
3512 3512          }
3513 3513          *hpp = pp->p_hash;
3514 3514  
3515 3515          /*
3516 3516           * Now remove it from its associated vnode.
3517 3517           */
3518 3518          if (vp->v_pages)
3519 3519                  page_vpsub(&vp->v_pages, pp);
3520 3520  
3521 3521          pp->p_hash = NULL;
3522 3522          page_clr_all_props(pp);
3523 3523          PP_CLRSWAP(pp);
3524 3524          pp->p_vnode = NULL;
3525 3525          pp->p_offset = (u_offset_t)-1;
3526 3526          pp->p_fsdata = 0;
3527 3527  }
3528 3528  
3529 3529  /*
3530 3530   * Remove page ``pp'' from the hash and vp chains and remove vp association.
3531 3531   *
3532 3532   * When `phm' is non-NULL it contains the address of the mutex protecting the
3533 3533   * hash list pp is on.  It is not dropped.
3534 3534   */
3535 3535  void
3536 3536  page_hashout(page_t *pp, kmutex_t *phm)
3537 3537  {
3538 3538          vnode_t         *vp;
3539 3539          ulong_t         index;
3540 3540          kmutex_t        *nphm;
3541 3541          kmutex_t        *vphm;
3542 3542          kmutex_t        *sep;
3543 3543  
3544 3544          ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1);
3545 3545          ASSERT(pp->p_vnode != NULL);
3546 3546          ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
3547 3547          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode)));
3548 3548  
3549 3549          vp = pp->p_vnode;
3550 3550  
3551 3551          TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT,
3552 3552              "page_hashout:pp %p vp %p", pp, vp);
3553 3553  
3554 3554          /* Kernel probe */
3555 3555          TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */,
3556 3556              tnf_opaque, vnode, vp,
3557 3557              tnf_offset, offset, pp->p_offset);
3558 3558  
3559 3559          /*
3560 3560           *
3561 3561           */
3562 3562          VM_STAT_ADD(hashout_count);
3563 3563          index = PAGE_HASH_FUNC(vp, pp->p_offset);
3564 3564          if (phm == NULL) {
3565 3565                  VM_STAT_ADD(hashout_not_held);
3566 3566                  nphm = PAGE_HASH_MUTEX(index);
3567 3567                  mutex_enter(nphm);
3568 3568          }
3569 3569          ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1);
3570 3570  
3571 3571  
3572 3572          /*
3573 3573           * grab page vnode mutex and remove it...
3574 3574           */
3575 3575          vphm = page_vnode_mutex(vp);
3576 3576          mutex_enter(vphm);
3577 3577  
3578 3578          page_do_hashout(pp);
3579 3579  
3580 3580          mutex_exit(vphm);
3581 3581          if (phm == NULL)
3582 3582                  mutex_exit(nphm);
3583 3583  
3584 3584          /*
3585 3585           * Wake up processes waiting for this page.  The page's
3586 3586           * identity has been changed, and is probably not the
3587 3587           * desired page any longer.
3588 3588           */
3589 3589          sep = page_se_mutex(pp);
3590 3590          mutex_enter(sep);
3591 3591          pp->p_selock &= ~SE_EWANTED;
3592 3592          if (CV_HAS_WAITERS(&pp->p_cv))
3593 3593                  cv_broadcast(&pp->p_cv);
3594 3594          mutex_exit(sep);
3595 3595  }
3596 3596  
3597 3597  /*
3598 3598   * Add the page to the front of a linked list of pages
3599 3599   * using the p_next & p_prev pointers for the list.
3600 3600   * The caller is responsible for protecting the list pointers.
3601 3601   */
3602 3602  void
3603 3603  page_add(page_t **ppp, page_t *pp)
3604 3604  {
3605 3605          ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3606 3606  
3607 3607          page_add_common(ppp, pp);
3608 3608  }
3609 3609  
3610 3610  
3611 3611  
3612 3612  /*
3613 3613   *  Common code for page_add() and mach_page_add()
3614 3614   */
3615 3615  void
3616 3616  page_add_common(page_t **ppp, page_t *pp)
3617 3617  {
3618 3618          if (*ppp == NULL) {
3619 3619                  pp->p_next = pp->p_prev = pp;
3620 3620          } else {
3621 3621                  pp->p_next = *ppp;
3622 3622                  pp->p_prev = (*ppp)->p_prev;
3623 3623                  (*ppp)->p_prev = pp;
3624 3624                  pp->p_prev->p_next = pp;
3625 3625          }
3626 3626          *ppp = pp;
3627 3627  }
3628 3628  
3629 3629  
3630 3630  /*
3631 3631   * Remove this page from a linked list of pages
3632 3632   * using the p_next & p_prev pointers for the list.
3633 3633   *
3634 3634   * The caller is responsible for protecting the list pointers.
3635 3635   */
3636 3636  void
3637 3637  page_sub(page_t **ppp, page_t *pp)
3638 3638  {
3639 3639          ASSERT((PP_ISFREE(pp)) ? 1 :
3640 3640              (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3641 3641  
3642 3642          if (*ppp == NULL || pp == NULL) {
3643 3643                  panic("page_sub: bad arg(s): pp %p, *ppp %p",
3644 3644                      (void *)pp, (void *)(*ppp));
3645 3645                  /*NOTREACHED*/
3646 3646          }
3647 3647  
3648 3648          page_sub_common(ppp, pp);
3649 3649  }
3650 3650  
3651 3651  
3652 3652  /*
3653 3653   *  Common code for page_sub() and mach_page_sub()
3654 3654   */
3655 3655  void
3656 3656  page_sub_common(page_t **ppp, page_t *pp)
3657 3657  {
3658 3658          if (*ppp == pp)
3659 3659                  *ppp = pp->p_next;              /* go to next page */
3660 3660  
3661 3661          if (*ppp == pp)
3662 3662                  *ppp = NULL;                    /* page list is gone */
3663 3663          else {
3664 3664                  pp->p_prev->p_next = pp->p_next;
3665 3665                  pp->p_next->p_prev = pp->p_prev;
3666 3666          }
3667 3667          pp->p_prev = pp->p_next = pp;           /* make pp a list of one */
3668 3668  }
3669 3669  
3670 3670  
3671 3671  /*
3672 3672   * Break page list cppp into two lists with npages in the first list.
3673 3673   * The tail is returned in nppp.
3674 3674   */
3675 3675  void
3676 3676  page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages)
3677 3677  {
3678 3678          page_t *s1pp = *oppp;
3679 3679          page_t *s2pp;
3680 3680          page_t *e1pp, *e2pp;
3681 3681          long n = 0;
3682 3682  
3683 3683          if (s1pp == NULL) {
3684 3684                  *nppp = NULL;
3685 3685                  return;
3686 3686          }
3687 3687          if (npages == 0) {
3688 3688                  *nppp = s1pp;
3689 3689                  *oppp = NULL;
3690 3690                  return;
3691 3691          }
3692 3692          for (n = 0, s2pp = *oppp; n < npages; n++) {
3693 3693                  s2pp = s2pp->p_next;
3694 3694          }
3695 3695          /* Fix head and tail of new lists */
3696 3696          e1pp = s2pp->p_prev;
3697 3697          e2pp = s1pp->p_prev;
3698 3698          s1pp->p_prev = e1pp;
3699 3699          e1pp->p_next = s1pp;
3700 3700          s2pp->p_prev = e2pp;
3701 3701          e2pp->p_next = s2pp;
3702 3702  
3703 3703          /* second list empty */
3704 3704          if (s2pp == s1pp) {
3705 3705                  *oppp = s1pp;
3706 3706                  *nppp = NULL;
3707 3707          } else {
3708 3708                  *oppp = s1pp;
3709 3709                  *nppp = s2pp;
3710 3710          }
3711 3711  }
3712 3712  
3713 3713  /*
3714 3714   * Concatenate page list nppp onto the end of list ppp.
3715 3715   */
3716 3716  void
3717 3717  page_list_concat(page_t **ppp, page_t **nppp)
3718 3718  {
3719 3719          page_t *s1pp, *s2pp, *e1pp, *e2pp;
3720 3720  
3721 3721          if (*nppp == NULL) {
3722 3722                  return;
3723 3723          }
3724 3724          if (*ppp == NULL) {
3725 3725                  *ppp = *nppp;
3726 3726                  return;
3727 3727          }
3728 3728          s1pp = *ppp;
3729 3729          e1pp =  s1pp->p_prev;
3730 3730          s2pp = *nppp;
3731 3731          e2pp = s2pp->p_prev;
3732 3732          s1pp->p_prev = e2pp;
3733 3733          e2pp->p_next = s1pp;
3734 3734          e1pp->p_next = s2pp;
3735 3735          s2pp->p_prev = e1pp;
3736 3736  }
3737 3737  
3738 3738  /*
3739 3739   * return the next page in the page list
3740 3740   */
3741 3741  page_t *
3742 3742  page_list_next(page_t *pp)
3743 3743  {
3744 3744          return (pp->p_next);
3745 3745  }
3746 3746  
3747 3747  
3748 3748  /*
3749 3749   * Add the page to the front of the linked list of pages
3750 3750   * using p_vpnext/p_vpprev pointers for the list.
3751 3751   *
3752 3752   * The caller is responsible for protecting the lists.
3753 3753   */
3754 3754  void
3755 3755  page_vpadd(page_t **ppp, page_t *pp)
3756 3756  {
3757 3757          if (*ppp == NULL) {
3758 3758                  pp->p_vpnext = pp->p_vpprev = pp;
3759 3759          } else {
3760 3760                  pp->p_vpnext = *ppp;
3761 3761                  pp->p_vpprev = (*ppp)->p_vpprev;
3762 3762                  (*ppp)->p_vpprev = pp;
3763 3763                  pp->p_vpprev->p_vpnext = pp;
3764 3764          }
3765 3765          *ppp = pp;
3766 3766  }
3767 3767  
3768 3768  /*
3769 3769   * Remove this page from the linked list of pages
3770 3770   * using p_vpnext/p_vpprev pointers for the list.
3771 3771   *
3772 3772   * The caller is responsible for protecting the lists.
3773 3773   */
3774 3774  void
3775 3775  page_vpsub(page_t **ppp, page_t *pp)
3776 3776  {
3777 3777          if (*ppp == NULL || pp == NULL) {
3778 3778                  panic("page_vpsub: bad arg(s): pp %p, *ppp %p",
3779 3779                      (void *)pp, (void *)(*ppp));
3780 3780                  /*NOTREACHED*/
3781 3781          }
3782 3782  
3783 3783          if (*ppp == pp)
3784 3784                  *ppp = pp->p_vpnext;            /* go to next page */
3785 3785  
3786 3786          if (*ppp == pp)
3787 3787                  *ppp = NULL;                    /* page list is gone */
3788 3788          else {
3789 3789                  pp->p_vpprev->p_vpnext = pp->p_vpnext;
3790 3790                  pp->p_vpnext->p_vpprev = pp->p_vpprev;
3791 3791          }
3792 3792          pp->p_vpprev = pp->p_vpnext = pp;       /* make pp a list of one */
3793 3793  }
3794 3794  
3795 3795  /*
3796 3796   * Lock a physical page into memory "long term".  Used to support "lock
3797 3797   * in memory" functions.  Accepts the page to be locked, and a cow variable
3798 3798   * to indicate whether a the lock will travel to the new page during
3799 3799   * a potential copy-on-write.
3800 3800   */
3801 3801  int
3802 3802  page_pp_lock(
3803 3803          page_t *pp,                     /* page to be locked */
3804 3804          int cow,                        /* cow lock */
3805 3805          int kernel)                     /* must succeed -- ignore checking */
3806 3806  {
3807 3807          int r = 0;                      /* result -- assume failure */
3808 3808  
3809 3809          ASSERT(PAGE_LOCKED(pp));
3810 3810  
3811 3811          page_struct_lock(pp);
3812 3812          /*
3813 3813           * Acquire the "freemem_lock" for availrmem.
3814 3814           */
3815 3815          if (cow) {
3816 3816                  mutex_enter(&freemem_lock);
3817 3817                  if ((availrmem > pages_pp_maximum) &&
3818 3818                      (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
3819 3819                          availrmem--;
3820 3820                          pages_locked++;
3821 3821                          mutex_exit(&freemem_lock);
3822 3822                          r = 1;
3823 3823                          if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3824 3824                                  cmn_err(CE_WARN,
3825 3825                                      "COW lock limit reached on pfn 0x%lx",
3826 3826                                      page_pptonum(pp));
3827 3827                          }
3828 3828                  } else
3829 3829                          mutex_exit(&freemem_lock);
3830 3830          } else {
3831 3831                  if (pp->p_lckcnt) {
3832 3832                          if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
3833 3833                                  r = 1;
3834 3834                                  if (++pp->p_lckcnt ==
3835 3835                                      (ushort_t)PAGE_LOCK_MAXIMUM) {
3836 3836                                          cmn_err(CE_WARN, "Page lock limit "
3837 3837                                              "reached on pfn 0x%lx",
3838 3838                                              page_pptonum(pp));
3839 3839                                  }
3840 3840                          }
3841 3841                  } else {
3842 3842                          if (kernel) {
3843 3843                                  /* availrmem accounting done by caller */
3844 3844                                  ++pp->p_lckcnt;
3845 3845                                  r = 1;
3846 3846                          } else {
3847 3847                                  mutex_enter(&freemem_lock);
3848 3848                                  if (availrmem > pages_pp_maximum) {
3849 3849                                          availrmem--;
3850 3850                                          pages_locked++;
3851 3851                                          ++pp->p_lckcnt;
3852 3852                                          r = 1;
3853 3853                                  }
3854 3854                                  mutex_exit(&freemem_lock);
3855 3855                          }
3856 3856                  }
3857 3857          }
3858 3858          page_struct_unlock(pp);
3859 3859          return (r);
3860 3860  }
3861 3861  
3862 3862  /*
3863 3863   * Decommit a lock on a physical page frame.  Account for cow locks if
3864 3864   * appropriate.
3865 3865   */
3866 3866  void
3867 3867  page_pp_unlock(
3868 3868          page_t *pp,                     /* page to be unlocked */
3869 3869          int cow,                        /* expect cow lock */
3870 3870          int kernel)                     /* this was a kernel lock */
3871 3871  {
3872 3872          ASSERT(PAGE_LOCKED(pp));
3873 3873  
3874 3874          page_struct_lock(pp);
3875 3875          /*
3876 3876           * Acquire the "freemem_lock" for availrmem.
3877 3877           * If cowcnt or lcknt is already 0 do nothing; i.e., we
3878 3878           * could be called to unlock even if nothing is locked. This could
3879 3879           * happen if locked file pages were truncated (removing the lock)
3880 3880           * and the file was grown again and new pages faulted in; the new
3881 3881           * pages are unlocked but the segment still thinks they're locked.
3882 3882           */
3883 3883          if (cow) {
3884 3884                  if (pp->p_cowcnt) {
3885 3885                          mutex_enter(&freemem_lock);
3886 3886                          pp->p_cowcnt--;
3887 3887                          availrmem++;
3888 3888                          pages_locked--;
3889 3889                          mutex_exit(&freemem_lock);
3890 3890                  }
3891 3891          } else {
3892 3892                  if (pp->p_lckcnt && --pp->p_lckcnt == 0) {
3893 3893                          if (!kernel) {
3894 3894                                  mutex_enter(&freemem_lock);
3895 3895                                  availrmem++;
3896 3896                                  pages_locked--;
3897 3897                                  mutex_exit(&freemem_lock);
3898 3898                          }
3899 3899                  }
3900 3900          }
3901 3901          page_struct_unlock(pp);
3902 3902  }
3903 3903  
3904 3904  /*
3905 3905   * This routine reserves availrmem for npages;
3906 3906   *      flags: KM_NOSLEEP or KM_SLEEP
3907 3907   *      returns 1 on success or 0 on failure
3908 3908   */
3909 3909  int
3910 3910  page_resv(pgcnt_t npages, uint_t flags)
3911 3911  {
3912 3912          mutex_enter(&freemem_lock);
3913 3913          while (availrmem < tune.t_minarmem + npages) {
3914 3914                  if (flags & KM_NOSLEEP) {
3915 3915                          mutex_exit(&freemem_lock);
3916 3916                          return (0);
3917 3917                  }
3918 3918                  mutex_exit(&freemem_lock);
3919 3919                  page_needfree(npages);
3920 3920                  kmem_reap();
3921 3921                  delay(hz >> 2);
3922 3922                  page_needfree(-(spgcnt_t)npages);
3923 3923                  mutex_enter(&freemem_lock);
3924 3924          }
3925 3925          availrmem -= npages;
3926 3926          mutex_exit(&freemem_lock);
3927 3927          return (1);
3928 3928  }
3929 3929  
3930 3930  /*
3931 3931   * This routine unreserves availrmem for npages;
3932 3932   */
3933 3933  void
3934 3934  page_unresv(pgcnt_t npages)
3935 3935  {
3936 3936          mutex_enter(&freemem_lock);
3937 3937          availrmem += npages;
3938 3938          mutex_exit(&freemem_lock);
3939 3939  }
3940 3940  
3941 3941  /*
3942 3942   * See Statement at the beginning of segvn_lockop() regarding
3943 3943   * the way we handle cowcnts and lckcnts.
3944 3944   *
3945 3945   * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage
3946 3946   * that breaks COW has PROT_WRITE.
3947 3947   *
3948 3948   * Note that, we may also break COW in case we are softlocking
3949 3949   * on read access during physio;
3950 3950   * in this softlock case, the vpage may not have PROT_WRITE.
3951 3951   * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp'
3952 3952   * if the vpage doesn't have PROT_WRITE.
3953 3953   *
3954 3954   * This routine is never called if we are stealing a page
3955 3955   * in anon_private.
3956 3956   *
3957 3957   * The caller subtracted from availrmem for read only mapping.
3958 3958   * if lckcnt is 1 increment availrmem.
3959 3959   */
3960 3960  void
3961 3961  page_pp_useclaim(
3962 3962          page_t *opp,            /* original page frame losing lock */
3963 3963          page_t *npp,            /* new page frame gaining lock */
3964 3964          uint_t  write_perm)     /* set if vpage has PROT_WRITE */
3965 3965  {
3966 3966          int payback = 0;
3967 3967          int nidx, oidx;
3968 3968  
3969 3969          ASSERT(PAGE_LOCKED(opp));
3970 3970          ASSERT(PAGE_LOCKED(npp));
3971 3971  
3972 3972          /*
3973 3973           * Since we have two pages we probably have two locks.  We need to take
3974 3974           * them in a defined order to avoid deadlocks.  It's also possible they
3975 3975           * both hash to the same lock in which case this is a non-issue.
3976 3976           */
3977 3977          nidx = PAGE_LLOCK_HASH(PP_PAGEROOT(npp));
3978 3978          oidx = PAGE_LLOCK_HASH(PP_PAGEROOT(opp));
3979 3979          if (nidx < oidx) {
3980 3980                  page_struct_lock(npp);
3981 3981                  page_struct_lock(opp);
3982 3982          } else if (oidx < nidx) {
3983 3983                  page_struct_lock(opp);
3984 3984                  page_struct_lock(npp);
3985 3985          } else {        /* The pages hash to the same lock */
3986 3986                  page_struct_lock(npp);
3987 3987          }
3988 3988  
3989 3989          ASSERT(npp->p_cowcnt == 0);
3990 3990          ASSERT(npp->p_lckcnt == 0);
3991 3991  
3992 3992          /* Don't use claim if nothing is locked (see page_pp_unlock above) */
3993 3993          if ((write_perm && opp->p_cowcnt != 0) ||
3994 3994              (!write_perm && opp->p_lckcnt != 0)) {
3995 3995  
3996 3996                  if (write_perm) {
3997 3997                          npp->p_cowcnt++;
3998 3998                          ASSERT(opp->p_cowcnt != 0);
3999 3999                          opp->p_cowcnt--;
4000 4000                  } else {
4001 4001  
4002 4002                          ASSERT(opp->p_lckcnt != 0);
4003 4003  
4004 4004                          /*
4005 4005                           * We didn't need availrmem decremented if p_lckcnt on
4006 4006                           * original page is 1. Here, we are unlocking
4007 4007                           * read-only copy belonging to original page and
4008 4008                           * are locking a copy belonging to new page.
4009 4009                           */
4010 4010                          if (opp->p_lckcnt == 1)
4011 4011                                  payback = 1;
4012 4012  
4013 4013                          npp->p_lckcnt++;
4014 4014                          opp->p_lckcnt--;
4015 4015                  }
4016 4016          }
4017 4017          if (payback) {
4018 4018                  mutex_enter(&freemem_lock);
4019 4019                  availrmem++;
4020 4020                  pages_useclaim--;
4021 4021                  mutex_exit(&freemem_lock);
4022 4022          }
4023 4023  
4024 4024          if (nidx < oidx) {
4025 4025                  page_struct_unlock(opp);
4026 4026                  page_struct_unlock(npp);
4027 4027          } else if (oidx < nidx) {
4028 4028                  page_struct_unlock(npp);
4029 4029                  page_struct_unlock(opp);
4030 4030          } else {        /* The pages hash to the same lock */
4031 4031                  page_struct_unlock(npp);
4032 4032          }
4033 4033  }
4034 4034  
4035 4035  /*
4036 4036   * Simple claim adjust functions -- used to support changes in
4037 4037   * claims due to changes in access permissions.  Used by segvn_setprot().
4038 4038   */
4039 4039  int
4040 4040  page_addclaim(page_t *pp)
4041 4041  {
4042 4042          int r = 0;                      /* result */
4043 4043  
4044 4044          ASSERT(PAGE_LOCKED(pp));
4045 4045  
4046 4046          page_struct_lock(pp);
4047 4047          ASSERT(pp->p_lckcnt != 0);
4048 4048  
4049 4049          if (pp->p_lckcnt == 1) {
4050 4050                  if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4051 4051                          --pp->p_lckcnt;
4052 4052                          r = 1;
4053 4053                          if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4054 4054                                  cmn_err(CE_WARN,
4055 4055                                      "COW lock limit reached on pfn 0x%lx",
4056 4056                                      page_pptonum(pp));
4057 4057                          }
4058 4058                  }
4059 4059          } else {
4060 4060                  mutex_enter(&freemem_lock);
4061 4061                  if ((availrmem > pages_pp_maximum) &&
4062 4062                      (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
4063 4063                          --availrmem;
4064 4064                          ++pages_claimed;
4065 4065                          mutex_exit(&freemem_lock);
4066 4066                          --pp->p_lckcnt;
4067 4067                          r = 1;
4068 4068                          if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4069 4069                                  cmn_err(CE_WARN,
4070 4070                                      "COW lock limit reached on pfn 0x%lx",
4071 4071                                      page_pptonum(pp));
4072 4072                          }
4073 4073                  } else
4074 4074                          mutex_exit(&freemem_lock);
4075 4075          }
4076 4076          page_struct_unlock(pp);
4077 4077          return (r);
4078 4078  }
4079 4079  
4080 4080  int
4081 4081  page_subclaim(page_t *pp)
4082 4082  {
4083 4083          int r = 0;
4084 4084  
4085 4085          ASSERT(PAGE_LOCKED(pp));
4086 4086  
4087 4087          page_struct_lock(pp);
4088 4088          ASSERT(pp->p_cowcnt != 0);
4089 4089  
4090 4090          if (pp->p_lckcnt) {
4091 4091                  if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4092 4092                          r = 1;
4093 4093                          /*
4094 4094                           * for availrmem
4095 4095                           */
4096 4096                          mutex_enter(&freemem_lock);
4097 4097                          availrmem++;
4098 4098                          pages_claimed--;
4099 4099                          mutex_exit(&freemem_lock);
4100 4100  
4101 4101                          pp->p_cowcnt--;
4102 4102  
4103 4103                          if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4104 4104                                  cmn_err(CE_WARN,
4105 4105                                      "Page lock limit reached on pfn 0x%lx",
4106 4106                                      page_pptonum(pp));
4107 4107                          }
4108 4108                  }
4109 4109          } else {
4110 4110                  r = 1;
4111 4111                  pp->p_cowcnt--;
4112 4112                  pp->p_lckcnt++;
4113 4113          }
4114 4114          page_struct_unlock(pp);
4115 4115          return (r);
4116 4116  }
4117 4117  
4118 4118  /*
4119 4119   * Variant of page_addclaim(), where ppa[] contains the pages of a single large
4120 4120   * page.
4121 4121   */
4122 4122  int
4123 4123  page_addclaim_pages(page_t  **ppa)
4124 4124  {
4125 4125          pgcnt_t lckpgs = 0, pg_idx;
4126 4126  
4127 4127          VM_STAT_ADD(pagecnt.pc_addclaim_pages);
4128 4128  
4129 4129          /*
4130 4130           * Only need to take the page struct lock on the large page root.
4131 4131           */
4132 4132          page_struct_lock(ppa[0]);
4133 4133          for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4134 4134  
4135 4135                  ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4136 4136                  ASSERT(ppa[pg_idx]->p_lckcnt != 0);
4137 4137                  if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4138 4138                          page_struct_unlock(ppa[0]);
4139 4139                          return (0);
4140 4140                  }
4141 4141                  if (ppa[pg_idx]->p_lckcnt > 1)
4142 4142                          lckpgs++;
4143 4143          }
4144 4144  
4145 4145          if (lckpgs != 0) {
4146 4146                  mutex_enter(&freemem_lock);
4147 4147                  if (availrmem >= pages_pp_maximum + lckpgs) {
4148 4148                          availrmem -= lckpgs;
4149 4149                          pages_claimed += lckpgs;
4150 4150                  } else {
4151 4151                          mutex_exit(&freemem_lock);
4152 4152                          page_struct_unlock(ppa[0]);
4153 4153                          return (0);
4154 4154                  }
4155 4155                  mutex_exit(&freemem_lock);
4156 4156          }
4157 4157  
4158 4158          for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4159 4159                  ppa[pg_idx]->p_lckcnt--;
4160 4160                  ppa[pg_idx]->p_cowcnt++;
4161 4161          }
4162 4162          page_struct_unlock(ppa[0]);
4163 4163          return (1);
4164 4164  }
4165 4165  
4166 4166  /*
4167 4167   * Variant of page_subclaim(), where ppa[] contains the pages of a single large
4168 4168   * page.
4169 4169   */
4170 4170  int
4171 4171  page_subclaim_pages(page_t  **ppa)
4172 4172  {
4173 4173          pgcnt_t ulckpgs = 0, pg_idx;
4174 4174  
4175 4175          VM_STAT_ADD(pagecnt.pc_subclaim_pages);
4176 4176  
4177 4177          /*
4178 4178           * Only need to take the page struct lock on the large page root.
4179 4179           */
4180 4180          page_struct_lock(ppa[0]);
4181 4181          for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4182 4182  
4183 4183                  ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4184 4184                  ASSERT(ppa[pg_idx]->p_cowcnt != 0);
4185 4185                  if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4186 4186                          page_struct_unlock(ppa[0]);
4187 4187                          return (0);
4188 4188                  }
4189 4189                  if (ppa[pg_idx]->p_lckcnt != 0)
4190 4190                          ulckpgs++;
4191 4191          }
4192 4192  
4193 4193          if (ulckpgs != 0) {
4194 4194                  mutex_enter(&freemem_lock);
4195 4195                  availrmem += ulckpgs;
4196 4196                  pages_claimed -= ulckpgs;
4197 4197                  mutex_exit(&freemem_lock);
4198 4198          }
4199 4199  
4200 4200          for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4201 4201                  ppa[pg_idx]->p_cowcnt--;
4202 4202                  ppa[pg_idx]->p_lckcnt++;
4203 4203  
4204 4204          }
4205 4205          page_struct_unlock(ppa[0]);
4206 4206          return (1);
4207 4207  }
4208 4208  
4209 4209  page_t *
4210 4210  page_numtopp(pfn_t pfnum, se_t se)
4211 4211  {
4212 4212          page_t *pp;
4213 4213  
4214 4214  retry:
4215 4215          pp = page_numtopp_nolock(pfnum);
4216 4216          if (pp == NULL) {
4217 4217                  return ((page_t *)NULL);
4218 4218          }
4219 4219  
4220 4220          /*
4221 4221           * Acquire the appropriate lock on the page.
4222 4222           */
4223 4223          while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) {
4224 4224                  if (page_pptonum(pp) != pfnum)
4225 4225                          goto retry;
4226 4226                  continue;
4227 4227          }
4228 4228  
4229 4229          if (page_pptonum(pp) != pfnum) {
4230 4230                  page_unlock(pp);
4231 4231                  goto retry;
4232 4232          }
4233 4233  
4234 4234          return (pp);
4235 4235  }
4236 4236  
4237 4237  page_t *
4238 4238  page_numtopp_noreclaim(pfn_t pfnum, se_t se)
4239 4239  {
4240 4240          page_t *pp;
4241 4241  
4242 4242  retry:
4243 4243          pp = page_numtopp_nolock(pfnum);
4244 4244          if (pp == NULL) {
4245 4245                  return ((page_t *)NULL);
4246 4246          }
4247 4247  
4248 4248          /*
4249 4249           * Acquire the appropriate lock on the page.
4250 4250           */
4251 4251          while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) {
4252 4252                  if (page_pptonum(pp) != pfnum)
4253 4253                          goto retry;
4254 4254                  continue;
4255 4255          }
4256 4256  
4257 4257          if (page_pptonum(pp) != pfnum) {
4258 4258                  page_unlock(pp);
4259 4259                  goto retry;
4260 4260          }
4261 4261  
4262 4262          return (pp);
4263 4263  }
4264 4264  
4265 4265  /*
4266 4266   * This routine is like page_numtopp, but will only return page structs
4267 4267   * for pages which are ok for loading into hardware using the page struct.
4268 4268   */
4269 4269  page_t *
4270 4270  page_numtopp_nowait(pfn_t pfnum, se_t se)
4271 4271  {
4272 4272          page_t *pp;
4273 4273  
4274 4274  retry:
4275 4275          pp = page_numtopp_nolock(pfnum);
4276 4276          if (pp == NULL) {
4277 4277                  return ((page_t *)NULL);
4278 4278          }
4279 4279  
4280 4280          /*
4281 4281           * Try to acquire the appropriate lock on the page.
4282 4282           */
4283 4283          if (PP_ISFREE(pp))
4284 4284                  pp = NULL;
4285 4285          else {
4286 4286                  if (!page_trylock(pp, se))
4287 4287                          pp = NULL;
4288 4288                  else {
4289 4289                          if (page_pptonum(pp) != pfnum) {
4290 4290                                  page_unlock(pp);
4291 4291                                  goto retry;
4292 4292                          }
4293 4293                          if (PP_ISFREE(pp)) {
4294 4294                                  page_unlock(pp);
4295 4295                                  pp = NULL;
4296 4296                          }
4297 4297                  }
4298 4298          }
4299 4299          return (pp);
4300 4300  }
4301 4301  
4302 4302  #define SYNC_PROGRESS_NPAGES    1000
4303 4303  
4304 4304  /*
4305 4305   * Returns a count of dirty pages that are in the process
4306 4306   * of being written out.  If 'cleanit' is set, try to push the page.
4307 4307   */
4308 4308  pgcnt_t
4309 4309  page_busy(int cleanit)
4310 4310  {
4311 4311          page_t *page0 = page_first();
4312 4312          page_t *pp = page0;
4313 4313          pgcnt_t nppbusy = 0;
4314 4314          int counter = 0;
4315 4315          u_offset_t off;
4316 4316  
4317 4317          do {
4318 4318                  vnode_t *vp = pp->p_vnode;
4319 4319  
4320 4320                  /*
4321 4321                   * Reset the sync timeout. The page list is very long
4322 4322                   * on large memory systems.
4323 4323                   */
4324 4324                  if (++counter > SYNC_PROGRESS_NPAGES) {
4325 4325                          counter = 0;
4326 4326                          vfs_syncprogress();
4327 4327                  }
4328 4328  
4329 4329                  /*
4330 4330                   * A page is a candidate for syncing if it is:
4331 4331                   *
4332 4332                   * (a)  On neither the freelist nor the cachelist
4333 4333                   * (b)  Hashed onto a vnode
4334 4334                   * (c)  Not a kernel page
4335 4335                   * (d)  Dirty
4336 4336                   * (e)  Not part of a swapfile
4337 4337                   * (f)  a page which belongs to a real vnode; eg has a non-null
4338 4338                   *      v_vfsp pointer.
4339 4339                   * (g)  Backed by a filesystem which doesn't have a
4340 4340                   *      stubbed-out sync operation
4341 4341                   */
4342 4342                  if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) &&
4343 4343                      hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
4344 4344                      vfs_can_sync(vp->v_vfsp)) {
4345 4345                          nppbusy++;
4346 4346  
4347 4347                          if (!cleanit)
4348 4348                                  continue;
4349 4349                          if (!page_trylock(pp, SE_EXCL))
4350 4350                                  continue;
4351 4351  
4352 4352                          if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) ||
4353 4353                              pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
4354 4354                              !(hat_pagesync(pp,
4355 4355                              HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) {
4356 4356                                  page_unlock(pp);
4357 4357                                  continue;
4358 4358                          }
4359 4359                          off = pp->p_offset;
4360 4360                          VN_HOLD(vp);
4361 4361                          page_unlock(pp);
4362 4362                          (void) VOP_PUTPAGE(vp, off, PAGESIZE,
4363 4363                              B_ASYNC | B_FREE, kcred, NULL);
4364 4364                          VN_RELE(vp);
4365 4365                  }
4366 4366          } while ((pp = page_next(pp)) != page0);
4367 4367  
4368 4368          vfs_syncprogress();
4369 4369          return (nppbusy);
4370 4370  }
4371 4371  
4372 4372  void page_invalidate_pages(void);
4373 4373  
4374 4374  /*
4375 4375   * callback handler to vm sub-system
4376 4376   *
4377 4377   * callers make sure no recursive entries to this func.
4378 4378   */
4379 4379  /*ARGSUSED*/
4380 4380  boolean_t
4381 4381  callb_vm_cpr(void *arg, int code)
4382 4382  {
4383 4383          if (code == CB_CODE_CPR_CHKPT)
4384 4384                  page_invalidate_pages();
4385 4385          return (B_TRUE);
4386 4386  }
4387 4387  
4388 4388  /*
4389 4389   * Invalidate all pages of the system.
4390 4390   * It shouldn't be called until all user page activities are all stopped.
4391 4391   */
4392 4392  void
4393 4393  page_invalidate_pages()
4394 4394  {
4395 4395          page_t *pp;
4396 4396          page_t *page0;
4397 4397          pgcnt_t nbusypages;
4398 4398          int retry = 0;
4399 4399          const int MAXRETRIES = 4;
4400 4400  top:
4401 4401          /*
4402 4402           * Flush dirty pages and destroy the clean ones.
4403 4403           */
4404 4404          nbusypages = 0;
4405 4405  
4406 4406          pp = page0 = page_first();
4407 4407          do {
4408 4408                  struct vnode    *vp;
4409 4409                  u_offset_t      offset;
4410 4410                  int             mod;
4411 4411  
4412 4412                  /*
4413 4413                   * skip the page if it has no vnode or the page associated
4414 4414                   * with the kernel vnode or prom allocated kernel mem.
4415 4415                   */
4416 4416                  if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp))
4417 4417                          continue;
4418 4418  
4419 4419                  /*
4420 4420                   * skip the page which is already free invalidated.
4421 4421                   */
4422 4422                  if (PP_ISFREE(pp) && PP_ISAGED(pp))
4423 4423                          continue;
4424 4424  
4425 4425                  /*
4426 4426                   * skip pages that are already locked or can't be "exclusively"
4427 4427                   * locked or are already free.  After we lock the page, check
4428 4428                   * the free and age bits again to be sure it's not destroyed
4429 4429                   * yet.
4430 4430                   * To achieve max. parallelization, we use page_trylock instead
4431 4431                   * of page_lock so that we don't get block on individual pages
4432 4432                   * while we have thousands of other pages to process.
4433 4433                   */
4434 4434                  if (!page_trylock(pp, SE_EXCL)) {
4435 4435                          nbusypages++;
4436 4436                          continue;
4437 4437                  } else if (PP_ISFREE(pp)) {
4438 4438                          if (!PP_ISAGED(pp)) {
4439 4439                                  page_destroy_free(pp);
4440 4440                          } else {
4441 4441                                  page_unlock(pp);
4442 4442                          }
4443 4443                          continue;
4444 4444                  }
4445 4445                  /*
4446 4446                   * Is this page involved in some I/O? shared?
4447 4447                   *
4448 4448                   * The page_struct_lock need not be acquired to
4449 4449                   * examine these fields since the page has an
4450 4450                   * "exclusive" lock.
4451 4451                   */
4452 4452                  if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
4453 4453                          page_unlock(pp);
4454 4454                          continue;
4455 4455                  }
4456 4456  
4457 4457                  if (vp->v_type == VCHR) {
4458 4458                          panic("vp->v_type == VCHR");
4459 4459                          /*NOTREACHED*/
4460 4460                  }
4461 4461  
4462 4462                  if (!page_try_demote_pages(pp)) {
4463 4463                          page_unlock(pp);
4464 4464                          continue;
4465 4465                  }
4466 4466  
4467 4467                  /*
4468 4468                   * Check the modified bit. Leave the bits alone in hardware
4469 4469                   * (they will be modified if we do the putpage).
4470 4470                   */
4471 4471                  mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD)
4472 4472                      & P_MOD);
4473 4473                  if (mod) {
4474 4474                          offset = pp->p_offset;
4475 4475                          /*
4476 4476                           * Hold the vnode before releasing the page lock
4477 4477                           * to prevent it from being freed and re-used by
4478 4478                           * some other thread.
4479 4479                           */
4480 4480                          VN_HOLD(vp);
4481 4481                          page_unlock(pp);
4482 4482                          /*
4483 4483                           * No error return is checked here. Callers such as
4484 4484                           * cpr deals with the dirty pages at the dump time
4485 4485                           * if this putpage fails.
4486 4486                           */
4487 4487                          (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL,
4488 4488                              kcred, NULL);
4489 4489                          VN_RELE(vp);
4490 4490                  } else {
4491 4491                          /*LINTED: constant in conditional context*/
4492 4492                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
4493 4493                  }
4494 4494          } while ((pp = page_next(pp)) != page0);
4495 4495          if (nbusypages && retry++ < MAXRETRIES) {
4496 4496                  delay(1);
4497 4497                  goto top;
4498 4498          }
4499 4499  }
4500 4500  
4501 4501  /*
4502 4502   * Replace the page "old" with the page "new" on the page hash and vnode lists
4503 4503   *
4504 4504   * the replacement must be done in place, ie the equivalent sequence:
4505 4505   *
4506 4506   *      vp = old->p_vnode;
4507 4507   *      off = old->p_offset;
4508 4508   *      page_do_hashout(old)
4509 4509   *      page_do_hashin(new, vp, off)
4510 4510   *
4511 4511   * doesn't work, since
4512 4512   *  1) if old is the only page on the vnode, the v_pages list has a window
4513 4513   *     where it looks empty. This will break file system assumptions.
4514 4514   * and
4515 4515   *  2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list.
4516 4516   */
4517 4517  static void
4518 4518  page_do_relocate_hash(page_t *new, page_t *old)
4519 4519  {
4520 4520          page_t  **hash_list;
4521 4521          vnode_t *vp = old->p_vnode;
4522 4522          kmutex_t *sep;
4523 4523  
4524 4524          ASSERT(PAGE_EXCL(old));
4525 4525          ASSERT(PAGE_EXCL(new));
4526 4526          ASSERT(vp != NULL);
4527 4527          ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
4528 4528          ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset))));
4529 4529  
4530 4530          /*
4531 4531           * First find old page on the page hash list
4532 4532           */
4533 4533          hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)];
4534 4534  
4535 4535          for (;;) {
4536 4536                  if (*hash_list == old)
4537 4537                          break;
4538 4538                  if (*hash_list == NULL) {
4539 4539                          panic("page_do_hashout");
4540 4540                          /*NOTREACHED*/
4541 4541                  }
4542 4542                  hash_list = &(*hash_list)->p_hash;
4543 4543          }
4544 4544  
4545 4545          /*
4546 4546           * update new and replace old with new on the page hash list
4547 4547           */
4548 4548          new->p_vnode = old->p_vnode;
4549 4549          new->p_offset = old->p_offset;
4550 4550          new->p_hash = old->p_hash;
4551 4551          *hash_list = new;
4552 4552  
4553 4553          if ((new->p_vnode->v_flag & VISSWAP) != 0)
4554 4554                  PP_SETSWAP(new);
4555 4555  
4556 4556          /*
4557 4557           * replace old with new on the vnode's page list
4558 4558           */
4559 4559          if (old->p_vpnext == old) {
4560 4560                  new->p_vpnext = new;
4561 4561                  new->p_vpprev = new;
4562 4562          } else {
4563 4563                  new->p_vpnext = old->p_vpnext;
4564 4564                  new->p_vpprev = old->p_vpprev;
4565 4565                  new->p_vpnext->p_vpprev = new;
4566 4566                  new->p_vpprev->p_vpnext = new;
4567 4567          }
4568 4568          if (vp->v_pages == old)
4569 4569                  vp->v_pages = new;
4570 4570  
4571 4571          /*
4572 4572           * clear out the old page
4573 4573           */
4574 4574          old->p_hash = NULL;
4575 4575          old->p_vpnext = NULL;
4576 4576          old->p_vpprev = NULL;
4577 4577          old->p_vnode = NULL;
4578 4578          PP_CLRSWAP(old);
4579 4579          old->p_offset = (u_offset_t)-1;
4580 4580          page_clr_all_props(old);
4581 4581  
4582 4582          /*
4583 4583           * Wake up processes waiting for this page.  The page's
4584 4584           * identity has been changed, and is probably not the
4585 4585           * desired page any longer.
4586 4586           */
4587 4587          sep = page_se_mutex(old);
4588 4588          mutex_enter(sep);
4589 4589          old->p_selock &= ~SE_EWANTED;
4590 4590          if (CV_HAS_WAITERS(&old->p_cv))
4591 4591                  cv_broadcast(&old->p_cv);
4592 4592          mutex_exit(sep);
4593 4593  }
4594 4594  
4595 4595  /*
4596 4596   * This function moves the identity of page "pp_old" to page "pp_new".
4597 4597   * Both pages must be locked on entry.  "pp_new" is free, has no identity,
4598 4598   * and need not be hashed out from anywhere.
4599 4599   */
4600 4600  void
4601 4601  page_relocate_hash(page_t *pp_new, page_t *pp_old)
4602 4602  {
4603 4603          vnode_t *vp = pp_old->p_vnode;
4604 4604          u_offset_t off = pp_old->p_offset;
4605 4605          kmutex_t *phm, *vphm;
4606 4606  
4607 4607          /*
4608 4608           * Rehash two pages
4609 4609           */
4610 4610          ASSERT(PAGE_EXCL(pp_old));
4611 4611          ASSERT(PAGE_EXCL(pp_new));
4612 4612          ASSERT(vp != NULL);
4613 4613          ASSERT(pp_new->p_vnode == NULL);
4614 4614  
4615 4615          /*
4616 4616           * hashout then hashin while holding the mutexes
4617 4617           */
4618 4618          phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off));
4619 4619          mutex_enter(phm);
4620 4620          vphm = page_vnode_mutex(vp);
4621 4621          mutex_enter(vphm);
4622 4622  
4623 4623          page_do_relocate_hash(pp_new, pp_old);
4624 4624  
4625 4625          /* The following comment preserved from page_flip(). */
4626 4626          pp_new->p_fsdata = pp_old->p_fsdata;
4627 4627          pp_old->p_fsdata = 0;
4628 4628          mutex_exit(vphm);
4629 4629          mutex_exit(phm);
4630 4630  
4631 4631          /*
4632 4632           * The page_struct_lock need not be acquired for lckcnt and
4633 4633           * cowcnt since the page has an "exclusive" lock.
4634 4634           */
4635 4635          ASSERT(pp_new->p_lckcnt == 0);
4636 4636          ASSERT(pp_new->p_cowcnt == 0);
4637 4637          pp_new->p_lckcnt = pp_old->p_lckcnt;
4638 4638          pp_new->p_cowcnt = pp_old->p_cowcnt;
4639 4639          pp_old->p_lckcnt = pp_old->p_cowcnt = 0;
4640 4640  
4641 4641  }
4642 4642  
4643 4643  /*
4644 4644   * Helper routine used to lock all remaining members of a
4645 4645   * large page. The caller is responsible for passing in a locked
4646 4646   * pp. If pp is a large page, then it succeeds in locking all the
4647 4647   * remaining constituent pages or it returns with only the
4648 4648   * original page locked.
4649 4649   *
4650 4650   * Returns 1 on success, 0 on failure.
4651 4651   *
4652 4652   * If success is returned this routine guarantees p_szc for all constituent
4653 4653   * pages of a large page pp belongs to can't change. To achieve this we
4654 4654   * recheck szc of pp after locking all constituent pages and retry if szc
4655 4655   * changed (it could only decrease). Since hat_page_demote() needs an EXCL
4656 4656   * lock on one of constituent pages it can't be running after all constituent
4657 4657   * pages are locked.  hat_page_demote() with a lock on a constituent page
4658 4658   * outside of this large page (i.e. pp belonged to a larger large page) is
4659 4659   * already done with all constituent pages of pp since the root's p_szc is
4660 4660   * changed last. Therefore no need to synchronize with hat_page_demote() that
4661 4661   * locked a constituent page outside of pp's current large page.
4662 4662   */
4663 4663  #ifdef DEBUG
4664 4664  uint32_t gpg_trylock_mtbf = 0;
4665 4665  #endif
4666 4666  
4667 4667  int
4668 4668  group_page_trylock(page_t *pp, se_t se)
4669 4669  {
4670 4670          page_t  *tpp;
4671 4671          pgcnt_t npgs, i, j;
4672 4672          uint_t pszc = pp->p_szc;
4673 4673  
4674 4674  #ifdef DEBUG
4675 4675          if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) {
4676 4676                  return (0);
4677 4677          }
4678 4678  #endif
4679 4679  
4680 4680          if (pp != PP_GROUPLEADER(pp, pszc)) {
4681 4681                  return (0);
4682 4682          }
4683 4683  
4684 4684  retry:
4685 4685          ASSERT(PAGE_LOCKED_SE(pp, se));
4686 4686          ASSERT(!PP_ISFREE(pp));
4687 4687          if (pszc == 0) {
4688 4688                  return (1);
4689 4689          }
4690 4690          npgs = page_get_pagecnt(pszc);
4691 4691          tpp = pp + 1;
4692 4692          for (i = 1; i < npgs; i++, tpp++) {
4693 4693                  if (!page_trylock(tpp, se)) {
4694 4694                          tpp = pp + 1;
4695 4695                          for (j = 1; j < i; j++, tpp++) {
4696 4696                                  page_unlock(tpp);
4697 4697                          }
4698 4698                          return (0);
4699 4699                  }
4700 4700          }
4701 4701          if (pp->p_szc != pszc) {
4702 4702                  ASSERT(pp->p_szc < pszc);
4703 4703                  ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) &&
4704 4704                      !IS_SWAPFSVP(pp->p_vnode));
4705 4705                  tpp = pp + 1;
4706 4706                  for (i = 1; i < npgs; i++, tpp++) {
4707 4707                          page_unlock(tpp);
4708 4708                  }
4709 4709                  pszc = pp->p_szc;
4710 4710                  goto retry;
4711 4711          }
4712 4712          return (1);
4713 4713  }
4714 4714  
4715 4715  void
4716 4716  group_page_unlock(page_t *pp)
4717 4717  {
4718 4718          page_t *tpp;
4719 4719          pgcnt_t npgs, i;
4720 4720  
4721 4721          ASSERT(PAGE_LOCKED(pp));
4722 4722          ASSERT(!PP_ISFREE(pp));
4723 4723          ASSERT(pp == PP_PAGEROOT(pp));
4724 4724          npgs = page_get_pagecnt(pp->p_szc);
4725 4725          for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) {
4726 4726                  page_unlock(tpp);
4727 4727          }
4728 4728  }
4729 4729  
4730 4730  /*
4731 4731   * returns
4732 4732   * 0            : on success and *nrelocp is number of relocated PAGESIZE pages
4733 4733   * ERANGE       : this is not a base page
4734 4734   * EBUSY        : failure to get locks on the page/pages
4735 4735   * ENOMEM       : failure to obtain replacement pages
4736 4736   * EAGAIN       : OBP has not yet completed its boot-time handoff to the kernel
4737 4737   * EIO          : An error occurred while trying to copy the page data
4738 4738   *
4739 4739   * Return with all constituent members of target and replacement
4740 4740   * SE_EXCL locked. It is the callers responsibility to drop the
4741 4741   * locks.
4742 4742   */
4743 4743  int
4744 4744  do_page_relocate(
4745 4745          page_t **target,
4746 4746          page_t **replacement,
4747 4747          int grouplock,
4748 4748          spgcnt_t *nrelocp,
4749 4749          lgrp_t *lgrp)
4750 4750  {
4751 4751          page_t *first_repl;
4752 4752          page_t *repl;
4753 4753          page_t *targ;
4754 4754          page_t *pl = NULL;
4755 4755          uint_t ppattr;
4756 4756          pfn_t   pfn, repl_pfn;
4757 4757          uint_t  szc;
4758 4758          spgcnt_t npgs, i;
4759 4759          int repl_contig = 0;
4760 4760          uint_t flags = 0;
4761 4761          spgcnt_t dofree = 0;
4762 4762  
4763 4763          *nrelocp = 0;
4764 4764  
4765 4765  #if defined(__sparc)
4766 4766          /*
4767 4767           * We need to wait till OBP has completed
4768 4768           * its boot-time handoff of its resources to the kernel
4769 4769           * before we allow page relocation
4770 4770           */
4771 4771          if (page_relocate_ready == 0) {
4772 4772                  return (EAGAIN);
4773 4773          }
4774 4774  #endif
4775 4775  
4776 4776          /*
4777 4777           * If this is not a base page,
4778 4778           * just return with 0x0 pages relocated.
4779 4779           */
4780 4780          targ = *target;
4781 4781          ASSERT(PAGE_EXCL(targ));
4782 4782          ASSERT(!PP_ISFREE(targ));
4783 4783          szc = targ->p_szc;
4784 4784          ASSERT(szc < mmu_page_sizes);
4785 4785          VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4786 4786          pfn = targ->p_pagenum;
4787 4787          if (pfn != PFN_BASE(pfn, szc)) {
4788 4788                  VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]);
4789 4789                  return (ERANGE);
4790 4790          }
4791 4791  
4792 4792          if ((repl = *replacement) != NULL && repl->p_szc >= szc) {
4793 4793                  repl_pfn = repl->p_pagenum;
4794 4794                  if (repl_pfn != PFN_BASE(repl_pfn, szc)) {
4795 4795                          VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]);
4796 4796                          return (ERANGE);
4797 4797                  }
4798 4798                  repl_contig = 1;
4799 4799          }
4800 4800  
4801 4801          /*
4802 4802           * We must lock all members of this large page or we cannot
4803 4803           * relocate any part of it.
4804 4804           */
4805 4805          if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) {
4806 4806                  VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]);
4807 4807                  return (EBUSY);
4808 4808          }
4809 4809  
4810 4810          /*
4811 4811           * reread szc it could have been decreased before
4812 4812           * group_page_trylock() was done.
4813 4813           */
4814 4814          szc = targ->p_szc;
4815 4815          ASSERT(szc < mmu_page_sizes);
4816 4816          VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4817 4817          ASSERT(pfn == PFN_BASE(pfn, szc));
4818 4818  
4819 4819          npgs = page_get_pagecnt(targ->p_szc);
4820 4820  
4821 4821          if (repl == NULL) {
4822 4822                  dofree = npgs;          /* Size of target page in MMU pages */
4823 4823                  if (!page_create_wait(dofree, 0)) {
4824 4824                          if (grouplock != 0) {
4825 4825                                  group_page_unlock(targ);
4826 4826                          }
4827 4827                          VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4828 4828                          return (ENOMEM);
4829 4829                  }
4830 4830  
4831 4831                  /*
4832 4832                   * seg kmem pages require that the target and replacement
4833 4833                   * page be the same pagesize.
4834 4834                   */
4835 4835                  flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0;
4836 4836                  repl = page_get_replacement_page(targ, lgrp, flags);
4837 4837                  if (repl == NULL) {
4838 4838                          if (grouplock != 0) {
4839 4839                                  group_page_unlock(targ);
4840 4840                          }
4841 4841                          page_create_putback(dofree);
4842 4842                          VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4843 4843                          return (ENOMEM);
4844 4844                  }
4845 4845          }
4846 4846  #ifdef DEBUG
4847 4847          else {
4848 4848                  ASSERT(PAGE_LOCKED(repl));
4849 4849          }
4850 4850  #endif /* DEBUG */
4851 4851  
4852 4852  #if defined(__sparc)
4853 4853          /*
4854 4854           * Let hat_page_relocate() complete the relocation if it's kernel page
4855 4855           */
4856 4856          if (VN_ISKAS(targ->p_vnode)) {
4857 4857                  *replacement = repl;
4858 4858                  if (hat_page_relocate(target, replacement, nrelocp) != 0) {
4859 4859                          if (grouplock != 0) {
4860 4860                                  group_page_unlock(targ);
4861 4861                          }
4862 4862                          if (dofree) {
4863 4863                                  *replacement = NULL;
4864 4864                                  page_free_replacement_page(repl);
4865 4865                                  page_create_putback(dofree);
4866 4866                          }
4867 4867                          VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]);
4868 4868                          return (EAGAIN);
4869 4869                  }
4870 4870                  VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4871 4871                  return (0);
4872 4872          }
4873 4873  #else
4874 4874  #if defined(lint)
4875 4875          dofree = dofree;
4876 4876  #endif
4877 4877  #endif
4878 4878  
4879 4879          first_repl = repl;
4880 4880  
4881 4881          for (i = 0; i < npgs; i++) {
4882 4882                  ASSERT(PAGE_EXCL(targ));
4883 4883                  ASSERT(targ->p_slckcnt == 0);
4884 4884                  ASSERT(repl->p_slckcnt == 0);
4885 4885  
4886 4886                  (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD);
4887 4887  
4888 4888                  ASSERT(hat_page_getshare(targ) == 0);
4889 4889                  ASSERT(!PP_ISFREE(targ));
4890 4890                  ASSERT(targ->p_pagenum == (pfn + i));
4891 4891                  ASSERT(repl_contig == 0 ||
4892 4892                      repl->p_pagenum == (repl_pfn + i));
4893 4893  
4894 4894                  /*
4895 4895                   * Copy the page contents and attributes then
4896 4896                   * relocate the page in the page hash.
4897 4897                   */
4898 4898                  if (ppcopy(targ, repl) == 0) {
4899 4899                          targ = *target;
4900 4900                          repl = first_repl;
4901 4901                          VM_STAT_ADD(vmm_vmstats.ppr_copyfail);
4902 4902                          if (grouplock != 0) {
4903 4903                                  group_page_unlock(targ);
4904 4904                          }
4905 4905                          if (dofree) {
4906 4906                                  *replacement = NULL;
4907 4907                                  page_free_replacement_page(repl);
4908 4908                                  page_create_putback(dofree);
4909 4909                          }
4910 4910                          return (EIO);
4911 4911                  }
4912 4912  
4913 4913                  targ++;
4914 4914                  if (repl_contig != 0) {
4915 4915                          repl++;
4916 4916                  } else {
4917 4917                          repl = repl->p_next;
4918 4918                  }
4919 4919          }
4920 4920  
4921 4921          repl = first_repl;
4922 4922          targ = *target;
4923 4923  
4924 4924          for (i = 0; i < npgs; i++) {
4925 4925                  ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
4926 4926                  page_clr_all_props(repl);
4927 4927                  page_set_props(repl, ppattr);
4928 4928                  page_relocate_hash(repl, targ);
4929 4929  
4930 4930                  ASSERT(hat_page_getshare(targ) == 0);
4931 4931                  ASSERT(hat_page_getshare(repl) == 0);
4932 4932                  /*
4933 4933                   * Now clear the props on targ, after the
4934 4934                   * page_relocate_hash(), they no longer
4935 4935                   * have any meaning.
4936 4936                   */
4937 4937                  page_clr_all_props(targ);
4938 4938                  ASSERT(targ->p_next == targ);
4939 4939                  ASSERT(targ->p_prev == targ);
4940 4940                  page_list_concat(&pl, &targ);
4941 4941  
4942 4942                  targ++;
4943 4943                  if (repl_contig != 0) {
4944 4944                          repl++;
4945 4945                  } else {
4946 4946                          repl = repl->p_next;
4947 4947                  }
4948 4948          }
4949 4949          /* assert that we have come full circle with repl */
4950 4950          ASSERT(repl_contig == 1 || first_repl == repl);
4951 4951  
4952 4952          *target = pl;
4953 4953          if (*replacement == NULL) {
4954 4954                  ASSERT(first_repl == repl);
4955 4955                  *replacement = repl;
4956 4956          }
4957 4957          VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4958 4958          *nrelocp = npgs;
4959 4959          return (0);
4960 4960  }
4961 4961  /*
4962 4962   * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated.
4963 4963   */
4964 4964  int
4965 4965  page_relocate(
4966 4966          page_t **target,
4967 4967          page_t **replacement,
4968 4968          int grouplock,
4969 4969          int freetarget,
4970 4970          spgcnt_t *nrelocp,
4971 4971          lgrp_t *lgrp)
4972 4972  {
4973 4973          spgcnt_t ret;
4974 4974  
4975 4975          /* do_page_relocate returns 0 on success or errno value */
4976 4976          ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp);
4977 4977  
4978 4978          if (ret != 0 || freetarget == 0) {
4979 4979                  return (ret);
4980 4980          }
4981 4981          if (*nrelocp == 1) {
4982 4982                  ASSERT(*target != NULL);
4983 4983                  page_free(*target, 1);
4984 4984          } else {
4985 4985                  page_t *tpp = *target;
4986 4986                  uint_t szc = tpp->p_szc;
4987 4987                  pgcnt_t npgs = page_get_pagecnt(szc);
4988 4988                  ASSERT(npgs > 1);
4989 4989                  ASSERT(szc != 0);
4990 4990                  do {
4991 4991                          ASSERT(PAGE_EXCL(tpp));
4992 4992                          ASSERT(!hat_page_is_mapped(tpp));
4993 4993                          ASSERT(tpp->p_szc == szc);
4994 4994                          PP_SETFREE(tpp);
4995 4995                          PP_SETAGED(tpp);
4996 4996                          npgs--;
4997 4997                  } while ((tpp = tpp->p_next) != *target);
4998 4998                  ASSERT(npgs == 0);
4999 4999                  page_list_add_pages(*target, 0);
5000 5000                  npgs = page_get_pagecnt(szc);
5001 5001                  page_create_putback(npgs);
5002 5002          }
5003 5003          return (ret);
5004 5004  }
5005 5005  
5006 5006  /*
5007 5007   * it is up to the caller to deal with pcf accounting.
5008 5008   */
5009 5009  void
5010 5010  page_free_replacement_page(page_t *pplist)
5011 5011  {
5012 5012          page_t *pp;
5013 5013  
5014 5014          while (pplist != NULL) {
5015 5015                  /*
5016 5016                   * pp_targ is a linked list.
5017 5017                   */
5018 5018                  pp = pplist;
5019 5019                  if (pp->p_szc == 0) {
5020 5020                          page_sub(&pplist, pp);
5021 5021                          page_clr_all_props(pp);
5022 5022                          PP_SETFREE(pp);
5023 5023                          PP_SETAGED(pp);
5024 5024                          page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
5025 5025                          page_unlock(pp);
5026 5026                          VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]);
5027 5027                  } else {
5028 5028                          spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc);
5029 5029                          page_t *tpp;
5030 5030                          page_list_break(&pp, &pplist, curnpgs);
5031 5031                          tpp = pp;
5032 5032                          do {
5033 5033                                  ASSERT(PAGE_EXCL(tpp));
5034 5034                                  ASSERT(!hat_page_is_mapped(tpp));
5035 5035                                  page_clr_all_props(tpp);
5036 5036                                  PP_SETFREE(tpp);
5037 5037                                  PP_SETAGED(tpp);
5038 5038                          } while ((tpp = tpp->p_next) != pp);
5039 5039                          page_list_add_pages(pp, 0);
5040 5040                          VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]);
5041 5041                  }
5042 5042          }
5043 5043  }
5044 5044  
5045 5045  /*
5046 5046   * Relocate target to non-relocatable replacement page.
5047 5047   */
5048 5048  int
5049 5049  page_relocate_cage(page_t **target, page_t **replacement)
5050 5050  {
5051 5051          page_t *tpp, *rpp;
5052 5052          spgcnt_t pgcnt, npgs;
5053 5053          int result;
5054 5054  
5055 5055          tpp = *target;
5056 5056  
5057 5057          ASSERT(PAGE_EXCL(tpp));
5058 5058          ASSERT(tpp->p_szc == 0);
5059 5059  
5060 5060          pgcnt = btop(page_get_pagesize(tpp->p_szc));
5061 5061  
5062 5062          do {
5063 5063                  (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC);
5064 5064                  rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC);
5065 5065                  if (rpp == NULL) {
5066 5066                          page_create_putback(pgcnt);
5067 5067                          kcage_cageout_wakeup();
5068 5068                  }
5069 5069          } while (rpp == NULL);
5070 5070  
5071 5071          ASSERT(PP_ISNORELOC(rpp));
5072 5072  
5073 5073          result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL);
5074 5074  
5075 5075          if (result == 0) {
5076 5076                  *replacement = rpp;
5077 5077                  if (pgcnt != npgs)
5078 5078                          panic("page_relocate_cage: partial relocation");
5079 5079          }
5080 5080  
5081 5081          return (result);
5082 5082  }
5083 5083  
5084 5084  /*
5085 5085   * Release the page lock on a page, place on cachelist
5086 5086   * tail if no longer mapped. Caller can let us know if
5087 5087   * the page is known to be clean.
5088 5088   */
5089 5089  int
5090 5090  page_release(page_t *pp, int checkmod)
5091 5091  {
5092 5092          int status;
5093 5093  
5094 5094          ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) &&
5095 5095              (pp->p_vnode != NULL));
5096 5096  
5097 5097          if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) &&
5098 5098              ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) &&
5099 5099              pp->p_lckcnt == 0 && pp->p_cowcnt == 0 &&
5100 5100              !hat_page_is_mapped(pp)) {
5101 5101  
5102 5102                  /*
5103 5103                   * If page is modified, unlock it
5104 5104                   *
5105 5105                   * (p_nrm & P_MOD) bit has the latest stuff because:
5106 5106                   * (1) We found that this page doesn't have any mappings
5107 5107                   *      _after_ holding SE_EXCL and
5108 5108                   * (2) We didn't drop SE_EXCL lock after the check in (1)
5109 5109                   */
5110 5110                  if (checkmod && hat_ismod(pp)) {
5111 5111                          page_unlock(pp);
5112 5112                          status = PGREL_MOD;
5113 5113                  } else {
5114 5114                          /*LINTED: constant in conditional context*/
5115 5115                          VN_DISPOSE(pp, B_FREE, 0, kcred);
5116 5116                          status = PGREL_CLEAN;
5117 5117                  }
5118 5118          } else {
5119 5119                  page_unlock(pp);
5120 5120                  status = PGREL_NOTREL;
5121 5121          }
5122 5122          return (status);
5123 5123  }
5124 5124  
5125 5125  /*
5126 5126   * Given a constituent page, try to demote the large page on the freelist.
5127 5127   *
5128 5128   * Returns nonzero if the page could be demoted successfully. Returns with
5129 5129   * the constituent page still locked.
5130 5130   */
5131 5131  int
5132 5132  page_try_demote_free_pages(page_t *pp)
5133 5133  {
5134 5134          page_t *rootpp = pp;
5135 5135          pfn_t   pfn = page_pptonum(pp);
5136 5136          spgcnt_t npgs;
5137 5137          uint_t  szc = pp->p_szc;
5138 5138  
5139 5139          ASSERT(PP_ISFREE(pp));
5140 5140          ASSERT(PAGE_EXCL(pp));
5141 5141  
5142 5142          /*
5143 5143           * Adjust rootpp and lock it, if `pp' is not the base
5144 5144           * constituent page.
5145 5145           */
5146 5146          npgs = page_get_pagecnt(pp->p_szc);
5147 5147          if (npgs == 1) {
5148 5148                  return (0);
5149 5149          }
5150 5150  
5151 5151          if (!IS_P2ALIGNED(pfn, npgs)) {
5152 5152                  pfn = P2ALIGN(pfn, npgs);
5153 5153                  rootpp = page_numtopp_nolock(pfn);
5154 5154          }
5155 5155  
5156 5156          if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) {
5157 5157                  return (0);
5158 5158          }
5159 5159  
5160 5160          if (rootpp->p_szc != szc) {
5161 5161                  if (pp != rootpp)
5162 5162                          page_unlock(rootpp);
5163 5163                  return (0);
5164 5164          }
5165 5165  
5166 5166          page_demote_free_pages(rootpp);
5167 5167  
5168 5168          if (pp != rootpp)
5169 5169                  page_unlock(rootpp);
5170 5170  
5171 5171          ASSERT(PP_ISFREE(pp));
5172 5172          ASSERT(PAGE_EXCL(pp));
5173 5173          return (1);
5174 5174  }
5175 5175  
5176 5176  /*
5177 5177   * Given a constituent page, try to demote the large page.
5178 5178   *
5179 5179   * Returns nonzero if the page could be demoted successfully. Returns with
5180 5180   * the constituent page still locked.
5181 5181   */
5182 5182  int
5183 5183  page_try_demote_pages(page_t *pp)
5184 5184  {
5185 5185          page_t *tpp, *rootpp = pp;
5186 5186          pfn_t   pfn = page_pptonum(pp);
5187 5187          spgcnt_t i, npgs;
5188 5188          uint_t  szc = pp->p_szc;
5189 5189          vnode_t *vp = pp->p_vnode;
5190 5190  
5191 5191          ASSERT(PAGE_EXCL(pp));
5192 5192  
5193 5193          VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
5194 5194  
5195 5195          if (pp->p_szc == 0) {
5196 5196                  VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
5197 5197                  return (1);
5198 5198          }
5199 5199  
5200 5200          if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) {
5201 5201                  VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
5202 5202                  page_demote_vp_pages(pp);
5203 5203                  ASSERT(pp->p_szc == 0);
5204 5204                  return (1);
5205 5205          }
5206 5206  
5207 5207          /*
5208 5208           * Adjust rootpp if passed in is not the base
5209 5209           * constituent page.
5210 5210           */
5211 5211          npgs = page_get_pagecnt(pp->p_szc);
5212 5212          ASSERT(npgs > 1);
5213 5213          if (!IS_P2ALIGNED(pfn, npgs)) {
5214 5214                  pfn = P2ALIGN(pfn, npgs);
5215 5215                  rootpp = page_numtopp_nolock(pfn);
5216 5216                  VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]);
5217 5217                  ASSERT(rootpp->p_vnode != NULL);
5218 5218                  ASSERT(rootpp->p_szc == szc);
5219 5219          }
5220 5220  
5221 5221          /*
5222 5222           * We can't demote kernel pages since we can't hat_unload()
5223 5223           * the mappings.
5224 5224           */
5225 5225          if (VN_ISKAS(rootpp->p_vnode))
5226 5226                  return (0);
5227 5227  
5228 5228          /*
5229 5229           * Attempt to lock all constituent pages except the page passed
5230 5230           * in since it's already locked.
5231 5231           */
5232 5232          for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5233 5233                  ASSERT(!PP_ISFREE(tpp));
5234 5234                  ASSERT(tpp->p_vnode != NULL);
5235 5235  
5236 5236                  if (tpp != pp && !page_trylock(tpp, SE_EXCL))
5237 5237                          break;
5238 5238                  ASSERT(tpp->p_szc == rootpp->p_szc);
5239 5239                  ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
5240 5240          }
5241 5241  
5242 5242          /*
5243 5243           * If we failed to lock them all then unlock what we have
5244 5244           * locked so far and bail.
5245 5245           */
5246 5246          if (i < npgs) {
5247 5247                  tpp = rootpp;
5248 5248                  while (i-- > 0) {
5249 5249                          if (tpp != pp)
5250 5250                                  page_unlock(tpp);
5251 5251                          tpp++;
5252 5252                  }
5253 5253                  VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]);
5254 5254                  return (0);
5255 5255          }
5256 5256  
5257 5257          for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5258 5258                  ASSERT(PAGE_EXCL(tpp));
5259 5259                  ASSERT(tpp->p_slckcnt == 0);
5260 5260                  (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
5261 5261                  tpp->p_szc = 0;
5262 5262          }
5263 5263  
5264 5264          /*
5265 5265           * Unlock all pages except the page passed in.
5266 5266           */
5267 5267          for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5268 5268                  ASSERT(!hat_page_is_mapped(tpp));
5269 5269                  if (tpp != pp)
5270 5270                          page_unlock(tpp);
5271 5271          }
5272 5272  
5273 5273          VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
5274 5274          return (1);
5275 5275  }
5276 5276  
5277 5277  /*
5278 5278   * Called by page_free() and page_destroy() to demote the page size code
5279 5279   * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero
5280 5280   * p_szc on free list, neither can we just clear p_szc of a single page_t
5281 5281   * within a large page since it will break other code that relies on p_szc
5282 5282   * being the same for all page_t's of a large page). Anonymous pages should
5283 5283   * never end up here because anon_map_getpages() cannot deal with p_szc
5284 5284   * changes after a single constituent page is locked.  While anonymous or
5285 5285   * kernel large pages are demoted or freed the entire large page at a time
5286 5286   * with all constituent pages locked EXCL for the file system pages we
5287 5287   * have to be able to demote a large page (i.e. decrease all constituent pages
5288 5288   * p_szc) with only just an EXCL lock on one of constituent pages. The reason
5289 5289   * we can easily deal with anonymous page demotion the entire large page at a
5290 5290   * time is that those operation originate at address space level and concern
5291 5291   * the entire large page region with actual demotion only done when pages are
5292 5292   * not shared with any other processes (therefore we can always get EXCL lock
5293 5293   * on all anonymous constituent pages after clearing segment page
5294 5294   * cache). However file system pages can be truncated or invalidated at a
5295 5295   * PAGESIZE level from the file system side and end up in page_free() or
5296 5296   * page_destroy() (we also allow only part of the large page to be SOFTLOCKed
5297 5297   * and therefore pageout should be able to demote a large page by EXCL locking
5298 5298   * any constituent page that is not under SOFTLOCK). In those cases we cannot
5299 5299   * rely on being able to lock EXCL all constituent pages.
5300 5300   *
5301 5301   * To prevent szc changes on file system pages one has to lock all constituent
5302 5302   * pages at least SHARED (or call page_szc_lock()). The only subsystem that
5303 5303   * doesn't rely on locking all constituent pages (or using page_szc_lock()) to
5304 5304   * prevent szc changes is hat layer that uses its own page level mlist
5305 5305   * locks. hat assumes that szc doesn't change after mlist lock for a page is
5306 5306   * taken. Therefore we need to change szc under hat level locks if we only
5307 5307   * have an EXCL lock on a single constituent page and hat still references any
5308 5308   * of constituent pages.  (Note we can't "ignore" hat layer by simply
5309 5309   * hat_pageunload() all constituent pages without having EXCL locks on all of
5310 5310   * constituent pages). We use hat_page_demote() call to safely demote szc of
5311 5311   * all constituent pages under hat locks when we only have an EXCL lock on one
5312 5312   * of constituent pages.
5313 5313   *
5314 5314   * This routine calls page_szc_lock() before calling hat_page_demote() to
5315 5315   * allow segvn in one special case not to lock all constituent pages SHARED
5316 5316   * before calling hat_memload_array() that relies on p_szc not changing even
5317 5317   * before hat level mlist lock is taken.  In that case segvn uses
5318 5318   * page_szc_lock() to prevent hat_page_demote() changing p_szc values.
5319 5319   *
5320 5320   * Anonymous or kernel page demotion still has to lock all pages exclusively
5321 5321   * and do hat_pageunload() on all constituent pages before demoting the page
5322 5322   * therefore there's no need for anonymous or kernel page demotion to use
5323 5323   * hat_page_demote() mechanism.
5324 5324   *
5325 5325   * hat_page_demote() removes all large mappings that map pp and then decreases
5326 5326   * p_szc starting from the last constituent page of the large page. By working
5327 5327   * from the tail of a large page in pfn decreasing order allows one looking at
5328 5328   * the root page to know that hat_page_demote() is done for root's szc area.
5329 5329   * e.g. if a root page has szc 1 one knows it only has to lock all constituent
5330 5330   * pages within szc 1 area to prevent szc changes because hat_page_demote()
5331 5331   * that started on this page when it had szc > 1 is done for this szc 1 area.
5332 5332   *
5333 5333   * We are guaranteed that all constituent pages of pp's large page belong to
5334 5334   * the same vnode with the consecutive offsets increasing in the direction of
5335 5335   * the pfn i.e. the identity of constituent pages can't change until their
5336 5336   * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove
5337 5337   * large mappings to pp even though we don't lock any constituent page except
5338 5338   * pp (i.e. we won't unload e.g. kernel locked page).
5339 5339   */
5340 5340  static void
5341 5341  page_demote_vp_pages(page_t *pp)
5342 5342  {
5343 5343          kmutex_t *mtx;
5344 5344  
5345 5345          ASSERT(PAGE_EXCL(pp));
5346 5346          ASSERT(!PP_ISFREE(pp));
5347 5347          ASSERT(pp->p_vnode != NULL);
5348 5348          ASSERT(!IS_SWAPFSVP(pp->p_vnode));
5349 5349          ASSERT(!PP_ISKAS(pp));
5350 5350  
5351 5351          VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
5352 5352  
5353 5353          mtx = page_szc_lock(pp);
5354 5354          if (mtx != NULL) {
5355 5355                  hat_page_demote(pp);
5356 5356                  mutex_exit(mtx);
5357 5357          }
5358 5358          ASSERT(pp->p_szc == 0);
5359 5359  }
5360 5360  
5361 5361  /*
5362 5362   * Mark any existing pages for migration in the given range
5363 5363   */
5364 5364  void
5365 5365  page_mark_migrate(struct seg *seg, caddr_t addr, size_t len,
5366 5366      struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
5367 5367      u_offset_t vnoff, int rflag)
5368 5368  {
5369 5369          struct anon     *ap;
5370 5370          vnode_t         *curvp;
5371 5371          lgrp_t          *from;
5372 5372          pgcnt_t         nlocked;
5373 5373          u_offset_t      off;

↓ open down ↓

5373 lines elided

↑ open up ↑

5374 5374          pfn_t           pfn;
5375 5375          size_t          pgsz;
5376 5376          size_t          segpgsz;
5377 5377          pgcnt_t         pages;
5378 5378          uint_t          pszc;
5379 5379          page_t          *pp0, *pp;
5380 5380          caddr_t         va;
5381 5381          ulong_t         an_idx;
5382 5382          anon_sync_obj_t cookie;
5383 5383  
5384      -        ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
     5384 +        ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
5385 5385  
5386 5386          /*
5387 5387           * Don't do anything if don't need to do lgroup optimizations
5388 5388           * on this system
5389 5389           */
5390 5390          if (!lgrp_optimizations())
5391 5391                  return;
5392 5392  
5393 5393          /*
5394 5394           * Align address and length to (potentially large) page boundary

5395 5395           */
5396 5396          segpgsz = page_get_pagesize(seg->s_szc);
5397 5397          addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz);
5398 5398          if (rflag)
5399 5399                  len = P2ROUNDUP(len, segpgsz);
5400 5400  
5401 5401          /*
5402 5402           * Do one (large) page at a time
5403 5403           */
5404 5404          va = addr;
5405 5405          while (va < addr + len) {
5406 5406                  /*
5407 5407                   * Lookup (root) page for vnode and offset corresponding to
5408 5408                   * this virtual address
5409 5409                   * Try anonmap first since there may be copy-on-write
5410 5410                   * pages, but initialize vnode pointer and offset using
5411 5411                   * vnode arguments just in case there isn't an amp.
5412 5412                   */
5413 5413                  curvp = vp;
5414 5414                  off = vnoff + va - seg->s_base;
5415 5415                  if (amp) {
5416 5416                          ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5417 5417                          an_idx = anon_index + seg_page(seg, va);
5418 5418                          anon_array_enter(amp, an_idx, &cookie);
5419 5419                          ap = anon_get_ptr(amp->ahp, an_idx);
5420 5420                          if (ap)
5421 5421                                  swap_xlate(ap, &curvp, &off);
5422 5422                          anon_array_exit(&cookie);
5423 5423                          ANON_LOCK_EXIT(&amp->a_rwlock);
5424 5424                  }
5425 5425  
5426 5426                  pp = NULL;
5427 5427                  if (curvp)
5428 5428                          pp = page_lookup(curvp, off, SE_SHARED);
5429 5429  
5430 5430                  /*
5431 5431                   * If there isn't a page at this virtual address,
5432 5432                   * skip to next page
5433 5433                   */
5434 5434                  if (pp == NULL) {
5435 5435                          va += PAGESIZE;
5436 5436                          continue;
5437 5437                  }
5438 5438  
5439 5439                  /*
5440 5440                   * Figure out which lgroup this page is in for kstats
5441 5441                   */
5442 5442                  pfn = page_pptonum(pp);
5443 5443                  from = lgrp_pfn_to_lgrp(pfn);
5444 5444  
5445 5445                  /*
5446 5446                   * Get page size, and round up and skip to next page boundary
5447 5447                   * if unaligned address
5448 5448                   */
5449 5449                  pszc = pp->p_szc;
5450 5450                  pgsz = page_get_pagesize(pszc);
5451 5451                  pages = btop(pgsz);
5452 5452                  if (!IS_P2ALIGNED(va, pgsz) ||
5453 5453                      !IS_P2ALIGNED(pfn, pages) ||
5454 5454                      pgsz > segpgsz) {
5455 5455                          pgsz = MIN(pgsz, segpgsz);
5456 5456                          page_unlock(pp);
5457 5457                          pages = btop(P2END((uintptr_t)va, pgsz) -
5458 5458                              (uintptr_t)va);
5459 5459                          va = (caddr_t)P2END((uintptr_t)va, pgsz);
5460 5460                          lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, pages);
5461 5461                          continue;
5462 5462                  }
5463 5463  
5464 5464                  /*
5465 5465                   * Upgrade to exclusive lock on page
5466 5466                   */
5467 5467                  if (!page_tryupgrade(pp)) {
5468 5468                          page_unlock(pp);
5469 5469                          va += pgsz;
5470 5470                          lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5471 5471                              btop(pgsz));
5472 5472                          continue;
5473 5473                  }
5474 5474  
5475 5475                  pp0 = pp++;
5476 5476                  nlocked = 1;
5477 5477  
5478 5478                  /*
5479 5479                   * Lock constituent pages if this is large page
5480 5480                   */
5481 5481                  if (pages > 1) {
5482 5482                          /*
5483 5483                           * Lock all constituents except root page, since it
5484 5484                           * should be locked already.
5485 5485                           */
5486 5486                          for (; nlocked < pages; nlocked++) {
5487 5487                                  if (!page_trylock(pp, SE_EXCL)) {
5488 5488                                          break;
5489 5489                                  }
5490 5490                                  if (PP_ISFREE(pp) ||
5491 5491                                      pp->p_szc != pszc) {
5492 5492                                          /*
5493 5493                                           * hat_page_demote() raced in with us.
5494 5494                                           */
5495 5495                                          ASSERT(!IS_SWAPFSVP(curvp));
5496 5496                                          page_unlock(pp);
5497 5497                                          break;
5498 5498                                  }
5499 5499                                  pp++;
5500 5500                          }
5501 5501                  }
5502 5502  
5503 5503                  /*
5504 5504                   * If all constituent pages couldn't be locked,
5505 5505                   * unlock pages locked so far and skip to next page.
5506 5506                   */
5507 5507                  if (nlocked < pages) {
5508 5508                          while (pp0 < pp) {
5509 5509                                  page_unlock(pp0++);
5510 5510                          }
5511 5511                          va += pgsz;
5512 5512                          lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5513 5513                              btop(pgsz));
5514 5514                          continue;
5515 5515                  }
5516 5516  
5517 5517                  /*
5518 5518                   * hat_page_demote() can no longer happen
5519 5519                   * since last cons page had the right p_szc after
5520 5520                   * all cons pages were locked. all cons pages
5521 5521                   * should now have the same p_szc.
5522 5522                   */
5523 5523  
5524 5524                  /*
5525 5525                   * All constituent pages locked successfully, so mark
5526 5526                   * large page for migration and unload the mappings of
5527 5527                   * constituent pages, so a fault will occur on any part of the
5528 5528                   * large page
5529 5529                   */
5530 5530                  PP_SETMIGRATE(pp0);
5531 5531                  while (pp0 < pp) {
5532 5532                          (void) hat_pageunload(pp0, HAT_FORCE_PGUNLOAD);
5533 5533                          ASSERT(hat_page_getshare(pp0) == 0);
5534 5534                          page_unlock(pp0++);
5535 5535                  }
5536 5536                  lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked);
5537 5537  
5538 5538                  va += pgsz;
5539 5539          }
5540 5540  }
5541 5541  
5542 5542  /*
5543 5543   * Migrate any pages that have been marked for migration in the given range
5544 5544   */
5545 5545  void
5546 5546  page_migrate(
5547 5547          struct seg      *seg,
5548 5548          caddr_t         addr,
5549 5549          page_t          **ppa,
5550 5550          pgcnt_t         npages)
5551 5551  {

↓ open down ↓

157 lines elided

↑ open up ↑

5552 5552          lgrp_t          *from;
5553 5553          lgrp_t          *to;
5554 5554          page_t          *newpp;
5555 5555          page_t          *pp;
5556 5556          pfn_t           pfn;
5557 5557          size_t          pgsz;
5558 5558          spgcnt_t        page_cnt;
5559 5559          spgcnt_t        i;
5560 5560          uint_t          pszc;
5561 5561  
5562      -        ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
     5562 +        ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
5563 5563  
5564 5564          while (npages > 0) {
5565 5565                  pp = *ppa;
5566 5566                  pszc = pp->p_szc;
5567 5567                  pgsz = page_get_pagesize(pszc);
5568 5568                  page_cnt = btop(pgsz);
5569 5569  
5570 5570                  /*
5571 5571                   * Check to see whether this page is marked for migration
5572 5572                   *

5573 5573                   * Assume that root page of large page is marked for
5574 5574                   * migration and none of the other constituent pages
5575 5575                   * are marked.  This really simplifies clearing the
5576 5576                   * migrate bit by not having to clear it from each
5577 5577                   * constituent page.
5578 5578                   *
5579 5579                   * note we don't want to relocate an entire large page if
5580 5580                   * someone is only using one subpage.
5581 5581                   */
5582 5582                  if (npages < page_cnt)
5583 5583                          break;
5584 5584  
5585 5585                  /*
5586 5586                   * Is it marked for migration?
5587 5587                   */
5588 5588                  if (!PP_ISMIGRATE(pp))
5589 5589                          goto next;
5590 5590  
5591 5591                  /*
5592 5592                   * Determine lgroups that page is being migrated between
5593 5593                   */
5594 5594                  pfn = page_pptonum(pp);
5595 5595                  if (!IS_P2ALIGNED(pfn, page_cnt)) {
5596 5596                          break;
5597 5597                  }
5598 5598                  from = lgrp_pfn_to_lgrp(pfn);
5599 5599                  to = lgrp_mem_choose(seg, addr, pgsz);
5600 5600  
5601 5601                  /*
5602 5602                   * Need to get exclusive lock's to migrate
5603 5603                   */
5604 5604                  for (i = 0; i < page_cnt; i++) {
5605 5605                          ASSERT(PAGE_LOCKED(ppa[i]));
5606 5606                          if (page_pptonum(ppa[i]) != pfn + i ||
5607 5607                              ppa[i]->p_szc != pszc) {
5608 5608                                  break;
5609 5609                          }
5610 5610                          if (!page_tryupgrade(ppa[i])) {
5611 5611                                  lgrp_stat_add(from->lgrp_id,
5612 5612                                      LGRP_PM_FAIL_LOCK_PGS,
5613 5613                                      page_cnt);
5614 5614                                  break;
5615 5615                          }
5616 5616  
5617 5617                          /*
5618 5618                           * Check to see whether we are trying to migrate
5619 5619                           * page to lgroup where it is allocated already.
5620 5620                           * If so, clear the migrate bit and skip to next
5621 5621                           * page.
5622 5622                           */
5623 5623                          if (i == 0 && to == from) {
5624 5624                                  PP_CLRMIGRATE(ppa[0]);
5625 5625                                  page_downgrade(ppa[0]);
5626 5626                                  goto next;
5627 5627                          }
5628 5628                  }
5629 5629  
5630 5630                  /*
5631 5631                   * If all constituent pages couldn't be locked,
5632 5632                   * unlock pages locked so far and skip to next page.
5633 5633                   */
5634 5634                  if (i != page_cnt) {
5635 5635                          while (--i != -1) {
5636 5636                                  page_downgrade(ppa[i]);
5637 5637                          }
5638 5638                          goto next;
5639 5639                  }
5640 5640  
5641 5641                  (void) page_create_wait(page_cnt, PG_WAIT);
5642 5642                  newpp = page_get_replacement_page(pp, to, PGR_SAMESZC);
5643 5643                  if (newpp == NULL) {
5644 5644                          page_create_putback(page_cnt);
5645 5645                          for (i = 0; i < page_cnt; i++) {
5646 5646                                  page_downgrade(ppa[i]);
5647 5647                          }
5648 5648                          lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS,
5649 5649                              page_cnt);
5650 5650                          goto next;
5651 5651                  }
5652 5652                  ASSERT(newpp->p_szc == pszc);
5653 5653                  /*
5654 5654                   * Clear migrate bit and relocate page
5655 5655                   */
5656 5656                  PP_CLRMIGRATE(pp);
5657 5657                  if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) {
5658 5658                          panic("page_migrate: page_relocate failed");
5659 5659                  }
5660 5660                  ASSERT(page_cnt * PAGESIZE == pgsz);
5661 5661  
5662 5662                  /*
5663 5663                   * Keep stats for number of pages migrated from and to
5664 5664                   * each lgroup
5665 5665                   */
5666 5666                  lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt);
5667 5667                  lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt);
5668 5668                  /*
5669 5669                   * update the page_t array we were passed in and
5670 5670                   * unlink constituent pages of a large page.
5671 5671                   */
5672 5672                  for (i = 0; i < page_cnt; ++i, ++pp) {
5673 5673                          ASSERT(PAGE_EXCL(newpp));
5674 5674                          ASSERT(newpp->p_szc == pszc);
5675 5675                          ppa[i] = newpp;
5676 5676                          pp = newpp;
5677 5677                          page_sub(&newpp, pp);
5678 5678                          page_downgrade(pp);
5679 5679                  }
5680 5680                  ASSERT(newpp == NULL);
5681 5681  next:
5682 5682                  addr += pgsz;
5683 5683                  ppa += page_cnt;
5684 5684                  npages -= page_cnt;
5685 5685          }
5686 5686  }
5687 5687  
5688 5688  #define MAX_CNT 60      /* max num of iterations */
5689 5689  /*
5690 5690   * Reclaim/reserve availrmem for npages.
5691 5691   * If there is not enough memory start reaping seg, kmem caches.
5692 5692   * Start pageout scanner (via page_needfree()).
5693 5693   * Exit after ~ MAX_CNT s regardless of how much memory has been released.
5694 5694   * Note: There is no guarantee that any availrmem will be freed as
5695 5695   * this memory typically is locked (kernel heap) or reserved for swap.
5696 5696   * Also due to memory fragmentation kmem allocator may not be able
5697 5697   * to free any memory (single user allocated buffer will prevent
5698 5698   * freeing slab or a page).
5699 5699   */
5700 5700  int
5701 5701  page_reclaim_mem(pgcnt_t npages, pgcnt_t epages, int adjust)
5702 5702  {
5703 5703          int     i = 0;
5704 5704          int     ret = 0;
5705 5705          pgcnt_t deficit;
5706 5706          pgcnt_t old_availrmem;
5707 5707  
5708 5708          mutex_enter(&freemem_lock);
5709 5709          old_availrmem = availrmem - 1;
5710 5710          while ((availrmem < tune.t_minarmem + npages + epages) &&
5711 5711              (old_availrmem < availrmem) && (i++ < MAX_CNT)) {
5712 5712                  old_availrmem = availrmem;
5713 5713                  deficit = tune.t_minarmem + npages + epages - availrmem;
5714 5714                  mutex_exit(&freemem_lock);
5715 5715                  page_needfree(deficit);
5716 5716                  kmem_reap();
5717 5717                  delay(hz);
5718 5718                  page_needfree(-(spgcnt_t)deficit);
5719 5719                  mutex_enter(&freemem_lock);
5720 5720          }
5721 5721  
5722 5722          if (adjust && (availrmem >= tune.t_minarmem + npages + epages)) {
5723 5723                  availrmem -= npages;
5724 5724                  ret = 1;
5725 5725          }
5726 5726  
5727 5727          mutex_exit(&freemem_lock);
5728 5728  
5729 5729          return (ret);
5730 5730  }
5731 5731  
5732 5732  /*
5733 5733   * Search the memory segments to locate the desired page.  Within a
5734 5734   * segment, pages increase linearly with one page structure per
5735 5735   * physical page frame (size PAGESIZE).  The search begins
5736 5736   * with the segment that was accessed last, to take advantage of locality.
5737 5737   * If the hint misses, we start from the beginning of the sorted memseg list
5738 5738   */
5739 5739  
5740 5740  
5741 5741  /*
5742 5742   * Some data structures for pfn to pp lookup.
5743 5743   */
5744 5744  ulong_t mhash_per_slot;
5745 5745  struct memseg *memseg_hash[N_MEM_SLOTS];
5746 5746  
5747 5747  page_t *
5748 5748  page_numtopp_nolock(pfn_t pfnum)
5749 5749  {
5750 5750          struct memseg *seg;
5751 5751          page_t *pp;
5752 5752          vm_cpu_data_t *vc;
5753 5753  
5754 5754          /*
5755 5755           * We need to disable kernel preemption while referencing the
5756 5756           * cpu_vm_data field in order to prevent us from being switched to
5757 5757           * another cpu and trying to reference it after it has been freed.
5758 5758           * This will keep us on cpu and prevent it from being removed while
5759 5759           * we are still on it.
5760 5760           *
5761 5761           * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5762 5762           * which is being resued by DR who will flush those references
5763 5763           * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5764 5764           */
5765 5765          kpreempt_disable();
5766 5766          vc = CPU->cpu_vm_data;
5767 5767          ASSERT(vc != NULL);
5768 5768  
5769 5769          MEMSEG_STAT_INCR(nsearch);
5770 5770  
5771 5771          /* Try last winner first */
5772 5772          if (((seg = vc->vc_pnum_memseg) != NULL) &&
5773 5773              (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5774 5774                  MEMSEG_STAT_INCR(nlastwon);
5775 5775                  pp = seg->pages + (pfnum - seg->pages_base);
5776 5776                  if (pp->p_pagenum == pfnum) {
5777 5777                          kpreempt_enable();
5778 5778                          return ((page_t *)pp);
5779 5779                  }
5780 5780          }
5781 5781  
5782 5782          /* Else Try hash */
5783 5783          if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5784 5784              (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5785 5785                  MEMSEG_STAT_INCR(nhashwon);
5786 5786                  vc->vc_pnum_memseg = seg;
5787 5787                  pp = seg->pages + (pfnum - seg->pages_base);
5788 5788                  if (pp->p_pagenum == pfnum) {
5789 5789                          kpreempt_enable();
5790 5790                          return ((page_t *)pp);
5791 5791                  }
5792 5792          }
5793 5793  
5794 5794          /* Else Brute force */
5795 5795          for (seg = memsegs; seg != NULL; seg = seg->next) {
5796 5796                  if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5797 5797                          vc->vc_pnum_memseg = seg;
5798 5798                          pp = seg->pages + (pfnum - seg->pages_base);
5799 5799                          if (pp->p_pagenum == pfnum) {
5800 5800                                  kpreempt_enable();
5801 5801                                  return ((page_t *)pp);
5802 5802                          }
5803 5803                  }
5804 5804          }
5805 5805          vc->vc_pnum_memseg = NULL;
5806 5806          kpreempt_enable();
5807 5807          MEMSEG_STAT_INCR(nnotfound);
5808 5808          return ((page_t *)NULL);
5809 5809  
5810 5810  }
5811 5811  
5812 5812  struct memseg *
5813 5813  page_numtomemseg_nolock(pfn_t pfnum)
5814 5814  {
5815 5815          struct memseg *seg;
5816 5816          page_t *pp;
5817 5817  
5818 5818          /*
5819 5819           * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5820 5820           * which is being resued by DR who will flush those references
5821 5821           * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5822 5822           */
5823 5823          kpreempt_disable();
5824 5824          /* Try hash */
5825 5825          if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5826 5826              (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5827 5827                  pp = seg->pages + (pfnum - seg->pages_base);
5828 5828                  if (pp->p_pagenum == pfnum) {
5829 5829                          kpreempt_enable();
5830 5830                          return (seg);
5831 5831                  }
5832 5832          }
5833 5833  
5834 5834          /* Else Brute force */
5835 5835          for (seg = memsegs; seg != NULL; seg = seg->next) {
5836 5836                  if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5837 5837                          pp = seg->pages + (pfnum - seg->pages_base);
5838 5838                          if (pp->p_pagenum == pfnum) {
5839 5839                                  kpreempt_enable();
5840 5840                                  return (seg);
5841 5841                          }
5842 5842                  }
5843 5843          }
5844 5844          kpreempt_enable();
5845 5845          return ((struct memseg *)NULL);
5846 5846  }
5847 5847  
5848 5848  /*
5849 5849   * Given a page and a count return the page struct that is
5850 5850   * n structs away from the current one in the global page
5851 5851   * list.
5852 5852   *
5853 5853   * This function wraps to the first page upon
5854 5854   * reaching the end of the memseg list.
5855 5855   */
5856 5856  page_t *
5857 5857  page_nextn(page_t *pp, ulong_t n)
5858 5858  {
5859 5859          struct memseg *seg;
5860 5860          page_t *ppn;
5861 5861          vm_cpu_data_t *vc;
5862 5862  
5863 5863          /*
5864 5864           * We need to disable kernel preemption while referencing the
5865 5865           * cpu_vm_data field in order to prevent us from being switched to
5866 5866           * another cpu and trying to reference it after it has been freed.
5867 5867           * This will keep us on cpu and prevent it from being removed while
5868 5868           * we are still on it.
5869 5869           *
5870 5870           * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5871 5871           * which is being resued by DR who will flush those references
5872 5872           * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5873 5873           */
5874 5874          kpreempt_disable();
5875 5875          vc = (vm_cpu_data_t *)CPU->cpu_vm_data;
5876 5876  
5877 5877          ASSERT(vc != NULL);
5878 5878  
5879 5879          if (((seg = vc->vc_pnext_memseg) == NULL) ||
5880 5880              (seg->pages_base == seg->pages_end) ||
5881 5881              !(pp >= seg->pages && pp < seg->epages)) {
5882 5882  
5883 5883                  for (seg = memsegs; seg; seg = seg->next) {
5884 5884                          if (pp >= seg->pages && pp < seg->epages)
5885 5885                                  break;
5886 5886                  }
5887 5887  
5888 5888                  if (seg == NULL) {
5889 5889                          /* Memory delete got in, return something valid. */
5890 5890                          /* TODO: fix me. */
5891 5891                          seg = memsegs;
5892 5892                          pp = seg->pages;
5893 5893                  }
5894 5894          }
5895 5895  
5896 5896          /* check for wraparound - possible if n is large */
5897 5897          while ((ppn = (pp + n)) >= seg->epages || ppn < pp) {
5898 5898                  n -= seg->epages - pp;
5899 5899                  seg = seg->next;
5900 5900                  if (seg == NULL)
5901 5901                          seg = memsegs;
5902 5902                  pp = seg->pages;
5903 5903          }
5904 5904          vc->vc_pnext_memseg = seg;
5905 5905          kpreempt_enable();
5906 5906          return (ppn);
5907 5907  }
5908 5908  
5909 5909  /*
5910 5910   * Initialize for a loop using page_next_scan_large().
5911 5911   */
5912 5912  page_t *
5913 5913  page_next_scan_init(void **cookie)
5914 5914  {
5915 5915          ASSERT(cookie != NULL);
5916 5916          *cookie = (void *)memsegs;
5917 5917          return ((page_t *)memsegs->pages);
5918 5918  }
5919 5919  
5920 5920  /*
5921 5921   * Return the next page in a scan of page_t's, assuming we want
5922 5922   * to skip over sub-pages within larger page sizes.
5923 5923   *
5924 5924   * The cookie is used to keep track of the current memseg.
5925 5925   */
5926 5926  page_t *
5927 5927  page_next_scan_large(
5928 5928          page_t          *pp,
5929 5929          ulong_t         *n,
5930 5930          void            **cookie)
5931 5931  {
5932 5932          struct memseg   *seg = (struct memseg *)*cookie;
5933 5933          page_t          *new_pp;
5934 5934          ulong_t         cnt;
5935 5935          pfn_t           pfn;
5936 5936  
5937 5937  
5938 5938          /*
5939 5939           * get the count of page_t's to skip based on the page size
5940 5940           */
5941 5941          ASSERT(pp != NULL);
5942 5942          if (pp->p_szc == 0) {
5943 5943                  cnt = 1;
5944 5944          } else {
5945 5945                  pfn = page_pptonum(pp);
5946 5946                  cnt = page_get_pagecnt(pp->p_szc);
5947 5947                  cnt -= pfn & (cnt - 1);
5948 5948          }
5949 5949          *n += cnt;
5950 5950          new_pp = pp + cnt;
5951 5951  
5952 5952          /*
5953 5953           * Catch if we went past the end of the current memory segment. If so,
5954 5954           * just move to the next segment with pages.
5955 5955           */
5956 5956          if (new_pp >= seg->epages || seg->pages_base == seg->pages_end) {
5957 5957                  do {
5958 5958                          seg = seg->next;
5959 5959                          if (seg == NULL)
5960 5960                                  seg = memsegs;
5961 5961                  } while (seg->pages_base == seg->pages_end);
5962 5962                  new_pp = seg->pages;
5963 5963                  *cookie = (void *)seg;
5964 5964          }
5965 5965  
5966 5966          return (new_pp);
5967 5967  }
5968 5968  
5969 5969  
5970 5970  /*
5971 5971   * Returns next page in list. Note: this function wraps
5972 5972   * to the first page in the list upon reaching the end
5973 5973   * of the list. Callers should be aware of this fact.
5974 5974   */
5975 5975  
5976 5976  /* We should change this be a #define */
5977 5977  
5978 5978  page_t *
5979 5979  page_next(page_t *pp)
5980 5980  {
5981 5981          return (page_nextn(pp, 1));
5982 5982  }
5983 5983  
5984 5984  page_t *
5985 5985  page_first()
5986 5986  {
5987 5987          return ((page_t *)memsegs->pages);
5988 5988  }
5989 5989  
5990 5990  
5991 5991  /*
5992 5992   * This routine is called at boot with the initial memory configuration
5993 5993   * and when memory is added or removed.
5994 5994   */
5995 5995  void
5996 5996  build_pfn_hash()
5997 5997  {
5998 5998          pfn_t cur;
5999 5999          pgcnt_t index;
6000 6000          struct memseg *pseg;
6001 6001          int     i;
6002 6002  
6003 6003          /*
6004 6004           * Clear memseg_hash array.
6005 6005           * Since memory add/delete is designed to operate concurrently
6006 6006           * with normal operation, the hash rebuild must be able to run
6007 6007           * concurrently with page_numtopp_nolock(). To support this
6008 6008           * functionality, assignments to memseg_hash array members must
6009 6009           * be done atomically.
6010 6010           *
6011 6011           * NOTE: bzero() does not currently guarantee this for kernel
6012 6012           * threads, and cannot be used here.
6013 6013           */
6014 6014          for (i = 0; i < N_MEM_SLOTS; i++)
6015 6015                  memseg_hash[i] = NULL;
6016 6016  
6017 6017          hat_kpm_mseghash_clear(N_MEM_SLOTS);
6018 6018  
6019 6019          /*
6020 6020           * Physmax is the last valid pfn.
6021 6021           */
6022 6022          mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT;
6023 6023          for (pseg = memsegs; pseg != NULL; pseg = pseg->next) {
6024 6024                  index = MEMSEG_PFN_HASH(pseg->pages_base);
6025 6025                  cur = pseg->pages_base;
6026 6026                  do {
6027 6027                          if (index >= N_MEM_SLOTS)
6028 6028                                  index = MEMSEG_PFN_HASH(cur);
6029 6029  
6030 6030                          if (memseg_hash[index] == NULL ||
6031 6031                              memseg_hash[index]->pages_base > pseg->pages_base) {
6032 6032                                  memseg_hash[index] = pseg;
6033 6033                                  hat_kpm_mseghash_update(index, pseg);
6034 6034                          }
6035 6035                          cur += mhash_per_slot;
6036 6036                          index++;
6037 6037                  } while (cur < pseg->pages_end);
6038 6038          }
6039 6039  }
6040 6040  
6041 6041  /*
6042 6042   * Return the pagenum for the pp
6043 6043   */
6044 6044  pfn_t
6045 6045  page_pptonum(page_t *pp)
6046 6046  {
6047 6047          return (pp->p_pagenum);
6048 6048  }
6049 6049  
6050 6050  /*
6051 6051   * interface to the referenced and modified etc bits
6052 6052   * in the PSM part of the page struct
6053 6053   * when no locking is desired.
6054 6054   */
6055 6055  void
6056 6056  page_set_props(page_t *pp, uint_t flags)
6057 6057  {
6058 6058          ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0);
6059 6059          pp->p_nrm |= (uchar_t)flags;
6060 6060  }
6061 6061  
6062 6062  void
6063 6063  page_clr_all_props(page_t *pp)
6064 6064  {
6065 6065          pp->p_nrm = 0;
6066 6066  }
6067 6067  
6068 6068  /*
6069 6069   * Clear p_lckcnt and p_cowcnt, adjusting freemem if required.
6070 6070   */
6071 6071  int
6072 6072  page_clear_lck_cow(page_t *pp, int adjust)
6073 6073  {
6074 6074          int     f_amount;
6075 6075  
6076 6076          ASSERT(PAGE_EXCL(pp));
6077 6077  
6078 6078          /*
6079 6079           * The page_struct_lock need not be acquired here since
6080 6080           * we require the caller hold the page exclusively locked.
6081 6081           */
6082 6082          f_amount = 0;
6083 6083          if (pp->p_lckcnt) {
6084 6084                  f_amount = 1;
6085 6085                  pp->p_lckcnt = 0;
6086 6086          }
6087 6087          if (pp->p_cowcnt) {
6088 6088                  f_amount += pp->p_cowcnt;
6089 6089                  pp->p_cowcnt = 0;
6090 6090          }
6091 6091  
6092 6092          if (adjust && f_amount) {
6093 6093                  mutex_enter(&freemem_lock);
6094 6094                  availrmem += f_amount;
6095 6095                  mutex_exit(&freemem_lock);
6096 6096          }
6097 6097  
6098 6098          return (f_amount);
6099 6099  }
6100 6100  
6101 6101  /*
6102 6102   * The following functions is called from free_vp_pages()
6103 6103   * for an inexact estimate of a newly free'd page...
6104 6104   */
6105 6105  ulong_t
6106 6106  page_share_cnt(page_t *pp)
6107 6107  {
6108 6108          return (hat_page_getshare(pp));
6109 6109  }
6110 6110  
6111 6111  int
6112 6112  page_isshared(page_t *pp)
6113 6113  {
6114 6114          return (hat_page_checkshare(pp, 1));
6115 6115  }
6116 6116  
6117 6117  int
6118 6118  page_isfree(page_t *pp)
6119 6119  {
6120 6120          return (PP_ISFREE(pp));
6121 6121  }
6122 6122  
6123 6123  int
6124 6124  page_isref(page_t *pp)
6125 6125  {
6126 6126          return (hat_page_getattr(pp, P_REF));
6127 6127  }
6128 6128  
6129 6129  int
6130 6130  page_ismod(page_t *pp)
6131 6131  {
6132 6132          return (hat_page_getattr(pp, P_MOD));
6133 6133  }
6134 6134  
6135 6135  /*
6136 6136   * The following code all currently relates to the page capture logic:
6137 6137   *
6138 6138   * This logic is used for cases where there is a desire to claim a certain
6139 6139   * physical page in the system for the caller.  As it may not be possible
6140 6140   * to capture the page immediately, the p_toxic bits are used in the page
6141 6141   * structure to indicate that someone wants to capture this page.  When the
6142 6142   * page gets unlocked, the toxic flag will be noted and an attempt to capture
6143 6143   * the page will be made.  If it is successful, the original callers callback
6144 6144   * will be called with the page to do with it what they please.
6145 6145   *
6146 6146   * There is also an async thread which wakes up to attempt to capture
6147 6147   * pages occasionally which have the capture bit set.  All of the pages which
6148 6148   * need to be captured asynchronously have been inserted into the
6149 6149   * page_capture_hash and thus this thread walks that hash list.  Items in the
6150 6150   * hash have an expiration time so this thread handles that as well by removing
6151 6151   * the item from the hash if it has expired.
6152 6152   *
6153 6153   * Some important things to note are:
6154 6154   * - if the PR_CAPTURE bit is set on a page, then the page is in the
6155 6155   *   page_capture_hash.  The page_capture_hash_head.pchh_mutex is needed
6156 6156   *   to set and clear this bit, and while the lock is held is the only time
6157 6157   *   you can add or remove an entry from the hash.
6158 6158   * - the PR_CAPTURE bit can only be set and cleared while holding the
6159 6159   *   page_capture_hash_head.pchh_mutex
6160 6160   * - the t_flag field of the thread struct is used with the T_CAPTURING
6161 6161   *   flag to prevent recursion while dealing with large pages.
6162 6162   * - pages which need to be retired never expire on the page_capture_hash.
6163 6163   */
6164 6164  
6165 6165  static void page_capture_thread(void);
6166 6166  static kthread_t *pc_thread_id;
6167 6167  kcondvar_t pc_cv;
6168 6168  static kmutex_t pc_thread_mutex;
6169 6169  static clock_t pc_thread_shortwait;
6170 6170  static clock_t pc_thread_longwait;
6171 6171  static int pc_thread_retry;
6172 6172  
6173 6173  struct page_capture_callback pc_cb[PC_NUM_CALLBACKS];
6174 6174  
6175 6175  /* Note that this is a circular linked list */
6176 6176  typedef struct page_capture_hash_bucket {
6177 6177          page_t *pp;
6178 6178          uchar_t szc;
6179 6179          uchar_t pri;
6180 6180          uint_t flags;
6181 6181          clock_t expires;        /* lbolt at which this request expires. */
6182 6182          void *datap;            /* Cached data passed in for callback */
6183 6183          struct page_capture_hash_bucket *next;
6184 6184          struct page_capture_hash_bucket *prev;
6185 6185  } page_capture_hash_bucket_t;
6186 6186  
6187 6187  #define PC_PRI_HI       0       /* capture now */
6188 6188  #define PC_PRI_LO       1       /* capture later */
6189 6189  #define PC_NUM_PRI      2
6190 6190  
6191 6191  #define PAGE_CAPTURE_PRIO(pp) (PP_ISRAF(pp) ? PC_PRI_LO : PC_PRI_HI)
6192 6192  
6193 6193  
6194 6194  /*
6195 6195   * Each hash bucket will have it's own mutex and two lists which are:
6196 6196   * active (0):  represents requests which have not been processed by
6197 6197   *              the page_capture async thread yet.
6198 6198   * walked (1):  represents requests which have been processed by the
6199 6199   *              page_capture async thread within it's given walk of this bucket.
6200 6200   *
6201 6201   * These are all needed so that we can synchronize all async page_capture
6202 6202   * events.  When the async thread moves to a new bucket, it will append the
6203 6203   * walked list to the active list and walk each item one at a time, moving it
6204 6204   * from the active list to the walked list.  Thus if there is an async request
6205 6205   * outstanding for a given page, it will always be in one of the two lists.
6206 6206   * New requests will always be added to the active list.
6207 6207   * If we were not able to capture a page before the request expired, we'd free
6208 6208   * up the request structure which would indicate to page_capture that there is
6209 6209   * no longer a need for the given page, and clear the PR_CAPTURE flag if
6210 6210   * possible.
6211 6211   */
6212 6212  typedef struct page_capture_hash_head {
6213 6213          kmutex_t pchh_mutex;
6214 6214          uint_t num_pages[PC_NUM_PRI];
6215 6215          page_capture_hash_bucket_t lists[2]; /* sentinel nodes */
6216 6216  } page_capture_hash_head_t;
6217 6217  
6218 6218  #ifdef DEBUG
6219 6219  #define NUM_PAGE_CAPTURE_BUCKETS 4
6220 6220  #else
6221 6221  #define NUM_PAGE_CAPTURE_BUCKETS 64
6222 6222  #endif
6223 6223  
6224 6224  page_capture_hash_head_t page_capture_hash[NUM_PAGE_CAPTURE_BUCKETS];
6225 6225  
6226 6226  /* for now use a very simple hash based upon the size of a page struct */
6227 6227  #define PAGE_CAPTURE_HASH(pp)   \
6228 6228          ((int)(((uintptr_t)pp >> 7) & (NUM_PAGE_CAPTURE_BUCKETS - 1)))
6229 6229  
6230 6230  extern pgcnt_t swapfs_minfree;
6231 6231  
6232 6232  int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap);
6233 6233  
6234 6234  /*
6235 6235   * a callback function is required for page capture requests.
6236 6236   */
6237 6237  void
6238 6238  page_capture_register_callback(uint_t index, clock_t duration,
6239 6239      int (*cb_func)(page_t *, void *, uint_t))
6240 6240  {
6241 6241          ASSERT(pc_cb[index].cb_active == 0);
6242 6242          ASSERT(cb_func != NULL);
6243 6243          rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6244 6244          pc_cb[index].duration = duration;
6245 6245          pc_cb[index].cb_func = cb_func;
6246 6246          pc_cb[index].cb_active = 1;
6247 6247          rw_exit(&pc_cb[index].cb_rwlock);
6248 6248  }
6249 6249  
6250 6250  void
6251 6251  page_capture_unregister_callback(uint_t index)
6252 6252  {
6253 6253          int i, j;
6254 6254          struct page_capture_hash_bucket *bp1;
6255 6255          struct page_capture_hash_bucket *bp2;
6256 6256          struct page_capture_hash_bucket *head = NULL;
6257 6257          uint_t flags = (1 << index);
6258 6258  
6259 6259          rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6260 6260          ASSERT(pc_cb[index].cb_active == 1);
6261 6261          pc_cb[index].duration = 0;      /* Paranoia */
6262 6262          pc_cb[index].cb_func = NULL;    /* Paranoia */
6263 6263          pc_cb[index].cb_active = 0;
6264 6264          rw_exit(&pc_cb[index].cb_rwlock);
6265 6265  
6266 6266          /*
6267 6267           * Just move all the entries to a private list which we can walk
6268 6268           * through without the need to hold any locks.
6269 6269           * No more requests can get added to the hash lists for this consumer
6270 6270           * as the cb_active field for the callback has been cleared.
6271 6271           */
6272 6272          for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
6273 6273                  mutex_enter(&page_capture_hash[i].pchh_mutex);
6274 6274                  for (j = 0; j < 2; j++) {
6275 6275                          bp1 = page_capture_hash[i].lists[j].next;
6276 6276                          /* walk through all but first (sentinel) element */
6277 6277                          while (bp1 != &page_capture_hash[i].lists[j]) {
6278 6278                                  bp2 = bp1;
6279 6279                                  if (bp2->flags & flags) {
6280 6280                                          bp1 = bp2->next;
6281 6281                                          bp1->prev = bp2->prev;
6282 6282                                          bp2->prev->next = bp1;
6283 6283                                          bp2->next = head;
6284 6284                                          head = bp2;
6285 6285                                          /*
6286 6286                                           * Clear the PR_CAPTURE bit as we
6287 6287                                           * hold appropriate locks here.
6288 6288                                           */
6289 6289                                          page_clrtoxic(head->pp, PR_CAPTURE);
6290 6290                                          page_capture_hash[i].
6291 6291                                              num_pages[bp2->pri]--;
6292 6292                                          continue;
6293 6293                                  }
6294 6294                                  bp1 = bp1->next;
6295 6295                          }
6296 6296                  }
6297 6297                  mutex_exit(&page_capture_hash[i].pchh_mutex);
6298 6298          }
6299 6299  
6300 6300          while (head != NULL) {
6301 6301                  bp1 = head;
6302 6302                  head = head->next;
6303 6303                  kmem_free(bp1, sizeof (*bp1));
6304 6304          }
6305 6305  }
6306 6306  
6307 6307  
6308 6308  /*
6309 6309   * Find pp in the active list and move it to the walked list if it
6310 6310   * exists.
6311 6311   * Note that most often pp should be at the front of the active list
6312 6312   * as it is currently used and thus there is no other sort of optimization
6313 6313   * being done here as this is a linked list data structure.
6314 6314   * Returns 1 on successful move or 0 if page could not be found.
6315 6315   */
6316 6316  static int
6317 6317  page_capture_move_to_walked(page_t *pp)
6318 6318  {
6319 6319          page_capture_hash_bucket_t *bp;
6320 6320          int index;
6321 6321  
6322 6322          index = PAGE_CAPTURE_HASH(pp);
6323 6323  
6324 6324          mutex_enter(&page_capture_hash[index].pchh_mutex);
6325 6325          bp = page_capture_hash[index].lists[0].next;
6326 6326          while (bp != &page_capture_hash[index].lists[0]) {
6327 6327                  if (bp->pp == pp) {
6328 6328                          /* Remove from old list */
6329 6329                          bp->next->prev = bp->prev;
6330 6330                          bp->prev->next = bp->next;
6331 6331  
6332 6332                          /* Add to new list */
6333 6333                          bp->next = page_capture_hash[index].lists[1].next;
6334 6334                          bp->prev = &page_capture_hash[index].lists[1];
6335 6335                          page_capture_hash[index].lists[1].next = bp;
6336 6336                          bp->next->prev = bp;
6337 6337  
6338 6338                          /*
6339 6339                           * There is a small probability of page on a free
6340 6340                           * list being retired while being allocated
6341 6341                           * and before P_RAF is set on it. The page may
6342 6342                           * end up marked as high priority request instead
6343 6343                           * of low priority request.
6344 6344                           * If P_RAF page is not marked as low priority request
6345 6345                           * change it to low priority request.
6346 6346                           */
6347 6347                          page_capture_hash[index].num_pages[bp->pri]--;
6348 6348                          bp->pri = PAGE_CAPTURE_PRIO(pp);
6349 6349                          page_capture_hash[index].num_pages[bp->pri]++;
6350 6350                          mutex_exit(&page_capture_hash[index].pchh_mutex);
6351 6351                          return (1);
6352 6352                  }
6353 6353                  bp = bp->next;
6354 6354          }
6355 6355          mutex_exit(&page_capture_hash[index].pchh_mutex);
6356 6356          return (0);
6357 6357  }
6358 6358  
6359 6359  /*
6360 6360   * Add a new entry to the page capture hash.  The only case where a new
6361 6361   * entry is not added is when the page capture consumer is no longer registered.
6362 6362   * In this case, we'll silently not add the page to the hash.  We know that
6363 6363   * page retire will always be registered for the case where we are currently
6364 6364   * unretiring a page and thus there are no conflicts.
6365 6365   */
6366 6366  static void
6367 6367  page_capture_add_hash(page_t *pp, uint_t szc, uint_t flags, void *datap)
6368 6368  {
6369 6369          page_capture_hash_bucket_t *bp1;
6370 6370          page_capture_hash_bucket_t *bp2;
6371 6371          int index;
6372 6372          int cb_index;
6373 6373          int i;
6374 6374          uchar_t pri;
6375 6375  #ifdef DEBUG
6376 6376          page_capture_hash_bucket_t *tp1;
6377 6377          int l;
6378 6378  #endif
6379 6379  
6380 6380          ASSERT(!(flags & CAPTURE_ASYNC));
6381 6381  
6382 6382          bp1 = kmem_alloc(sizeof (struct page_capture_hash_bucket), KM_SLEEP);
6383 6383  
6384 6384          bp1->pp = pp;
6385 6385          bp1->szc = szc;
6386 6386          bp1->flags = flags;
6387 6387          bp1->datap = datap;
6388 6388  
6389 6389          for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6390 6390                  if ((flags >> cb_index) & 1) {
6391 6391                          break;
6392 6392                  }
6393 6393          }
6394 6394  
6395 6395          ASSERT(cb_index != PC_NUM_CALLBACKS);
6396 6396  
6397 6397          rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6398 6398          if (pc_cb[cb_index].cb_active) {
6399 6399                  if (pc_cb[cb_index].duration == -1) {
6400 6400                          bp1->expires = (clock_t)-1;
6401 6401                  } else {
6402 6402                          bp1->expires = ddi_get_lbolt() +
6403 6403                              pc_cb[cb_index].duration;
6404 6404                  }
6405 6405          } else {
6406 6406                  /* There's no callback registered so don't add to the hash */
6407 6407                  rw_exit(&pc_cb[cb_index].cb_rwlock);
6408 6408                  kmem_free(bp1, sizeof (*bp1));
6409 6409                  return;
6410 6410          }
6411 6411  
6412 6412          index = PAGE_CAPTURE_HASH(pp);
6413 6413  
6414 6414          /*
6415 6415           * Only allow capture flag to be modified under this mutex.
6416 6416           * Prevents multiple entries for same page getting added.
6417 6417           */
6418 6418          mutex_enter(&page_capture_hash[index].pchh_mutex);
6419 6419  
6420 6420          /*
6421 6421           * if not already on the hash, set capture bit and add to the hash
6422 6422           */
6423 6423          if (!(pp->p_toxic & PR_CAPTURE)) {
6424 6424  #ifdef DEBUG
6425 6425                  /* Check for duplicate entries */
6426 6426                  for (l = 0; l < 2; l++) {
6427 6427                          tp1 = page_capture_hash[index].lists[l].next;
6428 6428                          while (tp1 != &page_capture_hash[index].lists[l]) {
6429 6429                                  if (tp1->pp == pp) {
6430 6430                                          panic("page pp 0x%p already on hash "
6431 6431                                              "at 0x%p\n",
6432 6432                                              (void *)pp, (void *)tp1);
6433 6433                                  }
6434 6434                                  tp1 = tp1->next;
6435 6435                          }
6436 6436                  }
6437 6437  
6438 6438  #endif
6439 6439                  page_settoxic(pp, PR_CAPTURE);
6440 6440                  pri = PAGE_CAPTURE_PRIO(pp);
6441 6441                  bp1->pri = pri;
6442 6442                  bp1->next = page_capture_hash[index].lists[0].next;
6443 6443                  bp1->prev = &page_capture_hash[index].lists[0];
6444 6444                  bp1->next->prev = bp1;
6445 6445                  page_capture_hash[index].lists[0].next = bp1;
6446 6446                  page_capture_hash[index].num_pages[pri]++;
6447 6447                  if (flags & CAPTURE_RETIRE) {
6448 6448                          page_retire_incr_pend_count(datap);
6449 6449                  }
6450 6450                  mutex_exit(&page_capture_hash[index].pchh_mutex);
6451 6451                  rw_exit(&pc_cb[cb_index].cb_rwlock);
6452 6452                  cv_signal(&pc_cv);
6453 6453                  return;
6454 6454          }
6455 6455  
6456 6456          /*
6457 6457           * A page retire request will replace any other request.
6458 6458           * A second physmem request which is for a different process than
6459 6459           * the currently registered one will be dropped as there is
6460 6460           * no way to hold the private data for both calls.
6461 6461           * In the future, once there are more callers, this will have to
6462 6462           * be worked out better as there needs to be private storage for
6463 6463           * at least each type of caller (maybe have datap be an array of
6464 6464           * *void's so that we can index based upon callers index).
6465 6465           */
6466 6466  
6467 6467          /* walk hash list to update expire time */
6468 6468          for (i = 0; i < 2; i++) {
6469 6469                  bp2 = page_capture_hash[index].lists[i].next;
6470 6470                  while (bp2 != &page_capture_hash[index].lists[i]) {
6471 6471                          if (bp2->pp == pp) {
6472 6472                                  if (flags & CAPTURE_RETIRE) {
6473 6473                                          if (!(bp2->flags & CAPTURE_RETIRE)) {
6474 6474                                                  page_retire_incr_pend_count(
6475 6475                                                      datap);
6476 6476                                                  bp2->flags = flags;
6477 6477                                                  bp2->expires = bp1->expires;
6478 6478                                                  bp2->datap = datap;
6479 6479                                          }
6480 6480                                  } else {
6481 6481                                          ASSERT(flags & CAPTURE_PHYSMEM);
6482 6482                                          if (!(bp2->flags & CAPTURE_RETIRE) &&
6483 6483                                              (datap == bp2->datap)) {
6484 6484                                                  bp2->expires = bp1->expires;
6485 6485                                          }
6486 6486                                  }
6487 6487                                  mutex_exit(&page_capture_hash[index].
6488 6488                                      pchh_mutex);
6489 6489                                  rw_exit(&pc_cb[cb_index].cb_rwlock);
6490 6490                                  kmem_free(bp1, sizeof (*bp1));
6491 6491                                  return;
6492 6492                          }
6493 6493                          bp2 = bp2->next;
6494 6494                  }
6495 6495          }
6496 6496  
6497 6497          /*
6498 6498           * the PR_CAPTURE flag is protected by the page_capture_hash mutexes
6499 6499           * and thus it either has to be set or not set and can't change
6500 6500           * while holding the mutex above.
6501 6501           */
6502 6502          panic("page_capture_add_hash, PR_CAPTURE flag set on pp %p\n",
6503 6503              (void *)pp);
6504 6504  }
6505 6505  
6506 6506  /*
6507 6507   * We have a page in our hands, lets try and make it ours by turning
6508 6508   * it into a clean page like it had just come off the freelists.
6509 6509   *
6510 6510   * Returns 0 on success, with the page still EXCL locked.
6511 6511   * On failure, the page will be unlocked, and returns EAGAIN
6512 6512   */
6513 6513  static int
6514 6514  page_capture_clean_page(page_t *pp)
6515 6515  {
6516 6516          page_t *newpp;
6517 6517          int skip_unlock = 0;
6518 6518          spgcnt_t count;
6519 6519          page_t *tpp;
6520 6520          int ret = 0;
6521 6521          int extra;
6522 6522  
6523 6523          ASSERT(PAGE_EXCL(pp));
6524 6524          ASSERT(!PP_RETIRED(pp));
6525 6525          ASSERT(curthread->t_flag & T_CAPTURING);
6526 6526  
6527 6527          if (PP_ISFREE(pp)) {
6528 6528                  if (!page_reclaim(pp, NULL)) {
6529 6529                          skip_unlock = 1;
6530 6530                          ret = EAGAIN;
6531 6531                          goto cleanup;
6532 6532                  }
6533 6533                  ASSERT(pp->p_szc == 0);
6534 6534                  if (pp->p_vnode != NULL) {
6535 6535                          /*
6536 6536                           * Since this page came from the
6537 6537                           * cachelist, we must destroy the
6538 6538                           * old vnode association.
6539 6539                           */
6540 6540                          page_hashout(pp, NULL);
6541 6541                  }
6542 6542                  goto cleanup;
6543 6543          }
6544 6544  
6545 6545          /*
6546 6546           * If we know page_relocate will fail, skip it
6547 6547           * It could still fail due to a UE on another page but we
6548 6548           * can't do anything about that.
6549 6549           */
6550 6550          if (pp->p_toxic & PR_UE) {
6551 6551                  goto skip_relocate;
6552 6552          }
6553 6553  
6554 6554          /*
6555 6555           * It's possible that pages can not have a vnode as fsflush comes
6556 6556           * through and cleans up these pages.  It's ugly but that's how it is.
6557 6557           */
6558 6558          if (pp->p_vnode == NULL) {
6559 6559                  goto skip_relocate;
6560 6560          }
6561 6561  
6562 6562          /*
6563 6563           * Page was not free, so lets try to relocate it.
6564 6564           * page_relocate only works with root pages, so if this is not a root
6565 6565           * page, we need to demote it to try and relocate it.
6566 6566           * Unfortunately this is the best we can do right now.
6567 6567           */
6568 6568          newpp = NULL;
6569 6569          if ((pp->p_szc > 0) && (pp != PP_PAGEROOT(pp))) {
6570 6570                  if (page_try_demote_pages(pp) == 0) {
6571 6571                          ret = EAGAIN;
6572 6572                          goto cleanup;
6573 6573                  }
6574 6574          }
6575 6575          ret = page_relocate(&pp, &newpp, 1, 0, &count, NULL);
6576 6576          if (ret == 0) {
6577 6577                  page_t *npp;
6578 6578                  /* unlock the new page(s) */
6579 6579                  while (count-- > 0) {
6580 6580                          ASSERT(newpp != NULL);
6581 6581                          npp = newpp;
6582 6582                          page_sub(&newpp, npp);
6583 6583                          page_unlock(npp);
6584 6584                  }
6585 6585                  ASSERT(newpp == NULL);
6586 6586                  /*
6587 6587                   * Check to see if the page we have is too large.
6588 6588                   * If so, demote it freeing up the extra pages.
6589 6589                   */
6590 6590                  if (pp->p_szc > 0) {
6591 6591                          /* For now demote extra pages to szc == 0 */
6592 6592                          extra = page_get_pagecnt(pp->p_szc) - 1;
6593 6593                          while (extra > 0) {
6594 6594                                  tpp = pp->p_next;
6595 6595                                  page_sub(&pp, tpp);
6596 6596                                  tpp->p_szc = 0;
6597 6597                                  page_free(tpp, 1);
6598 6598                                  extra--;
6599 6599                          }
6600 6600                          /* Make sure to set our page to szc 0 as well */
6601 6601                          ASSERT(pp->p_next == pp && pp->p_prev == pp);
6602 6602                          pp->p_szc = 0;
6603 6603                  }
6604 6604                  goto cleanup;
6605 6605          } else if (ret == EIO) {
6606 6606                  ret = EAGAIN;
6607 6607                  goto cleanup;
6608 6608          } else {
6609 6609                  /*
6610 6610                   * Need to reset return type as we failed to relocate the page
6611 6611                   * but that does not mean that some of the next steps will not
6612 6612                   * work.
6613 6613                   */
6614 6614                  ret = 0;
6615 6615          }
6616 6616  
6617 6617  skip_relocate:
6618 6618  
6619 6619          if (pp->p_szc > 0) {
6620 6620                  if (page_try_demote_pages(pp) == 0) {
6621 6621                          ret = EAGAIN;
6622 6622                          goto cleanup;
6623 6623                  }
6624 6624          }
6625 6625  
6626 6626          ASSERT(pp->p_szc == 0);
6627 6627  
6628 6628          if (hat_ismod(pp)) {
6629 6629                  ret = EAGAIN;
6630 6630                  goto cleanup;
6631 6631          }
6632 6632          if (PP_ISKAS(pp)) {
6633 6633                  ret = EAGAIN;
6634 6634                  goto cleanup;
6635 6635          }
6636 6636          if (pp->p_lckcnt || pp->p_cowcnt) {
6637 6637                  ret = EAGAIN;
6638 6638                  goto cleanup;
6639 6639          }
6640 6640  
6641 6641          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
6642 6642          ASSERT(!hat_page_is_mapped(pp));
6643 6643  
6644 6644          if (hat_ismod(pp)) {
6645 6645                  /*
6646 6646                   * This is a semi-odd case as the page is now modified but not
6647 6647                   * mapped as we just unloaded the mappings above.
6648 6648                   */
6649 6649                  ret = EAGAIN;
6650 6650                  goto cleanup;
6651 6651          }
6652 6652          if (pp->p_vnode != NULL) {
6653 6653                  page_hashout(pp, NULL);
6654 6654          }
6655 6655  
6656 6656          /*
6657 6657           * At this point, the page should be in a clean state and
6658 6658           * we can do whatever we want with it.
6659 6659           */
6660 6660  
6661 6661  cleanup:
6662 6662          if (ret != 0) {
6663 6663                  if (!skip_unlock) {
6664 6664                          page_unlock(pp);
6665 6665                  }
6666 6666          } else {
6667 6667                  ASSERT(pp->p_szc == 0);
6668 6668                  ASSERT(PAGE_EXCL(pp));
6669 6669  
6670 6670                  pp->p_next = pp;
6671 6671                  pp->p_prev = pp;
6672 6672          }
6673 6673          return (ret);
6674 6674  }
6675 6675  
6676 6676  /*
6677 6677   * Various callers of page_trycapture() can have different restrictions upon
6678 6678   * what memory they have access to.
6679 6679   * Returns 0 on success, with the following error codes on failure:
6680 6680   *      EPERM - The requested page is long term locked, and thus repeated
6681 6681   *              requests to capture this page will likely fail.
6682 6682   *      ENOMEM - There was not enough free memory in the system to safely
6683 6683   *              map the requested page.
6684 6684   *      ENOENT - The requested page was inside the kernel cage, and the
6685 6685   *              PHYSMEM_CAGE flag was not set.
6686 6686   */
6687 6687  int
6688 6688  page_capture_pre_checks(page_t *pp, uint_t flags)
6689 6689  {
6690 6690          ASSERT(pp != NULL);
6691 6691  
6692 6692  #if defined(__sparc)
6693 6693          if (pp->p_vnode == &promvp) {
6694 6694                  return (EPERM);
6695 6695          }
6696 6696  
6697 6697          if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE) &&
6698 6698              (flags & CAPTURE_PHYSMEM)) {
6699 6699                  return (ENOENT);
6700 6700          }
6701 6701  
6702 6702          if (PP_ISNORELOCKERNEL(pp)) {
6703 6703                  return (EPERM);
6704 6704          }
6705 6705  #else
6706 6706          if (PP_ISKAS(pp)) {
6707 6707                  return (EPERM);
6708 6708          }
6709 6709  #endif /* __sparc */
6710 6710  
6711 6711          /* only physmem currently has the restrictions checked below */
6712 6712          if (!(flags & CAPTURE_PHYSMEM)) {
6713 6713                  return (0);
6714 6714          }
6715 6715  
6716 6716          if (availrmem < swapfs_minfree) {
6717 6717                  /*
6718 6718                   * We won't try to capture this page as we are
6719 6719                   * running low on memory.
6720 6720                   */
6721 6721                  return (ENOMEM);
6722 6722          }
6723 6723          return (0);
6724 6724  }
6725 6725  
6726 6726  /*
6727 6727   * Once we have a page in our mits, go ahead and complete the capture
6728 6728   * operation.
6729 6729   * Returns 1 on failure where page is no longer needed
6730 6730   * Returns 0 on success
6731 6731   * Returns -1 if there was a transient failure.
6732 6732   * Failure cases must release the SE_EXCL lock on pp (usually via page_free).
6733 6733   */
6734 6734  int
6735 6735  page_capture_take_action(page_t *pp, uint_t flags, void *datap)
6736 6736  {
6737 6737          int cb_index;
6738 6738          int ret = 0;
6739 6739          page_capture_hash_bucket_t *bp1;
6740 6740          page_capture_hash_bucket_t *bp2;
6741 6741          int index;
6742 6742          int found = 0;
6743 6743          int i;
6744 6744  
6745 6745          ASSERT(PAGE_EXCL(pp));
6746 6746          ASSERT(curthread->t_flag & T_CAPTURING);
6747 6747  
6748 6748          for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6749 6749                  if ((flags >> cb_index) & 1) {
6750 6750                          break;
6751 6751                  }
6752 6752          }
6753 6753          ASSERT(cb_index < PC_NUM_CALLBACKS);
6754 6754  
6755 6755          /*
6756 6756           * Remove the entry from the page_capture hash, but don't free it yet
6757 6757           * as we may need to put it back.
6758 6758           * Since we own the page at this point in time, we should find it
6759 6759           * in the hash if this is an ASYNC call.  If we don't it's likely
6760 6760           * that the page_capture_async() thread decided that this request
6761 6761           * had expired, in which case we just continue on.
6762 6762           */
6763 6763          if (flags & CAPTURE_ASYNC) {
6764 6764  
6765 6765                  index = PAGE_CAPTURE_HASH(pp);
6766 6766  
6767 6767                  mutex_enter(&page_capture_hash[index].pchh_mutex);
6768 6768                  for (i = 0; i < 2 && !found; i++) {
6769 6769                          bp1 = page_capture_hash[index].lists[i].next;
6770 6770                          while (bp1 != &page_capture_hash[index].lists[i]) {
6771 6771                                  if (bp1->pp == pp) {
6772 6772                                          bp1->next->prev = bp1->prev;
6773 6773                                          bp1->prev->next = bp1->next;
6774 6774                                          page_capture_hash[index].
6775 6775                                              num_pages[bp1->pri]--;
6776 6776                                          page_clrtoxic(pp, PR_CAPTURE);
6777 6777                                          found = 1;
6778 6778                                          break;
6779 6779                                  }
6780 6780                                  bp1 = bp1->next;
6781 6781                          }
6782 6782                  }
6783 6783                  mutex_exit(&page_capture_hash[index].pchh_mutex);
6784 6784          }
6785 6785  
6786 6786          /* Synchronize with the unregister func. */
6787 6787          rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6788 6788          if (!pc_cb[cb_index].cb_active) {
6789 6789                  page_free(pp, 1);
6790 6790                  rw_exit(&pc_cb[cb_index].cb_rwlock);
6791 6791                  if (found) {
6792 6792                          kmem_free(bp1, sizeof (*bp1));
6793 6793                  }
6794 6794                  return (1);
6795 6795          }
6796 6796  
6797 6797          /*
6798 6798           * We need to remove the entry from the page capture hash and turn off
6799 6799           * the PR_CAPTURE bit before calling the callback.  We'll need to cache
6800 6800           * the entry here, and then based upon the return value, cleanup
6801 6801           * appropriately or re-add it to the hash, making sure that someone else
6802 6802           * hasn't already done so.
6803 6803           * It should be rare for the callback to fail and thus it's ok for
6804 6804           * the failure path to be a bit complicated as the success path is
6805 6805           * cleaner and the locking rules are easier to follow.
6806 6806           */
6807 6807  
6808 6808          ret = pc_cb[cb_index].cb_func(pp, datap, flags);
6809 6809  
6810 6810          rw_exit(&pc_cb[cb_index].cb_rwlock);
6811 6811  
6812 6812          /*
6813 6813           * If this was an ASYNC request, we need to cleanup the hash if the
6814 6814           * callback was successful or if the request was no longer valid.
6815 6815           * For non-ASYNC requests, we return failure to map and the caller
6816 6816           * will take care of adding the request to the hash.
6817 6817           * Note also that the callback itself is responsible for the page
6818 6818           * at this point in time in terms of locking ...  The most common
6819 6819           * case for the failure path should just be a page_free.
6820 6820           */
6821 6821          if (ret >= 0) {
6822 6822                  if (found) {
6823 6823                          if (bp1->flags & CAPTURE_RETIRE) {
6824 6824                                  page_retire_decr_pend_count(datap);
6825 6825                          }
6826 6826                          kmem_free(bp1, sizeof (*bp1));
6827 6827                  }
6828 6828                  return (ret);
6829 6829          }
6830 6830          if (!found) {
6831 6831                  return (ret);
6832 6832          }
6833 6833  
6834 6834          ASSERT(flags & CAPTURE_ASYNC);
6835 6835  
6836 6836          /*
6837 6837           * Check for expiration time first as we can just free it up if it's
6838 6838           * expired.
6839 6839           */
6840 6840          if (ddi_get_lbolt() > bp1->expires && bp1->expires != -1) {
6841 6841                  kmem_free(bp1, sizeof (*bp1));
6842 6842                  return (ret);
6843 6843          }
6844 6844  
6845 6845          /*
6846 6846           * The callback failed and there used to be an entry in the hash for
6847 6847           * this page, so we need to add it back to the hash.
6848 6848           */
6849 6849          mutex_enter(&page_capture_hash[index].pchh_mutex);
6850 6850          if (!(pp->p_toxic & PR_CAPTURE)) {
6851 6851                  /* just add bp1 back to head of walked list */
6852 6852                  page_settoxic(pp, PR_CAPTURE);
6853 6853                  bp1->next = page_capture_hash[index].lists[1].next;
6854 6854                  bp1->prev = &page_capture_hash[index].lists[1];
6855 6855                  bp1->next->prev = bp1;
6856 6856                  bp1->pri = PAGE_CAPTURE_PRIO(pp);
6857 6857                  page_capture_hash[index].lists[1].next = bp1;
6858 6858                  page_capture_hash[index].num_pages[bp1->pri]++;
6859 6859                  mutex_exit(&page_capture_hash[index].pchh_mutex);
6860 6860                  return (ret);
6861 6861          }
6862 6862  
6863 6863          /*
6864 6864           * Otherwise there was a new capture request added to list
6865 6865           * Need to make sure that our original data is represented if
6866 6866           * appropriate.
6867 6867           */
6868 6868          for (i = 0; i < 2; i++) {
6869 6869                  bp2 = page_capture_hash[index].lists[i].next;
6870 6870                  while (bp2 != &page_capture_hash[index].lists[i]) {
6871 6871                          if (bp2->pp == pp) {
6872 6872                                  if (bp1->flags & CAPTURE_RETIRE) {
6873 6873                                          if (!(bp2->flags & CAPTURE_RETIRE)) {
6874 6874                                                  bp2->szc = bp1->szc;
6875 6875                                                  bp2->flags = bp1->flags;
6876 6876                                                  bp2->expires = bp1->expires;
6877 6877                                                  bp2->datap = bp1->datap;
6878 6878                                          }
6879 6879                                  } else {
6880 6880                                          ASSERT(bp1->flags & CAPTURE_PHYSMEM);
6881 6881                                          if (!(bp2->flags & CAPTURE_RETIRE)) {
6882 6882                                                  bp2->szc = bp1->szc;
6883 6883                                                  bp2->flags = bp1->flags;
6884 6884                                                  bp2->expires = bp1->expires;
6885 6885                                                  bp2->datap = bp1->datap;
6886 6886                                          }
6887 6887                                  }
6888 6888                                  page_capture_hash[index].num_pages[bp2->pri]--;
6889 6889                                  bp2->pri = PAGE_CAPTURE_PRIO(pp);
6890 6890                                  page_capture_hash[index].num_pages[bp2->pri]++;
6891 6891                                  mutex_exit(&page_capture_hash[index].
6892 6892                                      pchh_mutex);
6893 6893                                  kmem_free(bp1, sizeof (*bp1));
6894 6894                                  return (ret);
6895 6895                          }
6896 6896                          bp2 = bp2->next;
6897 6897                  }
6898 6898          }
6899 6899          panic("PR_CAPTURE set but not on hash for pp 0x%p\n", (void *)pp);
6900 6900          /*NOTREACHED*/
6901 6901  }
6902 6902  
6903 6903  /*
6904 6904   * Try to capture the given page for the caller specified in the flags
6905 6905   * parameter.  The page will either be captured and handed over to the
6906 6906   * appropriate callback, or will be queued up in the page capture hash
6907 6907   * to be captured asynchronously.
6908 6908   * If the current request is due to an async capture, the page must be
6909 6909   * exclusively locked before calling this function.
6910 6910   * Currently szc must be 0 but in the future this should be expandable to
6911 6911   * other page sizes.
6912 6912   * Returns 0 on success, with the following error codes on failure:
6913 6913   *      EPERM - The requested page is long term locked, and thus repeated
6914 6914   *              requests to capture this page will likely fail.
6915 6915   *      ENOMEM - There was not enough free memory in the system to safely
6916 6916   *              map the requested page.
6917 6917   *      ENOENT - The requested page was inside the kernel cage, and the
6918 6918   *              CAPTURE_GET_CAGE flag was not set.
6919 6919   *      EAGAIN - The requested page could not be capturead at this point in
6920 6920   *              time but future requests will likely work.
6921 6921   *      EBUSY - The requested page is retired and the CAPTURE_GET_RETIRED flag
6922 6922   *              was not set.
6923 6923   */
6924 6924  int
6925 6925  page_itrycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
6926 6926  {
6927 6927          int ret;
6928 6928          int cb_index;
6929 6929  
6930 6930          if (flags & CAPTURE_ASYNC) {
6931 6931                  ASSERT(PAGE_EXCL(pp));
6932 6932                  goto async;
6933 6933          }
6934 6934  
6935 6935          /* Make sure there's enough availrmem ... */
6936 6936          ret = page_capture_pre_checks(pp, flags);
6937 6937          if (ret != 0) {
6938 6938                  return (ret);
6939 6939          }
6940 6940  
6941 6941          if (!page_trylock(pp, SE_EXCL)) {
6942 6942                  for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6943 6943                          if ((flags >> cb_index) & 1) {
6944 6944                                  break;
6945 6945                          }
6946 6946                  }
6947 6947                  ASSERT(cb_index < PC_NUM_CALLBACKS);
6948 6948                  ret = EAGAIN;
6949 6949                  /* Special case for retired pages */
6950 6950                  if (PP_RETIRED(pp)) {
6951 6951                          if (flags & CAPTURE_GET_RETIRED) {
6952 6952                                  if (!page_unretire_pp(pp, PR_UNR_TEMP)) {
6953 6953                                          /*
6954 6954                                           * Need to set capture bit and add to
6955 6955                                           * hash so that the page will be
6956 6956                                           * retired when freed.
6957 6957                                           */
6958 6958                                          page_capture_add_hash(pp, szc,
6959 6959                                              CAPTURE_RETIRE, NULL);
6960 6960                                          ret = 0;
6961 6961                                          goto own_page;
6962 6962                                  }
6963 6963                          } else {
6964 6964                                  return (EBUSY);
6965 6965                          }
6966 6966                  }
6967 6967                  page_capture_add_hash(pp, szc, flags, datap);
6968 6968                  return (ret);
6969 6969          }
6970 6970  
6971 6971  async:
6972 6972          ASSERT(PAGE_EXCL(pp));
6973 6973  
6974 6974          /* Need to check for physmem async requests that availrmem is sane */
6975 6975          if ((flags & (CAPTURE_ASYNC | CAPTURE_PHYSMEM)) ==
6976 6976              (CAPTURE_ASYNC | CAPTURE_PHYSMEM) &&
6977 6977              (availrmem < swapfs_minfree)) {
6978 6978                  page_unlock(pp);
6979 6979                  return (ENOMEM);
6980 6980          }
6981 6981  
6982 6982          ret = page_capture_clean_page(pp);
6983 6983  
6984 6984          if (ret != 0) {
6985 6985                  /* We failed to get the page, so lets add it to the hash */
6986 6986                  if (!(flags & CAPTURE_ASYNC)) {
6987 6987                          page_capture_add_hash(pp, szc, flags, datap);
6988 6988                  }
6989 6989                  return (ret);
6990 6990          }
6991 6991  
6992 6992  own_page:
6993 6993          ASSERT(PAGE_EXCL(pp));
6994 6994          ASSERT(pp->p_szc == 0);
6995 6995  
6996 6996          /* Call the callback */
6997 6997          ret = page_capture_take_action(pp, flags, datap);
6998 6998  
6999 6999          if (ret == 0) {
7000 7000                  return (0);
7001 7001          }
7002 7002  
7003 7003          /*
7004 7004           * Note that in the failure cases from page_capture_take_action, the
7005 7005           * EXCL lock will have already been dropped.
7006 7006           */
7007 7007          if ((ret == -1) && (!(flags & CAPTURE_ASYNC))) {
7008 7008                  page_capture_add_hash(pp, szc, flags, datap);
7009 7009          }
7010 7010          return (EAGAIN);
7011 7011  }
7012 7012  
7013 7013  int
7014 7014  page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
7015 7015  {
7016 7016          int ret;
7017 7017  
7018 7018          curthread->t_flag |= T_CAPTURING;
7019 7019          ret = page_itrycapture(pp, szc, flags, datap);
7020 7020          curthread->t_flag &= ~T_CAPTURING; /* xor works as we know its set */
7021 7021          return (ret);
7022 7022  }
7023 7023  
7024 7024  /*
7025 7025   * When unlocking a page which has the PR_CAPTURE bit set, this routine
7026 7026   * gets called to try and capture the page.
7027 7027   */
7028 7028  void
7029 7029  page_unlock_capture(page_t *pp)
7030 7030  {
7031 7031          page_capture_hash_bucket_t *bp;
7032 7032          int index;
7033 7033          int i;
7034 7034          uint_t szc;
7035 7035          uint_t flags = 0;
7036 7036          void *datap;
7037 7037          kmutex_t *mp;
7038 7038          extern vnode_t retired_pages;
7039 7039  
7040 7040          /*
7041 7041           * We need to protect against a possible deadlock here where we own
7042 7042           * the vnode page hash mutex and want to acquire it again as there
7043 7043           * are locations in the code, where we unlock a page while holding
7044 7044           * the mutex which can lead to the page being captured and eventually
7045 7045           * end up here.  As we may be hashing out the old page and hashing into
7046 7046           * the retire vnode, we need to make sure we don't own them.
7047 7047           * Other callbacks who do hash operations also need to make sure that
7048 7048           * before they hashin to a vnode that they do not currently own the
7049 7049           * vphm mutex otherwise there will be a panic.
7050 7050           */
7051 7051          if (mutex_owned(page_vnode_mutex(&retired_pages))) {
7052 7052                  page_unlock_nocapture(pp);
7053 7053                  return;
7054 7054          }
7055 7055          if (pp->p_vnode != NULL && mutex_owned(page_vnode_mutex(pp->p_vnode))) {
7056 7056                  page_unlock_nocapture(pp);
7057 7057                  return;
7058 7058          }
7059 7059  
7060 7060          index = PAGE_CAPTURE_HASH(pp);
7061 7061  
7062 7062          mp = &page_capture_hash[index].pchh_mutex;
7063 7063          mutex_enter(mp);
7064 7064          for (i = 0; i < 2; i++) {
7065 7065                  bp = page_capture_hash[index].lists[i].next;
7066 7066                  while (bp != &page_capture_hash[index].lists[i]) {
7067 7067                          if (bp->pp == pp) {
7068 7068                                  szc = bp->szc;
7069 7069                                  flags = bp->flags | CAPTURE_ASYNC;
7070 7070                                  datap = bp->datap;
7071 7071                                  mutex_exit(mp);
7072 7072                                  (void) page_trycapture(pp, szc, flags, datap);
7073 7073                                  return;
7074 7074                          }
7075 7075                          bp = bp->next;
7076 7076                  }
7077 7077          }
7078 7078  
7079 7079          /* Failed to find page in hash so clear flags and unlock it. */
7080 7080          page_clrtoxic(pp, PR_CAPTURE);
7081 7081          page_unlock(pp);
7082 7082  
7083 7083          mutex_exit(mp);
7084 7084  }
7085 7085  
7086 7086  void
7087 7087  page_capture_init()
7088 7088  {
7089 7089          int i;
7090 7090          for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7091 7091                  page_capture_hash[i].lists[0].next =
7092 7092                      &page_capture_hash[i].lists[0];
7093 7093                  page_capture_hash[i].lists[0].prev =
7094 7094                      &page_capture_hash[i].lists[0];
7095 7095                  page_capture_hash[i].lists[1].next =
7096 7096                      &page_capture_hash[i].lists[1];
7097 7097                  page_capture_hash[i].lists[1].prev =
7098 7098                      &page_capture_hash[i].lists[1];
7099 7099          }
7100 7100  
7101 7101          pc_thread_shortwait = 23 * hz;
7102 7102          pc_thread_longwait = 1201 * hz;
7103 7103          pc_thread_retry = 3;
7104 7104          mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
7105 7105          cv_init(&pc_cv, NULL, CV_DEFAULT, NULL);
7106 7106          pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0,
7107 7107              TS_RUN, minclsyspri);
7108 7108  }
7109 7109  
7110 7110  /*
7111 7111   * It is necessary to scrub any failing pages prior to reboot in order to
7112 7112   * prevent a latent error trap from occurring on the next boot.
7113 7113   */
7114 7114  void
7115 7115  page_retire_mdboot()
7116 7116  {
7117 7117          page_t *pp;
7118 7118          int i, j;
7119 7119          page_capture_hash_bucket_t *bp;
7120 7120          uchar_t pri;
7121 7121  
7122 7122          /* walk lists looking for pages to scrub */
7123 7123          for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7124 7124                  for (pri = 0; pri < PC_NUM_PRI; pri++) {
7125 7125                          if (page_capture_hash[i].num_pages[pri] != 0) {
7126 7126                                  break;
7127 7127                          }
7128 7128                  }
7129 7129                  if (pri == PC_NUM_PRI)
7130 7130                          continue;
7131 7131  
7132 7132                  mutex_enter(&page_capture_hash[i].pchh_mutex);
7133 7133  
7134 7134                  for (j = 0; j < 2; j++) {
7135 7135                          bp = page_capture_hash[i].lists[j].next;
7136 7136                          while (bp != &page_capture_hash[i].lists[j]) {
7137 7137                                  pp = bp->pp;
7138 7138                                  if (PP_TOXIC(pp)) {
7139 7139                                          if (page_trylock(pp, SE_EXCL)) {
7140 7140                                                  PP_CLRFREE(pp);
7141 7141                                                  pagescrub(pp, 0, PAGESIZE);
7142 7142                                                  page_unlock(pp);
7143 7143                                          }
7144 7144                                  }
7145 7145                                  bp = bp->next;
7146 7146                          }
7147 7147                  }
7148 7148                  mutex_exit(&page_capture_hash[i].pchh_mutex);
7149 7149          }
7150 7150  }
7151 7151  
7152 7152  /*
7153 7153   * Walk the page_capture_hash trying to capture pages and also cleanup old
7154 7154   * entries which have expired.
7155 7155   */
7156 7156  void
7157 7157  page_capture_async()
7158 7158  {
7159 7159          page_t *pp;
7160 7160          int i;
7161 7161          int ret;
7162 7162          page_capture_hash_bucket_t *bp1, *bp2;
7163 7163          uint_t szc;
7164 7164          uint_t flags;
7165 7165          void *datap;
7166 7166          uchar_t pri;
7167 7167  
7168 7168          /* If there are outstanding pages to be captured, get to work */
7169 7169          for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7170 7170                  for (pri = 0; pri < PC_NUM_PRI; pri++) {
7171 7171                          if (page_capture_hash[i].num_pages[pri] != 0)
7172 7172                                  break;
7173 7173                  }
7174 7174                  if (pri == PC_NUM_PRI)
7175 7175                          continue;
7176 7176  
7177 7177                  /* Append list 1 to list 0 and then walk through list 0 */
7178 7178                  mutex_enter(&page_capture_hash[i].pchh_mutex);
7179 7179                  bp1 = &page_capture_hash[i].lists[1];
7180 7180                  bp2 = bp1->next;
7181 7181                  if (bp1 != bp2) {
7182 7182                          bp1->prev->next = page_capture_hash[i].lists[0].next;
7183 7183                          bp2->prev = &page_capture_hash[i].lists[0];
7184 7184                          page_capture_hash[i].lists[0].next->prev = bp1->prev;
7185 7185                          page_capture_hash[i].lists[0].next = bp2;
7186 7186                          bp1->next = bp1;
7187 7187                          bp1->prev = bp1;
7188 7188                  }
7189 7189  
7190 7190                  /* list[1] will be empty now */
7191 7191  
7192 7192                  bp1 = page_capture_hash[i].lists[0].next;
7193 7193                  while (bp1 != &page_capture_hash[i].lists[0]) {
7194 7194                          /* Check expiration time */
7195 7195                          if ((ddi_get_lbolt() > bp1->expires &&
7196 7196                              bp1->expires != -1) ||
7197 7197                              page_deleted(bp1->pp)) {
7198 7198                                  page_capture_hash[i].lists[0].next = bp1->next;
7199 7199                                  bp1->next->prev =
7200 7200                                      &page_capture_hash[i].lists[0];
7201 7201                                  page_capture_hash[i].num_pages[bp1->pri]--;
7202 7202  
7203 7203                                  /*
7204 7204                                   * We can safely remove the PR_CAPTURE bit
7205 7205                                   * without holding the EXCL lock on the page
7206 7206                                   * as the PR_CAPTURE bit requres that the
7207 7207                                   * page_capture_hash[].pchh_mutex be held
7208 7208                                   * to modify it.
7209 7209                                   */
7210 7210                                  page_clrtoxic(bp1->pp, PR_CAPTURE);
7211 7211                                  mutex_exit(&page_capture_hash[i].pchh_mutex);
7212 7212                                  kmem_free(bp1, sizeof (*bp1));
7213 7213                                  mutex_enter(&page_capture_hash[i].pchh_mutex);
7214 7214                                  bp1 = page_capture_hash[i].lists[0].next;
7215 7215                                  continue;
7216 7216                          }
7217 7217                          pp = bp1->pp;
7218 7218                          szc = bp1->szc;
7219 7219                          flags = bp1->flags;
7220 7220                          datap = bp1->datap;
7221 7221                          mutex_exit(&page_capture_hash[i].pchh_mutex);
7222 7222                          if (page_trylock(pp, SE_EXCL)) {
7223 7223                                  ret = page_trycapture(pp, szc,
7224 7224                                      flags | CAPTURE_ASYNC, datap);
7225 7225                          } else {
7226 7226                                  ret = 1;        /* move to walked hash */
7227 7227                          }
7228 7228  
7229 7229                          if (ret != 0) {
7230 7230                                  /* Move to walked hash */
7231 7231                                  (void) page_capture_move_to_walked(pp);
7232 7232                          }
7233 7233                          mutex_enter(&page_capture_hash[i].pchh_mutex);
7234 7234                          bp1 = page_capture_hash[i].lists[0].next;
7235 7235                  }
7236 7236  
7237 7237                  mutex_exit(&page_capture_hash[i].pchh_mutex);
7238 7238          }
7239 7239  }
7240 7240  
7241 7241  /*
7242 7242   * This function is called by the page_capture_thread, and is needed in
7243 7243   * in order to initiate aio cleanup, so that pages used in aio
7244 7244   * will be unlocked and subsequently retired by page_capture_thread.
7245 7245   */
7246 7246  static int
7247 7247  do_aio_cleanup(void)
7248 7248  {
7249 7249          proc_t *procp;
7250 7250          int (*aio_cleanup_dr_delete_memory)(proc_t *);
7251 7251          int cleaned = 0;
7252 7252  
7253 7253          if (modload("sys", "kaio") == -1) {
7254 7254                  cmn_err(CE_WARN, "do_aio_cleanup: cannot load kaio");
7255 7255                  return (0);
7256 7256          }
7257 7257          /*
7258 7258           * We use the aio_cleanup_dr_delete_memory function to
7259 7259           * initiate the actual clean up; this function will wake
7260 7260           * up the per-process aio_cleanup_thread.
7261 7261           */
7262 7262          aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
7263 7263              modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
7264 7264          if (aio_cleanup_dr_delete_memory == NULL) {
7265 7265                  cmn_err(CE_WARN,
7266 7266              "aio_cleanup_dr_delete_memory not found in kaio");
7267 7267                  return (0);
7268 7268          }
7269 7269          mutex_enter(&pidlock);
7270 7270          for (procp = practive; (procp != NULL); procp = procp->p_next) {
7271 7271                  mutex_enter(&procp->p_lock);
7272 7272                  if (procp->p_aio != NULL) {
7273 7273                          /* cleanup proc's outstanding kaio */
7274 7274                          cleaned += (*aio_cleanup_dr_delete_memory)(procp);
7275 7275                  }
7276 7276                  mutex_exit(&procp->p_lock);
7277 7277          }
7278 7278          mutex_exit(&pidlock);
7279 7279          return (cleaned);
7280 7280  }
7281 7281  
7282 7282  /*
7283 7283   * helper function for page_capture_thread
7284 7284   */
7285 7285  static void
7286 7286  page_capture_handle_outstanding(void)
7287 7287  {
7288 7288          int ntry;
7289 7289  
7290 7290          /* Reap pages before attempting capture pages */
7291 7291          kmem_reap();
7292 7292  
7293 7293          if ((page_retire_pend_count() > page_retire_pend_kas_count()) &&
7294 7294              hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
7295 7295                  /*
7296 7296                   * Note: Purging only for platforms that support
7297 7297                   * ISM hat_pageunload() - mainly SPARC. On x86/x64
7298 7298                   * platforms ISM pages SE_SHARED locked until destroyed.
7299 7299                   */
7300 7300  
7301 7301                  /* disable and purge seg_pcache */
7302 7302                  (void) seg_p_disable();
7303 7303                  for (ntry = 0; ntry < pc_thread_retry; ntry++) {
7304 7304                          if (!page_retire_pend_count())
7305 7305                                  break;
7306 7306                          if (do_aio_cleanup()) {
7307 7307                                  /*
7308 7308                                   * allow the apps cleanup threads
7309 7309                                   * to run
7310 7310                                   */
7311 7311                                  delay(pc_thread_shortwait);
7312 7312                          }
7313 7313                          page_capture_async();
7314 7314                  }
7315 7315                  /* reenable seg_pcache */
7316 7316                  seg_p_enable();
7317 7317  
7318 7318                  /* completed what can be done.  break out */
7319 7319                  return;
7320 7320          }
7321 7321  
7322 7322          /*
7323 7323           * For kernel pages and/or unsupported HAT_DYNAMIC_ISM_UNMAP, reap
7324 7324           * and then attempt to capture.
7325 7325           */
7326 7326          seg_preap();
7327 7327          page_capture_async();
7328 7328  }
7329 7329  
7330 7330  /*
7331 7331   * The page_capture_thread loops forever, looking to see if there are
7332 7332   * pages still waiting to be captured.
7333 7333   */
7334 7334  static void
7335 7335  page_capture_thread(void)
7336 7336  {
7337 7337          callb_cpr_t c;
7338 7338          int i;
7339 7339          int high_pri_pages;
7340 7340          int low_pri_pages;
7341 7341          clock_t timeout;
7342 7342  
7343 7343          CALLB_CPR_INIT(&c, &pc_thread_mutex, callb_generic_cpr, "page_capture");
7344 7344  
7345 7345          mutex_enter(&pc_thread_mutex);
7346 7346          for (;;) {
7347 7347                  high_pri_pages = 0;
7348 7348                  low_pri_pages = 0;
7349 7349                  for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7350 7350                          high_pri_pages +=
7351 7351                              page_capture_hash[i].num_pages[PC_PRI_HI];
7352 7352                          low_pri_pages +=
7353 7353                              page_capture_hash[i].num_pages[PC_PRI_LO];
7354 7354                  }
7355 7355  
7356 7356                  timeout = pc_thread_longwait;
7357 7357                  if (high_pri_pages != 0) {
7358 7358                          timeout = pc_thread_shortwait;
7359 7359                          page_capture_handle_outstanding();
7360 7360                  } else if (low_pri_pages != 0) {
7361 7361                          page_capture_async();
7362 7362                  }
7363 7363                  CALLB_CPR_SAFE_BEGIN(&c);
7364 7364                  (void) cv_reltimedwait(&pc_cv, &pc_thread_mutex,
7365 7365                      timeout, TR_CLOCK_TICK);
7366 7366                  CALLB_CPR_SAFE_END(&c, &pc_thread_mutex);
7367 7367          }
7368 7368          /*NOTREACHED*/
7369 7369  }
7370 7370  /*
7371 7371   * Attempt to locate a bucket that has enough pages to satisfy the request.
7372 7372   * The initial check is done without the lock to avoid unneeded contention.
7373 7373   * The function returns 1 if enough pages were found, else 0 if it could not
7374 7374   * find enough pages in a bucket.
7375 7375   */
7376 7376  static int
7377 7377  pcf_decrement_bucket(pgcnt_t npages)
7378 7378  {
7379 7379          struct pcf      *p;
7380 7380          struct pcf      *q;
7381 7381          int i;
7382 7382  
7383 7383          p = &pcf[PCF_INDEX()];
7384 7384          q = &pcf[pcf_fanout];
7385 7385          for (i = 0; i < pcf_fanout; i++) {
7386 7386                  if (p->pcf_count > npages) {
7387 7387                          /*
7388 7388                           * a good one to try.
7389 7389                           */
7390 7390                          mutex_enter(&p->pcf_lock);
7391 7391                          if (p->pcf_count > npages) {
7392 7392                                  p->pcf_count -= (uint_t)npages;
7393 7393                                  /*
7394 7394                                   * freemem is not protected by any lock.
7395 7395                                   * Thus, we cannot have any assertion
7396 7396                                   * containing freemem here.
7397 7397                                   */
7398 7398                                  freemem -= npages;
7399 7399                                  mutex_exit(&p->pcf_lock);
7400 7400                                  return (1);
7401 7401                          }
7402 7402                          mutex_exit(&p->pcf_lock);
7403 7403                  }
7404 7404                  p++;
7405 7405                  if (p >= q) {
7406 7406                          p = pcf;
7407 7407                  }
7408 7408          }
7409 7409          return (0);
7410 7410  }
7411 7411  
7412 7412  /*
7413 7413   * Arguments:
7414 7414   *      pcftotal_ret:   If the value is not NULL and we have walked all the
7415 7415   *                      buckets but did not find enough pages then it will
7416 7416   *                      be set to the total number of pages in all the pcf
7417 7417   *                      buckets.
7418 7418   *      npages:         Is the number of pages we have been requested to
7419 7419   *                      find.
7420 7420   *      unlock:         If set to 0 we will leave the buckets locked if the
7421 7421   *                      requested number of pages are not found.
7422 7422   *
7423 7423   * Go and try to satisfy the page request  from any number of buckets.
7424 7424   * This can be a very expensive operation as we have to lock the buckets
7425 7425   * we are checking (and keep them locked), starting at bucket 0.
7426 7426   *
7427 7427   * The function returns 1 if enough pages were found, else 0 if it could not
7428 7428   * find enough pages in the buckets.
7429 7429   *
7430 7430   */
7431 7431  static int
7432 7432  pcf_decrement_multiple(pgcnt_t *pcftotal_ret, pgcnt_t npages, int unlock)
7433 7433  {
7434 7434          struct pcf      *p;
7435 7435          pgcnt_t pcftotal;
7436 7436          int i;
7437 7437  
7438 7438          p = pcf;
7439 7439          /* try to collect pages from several pcf bins */
7440 7440          for (pcftotal = 0, i = 0; i < pcf_fanout; i++) {
7441 7441                  mutex_enter(&p->pcf_lock);
7442 7442                  pcftotal += p->pcf_count;
7443 7443                  if (pcftotal >= npages) {
7444 7444                          /*
7445 7445                           * Wow!  There are enough pages laying around
7446 7446                           * to satisfy the request.  Do the accounting,
7447 7447                           * drop the locks we acquired, and go back.
7448 7448                           *
7449 7449                           * freemem is not protected by any lock. So,
7450 7450                           * we cannot have any assertion containing
7451 7451                           * freemem.
7452 7452                           */
7453 7453                          freemem -= npages;
7454 7454                          while (p >= pcf) {
7455 7455                                  if (p->pcf_count <= npages) {
7456 7456                                          npages -= p->pcf_count;
7457 7457                                          p->pcf_count = 0;
7458 7458                                  } else {
7459 7459                                          p->pcf_count -= (uint_t)npages;
7460 7460                                          npages = 0;
7461 7461                                  }
7462 7462                                  mutex_exit(&p->pcf_lock);
7463 7463                                  p--;
7464 7464                          }
7465 7465                          ASSERT(npages == 0);
7466 7466                          return (1);
7467 7467                  }
7468 7468                  p++;
7469 7469          }
7470 7470          if (unlock) {
7471 7471                  /* failed to collect pages - release the locks */
7472 7472                  while (--p >= pcf) {
7473 7473                          mutex_exit(&p->pcf_lock);
7474 7474                  }
7475 7475          }
7476 7476          if (pcftotal_ret != NULL)
7477 7477                  *pcftotal_ret = pcftotal;
7478 7478          return (0);
7479 7479  }

↓ open down ↓

1907 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX