combined Wdiff usr/src/uts/common/vm/vm_anon.c

Print this page

remove whole-process swapping
Long before Unix supported paging, it used process swapping to reclaim
memory.  The code is there and in theory it runs when we get *extremely* low
on memory.  In practice, it never runs since the definition of low-on-memory
is antiquated. (XXX: define what antiquated means)
You can check the number of swapout/swapin events with kstats:
$ kstat -p ::vm:swapin ::vm:swapout

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_anon.c
          +++ new/usr/src/uts/common/vm/vm_anon.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   */
  24   24  
  25   25  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  26   26  /*        All Rights Reserved   */
  27   27  
  28   28  /*
  29   29   * University Copyright- Copyright (c) 1982, 1986, 1988
  30   30   * The Regents of the University of California
  31   31   * All Rights Reserved
  32   32   *
  33   33   * University Acknowledgment- Portions of this document are derived from
  34   34   * software developed by the University of California, Berkeley, and its
  35   35   * contributors.
  36   36   */
  37   37  
  38   38  /*
  39   39   * VM - anonymous pages.
  40   40   *
  41   41   * This layer sits immediately above the vm_swap layer.  It manages
  42   42   * physical pages that have no permanent identity in the file system
  43   43   * name space, using the services of the vm_swap layer to allocate
  44   44   * backing storage for these pages.  Since these pages have no external
  45   45   * identity, they are discarded when the last reference is removed.
  46   46   *
  47   47   * An important function of this layer is to manage low-level sharing
  48   48   * of pages that are logically distinct but that happen to be
  49   49   * physically identical (e.g., the corresponding pages of the processes
  50   50   * resulting from a fork before one process or the other changes their
  51   51   * contents).  This pseudo-sharing is present only as an optimization
  52   52   * and is not to be confused with true sharing in which multiple
  53   53   * address spaces deliberately contain references to the same object;
  54   54   * such sharing is managed at a higher level.
  55   55   *
  56   56   * The key data structure here is the anon struct, which contains a
  57   57   * reference count for its associated physical page and a hint about
  58   58   * the identity of that page.  Anon structs typically live in arrays,
  59   59   * with an instance's position in its array determining where the

↓ open down ↓

59 lines elided

↑ open up ↑

  60   60   * corresponding backing storage is allocated; however, the swap_xlate()
  61   61   * routine abstracts away this representation information so that the
  62   62   * rest of the anon layer need not know it.  (See the swap layer for
  63   63   * more details on anon struct layout.)
  64   64   *
  65   65   * In the future versions of the system, the association between an
  66   66   * anon struct and its position on backing store will change so that
  67   67   * we don't require backing store all anonymous pages in the system.
  68   68   * This is important for consideration for large memory systems.
  69   69   * We can also use this technique to delay binding physical locations
  70      - * to anonymous pages until pageout/swapout time where we can make
  71      - * smarter allocation decisions to improve anonymous klustering.
       70 + * to anonymous pages until pageout time where we can make smarter
       71 + * allocation decisions to improve anonymous klustering.
  72   72   *
  73   73   * Many of the routines defined here take a (struct anon **) argument,
  74   74   * which allows the code at this level to manage anon pages directly,
  75   75   * so that callers can regard anon structs as opaque objects and not be
  76   76   * concerned with assigning or inspecting their contents.
  77   77   *
  78   78   * Clients of this layer refer to anon pages indirectly.  That is, they
  79   79   * maintain arrays of pointers to anon structs rather than maintaining
  80   80   * anon structs themselves.  The (struct anon **) arguments mentioned
  81   81   * above are pointers to entries in these arrays.  It is these arrays

  82   82   * that capture the mapping between offsets within a given segment and
  83   83   * the corresponding anonymous backing storage address.
  84   84   */
  85   85  
  86   86  #ifdef DEBUG
  87   87  #define ANON_DEBUG
  88   88  #endif
  89   89  
  90   90  #include <sys/types.h>
  91   91  #include <sys/t_lock.h>
  92   92  #include <sys/param.h>
  93   93  #include <sys/systm.h>
  94   94  #include <sys/mman.h>
  95   95  #include <sys/cred.h>
  96   96  #include <sys/thread.h>
  97   97  #include <sys/vnode.h>
  98   98  #include <sys/cpuvar.h>
  99   99  #include <sys/swap.h>
 100  100  #include <sys/cmn_err.h>
 101  101  #include <sys/vtrace.h>
 102  102  #include <sys/kmem.h>
 103  103  #include <sys/sysmacros.h>
 104  104  #include <sys/bitmap.h>
 105  105  #include <sys/vmsystm.h>
 106  106  #include <sys/tuneable.h>
 107  107  #include <sys/debug.h>
 108  108  #include <sys/fs/swapnode.h>
 109  109  #include <sys/tnf_probe.h>
 110  110  #include <sys/lgrp.h>
 111  111  #include <sys/policy.h>
 112  112  #include <sys/condvar_impl.h>
 113  113  #include <sys/mutex_impl.h>
 114  114  #include <sys/rctl.h>
 115  115  
 116  116  #include <vm/as.h>
 117  117  #include <vm/hat.h>
 118  118  #include <vm/anon.h>
 119  119  #include <vm/page.h>
 120  120  #include <vm/vpage.h>
 121  121  #include <vm/seg.h>
 122  122  #include <vm/rm.h>
 123  123  
 124  124  #include <fs/fs_subr.h>
 125  125  
 126  126  struct vnode *anon_vp;
 127  127  
 128  128  int anon_debug;
 129  129  
 130  130  kmutex_t        anoninfo_lock;
 131  131  struct          k_anoninfo k_anoninfo;
 132  132  ani_free_t      *ani_free_pool;
 133  133  pad_mutex_t     anon_array_lock[ANON_LOCKSIZE];
 134  134  kcondvar_t      anon_array_cv[ANON_LOCKSIZE];
 135  135  
 136  136  /*
 137  137   * Global hash table for (vp, off) -> anon slot
 138  138   */
 139  139  extern  int swap_maxcontig;
 140  140  size_t  anon_hash_size;
 141  141  unsigned int anon_hash_shift;
 142  142  struct anon **anon_hash;
 143  143  
 144  144  static struct kmem_cache *anon_cache;
 145  145  static struct kmem_cache *anonmap_cache;
 146  146  
 147  147  pad_mutex_t     *anonhash_lock;
 148  148  
 149  149  /*
 150  150   * Used to make the increment of all refcnts of all anon slots of a large
 151  151   * page appear to be atomic.  The lock is grabbed for the first anon slot of
 152  152   * a large page.
 153  153   */
 154  154  pad_mutex_t     *anonpages_hash_lock;
 155  155  
 156  156  #define APH_MUTEX(vp, off)                              \
 157  157          (&anonpages_hash_lock[(ANON_HASH((vp), (off)) & \
 158  158              (AH_LOCK_SIZE - 1))].pad_mutex)
 159  159  
 160  160  #ifdef VM_STATS
 161  161  static struct anonvmstats_str {
 162  162          ulong_t getpages[30];
 163  163          ulong_t privatepages[10];
 164  164          ulong_t demotepages[9];
 165  165          ulong_t decrefpages[9];
 166  166          ulong_t dupfillholes[4];
 167  167          ulong_t freepages[1];
 168  168  } anonvmstats;
 169  169  #endif /* VM_STATS */
 170  170  
 171  171  /*ARGSUSED*/
 172  172  static int
 173  173  anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags)
 174  174  {
 175  175          struct anon_map *amp = buf;
 176  176  
 177  177          rw_init(&amp->a_rwlock, NULL, RW_DEFAULT, NULL);
 178  178          cv_init(&amp->a_purgecv, NULL, CV_DEFAULT, NULL);
 179  179          mutex_init(&amp->a_pmtx, NULL, MUTEX_DEFAULT, NULL);
 180  180          mutex_init(&amp->a_purgemtx, NULL, MUTEX_DEFAULT, NULL);
 181  181          return (0);
 182  182  }
 183  183  
 184  184  /*ARGSUSED1*/
 185  185  static void
 186  186  anonmap_cache_destructor(void *buf, void *cdrarg)
 187  187  {
 188  188          struct anon_map *amp = buf;
 189  189  
 190  190          rw_destroy(&amp->a_rwlock);
 191  191          cv_destroy(&amp->a_purgecv);
 192  192          mutex_destroy(&amp->a_pmtx);
 193  193          mutex_destroy(&amp->a_purgemtx);
 194  194  }
 195  195  
 196  196  void
 197  197  anon_init(void)
 198  198  {
 199  199          int i;
 200  200          pad_mutex_t *tmp;
 201  201  
 202  202          /* These both need to be powers of 2 so round up to the next power */
 203  203          anon_hash_shift = highbit((physmem / ANON_HASHAVELEN) - 1);
 204  204          anon_hash_size = 1L << anon_hash_shift;
 205  205  
 206  206          /*
 207  207           * We need to align the anonhash_lock and anonpages_hash_lock arrays
 208  208           * to a 64B boundary to avoid false sharing.  We add 63B to our
 209  209           * allocation so that we can get a 64B aligned address to use.
 210  210           * We allocate both of these together to avoid wasting an additional
 211  211           * 63B.
 212  212           */
 213  213          tmp = kmem_zalloc((2 * AH_LOCK_SIZE * sizeof (pad_mutex_t)) + 63,
 214  214              KM_SLEEP);
 215  215          anonhash_lock = (pad_mutex_t *)P2ROUNDUP((uintptr_t)tmp, 64);
 216  216          anonpages_hash_lock = anonhash_lock + AH_LOCK_SIZE;
 217  217  
 218  218          for (i = 0; i < AH_LOCK_SIZE; i++) {
 219  219                  mutex_init(&anonhash_lock[i].pad_mutex, NULL, MUTEX_DEFAULT,
 220  220                      NULL);
 221  221                  mutex_init(&anonpages_hash_lock[i].pad_mutex, NULL,
 222  222                      MUTEX_DEFAULT, NULL);
 223  223          }
 224  224  
 225  225          for (i = 0; i < ANON_LOCKSIZE; i++) {
 226  226                  mutex_init(&anon_array_lock[i].pad_mutex, NULL,
 227  227                      MUTEX_DEFAULT, NULL);
 228  228                  cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL);
 229  229          }
 230  230  
 231  231          anon_hash = (struct anon **)
 232  232              kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP);
 233  233          anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon),
 234  234              AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, KMC_PREFILL);
 235  235          anonmap_cache = kmem_cache_create("anonmap_cache",
 236  236              sizeof (struct anon_map), 0,
 237  237              anonmap_cache_constructor, anonmap_cache_destructor, NULL,
 238  238              NULL, NULL, 0);
 239  239          swap_maxcontig = (1024 * 1024) >> PAGESHIFT;    /* 1MB of pages */
 240  240  
 241  241          tmp = kmem_zalloc((ANI_MAX_POOL * sizeof (ani_free_t)) + 63, KM_SLEEP);
 242  242          /* Round ani_free_pool to cacheline boundary to avoid false sharing. */
 243  243          ani_free_pool = (ani_free_t *)P2ROUNDUP((uintptr_t)tmp, 64);
 244  244  
 245  245          anon_vp = vn_alloc(KM_SLEEP);
 246  246          vn_setops(anon_vp, swap_vnodeops);
 247  247          anon_vp->v_type = VREG;
 248  248          anon_vp->v_flag |= (VISSWAP|VISSWAPFS);
 249  249  }
 250  250  
 251  251  /*
 252  252   * Global anon slot hash table manipulation.
 253  253   */
 254  254  
 255  255  static void
 256  256  anon_addhash(struct anon *ap)
 257  257  {
 258  258          int index;
 259  259  
 260  260          ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off)));
 261  261          index = ANON_HASH(ap->an_vp, ap->an_off);
 262  262          ap->an_hash = anon_hash[index];
 263  263          anon_hash[index] = ap;
 264  264  }
 265  265  
 266  266  static void
 267  267  anon_rmhash(struct anon *ap)
 268  268  {
 269  269          struct anon **app;
 270  270  
 271  271          ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off)));
 272  272  
 273  273          for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)];
 274  274              *app; app = &((*app)->an_hash)) {
 275  275                  if (*app == ap) {
 276  276                          *app = ap->an_hash;
 277  277                          break;
 278  278                  }
 279  279          }
 280  280  }
 281  281  
 282  282  /*
 283  283   * The anon array interfaces. Functions allocating,
 284  284   * freeing array of pointers, and returning/setting
 285  285   * entries in the array of pointers for a given offset.
 286  286   *
 287  287   * Create the list of pointers
 288  288   */
 289  289  struct anon_hdr *
 290  290  anon_create(pgcnt_t npages, int flags)
 291  291  {
 292  292          struct anon_hdr *ahp;
 293  293          ulong_t nchunks;
 294  294          int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
 295  295  
 296  296          if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) {
 297  297                  return (NULL);
 298  298          }
 299  299  
 300  300          mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL);
 301  301          /*
 302  302           * Single level case.
 303  303           */
 304  304          ahp->size = npages;
 305  305          if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) {
 306  306  
 307  307                  if (flags & ANON_ALLOC_FORCE)
 308  308                          ahp->flags |= ANON_ALLOC_FORCE;
 309  309  
 310  310                  ahp->array_chunk = kmem_zalloc(
 311  311                      ahp->size * sizeof (struct anon *), kmemflags);
 312  312  
 313  313                  if (ahp->array_chunk == NULL) {
 314  314                          kmem_free(ahp, sizeof (struct anon_hdr));
 315  315                          return (NULL);
 316  316                  }
 317  317          } else {
 318  318                  /*
 319  319                   * 2 Level case.
 320  320                   * anon hdr size needs to be rounded off  to be a multiple
 321  321                   * of ANON_CHUNK_SIZE. This is important as various anon
 322  322                   * related functions depend on this.
 323  323                   * NOTE -
 324  324                   * anon_grow()  makes anon hdr size a multiple of
 325  325                   * ANON_CHUNK_SIZE.
 326  326                   * amp size is <= anon hdr size.
 327  327                   * anon_index + seg_pgs <= anon hdr size.
 328  328                   */
 329  329                  ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE);
 330  330                  nchunks = ahp->size >> ANON_CHUNK_SHIFT;
 331  331  
 332  332                  ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *),
 333  333                      kmemflags);
 334  334  
 335  335                  if (ahp->array_chunk == NULL) {
 336  336                          kmem_free(ahp, sizeof (struct anon_hdr));
 337  337                          return (NULL);
 338  338                  }
 339  339          }
 340  340          return (ahp);
 341  341  }
 342  342  
 343  343  /*
 344  344   * Free the array of pointers
 345  345   */
 346  346  void
 347  347  anon_release(struct anon_hdr *ahp, pgcnt_t npages)
 348  348  {
 349  349          ulong_t i;
 350  350          void **ppp;
 351  351          ulong_t nchunks;
 352  352  
 353  353          ASSERT(npages <= ahp->size);
 354  354  
 355  355          /*
 356  356           * Single level case.
 357  357           */
 358  358          if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
 359  359                  kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *));
 360  360          } else {
 361  361                  /*
 362  362                   * 2 level case.
 363  363                   */
 364  364                  nchunks = ahp->size >> ANON_CHUNK_SHIFT;
 365  365                  for (i = 0; i < nchunks; i++) {
 366  366                          ppp = &ahp->array_chunk[i];
 367  367                          if (*ppp != NULL)
 368  368                                  kmem_free(*ppp, PAGESIZE);
 369  369                  }
 370  370                  kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *));
 371  371          }
 372  372          mutex_destroy(&ahp->serial_lock);
 373  373          kmem_free(ahp, sizeof (struct anon_hdr));
 374  374  }
 375  375  
 376  376  /*
 377  377   * Return the pointer from the list for a
 378  378   * specified anon index.
 379  379   */
 380  380  struct anon *
 381  381  anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx)
 382  382  {
 383  383          struct anon **app;
 384  384  
 385  385          ASSERT(an_idx < ahp->size);
 386  386  
 387  387          /*
 388  388           * Single level case.
 389  389           */
 390  390          if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
 391  391                  return ((struct anon *)
 392  392                      ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK));
 393  393          } else {
 394  394  
 395  395                  /*
 396  396                   * 2 level case.
 397  397                   */
 398  398                  app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
 399  399                  if (app) {
 400  400                          return ((struct anon *)
 401  401                              ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] &
 402  402                              ANON_PTRMASK));
 403  403                  } else {
 404  404                          return (NULL);
 405  405                  }
 406  406          }
 407  407  }
 408  408  
 409  409  /*
 410  410   * Return the anon pointer for the first valid entry in the anon list,
 411  411   * starting from the given index.
 412  412   */
 413  413  struct anon *
 414  414  anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index)
 415  415  {
 416  416          struct anon *ap;
 417  417          struct anon **app;
 418  418          ulong_t chunkoff;
 419  419          ulong_t i;
 420  420          ulong_t j;
 421  421          pgcnt_t size;
 422  422  
 423  423          i = *index;
 424  424          size = ahp->size;
 425  425  
 426  426          ASSERT(i < size);
 427  427  
 428  428          if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
 429  429                  /*
 430  430                   * 1 level case
 431  431                   */
 432  432                  while (i < size) {
 433  433                          ap = (struct anon *)
 434  434                              ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK);
 435  435                          if (ap) {
 436  436                                  *index = i;
 437  437                                  return (ap);
 438  438                          }
 439  439                          i++;
 440  440                  }
 441  441          } else {
 442  442                  /*
 443  443                   * 2 level case
 444  444                   */
 445  445                  chunkoff = i & ANON_CHUNK_OFF;
 446  446                  while (i < size) {
 447  447                          app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT];
 448  448                          if (app)
 449  449                                  for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) {
 450  450                                          ap = (struct anon *)
 451  451                                              ((uintptr_t)app[j] & ANON_PTRMASK);
 452  452                                          if (ap) {
 453  453                                                  *index = i + (j - chunkoff);
 454  454                                                  return (ap);
 455  455                                          }
 456  456                                  }
 457  457                          chunkoff = 0;
 458  458                          i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF;
 459  459                  }
 460  460          }
 461  461          *index = size;
 462  462          return (NULL);
 463  463  }
 464  464  
 465  465  /*
 466  466   * Set list entry with a given pointer for a specified offset
 467  467   */
 468  468  int
 469  469  anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags)
 470  470  {
 471  471          void            **ppp;
 472  472          struct anon     **app;
 473  473          int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
 474  474          uintptr_t       *ap_addr;
 475  475  
 476  476          ASSERT(an_idx < ahp->size);
 477  477  
 478  478          /*
 479  479           * Single level case.
 480  480           */
 481  481          if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
 482  482                  ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx];
 483  483          } else {
 484  484  
 485  485                  /*
 486  486                   * 2 level case.
 487  487                   */
 488  488                  ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
 489  489  
 490  490                  ASSERT(ppp != NULL);
 491  491                  if (*ppp == NULL) {
 492  492                          mutex_enter(&ahp->serial_lock);
 493  493                          ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
 494  494                          if (*ppp == NULL) {
 495  495                                  *ppp = kmem_zalloc(PAGESIZE, kmemflags);
 496  496                                  if (*ppp == NULL) {
 497  497                                          mutex_exit(&ahp->serial_lock);
 498  498                                          return (ENOMEM);
 499  499                                  }
 500  500                          }
 501  501                          mutex_exit(&ahp->serial_lock);
 502  502                  }
 503  503                  app = *ppp;
 504  504                  ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF];
 505  505          }
 506  506          *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap;
 507  507          return (0);
 508  508  }
 509  509  
 510  510  /*
 511  511   * Copy anon array into a given new anon array
 512  512   */
 513  513  int
 514  514  anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx,
 515  515          struct anon_hdr *dahp, ulong_t d_idx,
 516  516          pgcnt_t npages, int flags)
 517  517  {
 518  518          void **sapp, **dapp;
 519  519          void *ap;
 520  520          int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
 521  521  
 522  522          ASSERT((s_idx < sahp->size) && (d_idx < dahp->size));
 523  523          ASSERT((npages <= sahp->size) && (npages <= dahp->size));
 524  524  
 525  525          /*
 526  526           * Both arrays are 1 level.
 527  527           */
 528  528          if (((sahp->size <= ANON_CHUNK_SIZE) &&
 529  529              (dahp->size <= ANON_CHUNK_SIZE)) ||
 530  530              ((sahp->flags & ANON_ALLOC_FORCE) &&
 531  531              (dahp->flags & ANON_ALLOC_FORCE))) {
 532  532  
 533  533                  bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx],
 534  534                      npages * sizeof (struct anon *));
 535  535                  return (0);
 536  536          }
 537  537  
 538  538          /*
 539  539           * Both arrays are 2 levels.
 540  540           */
 541  541          if (sahp->size > ANON_CHUNK_SIZE &&
 542  542              dahp->size > ANON_CHUNK_SIZE &&
 543  543              ((sahp->flags & ANON_ALLOC_FORCE) == 0) &&
 544  544              ((dahp->flags & ANON_ALLOC_FORCE) == 0)) {
 545  545  
 546  546                  ulong_t sapidx, dapidx;
 547  547                  ulong_t *sap, *dap;
 548  548                  ulong_t chknp;
 549  549  
 550  550                  while (npages != 0) {
 551  551  
 552  552                          sapidx = s_idx & ANON_CHUNK_OFF;
 553  553                          dapidx = d_idx & ANON_CHUNK_OFF;
 554  554                          chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx);
 555  555                          if (chknp > npages)
 556  556                                  chknp = npages;
 557  557  
 558  558                          sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT];
 559  559                          if ((sap = *sapp) != NULL) {
 560  560                                  dapp = &dahp->array_chunk[d_idx
 561  561                                      >> ANON_CHUNK_SHIFT];
 562  562                                  if ((dap = *dapp) == NULL) {
 563  563                                          *dapp = kmem_zalloc(PAGESIZE,
 564  564                                              kmemflags);
 565  565                                          if ((dap = *dapp) == NULL)
 566  566                                                  return (ENOMEM);
 567  567                                  }
 568  568                                  bcopy((sap + sapidx), (dap + dapidx),
 569  569                                      chknp << ANON_PTRSHIFT);
 570  570                          }
 571  571                          s_idx += chknp;
 572  572                          d_idx += chknp;
 573  573                          npages -= chknp;
 574  574                  }
 575  575                  return (0);
 576  576          }
 577  577  
 578  578          /*
 579  579           * At least one of the arrays is 2 level.
 580  580           */
 581  581          while (npages--) {
 582  582                  if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) {
 583  583                          ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx)));
 584  584                          if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM)
 585  585                                          return (ENOMEM);
 586  586                  }
 587  587                  s_idx++;
 588  588                  d_idx++;
 589  589          }
 590  590          return (0);
 591  591  }
 592  592  
 593  593  
 594  594  /*
 595  595   * ANON_INITBUF is a convenience macro for anon_grow() below. It
 596  596   * takes a buffer dst, which is at least as large as buffer src. It
 597  597   * does a bcopy from src into dst, and then bzeros the extra bytes
 598  598   * of dst. If tail is set, the data in src is tail aligned within
 599  599   * dst instead of head aligned.
 600  600   */
 601  601  
 602  602  #define ANON_INITBUF(src, srclen, dst, dstsize, tail)                         \
 603  603          if (tail) {                                                           \
 604  604                  bzero((dst), (dstsize) - (srclen));                           \
 605  605                  bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \
 606  606          } else {                                                              \
 607  607                  bcopy((src), (dst), (srclen));                                \
 608  608                  bzero((char *)(dst) + (srclen), (dstsize) - (srclen));        \
 609  609          }
 610  610  
 611  611  #define ANON_1_LEVEL_INC        (ANON_CHUNK_SIZE / 8)
 612  612  #define ANON_2_LEVEL_INC        (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE)
 613  613  
 614  614  /*
 615  615   * anon_grow() is used to efficiently extend an existing anon array.
 616  616   * startidx_p points to the index into the anon array of the first page
 617  617   * that is in use. oldseg_pgs is the number of pages in use, starting at
 618  618   * *startidx_p. newpages is the number of additional pages desired.
 619  619   *
 620  620   * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed.
 621  621   *
 622  622   * The growth is done by creating a new top level of the anon array,
 623  623   * and (if the array is 2-level) reusing the existing second level arrays.
 624  624   *
 625  625   * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN.
 626  626   *
 627  627   * Returns the new number of pages in the anon array.
 628  628   */
 629  629  pgcnt_t
 630  630  anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs,
 631  631      pgcnt_t newseg_pgs, int flags)
 632  632  {
 633  633          ulong_t startidx = startidx_p ? *startidx_p : 0;
 634  634          pgcnt_t oldamp_pgs = ahp->size, newamp_pgs;
 635  635          pgcnt_t oelems, nelems, totpages;
 636  636          void **level1;
 637  637          int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
 638  638          int growdown = (flags & ANON_GROWDOWN);
 639  639          size_t newarrsz, oldarrsz;
 640  640          void *level2;
 641  641  
 642  642          ASSERT(!(startidx_p == NULL && growdown));
 643  643          ASSERT(startidx + oldseg_pgs <= ahp->size);
 644  644  
 645  645          /*
 646  646           * Determine the total number of pages needed in the new
 647  647           * anon array. If growing down, totpages is all pages from
 648  648           * startidx through the end of the array, plus <newseg_pgs>
 649  649           * pages. If growing up, keep all pages from page 0 through
 650  650           * the last page currently in use, plus <newseg_pgs> pages.
 651  651           */
 652  652          if (growdown)
 653  653                  totpages = oldamp_pgs - startidx + newseg_pgs;
 654  654          else
 655  655                  totpages = startidx + oldseg_pgs + newseg_pgs;
 656  656  
 657  657          /* If the array is already large enough, just return. */
 658  658  
 659  659          if (oldamp_pgs >= totpages) {
 660  660                  if (growdown)
 661  661                          *startidx_p = oldamp_pgs - totpages;
 662  662                  return (oldamp_pgs);
 663  663          }
 664  664  
 665  665          /*
 666  666           * oldamp_pgs/newamp_pgs are the total numbers of pages represented
 667  667           * by the corresponding arrays.
 668  668           * oelems/nelems are the number of pointers in the top level arrays
 669  669           * which may be either level 1 or level 2.
 670  670           * Will the new anon array be one level or two levels?
 671  671           */
 672  672          if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
 673  673                  newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC);
 674  674                  oelems = oldamp_pgs;
 675  675                  nelems = newamp_pgs;
 676  676          } else {
 677  677                  newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC);
 678  678                  oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT;
 679  679                  nelems = newamp_pgs >> ANON_CHUNK_SHIFT;
 680  680          }
 681  681  
 682  682          newarrsz = nelems * sizeof (void *);
 683  683          level1 = kmem_alloc(newarrsz, kmemflags);
 684  684          if (level1 == NULL)
 685  685                  return (0);
 686  686  
 687  687          /* Are we converting from a one level to a two level anon array? */
 688  688  
 689  689          if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE &&
 690  690              !(ahp->flags & ANON_ALLOC_FORCE)) {
 691  691  
 692  692                  /*
 693  693                   * Yes, we're converting to a two level. Reuse old level 1
 694  694                   * as new level 2 if it is exactly PAGESIZE. Otherwise
 695  695                   * alloc a new level 2 and copy the old level 1 data into it.
 696  696                   */
 697  697                  if (oldamp_pgs == ANON_CHUNK_SIZE) {
 698  698                          level2 = (void *)ahp->array_chunk;
 699  699                  } else {
 700  700                          level2 = kmem_alloc(PAGESIZE, kmemflags);
 701  701                          if (level2 == NULL) {
 702  702                                  kmem_free(level1, newarrsz);
 703  703                                  return (0);
 704  704                          }
 705  705                          oldarrsz = oldamp_pgs * sizeof (void *);
 706  706  
 707  707                          ANON_INITBUF(ahp->array_chunk, oldarrsz,
 708  708                              level2, PAGESIZE, growdown);
 709  709                          kmem_free(ahp->array_chunk, oldarrsz);
 710  710                  }
 711  711                  bzero(level1, newarrsz);
 712  712                  if (growdown)
 713  713                          level1[nelems - 1] = level2;
 714  714                  else
 715  715                          level1[0] = level2;
 716  716          } else {
 717  717                  oldarrsz = oelems * sizeof (void *);
 718  718  
 719  719                  ANON_INITBUF(ahp->array_chunk, oldarrsz,
 720  720                      level1, newarrsz, growdown);
 721  721                  kmem_free(ahp->array_chunk, oldarrsz);
 722  722          }
 723  723  
 724  724          ahp->array_chunk = level1;
 725  725          ahp->size = newamp_pgs;
 726  726          if (growdown)
 727  727                  *startidx_p = newamp_pgs - totpages;
 728  728  
 729  729          return (newamp_pgs);
 730  730  }
 731  731  
 732  732  
 733  733  /*
 734  734   * Called to sync ani_free value.
 735  735   */
 736  736  
 737  737  void
 738  738  set_anoninfo(void)
 739  739  {
 740  740          processorid_t   ix, max_seqid;
 741  741          pgcnt_t         total = 0;
 742  742          static clock_t  last_time;
 743  743          clock_t         new_time;
 744  744  
 745  745          if (ani_free_pool == NULL)
 746  746                  return;
 747  747  
 748  748          /*
 749  749           * Recompute ani_free at most once per tick. Use max_cpu_seqid_ever to
 750  750           * identify the maximum number of CPUs were ever online.
 751  751           */
 752  752          new_time = ddi_get_lbolt();
 753  753          if (new_time > last_time) {
 754  754  
 755  755                  max_seqid = max_cpu_seqid_ever;
 756  756                  ASSERT(ANI_MAX_POOL > max_seqid);
 757  757                  for (ix = 0; ix <= max_seqid; ix++)
 758  758                          total += ani_free_pool[ix].ani_count;
 759  759  
 760  760                  last_time = new_time;
 761  761                  k_anoninfo.ani_free = total;
 762  762          }
 763  763  }
 764  764  
 765  765  /*
 766  766   * Reserve anon space.
 767  767   *
 768  768   * It's no longer simply a matter of incrementing ani_resv to
 769  769   * reserve swap space, we need to check memory-based as well
 770  770   * as disk-backed (physical) swap.  The following algorithm
 771  771   * is used:
 772  772   *      Check the space on physical swap
 773  773   *              i.e. amount needed < ani_max - ani_phys_resv
 774  774   *      If we are swapping on swapfs check
 775  775   *              amount needed < (availrmem - swapfs_minfree)
 776  776   * Since the algorithm to check for the quantity of swap space is
 777  777   * almost the same as that for reserving it, we'll just use anon_resvmem
 778  778   * with a flag to decrement availrmem.
 779  779   *
 780  780   * Return non-zero on success.
 781  781   */
 782  782  int
 783  783  anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard)
 784  784  {
 785  785          pgcnt_t npages = btopr(size);
 786  786          pgcnt_t mswap_pages = 0;
 787  787          pgcnt_t pswap_pages = 0;
 788  788          proc_t *p = curproc;
 789  789  
 790  790          if (zone != NULL && takemem) {
 791  791                  /* test zone.max-swap resource control */
 792  792                  mutex_enter(&p->p_lock);
 793  793                  if (rctl_incr_swap(p, zone, ptob(npages)) != 0) {
 794  794                          mutex_exit(&p->p_lock);
 795  795                          return (0);
 796  796                  }
 797  797                  mutex_exit(&p->p_lock);
 798  798          }
 799  799          mutex_enter(&anoninfo_lock);
 800  800  
 801  801          /*
 802  802           * pswap_pages is the number of pages we can take from
 803  803           * physical (i.e. disk-backed) swap.
 804  804           */
 805  805          ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
 806  806          pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv;
 807  807  
 808  808          ANON_PRINT(A_RESV,
 809  809              ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n",
 810  810              npages, takemem, pswap_pages, (void *)caller()));
 811  811  
 812  812          if (npages <= pswap_pages) {
 813  813                  /*
 814  814                   * we have enough space on a physical swap
 815  815                   */
 816  816                  if (takemem)
 817  817                          k_anoninfo.ani_phys_resv += npages;
 818  818                  mutex_exit(&anoninfo_lock);
 819  819                  return (1);
 820  820          } else if (pswap_pages != 0) {
 821  821                  /*
 822  822                   * we have some space on a physical swap
 823  823                   */
 824  824                  if (takemem) {
 825  825                          /*
 826  826                           * use up remainder of phys swap
 827  827                           */
 828  828                          k_anoninfo.ani_phys_resv += pswap_pages;
 829  829                          ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max);
 830  830                  }
 831  831          }
 832  832          /*
 833  833           * since (npages > pswap_pages) we need mem swap
 834  834           * mswap_pages is the number of pages needed from availrmem
 835  835           */
 836  836          ASSERT(npages > pswap_pages);
 837  837          mswap_pages = npages - pswap_pages;
 838  838  
 839  839          ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n",
 840  840              mswap_pages));
 841  841  
 842  842          /*
 843  843           * priv processes can reserve memory as swap as long as availrmem
 844  844           * remains greater than swapfs_minfree; in the case of non-priv
 845  845           * processes, memory can be reserved as swap only if availrmem
 846  846           * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
 847  847           * swapfs_reserve amount of memswap is not available to non-priv
 848  848           * processes. This protects daemons such as automounter dying
 849  849           * as a result of application processes eating away almost entire
 850  850           * membased swap. This safeguard becomes useless if apps are run
 851  851           * with root access.
 852  852           *
 853  853           * swapfs_reserve is minimum of 4Mb or 1/16 of physmem.
 854  854           *
 855  855           */
 856  856          if (tryhard) {
 857  857                  pgcnt_t floor_pages;
 858  858  
 859  859                  if (secpolicy_resource_anon_mem(CRED())) {
 860  860                          floor_pages = swapfs_minfree;
 861  861                  } else {
 862  862                          floor_pages = swapfs_minfree + swapfs_reserve;
 863  863                  }
 864  864  
 865  865                  mutex_exit(&anoninfo_lock);
 866  866                  (void) page_reclaim_mem(mswap_pages, floor_pages, 0);
 867  867                  mutex_enter(&anoninfo_lock);
 868  868          }
 869  869  
 870  870          mutex_enter(&freemem_lock);
 871  871          if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) ||
 872  872              (availrmem > (swapfs_minfree + mswap_pages) &&
 873  873              secpolicy_resource(CRED()) == 0)) {
 874  874  
 875  875                  if (takemem) {
 876  876                          /*
 877  877                           * Take the memory from the rest of the system.
 878  878                           */
 879  879                          availrmem -= mswap_pages;
 880  880                          mutex_exit(&freemem_lock);
 881  881                          k_anoninfo.ani_mem_resv += mswap_pages;
 882  882                          ANI_ADD(mswap_pages);
 883  883                          ANON_PRINT((A_RESV | A_MRESV),
 884  884                              ("anon_resvmem: took %ld pages of availrmem\n",
 885  885                              mswap_pages));
 886  886                  } else {
 887  887                          mutex_exit(&freemem_lock);
 888  888                  }
 889  889  
 890  890                  ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
 891  891                  mutex_exit(&anoninfo_lock);
 892  892                  return (1);
 893  893          } else {
 894  894                  /*
 895  895                   * Fail if not enough memory
 896  896                   */
 897  897                  if (takemem) {
 898  898                          k_anoninfo.ani_phys_resv -= pswap_pages;
 899  899                  }
 900  900  
 901  901                  mutex_exit(&freemem_lock);
 902  902                  mutex_exit(&anoninfo_lock);
 903  903                  ANON_PRINT(A_RESV,
 904  904                      ("anon_resvmem: not enough space from swapfs\n"));
 905  905                  if (zone != NULL && takemem)
 906  906                          rctl_decr_swap(zone, ptob(npages));
 907  907                  return (0);
 908  908          }
 909  909  }
 910  910  
 911  911  /*
 912  912   * Give back an anon reservation.
 913  913   */
 914  914  void
 915  915  anon_unresvmem(size_t size, zone_t *zone)
 916  916  {
 917  917          pgcnt_t npages = btopr(size);
 918  918          spgcnt_t mem_free_pages = 0;
 919  919          pgcnt_t phys_free_slots;
 920  920  #ifdef  ANON_DEBUG
 921  921          pgcnt_t mem_resv;
 922  922  #endif
 923  923          if (zone != NULL)
 924  924                  rctl_decr_swap(zone, ptob(npages));
 925  925  
 926  926          mutex_enter(&anoninfo_lock);
 927  927  
 928  928          ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
 929  929  
 930  930          /*
 931  931           * If some of this reservation belonged to swapfs
 932  932           * give it back to availrmem.
 933  933           * ani_mem_resv is the amount of availrmem swapfs has reserved.
 934  934           * but some of that memory could be locked by segspt so we can only
 935  935           * return non locked ani_mem_resv back to availrmem
 936  936           */
 937  937          if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) {
 938  938                  ANON_PRINT((A_RESV | A_MRESV),
 939  939                      ("anon_unresv: growing availrmem by %ld pages\n",
 940  940                      MIN(k_anoninfo.ani_mem_resv, npages)));
 941  941  
 942  942                  mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv -
 943  943                      k_anoninfo.ani_locked_swap), npages);
 944  944                  mutex_enter(&freemem_lock);
 945  945                  availrmem += mem_free_pages;
 946  946                  mutex_exit(&freemem_lock);
 947  947                  k_anoninfo.ani_mem_resv -= mem_free_pages;
 948  948  
 949  949                  ANI_ADD(-mem_free_pages);
 950  950          }
 951  951          /*
 952  952           * The remainder of the pages is returned to phys swap
 953  953           */
 954  954          ASSERT(npages >= mem_free_pages);
 955  955          phys_free_slots = npages - mem_free_pages;
 956  956  
 957  957          if (phys_free_slots) {
 958  958                  k_anoninfo.ani_phys_resv -= phys_free_slots;
 959  959          }
 960  960  
 961  961  #ifdef  ANON_DEBUG
 962  962          mem_resv = k_anoninfo.ani_mem_resv;
 963  963  #endif
 964  964  
 965  965          ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
 966  966          ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
 967  967  
 968  968          mutex_exit(&anoninfo_lock);
 969  969  
 970  970          ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n",
 971  971              npages, mem_resv, (void *)caller()));
 972  972  }
 973  973  
 974  974  /*
 975  975   * Allocate an anon slot and return it with the lock held.
 976  976   */
 977  977  struct anon *
 978  978  anon_alloc(struct vnode *vp, anoff_t off)
 979  979  {
 980  980          struct anon     *ap;
 981  981          kmutex_t        *ahm;
 982  982  
 983  983          ap = kmem_cache_alloc(anon_cache, KM_SLEEP);
 984  984          if (vp == NULL) {
 985  985                  swap_alloc(ap);
 986  986          } else {
 987  987                  ap->an_vp = vp;
 988  988                  ap->an_off = off;
 989  989          }
 990  990          ap->an_refcnt = 1;
 991  991          ap->an_pvp = NULL;
 992  992          ap->an_poff = 0;
 993  993          ahm = AH_MUTEX(ap->an_vp, ap->an_off);
 994  994          mutex_enter(ahm);
 995  995          anon_addhash(ap);
 996  996          mutex_exit(ahm);
 997  997          ANI_ADD(-1);
 998  998          ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n",
 999  999              (void *)ap, (ap ? (void *)ap->an_vp : NULL)));
1000 1000          return (ap);
1001 1001  }
1002 1002  
1003 1003  /*
1004 1004   * Called for pages locked in memory via softlock/pagelock/mlock to make sure
1005 1005   * such pages don't consume any physical swap resources needed for swapping
1006 1006   * unlocked pages.
1007 1007   */
1008 1008  void
1009 1009  anon_swap_free(struct anon *ap, page_t *pp)
1010 1010  {
1011 1011          kmutex_t *ahm;
1012 1012  
1013 1013          ASSERT(ap != NULL);
1014 1014          ASSERT(pp != NULL);
1015 1015          ASSERT(PAGE_LOCKED(pp));
1016 1016          ASSERT(pp->p_vnode != NULL);
1017 1017          ASSERT(IS_SWAPFSVP(pp->p_vnode));
1018 1018          ASSERT(ap->an_refcnt != 0);
1019 1019          ASSERT(pp->p_vnode == ap->an_vp);
1020 1020          ASSERT(pp->p_offset == ap->an_off);
1021 1021  
1022 1022          if (ap->an_pvp == NULL)
1023 1023                  return;
1024 1024  
1025 1025          page_io_lock(pp);
1026 1026          ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1027 1027          mutex_enter(ahm);
1028 1028  
1029 1029          ASSERT(ap->an_refcnt != 0);
1030 1030          ASSERT(pp->p_vnode == ap->an_vp);
1031 1031          ASSERT(pp->p_offset == ap->an_off);
1032 1032  
1033 1033          if (ap->an_pvp != NULL) {
1034 1034                  swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
1035 1035                  ap->an_pvp = NULL;
1036 1036                  ap->an_poff = 0;
1037 1037                  mutex_exit(ahm);
1038 1038                  hat_setmod(pp);
1039 1039          } else {
1040 1040                  mutex_exit(ahm);
1041 1041          }
1042 1042          page_io_unlock(pp);
1043 1043  }
1044 1044  
1045 1045  /*
1046 1046   * Decrement the reference count of an anon page.
1047 1047   * If reference count goes to zero, free it and
1048 1048   * its associated page (if any).
1049 1049   */
1050 1050  void
1051 1051  anon_decref(struct anon *ap)
1052 1052  {
1053 1053          page_t *pp;
1054 1054          struct vnode *vp;
1055 1055          anoff_t off;
1056 1056          kmutex_t *ahm;
1057 1057  
1058 1058          ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1059 1059          mutex_enter(ahm);
1060 1060          ASSERT(ap->an_refcnt != 0);
1061 1061          if (ap->an_refcnt == 0)
1062 1062                  panic("anon_decref: slot count 0");
1063 1063          if (--ap->an_refcnt == 0) {
1064 1064                  swap_xlate(ap, &vp, &off);
1065 1065                  anon_rmhash(ap);
1066 1066                  if (ap->an_pvp != NULL)
1067 1067                          swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
1068 1068                  mutex_exit(ahm);
1069 1069  
1070 1070                  /*
1071 1071                   * If there is a page for this anon slot we will need to
1072 1072                   * call VN_DISPOSE to get rid of the vp association and
1073 1073                   * put the page back on the free list as really free.
1074 1074                   * Acquire the "exclusive" lock to ensure that any
1075 1075                   * pending i/o always completes before the swap slot
1076 1076                   * is freed.
1077 1077                   */
1078 1078                  pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
1079 1079                  if (pp != NULL) {
1080 1080                          /*LINTED: constant in conditional context */
1081 1081                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
1082 1082                  }
1083 1083                  ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n",
1084 1084                      (void *)ap, (void *)ap->an_vp));
1085 1085  
1086 1086                  kmem_cache_free(anon_cache, ap);
1087 1087  
1088 1088                  ANI_ADD(1);
1089 1089          } else {
1090 1090                  mutex_exit(ahm);
1091 1091          }
1092 1092  }
1093 1093  
1094 1094  
1095 1095  /*
1096 1096   * check an_refcnt of the root anon slot (anon_index argument is aligned at
1097 1097   * seg->s_szc level) to determine whether COW processing is required.
1098 1098   * anonpages_hash_lock[] held on the root ap ensures that if root's
1099 1099   * refcnt is 1 all other refcnt's are 1 as well (and they can't increase
1100 1100   * later since this process can't fork while its AS lock is held).
1101 1101   *
1102 1102   * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0.
1103 1103   */
1104 1104  int
1105 1105  anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index)
1106 1106  {
1107 1107          struct anon     *ap;
1108 1108          kmutex_t        *ahmpages = NULL;
1109 1109  
1110 1110          ap = anon_get_ptr(ahp, anon_index);
1111 1111          if (ap == NULL)
1112 1112                  return (0);
1113 1113  
1114 1114          ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1115 1115          mutex_enter(ahmpages);
1116 1116          ASSERT(ap->an_refcnt >= 1);
1117 1117          if (ap->an_refcnt == 1) {
1118 1118                  mutex_exit(ahmpages);
1119 1119                  return (0);
1120 1120          }
1121 1121          mutex_exit(ahmpages);
1122 1122          return (1);
1123 1123  }
1124 1124  /*
1125 1125   * Check 'nslots' anon slots for refcnt > 1.
1126 1126   *
1127 1127   * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise
1128 1128   * returns 0.
1129 1129   */
1130 1130  static int
1131 1131  anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
1132 1132  {
1133 1133          struct anon *ap;
1134 1134  
1135 1135          while (nslots-- > 0) {
1136 1136                  if ((ap = anon_get_ptr(ahp, anon_index)) != NULL &&
1137 1137                      ap->an_refcnt > 1)
1138 1138                          return (1);
1139 1139                  anon_index++;
1140 1140          }
1141 1141  
1142 1142          return (0);
1143 1143  }
1144 1144  
1145 1145  static void
1146 1146  anon_decref_pages(
1147 1147          struct anon_hdr *ahp,
1148 1148          ulong_t an_idx,
1149 1149          uint_t szc)
1150 1150  {
1151 1151          struct anon *ap = anon_get_ptr(ahp, an_idx);
1152 1152          kmutex_t *ahmpages = NULL;
1153 1153          page_t *pp;
1154 1154          pgcnt_t pgcnt = page_get_pagecnt(szc);
1155 1155          pgcnt_t i;
1156 1156          struct vnode *vp;
1157 1157          anoff_t   off;
1158 1158          kmutex_t *ahm;
1159 1159  #ifdef DEBUG
1160 1160          int refcnt = 1;
1161 1161  #endif
1162 1162  
1163 1163          ASSERT(szc != 0);
1164 1164          ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1165 1165          ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1166 1166          ASSERT(an_idx < ahp->size);
1167 1167  
1168 1168          if (ahp->size - an_idx < pgcnt) {
1169 1169                  /*
1170 1170                   * In case of shared mappings total anon map size may not be
1171 1171                   * the largest page size aligned.
1172 1172                   */
1173 1173                  pgcnt = ahp->size - an_idx;
1174 1174          }
1175 1175  
1176 1176          VM_STAT_ADD(anonvmstats.decrefpages[0]);
1177 1177  
1178 1178          if (ap != NULL) {
1179 1179                  ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1180 1180                  mutex_enter(ahmpages);
1181 1181                  ASSERT((refcnt = ap->an_refcnt) != 0);
1182 1182                  VM_STAT_ADD(anonvmstats.decrefpages[1]);
1183 1183                  if (ap->an_refcnt == 1) {
1184 1184                          VM_STAT_ADD(anonvmstats.decrefpages[2]);
1185 1185                          ASSERT(!anon_share(ahp, an_idx, pgcnt));
1186 1186                          mutex_exit(ahmpages);
1187 1187                          ahmpages = NULL;
1188 1188                  }
1189 1189          }
1190 1190  
1191 1191          i = 0;
1192 1192          while (i < pgcnt) {
1193 1193                  if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) {
1194 1194                          ASSERT(refcnt == 1 && ahmpages == NULL);
1195 1195                          i++;
1196 1196                          continue;
1197 1197                  }
1198 1198                  ASSERT(ap->an_refcnt == refcnt);
1199 1199                  ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
1200 1200                  ASSERT(ahmpages == NULL || ap->an_refcnt > 1);
1201 1201  
1202 1202                  if (ahmpages == NULL) {
1203 1203                          swap_xlate(ap, &vp, &off);
1204 1204                          pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
1205 1205                          if (pp == NULL || pp->p_szc == 0) {
1206 1206                                  VM_STAT_ADD(anonvmstats.decrefpages[3]);
1207 1207                                  ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1208 1208                                  (void) anon_set_ptr(ahp, an_idx + i, NULL,
1209 1209                                      ANON_SLEEP);
1210 1210                                  mutex_enter(ahm);
1211 1211                                  ap->an_refcnt--;
1212 1212                                  ASSERT(ap->an_refcnt == 0);
1213 1213                                  anon_rmhash(ap);
1214 1214                                  if (ap->an_pvp)
1215 1215                                          swap_phys_free(ap->an_pvp, ap->an_poff,
1216 1216                                              PAGESIZE);
1217 1217                                  mutex_exit(ahm);
1218 1218                                  if (pp == NULL) {
1219 1219                                          pp = page_lookup(vp, (u_offset_t)off,
1220 1220                                              SE_EXCL);
1221 1221                                          ASSERT(pp == NULL || pp->p_szc == 0);
1222 1222                                  }
1223 1223                                  if (pp != NULL) {
1224 1224                                          VM_STAT_ADD(anonvmstats.decrefpages[4]);
1225 1225                                          /*LINTED*/
1226 1226                                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
1227 1227                                  }
1228 1228                                  kmem_cache_free(anon_cache, ap);
1229 1229                                  ANI_ADD(1);
1230 1230                                  i++;
1231 1231                          } else {
1232 1232                                  pgcnt_t j;
1233 1233                                  pgcnt_t curpgcnt =
1234 1234                                      page_get_pagecnt(pp->p_szc);
1235 1235                                  size_t ppasize = curpgcnt * sizeof (page_t *);
1236 1236                                  page_t **ppa = kmem_alloc(ppasize, KM_SLEEP);
1237 1237                                  int dispose = 0;
1238 1238  
1239 1239                                  VM_STAT_ADD(anonvmstats.decrefpages[5]);
1240 1240  
1241 1241                                  ASSERT(pp->p_szc <= szc);
1242 1242                                  ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt));
1243 1243                                  ASSERT(IS_P2ALIGNED(i, curpgcnt));
1244 1244                                  ASSERT(i + curpgcnt <= pgcnt);
1245 1245                                  ASSERT(!(page_pptonum(pp) & (curpgcnt - 1)));
1246 1246                                  ppa[0] = pp;
1247 1247                                  for (j = i + 1; j < i + curpgcnt; j++) {
1248 1248                                          ap = anon_get_ptr(ahp, an_idx + j);
1249 1249                                          ASSERT(ap != NULL &&
1250 1250                                              ap->an_refcnt == 1);
1251 1251                                          swap_xlate(ap, &vp, &off);
1252 1252                                          pp = page_lookup(vp, (u_offset_t)off,
1253 1253                                              SE_EXCL);
1254 1254                                          if (pp == NULL)
1255 1255                                                  panic("anon_decref_pages: "
1256 1256                                                      "no page");
1257 1257  
1258 1258                                          (void) hat_pageunload(pp,
1259 1259                                              HAT_FORCE_PGUNLOAD);
1260 1260                                          ASSERT(pp->p_szc == ppa[0]->p_szc);
1261 1261                                          ASSERT(page_pptonum(pp) - 1 ==
1262 1262                                              page_pptonum(ppa[j - i - 1]));
1263 1263                                          ppa[j - i] = pp;
1264 1264                                          if (ap->an_pvp != NULL &&
1265 1265                                              !vn_matchopval(ap->an_pvp,
1266 1266                                              VOPNAME_DISPOSE,
1267 1267                                              (fs_generic_func_p)fs_dispose))
1268 1268                                                  dispose = 1;
1269 1269                                  }
1270 1270                                  for (j = i; j < i + curpgcnt; j++) {
1271 1271                                          ap = anon_get_ptr(ahp, an_idx + j);
1272 1272                                          ASSERT(ap != NULL &&
1273 1273                                              ap->an_refcnt == 1);
1274 1274                                          ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1275 1275                                          (void) anon_set_ptr(ahp, an_idx + j,
1276 1276                                              NULL, ANON_SLEEP);
1277 1277                                          mutex_enter(ahm);
1278 1278                                          ap->an_refcnt--;
1279 1279                                          ASSERT(ap->an_refcnt == 0);
1280 1280                                          anon_rmhash(ap);
1281 1281                                          if (ap->an_pvp)
1282 1282                                                  swap_phys_free(ap->an_pvp,
1283 1283                                                      ap->an_poff, PAGESIZE);
1284 1284                                          mutex_exit(ahm);
1285 1285                                          kmem_cache_free(anon_cache, ap);
1286 1286                                          ANI_ADD(1);
1287 1287                                  }
1288 1288                                  if (!dispose) {
1289 1289                                          VM_STAT_ADD(anonvmstats.decrefpages[6]);
1290 1290                                          page_destroy_pages(ppa[0]);
1291 1291                                  } else {
1292 1292                                          VM_STAT_ADD(anonvmstats.decrefpages[7]);
1293 1293                                          for (j = 0; j < curpgcnt; j++) {
1294 1294                                                  ASSERT(PAGE_EXCL(ppa[j]));
1295 1295                                                  ppa[j]->p_szc = 0;
1296 1296                                          }
1297 1297                                          for (j = 0; j < curpgcnt; j++) {
1298 1298                                                  ASSERT(!hat_page_is_mapped(
1299 1299                                                      ppa[j]));
1300 1300                                                  /*LINTED*/
1301 1301                                                  VN_DISPOSE(ppa[j], B_INVAL, 0,
1302 1302                                                      kcred);
1303 1303                                          }
1304 1304                                  }
1305 1305                                  kmem_free(ppa, ppasize);
1306 1306                                  i += curpgcnt;
1307 1307                          }
1308 1308                  } else {
1309 1309                          VM_STAT_ADD(anonvmstats.decrefpages[8]);
1310 1310                          (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP);
1311 1311                          ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1312 1312                          mutex_enter(ahm);
1313 1313                          ap->an_refcnt--;
1314 1314                          mutex_exit(ahm);
1315 1315                          i++;
1316 1316                  }
1317 1317          }
1318 1318  
1319 1319          if (ahmpages != NULL) {
1320 1320                  mutex_exit(ahmpages);
1321 1321          }
1322 1322  }
1323 1323  
1324 1324  /*
1325 1325   * Duplicate references to size bytes worth of anon pages.
1326 1326   * Used when duplicating a segment that contains private anon pages.
1327 1327   * This code assumes that procedure calling this one has already used
1328 1328   * hat_chgprot() to disable write access to the range of addresses that
1329 1329   * that *old actually refers to.
1330 1330   */
1331 1331  void
1332 1332  anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new,
1333 1333                          ulong_t new_idx, size_t size)
1334 1334  {
1335 1335          spgcnt_t npages;
1336 1336          kmutex_t *ahm;
1337 1337          struct anon *ap;
1338 1338          ulong_t off;
1339 1339          ulong_t index;
1340 1340  
1341 1341          npages = btopr(size);
1342 1342          while (npages > 0) {
1343 1343                  index = old_idx;
1344 1344                  if ((ap = anon_get_next_ptr(old, &index)) == NULL)
1345 1345                          break;
1346 1346  
1347 1347                  ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
1348 1348                  off = index - old_idx;
1349 1349                  npages -= off;
1350 1350                  if (npages <= 0)
1351 1351                          break;
1352 1352  
1353 1353                  (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP);
1354 1354                  ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1355 1355  
1356 1356                  mutex_enter(ahm);
1357 1357                  ap->an_refcnt++;
1358 1358                  mutex_exit(ahm);
1359 1359  
1360 1360                  off++;
1361 1361                  new_idx += off;
1362 1362                  old_idx += off;
1363 1363                  npages--;
1364 1364          }
1365 1365  }
1366 1366  
1367 1367  /*
1368 1368   * Just like anon_dup but also guarantees there are no holes (unallocated anon
1369 1369   * slots) within any large page region. That means if a large page region is
1370 1370   * empty in the old array it will skip it. If there are 1 or more valid slots
1371 1371   * in the large page region of the old array it will make sure to fill in any
1372 1372   * unallocated ones and also copy them to the new array. If noalloc is 1 large
1373 1373   * page region should either have no valid anon slots or all slots should be
1374 1374   * valid.
1375 1375   */
1376 1376  void
1377 1377  anon_dup_fill_holes(
1378 1378          struct anon_hdr *old,
1379 1379          ulong_t old_idx,
1380 1380          struct anon_hdr *new,
1381 1381          ulong_t new_idx,
1382 1382          size_t size,
1383 1383          uint_t szc,
1384 1384          int noalloc)
1385 1385  {
1386 1386          struct anon     *ap;
1387 1387          spgcnt_t        npages;
1388 1388          kmutex_t        *ahm, *ahmpages = NULL;
1389 1389          pgcnt_t         pgcnt, i;
1390 1390          ulong_t         index, off;
1391 1391  #ifdef DEBUG
1392 1392          int             refcnt;
1393 1393  #endif
1394 1394  
1395 1395          ASSERT(szc != 0);
1396 1396          pgcnt = page_get_pagecnt(szc);
1397 1397          ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1398 1398          npages = btopr(size);
1399 1399          ASSERT(IS_P2ALIGNED(npages, pgcnt));
1400 1400          ASSERT(IS_P2ALIGNED(old_idx, pgcnt));
1401 1401  
1402 1402          VM_STAT_ADD(anonvmstats.dupfillholes[0]);
1403 1403  
1404 1404          while (npages > 0) {
1405 1405                  index = old_idx;
1406 1406  
1407 1407                  /*
1408 1408                   * Find the next valid slot.
1409 1409                   */
1410 1410                  if (anon_get_next_ptr(old, &index) == NULL)
1411 1411                          break;
1412 1412  
1413 1413                  ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
1414 1414                  /*
1415 1415                   * Now backup index to the beginning of the
1416 1416                   * current large page region of the old array.
1417 1417                   */
1418 1418                  index = P2ALIGN(index, pgcnt);
1419 1419                  off = index - old_idx;
1420 1420                  ASSERT(IS_P2ALIGNED(off, pgcnt));
1421 1421                  npages -= off;
1422 1422                  if (npages <= 0)
1423 1423                          break;
1424 1424  
1425 1425                  /*
1426 1426                   * Fill and copy a large page regions worth
1427 1427                   * of anon slots.
1428 1428                   */
1429 1429                  for (i = 0; i < pgcnt; i++) {
1430 1430                          if ((ap = anon_get_ptr(old, index + i)) == NULL) {
1431 1431                                  if (noalloc) {
1432 1432                                          panic("anon_dup_fill_holes: "
1433 1433                                              "empty anon slot\n");
1434 1434                                  }
1435 1435                                  VM_STAT_ADD(anonvmstats.dupfillholes[1]);
1436 1436                                  ap = anon_alloc(NULL, 0);
1437 1437                                  (void) anon_set_ptr(old, index + i, ap,
1438 1438                                      ANON_SLEEP);
1439 1439                          } else if (i == 0) {
1440 1440                                  /*
1441 1441                                   * make the increment of all refcnts of all
1442 1442                                   * anon slots of a large page appear atomic by
1443 1443                                   * getting an anonpages_hash_lock for the
1444 1444                                   * first anon slot of a large page.
1445 1445                                   */
1446 1446                                  VM_STAT_ADD(anonvmstats.dupfillholes[2]);
1447 1447  
1448 1448                                  ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1449 1449                                  mutex_enter(ahmpages);
1450 1450                                  /*LINTED*/
1451 1451                                  ASSERT(refcnt = ap->an_refcnt);
1452 1452  
1453 1453                                  VM_STAT_COND_ADD(ap->an_refcnt > 1,
1454 1454                                      anonvmstats.dupfillholes[3]);
1455 1455                          }
1456 1456                          (void) anon_set_ptr(new, new_idx + off + i, ap,
1457 1457                              ANON_SLEEP);
1458 1458                          ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1459 1459                          mutex_enter(ahm);
1460 1460                          ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
1461 1461                          ASSERT(i == 0 || ahmpages == NULL ||
1462 1462                              refcnt == ap->an_refcnt);
1463 1463                          ap->an_refcnt++;
1464 1464                          mutex_exit(ahm);
1465 1465                  }
1466 1466                  if (ahmpages != NULL) {
1467 1467                          mutex_exit(ahmpages);
1468 1468                          ahmpages = NULL;
1469 1469                  }
1470 1470                  off += pgcnt;
1471 1471                  new_idx += off;
1472 1472                  old_idx += off;
1473 1473                  npages -= pgcnt;
1474 1474          }
1475 1475  }
1476 1476  
1477 1477  /*
1478 1478   * Used when a segment with a vnode changes szc. similarly to
1479 1479   * anon_dup_fill_holes() makes sure each large page region either has no anon
1480 1480   * slots or all of them. but new slots are created by COWing the file
1481 1481   * pages. on entrance no anon slots should be shared.
1482 1482   */
1483 1483  int
1484 1484  anon_fill_cow_holes(
1485 1485          struct seg *seg,
1486 1486          caddr_t addr,
1487 1487          struct anon_hdr *ahp,
1488 1488          ulong_t an_idx,
1489 1489          struct vnode *vp,
1490 1490          u_offset_t vp_off,
1491 1491          size_t size,
1492 1492          uint_t szc,
1493 1493          uint_t prot,
1494 1494          struct vpage vpage[],
1495 1495          struct cred *cred)
1496 1496  {
1497 1497          struct anon     *ap;
1498 1498          spgcnt_t        npages;
1499 1499          pgcnt_t         pgcnt, i;
1500 1500          ulong_t         index, off;
1501 1501          int             err = 0;
1502 1502          int             pageflags = 0;
1503 1503  
1504 1504          ASSERT(szc != 0);
1505 1505          pgcnt = page_get_pagecnt(szc);
1506 1506          ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1507 1507          npages = btopr(size);
1508 1508          ASSERT(IS_P2ALIGNED(npages, pgcnt));
1509 1509          ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1510 1510  
1511 1511          while (npages > 0) {
1512 1512                  index = an_idx;
1513 1513  
1514 1514                  /*
1515 1515                   * Find the next valid slot.
1516 1516                   */
1517 1517                  if (anon_get_next_ptr(ahp, &index) == NULL) {
1518 1518                          break;
1519 1519                  }
1520 1520  
1521 1521                  ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1522 1522                  /*
1523 1523                   * Now backup index to the beginning of the
1524 1524                   * current large page region of the anon array.
1525 1525                   */
1526 1526                  index = P2ALIGN(index, pgcnt);
1527 1527                  off = index - an_idx;
1528 1528                  ASSERT(IS_P2ALIGNED(off, pgcnt));
1529 1529                  npages -= off;
1530 1530                  if (npages <= 0)
1531 1531                          break;
1532 1532                  an_idx += off;
1533 1533                  vp_off += ptob(off);
1534 1534                  addr += ptob(off);
1535 1535                  if (vpage != NULL) {
1536 1536                          vpage += off;
1537 1537                  }
1538 1538  
1539 1539                  for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) {
1540 1540                          if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) {
1541 1541                                  page_t *pl[1 + 1];
1542 1542                                  page_t *pp;
1543 1543  
1544 1544                                  err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL,
1545 1545                                      pl, PAGESIZE, seg, addr, S_READ, cred,
1546 1546                                      NULL);
1547 1547                                  if (err) {
1548 1548                                          break;
1549 1549                                  }
1550 1550                                  if (vpage != NULL) {
1551 1551                                          prot = VPP_PROT(vpage);
1552 1552                                          pageflags = VPP_ISPPLOCK(vpage) ?
1553 1553                                              LOCK_PAGE : 0;
1554 1554                                  }
1555 1555                                  pp = anon_private(&ap, seg, addr, prot, pl[0],
1556 1556                                      pageflags, cred);
1557 1557                                  if (pp == NULL) {
1558 1558                                          err = ENOMEM;
1559 1559                                          break;
1560 1560                                  }
1561 1561                                  (void) anon_set_ptr(ahp, an_idx, ap,
1562 1562                                      ANON_SLEEP);
1563 1563                                  page_unlock(pp);
1564 1564                          }
1565 1565                          ASSERT(ap->an_refcnt == 1);
1566 1566                          addr += PAGESIZE;
1567 1567                          if (vpage != NULL) {
1568 1568                                  vpage++;
1569 1569                          }
1570 1570                  }
1571 1571                  npages -= pgcnt;
1572 1572          }
1573 1573  
1574 1574          return (err);
1575 1575  }
1576 1576  
1577 1577  /*
1578 1578   * Free a group of "size" anon pages, size in bytes,
1579 1579   * and clear out the pointers to the anon entries.
1580 1580   */
1581 1581  void
1582 1582  anon_free(struct anon_hdr *ahp, ulong_t index, size_t size)
1583 1583  {
1584 1584          spgcnt_t npages;
1585 1585          struct anon *ap;
1586 1586          ulong_t old;
1587 1587  
1588 1588          npages = btopr(size);
1589 1589  
1590 1590          while (npages > 0) {
1591 1591                  old = index;
1592 1592                  if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
1593 1593                          break;
1594 1594  
1595 1595                  ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1596 1596                  npages -= index - old;
1597 1597                  if (npages <= 0)
1598 1598                          break;
1599 1599  
1600 1600                  (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP);
1601 1601                  anon_decref(ap);
1602 1602                  /*
1603 1603                   * Bump index and decrement page count
1604 1604                   */
1605 1605                  index++;
1606 1606                  npages--;
1607 1607          }
1608 1608  }
1609 1609  
1610 1610  void
1611 1611  anon_free_pages(
1612 1612          struct anon_hdr *ahp,
1613 1613          ulong_t an_idx,
1614 1614          size_t size,
1615 1615          uint_t szc)
1616 1616  {
1617 1617          spgcnt_t        npages;
1618 1618          pgcnt_t         pgcnt;
1619 1619          ulong_t         index, off;
1620 1620  
1621 1621          ASSERT(szc != 0);
1622 1622          pgcnt = page_get_pagecnt(szc);
1623 1623          ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1624 1624          npages = btopr(size);
1625 1625          ASSERT(IS_P2ALIGNED(npages, pgcnt));
1626 1626          ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1627 1627          ASSERT(an_idx < ahp->size);
1628 1628  
1629 1629          VM_STAT_ADD(anonvmstats.freepages[0]);
1630 1630  
1631 1631          while (npages > 0) {
1632 1632                  index = an_idx;
1633 1633  
1634 1634                  /*
1635 1635                   * Find the next valid slot.
1636 1636                   */
1637 1637                  if (anon_get_next_ptr(ahp, &index) == NULL)
1638 1638                          break;
1639 1639  
1640 1640                  ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1641 1641                  /*
1642 1642                   * Now backup index to the beginning of the
1643 1643                   * current large page region of the old array.
1644 1644                   */
1645 1645                  index = P2ALIGN(index, pgcnt);
1646 1646                  off = index - an_idx;
1647 1647                  ASSERT(IS_P2ALIGNED(off, pgcnt));
1648 1648                  npages -= off;
1649 1649                  if (npages <= 0)
1650 1650                          break;
1651 1651  
1652 1652                  anon_decref_pages(ahp, index, szc);
1653 1653  
1654 1654                  off += pgcnt;
1655 1655                  an_idx += off;
1656 1656                  npages -= pgcnt;
1657 1657          }
1658 1658  }
1659 1659  
1660 1660  /*
1661 1661   * Make anonymous pages discardable
1662 1662   */
1663 1663  void
1664 1664  anon_disclaim(struct anon_map *amp, ulong_t index, size_t size)
1665 1665  {
1666 1666          spgcnt_t npages = btopr(size);
1667 1667          struct anon *ap;
1668 1668          struct vnode *vp;
1669 1669          anoff_t off;
1670 1670          page_t *pp, *root_pp;
1671 1671          kmutex_t *ahm;
1672 1672          pgcnt_t pgcnt;
1673 1673          ulong_t old_idx, idx, i;
1674 1674          struct anon_hdr *ahp = amp->ahp;
1675 1675          anon_sync_obj_t cookie;
1676 1676  
1677 1677          ASSERT(RW_READ_HELD(&amp->a_rwlock));
1678 1678          pgcnt = 1;
1679 1679          for (; npages > 0; index = (pgcnt == 1) ? index + 1 :
1680 1680              P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) {
1681 1681  
1682 1682                  /*
1683 1683                   * get anon pointer and index for the first valid entry
1684 1684                   * in the anon list, starting from "index"
1685 1685                   */
1686 1686                  old_idx = index;
1687 1687                  if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
1688 1688                          break;
1689 1689  
1690 1690                  /*
1691 1691                   * decrement npages by number of NULL anon slots we skipped
1692 1692                   */
1693 1693                  npages -= index - old_idx;
1694 1694                  if (npages <= 0)
1695 1695                          break;
1696 1696  
1697 1697                  anon_array_enter(amp, index, &cookie);
1698 1698                  ap = anon_get_ptr(ahp, index);
1699 1699                  ASSERT(ap != NULL);
1700 1700  
1701 1701                  /*
1702 1702                   * Get anonymous page and try to lock it SE_EXCL;
1703 1703                   * if we couldn't grab the lock we skip to next page.
1704 1704                   */
1705 1705                  swap_xlate(ap, &vp, &off);
1706 1706                  pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL);
1707 1707                  if (pp == NULL) {
1708 1708                          segadvstat.MADV_FREE_miss.value.ul++;
1709 1709                          pgcnt = 1;
1710 1710                          anon_array_exit(&cookie);
1711 1711                          continue;
1712 1712                  }
1713 1713                  pgcnt = page_get_pagecnt(pp->p_szc);
1714 1714  
1715 1715                  /*
1716 1716                   * we cannot free a page which is permanently locked.
1717 1717                   * The page_struct_lock need not be acquired to examine
1718 1718                   * these fields since the page has an "exclusive" lock.
1719 1719                   */
1720 1720                  if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1721 1721                          page_unlock(pp);
1722 1722                          segadvstat.MADV_FREE_miss.value.ul++;
1723 1723                          anon_array_exit(&cookie);
1724 1724                          continue;
1725 1725                  }
1726 1726  
1727 1727                  ahm = AH_MUTEX(vp, off);
1728 1728                  mutex_enter(ahm);
1729 1729                  ASSERT(ap->an_refcnt != 0);
1730 1730                  /*
1731 1731                   * skip this one if copy-on-write is not yet broken.
1732 1732                   */
1733 1733                  if (ap->an_refcnt > 1) {
1734 1734                          mutex_exit(ahm);
1735 1735                          page_unlock(pp);
1736 1736                          segadvstat.MADV_FREE_miss.value.ul++;
1737 1737                          anon_array_exit(&cookie);
1738 1738                          continue;
1739 1739                  }
1740 1740  
1741 1741                  if (pp->p_szc == 0) {
1742 1742                          pgcnt = 1;
1743 1743  
1744 1744                          /*
1745 1745                           * free swap slot;
1746 1746                           */
1747 1747                          if (ap->an_pvp) {
1748 1748                                  swap_phys_free(ap->an_pvp, ap->an_poff,
1749 1749                                      PAGESIZE);
1750 1750                                  ap->an_pvp = NULL;
1751 1751                                  ap->an_poff = 0;
1752 1752                          }
1753 1753                          mutex_exit(ahm);
1754 1754                          segadvstat.MADV_FREE_hit.value.ul++;
1755 1755  
1756 1756                          /*
1757 1757                           * while we are at it, unload all the translations
1758 1758                           * and attempt to free the page.
1759 1759                           */
1760 1760                          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1761 1761                          /*LINTED: constant in conditional context */
1762 1762                          VN_DISPOSE(pp, B_FREE, 0, kcred);
1763 1763                          anon_array_exit(&cookie);
1764 1764                          continue;
1765 1765                  }
1766 1766  
1767 1767                  pgcnt = page_get_pagecnt(pp->p_szc);
1768 1768                  if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) {
1769 1769                          if (!page_try_demote_pages(pp)) {
1770 1770                                  mutex_exit(ahm);
1771 1771                                  page_unlock(pp);
1772 1772                                  segadvstat.MADV_FREE_miss.value.ul++;
1773 1773                                  anon_array_exit(&cookie);
1774 1774                                  continue;
1775 1775                          } else {
1776 1776                                  pgcnt = 1;
1777 1777                                  if (ap->an_pvp) {
1778 1778                                          swap_phys_free(ap->an_pvp,
1779 1779                                              ap->an_poff, PAGESIZE);
1780 1780                                          ap->an_pvp = NULL;
1781 1781                                          ap->an_poff = 0;
1782 1782                                  }
1783 1783                                  mutex_exit(ahm);
1784 1784                                  (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1785 1785                                  /*LINTED*/
1786 1786                                  VN_DISPOSE(pp, B_FREE, 0, kcred);
1787 1787                                  segadvstat.MADV_FREE_hit.value.ul++;
1788 1788                                  anon_array_exit(&cookie);
1789 1789                                  continue;
1790 1790                          }
1791 1791                  }
1792 1792                  mutex_exit(ahm);
1793 1793                  root_pp = pp;
1794 1794  
1795 1795                  /*
1796 1796                   * try to lock remaining pages
1797 1797                   */
1798 1798                  for (idx = 1; idx < pgcnt; idx++) {
1799 1799                          pp++;
1800 1800                          if (!page_trylock(pp, SE_EXCL))
1801 1801                                  break;
1802 1802                          if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1803 1803                                  page_unlock(pp);
1804 1804                                  break;
1805 1805                          }
1806 1806                  }
1807 1807  
1808 1808                  if (idx == pgcnt) {
1809 1809                          for (i = 0; i < pgcnt; i++) {
1810 1810                                  ap = anon_get_ptr(ahp, index + i);
1811 1811                                  if (ap == NULL)
1812 1812                                          break;
1813 1813                                  swap_xlate(ap, &vp, &off);
1814 1814                                  ahm = AH_MUTEX(vp, off);
1815 1815                                  mutex_enter(ahm);
1816 1816                                  ASSERT(ap->an_refcnt != 0);
1817 1817  
1818 1818                                  /*
1819 1819                                   * skip this one if copy-on-write
1820 1820                                   * is not yet broken.
1821 1821                                   */
1822 1822                                  if (ap->an_refcnt > 1) {
1823 1823                                          mutex_exit(ahm);
1824 1824                                          goto skiplp;
1825 1825                                  }
1826 1826                                  if (ap->an_pvp) {
1827 1827                                          swap_phys_free(ap->an_pvp,
1828 1828                                              ap->an_poff, PAGESIZE);
1829 1829                                          ap->an_pvp = NULL;
1830 1830                                          ap->an_poff = 0;
1831 1831                                  }
1832 1832                                  mutex_exit(ahm);
1833 1833                          }
1834 1834                          page_destroy_pages(root_pp);
1835 1835                          segadvstat.MADV_FREE_hit.value.ul += pgcnt;
1836 1836                          anon_array_exit(&cookie);
1837 1837                          continue;
1838 1838                  }
1839 1839  skiplp:
1840 1840                  segadvstat.MADV_FREE_miss.value.ul += pgcnt;
1841 1841                  for (i = 0, pp = root_pp; i < idx; pp++, i++)
1842 1842                          page_unlock(pp);
1843 1843                  anon_array_exit(&cookie);
1844 1844          }
1845 1845  }
1846 1846  
1847 1847  /*
1848 1848   * Return the kept page(s) and protections back to the segment driver.
1849 1849   */
1850 1850  int
1851 1851  anon_getpage(
1852 1852          struct anon **app,
1853 1853          uint_t *protp,
1854 1854          page_t *pl[],
1855 1855          size_t plsz,
1856 1856          struct seg *seg,
1857 1857          caddr_t addr,
1858 1858          enum seg_rw rw,
1859 1859          struct cred *cred)
1860 1860  {
1861 1861          page_t *pp;
1862 1862          struct anon *ap = *app;
1863 1863          struct vnode *vp;
1864 1864          anoff_t off;
1865 1865          int err;
1866 1866          kmutex_t *ahm;
1867 1867  
1868 1868          swap_xlate(ap, &vp, &off);
1869 1869  
1870 1870          /*
1871 1871           * Lookup the page. If page is being paged in,
1872 1872           * wait for it to finish as we must return a list of
1873 1873           * pages since this routine acts like the VOP_GETPAGE
1874 1874           * routine does.
1875 1875           */
1876 1876          if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) {
1877 1877                  ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1878 1878                  mutex_enter(ahm);
1879 1879                  if (ap->an_refcnt == 1)
1880 1880                          *protp = PROT_ALL;
1881 1881                  else
1882 1882                          *protp = PROT_ALL & ~PROT_WRITE;
1883 1883                  mutex_exit(ahm);
1884 1884                  pl[0] = pp;
1885 1885                  pl[1] = NULL;
1886 1886                  return (0);
1887 1887          }
1888 1888  
1889 1889          /*
1890 1890           * Simply treat it as a vnode fault on the anon vp.
1891 1891           */
1892 1892  
1893 1893          TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE,
1894 1894              "anon_getpage:seg %x addr %x vp %x",
1895 1895              seg, addr, vp);
1896 1896  
1897 1897          err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz,
1898 1898              seg, addr, rw, cred, NULL);
1899 1899  
1900 1900          if (err == 0 && pl != NULL) {
1901 1901                  ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1902 1902                  mutex_enter(ahm);
1903 1903                  if (ap->an_refcnt != 1)
1904 1904                          *protp &= ~PROT_WRITE;  /* make read-only */
1905 1905                  mutex_exit(ahm);
1906 1906          }
1907 1907          return (err);
1908 1908  }
1909 1909  
1910 1910  /*
1911 1911   * Creates or returns kept pages to the segment driver.  returns -1 if a large
1912 1912   * page cannot be allocated. returns -2 if some other process has allocated a
1913 1913   * larger page.
1914 1914   *
1915 1915   * For cowfault it will allocate any size pages to fill the requested area to
1916 1916   * avoid partially overwriting anon slots (i.e. sharing only some of the anon
1917 1917   * slots within a large page with other processes). This policy greatly
1918 1918   * simplifies large page freeing (which is only freed when all anon slot
1919 1919   * refcnts are 0).
1920 1920   */
1921 1921  int
1922 1922  anon_map_getpages(
1923 1923          struct anon_map *amp,
1924 1924          ulong_t start_idx,
1925 1925          uint_t  szc,
1926 1926          struct seg *seg,
1927 1927          caddr_t addr,
1928 1928          uint_t prot,
1929 1929          uint_t *protp,
1930 1930          page_t  *ppa[],
1931 1931          uint_t  *ppa_szc,
1932 1932          struct vpage vpage[],
1933 1933          enum seg_rw rw,
1934 1934          int brkcow,
1935 1935          int anypgsz,
1936 1936          int pgflags,
1937 1937          struct cred *cred)
1938 1938  {
1939 1939          pgcnt_t         pgcnt;
1940 1940          struct anon     *ap;
1941 1941          struct vnode    *vp;
1942 1942          anoff_t         off;
1943 1943          page_t          *pp, *pl[2], *conpp = NULL;
1944 1944          caddr_t         vaddr;
1945 1945          ulong_t         pg_idx, an_idx, i;
1946 1946          spgcnt_t        nreloc = 0;
1947 1947          int             prealloc = 1;
1948 1948          int             err, slotcreate;
1949 1949          uint_t          vpprot;
1950 1950          int             upsize = (szc < seg->s_szc);
1951 1951  
1952 1952  #if !defined(__i386) && !defined(__amd64)
1953 1953          ASSERT(seg->s_szc != 0);
1954 1954  #endif
1955 1955          ASSERT(szc <= seg->s_szc);
1956 1956          ASSERT(ppa_szc != NULL);
1957 1957          ASSERT(rw != S_CREATE);
1958 1958  
1959 1959          *protp = PROT_ALL;
1960 1960  
1961 1961          VM_STAT_ADD(anonvmstats.getpages[0]);
1962 1962  
1963 1963          if (szc == 0) {
1964 1964                  VM_STAT_ADD(anonvmstats.getpages[1]);
1965 1965                  if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) {
1966 1966                          err = anon_getpage(&ap, protp, pl, PAGESIZE, seg,
1967 1967                              addr, rw, cred);
1968 1968                          if (err)
1969 1969                                  return (err);
1970 1970                          ppa[0] = pl[0];
1971 1971                          if (brkcow == 0 || (*protp & PROT_WRITE)) {
1972 1972                                  VM_STAT_ADD(anonvmstats.getpages[2]);
1973 1973                                  if (ppa[0]->p_szc != 0 && upsize) {
1974 1974                                          VM_STAT_ADD(anonvmstats.getpages[3]);
1975 1975                                          *ppa_szc = MIN(ppa[0]->p_szc,
1976 1976                                              seg->s_szc);
1977 1977                                          page_unlock(ppa[0]);
1978 1978                                          return (-2);
1979 1979                                  }
1980 1980                                  return (0);
1981 1981                          }
1982 1982                          panic("anon_map_getpages: cowfault for szc 0");
1983 1983                  } else {
1984 1984                          VM_STAT_ADD(anonvmstats.getpages[4]);
1985 1985                          ppa[0] = anon_zero(seg, addr, &ap, cred);
1986 1986                          if (ppa[0] == NULL)
1987 1987                                  return (ENOMEM);
1988 1988                          (void) anon_set_ptr(amp->ahp, start_idx, ap,
1989 1989                              ANON_SLEEP);
1990 1990                          return (0);
1991 1991                  }
1992 1992          }
1993 1993  
1994 1994          pgcnt = page_get_pagecnt(szc);
1995 1995          ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1996 1996          ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
1997 1997  
1998 1998          /*
1999 1999           * First we check for the case that the requtested large
2000 2000           * page or larger page already exists in the system.
2001 2001           * Actually we only check if the first constituent page
2002 2002           * exists and only preallocate if it's not found.
2003 2003           */
2004 2004          ap = anon_get_ptr(amp->ahp, start_idx);
2005 2005          if (ap) {
2006 2006                  uint_t pszc;
2007 2007                  swap_xlate(ap, &vp, &off);
2008 2008                  if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) {
2009 2009                          if (pszc > szc && upsize) {
2010 2010                                  *ppa_szc = MIN(pszc, seg->s_szc);
2011 2011                                  return (-2);
2012 2012                          }
2013 2013                          if (pszc >= szc) {
2014 2014                                  prealloc = 0;
2015 2015                          }
2016 2016                  }
2017 2017          }
2018 2018  
2019 2019          VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]);
2020 2020          VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]);
2021 2021  
2022 2022  top:
2023 2023          /*
2024 2024           * If a smaller page or no page at all was found,
2025 2025           * grab a large page off the freelist.
2026 2026           */
2027 2027          if (prealloc) {
2028 2028                  ASSERT(conpp == NULL);
2029 2029                  if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa,
2030 2030                      szc, 0, pgflags) != 0) {
2031 2031                          VM_STAT_ADD(anonvmstats.getpages[7]);
2032 2032                          if (brkcow == 0 || szc < seg->s_szc ||
2033 2033                              !anon_szcshare(amp->ahp, start_idx)) {
2034 2034                                  /*
2035 2035                                   * If the refcnt's of all anon slots are <= 1
2036 2036                                   * they can't increase since we are holding
2037 2037                                   * the address space's lock. So segvn can
2038 2038                                   * safely decrease szc without risking to
2039 2039                                   * generate a cow fault for the region smaller
2040 2040                                   * than the segment's largest page size.
2041 2041                                   */
2042 2042                                  VM_STAT_ADD(anonvmstats.getpages[8]);
2043 2043                                  return (-1);
2044 2044                          }
2045 2045                  docow:
2046 2046                          /*
2047 2047                           * This is a cow fault. Copy away the entire 1 large
2048 2048                           * page region of this segment.
2049 2049                           */
2050 2050                          if (szc != seg->s_szc)
2051 2051                                  panic("anon_map_getpages: cowfault for szc %d",
2052 2052                                      szc);
2053 2053                          vaddr = addr;
2054 2054                          for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
2055 2055                              pg_idx++, an_idx++, vaddr += PAGESIZE) {
2056 2056                                  if ((ap = anon_get_ptr(amp->ahp, an_idx)) !=
2057 2057                                      NULL) {
2058 2058                                          err = anon_getpage(&ap, &vpprot, pl,
2059 2059                                              PAGESIZE, seg, vaddr, rw, cred);
2060 2060                                          if (err) {
2061 2061                                                  for (i = 0; i < pg_idx; i++) {
2062 2062                                                          if ((pp = ppa[i]) !=
2063 2063                                                              NULL)
2064 2064                                                                  page_unlock(pp);
2065 2065                                                  }
2066 2066                                                  return (err);
2067 2067                                          }
2068 2068                                          ppa[pg_idx] = pl[0];
2069 2069                                  } else {
2070 2070                                          /*
2071 2071                                           * Since this is a cowfault we know
2072 2072                                           * that this address space has a
2073 2073                                           * parent or children which means
2074 2074                                           * anon_dup_fill_holes() has initialized
2075 2075                                           * all anon slots within a large page
2076 2076                                           * region that had at least one anon
2077 2077                                           * slot at the time of fork().
2078 2078                                           */
2079 2079                                          panic("anon_map_getpages: "
2080 2080                                              "cowfault but anon slot is empty");
2081 2081                                  }
2082 2082                          }
2083 2083                          VM_STAT_ADD(anonvmstats.getpages[9]);
2084 2084                          *protp = PROT_ALL;
2085 2085                          return (anon_map_privatepages(amp, start_idx, szc, seg,
2086 2086                              addr, prot, ppa, vpage, anypgsz, pgflags, cred));
2087 2087                  }
2088 2088          }
2089 2089  
2090 2090          VM_STAT_ADD(anonvmstats.getpages[10]);
2091 2091  
2092 2092          an_idx = start_idx;
2093 2093          pg_idx = 0;
2094 2094          vaddr = addr;
2095 2095          while (pg_idx < pgcnt) {
2096 2096                  slotcreate = 0;
2097 2097                  if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) {
2098 2098                          VM_STAT_ADD(anonvmstats.getpages[11]);
2099 2099                          /*
2100 2100                           * For us to have decided not to preallocate
2101 2101                           * would have meant that a large page
2102 2102                           * was found. Which also means that all of the
2103 2103                           * anon slots for that page would have been
2104 2104                           * already created for us.
2105 2105                           */
2106 2106                          if (prealloc == 0)
2107 2107                                  panic("anon_map_getpages: prealloc = 0");
2108 2108  
2109 2109                          slotcreate = 1;
2110 2110                          ap = anon_alloc(NULL, 0);
2111 2111                  }
2112 2112                  swap_xlate(ap, &vp, &off);
2113 2113  
2114 2114                  /*
2115 2115                   * Now setup our preallocated page to pass down
2116 2116                   * to swap_getpage().
2117 2117                   */
2118 2118                  if (prealloc) {
2119 2119                          ASSERT(ppa[pg_idx]->p_szc == szc);
2120 2120                          conpp = ppa[pg_idx];
2121 2121                  }
2122 2122                  ASSERT(prealloc || conpp == NULL);
2123 2123  
2124 2124                  /*
2125 2125                   * If we just created this anon slot then call
2126 2126                   * with S_CREATE to prevent doing IO on the page.
2127 2127                   * Similar to the anon_zero case.
2128 2128                   */
2129 2129                  err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE,
2130 2130                      NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr,
2131 2131                      slotcreate == 1 ? S_CREATE : rw, cred);
2132 2132  
2133 2133                  if (err) {
2134 2134                          ASSERT(err != -2 || upsize);
2135 2135                          VM_STAT_ADD(anonvmstats.getpages[12]);
2136 2136                          ASSERT(slotcreate == 0);
2137 2137                          goto io_err;
2138 2138                  }
2139 2139  
2140 2140                  pp = pl[0];
2141 2141  
2142 2142                  if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) {
2143 2143                          VM_STAT_ADD(anonvmstats.getpages[13]);
2144 2144                          ASSERT(slotcreate == 0);
2145 2145                          ASSERT(prealloc == 0);
2146 2146                          ASSERT(pg_idx == 0);
2147 2147                          if (pp->p_szc > szc) {
2148 2148                                  ASSERT(upsize);
2149 2149                                  *ppa_szc = MIN(pp->p_szc, seg->s_szc);
2150 2150                                  page_unlock(pp);
2151 2151                                  VM_STAT_ADD(anonvmstats.getpages[14]);
2152 2152                                  return (-2);
2153 2153                          }
2154 2154                          page_unlock(pp);
2155 2155                          prealloc = 1;
2156 2156                          goto top;
2157 2157                  }
2158 2158  
2159 2159                  /*
2160 2160                   * If we decided to preallocate but VOP_GETPAGE
2161 2161                   * found a page in the system that satisfies our
2162 2162                   * request then free up our preallocated large page
2163 2163                   * and continue looping accross the existing large
2164 2164                   * page via VOP_GETPAGE.
2165 2165                   */
2166 2166                  if (prealloc && pp != ppa[pg_idx]) {
2167 2167                          VM_STAT_ADD(anonvmstats.getpages[15]);
2168 2168                          ASSERT(slotcreate == 0);
2169 2169                          ASSERT(pg_idx == 0);
2170 2170                          conpp = NULL;
2171 2171                          prealloc = 0;
2172 2172                          page_free_pages(ppa[0]);
2173 2173                  }
2174 2174  
2175 2175                  if (prealloc && nreloc > 1) {
2176 2176                          /*
2177 2177                           * we have relocated out of a smaller large page.
2178 2178                           * skip npgs - 1 iterations and continue which will
2179 2179                           * increment by one the loop indices.
2180 2180                           */
2181 2181                          spgcnt_t npgs = nreloc;
2182 2182  
2183 2183                          VM_STAT_ADD(anonvmstats.getpages[16]);
2184 2184  
2185 2185                          ASSERT(pp == ppa[pg_idx]);
2186 2186                          ASSERT(slotcreate == 0);
2187 2187                          ASSERT(pg_idx + npgs <= pgcnt);
2188 2188                          if ((*protp & PROT_WRITE) &&
2189 2189                              anon_share(amp->ahp, an_idx, npgs)) {
2190 2190                                  *protp &= ~PROT_WRITE;
2191 2191                          }
2192 2192                          pg_idx += npgs;
2193 2193                          an_idx += npgs;
2194 2194                          vaddr += PAGESIZE * npgs;
2195 2195                          continue;
2196 2196                  }
2197 2197  
2198 2198                  VM_STAT_ADD(anonvmstats.getpages[17]);
2199 2199  
2200 2200                  /*
2201 2201                   * Anon_zero case.
2202 2202                   */
2203 2203                  if (slotcreate) {
2204 2204                          ASSERT(prealloc);
2205 2205                          pagezero(pp, 0, PAGESIZE);
2206 2206                          CPU_STATS_ADD_K(vm, zfod, 1);
2207 2207                          hat_setrefmod(pp);
2208 2208                  }
2209 2209  
2210 2210                  ASSERT(prealloc == 0 || ppa[pg_idx] == pp);
2211 2211                  ASSERT(prealloc != 0 || PAGE_SHARED(pp));
2212 2212                  ASSERT(prealloc == 0 || PAGE_EXCL(pp));
2213 2213  
2214 2214                  if (pg_idx > 0 &&
2215 2215                      ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) ||
2216 2216                      (pp->p_szc != ppa[pg_idx - 1]->p_szc))) {
2217 2217                          panic("anon_map_getpages: unexpected page");
2218 2218                  } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) {
2219 2219                          panic("anon_map_getpages: unaligned page");
2220 2220                  }
2221 2221  
2222 2222                  if (prealloc == 0) {
2223 2223                          ppa[pg_idx] = pp;
2224 2224                  }
2225 2225  
2226 2226                  if (ap->an_refcnt > 1) {
2227 2227                          VM_STAT_ADD(anonvmstats.getpages[18]);
2228 2228                          *protp &= ~PROT_WRITE;
2229 2229                  }
2230 2230  
2231 2231                  /*
2232 2232                   * If this is a new anon slot then initialize
2233 2233                   * the anon array entry.
2234 2234                   */
2235 2235                  if (slotcreate) {
2236 2236                          (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
2237 2237                  }
2238 2238                  pg_idx++;
2239 2239                  an_idx++;
2240 2240                  vaddr += PAGESIZE;
2241 2241          }
2242 2242  
2243 2243          /*
2244 2244           * Since preallocated pages come off the freelist
2245 2245           * they are locked SE_EXCL. Simply downgrade and return.
2246 2246           */
2247 2247          if (prealloc) {
2248 2248                  VM_STAT_ADD(anonvmstats.getpages[19]);
2249 2249                  conpp = NULL;
2250 2250                  for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2251 2251                          page_downgrade(ppa[pg_idx]);
2252 2252                  }
2253 2253          }
2254 2254          ASSERT(conpp == NULL);
2255 2255  
2256 2256          if (brkcow == 0 || (*protp & PROT_WRITE)) {
2257 2257                  VM_STAT_ADD(anonvmstats.getpages[20]);
2258 2258                  return (0);
2259 2259          }
2260 2260  
2261 2261          if (szc < seg->s_szc)
2262 2262                  panic("anon_map_getpages: cowfault for szc %d", szc);
2263 2263  
2264 2264          VM_STAT_ADD(anonvmstats.getpages[21]);
2265 2265  
2266 2266          *protp = PROT_ALL;
2267 2267          return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot,
2268 2268              ppa, vpage, anypgsz, pgflags, cred));
2269 2269  io_err:
2270 2270          /*
2271 2271           * We got an IO error somewhere in our large page.
2272 2272           * If we were using a preallocated page then just demote
2273 2273           * all the constituent pages that we've succeeded with sofar
2274 2274           * to PAGESIZE pages and leave them in the system
2275 2275           * unlocked.
2276 2276           */
2277 2277  
2278 2278          ASSERT(err != -2 || ((pg_idx == 0) && upsize));
2279 2279  
2280 2280          VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]);
2281 2281          VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]);
2282 2282          VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]);
2283 2283  
2284 2284          if (prealloc) {
2285 2285                  conpp = NULL;
2286 2286                  if (pg_idx > 0) {
2287 2287                          VM_STAT_ADD(anonvmstats.getpages[25]);
2288 2288                          for (i = 0; i < pgcnt; i++) {
2289 2289                                  pp = ppa[i];
2290 2290                                  ASSERT(PAGE_EXCL(pp));
2291 2291                                  ASSERT(pp->p_szc == szc);
2292 2292                                  pp->p_szc = 0;
2293 2293                          }
2294 2294                          for (i = 0; i < pg_idx; i++) {
2295 2295                                  ASSERT(!hat_page_is_mapped(ppa[i]));
2296 2296                                  page_unlock(ppa[i]);
2297 2297                          }
2298 2298                          /*
2299 2299                           * Now free up the remaining unused constituent
2300 2300                           * pages.
2301 2301                           */
2302 2302                          while (pg_idx < pgcnt) {
2303 2303                                  ASSERT(!hat_page_is_mapped(ppa[pg_idx]));
2304 2304                                  page_free(ppa[pg_idx], 0);
2305 2305                                  pg_idx++;
2306 2306                          }
2307 2307                  } else {
2308 2308                          VM_STAT_ADD(anonvmstats.getpages[26]);
2309 2309                          page_free_pages(ppa[0]);
2310 2310                  }
2311 2311          } else {
2312 2312                  VM_STAT_ADD(anonvmstats.getpages[27]);
2313 2313                  ASSERT(err > 0);
2314 2314                  for (i = 0; i < pg_idx; i++)
2315 2315                          page_unlock(ppa[i]);
2316 2316          }
2317 2317          ASSERT(conpp == NULL);
2318 2318          if (err != -1)
2319 2319                  return (err);
2320 2320          /*
2321 2321           * we are here because we failed to relocate.
2322 2322           */
2323 2323          ASSERT(prealloc);
2324 2324          if (brkcow == 0 || szc < seg->s_szc ||
2325 2325              !anon_szcshare(amp->ahp, start_idx)) {
2326 2326                  VM_STAT_ADD(anonvmstats.getpages[28]);
2327 2327                  return (-1);
2328 2328          }
2329 2329          VM_STAT_ADD(anonvmstats.getpages[29]);
2330 2330          goto docow;
2331 2331  }
2332 2332  
2333 2333  
2334 2334  /*
2335 2335   * Turn a reference to an object or shared anon page
2336 2336   * into a private page with a copy of the data from the
2337 2337   * original page which is always locked by the caller.
2338 2338   * This routine unloads the translation and unlocks the
2339 2339   * original page, if it isn't being stolen, before returning
2340 2340   * to the caller.
2341 2341   *
2342 2342   * NOTE:  The original anon slot is not freed by this routine
2343 2343   *        It must be freed by the caller while holding the
2344 2344   *        "anon_map" lock to prevent races which can occur if
2345 2345   *        a process has multiple lwps in its address space.
2346 2346   */
2347 2347  page_t *
2348 2348  anon_private(
2349 2349          struct anon **app,
2350 2350          struct seg *seg,
2351 2351          caddr_t addr,
2352 2352          uint_t  prot,
2353 2353          page_t *opp,
2354 2354          int oppflags,
2355 2355          struct cred *cred)
2356 2356  {
2357 2357          struct anon *old = *app;
2358 2358          struct anon *new;
2359 2359          page_t *pp = NULL;
2360 2360          struct vnode *vp;
2361 2361          anoff_t off;
2362 2362          page_t *anon_pl[1 + 1];
2363 2363          int err;
2364 2364  
2365 2365          if (oppflags & STEAL_PAGE)
2366 2366                  ASSERT(PAGE_EXCL(opp));
2367 2367          else
2368 2368                  ASSERT(PAGE_LOCKED(opp));
2369 2369  
2370 2370          CPU_STATS_ADD_K(vm, cow_fault, 1);
2371 2371  
2372 2372          /* Kernel probe */
2373 2373          TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */,
2374 2374                  tnf_opaque,     address,        addr);
2375 2375  
2376 2376          *app = new = anon_alloc(NULL, 0);
2377 2377          swap_xlate(new, &vp, &off);
2378 2378  
2379 2379          if (oppflags & STEAL_PAGE) {
2380 2380                  page_rename(opp, vp, (u_offset_t)off);
2381 2381                  pp = opp;
2382 2382                  TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE,
2383 2383                      "anon_private:seg %p addr %x pp %p vp %p off %lx",
2384 2384                      seg, addr, pp, vp, off);
2385 2385                  hat_setmod(pp);
2386 2386  
2387 2387                  /* bug 4026339 */
2388 2388                  page_downgrade(pp);
2389 2389                  return (pp);
2390 2390          }
2391 2391  
2392 2392          /*
2393 2393           * Call the VOP_GETPAGE routine to create the page, thereby
2394 2394           * enabling the vnode driver to allocate any filesystem
2395 2395           * space (e.g., disk block allocation for UFS).  This also
2396 2396           * prevents more than one page from being added to the
2397 2397           * vnode at the same time.
2398 2398           */
2399 2399          err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL,
2400 2400              anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL);
2401 2401          if (err)
2402 2402                  goto out;
2403 2403  
2404 2404          pp = anon_pl[0];
2405 2405  
2406 2406          /*
2407 2407           * If the original page was locked, we need to move the lock
2408 2408           * to the new page by transfering 'cowcnt/lckcnt' of the original
2409 2409           * page to 'cowcnt/lckcnt' of the new page.
2410 2410           *
2411 2411           * See Statement at the beginning of segvn_lockop() and
2412 2412           * comments in page_pp_useclaim() regarding the way
2413 2413           * cowcnts/lckcnts are handled.
2414 2414           *
2415 2415           * Also availrmem must be decremented up front for read only mapping
2416 2416           * before calling page_pp_useclaim. page_pp_useclaim will bump it back
2417 2417           * if availrmem did not need to be decremented after all.
2418 2418           */
2419 2419          if (oppflags & LOCK_PAGE) {
2420 2420                  if ((prot & PROT_WRITE) == 0) {
2421 2421                          mutex_enter(&freemem_lock);
2422 2422                          if (availrmem > pages_pp_maximum) {
2423 2423                                  availrmem--;
2424 2424                                  pages_useclaim++;
2425 2425                          } else {
2426 2426                                  mutex_exit(&freemem_lock);
2427 2427                                  goto out;
2428 2428                          }
2429 2429                          mutex_exit(&freemem_lock);
2430 2430                  }
2431 2431                  page_pp_useclaim(opp, pp, prot & PROT_WRITE);
2432 2432          }
2433 2433  
2434 2434          /*
2435 2435           * Now copy the contents from the original page,
2436 2436           * which is locked and loaded in the MMU by
2437 2437           * the caller to prevent yet another page fault.
2438 2438           */
2439 2439          /* XXX - should set mod bit in here */
2440 2440          if (ppcopy(opp, pp) == 0) {
2441 2441                  /*
2442 2442                   * Before ppcopy could hanlde UE or other faults, we
2443 2443                   * would have panicked here, and still have no option
2444 2444                   * but to do so now.
2445 2445                   */
2446 2446                  panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p",
2447 2447                      (void *)opp, (void *)pp);
2448 2448          }
2449 2449  
2450 2450          hat_setrefmod(pp);              /* mark as modified */
2451 2451  
2452 2452          /*
2453 2453           * Unload the old translation.
2454 2454           */
2455 2455          hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD);
2456 2456  
2457 2457          /*
2458 2458           * Free unmapped, unmodified original page.
2459 2459           * or release the lock on the original page,
2460 2460           * otherwise the process will sleep forever in
2461 2461           * anon_decref() waiting for the "exclusive" lock
2462 2462           * on the page.
2463 2463           */
2464 2464          (void) page_release(opp, 1);
2465 2465  
2466 2466          /*
2467 2467           * we are done with page creation so downgrade the new
2468 2468           * page's selock to shared, this helps when multiple
2469 2469           * as_fault(...SOFTLOCK...) are done to the same
2470 2470           * page(aio)
2471 2471           */
2472 2472          page_downgrade(pp);
2473 2473  
2474 2474          /*
2475 2475           * NOTE:  The original anon slot must be freed by the
2476 2476           * caller while holding the "anon_map" lock, if we
2477 2477           * copied away from an anonymous page.
2478 2478           */
2479 2479          return (pp);
2480 2480  
2481 2481  out:
2482 2482          *app = old;
2483 2483          if (pp)
2484 2484                  page_unlock(pp);
2485 2485          anon_decref(new);
2486 2486          page_unlock(opp);
2487 2487          return ((page_t *)NULL);
2488 2488  }
2489 2489  
2490 2490  int
2491 2491  anon_map_privatepages(
2492 2492          struct anon_map *amp,
2493 2493          ulong_t start_idx,
2494 2494          uint_t  szc,
2495 2495          struct seg *seg,
2496 2496          caddr_t addr,
2497 2497          uint_t  prot,
2498 2498          page_t  *ppa[],
2499 2499          struct vpage vpage[],
2500 2500          int anypgsz,
2501 2501          int pgflags,
2502 2502          struct cred *cred)
2503 2503  {
2504 2504          pgcnt_t         pgcnt;
2505 2505          struct vnode    *vp;
2506 2506          anoff_t         off;
2507 2507          page_t          *pl[2], *conpp = NULL;
2508 2508          int             err;
2509 2509          int             prealloc = 1;
2510 2510          struct anon     *ap, *oldap;
2511 2511          caddr_t         vaddr;
2512 2512          page_t          *pplist, *pp;
2513 2513          ulong_t         pg_idx, an_idx;
2514 2514          spgcnt_t        nreloc = 0;
2515 2515          int             pagelock = 0;
2516 2516          kmutex_t        *ahmpages = NULL;
2517 2517  #ifdef DEBUG
2518 2518          int             refcnt;
2519 2519  #endif
2520 2520  
2521 2521          ASSERT(szc != 0);
2522 2522          ASSERT(szc == seg->s_szc);
2523 2523  
2524 2524          VM_STAT_ADD(anonvmstats.privatepages[0]);
2525 2525  
2526 2526          pgcnt = page_get_pagecnt(szc);
2527 2527          ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
2528 2528          ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
2529 2529  
2530 2530          ASSERT(amp != NULL);
2531 2531          ap = anon_get_ptr(amp->ahp, start_idx);
2532 2532          ASSERT(ap == NULL || ap->an_refcnt >= 1);
2533 2533  
2534 2534          VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]);
2535 2535  
2536 2536          /*
2537 2537           * Now try and allocate the large page. If we fail then just
2538 2538           * let VOP_GETPAGE give us PAGESIZE pages. Normally we let
2539 2539           * the caller make this decision but to avoid added complexity
2540 2540           * it's simplier to handle that case here.
2541 2541           */
2542 2542          if (anypgsz == -1) {
2543 2543                  VM_STAT_ADD(anonvmstats.privatepages[2]);
2544 2544                  prealloc = 0;
2545 2545          } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc,
2546 2546              anypgsz, pgflags) != 0) {
2547 2547                  VM_STAT_ADD(anonvmstats.privatepages[3]);
2548 2548                  prealloc = 0;
2549 2549          }
2550 2550  
2551 2551          /*
2552 2552           * make the decrement of all refcnts of all
2553 2553           * anon slots of a large page appear atomic by
2554 2554           * getting an anonpages_hash_lock for the
2555 2555           * first anon slot of a large page.
2556 2556           */
2557 2557          if (ap != NULL) {
2558 2558                  ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
2559 2559                  mutex_enter(ahmpages);
2560 2560                  if (ap->an_refcnt == 1) {
2561 2561                          VM_STAT_ADD(anonvmstats.privatepages[4]);
2562 2562                          ASSERT(!anon_share(amp->ahp, start_idx, pgcnt));
2563 2563                          mutex_exit(ahmpages);
2564 2564  
2565 2565                          if (prealloc) {
2566 2566                                  page_free_replacement_page(pplist);
2567 2567                                  page_create_putback(pgcnt);
2568 2568                          }
2569 2569                          ASSERT(ppa[0]->p_szc <= szc);
2570 2570                          if (ppa[0]->p_szc == szc) {
2571 2571                                  VM_STAT_ADD(anonvmstats.privatepages[5]);
2572 2572                                  return (0);
2573 2573                          }
2574 2574                          for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2575 2575                                  ASSERT(ppa[pg_idx] != NULL);
2576 2576                                  page_unlock(ppa[pg_idx]);
2577 2577                          }
2578 2578                          return (-1);
2579 2579                  }
2580 2580          }
2581 2581  
2582 2582          /*
2583 2583           * If we are passed in the vpage array and this is
2584 2584           * not PROT_WRITE then we need to decrement availrmem
2585 2585           * up front before we try anything. If we need to and
2586 2586           * can't decrement availrmem then its better to fail now
2587 2587           * than in the middle of processing the new large page.
2588 2588           * page_pp_usclaim() on behalf of each constituent page
2589 2589           * below will adjust availrmem back for the cases not needed.
2590 2590           */
2591 2591          if (vpage != NULL && (prot & PROT_WRITE) == 0) {
2592 2592                  for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2593 2593                          if (VPP_ISPPLOCK(&vpage[pg_idx])) {
2594 2594                                  pagelock = 1;
2595 2595                                  break;
2596 2596                          }
2597 2597                  }
2598 2598                  if (pagelock) {
2599 2599                          VM_STAT_ADD(anonvmstats.privatepages[6]);
2600 2600                          mutex_enter(&freemem_lock);
2601 2601                          if (availrmem >= pages_pp_maximum + pgcnt) {
2602 2602                                  availrmem -= pgcnt;
2603 2603                                  pages_useclaim += pgcnt;
2604 2604                          } else {
2605 2605                                  VM_STAT_ADD(anonvmstats.privatepages[7]);
2606 2606                                  mutex_exit(&freemem_lock);
2607 2607                                  if (ahmpages != NULL) {
2608 2608                                          mutex_exit(ahmpages);
2609 2609                                  }
2610 2610                                  if (prealloc) {
2611 2611                                          page_free_replacement_page(pplist);
2612 2612                                          page_create_putback(pgcnt);
2613 2613                                  }
2614 2614                                  for (pg_idx = 0; pg_idx < pgcnt; pg_idx++)
2615 2615                                          if (ppa[pg_idx] != NULL)
2616 2616                                                  page_unlock(ppa[pg_idx]);
2617 2617                                  return (ENOMEM);
2618 2618                          }
2619 2619                          mutex_exit(&freemem_lock);
2620 2620                  }
2621 2621          }
2622 2622  
2623 2623          CPU_STATS_ADD_K(vm, cow_fault, pgcnt);
2624 2624  
2625 2625          VM_STAT_ADD(anonvmstats.privatepages[8]);
2626 2626  
2627 2627          an_idx = start_idx;
2628 2628          pg_idx = 0;
2629 2629          vaddr = addr;
2630 2630          for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) {
2631 2631                  ASSERT(ppa[pg_idx] != NULL);
2632 2632                  oldap = anon_get_ptr(amp->ahp, an_idx);
2633 2633                  ASSERT(ahmpages != NULL || oldap == NULL);
2634 2634                  ASSERT(ahmpages == NULL || oldap != NULL);
2635 2635                  ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
2636 2636                  ASSERT(ahmpages == NULL || pg_idx != 0 ||
2637 2637                      (refcnt = oldap->an_refcnt));
2638 2638                  ASSERT(ahmpages == NULL || pg_idx == 0 ||
2639 2639                      refcnt == oldap->an_refcnt);
2640 2640  
2641 2641                  ap = anon_alloc(NULL, 0);
2642 2642  
2643 2643                  swap_xlate(ap, &vp, &off);
2644 2644  
2645 2645                  /*
2646 2646                   * Now setup our preallocated page to pass down to
2647 2647                   * swap_getpage().
2648 2648                   */
2649 2649                  if (prealloc) {
2650 2650                          pp = pplist;
2651 2651                          page_sub(&pplist, pp);
2652 2652                          conpp = pp;
2653 2653                  }
2654 2654  
2655 2655                  err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl,
2656 2656                      PAGESIZE, conpp, NULL, &nreloc, seg, vaddr,
2657 2657                      S_CREATE, cred);
2658 2658  
2659 2659                  /*
2660 2660                   * Impossible to fail this is S_CREATE.
2661 2661                   */
2662 2662                  if (err)
2663 2663                          panic("anon_map_privatepages: VOP_GETPAGE failed");
2664 2664  
2665 2665                  ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0);
2666 2666                  ASSERT(prealloc == 0 || nreloc == 1);
2667 2667  
2668 2668                  pp = pl[0];
2669 2669  
2670 2670                  /*
2671 2671                   * If the original page was locked, we need to move
2672 2672                   * the lock to the new page by transfering
2673 2673                   * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt'
2674 2674                   * of the new page. pg_idx can be used to index
2675 2675                   * into the vpage array since the caller will guarentee
2676 2676                   * that vpage struct passed in corresponds to addr
2677 2677                   * and forward.
2678 2678                   */
2679 2679                  if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) {
2680 2680                          page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE);
2681 2681                  } else if (pagelock) {
2682 2682                          mutex_enter(&freemem_lock);
2683 2683                          availrmem++;
2684 2684                          pages_useclaim--;
2685 2685                          mutex_exit(&freemem_lock);
2686 2686                  }
2687 2687  
2688 2688                  /*
2689 2689                   * Now copy the contents from the original page.
2690 2690                   */
2691 2691                  if (ppcopy(ppa[pg_idx], pp) == 0) {
2692 2692                          /*
2693 2693                           * Before ppcopy could hanlde UE or other faults, we
2694 2694                           * would have panicked here, and still have no option
2695 2695                           * but to do so now.
2696 2696                           */
2697 2697                          panic("anon_map_privatepages, ppcopy failed");
2698 2698                  }
2699 2699  
2700 2700                  hat_setrefmod(pp);              /* mark as modified */
2701 2701  
2702 2702                  /*
2703 2703                   * Release the lock on the original page,
2704 2704                   * derement the old slot, and down grade the lock
2705 2705                   * on the new copy.
2706 2706                   */
2707 2707                  page_unlock(ppa[pg_idx]);
2708 2708  
2709 2709                  if (!prealloc)
2710 2710                          page_downgrade(pp);
2711 2711  
2712 2712                  ppa[pg_idx] = pp;
2713 2713  
2714 2714                  /*
2715 2715                   * Now reflect the copy in the new anon array.
2716 2716                   */
2717 2717                  ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
2718 2718                  if (oldap != NULL)
2719 2719                          anon_decref(oldap);
2720 2720                  (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
2721 2721          }
2722 2722  
2723 2723          /*
2724 2724           * Unload the old large page translation.
2725 2725           */
2726 2726          hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD);
2727 2727  
2728 2728          if (ahmpages != NULL) {
2729 2729                  mutex_exit(ahmpages);
2730 2730          }
2731 2731          ASSERT(prealloc == 0 || pplist == NULL);
2732 2732          if (prealloc) {
2733 2733                  VM_STAT_ADD(anonvmstats.privatepages[9]);
2734 2734                  for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2735 2735                          page_downgrade(ppa[pg_idx]);
2736 2736                  }
2737 2737          }
2738 2738  
2739 2739          return (0);
2740 2740  }
2741 2741  
2742 2742  /*
2743 2743   * Allocate a private zero-filled anon page.
2744 2744   */
2745 2745  page_t *
2746 2746  anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred)
2747 2747  {
2748 2748          struct anon *ap;
2749 2749          page_t *pp;
2750 2750          struct vnode *vp;
2751 2751          anoff_t off;
2752 2752          page_t *anon_pl[1 + 1];
2753 2753          int err;
2754 2754  
2755 2755          /* Kernel probe */
2756 2756          TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */,
2757 2757                  tnf_opaque,     address,        addr);
2758 2758  
2759 2759          *app = ap = anon_alloc(NULL, 0);
2760 2760          swap_xlate(ap, &vp, &off);
2761 2761  
2762 2762          /*
2763 2763           * Call the VOP_GETPAGE routine to create the page, thereby
2764 2764           * enabling the vnode driver to allocate any filesystem
2765 2765           * dependent structures (e.g., disk block allocation for UFS).
2766 2766           * This also prevents more than on page from being added to
2767 2767           * the vnode at the same time since it is locked.
2768 2768           */
2769 2769          err = VOP_GETPAGE(vp, off, PAGESIZE, NULL,
2770 2770              anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL);
2771 2771          if (err) {
2772 2772                  *app = NULL;
2773 2773                  anon_decref(ap);
2774 2774                  return (NULL);
2775 2775          }
2776 2776          pp = anon_pl[0];
2777 2777  
2778 2778          pagezero(pp, 0, PAGESIZE);      /* XXX - should set mod bit */
2779 2779          page_downgrade(pp);
2780 2780          CPU_STATS_ADD_K(vm, zfod, 1);
2781 2781          hat_setrefmod(pp);      /* mark as modified so pageout writes back */
2782 2782          return (pp);
2783 2783  }
2784 2784  
2785 2785  
2786 2786  /*
2787 2787   * Allocate array of private zero-filled anon pages for empty slots
2788 2788   * and kept pages for non empty slots within given range.
2789 2789   *
2790 2790   * NOTE: This rontine will try and use large pages
2791 2791   *      if available and supported by underlying platform.
2792 2792   */
2793 2793  int
2794 2794  anon_map_createpages(
2795 2795          struct anon_map *amp,
2796 2796          ulong_t start_index,
2797 2797          size_t len,
2798 2798          page_t *ppa[],
2799 2799          struct seg *seg,
2800 2800          caddr_t addr,
2801 2801          enum seg_rw rw,
2802 2802          struct cred *cred)
2803 2803  {
2804 2804  
2805 2805          struct anon     *ap;
2806 2806          struct vnode    *ap_vp;
2807 2807          page_t          *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL;
2808 2808          int             err = 0;
2809 2809          ulong_t         p_index, index;
2810 2810          pgcnt_t         npgs, pg_cnt;
2811 2811          spgcnt_t        nreloc = 0;
2812 2812          uint_t          l_szc, szc, prot;
2813 2813          anoff_t         ap_off;
2814 2814          size_t          pgsz;
2815 2815          lgrp_t          *lgrp;
2816 2816          kmutex_t        *ahm;
2817 2817  
2818 2818          /*
2819 2819           * XXX For now only handle S_CREATE.
2820 2820           */
2821 2821          ASSERT(rw == S_CREATE);
2822 2822  
2823 2823          index   = start_index;
2824 2824          p_index = 0;
2825 2825          npgs = btopr(len);
2826 2826  
2827 2827          /*
2828 2828           * If this platform supports multiple page sizes
2829 2829           * then try and allocate directly from the free
2830 2830           * list for pages larger than PAGESIZE.
2831 2831           *
2832 2832           * NOTE:When we have page_create_ru we can stop
2833 2833           *      directly allocating from the freelist.
2834 2834           */
2835 2835          l_szc  = seg->s_szc;
2836 2836          ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2837 2837          while (npgs) {
2838 2838  
2839 2839                  /*
2840 2840                   * if anon slot already exists
2841 2841                   *   (means page has been created)
2842 2842                   * so 1) look up the page
2843 2843                   *    2) if the page is still in memory, get it.
2844 2844                   *    3) if not, create a page and
2845 2845                   *        page in from physical swap device.
2846 2846                   * These are done in anon_getpage().
2847 2847                   */
2848 2848                  ap = anon_get_ptr(amp->ahp, index);
2849 2849                  if (ap) {
2850 2850                          err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE,
2851 2851                              seg, addr, S_READ, cred);
2852 2852                          if (err) {
2853 2853                                  ANON_LOCK_EXIT(&amp->a_rwlock);
2854 2854                                  panic("anon_map_createpages: anon_getpage");
2855 2855                          }
2856 2856                          pp = anon_pl[0];
2857 2857                          ppa[p_index++] = pp;
2858 2858  
2859 2859                          /*
2860 2860                           * an_pvp can become non-NULL after SysV's page was
2861 2861                           * paged out before ISM was attached to this SysV
2862 2862                           * shared memory segment. So free swap slot if needed.
2863 2863                           */
2864 2864                          if (ap->an_pvp != NULL) {
2865 2865                                  page_io_lock(pp);
2866 2866                                  ahm = AH_MUTEX(ap->an_vp, ap->an_off);
2867 2867                                  mutex_enter(ahm);
2868 2868                                  if (ap->an_pvp != NULL) {
2869 2869                                          swap_phys_free(ap->an_pvp,
2870 2870                                              ap->an_poff, PAGESIZE);
2871 2871                                          ap->an_pvp = NULL;
2872 2872                                          ap->an_poff = 0;
2873 2873                                          mutex_exit(ahm);
2874 2874                                          hat_setmod(pp);
2875 2875                                  } else {
2876 2876                                          mutex_exit(ahm);
2877 2877                                  }
2878 2878                                  page_io_unlock(pp);
2879 2879                          }
2880 2880  
2881 2881                          addr += PAGESIZE;
2882 2882                          index++;
2883 2883                          npgs--;
2884 2884                          continue;
2885 2885                  }
2886 2886                  /*
2887 2887                   * Now try and allocate the largest page possible
2888 2888                   * for the current address and range.
2889 2889                   * Keep dropping down in page size until:
2890 2890                   *
2891 2891                   *      1) Properly aligned
2892 2892                   *      2) Does not overlap existing anon pages
2893 2893                   *      3) Fits in remaining range.
2894 2894                   *      4) able to allocate one.
2895 2895                   *
2896 2896                   * NOTE: XXX When page_create_ru is completed this code
2897 2897                   *       will change.
2898 2898                   */
2899 2899                  szc    = l_szc;
2900 2900                  pplist = NULL;
2901 2901                  pg_cnt = 0;
2902 2902                  while (szc) {
2903 2903                          pgsz    = page_get_pagesize(szc);
2904 2904                          pg_cnt  = pgsz >> PAGESHIFT;
2905 2905                          if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs &&
2906 2906                              anon_pages(amp->ahp, index, pg_cnt) == 0) {
2907 2907                                  /*
2908 2908                                   * XXX
2909 2909                                   * Since we are faking page_create()
2910 2910                                   * we also need to do the freemem and
2911 2911                                   * pcf accounting.
2912 2912                                   */
2913 2913                                  (void) page_create_wait(pg_cnt, PG_WAIT);
2914 2914  
2915 2915                                  /*
2916 2916                                   * Get lgroup to allocate next page of shared
2917 2917                                   * memory from and use it to specify where to
2918 2918                                   * allocate the physical memory
2919 2919                                   */
2920 2920                                  lgrp = lgrp_mem_choose(seg, addr, pgsz);
2921 2921  
2922 2922                                  pplist = page_get_freelist(
2923 2923                                      anon_vp, (u_offset_t)0, seg,
2924 2924                                      addr, pgsz, 0, lgrp);
2925 2925  
2926 2926                                  if (pplist == NULL) {
2927 2927                                          page_create_putback(pg_cnt);
2928 2928                                  }
2929 2929  
2930 2930                                  /*
2931 2931                                   * If a request for a page of size
2932 2932                                   * larger than PAGESIZE failed
2933 2933                                   * then don't try that size anymore.
2934 2934                                   */
2935 2935                                  if (pplist == NULL) {
2936 2936                                          l_szc = szc - 1;
2937 2937                                  } else {
2938 2938                                          break;
2939 2939                                  }
2940 2940                          }
2941 2941                          szc--;
2942 2942                  }
2943 2943  
2944 2944                  /*
2945 2945                   * If just using PAGESIZE pages then don't
2946 2946                   * directly allocate from the free list.
2947 2947                   */
2948 2948                  if (pplist == NULL) {
2949 2949                          ASSERT(szc == 0);
2950 2950                          pp = anon_zero(seg, addr, &ap, cred);
2951 2951                          if (pp == NULL) {
2952 2952                                  ANON_LOCK_EXIT(&amp->a_rwlock);
2953 2953                                  panic("anon_map_createpages: anon_zero");
2954 2954                          }
2955 2955                          ppa[p_index++] = pp;
2956 2956  
2957 2957                          ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
2958 2958                          (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
2959 2959  
2960 2960                          addr += PAGESIZE;
2961 2961                          index++;
2962 2962                          npgs--;
2963 2963                          continue;
2964 2964                  }
2965 2965  
2966 2966                  /*
2967 2967                   * pplist is a list of pg_cnt PAGESIZE pages.
2968 2968                   * These pages are locked SE_EXCL since they
2969 2969                   * came directly off the free list.
2970 2970                   */
2971 2971                  ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt));
2972 2972                  ASSERT(IS_P2ALIGNED(index, pg_cnt));
2973 2973                  ASSERT(conpp == NULL);
2974 2974                  while (pg_cnt--) {
2975 2975  
2976 2976                          ap = anon_alloc(NULL, 0);
2977 2977                          swap_xlate(ap, &ap_vp, &ap_off);
2978 2978  
2979 2979                          ASSERT(pplist != NULL);
2980 2980                          pp = pplist;
2981 2981                          page_sub(&pplist, pp);
2982 2982                          PP_CLRFREE(pp);
2983 2983                          PP_CLRAGED(pp);
2984 2984                          conpp = pp;
2985 2985  
2986 2986                          err = swap_getconpage(ap_vp, ap_off, PAGESIZE,
2987 2987                              (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL,
2988 2988                              &nreloc, seg, addr, S_CREATE, cred);
2989 2989  
2990 2990                          if (err) {
2991 2991                                  ANON_LOCK_EXIT(&amp->a_rwlock);
2992 2992                                  panic("anon_map_createpages: S_CREATE");
2993 2993                          }
2994 2994  
2995 2995                          ASSERT(anon_pl[0] == pp);
2996 2996                          ASSERT(nreloc == 1);
2997 2997                          pagezero(pp, 0, PAGESIZE);
2998 2998                          CPU_STATS_ADD_K(vm, zfod, 1);
2999 2999                          hat_setrefmod(pp);
3000 3000  
3001 3001                          ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
3002 3002                          (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
3003 3003  
3004 3004                          ppa[p_index++] = pp;
3005 3005  
3006 3006                          addr += PAGESIZE;
3007 3007                          index++;
3008 3008                          npgs--;
3009 3009                  }
3010 3010                  conpp = NULL;
3011 3011                  pg_cnt  = pgsz >> PAGESHIFT;
3012 3012                  p_index = p_index - pg_cnt;
3013 3013                  while (pg_cnt--) {
3014 3014                          page_downgrade(ppa[p_index++]);
3015 3015                  }
3016 3016          }
3017 3017          ANON_LOCK_EXIT(&amp->a_rwlock);
3018 3018          return (0);
3019 3019  }
3020 3020  
3021 3021  static int
3022 3022  anon_try_demote_pages(
3023 3023          struct anon_hdr *ahp,
3024 3024          ulong_t sidx,
3025 3025          uint_t szc,
3026 3026          page_t **ppa,
3027 3027          int private)
3028 3028  {
3029 3029          struct anon     *ap;
3030 3030          pgcnt_t         pgcnt = page_get_pagecnt(szc);
3031 3031          page_t          *pp;
3032 3032          pgcnt_t         i;
3033 3033          kmutex_t        *ahmpages = NULL;
3034 3034          int             root = 0;
3035 3035          pgcnt_t         npgs;
3036 3036          pgcnt_t         curnpgs = 0;
3037 3037          size_t          ppasize = 0;
3038 3038  
3039 3039          ASSERT(szc != 0);
3040 3040          ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
3041 3041          ASSERT(IS_P2ALIGNED(sidx, pgcnt));
3042 3042          ASSERT(sidx < ahp->size);
3043 3043  
3044 3044          if (ppa == NULL) {
3045 3045                  ppasize = pgcnt * sizeof (page_t *);
3046 3046                  ppa = kmem_alloc(ppasize, KM_SLEEP);
3047 3047          }
3048 3048  
3049 3049          ap = anon_get_ptr(ahp, sidx);
3050 3050          if (ap != NULL && private) {
3051 3051                  VM_STAT_ADD(anonvmstats.demotepages[1]);
3052 3052                  ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
3053 3053                  mutex_enter(ahmpages);
3054 3054          }
3055 3055  
3056 3056          if (ap != NULL && ap->an_refcnt > 1) {
3057 3057                  if (ahmpages != NULL) {
3058 3058                          VM_STAT_ADD(anonvmstats.demotepages[2]);
3059 3059                          mutex_exit(ahmpages);
3060 3060                  }
3061 3061                  if (ppasize != 0) {
3062 3062                          kmem_free(ppa, ppasize);
3063 3063                  }
3064 3064                  return (0);
3065 3065          }
3066 3066          if (ahmpages != NULL) {
3067 3067                  mutex_exit(ahmpages);
3068 3068          }
3069 3069          if (ahp->size - sidx < pgcnt) {
3070 3070                  ASSERT(private == 0);
3071 3071                  pgcnt = ahp->size - sidx;
3072 3072          }
3073 3073          for (i = 0; i < pgcnt; i++, sidx++) {
3074 3074                  ap = anon_get_ptr(ahp, sidx);
3075 3075                  if (ap != NULL) {
3076 3076                          if (ap->an_refcnt != 1) {
3077 3077                                  panic("anon_try_demote_pages: an_refcnt != 1");
3078 3078                          }
3079 3079                          pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off,
3080 3080                              SE_EXCL);
3081 3081                          if (pp != NULL) {
3082 3082                                  (void) hat_pageunload(pp,
3083 3083                                      HAT_FORCE_PGUNLOAD);
3084 3084                          }
3085 3085                  } else {
3086 3086                          ppa[i] = NULL;
3087 3087                  }
3088 3088          }
3089 3089          for (i = 0; i < pgcnt; i++) {
3090 3090                  if ((pp = ppa[i]) != NULL && pp->p_szc != 0) {
3091 3091                          ASSERT(pp->p_szc <= szc);
3092 3092                          if (!root) {
3093 3093                                  VM_STAT_ADD(anonvmstats.demotepages[3]);
3094 3094                                  if (curnpgs != 0)
3095 3095                                          panic("anon_try_demote_pages: "
3096 3096                                              "bad large page");
3097 3097  
3098 3098                                  root = 1;
3099 3099                                  curnpgs = npgs =
3100 3100                                      page_get_pagecnt(pp->p_szc);
3101 3101  
3102 3102                                  ASSERT(npgs <= pgcnt);
3103 3103                                  ASSERT(IS_P2ALIGNED(npgs, npgs));
3104 3104                                  ASSERT(!(page_pptonum(pp) & (npgs - 1)));
3105 3105                          } else {
3106 3106                                  ASSERT(i > 0);
3107 3107                                  ASSERT(page_pptonum(pp) - 1 ==
3108 3108                                      page_pptonum(ppa[i - 1]));
3109 3109                                  if ((page_pptonum(pp) & (npgs - 1)) ==
3110 3110                                      npgs - 1)
3111 3111                                          root = 0;
3112 3112                          }
3113 3113                          ASSERT(PAGE_EXCL(pp));
3114 3114                          pp->p_szc = 0;
3115 3115                          ASSERT(curnpgs > 0);
3116 3116                          curnpgs--;
3117 3117                  }
3118 3118          }
3119 3119          if (root != 0 || curnpgs != 0)
3120 3120                  panic("anon_try_demote_pages: bad large page");
3121 3121  
3122 3122          for (i = 0; i < pgcnt; i++) {
3123 3123                  if ((pp = ppa[i]) != NULL) {
3124 3124                          ASSERT(!hat_page_is_mapped(pp));
3125 3125                          ASSERT(pp->p_szc == 0);
3126 3126                          page_unlock(pp);
3127 3127                  }
3128 3128          }
3129 3129          if (ppasize != 0) {
3130 3130                  kmem_free(ppa, ppasize);
3131 3131          }
3132 3132          return (1);
3133 3133  }
3134 3134  
3135 3135  /*
3136 3136   * anon_map_demotepages() can only be called by MAP_PRIVATE segments.
3137 3137   */
3138 3138  int
3139 3139  anon_map_demotepages(
3140 3140          struct anon_map *amp,
3141 3141          ulong_t start_idx,
3142 3142          struct seg *seg,
3143 3143          caddr_t addr,
3144 3144          uint_t prot,
3145 3145          struct vpage vpage[],
3146 3146          struct cred *cred)
3147 3147  {
3148 3148          struct anon     *ap;
3149 3149          uint_t          szc = seg->s_szc;
3150 3150          pgcnt_t         pgcnt = page_get_pagecnt(szc);
3151 3151          size_t          ppasize = pgcnt * sizeof (page_t *);
3152 3152          page_t          **ppa = kmem_alloc(ppasize, KM_SLEEP);
3153 3153          page_t          *pp;
3154 3154          page_t          *pl[2];
3155 3155          pgcnt_t         i, pg_idx;
3156 3156          ulong_t         an_idx;
3157 3157          caddr_t         vaddr;
3158 3158          int             err;
3159 3159          int             retry = 0;
3160 3160          uint_t          vpprot;
3161 3161  
3162 3162          ASSERT(RW_WRITE_HELD(&amp->a_rwlock));
3163 3163          ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
3164 3164          ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
3165 3165          ASSERT(ppa != NULL);
3166 3166          ASSERT(szc != 0);
3167 3167          ASSERT(szc == amp->a_szc);
3168 3168  
3169 3169          VM_STAT_ADD(anonvmstats.demotepages[0]);
3170 3170  
3171 3171  top:
3172 3172          if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) {
3173 3173                  kmem_free(ppa, ppasize);
3174 3174                  return (0);
3175 3175          }
3176 3176  
3177 3177          VM_STAT_ADD(anonvmstats.demotepages[4]);
3178 3178  
3179 3179          ASSERT(retry == 0); /* we can be here only once */
3180 3180  
3181 3181          vaddr = addr;
3182 3182          for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
3183 3183              pg_idx++, an_idx++, vaddr += PAGESIZE) {
3184 3184                  ap = anon_get_ptr(amp->ahp, an_idx);
3185 3185                  if (ap == NULL)
3186 3186                          panic("anon_map_demotepages: no anon slot");
3187 3187                  err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr,
3188 3188                      S_READ, cred);
3189 3189                  if (err) {
3190 3190                          for (i = 0; i < pg_idx; i++) {
3191 3191                                  if ((pp = ppa[i]) != NULL)
3192 3192                                          page_unlock(pp);
3193 3193                          }
3194 3194                          kmem_free(ppa, ppasize);
3195 3195                          return (err);
3196 3196                  }
3197 3197                  ppa[pg_idx] = pl[0];
3198 3198          }
3199 3199  
3200 3200          err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa,
3201 3201              vpage, -1, 0, cred);
3202 3202          if (err > 0) {
3203 3203                  VM_STAT_ADD(anonvmstats.demotepages[5]);
3204 3204                  kmem_free(ppa, ppasize);
3205 3205                  return (err);
3206 3206          }
3207 3207          ASSERT(err == 0 || err == -1);
3208 3208          if (err == -1) {
3209 3209                  VM_STAT_ADD(anonvmstats.demotepages[6]);
3210 3210                  retry = 1;
3211 3211                  goto top;
3212 3212          }
3213 3213          for (i = 0; i < pgcnt; i++) {
3214 3214                  ASSERT(ppa[i] != NULL);
3215 3215                  if (ppa[i]->p_szc != 0)
3216 3216                          retry = 1;
3217 3217                  page_unlock(ppa[i]);
3218 3218          }
3219 3219          if (retry) {
3220 3220                  VM_STAT_ADD(anonvmstats.demotepages[7]);
3221 3221                  goto top;
3222 3222          }
3223 3223  
3224 3224          VM_STAT_ADD(anonvmstats.demotepages[8]);
3225 3225  
3226 3226          kmem_free(ppa, ppasize);
3227 3227  
3228 3228          return (0);
3229 3229  }
3230 3230  
3231 3231  /*
3232 3232   * Free pages of shared anon map. It's assumed that anon maps don't share anon
3233 3233   * structures with private anon maps. Therefore all anon structures should
3234 3234   * have at most one reference at this point. This means underlying pages can
3235 3235   * be exclusively locked and demoted or freed.  If not freeing the entire
3236 3236   * large pages demote the ends of the region we free to be able to free
3237 3237   * subpages. Page roots correspond to aligned index positions in anon map.
3238 3238   */
3239 3239  void
3240 3240  anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len)
3241 3241  {
3242 3242          ulong_t eidx = sidx + btopr(len);
3243 3243          pgcnt_t pages = page_get_pagecnt(amp->a_szc);
3244 3244          struct anon_hdr *ahp = amp->ahp;
3245 3245          ulong_t tidx;
3246 3246          size_t size;
3247 3247          ulong_t sidx_aligned;
3248 3248          ulong_t eidx_aligned;
3249 3249  
3250 3250          ASSERT(ANON_WRITE_HELD(&amp->a_rwlock));
3251 3251          ASSERT(amp->refcnt <= 1);
3252 3252          ASSERT(amp->a_szc > 0);
3253 3253          ASSERT(eidx <= ahp->size);
3254 3254          ASSERT(!anon_share(ahp, sidx, btopr(len)));
3255 3255  
3256 3256          if (len == 0) { /* XXX */
3257 3257                  return;
3258 3258          }
3259 3259  
3260 3260          sidx_aligned = P2ALIGN(sidx, pages);
3261 3261          if (sidx_aligned != sidx ||
3262 3262              (eidx < sidx_aligned + pages && eidx < ahp->size)) {
3263 3263                  if (!anon_try_demote_pages(ahp, sidx_aligned,
3264 3264                      amp->a_szc, NULL, 0)) {
3265 3265                          panic("anon_shmap_free_pages: demote failed");
3266 3266                  }
3267 3267                  size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) :
3268 3268                      P2NPHASE(sidx, pages);
3269 3269                  size <<= PAGESHIFT;
3270 3270                  anon_free(ahp, sidx, size);
3271 3271                  sidx = sidx_aligned + pages;
3272 3272                  if (eidx <= sidx) {
3273 3273                          return;
3274 3274                  }
3275 3275          }
3276 3276          eidx_aligned = P2ALIGN(eidx, pages);
3277 3277          if (sidx < eidx_aligned) {
3278 3278                  anon_free_pages(ahp, sidx,
3279 3279                      (eidx_aligned - sidx) << PAGESHIFT,
3280 3280                      amp->a_szc);
3281 3281                  sidx = eidx_aligned;
3282 3282          }
3283 3283          ASSERT(sidx == eidx_aligned);
3284 3284          if (eidx == eidx_aligned) {
3285 3285                  return;
3286 3286          }
3287 3287          tidx = eidx;
3288 3288          if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL &&
3289 3289              tidx - sidx < pages) {
3290 3290                  if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) {
3291 3291                          panic("anon_shmap_free_pages: demote failed");
3292 3292                  }
3293 3293                  size = (eidx - sidx) << PAGESHIFT;
3294 3294                  anon_free(ahp, sidx, size);
3295 3295          } else {
3296 3296                  anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc);
3297 3297          }
3298 3298  }
3299 3299  
3300 3300  /*
3301 3301   * This routine should be called with amp's writer lock when there're no other
3302 3302   * users of amp.  All pcache entries of this amp must have been already
3303 3303   * inactivated. We must not drop a_rwlock here to prevent new users from
3304 3304   * attaching to this amp.
3305 3305   */
3306 3306  void
3307 3307  anonmap_purge(struct anon_map *amp)
3308 3308  {
3309 3309          ASSERT(ANON_WRITE_HELD(&amp->a_rwlock));
3310 3310          ASSERT(amp->refcnt <= 1);
3311 3311  
3312 3312          if (amp->a_softlockcnt != 0) {
3313 3313                  seg_ppurge(NULL, amp, 0);
3314 3314          }
3315 3315  
3316 3316          /*
3317 3317           * Since all pcache entries were already inactive before this routine
3318 3318           * was called seg_ppurge() couldn't return while there're still
3319 3319           * entries that can be found via the list anchored at a_phead. So we
3320 3320           * can assert this list is empty now. a_softlockcnt may be still non 0
3321 3321           * if asynchronous thread that manages pcache already removed pcache
3322 3322           * entries but hasn't unlocked the pages yet. If a_softlockcnt is non
3323 3323           * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if
3324 3324           * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map
3325 3325           * before shamp_reclaim() is done with it. a_purgemtx also taken by
3326 3326           * shamp_reclaim() while a_softlockcnt was still not 0 acts as a
3327 3327           * barrier that prevents anonmap_purge() to complete while
3328 3328           * shamp_reclaim() may still be referencing this amp.
3329 3329           */
3330 3330          ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
3331 3331          ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
3332 3332  
3333 3333          mutex_enter(&amp->a_purgemtx);
3334 3334          while (amp->a_softlockcnt != 0) {
3335 3335                  ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
3336 3336                  ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
3337 3337                  amp->a_purgewait = 1;
3338 3338                  cv_wait(&amp->a_purgecv, &amp->a_purgemtx);
3339 3339          }
3340 3340          mutex_exit(&amp->a_purgemtx);
3341 3341  
3342 3342          ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
3343 3343          ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
3344 3344          ASSERT(amp->a_softlockcnt == 0);
3345 3345  }
3346 3346  
3347 3347  /*
3348 3348   * Allocate and initialize an anon_map structure for seg
3349 3349   * associating the given swap reservation with the new anon_map.
3350 3350   */
3351 3351  struct anon_map *
3352 3352  anonmap_alloc(size_t size, size_t swresv, int flags)
3353 3353  {
3354 3354          struct anon_map *amp;
3355 3355          int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
3356 3356  
3357 3357          amp = kmem_cache_alloc(anonmap_cache, kmflags);
3358 3358          if (amp == NULL) {
3359 3359                  ASSERT(kmflags == KM_NOSLEEP);
3360 3360                  return (NULL);
3361 3361          }
3362 3362  
3363 3363          amp->ahp = anon_create(btopr(size), flags);
3364 3364          if (amp->ahp == NULL) {
3365 3365                  ASSERT(flags == ANON_NOSLEEP);
3366 3366                  kmem_cache_free(anonmap_cache, amp);
3367 3367                  return (NULL);
3368 3368          }
3369 3369          amp->refcnt = 1;
3370 3370          amp->size = size;
3371 3371          amp->swresv = swresv;
3372 3372          amp->locality = 0;
3373 3373          amp->a_szc = 0;
3374 3374          amp->a_sp = NULL;
3375 3375          amp->a_softlockcnt = 0;
3376 3376          amp->a_purgewait = 0;
3377 3377          amp->a_phead.p_lnext = &amp->a_phead;
3378 3378          amp->a_phead.p_lprev = &amp->a_phead;
3379 3379  
3380 3380          return (amp);
3381 3381  }
3382 3382  
3383 3383  void
3384 3384  anonmap_free(struct anon_map *amp)
3385 3385  {
3386 3386          ASSERT(amp->ahp != NULL);
3387 3387          ASSERT(amp->refcnt == 0);
3388 3388          ASSERT(amp->a_softlockcnt == 0);
3389 3389          ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
3390 3390          ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
3391 3391  
3392 3392          lgrp_shm_policy_fini(amp, NULL);
3393 3393          anon_release(amp->ahp, btopr(amp->size));
3394 3394          kmem_cache_free(anonmap_cache, amp);
3395 3395  }
3396 3396  
3397 3397  /*
3398 3398   * Returns true if the app array has some empty slots.
3399 3399   * The offp and lenp parameters are in/out parameters.  On entry
3400 3400   * these values represent the starting offset and length of the
3401 3401   * mapping.  When true is returned, these values may be modified
3402 3402   * to be the largest range which includes empty slots.
3403 3403   */
3404 3404  int
3405 3405  non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp,
3406 3406                                  size_t *lenp)
3407 3407  {
3408 3408          ulong_t i, el;
3409 3409          ssize_t low, high;
3410 3410          struct anon *ap;
3411 3411  
3412 3412          low = -1;
3413 3413          for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) {
3414 3414                  ap = anon_get_ptr(ahp, anon_idx);
3415 3415                  if (ap == NULL) {
3416 3416                          if (low == -1)
3417 3417                                  low = i;
3418 3418                          high = i;
3419 3419                  }
3420 3420          }
3421 3421          if (low != -1) {
3422 3422                  /*
3423 3423                   * Found at least one non-anon page.
3424 3424                   * Set up the off and len return values.
3425 3425                   */
3426 3426                  if (low != 0)
3427 3427                          *offp += low;
3428 3428                  *lenp = high - low + PAGESIZE;
3429 3429                  return (1);
3430 3430          }
3431 3431          return (0);
3432 3432  }
3433 3433  
3434 3434  /*
3435 3435   * Return a count of the number of existing anon pages in the anon array
3436 3436   * app in the range (off, off+len). The array and slots must be guaranteed
3437 3437   * stable by the caller.
3438 3438   */
3439 3439  pgcnt_t
3440 3440  anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
3441 3441  {
3442 3442          pgcnt_t cnt = 0;
3443 3443  
3444 3444          while (nslots-- > 0) {
3445 3445                  if ((anon_get_ptr(ahp, anon_index)) != NULL)
3446 3446                          cnt++;
3447 3447                  anon_index++;
3448 3448          }
3449 3449          return (cnt);
3450 3450  }
3451 3451  
3452 3452  /*
3453 3453   * Move reserved phys swap into memory swap (unreserve phys swap
3454 3454   * and reserve mem swap by the same amount).
3455 3455   * Used by segspt when it needs to lock reserved swap npages in memory
3456 3456   */
3457 3457  int
3458 3458  anon_swap_adjust(pgcnt_t npages)
3459 3459  {
3460 3460          pgcnt_t unlocked_mem_swap;
3461 3461  
3462 3462          mutex_enter(&anoninfo_lock);
3463 3463  
3464 3464          ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
3465 3465          ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
3466 3466  
3467 3467          unlocked_mem_swap = k_anoninfo.ani_mem_resv
3468 3468              - k_anoninfo.ani_locked_swap;
3469 3469          if (npages > unlocked_mem_swap) {
3470 3470                  spgcnt_t adjusted_swap = npages - unlocked_mem_swap;
3471 3471  
3472 3472                  /*
3473 3473                   * if there is not enough unlocked mem swap we take missing
3474 3474                   * amount from phys swap and give it to mem swap
3475 3475                   */
3476 3476                  if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) {
3477 3477                          mutex_exit(&anoninfo_lock);
3478 3478                          return (ENOMEM);
3479 3479                  }
3480 3480  
3481 3481                  k_anoninfo.ani_mem_resv += adjusted_swap;
3482 3482                  ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap);
3483 3483                  k_anoninfo.ani_phys_resv -= adjusted_swap;
3484 3484  
3485 3485                  ANI_ADD(adjusted_swap);
3486 3486          }
3487 3487          k_anoninfo.ani_locked_swap += npages;
3488 3488  
3489 3489          ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
3490 3490          ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
3491 3491  
3492 3492          mutex_exit(&anoninfo_lock);
3493 3493  
3494 3494          return (0);
3495 3495  }
3496 3496  
3497 3497  /*
3498 3498   * 'unlocked' reserved mem swap so when it is unreserved it
3499 3499   * can be moved back phys (disk) swap
3500 3500   */
3501 3501  void
3502 3502  anon_swap_restore(pgcnt_t npages)
3503 3503  {
3504 3504          mutex_enter(&anoninfo_lock);
3505 3505  
3506 3506          ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
3507 3507  
3508 3508          ASSERT(k_anoninfo.ani_locked_swap >= npages);
3509 3509          k_anoninfo.ani_locked_swap -= npages;
3510 3510  
3511 3511          ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
3512 3512  
3513 3513          mutex_exit(&anoninfo_lock);
3514 3514  }
3515 3515  
3516 3516  /*
3517 3517   * Return the pointer from the list for a
3518 3518   * specified anon index.
3519 3519   */
3520 3520  ulong_t *
3521 3521  anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx)
3522 3522  {
3523 3523          struct anon     **app;
3524 3524          void            **ppp;
3525 3525  
3526 3526          ASSERT(an_idx < ahp->size);
3527 3527  
3528 3528          /*
3529 3529           * Single level case.
3530 3530           */
3531 3531          if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
3532 3532                  return ((ulong_t *)&ahp->array_chunk[an_idx]);
3533 3533          } else {
3534 3534  
3535 3535                  /*
3536 3536                   * 2 level case.
3537 3537                   */
3538 3538                  ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
3539 3539                  if (*ppp == NULL) {
3540 3540                          mutex_enter(&ahp->serial_lock);
3541 3541                          ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
3542 3542                          if (*ppp == NULL)
3543 3543                                  *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP);
3544 3544                          mutex_exit(&ahp->serial_lock);
3545 3545                  }
3546 3546                  app = *ppp;
3547 3547                  return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]);
3548 3548          }
3549 3549  }
3550 3550  
3551 3551  void
3552 3552  anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj)
3553 3553  {
3554 3554          ulong_t         *ap_slot;
3555 3555          kmutex_t        *mtx;
3556 3556          kcondvar_t      *cv;
3557 3557          int             hash;
3558 3558  
3559 3559          /*
3560 3560           * Use szc to determine anon slot(s) to appear atomic.
3561 3561           * If szc = 0, then lock the anon slot and mark it busy.
3562 3562           * If szc > 0, then lock the range of slots by getting the
3563 3563           * anon_array_lock for the first anon slot, and mark only the
3564 3564           * first anon slot busy to represent whole range being busy.
3565 3565           */
3566 3566  
3567 3567          ASSERT(RW_READ_HELD(&amp->a_rwlock));
3568 3568          an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc));

↓ open down ↓

3487 lines elided

↑ open up ↑

3569 3569          hash = ANON_ARRAY_HASH(amp, an_idx);
3570 3570          sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex;
3571 3571          sobj->sync_cv = cv = &anon_array_cv[hash];
3572 3572          mutex_enter(mtx);
3573 3573          ap_slot = anon_get_slot(amp->ahp, an_idx);
3574 3574          while (ANON_ISBUSY(ap_slot))
3575 3575                  cv_wait(cv, mtx);
3576 3576          ANON_SETBUSY(ap_slot);
3577 3577          sobj->sync_data = ap_slot;
3578 3578          mutex_exit(mtx);
3579      -}
3580      -
3581      -int
3582      -anon_array_try_enter(struct anon_map *amp, ulong_t an_idx,
3583      -                        anon_sync_obj_t *sobj)
3584      -{
3585      -        ulong_t         *ap_slot;
3586      -        kmutex_t        *mtx;
3587      -        int             hash;
3588      -
3589      -        /*
3590      -         * Try to lock a range of anon slots.
3591      -         * Use szc to determine anon slot(s) to appear atomic.
3592      -         * If szc = 0, then lock the anon slot and mark it busy.
3593      -         * If szc > 0, then lock the range of slots by getting the
3594      -         * anon_array_lock for the first anon slot, and mark only the
3595      -         * first anon slot busy to represent whole range being busy.
3596      -         * Fail if the mutex or the anon_array are busy.
3597      -         */
3598      -
3599      -        ASSERT(RW_READ_HELD(&amp->a_rwlock));
3600      -        an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc));
3601      -        hash = ANON_ARRAY_HASH(amp, an_idx);
3602      -        sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex;
3603      -        sobj->sync_cv = &anon_array_cv[hash];
3604      -        if (!mutex_tryenter(mtx)) {
3605      -                return (EWOULDBLOCK);
3606      -        }
3607      -        ap_slot = anon_get_slot(amp->ahp, an_idx);
3608      -        if (ANON_ISBUSY(ap_slot)) {
3609      -                mutex_exit(mtx);
3610      -                return (EWOULDBLOCK);
3611      -        }
3612      -        ANON_SETBUSY(ap_slot);
3613      -        sobj->sync_data = ap_slot;
3614      -        mutex_exit(mtx);
3615      -        return (0);
3616 3579  }
3617 3580  
3618 3581  void
3619 3582  anon_array_exit(anon_sync_obj_t *sobj)
3620 3583  {
3621 3584          mutex_enter(sobj->sync_mutex);
3622 3585          ASSERT(ANON_ISBUSY(sobj->sync_data));
3623 3586          ANON_CLRBUSY(sobj->sync_data);
3624 3587          if (CV_HAS_WAITERS(sobj->sync_cv))
3625 3588                  cv_broadcast(sobj->sync_cv);
3626 3589          mutex_exit(sobj->sync_mutex);
3627 3590  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX