6145-instead-using-SEGOP_-macros-define-full-fledged-segop_-functions Wdiff usr/src/uts/common/vm/vm_seg.c

Print this page

6145 instead using SEGOP_* macros, define full-fledged segop_* functions

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_seg.c
          +++ new/usr/src/uts/common/vm/vm_seg.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.

↓ open down ↓

14 lines elided

↑ open up ↑

  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   * Copyright (c) 2015, Joyent, Inc.
       25 + * Copyright 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
  25   26   */
  26   27  
  27   28  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28   29  /*        All Rights Reserved   */
  29   30  
  30   31  /*
  31   32   * University Copyright- Copyright (c) 1982, 1986, 1988
  32   33   * The Regents of the University of California
  33   34   * All Rights Reserved
  34   35   *

  35   36   * University Acknowledgment- Portions of this document are derived from
  36   37   * software developed by the University of California, Berkeley, and its
  37   38   * contributors.
  38   39   */
  39   40  
  40   41  /*
  41   42   * VM - segment management.
  42   43   */
  43   44  
  44   45  #include <sys/types.h>
  45   46  #include <sys/inttypes.h>
  46   47  #include <sys/t_lock.h>
  47   48  #include <sys/param.h>
  48   49  #include <sys/systm.h>
  49   50  #include <sys/kmem.h>
  50   51  #include <sys/sysmacros.h>
  51   52  #include <sys/vmsystm.h>
  52   53  #include <sys/tuneable.h>
  53   54  #include <sys/debug.h>
  54   55  #include <sys/fs/swapnode.h>
  55   56  #include <sys/cmn_err.h>
  56   57  #include <sys/callb.h>
  57   58  #include <sys/mem_config.h>
  58   59  #include <sys/mman.h>
  59   60  
  60   61  #include <vm/hat.h>
  61   62  #include <vm/as.h>
  62   63  #include <vm/seg.h>
  63   64  #include <vm/seg_kmem.h>
  64   65  #include <vm/seg_spt.h>
  65   66  #include <vm/seg_vn.h>
  66   67  #include <vm/anon.h>
  67   68  
  68   69  /*
  69   70   * kstats for segment advise
  70   71   */
  71   72  segadvstat_t segadvstat = {
  72   73          { "MADV_FREE_hit",      KSTAT_DATA_ULONG },
  73   74          { "MADV_FREE_miss",     KSTAT_DATA_ULONG },
  74   75  };
  75   76  
  76   77  kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
  77   78  uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
  78   79  
  79   80  /*
  80   81   * entry in the segment page cache
  81   82   */
  82   83  struct seg_pcache {
  83   84          struct seg_pcache       *p_hnext;       /* list for hashed blocks */
  84   85          struct seg_pcache       *p_hprev;
  85   86          pcache_link_t           p_plink;        /* per segment/amp list */
  86   87          void                    *p_htag0;       /* segment/amp pointer */
  87   88          caddr_t                 p_addr;         /* base address/anon_idx */
  88   89          size_t                  p_len;          /* total bytes */
  89   90          size_t                  p_wlen;         /* writtable bytes at p_addr */
  90   91          struct page             **p_pp;         /* pp shadow list */
  91   92          seg_preclaim_cbfunc_t   p_callback;     /* reclaim callback function */
  92   93          clock_t                 p_lbolt;        /* lbolt from last use */
  93   94          struct seg_phash        *p_hashp;       /* our pcache hash bucket */
  94   95          uint_t                  p_active;       /* active count */
  95   96          uchar_t                 p_write;        /* true if S_WRITE */
  96   97          uchar_t                 p_ref;          /* reference byte */
  97   98          ushort_t                p_flags;        /* bit flags */
  98   99  };
  99  100  
 100  101  struct seg_phash {
 101  102          struct seg_pcache       *p_hnext;       /* list for hashed blocks */
 102  103          struct seg_pcache       *p_hprev;
 103  104          kmutex_t                p_hmutex;       /* protects hash bucket */
 104  105          pcache_link_t           p_halink[2];    /* active bucket linkages */
 105  106  };
 106  107  
 107  108  struct seg_phash_wired {
 108  109          struct seg_pcache       *p_hnext;       /* list for hashed blocks */
 109  110          struct seg_pcache       *p_hprev;
 110  111          kmutex_t                p_hmutex;       /* protects hash bucket */
 111  112  };
 112  113  
 113  114  /*
 114  115   * A parameter to control a maximum number of bytes that can be
 115  116   * purged from pcache at a time.
 116  117   */
 117  118  #define P_MAX_APURGE_BYTES      (1024 * 1024 * 1024)
 118  119  
 119  120  /*
 120  121   * log2(fraction of pcache to reclaim at a time).
 121  122   */
 122  123  #define P_SHRINK_SHFT           (5)
 123  124  
 124  125  /*
 125  126   * The following variables can be tuned via /etc/system.
 126  127   */
 127  128  
 128  129  int     segpcache_enabled = 1;          /* if 1, shadow lists are cached */
 129  130  pgcnt_t segpcache_maxwindow = 0;        /* max # of pages that can be cached */
 130  131  ulong_t segpcache_hashsize_win = 0;     /* # of non wired buckets */
 131  132  ulong_t segpcache_hashsize_wired = 0;   /* # of wired buckets */
 132  133  int     segpcache_reap_sec = 1;         /* reap check rate in secs */
 133  134  clock_t segpcache_reap_ticks = 0;       /* reap interval in ticks */
 134  135  int     segpcache_pcp_maxage_sec = 1;   /* pcp max age in secs */
 135  136  clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */
 136  137  int     segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */
 137  138  pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */
 138  139  
 139  140  static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */
 140  141  static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */
 141  142  static kcondvar_t seg_pasync_cv;
 142  143  
 143  144  #pragma align 64(pctrl1)
 144  145  #pragma align 64(pctrl2)
 145  146  #pragma align 64(pctrl3)
 146  147  
 147  148  /*
 148  149   * Keep frequently used variables together in one cache line.
 149  150   */
 150  151  static struct p_ctrl1 {
 151  152          uint_t p_disabled;              /* if not 0, caching temporarily off */
 152  153          pgcnt_t p_maxwin;               /* max # of pages that can be cached */
 153  154          size_t p_hashwin_sz;            /* # of non wired buckets */
 154  155          struct seg_phash *p_htabwin;    /* hash table for non wired entries */
 155  156          size_t p_hashwired_sz;          /* # of wired buckets */
 156  157          struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
 157  158          kmem_cache_t *p_kmcache;        /* kmem cache for seg_pcache structs */
 158  159  #ifdef _LP64
 159  160          ulong_t pad[1];
 160  161  #endif /* _LP64 */
 161  162  } pctrl1;
 162  163  
 163  164  static struct p_ctrl2 {
 164  165          kmutex_t p_mem_mtx;     /* protects window counter and p_halinks */
 165  166          pgcnt_t  p_locked_win;  /* # pages from window */
 166  167          pgcnt_t  p_locked;      /* # of pages cached by pagelock */
 167  168          uchar_t  p_ahcur;       /* current active links for insert/delete */
 168  169          uchar_t  p_athr_on;     /* async reclaim thread is running. */
 169  170          pcache_link_t p_ahhead[2]; /* active buckets linkages */
 170  171  } pctrl2;
 171  172  
 172  173  static struct p_ctrl3 {
 173  174          clock_t p_pcp_maxage;           /* max pcp age in ticks */
 174  175          ulong_t p_athr_empty_ahb;       /* athread walk stats */
 175  176          ulong_t p_athr_full_ahb;        /* athread walk stats */
 176  177          pgcnt_t p_maxapurge_npages;     /* max pages to purge at a time */
 177  178          int     p_shrink_shft;          /* reap shift factor */
 178  179  #ifdef _LP64
 179  180          ulong_t pad[3];
 180  181  #endif /* _LP64 */
 181  182  } pctrl3;
 182  183  
 183  184  #define seg_pdisabled                   pctrl1.p_disabled
 184  185  #define seg_pmaxwindow                  pctrl1.p_maxwin
 185  186  #define seg_phashsize_win               pctrl1.p_hashwin_sz
 186  187  #define seg_phashtab_win                pctrl1.p_htabwin
 187  188  #define seg_phashsize_wired             pctrl1.p_hashwired_sz
 188  189  #define seg_phashtab_wired              pctrl1.p_htabwired
 189  190  #define seg_pkmcache                    pctrl1.p_kmcache
 190  191  #define seg_pmem_mtx                    pctrl2.p_mem_mtx
 191  192  #define seg_plocked_window              pctrl2.p_locked_win
 192  193  #define seg_plocked                     pctrl2.p_locked
 193  194  #define seg_pahcur                      pctrl2.p_ahcur
 194  195  #define seg_pathr_on                    pctrl2.p_athr_on
 195  196  #define seg_pahhead                     pctrl2.p_ahhead
 196  197  #define seg_pmax_pcpage                 pctrl3.p_pcp_maxage
 197  198  #define seg_pathr_empty_ahb             pctrl3.p_athr_empty_ahb
 198  199  #define seg_pathr_full_ahb              pctrl3.p_athr_full_ahb
 199  200  #define seg_pshrink_shift               pctrl3.p_shrink_shft
 200  201  #define seg_pmaxapurge_npages           pctrl3.p_maxapurge_npages
 201  202  
 202  203  #define P_HASHWIN_MASK                  (seg_phashsize_win - 1)
 203  204  #define P_HASHWIRED_MASK                (seg_phashsize_wired - 1)
 204  205  #define P_BASESHIFT                     (6)
 205  206  
 206  207  kthread_t *seg_pasync_thr;
 207  208  
 208  209  extern struct seg_ops segvn_ops;
 209  210  extern struct seg_ops segspt_shmops;
 210  211  
 211  212  #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
 212  213  #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
 213  214  
 214  215  #define LBOLT_DELTA(t)  ((ulong_t)(ddi_get_lbolt() - (t)))
 215  216  
 216  217  #define PCP_AGE(pcp)    LBOLT_DELTA((pcp)->p_lbolt)
 217  218  
 218  219  /*
 219  220   * htag0 argument can be a seg or amp pointer.
 220  221   */
 221  222  #define P_HASHBP(seg, htag0, addr, flags)                               \
 222  223          (IS_PFLAGS_WIRED((flags)) ?                                     \
 223  224              ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \
 224  225              ((uintptr_t)(htag0) >> P_BASESHIFT)]) :                     \
 225  226              (&seg_phashtab_win[P_HASHWIN_MASK &                         \
 226  227              (((uintptr_t)(htag0) >> 3) ^                                \
 227  228              ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ?              \
 228  229              (flags >> 16) : page_get_shift((seg)->s_szc))))]))
 229  230  
 230  231  /*
 231  232   * htag0 argument can be a seg or amp pointer.
 232  233   */
 233  234  #define P_MATCH(pcp, htag0, addr, len)                                  \
 234  235          ((pcp)->p_htag0 == (htag0) &&                                   \
 235  236          (pcp)->p_addr == (addr) &&                                      \
 236  237          (pcp)->p_len >= (len))
 237  238  
 238  239  #define P_MATCH_PP(pcp, htag0, addr, len, pp)                           \
 239  240          ((pcp)->p_pp == (pp) &&                                         \
 240  241          (pcp)->p_htag0 == (htag0) &&                                    \
 241  242          (pcp)->p_addr == (addr) &&                                      \
 242  243          (pcp)->p_len >= (len))
 243  244  
 244  245  #define plink2pcache(pl)        ((struct seg_pcache *)((uintptr_t)(pl) - \
 245  246      offsetof(struct seg_pcache, p_plink)))
 246  247  
 247  248  #define hlink2phash(hl, l)      ((struct seg_phash *)((uintptr_t)(hl) - \
 248  249      offsetof(struct seg_phash, p_halink[l])))
 249  250  
 250  251  /*
 251  252   * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
 252  253   * active hash bucket lists. We maintain active bucket lists to reduce the
 253  254   * overhead of finding active buckets during asynchronous purging since there
 254  255   * can be 10s of millions of buckets on a large system but only a small subset
 255  256   * of them in actual use.
 256  257   *
 257  258   * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
 258  259   * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
 259  260   * buckets. The other list is used by asynchronous purge thread. This allows
 260  261   * the purge thread to walk its active list without holding seg_pmem_mtx for a
 261  262   * long time. When asynchronous thread is done with its list it switches to
 262  263   * current active list and makes the list it just finished processing as
 263  264   * current active list.
 264  265   *
 265  266   * seg_padd_abuck() only adds the bucket to current list if the bucket is not
 266  267   * yet on any list.  seg_premove_abuck() may remove the bucket from either
 267  268   * list. If the bucket is on current list it will be always removed. Otherwise
 268  269   * the bucket is only removed if asynchronous purge thread is not currently
 269  270   * running or seg_premove_abuck() is called by asynchronous purge thread
 270  271   * itself. A given bucket can only be on one of active lists at a time. These
 271  272   * routines should be called with per bucket lock held.  The routines use
 272  273   * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
 273  274   * the first entry is added to the bucket chain and seg_premove_abuck() must
 274  275   * be called after the last pcp entry is deleted from its chain. Per bucket
 275  276   * lock should be held by the callers.  This avoids a potential race condition
 276  277   * when seg_premove_abuck() removes a bucket after pcp entries are added to
 277  278   * its list after the caller checked that the bucket has no entries. (this
 278  279   * race would cause a loss of an active bucket from the active lists).
 279  280   *
 280  281   * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
 281  282   * New entries are added to the end of the list since LRU is used as the
 282  283   * purging policy.
 283  284   */
 284  285  static void
 285  286  seg_padd_abuck(struct seg_phash *hp)
 286  287  {
 287  288          int lix;
 288  289  
 289  290          ASSERT(MUTEX_HELD(&hp->p_hmutex));
 290  291          ASSERT((struct seg_phash *)hp->p_hnext != hp);
 291  292          ASSERT((struct seg_phash *)hp->p_hprev != hp);
 292  293          ASSERT(hp->p_hnext == hp->p_hprev);
 293  294          ASSERT(!IS_PCP_WIRED(hp->p_hnext));
 294  295          ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
 295  296          ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
 296  297          ASSERT(hp >= seg_phashtab_win &&
 297  298              hp < &seg_phashtab_win[seg_phashsize_win]);
 298  299  
 299  300          /*
 300  301           * This bucket can already be on one of active lists
 301  302           * since seg_premove_abuck() may have failed to remove it
 302  303           * before.
 303  304           */
 304  305          mutex_enter(&seg_pmem_mtx);
 305  306          lix = seg_pahcur;
 306  307          ASSERT(lix >= 0 && lix <= 1);
 307  308          if (hp->p_halink[lix].p_lnext != NULL) {
 308  309                  ASSERT(hp->p_halink[lix].p_lprev != NULL);
 309  310                  ASSERT(hp->p_halink[!lix].p_lnext == NULL);
 310  311                  ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 311  312                  mutex_exit(&seg_pmem_mtx);
 312  313                  return;
 313  314          }
 314  315          ASSERT(hp->p_halink[lix].p_lprev == NULL);
 315  316  
 316  317          /*
 317  318           * If this bucket is still on list !lix async thread can't yet remove
 318  319           * it since we hold here per bucket lock. In this case just return
 319  320           * since async thread will eventually find and process this bucket.
 320  321           */
 321  322          if (hp->p_halink[!lix].p_lnext != NULL) {
 322  323                  ASSERT(hp->p_halink[!lix].p_lprev != NULL);
 323  324                  mutex_exit(&seg_pmem_mtx);
 324  325                  return;
 325  326          }
 326  327          ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 327  328          /*
 328  329           * This bucket is not on any active bucket list yet.
 329  330           * Add the bucket to the tail of current active list.
 330  331           */
 331  332          hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
 332  333          hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
 333  334          seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
 334  335          seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
 335  336          mutex_exit(&seg_pmem_mtx);
 336  337  }
 337  338  
 338  339  static void
 339  340  seg_premove_abuck(struct seg_phash *hp, int athr)
 340  341  {
 341  342          int lix;
 342  343  
 343  344          ASSERT(MUTEX_HELD(&hp->p_hmutex));
 344  345          ASSERT((struct seg_phash *)hp->p_hnext == hp);
 345  346          ASSERT((struct seg_phash *)hp->p_hprev == hp);
 346  347          ASSERT(hp >= seg_phashtab_win &&
 347  348              hp < &seg_phashtab_win[seg_phashsize_win]);
 348  349  
 349  350          if (athr) {
 350  351                  ASSERT(seg_pathr_on);
 351  352                  ASSERT(seg_pahcur <= 1);
 352  353                  /*
 353  354                   * We are called by asynchronous thread that found this bucket
 354  355                   * on not currently active (i.e. !seg_pahcur) list. Remove it
 355  356                   * from there.  Per bucket lock we are holding makes sure
 356  357                   * seg_pinsert() can't sneak in and add pcp entries to this
 357  358                   * bucket right before we remove the bucket from its list.
 358  359                   */
 359  360                  lix = !seg_pahcur;
 360  361                  ASSERT(hp->p_halink[lix].p_lnext != NULL);
 361  362                  ASSERT(hp->p_halink[lix].p_lprev != NULL);
 362  363                  ASSERT(hp->p_halink[!lix].p_lnext == NULL);
 363  364                  ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 364  365                  hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
 365  366                  hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
 366  367                  hp->p_halink[lix].p_lnext = NULL;
 367  368                  hp->p_halink[lix].p_lprev = NULL;
 368  369                  return;
 369  370          }
 370  371  
 371  372          mutex_enter(&seg_pmem_mtx);
 372  373          lix = seg_pahcur;
 373  374          ASSERT(lix >= 0 && lix <= 1);
 374  375  
 375  376          /*
 376  377           * If the bucket is on currently active list just remove it from
 377  378           * there.
 378  379           */
 379  380          if (hp->p_halink[lix].p_lnext != NULL) {
 380  381                  ASSERT(hp->p_halink[lix].p_lprev != NULL);
 381  382                  ASSERT(hp->p_halink[!lix].p_lnext == NULL);
 382  383                  ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 383  384                  hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
 384  385                  hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
 385  386                  hp->p_halink[lix].p_lnext = NULL;
 386  387                  hp->p_halink[lix].p_lprev = NULL;
 387  388                  mutex_exit(&seg_pmem_mtx);
 388  389                  return;
 389  390          }
 390  391          ASSERT(hp->p_halink[lix].p_lprev == NULL);
 391  392  
 392  393          /*
 393  394           * If asynchronous thread is not running we can remove the bucket from
 394  395           * not currently active list. The bucket must be on this list since we
 395  396           * already checked that it's not on the other list and the bucket from
 396  397           * which we just deleted the last pcp entry must be still on one of the
 397  398           * active bucket lists.
 398  399           */
 399  400          lix = !lix;
 400  401          ASSERT(hp->p_halink[lix].p_lnext != NULL);
 401  402          ASSERT(hp->p_halink[lix].p_lprev != NULL);
 402  403  
 403  404          if (!seg_pathr_on) {
 404  405                  hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
 405  406                  hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
 406  407                  hp->p_halink[lix].p_lnext = NULL;
 407  408                  hp->p_halink[lix].p_lprev = NULL;
 408  409          }
 409  410          mutex_exit(&seg_pmem_mtx);
 410  411  }
 411  412  
 412  413  /*
 413  414   * Check if bucket pointed by hp already has a pcp entry that matches request
 414  415   * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
 415  416   * Also delete matching entries that cover smaller address range but start
 416  417   * at the same address as addr argument. Return the list of deleted entries if
 417  418   * any. This is an internal helper function called from seg_pinsert() only
 418  419   * for non wired shadow lists. The caller already holds a per seg/amp list
 419  420   * lock.
 420  421   */
 421  422  static struct seg_pcache *
 422  423  seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
 423  424      caddr_t addr, size_t len, int *found)
 424  425  {
 425  426          struct seg_pcache *pcp;
 426  427          struct seg_pcache *delcallb_list = NULL;
 427  428  
 428  429          ASSERT(MUTEX_HELD(&hp->p_hmutex));
 429  430  
 430  431          *found = 0;
 431  432          for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 432  433              pcp = pcp->p_hnext) {
 433  434                  ASSERT(pcp->p_hashp == hp);
 434  435                  if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
 435  436                          ASSERT(!IS_PCP_WIRED(pcp));
 436  437                          if (pcp->p_len < len) {
 437  438                                  pcache_link_t *plinkp;
 438  439                                  if (pcp->p_active) {
 439  440                                          continue;
 440  441                                  }
 441  442                                  plinkp = &pcp->p_plink;
 442  443                                  plinkp->p_lprev->p_lnext = plinkp->p_lnext;
 443  444                                  plinkp->p_lnext->p_lprev = plinkp->p_lprev;
 444  445                                  pcp->p_hprev->p_hnext = pcp->p_hnext;
 445  446                                  pcp->p_hnext->p_hprev = pcp->p_hprev;
 446  447                                  pcp->p_hprev = delcallb_list;
 447  448                                  delcallb_list = pcp;
 448  449                          } else {
 449  450                                  *found = 1;
 450  451                                  break;
 451  452                          }
 452  453                  }
 453  454          }
 454  455          return (delcallb_list);
 455  456  }
 456  457  
 457  458  /*
 458  459   * lookup an address range in pagelock cache. Return shadow list and bump up
 459  460   * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
 460  461   * as a lookup tag.
 461  462   */
 462  463  struct page **
 463  464  seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
 464  465      enum seg_rw rw, uint_t flags)
 465  466  {
 466  467          struct seg_pcache *pcp;
 467  468          struct seg_phash *hp;
 468  469          void *htag0;
 469  470  
 470  471          ASSERT(seg != NULL);
 471  472          ASSERT(rw == S_READ || rw == S_WRITE);
 472  473  
 473  474          /*
 474  475           * Skip pagelock cache, while DR is in progress or
 475  476           * seg_pcache is off.
 476  477           */
 477  478          if (seg_pdisabled) {
 478  479                  return (NULL);
 479  480          }
 480  481          ASSERT(seg_phashsize_win != 0);
 481  482  
 482  483          htag0 = (amp == NULL ? (void *)seg : (void *)amp);
 483  484          hp = P_HASHBP(seg, htag0, addr, flags);
 484  485          mutex_enter(&hp->p_hmutex);
 485  486          for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 486  487              pcp = pcp->p_hnext) {
 487  488                  ASSERT(pcp->p_hashp == hp);
 488  489                  if (P_MATCH(pcp, htag0, addr, len)) {
 489  490                          ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
 490  491                          /*
 491  492                           * If this request wants to write pages
 492  493                           * but write permissions starting from
 493  494                           * addr don't cover the entire length len
 494  495                           * return lookup failure back to the caller.
 495  496                           * It will check protections and fail this
 496  497                           * pagelock operation with EACCESS error.
 497  498                           */
 498  499                          if (rw == S_WRITE && pcp->p_wlen < len) {
 499  500                                  break;
 500  501                          }
 501  502                          if (pcp->p_active == UINT_MAX) {
 502  503                                  break;
 503  504                          }
 504  505                          pcp->p_active++;
 505  506                          if (rw == S_WRITE && !pcp->p_write) {
 506  507                                  pcp->p_write = 1;
 507  508                          }
 508  509                          mutex_exit(&hp->p_hmutex);
 509  510                          return (pcp->p_pp);
 510  511                  }
 511  512          }
 512  513          mutex_exit(&hp->p_hmutex);
 513  514          return (NULL);
 514  515  }
 515  516  
 516  517  /*
 517  518   * mark address range inactive. If the cache is off or the address range is
 518  519   * not in the cache or another shadow list that covers bigger range is found
 519  520   * we call the segment driver to reclaim the pages. Otherwise just decrement
 520  521   * active count and set ref bit.  If amp is not NULL use amp as a lookup tag
 521  522   * otherwise use seg as a lookup tag.
 522  523   */
 523  524  void
 524  525  seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
 525  526      size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
 526  527      seg_preclaim_cbfunc_t callback)
 527  528  {
 528  529          struct seg_pcache *pcp;
 529  530          struct seg_phash *hp;
 530  531          kmutex_t *pmtx = NULL;
 531  532          pcache_link_t *pheadp;
 532  533          void *htag0;
 533  534          pgcnt_t npages = 0;
 534  535          int keep = 0;
 535  536  
 536  537          ASSERT(seg != NULL);
 537  538          ASSERT(rw == S_READ || rw == S_WRITE);
 538  539  
 539  540          htag0 = (amp == NULL ? (void *)seg : (void *)amp);
 540  541  
 541  542          /*
 542  543           * Skip lookup if pcache is not configured.
 543  544           */
 544  545          if (seg_phashsize_win == 0) {
 545  546                  goto out;
 546  547          }
 547  548  
 548  549          /*
 549  550           * Grab per seg/amp lock before hash lock if we are going to remove
 550  551           * inactive entry from pcache.
 551  552           */
 552  553          if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
 553  554                  if (amp == NULL) {
 554  555                          pheadp = &seg->s_phead;
 555  556                          pmtx = &seg->s_pmtx;
 556  557                  } else {
 557  558                          pheadp = &amp->a_phead;
 558  559                          pmtx = &amp->a_pmtx;
 559  560                  }
 560  561                  mutex_enter(pmtx);
 561  562          }
 562  563  
 563  564          hp = P_HASHBP(seg, htag0, addr, flags);
 564  565          mutex_enter(&hp->p_hmutex);
 565  566  again:
 566  567          for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 567  568              pcp = pcp->p_hnext) {
 568  569                  ASSERT(pcp->p_hashp == hp);
 569  570                  if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
 570  571                          ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
 571  572                          ASSERT(pcp->p_active);
 572  573                          if (keep) {
 573  574                                  /*
 574  575                                   * Don't remove this pcp entry
 575  576                                   * if we didn't find duplicate
 576  577                                   * shadow lists on second search.
 577  578                                   * Somebody removed those duplicates
 578  579                                   * since we dropped hash lock after first
 579  580                                   * search.
 580  581                                   */
 581  582                                  ASSERT(pmtx != NULL);
 582  583                                  ASSERT(!IS_PFLAGS_WIRED(flags));
 583  584                                  mutex_exit(pmtx);
 584  585                                  pmtx = NULL;
 585  586                          }
 586  587                          pcp->p_active--;
 587  588                          if (pcp->p_active == 0 && (pmtx != NULL ||
 588  589                              (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
 589  590  
 590  591                                  /*
 591  592                                   * This entry is no longer active.  Remove it
 592  593                                   * now either because pcaching is temporarily
 593  594                                   * disabled or there're other pcp entries that
 594  595                                   * can match this pagelock request (i.e. this
 595  596                                   * entry is a duplicate).
 596  597                                   */
 597  598  
 598  599                                  ASSERT(callback == pcp->p_callback);
 599  600                                  if (pmtx != NULL) {
 600  601                                          pcache_link_t *plinkp = &pcp->p_plink;
 601  602                                          ASSERT(!IS_PCP_WIRED(pcp));
 602  603                                          ASSERT(pheadp->p_lnext != pheadp);
 603  604                                          ASSERT(pheadp->p_lprev != pheadp);
 604  605                                          plinkp->p_lprev->p_lnext =
 605  606                                              plinkp->p_lnext;
 606  607                                          plinkp->p_lnext->p_lprev =
 607  608                                              plinkp->p_lprev;
 608  609                                  }
 609  610                                  pcp->p_hprev->p_hnext = pcp->p_hnext;
 610  611                                  pcp->p_hnext->p_hprev = pcp->p_hprev;
 611  612                                  if (!IS_PCP_WIRED(pcp) &&
 612  613                                      hp->p_hnext == (struct seg_pcache *)hp) {
 613  614                                          /*
 614  615                                           * We removed the last entry from this
 615  616                                           * bucket.  Now remove the bucket from
 616  617                                           * its active list.
 617  618                                           */
 618  619                                          seg_premove_abuck(hp, 0);
 619  620                                  }
 620  621                                  mutex_exit(&hp->p_hmutex);
 621  622                                  if (pmtx != NULL) {
 622  623                                          mutex_exit(pmtx);
 623  624                                  }
 624  625                                  len = pcp->p_len;
 625  626                                  npages = btop(len);
 626  627                                  if (rw != S_WRITE && pcp->p_write) {
 627  628                                          rw = S_WRITE;
 628  629                                  }
 629  630                                  kmem_cache_free(seg_pkmcache, pcp);
 630  631                                  goto out;
 631  632                          } else {
 632  633                                  /*
 633  634                                   * We found a matching pcp entry but will not
 634  635                                   * free it right away even if it's no longer
 635  636                                   * active.
 636  637                                   */
 637  638                                  if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
 638  639                                          /*
 639  640                                           * Set the reference bit and mark the
 640  641                                           * time of last access to this pcp
 641  642                                           * so that asynchronous thread doesn't
 642  643                                           * free it immediately since
 643  644                                           * it may be reactivated very soon.
 644  645                                           */
 645  646                                          pcp->p_lbolt = ddi_get_lbolt();
 646  647                                          pcp->p_ref = 1;
 647  648                                  }
 648  649                                  mutex_exit(&hp->p_hmutex);
 649  650                                  if (pmtx != NULL) {
 650  651                                          mutex_exit(pmtx);
 651  652                                  }
 652  653                                  return;
 653  654                          }
 654  655                  } else if (!IS_PFLAGS_WIRED(flags) &&
 655  656                      P_MATCH(pcp, htag0, addr, len)) {
 656  657                          /*
 657  658                           * This is a duplicate pcp entry.  This situation may
 658  659                           * happen if a bigger shadow list that covers our
 659  660                           * range was added while our entry was still active.
 660  661                           * Now we can free our pcp entry if it becomes
 661  662                           * inactive.
 662  663                           */
 663  664                          if (!pcp->p_active) {
 664  665                                  /*
 665  666                                   * Mark this entry as referenced just in case
 666  667                                   * we'll free our own pcp entry soon.
 667  668                                   */
 668  669                                  pcp->p_lbolt = ddi_get_lbolt();
 669  670                                  pcp->p_ref = 1;
 670  671                          }
 671  672                          if (pmtx != NULL) {
 672  673                                  /*
 673  674                                   * we are already holding pmtx and found a
 674  675                                   * duplicate.  Don't keep our own pcp entry.
 675  676                                   */
 676  677                                  keep = 0;
 677  678                                  continue;
 678  679                          }
 679  680                          /*
 680  681                           * We have to use mutex_tryenter to attempt to lock
 681  682                           * seg/amp list lock since we already hold hash lock
 682  683                           * and seg/amp list lock is above hash lock in lock
 683  684                           * order.  If mutex_tryenter fails drop hash lock and
 684  685                           * retake both locks in correct order and research
 685  686                           * this hash chain.
 686  687                           */
 687  688                          ASSERT(keep == 0);
 688  689                          if (amp == NULL) {
 689  690                                  pheadp = &seg->s_phead;
 690  691                                  pmtx = &seg->s_pmtx;
 691  692                          } else {
 692  693                                  pheadp = &amp->a_phead;
 693  694                                  pmtx = &amp->a_pmtx;
 694  695                          }
 695  696                          if (!mutex_tryenter(pmtx)) {
 696  697                                  mutex_exit(&hp->p_hmutex);
 697  698                                  mutex_enter(pmtx);
 698  699                                  mutex_enter(&hp->p_hmutex);
 699  700                                  /*
 700  701                                   * If we don't find bigger shadow list on
 701  702                                   * second search (it may happen since we
 702  703                                   * dropped bucket lock) keep the entry that
 703  704                                   * matches our own shadow list.
 704  705                                   */
 705  706                                  keep = 1;
 706  707                                  goto again;
 707  708                          }
 708  709                  }
 709  710          }
 710  711          mutex_exit(&hp->p_hmutex);
 711  712          if (pmtx != NULL) {
 712  713                  mutex_exit(pmtx);
 713  714          }
 714  715  out:
 715  716          (*callback)(htag0, addr, len, pp, rw, 0);
 716  717          if (npages) {
 717  718                  mutex_enter(&seg_pmem_mtx);
 718  719                  ASSERT(seg_plocked >= npages);
 719  720                  seg_plocked -= npages;
 720  721                  if (!IS_PFLAGS_WIRED(flags)) {
 721  722                          ASSERT(seg_plocked_window >= npages);
 722  723                          seg_plocked_window -= npages;
 723  724                  }
 724  725                  mutex_exit(&seg_pmem_mtx);
 725  726          }
 726  727  
 727  728  }
 728  729  
 729  730  #ifdef DEBUG
 730  731  static uint32_t p_insert_chk_mtbf = 0;
 731  732  #endif
 732  733  
 733  734  /*
 734  735   * The seg_pinsert_check() is used by segment drivers to predict whether
 735  736   * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
 736  737   */
 737  738  /*ARGSUSED*/
 738  739  int
 739  740  seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
 740  741      size_t len, uint_t flags)
 741  742  {
 742  743          ASSERT(seg != NULL);
 743  744  
 744  745  #ifdef DEBUG
 745  746          if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
 746  747                  return (SEGP_FAIL);
 747  748          }
 748  749  #endif
 749  750  
 750  751          if (seg_pdisabled) {
 751  752                  return (SEGP_FAIL);
 752  753          }
 753  754          ASSERT(seg_phashsize_win != 0);
 754  755  
 755  756          if (IS_PFLAGS_WIRED(flags)) {
 756  757                  return (SEGP_SUCCESS);
 757  758          }
 758  759  
 759  760          if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
 760  761                  return (SEGP_FAIL);
 761  762          }
 762  763  
 763  764          if (freemem < desfree) {
 764  765                  return (SEGP_FAIL);
 765  766          }
 766  767  
 767  768          return (SEGP_SUCCESS);
 768  769  }
 769  770  
 770  771  #ifdef DEBUG
 771  772  static uint32_t p_insert_mtbf = 0;
 772  773  #endif
 773  774  
 774  775  /*
 775  776   * Insert address range with shadow list into pagelock cache if there's no
 776  777   * shadow list already cached for this address range. If the cache is off or
 777  778   * caching is temporarily disabled or the allowed 'window' is exceeded return
 778  779   * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
 779  780   *
 780  781   * For non wired shadow lists (segvn case) include address in the hashing
 781  782   * function to avoid linking all the entries from the same segment or amp on
 782  783   * the same bucket.  amp is used instead of seg if amp is not NULL. Non wired
 783  784   * pcache entries are also linked on a per segment/amp list so that all
 784  785   * entries can be found quickly during seg/amp purge without walking the
 785  786   * entire pcache hash table.  For wired shadow lists (segspt case) we
 786  787   * don't use address hashing and per segment linking because the caller
 787  788   * currently inserts only one entry per segment that covers the entire
 788  789   * segment. If we used per segment linking even for segspt it would complicate
 789  790   * seg_ppurge_wiredpp() locking.
 790  791   *
 791  792   * Both hash bucket and per seg/amp locks need to be held before adding a non
 792  793   * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
 793  794   * first.
 794  795   *
 795  796   * This function will also remove from pcache old inactive shadow lists that
 796  797   * overlap with this request but cover smaller range for the same start
 797  798   * address.
 798  799   */
 799  800  int
 800  801  seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
 801  802      size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
 802  803      seg_preclaim_cbfunc_t callback)
 803  804  {
 804  805          struct seg_pcache *pcp;
 805  806          struct seg_phash *hp;
 806  807          pgcnt_t npages;
 807  808          pcache_link_t *pheadp;
 808  809          kmutex_t *pmtx;
 809  810          struct seg_pcache *delcallb_list = NULL;
 810  811  
 811  812          ASSERT(seg != NULL);
 812  813          ASSERT(rw == S_READ || rw == S_WRITE);
 813  814          ASSERT(rw == S_READ || wlen == len);
 814  815          ASSERT(rw == S_WRITE || wlen <= len);
 815  816          ASSERT(amp == NULL || wlen == len);
 816  817  
 817  818  #ifdef DEBUG
 818  819          if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
 819  820                  return (SEGP_FAIL);
 820  821          }
 821  822  #endif
 822  823  
 823  824          if (seg_pdisabled) {
 824  825                  return (SEGP_FAIL);
 825  826          }
 826  827          ASSERT(seg_phashsize_win != 0);
 827  828  
 828  829          ASSERT((len & PAGEOFFSET) == 0);
 829  830          npages = btop(len);
 830  831          mutex_enter(&seg_pmem_mtx);
 831  832          if (!IS_PFLAGS_WIRED(flags)) {
 832  833                  if (seg_plocked_window + npages > seg_pmaxwindow) {
 833  834                          mutex_exit(&seg_pmem_mtx);
 834  835                          return (SEGP_FAIL);
 835  836                  }
 836  837                  seg_plocked_window += npages;
 837  838          }
 838  839          seg_plocked += npages;
 839  840          mutex_exit(&seg_pmem_mtx);
 840  841  
 841  842          pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
 842  843          /*
 843  844           * If amp is not NULL set htag0 to amp otherwise set it to seg.
 844  845           */
 845  846          if (amp == NULL) {
 846  847                  pcp->p_htag0 = (void *)seg;
 847  848                  pcp->p_flags = flags & 0xffff;
 848  849          } else {
 849  850                  pcp->p_htag0 = (void *)amp;
 850  851                  pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
 851  852          }
 852  853          pcp->p_addr = addr;
 853  854          pcp->p_len = len;
 854  855          pcp->p_wlen = wlen;
 855  856          pcp->p_pp = pp;
 856  857          pcp->p_write = (rw == S_WRITE);
 857  858          pcp->p_callback = callback;
 858  859          pcp->p_active = 1;
 859  860  
 860  861          hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
 861  862          if (!IS_PFLAGS_WIRED(flags)) {
 862  863                  int found;
 863  864                  void *htag0;
 864  865                  if (amp == NULL) {
 865  866                          pheadp = &seg->s_phead;
 866  867                          pmtx = &seg->s_pmtx;
 867  868                          htag0 = (void *)seg;
 868  869                  } else {
 869  870                          pheadp = &amp->a_phead;
 870  871                          pmtx = &amp->a_pmtx;
 871  872                          htag0 = (void *)amp;
 872  873                  }
 873  874                  mutex_enter(pmtx);
 874  875                  mutex_enter(&hp->p_hmutex);
 875  876                  delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
 876  877                      len, &found);
 877  878                  if (found) {
 878  879                          mutex_exit(&hp->p_hmutex);
 879  880                          mutex_exit(pmtx);
 880  881                          mutex_enter(&seg_pmem_mtx);
 881  882                          seg_plocked -= npages;
 882  883                          seg_plocked_window -= npages;
 883  884                          mutex_exit(&seg_pmem_mtx);
 884  885                          kmem_cache_free(seg_pkmcache, pcp);
 885  886                          goto out;
 886  887                  }
 887  888                  pcp->p_plink.p_lnext = pheadp->p_lnext;
 888  889                  pcp->p_plink.p_lprev = pheadp;
 889  890                  pheadp->p_lnext->p_lprev = &pcp->p_plink;
 890  891                  pheadp->p_lnext = &pcp->p_plink;
 891  892          } else {
 892  893                  mutex_enter(&hp->p_hmutex);
 893  894          }
 894  895          pcp->p_hashp = hp;
 895  896          pcp->p_hnext = hp->p_hnext;
 896  897          pcp->p_hprev = (struct seg_pcache *)hp;
 897  898          hp->p_hnext->p_hprev = pcp;
 898  899          hp->p_hnext = pcp;
 899  900          if (!IS_PFLAGS_WIRED(flags) &&
 900  901              hp->p_hprev == pcp) {
 901  902                  seg_padd_abuck(hp);
 902  903          }
 903  904          mutex_exit(&hp->p_hmutex);
 904  905          if (!IS_PFLAGS_WIRED(flags)) {
 905  906                  mutex_exit(pmtx);
 906  907          }
 907  908  
 908  909  out:
 909  910          npages = 0;
 910  911          while (delcallb_list != NULL) {
 911  912                  pcp = delcallb_list;
 912  913                  delcallb_list = pcp->p_hprev;
 913  914                  ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
 914  915                  (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
 915  916                      pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
 916  917                  npages += btop(pcp->p_len);
 917  918                  kmem_cache_free(seg_pkmcache, pcp);
 918  919          }
 919  920          if (npages) {
 920  921                  ASSERT(!IS_PFLAGS_WIRED(flags));
 921  922                  mutex_enter(&seg_pmem_mtx);
 922  923                  ASSERT(seg_plocked >= npages);
 923  924                  ASSERT(seg_plocked_window >= npages);
 924  925                  seg_plocked -= npages;
 925  926                  seg_plocked_window -= npages;
 926  927                  mutex_exit(&seg_pmem_mtx);
 927  928          }
 928  929  
 929  930          return (SEGP_SUCCESS);
 930  931  }
 931  932  
 932  933  /*
 933  934   * purge entries from the pagelock cache if not active
 934  935   * and not recently used.
 935  936   */
 936  937  static void
 937  938  seg_ppurge_async(int force)
 938  939  {
 939  940          struct seg_pcache *delcallb_list = NULL;
 940  941          struct seg_pcache *pcp;
 941  942          struct seg_phash *hp;
 942  943          pgcnt_t npages = 0;
 943  944          pgcnt_t npages_window = 0;
 944  945          pgcnt_t npgs_to_purge;
 945  946          pgcnt_t npgs_purged = 0;
 946  947          int hlinks = 0;
 947  948          int hlix;
 948  949          pcache_link_t *hlinkp;
 949  950          pcache_link_t *hlnextp = NULL;
 950  951          int lowmem;
 951  952          int trim;
 952  953  
 953  954          ASSERT(seg_phashsize_win != 0);
 954  955  
 955  956          /*
 956  957           * if the cache is off or empty, return
 957  958           */
 958  959          if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
 959  960                  return;
 960  961          }
 961  962  
 962  963          if (!force) {
 963  964                  lowmem = 0;
 964  965                  trim = 0;
 965  966                  if (freemem < lotsfree + needfree) {
 966  967                          spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
 967  968                          if (fmem <= 5 * (desfree >> 2)) {
 968  969                                  lowmem = 1;
 969  970                          } else if (fmem <= 7 * (lotsfree >> 3)) {
 970  971                                  if (seg_plocked_window >=
 971  972                                      (availrmem_initial >> 1)) {
 972  973                                          lowmem = 1;
 973  974                                  }
 974  975                          } else if (fmem < lotsfree) {
 975  976                                  if (seg_plocked_window >=
 976  977                                      3 * (availrmem_initial >> 2)) {
 977  978                                          lowmem = 1;
 978  979                                  }
 979  980                          }
 980  981                  }
 981  982                  if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
 982  983                          trim = 1;
 983  984                  }
 984  985                  if (!lowmem && !trim) {
 985  986                          return;
 986  987                  }
 987  988                  npgs_to_purge = seg_plocked_window >>
 988  989                      seg_pshrink_shift;
 989  990                  if (lowmem) {
 990  991                          npgs_to_purge = MIN(npgs_to_purge,
 991  992                              MAX(seg_pmaxapurge_npages, desfree));
 992  993                  } else {
 993  994                          npgs_to_purge = MIN(npgs_to_purge,
 994  995                              seg_pmaxapurge_npages);
 995  996                  }
 996  997                  if (npgs_to_purge == 0) {
 997  998                          return;
 998  999                  }
 999 1000          } else {
1000 1001                  struct seg_phash_wired *hpw;
1001 1002  
1002 1003                  ASSERT(seg_phashsize_wired != 0);
1003 1004  
1004 1005                  for (hpw = seg_phashtab_wired;
1005 1006                      hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
1006 1007  
1007 1008                          if (hpw->p_hnext == (struct seg_pcache *)hpw) {
1008 1009                                  continue;
1009 1010                          }
1010 1011  
1011 1012                          mutex_enter(&hpw->p_hmutex);
1012 1013  
1013 1014                          for (pcp = hpw->p_hnext;
1014 1015                              pcp != (struct seg_pcache *)hpw;
1015 1016                              pcp = pcp->p_hnext) {
1016 1017  
1017 1018                                  ASSERT(IS_PCP_WIRED(pcp));
1018 1019                                  ASSERT(pcp->p_hashp ==
1019 1020                                      (struct seg_phash *)hpw);
1020 1021  
1021 1022                                  if (pcp->p_active) {
1022 1023                                          continue;
1023 1024                                  }
1024 1025                                  pcp->p_hprev->p_hnext = pcp->p_hnext;
1025 1026                                  pcp->p_hnext->p_hprev = pcp->p_hprev;
1026 1027                                  pcp->p_hprev = delcallb_list;
1027 1028                                  delcallb_list = pcp;
1028 1029                          }
1029 1030                          mutex_exit(&hpw->p_hmutex);
1030 1031                  }
1031 1032          }
1032 1033  
1033 1034          mutex_enter(&seg_pmem_mtx);
1034 1035          if (seg_pathr_on) {
1035 1036                  mutex_exit(&seg_pmem_mtx);
1036 1037                  goto runcb;
1037 1038          }
1038 1039          seg_pathr_on = 1;
1039 1040          mutex_exit(&seg_pmem_mtx);
1040 1041          ASSERT(seg_pahcur <= 1);
1041 1042          hlix = !seg_pahcur;
1042 1043  
1043 1044  again:
1044 1045          for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
1045 1046              hlinkp = hlnextp) {
1046 1047  
1047 1048                  hlnextp = hlinkp->p_lnext;
1048 1049                  ASSERT(hlnextp != NULL);
1049 1050  
1050 1051                  hp = hlink2phash(hlinkp, hlix);
1051 1052                  if (hp->p_hnext == (struct seg_pcache *)hp) {
1052 1053                          seg_pathr_empty_ahb++;
1053 1054                          continue;
1054 1055                  }
1055 1056                  seg_pathr_full_ahb++;
1056 1057                  mutex_enter(&hp->p_hmutex);
1057 1058  
1058 1059                  for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
1059 1060                      pcp = pcp->p_hnext) {
1060 1061                          pcache_link_t *pheadp;
1061 1062                          pcache_link_t *plinkp;
1062 1063                          void *htag0;
1063 1064                          kmutex_t *pmtx;
1064 1065  
1065 1066                          ASSERT(!IS_PCP_WIRED(pcp));
1066 1067                          ASSERT(pcp->p_hashp == hp);
1067 1068  
1068 1069                          if (pcp->p_active) {
1069 1070                                  continue;
1070 1071                          }
1071 1072                          if (!force && pcp->p_ref &&
1072 1073                              PCP_AGE(pcp) < seg_pmax_pcpage) {
1073 1074                                  pcp->p_ref = 0;
1074 1075                                  continue;
1075 1076                          }
1076 1077                          plinkp = &pcp->p_plink;
1077 1078                          htag0 = pcp->p_htag0;
1078 1079                          if (pcp->p_flags & SEGP_AMP) {
1079 1080                                  pheadp = &((amp_t *)htag0)->a_phead;
1080 1081                                  pmtx = &((amp_t *)htag0)->a_pmtx;
1081 1082                          } else {
1082 1083                                  pheadp = &((seg_t *)htag0)->s_phead;
1083 1084                                  pmtx = &((seg_t *)htag0)->s_pmtx;
1084 1085                          }
1085 1086                          if (!mutex_tryenter(pmtx)) {
1086 1087                                  continue;
1087 1088                          }
1088 1089                          ASSERT(pheadp->p_lnext != pheadp);
1089 1090                          ASSERT(pheadp->p_lprev != pheadp);
1090 1091                          plinkp->p_lprev->p_lnext =
1091 1092                              plinkp->p_lnext;
1092 1093                          plinkp->p_lnext->p_lprev =
1093 1094                              plinkp->p_lprev;
1094 1095                          pcp->p_hprev->p_hnext = pcp->p_hnext;
1095 1096                          pcp->p_hnext->p_hprev = pcp->p_hprev;
1096 1097                          mutex_exit(pmtx);
1097 1098                          pcp->p_hprev = delcallb_list;
1098 1099                          delcallb_list = pcp;
1099 1100                          npgs_purged += btop(pcp->p_len);
1100 1101                  }
1101 1102                  if (hp->p_hnext == (struct seg_pcache *)hp) {
1102 1103                          seg_premove_abuck(hp, 1);
1103 1104                  }
1104 1105                  mutex_exit(&hp->p_hmutex);
1105 1106                  if (npgs_purged >= seg_plocked_window) {
1106 1107                          break;
1107 1108                  }
1108 1109                  if (!force) {
1109 1110                          if (npgs_purged >= npgs_to_purge) {
1110 1111                                  break;
1111 1112                          }
1112 1113                          if (!trim && !(seg_pathr_full_ahb & 15)) {
1113 1114                                  ASSERT(lowmem);
1114 1115                                  if (freemem >= lotsfree + needfree) {
1115 1116                                          break;
1116 1117                                  }
1117 1118                          }
1118 1119                  }
1119 1120          }
1120 1121  
1121 1122          if (hlinkp == &seg_pahhead[hlix]) {
1122 1123                  /*
1123 1124                   * We processed the entire hlix active bucket list
1124 1125                   * but didn't find enough pages to reclaim.
1125 1126                   * Switch the lists and walk the other list
1126 1127                   * if we haven't done it yet.
1127 1128                   */
1128 1129                  mutex_enter(&seg_pmem_mtx);
1129 1130                  ASSERT(seg_pathr_on);
1130 1131                  ASSERT(seg_pahcur == !hlix);
1131 1132                  seg_pahcur = hlix;
1132 1133                  mutex_exit(&seg_pmem_mtx);
1133 1134                  if (++hlinks < 2) {
1134 1135                          hlix = !hlix;
1135 1136                          goto again;
1136 1137                  }
1137 1138          } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
1138 1139              seg_pahhead[hlix].p_lnext != hlinkp) {
1139 1140                  ASSERT(hlinkp != NULL);
1140 1141                  ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
1141 1142                  ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
1142 1143                  ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
1143 1144  
1144 1145                  /*
1145 1146                   * Reinsert the header to point to hlinkp
1146 1147                   * so that we start from hlinkp bucket next time around.
1147 1148                   */
1148 1149                  seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
1149 1150                  seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
1150 1151                  seg_pahhead[hlix].p_lnext = hlinkp;
1151 1152                  seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
1152 1153                  hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
1153 1154                  hlinkp->p_lprev = &seg_pahhead[hlix];
1154 1155          }
1155 1156  
1156 1157          mutex_enter(&seg_pmem_mtx);
1157 1158          ASSERT(seg_pathr_on);
1158 1159          seg_pathr_on = 0;
1159 1160          mutex_exit(&seg_pmem_mtx);
1160 1161  
1161 1162  runcb:
1162 1163          /*
1163 1164           * Run the delayed callback list. segments/amps can't go away until
1164 1165           * callback is executed since they must have non 0 softlockcnt. That's
1165 1166           * why we don't need to hold as/seg/amp locks to execute the callback.
1166 1167           */
1167 1168          while (delcallb_list != NULL) {
1168 1169                  pcp = delcallb_list;
1169 1170                  delcallb_list = pcp->p_hprev;
1170 1171                  ASSERT(!pcp->p_active);
1171 1172                  (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1172 1173                      pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
1173 1174                  npages += btop(pcp->p_len);
1174 1175                  if (!IS_PCP_WIRED(pcp)) {
1175 1176                          npages_window += btop(pcp->p_len);
1176 1177                  }
1177 1178                  kmem_cache_free(seg_pkmcache, pcp);
1178 1179          }
1179 1180          if (npages) {
1180 1181                  mutex_enter(&seg_pmem_mtx);
1181 1182                  ASSERT(seg_plocked >= npages);
1182 1183                  ASSERT(seg_plocked_window >= npages_window);
1183 1184                  seg_plocked -= npages;
1184 1185                  seg_plocked_window -= npages_window;
1185 1186                  mutex_exit(&seg_pmem_mtx);
1186 1187          }
1187 1188  }
1188 1189  
1189 1190  /*
1190 1191   * Remove cached pages for segment(s) entries from hashtable.  The segments
1191 1192   * are identified by pp array. This is useful for multiple seg's cached on
1192 1193   * behalf of dummy segment (ISM/DISM) with common pp array.
1193 1194   */
1194 1195  void
1195 1196  seg_ppurge_wiredpp(struct page **pp)
1196 1197  {
1197 1198          struct seg_pcache *pcp;
1198 1199          struct seg_phash_wired *hp;
1199 1200          pgcnt_t npages = 0;
1200 1201          struct  seg_pcache *delcallb_list = NULL;
1201 1202  
1202 1203          /*
1203 1204           * if the cache is empty, return
1204 1205           */
1205 1206          if (seg_plocked == 0) {
1206 1207                  return;
1207 1208          }
1208 1209          ASSERT(seg_phashsize_wired != 0);
1209 1210  
1210 1211          for (hp = seg_phashtab_wired;
1211 1212              hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
1212 1213                  if (hp->p_hnext == (struct seg_pcache *)hp) {
1213 1214                          continue;
1214 1215                  }
1215 1216                  mutex_enter(&hp->p_hmutex);
1216 1217                  pcp = hp->p_hnext;
1217 1218                  while (pcp != (struct seg_pcache *)hp) {
1218 1219                          ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
1219 1220                          ASSERT(IS_PCP_WIRED(pcp));
1220 1221                          /*
1221 1222                           * purge entries which are not active
1222 1223                           */
1223 1224                          if (!pcp->p_active && pcp->p_pp == pp) {
1224 1225                                  ASSERT(pcp->p_htag0 != NULL);
1225 1226                                  pcp->p_hprev->p_hnext = pcp->p_hnext;
1226 1227                                  pcp->p_hnext->p_hprev = pcp->p_hprev;
1227 1228                                  pcp->p_hprev = delcallb_list;
1228 1229                                  delcallb_list = pcp;
1229 1230                          }
1230 1231                          pcp = pcp->p_hnext;
1231 1232                  }
1232 1233                  mutex_exit(&hp->p_hmutex);
1233 1234                  /*
1234 1235                   * segments can't go away until callback is executed since
1235 1236                   * they must have non 0 softlockcnt. That's why we don't
1236 1237                   * need to hold as/seg locks to execute the callback.
1237 1238                   */
1238 1239                  while (delcallb_list != NULL) {
1239 1240                          int done;
1240 1241                          pcp = delcallb_list;
1241 1242                          delcallb_list = pcp->p_hprev;
1242 1243                          ASSERT(!pcp->p_active);
1243 1244                          done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1244 1245                              pcp->p_len, pcp->p_pp,
1245 1246                              pcp->p_write ? S_WRITE : S_READ, 1);
1246 1247                          npages += btop(pcp->p_len);
1247 1248                          ASSERT(IS_PCP_WIRED(pcp));
1248 1249                          kmem_cache_free(seg_pkmcache, pcp);
1249 1250                          if (done) {
1250 1251                                  ASSERT(delcallb_list == NULL);
1251 1252                                  goto out;
1252 1253                          }
1253 1254                  }
1254 1255          }
1255 1256  
1256 1257  out:
1257 1258          mutex_enter(&seg_pmem_mtx);
1258 1259          ASSERT(seg_plocked >= npages);
1259 1260          seg_plocked -= npages;
1260 1261          mutex_exit(&seg_pmem_mtx);
1261 1262  }
1262 1263  
1263 1264  /*
1264 1265   * purge all entries for a given segment. Since we
1265 1266   * callback into the segment driver directly for page
1266 1267   * reclaim the caller needs to hold the right locks.
1267 1268   */
1268 1269  void
1269 1270  seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
1270 1271  {
1271 1272          struct seg_pcache *delcallb_list = NULL;
1272 1273          struct seg_pcache *pcp;
1273 1274          struct seg_phash *hp;
1274 1275          pgcnt_t npages = 0;
1275 1276          void *htag0;
1276 1277  
1277 1278          if (seg_plocked == 0) {
1278 1279                  return;
1279 1280          }
1280 1281          ASSERT(seg_phashsize_win != 0);
1281 1282  
1282 1283          /*
1283 1284           * If amp is not NULL use amp as a lookup tag otherwise use seg
1284 1285           * as a lookup tag.
1285 1286           */
1286 1287          htag0 = (amp == NULL ? (void *)seg : (void *)amp);
1287 1288          ASSERT(htag0 != NULL);
1288 1289          if (IS_PFLAGS_WIRED(flags)) {
1289 1290                  hp = P_HASHBP(seg, htag0, 0, flags);
1290 1291                  mutex_enter(&hp->p_hmutex);
1291 1292                  pcp = hp->p_hnext;
1292 1293                  while (pcp != (struct seg_pcache *)hp) {
1293 1294                          ASSERT(pcp->p_hashp == hp);
1294 1295                          ASSERT(IS_PCP_WIRED(pcp));
1295 1296                          if (pcp->p_htag0 == htag0) {
1296 1297                                  if (pcp->p_active) {
1297 1298                                          break;
1298 1299                                  }
1299 1300                                  pcp->p_hprev->p_hnext = pcp->p_hnext;
1300 1301                                  pcp->p_hnext->p_hprev = pcp->p_hprev;
1301 1302                                  pcp->p_hprev = delcallb_list;
1302 1303                                  delcallb_list = pcp;
1303 1304                          }
1304 1305                          pcp = pcp->p_hnext;
1305 1306                  }
1306 1307                  mutex_exit(&hp->p_hmutex);
1307 1308          } else {
1308 1309                  pcache_link_t *plinkp;
1309 1310                  pcache_link_t *pheadp;
1310 1311                  kmutex_t *pmtx;
1311 1312  
1312 1313                  if (amp == NULL) {
1313 1314                          ASSERT(seg != NULL);
1314 1315                          pheadp = &seg->s_phead;
1315 1316                          pmtx = &seg->s_pmtx;
1316 1317                  } else {
1317 1318                          pheadp = &amp->a_phead;
1318 1319                          pmtx = &amp->a_pmtx;
1319 1320                  }
1320 1321                  mutex_enter(pmtx);
1321 1322                  while ((plinkp = pheadp->p_lnext) != pheadp) {
1322 1323                          pcp = plink2pcache(plinkp);
1323 1324                          ASSERT(!IS_PCP_WIRED(pcp));
1324 1325                          ASSERT(pcp->p_htag0 == htag0);
1325 1326                          hp = pcp->p_hashp;
1326 1327                          mutex_enter(&hp->p_hmutex);
1327 1328                          if (pcp->p_active) {
1328 1329                                  mutex_exit(&hp->p_hmutex);
1329 1330                                  break;
1330 1331                          }
1331 1332                          ASSERT(plinkp->p_lprev == pheadp);
1332 1333                          pheadp->p_lnext = plinkp->p_lnext;
1333 1334                          plinkp->p_lnext->p_lprev = pheadp;
1334 1335                          pcp->p_hprev->p_hnext = pcp->p_hnext;
1335 1336                          pcp->p_hnext->p_hprev = pcp->p_hprev;
1336 1337                          pcp->p_hprev = delcallb_list;
1337 1338                          delcallb_list = pcp;
1338 1339                          if (hp->p_hnext == (struct seg_pcache *)hp) {
1339 1340                                  seg_premove_abuck(hp, 0);
1340 1341                          }
1341 1342                          mutex_exit(&hp->p_hmutex);
1342 1343                  }
1343 1344                  mutex_exit(pmtx);
1344 1345          }
1345 1346          while (delcallb_list != NULL) {
1346 1347                  pcp = delcallb_list;
1347 1348                  delcallb_list = pcp->p_hprev;
1348 1349                  ASSERT(!pcp->p_active);
1349 1350                  (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
1350 1351                      pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
1351 1352                  npages += btop(pcp->p_len);
1352 1353                  kmem_cache_free(seg_pkmcache, pcp);
1353 1354          }
1354 1355          mutex_enter(&seg_pmem_mtx);
1355 1356          ASSERT(seg_plocked >= npages);
1356 1357          seg_plocked -= npages;
1357 1358          if (!IS_PFLAGS_WIRED(flags)) {
1358 1359                  ASSERT(seg_plocked_window >= npages);
1359 1360                  seg_plocked_window -= npages;
1360 1361          }
1361 1362          mutex_exit(&seg_pmem_mtx);
1362 1363  }
1363 1364  
1364 1365  static void seg_pinit_mem_config(void);
1365 1366  
1366 1367  /*
1367 1368   * setup the pagelock cache
1368 1369   */
1369 1370  static void
1370 1371  seg_pinit(void)
1371 1372  {
1372 1373          struct seg_phash *hp;
1373 1374          ulong_t i;
1374 1375          pgcnt_t physmegs;
1375 1376  
1376 1377          seg_plocked = 0;
1377 1378          seg_plocked_window = 0;
1378 1379  
1379 1380          if (segpcache_enabled == 0) {
1380 1381                  seg_phashsize_win = 0;
1381 1382                  seg_phashsize_wired = 0;
1382 1383                  seg_pdisabled = 1;
1383 1384                  return;
1384 1385          }
1385 1386  
1386 1387          seg_pdisabled = 0;
1387 1388          seg_pkmcache = kmem_cache_create("seg_pcache",
1388 1389              sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
1389 1390          if (segpcache_pcp_maxage_ticks <= 0) {
1390 1391                  segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
1391 1392          }
1392 1393          seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
1393 1394          seg_pathr_empty_ahb = 0;
1394 1395          seg_pathr_full_ahb = 0;
1395 1396          seg_pshrink_shift = segpcache_shrink_shift;
1396 1397          seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
1397 1398  
1398 1399          mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
1399 1400          mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
1400 1401          mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
1401 1402          cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
1402 1403  
1403 1404          physmegs = physmem >> (20 - PAGESHIFT);
1404 1405  
1405 1406          /*
1406 1407           * If segpcache_hashsize_win was not set in /etc/system or it has
1407 1408           * absurd value set it to a default.
1408 1409           */
1409 1410          if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
1410 1411                  /*
1411 1412                   * Create one bucket per 32K (or at least per 8 pages) of
1412 1413                   * available memory.
1413 1414                   */
1414 1415                  pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
1415 1416                  segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
1416 1417          }
1417 1418          if (!ISP2(segpcache_hashsize_win)) {
1418 1419                  ulong_t rndfac = ~(1UL <<
1419 1420                      (highbit(segpcache_hashsize_win) - 1));
1420 1421                  rndfac &= segpcache_hashsize_win;
1421 1422                  segpcache_hashsize_win += rndfac;
1422 1423                  segpcache_hashsize_win = 1 <<
1423 1424                      (highbit(segpcache_hashsize_win) - 1);
1424 1425          }
1425 1426          seg_phashsize_win = segpcache_hashsize_win;
1426 1427          seg_phashtab_win = kmem_zalloc(
1427 1428              seg_phashsize_win * sizeof (struct seg_phash),
1428 1429              KM_SLEEP);
1429 1430          for (i = 0; i < seg_phashsize_win; i++) {
1430 1431                  hp = &seg_phashtab_win[i];
1431 1432                  hp->p_hnext = (struct seg_pcache *)hp;
1432 1433                  hp->p_hprev = (struct seg_pcache *)hp;
1433 1434                  mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1434 1435          }
1435 1436  
1436 1437          seg_pahcur = 0;
1437 1438          seg_pathr_on = 0;
1438 1439          seg_pahhead[0].p_lnext = &seg_pahhead[0];
1439 1440          seg_pahhead[0].p_lprev = &seg_pahhead[0];
1440 1441          seg_pahhead[1].p_lnext = &seg_pahhead[1];
1441 1442          seg_pahhead[1].p_lprev = &seg_pahhead[1];
1442 1443  
1443 1444          /*
1444 1445           * If segpcache_hashsize_wired was not set in /etc/system or it has
1445 1446           * absurd value set it to a default.
1446 1447           */
1447 1448          if (segpcache_hashsize_wired == 0 ||
1448 1449              segpcache_hashsize_wired > physmem / 4) {
1449 1450                  /*
1450 1451                   * Choose segpcache_hashsize_wired based on physmem.
1451 1452                   * Create a bucket per 128K bytes upto 256K buckets.
1452 1453                   */
1453 1454                  if (physmegs < 20 * 1024) {
1454 1455                          segpcache_hashsize_wired = MAX(1024, physmegs << 3);
1455 1456                  } else {
1456 1457                          segpcache_hashsize_wired = 256 * 1024;
1457 1458                  }
1458 1459          }
1459 1460          if (!ISP2(segpcache_hashsize_wired)) {
1460 1461                  segpcache_hashsize_wired = 1 <<
1461 1462                      highbit(segpcache_hashsize_wired);
1462 1463          }
1463 1464          seg_phashsize_wired = segpcache_hashsize_wired;
1464 1465          seg_phashtab_wired = kmem_zalloc(
1465 1466              seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
1466 1467          for (i = 0; i < seg_phashsize_wired; i++) {
1467 1468                  hp = (struct seg_phash *)&seg_phashtab_wired[i];
1468 1469                  hp->p_hnext = (struct seg_pcache *)hp;
1469 1470                  hp->p_hprev = (struct seg_pcache *)hp;
1470 1471                  mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1471 1472          }
1472 1473  
1473 1474          if (segpcache_maxwindow == 0) {
1474 1475                  if (physmegs < 64) {
1475 1476                          /* 3% of memory */
1476 1477                          segpcache_maxwindow = availrmem >> 5;
1477 1478                  } else if (physmegs < 512) {
1478 1479                          /* 12% of memory */
1479 1480                          segpcache_maxwindow = availrmem >> 3;
1480 1481                  } else if (physmegs < 1024) {
1481 1482                          /* 25% of memory */
1482 1483                          segpcache_maxwindow = availrmem >> 2;
1483 1484                  } else if (physmegs < 2048) {
1484 1485                          /* 50% of memory */
1485 1486                          segpcache_maxwindow = availrmem >> 1;
1486 1487                  } else {
1487 1488                          /* no limit */
1488 1489                          segpcache_maxwindow = (pgcnt_t)-1;
1489 1490                  }
1490 1491          }
1491 1492          seg_pmaxwindow = segpcache_maxwindow;
1492 1493          seg_pinit_mem_config();
1493 1494  }
1494 1495  
1495 1496  /*
1496 1497   * called by pageout if memory is low
1497 1498   */
1498 1499  void
1499 1500  seg_preap(void)
1500 1501  {
1501 1502          /*
1502 1503           * if the cache is off or empty, return
1503 1504           */
1504 1505          if (seg_plocked_window == 0) {
1505 1506                  return;
1506 1507          }
1507 1508          ASSERT(seg_phashsize_win != 0);
1508 1509  
1509 1510          /*
1510 1511           * If somebody is already purging pcache
1511 1512           * just return.
1512 1513           */
1513 1514          if (seg_pdisabled) {
1514 1515                  return;
1515 1516          }
1516 1517  
1517 1518          cv_signal(&seg_pasync_cv);
1518 1519  }
1519 1520  
1520 1521  /*
1521 1522   * run as a backgroud thread and reclaim pagelock
1522 1523   * pages which have not been used recently
1523 1524   */
1524 1525  void
1525 1526  seg_pasync_thread(void)
1526 1527  {
1527 1528          callb_cpr_t cpr_info;
1528 1529  
1529 1530          if (seg_phashsize_win == 0) {
1530 1531                  thread_exit();
1531 1532                  /*NOTREACHED*/
1532 1533          }
1533 1534  
1534 1535          seg_pasync_thr = curthread;
1535 1536  
1536 1537          CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
1537 1538              callb_generic_cpr, "seg_pasync");
1538 1539  
1539 1540          if (segpcache_reap_ticks <= 0) {
1540 1541                  segpcache_reap_ticks = segpcache_reap_sec * hz;
1541 1542          }
1542 1543  
1543 1544          mutex_enter(&seg_pasync_mtx);
1544 1545          for (;;) {
1545 1546                  CALLB_CPR_SAFE_BEGIN(&cpr_info);
1546 1547                  (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx,
1547 1548                      segpcache_reap_ticks, TR_CLOCK_TICK);
1548 1549                  CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
1549 1550                  if (seg_pdisabled == 0) {
1550 1551                          seg_ppurge_async(0);
1551 1552                  }
1552 1553          }
1553 1554  }
1554 1555  
1555 1556  static struct kmem_cache *seg_cache;
1556 1557  
1557 1558  /*
1558 1559   * Initialize segment management data structures.
1559 1560   */
1560 1561  void
1561 1562  seg_init(void)
1562 1563  {
1563 1564          kstat_t *ksp;
1564 1565  
1565 1566          seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
1566 1567              0, NULL, NULL, NULL, NULL, NULL, 0);
1567 1568  
1568 1569          ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
1569 1570              segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
1570 1571          if (ksp) {
1571 1572                  ksp->ks_data = (void *)segadvstat_ptr;
1572 1573                  kstat_install(ksp);
1573 1574          }
1574 1575  
1575 1576          seg_pinit();
1576 1577  }
1577 1578  
1578 1579  /*
1579 1580   * Allocate a segment to cover [base, base+size]
1580 1581   * and attach it to the specified address space.
1581 1582   */
1582 1583  struct seg *
1583 1584  seg_alloc(struct as *as, caddr_t base, size_t size)
1584 1585  {
1585 1586          struct seg *new;
1586 1587          caddr_t segbase;
1587 1588          size_t segsize;
1588 1589  
1589 1590          segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
1590 1591          segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
1591 1592              (uintptr_t)segbase;
1592 1593  
1593 1594          if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
1594 1595                  return ((struct seg *)NULL);    /* bad virtual addr range */
1595 1596  
1596 1597          if (as != &kas &&
1597 1598              valid_usr_range(segbase, segsize, 0, as,
1598 1599              as->a_userlimit) != RANGE_OKAY)
1599 1600                  return ((struct seg *)NULL);    /* bad virtual addr range */
1600 1601  
1601 1602          new = kmem_cache_alloc(seg_cache, KM_SLEEP);
1602 1603          new->s_ops = NULL;
1603 1604          new->s_data = NULL;
1604 1605          new->s_szc = 0;
1605 1606          new->s_flags = 0;
1606 1607          mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
1607 1608          new->s_phead.p_lnext = &new->s_phead;
1608 1609          new->s_phead.p_lprev = &new->s_phead;
1609 1610          if (seg_attach(as, segbase, segsize, new) < 0) {
1610 1611                  kmem_cache_free(seg_cache, new);
1611 1612                  return ((struct seg *)NULL);
1612 1613          }
1613 1614          /* caller must fill in ops, data */
1614 1615          return (new);
1615 1616  }
1616 1617  
1617 1618  /*
1618 1619   * Attach a segment to the address space.  Used by seg_alloc()
1619 1620   * and for kernel startup to attach to static segments.
1620 1621   */
1621 1622  int
1622 1623  seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
1623 1624  {
1624 1625          seg->s_as = as;
1625 1626          seg->s_base = base;
1626 1627          seg->s_size = size;
1627 1628  
1628 1629          /*
1629 1630           * as_addseg() will add the segment at the appropraite point
1630 1631           * in the list. It will return -1 if there is overlap with
1631 1632           * an already existing segment.
1632 1633           */
1633 1634          return (as_addseg(as, seg));
1634 1635  }
1635 1636  
1636 1637  /*
1637 1638   * Unmap a segment and free it from its associated address space.
1638 1639   * This should be called by anybody who's finished with a whole segment's
1639 1640   * mapping.  Just calls SEGOP_UNMAP() on the whole mapping .  It is the
1640 1641   * responsibility of the segment driver to unlink the the segment
1641 1642   * from the address space, and to free public and private data structures
1642 1643   * associated with the segment.  (This is typically done by a call to
1643 1644   * seg_free()).
1644 1645   */
1645 1646  void
1646 1647  seg_unmap(struct seg *seg)
1647 1648  {
1648 1649  #ifdef DEBUG
1649 1650          int ret;
1650 1651  #endif /* DEBUG */
1651 1652  
1652 1653          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1653 1654  
1654 1655          /* Shouldn't have called seg_unmap if mapping isn't yet established */
1655 1656          ASSERT(seg->s_data != NULL);
1656 1657  
1657 1658          /* Unmap the whole mapping */
1658 1659  #ifdef DEBUG
1659 1660          ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1660 1661          ASSERT(ret == 0);
1661 1662  #else
1662 1663          SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1663 1664  #endif /* DEBUG */
1664 1665  }
1665 1666  
1666 1667  /*
1667 1668   * Free the segment from its associated as. This should only be called
1668 1669   * if a mapping to the segment has not yet been established (e.g., if
1669 1670   * an error occurs in the middle of doing an as_map when the segment
1670 1671   * has already been partially set up) or if it has already been deleted
1671 1672   * (e.g., from a segment driver unmap routine if the unmap applies to the
1672 1673   * entire segment). If the mapping is currently set up then seg_unmap() should
1673 1674   * be called instead.
1674 1675   */
1675 1676  void
1676 1677  seg_free(struct seg *seg)
1677 1678  {
1678 1679          register struct as *as = seg->s_as;
1679 1680          struct seg *tseg = as_removeseg(as, seg);
1680 1681  
1681 1682          ASSERT(tseg == seg);
1682 1683  
1683 1684          /*
1684 1685           * If the segment private data field is NULL,
1685 1686           * then segment driver is not attached yet.
1686 1687           */
1687 1688          if (seg->s_data != NULL)
1688 1689                  SEGOP_FREE(seg);
1689 1690  
1690 1691          mutex_destroy(&seg->s_pmtx);
1691 1692          ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
1692 1693          ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
1693 1694          kmem_cache_free(seg_cache, seg);
1694 1695  }
1695 1696  
1696 1697  /*ARGSUSED*/
1697 1698  static void
1698 1699  seg_p_mem_config_post_add(
1699 1700          void *arg,
1700 1701          pgcnt_t delta_pages)
1701 1702  {
1702 1703          /* Nothing to do. */
1703 1704  }
1704 1705  
1705 1706  void
1706 1707  seg_p_enable(void)
1707 1708  {
1708 1709          mutex_enter(&seg_pcache_mtx);
1709 1710          ASSERT(seg_pdisabled != 0);
1710 1711          seg_pdisabled--;
1711 1712          mutex_exit(&seg_pcache_mtx);
1712 1713  }
1713 1714  
1714 1715  /*
1715 1716   * seg_p_disable - disables seg_pcache, and then attempts to empty the
1716 1717   * cache.
1717 1718   * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1718 1719   * SEGP_FAIL if the cache could not be emptied.
1719 1720   */
1720 1721  int
1721 1722  seg_p_disable(void)
1722 1723  {
1723 1724          pgcnt_t old_plocked;
1724 1725          int stall_count = 0;
1725 1726  
1726 1727          mutex_enter(&seg_pcache_mtx);
1727 1728          seg_pdisabled++;
1728 1729          ASSERT(seg_pdisabled != 0);
1729 1730          mutex_exit(&seg_pcache_mtx);
1730 1731  
1731 1732          /*
1732 1733           * Attempt to empty the cache. Terminate if seg_plocked does not
1733 1734           * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
1734 1735           */
1735 1736          while (seg_plocked != 0) {
1736 1737                  ASSERT(seg_phashsize_win != 0);
1737 1738                  old_plocked = seg_plocked;
1738 1739                  seg_ppurge_async(1);
1739 1740                  if (seg_plocked == old_plocked) {
1740 1741                          if (stall_count++ > SEGP_STALL_THRESHOLD) {
1741 1742                                  return (SEGP_FAIL);
1742 1743                          }
1743 1744                  } else
1744 1745                          stall_count = 0;
1745 1746                  if (seg_plocked != 0)
1746 1747                          delay(hz/SEGP_PREDEL_DELAY_FACTOR);
1747 1748          }
1748 1749          return (SEGP_SUCCESS);
1749 1750  }
1750 1751  
1751 1752  /*
1752 1753   * Attempt to purge seg_pcache.  May need to return before this has
1753 1754   * completed to allow other pre_del callbacks to unlock pages. This is
1754 1755   * ok because:
1755 1756   *      1) The seg_pdisabled flag has been set so at least we won't
1756 1757   *      cache anymore locks and the locks we couldn't purge
1757 1758   *      will not be held if they do get released by a subsequent
1758 1759   *      pre-delete callback.
1759 1760   *
1760 1761   *      2) The rest of the memory delete thread processing does not
1761 1762   *      depend on the changes made in this pre-delete callback. No
1762 1763   *      panics will result, the worst that will happen is that the
1763 1764   *      DR code will timeout and cancel the delete.
1764 1765   */
1765 1766  /*ARGSUSED*/
1766 1767  static int
1767 1768  seg_p_mem_config_pre_del(
1768 1769          void *arg,
1769 1770          pgcnt_t delta_pages)
1770 1771  {
1771 1772          if (seg_phashsize_win == 0) {
1772 1773                  return (0);
1773 1774          }
1774 1775          if (seg_p_disable() != SEGP_SUCCESS)
1775 1776                  cmn_err(CE_NOTE,
1776 1777                      "!Pre-delete couldn't purge"" pagelock cache - continuing");
1777 1778          return (0);
1778 1779  }
1779 1780  
1780 1781  /*ARGSUSED*/
1781 1782  static void
1782 1783  seg_p_mem_config_post_del(
1783 1784          void *arg,
1784 1785          pgcnt_t delta_pages,
1785 1786          int cancelled)
1786 1787  {
1787 1788          if (seg_phashsize_win == 0) {
1788 1789                  return;
1789 1790          }
1790 1791          seg_p_enable();
1791 1792  }
1792 1793  
1793 1794  static kphysm_setup_vector_t seg_p_mem_config_vec = {
1794 1795          KPHYSM_SETUP_VECTOR_VERSION,
1795 1796          seg_p_mem_config_post_add,
1796 1797          seg_p_mem_config_pre_del,
1797 1798          seg_p_mem_config_post_del,
1798 1799  };
1799 1800  
1800 1801  static void
1801 1802  seg_pinit_mem_config(void)
1802 1803  {
1803 1804          int ret;
1804 1805  
1805 1806          ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
1806 1807          /*
1807 1808           * Want to catch this in the debug kernel. At run time, if the
1808 1809           * callbacks don't get run all will be OK as the disable just makes
1809 1810           * it more likely that the pages can be collected.
1810 1811           */
1811 1812          ASSERT(ret == 0);
1812 1813  }
1813 1814  
1814 1815  /*
1815 1816   * Verify that segment is not a shared anonymous segment which reserves
1816 1817   * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
1817 1818   * from one zone to another if any segments are shared.  This is because the
1818 1819   * last process to exit will credit the swap reservation.  This could lead
1819 1820   * to the swap being reserved by one zone, and credited to another.
1820 1821   */
1821 1822  boolean_t
1822 1823  seg_can_change_zones(struct seg *seg)
1823 1824  {
1824 1825          struct segvn_data *svd;
1825 1826  
1826 1827          if (seg->s_ops == &segspt_shmops)
1827 1828                  return (B_FALSE);
1828 1829  
1829 1830          if (seg->s_ops == &segvn_ops) {
1830 1831                  svd = (struct segvn_data *)seg->s_data;
1831 1832                  if (svd->type == MAP_SHARED &&
1832 1833                      svd->amp != NULL &&
1833 1834                      svd->amp->swresv > 0)
1834 1835                  return (B_FALSE);
1835 1836          }
1836 1837          return (B_TRUE);
1837 1838  }
1838 1839  
1839 1840  /*
1840 1841   * Return swap reserved by a segment backing a private mapping.
1841 1842   */
1842 1843  size_t
1843 1844  seg_swresv(struct seg *seg)
1844 1845  {
1845 1846          struct segvn_data *svd;
1846 1847          size_t swap = 0;
1847 1848  
1848 1849          if (seg->s_ops == &segvn_ops) {
1849 1850                  svd = (struct segvn_data *)seg->s_data;
1850 1851                  if (svd->type == MAP_PRIVATE && svd->swresv > 0)
1851 1852                          swap = svd->swresv;
1852 1853          }
1853 1854          return (swap);

↓ open down ↓

1819 lines elided

↑ open up ↑

1854 1855  }
1855 1856  
1856 1857  /*
1857 1858   * General not supported function for SEGOP_INHERIT
1858 1859   */
1859 1860  /* ARGSUSED */
1860 1861  int
1861 1862  seg_inherit_notsup(struct seg *seg, caddr_t addr, size_t len, uint_t op)
1862 1863  {
1863 1864          return (ENOTSUP);
     1865 +}
     1866 +
     1867 +/*
     1868 + * segop wrappers
     1869 + */
     1870 +int
     1871 +segop_dup(struct seg *seg, struct seg *new)
     1872 +{
     1873 +        return (seg->s_ops->dup(seg, new));
     1874 +}
     1875 +
     1876 +int
     1877 +segop_unmap(struct seg *seg, caddr_t addr, size_t len)
     1878 +{
     1879 +        return (seg->s_ops->unmap(seg, addr, len));
     1880 +}
     1881 +
     1882 +void
     1883 +segop_free(struct seg *seg)
     1884 +{
     1885 +        seg->s_ops->free(seg);
     1886 +}
     1887 +
     1888 +faultcode_t
     1889 +segop_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
     1890 +    enum fault_type type, enum seg_rw rw)
     1891 +{
     1892 +        return (seg->s_ops->fault(hat, seg, addr, len, type, rw));
     1893 +}
     1894 +
     1895 +faultcode_t
     1896 +segop_faulta(struct seg *seg, caddr_t addr)
     1897 +{
     1898 +        return (seg->s_ops->faulta(seg, addr));
     1899 +}
     1900 +
     1901 +int
     1902 +segop_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
     1903 +{
     1904 +        return (seg->s_ops->setprot(seg, addr, len, prot));
     1905 +}
     1906 +
     1907 +int
     1908 +segop_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
     1909 +{
     1910 +        return (seg->s_ops->checkprot(seg, addr, len, prot));
     1911 +}
     1912 +
     1913 +int
     1914 +segop_kluster(struct seg *seg, caddr_t addr, ssize_t d)
     1915 +{
     1916 +        return (seg->s_ops->kluster(seg, addr, d));
     1917 +}
     1918 +
     1919 +size_t
     1920 +segop_swapout(struct seg *seg)
     1921 +{
     1922 +        return (seg->s_ops->swapout(seg));
     1923 +}
     1924 +
     1925 +int
     1926 +segop_sync(struct seg *seg, caddr_t addr, size_t len, int atr, uint_t f)
     1927 +{
     1928 +        return (seg->s_ops->sync(seg, addr, len, atr, f));
     1929 +}
     1930 +
     1931 +size_t
     1932 +segop_incore(struct seg *seg, caddr_t addr, size_t len, char *v)
     1933 +{
     1934 +        return (seg->s_ops->incore(seg, addr, len, v));
     1935 +}
     1936 +
     1937 +int
     1938 +segop_lockop(struct seg *seg, caddr_t addr, size_t len, int atr, int op,
     1939 +    ulong_t *b, size_t p)
     1940 +{
     1941 +        return (seg->s_ops->lockop(seg, addr, len, atr, op, b, p));
     1942 +}
     1943 +
     1944 +int
     1945 +segop_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *p)
     1946 +{
     1947 +        return (seg->s_ops->getprot(seg, addr, len, p));
     1948 +}
     1949 +
     1950 +u_offset_t
     1951 +segop_getoffset(struct seg *seg, caddr_t addr)
     1952 +{
     1953 +        return (seg->s_ops->getoffset(seg, addr));
     1954 +}
     1955 +
     1956 +int
     1957 +segop_gettype(struct seg *seg, caddr_t addr)
     1958 +{
     1959 +        return (seg->s_ops->gettype(seg, addr));
     1960 +}
     1961 +
     1962 +int
     1963 +segop_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
     1964 +{
     1965 +        return (seg->s_ops->getvp(seg, addr, vpp));
     1966 +}
     1967 +
     1968 +int
     1969 +segop_advise(struct seg *seg, caddr_t addr, size_t len, uint_t b)
     1970 +{
     1971 +        return (seg->s_ops->advise(seg, addr, len, b));
     1972 +}
     1973 +
     1974 +void
     1975 +segop_dump(struct seg *seg)
     1976 +{
     1977 +        seg->s_ops->dump(seg);
     1978 +}
     1979 +
     1980 +int
     1981 +segop_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***page,
     1982 +    enum lock_type type, enum seg_rw rw)
     1983 +{
     1984 +        return (seg->s_ops->pagelock(seg, addr, len, page, type, rw));
     1985 +}
     1986 +
     1987 +int
     1988 +segop_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
     1989 +{
     1990 +        return (seg->s_ops->setpagesize(seg, addr, len, szc));
     1991 +}
     1992 +
     1993 +int
     1994 +segop_getmemid(struct seg *seg, caddr_t addr, memid_t *mp)
     1995 +{
     1996 +        return (seg->s_ops->getmemid(seg, addr, mp));
     1997 +}
     1998 +
     1999 +struct lgrp_mem_policy_info *
     2000 +segop_getpolicy(struct seg *seg, caddr_t addr)
     2001 +{
     2002 +        if (seg->s_ops->getpolicy == NULL)
     2003 +                return (NULL);
     2004 +
     2005 +        return (seg->s_ops->getpolicy(seg, addr));
     2006 +}
     2007 +
     2008 +int
     2009 +segop_capable(struct seg *seg, segcapability_t cap)
     2010 +{
     2011 +        return (seg->s_ops->capable(seg, cap));
     2012 +}
     2013 +
     2014 +int
     2015 +segop_inherit(struct seg *seg, caddr_t addr, size_t len, uint_t op)
     2016 +{
     2017 +        if (seg->s_ops->inherit == NULL)
     2018 +                return (ENOTSUP);
     2019 +
     2020 +        return (seg->s_ops->inherit(seg, addr, len, op));
1864 2021  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX