1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright (c) 2015, Joyent, Inc.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * University Copyright- Copyright (c) 1982, 1986, 1988
  32  * The Regents of the University of California
  33  * All Rights Reserved
  34  *
  35  * University Acknowledgment- Portions of this document are derived from
  36  * software developed by the University of California, Berkeley, and its
  37  * contributors.
  38  */
  39 
  40 /*
  41  * VM - segment management.
  42  */
  43 
  44 #include <sys/types.h>
  45 #include <sys/inttypes.h>
  46 #include <sys/t_lock.h>
  47 #include <sys/param.h>
  48 #include <sys/systm.h>
  49 #include <sys/kmem.h>
  50 #include <sys/sysmacros.h>
  51 #include <sys/vmsystm.h>
  52 #include <sys/tuneable.h>
  53 #include <sys/debug.h>
  54 #include <sys/fs/swapnode.h>
  55 #include <sys/cmn_err.h>
  56 #include <sys/callb.h>
  57 #include <sys/mem_config.h>
  58 #include <sys/mman.h>
  59 
  60 #include <vm/hat.h>
  61 #include <vm/as.h>
  62 #include <vm/seg.h>
  63 #include <vm/seg_kmem.h>
  64 #include <vm/seg_spt.h>
  65 #include <vm/seg_vn.h>
  66 #include <vm/anon.h>
  67 
  68 /*
  69  * kstats for segment advise
  70  */
  71 segadvstat_t segadvstat = {
  72         { "MADV_FREE_hit",      KSTAT_DATA_ULONG },
  73         { "MADV_FREE_miss",     KSTAT_DATA_ULONG },
  74 };
  75 
  76 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
  77 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
  78 
  79 /*
  80  * entry in the segment page cache
  81  */
  82 struct seg_pcache {
  83         struct seg_pcache       *p_hnext;       /* list for hashed blocks */
  84         struct seg_pcache       *p_hprev;
  85         pcache_link_t           p_plink;        /* per segment/amp list */
  86         void                    *p_htag0;       /* segment/amp pointer */
  87         caddr_t                 p_addr;         /* base address/anon_idx */
  88         size_t                  p_len;          /* total bytes */
  89         size_t                  p_wlen;         /* writtable bytes at p_addr */
  90         struct page             **p_pp;         /* pp shadow list */
  91         seg_preclaim_cbfunc_t   p_callback;     /* reclaim callback function */
  92         clock_t                 p_lbolt;        /* lbolt from last use */
  93         struct seg_phash        *p_hashp;       /* our pcache hash bucket */
  94         uint_t                  p_active;       /* active count */
  95         uchar_t                 p_write;        /* true if S_WRITE */
  96         uchar_t                 p_ref;          /* reference byte */
  97         ushort_t                p_flags;        /* bit flags */
  98 };
  99 
 100 struct seg_phash {
 101         struct seg_pcache       *p_hnext;       /* list for hashed blocks */
 102         struct seg_pcache       *p_hprev;
 103         kmutex_t                p_hmutex;       /* protects hash bucket */
 104         pcache_link_t           p_halink[2];    /* active bucket linkages */
 105 };
 106 
 107 struct seg_phash_wired {
 108         struct seg_pcache       *p_hnext;       /* list for hashed blocks */
 109         struct seg_pcache       *p_hprev;
 110         kmutex_t                p_hmutex;       /* protects hash bucket */
 111 };
 112 
 113 /*
 114  * A parameter to control a maximum number of bytes that can be
 115  * purged from pcache at a time.
 116  */
 117 #define P_MAX_APURGE_BYTES      (1024 * 1024 * 1024)
 118 
 119 /*
 120  * log2(fraction of pcache to reclaim at a time).
 121  */
 122 #define P_SHRINK_SHFT           (5)
 123 
 124 /*
 125  * The following variables can be tuned via /etc/system.
 126  */
 127 
 128 int     segpcache_enabled = 1;          /* if 1, shadow lists are cached */
 129 ulong_t segpcache_hashsize_win = 0;     /* # of non wired buckets */
 130 ulong_t segpcache_hashsize_wired = 0;   /* # of wired buckets */
 131 int     segpcache_reap_sec = 1;         /* reap check rate in secs */
 132 clock_t segpcache_reap_ticks = 0;       /* reap interval in ticks */
 133 int     segpcache_pcp_maxage_sec = 1;   /* pcp max age in secs */
 134 clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */
 135 int     segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */
 136 pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */
 137 
 138 static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */
 139 static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */
 140 static kcondvar_t seg_pasync_cv;
 141 
 142 #pragma align 64(pctrl1)
 143 #pragma align 64(pctrl2)
 144 #pragma align 64(pctrl3)
 145 
 146 /*
 147  * Keep frequently used variables together in one cache line.
 148  */
 149 static struct p_ctrl1 {
 150         uint_t p_disabled;              /* if not 0, caching temporarily off */
 151         size_t p_hashwin_sz;            /* # of non wired buckets */
 152         struct seg_phash *p_htabwin;    /* hash table for non wired entries */
 153         size_t p_hashwired_sz;          /* # of wired buckets */
 154         struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
 155         kmem_cache_t *p_kmcache;        /* kmem cache for seg_pcache structs */
 156 #ifdef _LP64
 157         ulong_t pad[2];
 158 #endif /* _LP64 */
 159 } pctrl1;
 160 
 161 static struct p_ctrl2 {
 162         kmutex_t p_mem_mtx;     /* protects window counter and p_halinks */
 163         pgcnt_t  p_locked_win;  /* # pages from window */
 164         pgcnt_t  p_locked;      /* # of pages cached by pagelock */
 165         uchar_t  p_ahcur;       /* current active links for insert/delete */
 166         uchar_t  p_athr_on;     /* async reclaim thread is running. */
 167         pcache_link_t p_ahhead[2]; /* active buckets linkages */
 168 } pctrl2;
 169 
 170 static struct p_ctrl3 {
 171         clock_t p_pcp_maxage;           /* max pcp age in ticks */
 172         ulong_t p_athr_empty_ahb;       /* athread walk stats */
 173         ulong_t p_athr_full_ahb;        /* athread walk stats */
 174         pgcnt_t p_maxapurge_npages;     /* max pages to purge at a time */
 175         int     p_shrink_shft;          /* reap shift factor */
 176 #ifdef _LP64
 177         ulong_t pad[3];
 178 #endif /* _LP64 */
 179 } pctrl3;
 180 
 181 #define seg_pdisabled                   pctrl1.p_disabled
 182 #define seg_phashsize_win               pctrl1.p_hashwin_sz
 183 #define seg_phashtab_win                pctrl1.p_htabwin
 184 #define seg_phashsize_wired             pctrl1.p_hashwired_sz
 185 #define seg_phashtab_wired              pctrl1.p_htabwired
 186 #define seg_pkmcache                    pctrl1.p_kmcache
 187 #define seg_pmem_mtx                    pctrl2.p_mem_mtx
 188 #define seg_plocked_window              pctrl2.p_locked_win
 189 #define seg_plocked                     pctrl2.p_locked
 190 #define seg_pahcur                      pctrl2.p_ahcur
 191 #define seg_pathr_on                    pctrl2.p_athr_on
 192 #define seg_pahhead                     pctrl2.p_ahhead
 193 #define seg_pmax_pcpage                 pctrl3.p_pcp_maxage
 194 #define seg_pathr_empty_ahb             pctrl3.p_athr_empty_ahb
 195 #define seg_pathr_full_ahb              pctrl3.p_athr_full_ahb
 196 #define seg_pshrink_shift               pctrl3.p_shrink_shft
 197 #define seg_pmaxapurge_npages           pctrl3.p_maxapurge_npages
 198 
 199 #define P_HASHWIN_MASK                  (seg_phashsize_win - 1)
 200 #define P_HASHWIRED_MASK                (seg_phashsize_wired - 1)
 201 #define P_BASESHIFT                     (6)
 202 
 203 kthread_t *seg_pasync_thr;
 204 
 205 extern const struct seg_ops segvn_ops;
 206 extern const struct seg_ops segspt_shmops;
 207 
 208 #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
 209 #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
 210 
 211 #define LBOLT_DELTA(t)  ((ulong_t)(ddi_get_lbolt() - (t)))
 212 
 213 #define PCP_AGE(pcp)    LBOLT_DELTA((pcp)->p_lbolt)
 214 
 215 /*
 216  * htag0 argument can be a seg or amp pointer.
 217  */
 218 #define P_HASHBP(seg, htag0, addr, flags)                               \
 219         (IS_PFLAGS_WIRED((flags)) ?                                     \
 220             ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \
 221             ((uintptr_t)(htag0) >> P_BASESHIFT)]) :                       \
 222             (&seg_phashtab_win[P_HASHWIN_MASK &                         \
 223             (((uintptr_t)(htag0) >> 3) ^                          \
 224             ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ?            \
 225             (flags >> 16) : page_get_shift((seg)->s_szc))))]))
 226 
 227 /*
 228  * htag0 argument can be a seg or amp pointer.
 229  */
 230 #define P_MATCH(pcp, htag0, addr, len)                                  \
 231         ((pcp)->p_htag0 == (htag0) &&                                        \
 232         (pcp)->p_addr == (addr) &&                                   \
 233         (pcp)->p_len >= (len))
 234 
 235 #define P_MATCH_PP(pcp, htag0, addr, len, pp)                           \
 236         ((pcp)->p_pp == (pp) &&                                              \
 237         (pcp)->p_htag0 == (htag0) &&                                 \
 238         (pcp)->p_addr == (addr) &&                                   \
 239         (pcp)->p_len >= (len))
 240 
 241 #define plink2pcache(pl)        ((struct seg_pcache *)((uintptr_t)(pl) - \
 242     offsetof(struct seg_pcache, p_plink)))
 243 
 244 #define hlink2phash(hl, l)      ((struct seg_phash *)((uintptr_t)(hl) - \
 245     offsetof(struct seg_phash, p_halink[l])))
 246 
 247 /*
 248  * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
 249  * active hash bucket lists. We maintain active bucket lists to reduce the
 250  * overhead of finding active buckets during asynchronous purging since there
 251  * can be 10s of millions of buckets on a large system but only a small subset
 252  * of them in actual use.
 253  *
 254  * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
 255  * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
 256  * buckets. The other list is used by asynchronous purge thread. This allows
 257  * the purge thread to walk its active list without holding seg_pmem_mtx for a
 258  * long time. When asynchronous thread is done with its list it switches to
 259  * current active list and makes the list it just finished processing as
 260  * current active list.
 261  *
 262  * seg_padd_abuck() only adds the bucket to current list if the bucket is not
 263  * yet on any list.  seg_premove_abuck() may remove the bucket from either
 264  * list. If the bucket is on current list it will be always removed. Otherwise
 265  * the bucket is only removed if asynchronous purge thread is not currently
 266  * running or seg_premove_abuck() is called by asynchronous purge thread
 267  * itself. A given bucket can only be on one of active lists at a time. These
 268  * routines should be called with per bucket lock held.  The routines use
 269  * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
 270  * the first entry is added to the bucket chain and seg_premove_abuck() must
 271  * be called after the last pcp entry is deleted from its chain. Per bucket
 272  * lock should be held by the callers.  This avoids a potential race condition
 273  * when seg_premove_abuck() removes a bucket after pcp entries are added to
 274  * its list after the caller checked that the bucket has no entries. (this
 275  * race would cause a loss of an active bucket from the active lists).
 276  *
 277  * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
 278  * New entries are added to the end of the list since LRU is used as the
 279  * purging policy.
 280  */
 281 static void
 282 seg_padd_abuck(struct seg_phash *hp)
 283 {
 284         int lix;
 285 
 286         ASSERT(MUTEX_HELD(&hp->p_hmutex));
 287         ASSERT((struct seg_phash *)hp->p_hnext != hp);
 288         ASSERT((struct seg_phash *)hp->p_hprev != hp);
 289         ASSERT(hp->p_hnext == hp->p_hprev);
 290         ASSERT(!IS_PCP_WIRED(hp->p_hnext));
 291         ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
 292         ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
 293         ASSERT(hp >= seg_phashtab_win &&
 294             hp < &seg_phashtab_win[seg_phashsize_win]);
 295 
 296         /*
 297          * This bucket can already be on one of active lists
 298          * since seg_premove_abuck() may have failed to remove it
 299          * before.
 300          */
 301         mutex_enter(&seg_pmem_mtx);
 302         lix = seg_pahcur;
 303         ASSERT(lix >= 0 && lix <= 1);
 304         if (hp->p_halink[lix].p_lnext != NULL) {
 305                 ASSERT(hp->p_halink[lix].p_lprev != NULL);
 306                 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
 307                 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 308                 mutex_exit(&seg_pmem_mtx);
 309                 return;
 310         }
 311         ASSERT(hp->p_halink[lix].p_lprev == NULL);
 312 
 313         /*
 314          * If this bucket is still on list !lix async thread can't yet remove
 315          * it since we hold here per bucket lock. In this case just return
 316          * since async thread will eventually find and process this bucket.
 317          */
 318         if (hp->p_halink[!lix].p_lnext != NULL) {
 319                 ASSERT(hp->p_halink[!lix].p_lprev != NULL);
 320                 mutex_exit(&seg_pmem_mtx);
 321                 return;
 322         }
 323         ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 324         /*
 325          * This bucket is not on any active bucket list yet.
 326          * Add the bucket to the tail of current active list.
 327          */
 328         hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
 329         hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
 330         seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
 331         seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
 332         mutex_exit(&seg_pmem_mtx);
 333 }
 334 
 335 static void
 336 seg_premove_abuck(struct seg_phash *hp, int athr)
 337 {
 338         int lix;
 339 
 340         ASSERT(MUTEX_HELD(&hp->p_hmutex));
 341         ASSERT((struct seg_phash *)hp->p_hnext == hp);
 342         ASSERT((struct seg_phash *)hp->p_hprev == hp);
 343         ASSERT(hp >= seg_phashtab_win &&
 344             hp < &seg_phashtab_win[seg_phashsize_win]);
 345 
 346         if (athr) {
 347                 ASSERT(seg_pathr_on);
 348                 ASSERT(seg_pahcur <= 1);
 349                 /*
 350                  * We are called by asynchronous thread that found this bucket
 351                  * on not currently active (i.e. !seg_pahcur) list. Remove it
 352                  * from there.  Per bucket lock we are holding makes sure
 353                  * seg_pinsert() can't sneak in and add pcp entries to this
 354                  * bucket right before we remove the bucket from its list.
 355                  */
 356                 lix = !seg_pahcur;
 357                 ASSERT(hp->p_halink[lix].p_lnext != NULL);
 358                 ASSERT(hp->p_halink[lix].p_lprev != NULL);
 359                 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
 360                 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 361                 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
 362                 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
 363                 hp->p_halink[lix].p_lnext = NULL;
 364                 hp->p_halink[lix].p_lprev = NULL;
 365                 return;
 366         }
 367 
 368         mutex_enter(&seg_pmem_mtx);
 369         lix = seg_pahcur;
 370         ASSERT(lix >= 0 && lix <= 1);
 371 
 372         /*
 373          * If the bucket is on currently active list just remove it from
 374          * there.
 375          */
 376         if (hp->p_halink[lix].p_lnext != NULL) {
 377                 ASSERT(hp->p_halink[lix].p_lprev != NULL);
 378                 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
 379                 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 380                 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
 381                 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
 382                 hp->p_halink[lix].p_lnext = NULL;
 383                 hp->p_halink[lix].p_lprev = NULL;
 384                 mutex_exit(&seg_pmem_mtx);
 385                 return;
 386         }
 387         ASSERT(hp->p_halink[lix].p_lprev == NULL);
 388 
 389         /*
 390          * If asynchronous thread is not running we can remove the bucket from
 391          * not currently active list. The bucket must be on this list since we
 392          * already checked that it's not on the other list and the bucket from
 393          * which we just deleted the last pcp entry must be still on one of the
 394          * active bucket lists.
 395          */
 396         lix = !lix;
 397         ASSERT(hp->p_halink[lix].p_lnext != NULL);
 398         ASSERT(hp->p_halink[lix].p_lprev != NULL);
 399 
 400         if (!seg_pathr_on) {
 401                 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
 402                 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
 403                 hp->p_halink[lix].p_lnext = NULL;
 404                 hp->p_halink[lix].p_lprev = NULL;
 405         }
 406         mutex_exit(&seg_pmem_mtx);
 407 }
 408 
 409 /*
 410  * Check if bucket pointed by hp already has a pcp entry that matches request
 411  * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
 412  * Also delete matching entries that cover smaller address range but start
 413  * at the same address as addr argument. Return the list of deleted entries if
 414  * any. This is an internal helper function called from seg_pinsert() only
 415  * for non wired shadow lists. The caller already holds a per seg/amp list
 416  * lock.
 417  */
 418 static struct seg_pcache *
 419 seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
 420     caddr_t addr, size_t len, int *found)
 421 {
 422         struct seg_pcache *pcp;
 423         struct seg_pcache *delcallb_list = NULL;
 424 
 425         ASSERT(MUTEX_HELD(&hp->p_hmutex));
 426 
 427         *found = 0;
 428         for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 429             pcp = pcp->p_hnext) {
 430                 ASSERT(pcp->p_hashp == hp);
 431                 if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
 432                         ASSERT(!IS_PCP_WIRED(pcp));
 433                         if (pcp->p_len < len) {
 434                                 pcache_link_t *plinkp;
 435                                 if (pcp->p_active) {
 436                                         continue;
 437                                 }
 438                                 plinkp = &pcp->p_plink;
 439                                 plinkp->p_lprev->p_lnext = plinkp->p_lnext;
 440                                 plinkp->p_lnext->p_lprev = plinkp->p_lprev;
 441                                 pcp->p_hprev->p_hnext = pcp->p_hnext;
 442                                 pcp->p_hnext->p_hprev = pcp->p_hprev;
 443                                 pcp->p_hprev = delcallb_list;
 444                                 delcallb_list = pcp;
 445                         } else {
 446                                 *found = 1;
 447                                 break;
 448                         }
 449                 }
 450         }
 451         return (delcallb_list);
 452 }
 453 
 454 /*
 455  * lookup an address range in pagelock cache. Return shadow list and bump up
 456  * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
 457  * as a lookup tag.
 458  */
 459 struct page **
 460 seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
 461     enum seg_rw rw, uint_t flags)
 462 {
 463         struct seg_pcache *pcp;
 464         struct seg_phash *hp;
 465         void *htag0;
 466 
 467         ASSERT(seg != NULL);
 468         ASSERT(rw == S_READ || rw == S_WRITE);
 469 
 470         /*
 471          * Skip pagelock cache, while DR is in progress or
 472          * seg_pcache is off.
 473          */
 474         if (seg_pdisabled) {
 475                 return (NULL);
 476         }
 477         ASSERT(seg_phashsize_win != 0);
 478 
 479         htag0 = (amp == NULL ? (void *)seg : (void *)amp);
 480         hp = P_HASHBP(seg, htag0, addr, flags);
 481         mutex_enter(&hp->p_hmutex);
 482         for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 483             pcp = pcp->p_hnext) {
 484                 ASSERT(pcp->p_hashp == hp);
 485                 if (P_MATCH(pcp, htag0, addr, len)) {
 486                         ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
 487                         /*
 488                          * If this request wants to write pages
 489                          * but write permissions starting from
 490                          * addr don't cover the entire length len
 491                          * return lookup failure back to the caller.
 492                          * It will check protections and fail this
 493                          * pagelock operation with EACCESS error.
 494                          */
 495                         if (rw == S_WRITE && pcp->p_wlen < len) {
 496                                 break;
 497                         }
 498                         if (pcp->p_active == UINT_MAX) {
 499                                 break;
 500                         }
 501                         pcp->p_active++;
 502                         if (rw == S_WRITE && !pcp->p_write) {
 503                                 pcp->p_write = 1;
 504                         }
 505                         mutex_exit(&hp->p_hmutex);
 506                         return (pcp->p_pp);
 507                 }
 508         }
 509         mutex_exit(&hp->p_hmutex);
 510         return (NULL);
 511 }
 512 
 513 /*
 514  * mark address range inactive. If the cache is off or the address range is
 515  * not in the cache or another shadow list that covers bigger range is found
 516  * we call the segment driver to reclaim the pages. Otherwise just decrement
 517  * active count and set ref bit.  If amp is not NULL use amp as a lookup tag
 518  * otherwise use seg as a lookup tag.
 519  */
 520 void
 521 seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
 522     size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
 523     seg_preclaim_cbfunc_t callback)
 524 {
 525         struct seg_pcache *pcp;
 526         struct seg_phash *hp;
 527         kmutex_t *pmtx = NULL;
 528         pcache_link_t *pheadp;
 529         void *htag0;
 530         pgcnt_t npages = 0;
 531         int keep = 0;
 532 
 533         ASSERT(seg != NULL);
 534         ASSERT(rw == S_READ || rw == S_WRITE);
 535 
 536         htag0 = (amp == NULL ? (void *)seg : (void *)amp);
 537 
 538         /*
 539          * Skip lookup if pcache is not configured.
 540          */
 541         if (seg_phashsize_win == 0) {
 542                 goto out;
 543         }
 544 
 545         /*
 546          * Grab per seg/amp lock before hash lock if we are going to remove
 547          * inactive entry from pcache.
 548          */
 549         if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
 550                 if (amp == NULL) {
 551                         pheadp = &seg->s_phead;
 552                         pmtx = &seg->s_pmtx;
 553                 } else {
 554                         pheadp = &amp->a_phead;
 555                         pmtx = &amp->a_pmtx;
 556                 }
 557                 mutex_enter(pmtx);
 558         }
 559 
 560         hp = P_HASHBP(seg, htag0, addr, flags);
 561         mutex_enter(&hp->p_hmutex);
 562 again:
 563         for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 564             pcp = pcp->p_hnext) {
 565                 ASSERT(pcp->p_hashp == hp);
 566                 if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
 567                         ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
 568                         ASSERT(pcp->p_active);
 569                         if (keep) {
 570                                 /*
 571                                  * Don't remove this pcp entry
 572                                  * if we didn't find duplicate
 573                                  * shadow lists on second search.
 574                                  * Somebody removed those duplicates
 575                                  * since we dropped hash lock after first
 576                                  * search.
 577                                  */
 578                                 ASSERT(pmtx != NULL);
 579                                 ASSERT(!IS_PFLAGS_WIRED(flags));
 580                                 mutex_exit(pmtx);
 581                                 pmtx = NULL;
 582                         }
 583                         pcp->p_active--;
 584                         if (pcp->p_active == 0 && (pmtx != NULL ||
 585                             (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
 586 
 587                                 /*
 588                                  * This entry is no longer active.  Remove it
 589                                  * now either because pcaching is temporarily
 590                                  * disabled or there're other pcp entries that
 591                                  * can match this pagelock request (i.e. this
 592                                  * entry is a duplicate).
 593                                  */
 594 
 595                                 ASSERT(callback == pcp->p_callback);
 596                                 if (pmtx != NULL) {
 597                                         pcache_link_t *plinkp = &pcp->p_plink;
 598                                         ASSERT(!IS_PCP_WIRED(pcp));
 599                                         ASSERT(pheadp->p_lnext != pheadp);
 600                                         ASSERT(pheadp->p_lprev != pheadp);
 601                                         plinkp->p_lprev->p_lnext =
 602                                             plinkp->p_lnext;
 603                                         plinkp->p_lnext->p_lprev =
 604                                             plinkp->p_lprev;
 605                                 }
 606                                 pcp->p_hprev->p_hnext = pcp->p_hnext;
 607                                 pcp->p_hnext->p_hprev = pcp->p_hprev;
 608                                 if (!IS_PCP_WIRED(pcp) &&
 609                                     hp->p_hnext == (struct seg_pcache *)hp) {
 610                                         /*
 611                                          * We removed the last entry from this
 612                                          * bucket.  Now remove the bucket from
 613                                          * its active list.
 614                                          */
 615                                         seg_premove_abuck(hp, 0);
 616                                 }
 617                                 mutex_exit(&hp->p_hmutex);
 618                                 if (pmtx != NULL) {
 619                                         mutex_exit(pmtx);
 620                                 }
 621                                 len = pcp->p_len;
 622                                 npages = btop(len);
 623                                 if (rw != S_WRITE && pcp->p_write) {
 624                                         rw = S_WRITE;
 625                                 }
 626                                 kmem_cache_free(seg_pkmcache, pcp);
 627                                 goto out;
 628                         } else {
 629                                 /*
 630                                  * We found a matching pcp entry but will not
 631                                  * free it right away even if it's no longer
 632                                  * active.
 633                                  */
 634                                 if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
 635                                         /*
 636                                          * Set the reference bit and mark the
 637                                          * time of last access to this pcp
 638                                          * so that asynchronous thread doesn't
 639                                          * free it immediately since
 640                                          * it may be reactivated very soon.
 641                                          */
 642                                         pcp->p_lbolt = ddi_get_lbolt();
 643                                         pcp->p_ref = 1;
 644                                 }
 645                                 mutex_exit(&hp->p_hmutex);
 646                                 if (pmtx != NULL) {
 647                                         mutex_exit(pmtx);
 648                                 }
 649                                 return;
 650                         }
 651                 } else if (!IS_PFLAGS_WIRED(flags) &&
 652                     P_MATCH(pcp, htag0, addr, len)) {
 653                         /*
 654                          * This is a duplicate pcp entry.  This situation may
 655                          * happen if a bigger shadow list that covers our
 656                          * range was added while our entry was still active.
 657                          * Now we can free our pcp entry if it becomes
 658                          * inactive.
 659                          */
 660                         if (!pcp->p_active) {
 661                                 /*
 662                                  * Mark this entry as referenced just in case
 663                                  * we'll free our own pcp entry soon.
 664                                  */
 665                                 pcp->p_lbolt = ddi_get_lbolt();
 666                                 pcp->p_ref = 1;
 667                         }
 668                         if (pmtx != NULL) {
 669                                 /*
 670                                  * we are already holding pmtx and found a
 671                                  * duplicate.  Don't keep our own pcp entry.
 672                                  */
 673                                 keep = 0;
 674                                 continue;
 675                         }
 676                         /*
 677                          * We have to use mutex_tryenter to attempt to lock
 678                          * seg/amp list lock since we already hold hash lock
 679                          * and seg/amp list lock is above hash lock in lock
 680                          * order.  If mutex_tryenter fails drop hash lock and
 681                          * retake both locks in correct order and research
 682                          * this hash chain.
 683                          */
 684                         ASSERT(keep == 0);
 685                         if (amp == NULL) {
 686                                 pheadp = &seg->s_phead;
 687                                 pmtx = &seg->s_pmtx;
 688                         } else {
 689                                 pheadp = &amp->a_phead;
 690                                 pmtx = &amp->a_pmtx;
 691                         }
 692                         if (!mutex_tryenter(pmtx)) {
 693                                 mutex_exit(&hp->p_hmutex);
 694                                 mutex_enter(pmtx);
 695                                 mutex_enter(&hp->p_hmutex);
 696                                 /*
 697                                  * If we don't find bigger shadow list on
 698                                  * second search (it may happen since we
 699                                  * dropped bucket lock) keep the entry that
 700                                  * matches our own shadow list.
 701                                  */
 702                                 keep = 1;
 703                                 goto again;
 704                         }
 705                 }
 706         }
 707         mutex_exit(&hp->p_hmutex);
 708         if (pmtx != NULL) {
 709                 mutex_exit(pmtx);
 710         }
 711 out:
 712         (*callback)(htag0, addr, len, pp, rw, 0);
 713         if (npages) {
 714                 mutex_enter(&seg_pmem_mtx);
 715                 ASSERT(seg_plocked >= npages);
 716                 seg_plocked -= npages;
 717                 if (!IS_PFLAGS_WIRED(flags)) {
 718                         ASSERT(seg_plocked_window >= npages);
 719                         seg_plocked_window -= npages;
 720                 }
 721                 mutex_exit(&seg_pmem_mtx);
 722         }
 723 
 724 }
 725 
 726 #ifdef DEBUG
 727 static uint32_t p_insert_chk_mtbf = 0;
 728 #endif
 729 
 730 /*
 731  * The seg_pinsert_check() is used by segment drivers to predict whether
 732  * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
 733  */
 734 /*ARGSUSED*/
 735 int
 736 seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
 737     size_t len, uint_t flags)
 738 {
 739         ASSERT(seg != NULL);
 740 
 741 #ifdef DEBUG
 742         if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
 743                 return (SEGP_FAIL);
 744         }
 745 #endif
 746 
 747         if (seg_pdisabled) {
 748                 return (SEGP_FAIL);
 749         }
 750         ASSERT(seg_phashsize_win != 0);
 751 
 752         if (IS_PFLAGS_WIRED(flags)) {
 753                 return (SEGP_SUCCESS);
 754         }
 755 
 756         if (freemem < desfree) {
 757                 return (SEGP_FAIL);
 758         }
 759 
 760         return (SEGP_SUCCESS);
 761 }
 762 
 763 #ifdef DEBUG
 764 static uint32_t p_insert_mtbf = 0;
 765 #endif
 766 
 767 /*
 768  * Insert address range with shadow list into pagelock cache if there's no
 769  * shadow list already cached for this address range. If the cache is off or
 770  * caching is temporarily disabled or the allowed 'window' is exceeded return
 771  * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
 772  *
 773  * For non wired shadow lists (segvn case) include address in the hashing
 774  * function to avoid linking all the entries from the same segment or amp on
 775  * the same bucket.  amp is used instead of seg if amp is not NULL. Non wired
 776  * pcache entries are also linked on a per segment/amp list so that all
 777  * entries can be found quickly during seg/amp purge without walking the
 778  * entire pcache hash table.  For wired shadow lists (segspt case) we
 779  * don't use address hashing and per segment linking because the caller
 780  * currently inserts only one entry per segment that covers the entire
 781  * segment. If we used per segment linking even for segspt it would complicate
 782  * seg_ppurge_wiredpp() locking.
 783  *
 784  * Both hash bucket and per seg/amp locks need to be held before adding a non
 785  * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
 786  * first.
 787  *
 788  * This function will also remove from pcache old inactive shadow lists that
 789  * overlap with this request but cover smaller range for the same start
 790  * address.
 791  */
 792 int
 793 seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
 794     size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
 795     seg_preclaim_cbfunc_t callback)
 796 {
 797         struct seg_pcache *pcp;
 798         struct seg_phash *hp;
 799         pgcnt_t npages;
 800         pcache_link_t *pheadp;
 801         kmutex_t *pmtx;
 802         struct seg_pcache *delcallb_list = NULL;
 803 
 804         ASSERT(seg != NULL);
 805         ASSERT(rw == S_READ || rw == S_WRITE);
 806         ASSERT(rw == S_READ || wlen == len);
 807         ASSERT(rw == S_WRITE || wlen <= len);
 808         ASSERT(amp == NULL || wlen == len);
 809 
 810 #ifdef DEBUG
 811         if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
 812                 return (SEGP_FAIL);
 813         }
 814 #endif
 815 
 816         if (seg_pdisabled) {
 817                 return (SEGP_FAIL);
 818         }
 819         ASSERT(seg_phashsize_win != 0);
 820 
 821         ASSERT((len & PAGEOFFSET) == 0);
 822         npages = btop(len);
 823         mutex_enter(&seg_pmem_mtx);
 824         if (!IS_PFLAGS_WIRED(flags)) {
 825                 seg_plocked_window += npages;
 826         }
 827         seg_plocked += npages;
 828         mutex_exit(&seg_pmem_mtx);
 829 
 830         pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
 831         /*
 832          * If amp is not NULL set htag0 to amp otherwise set it to seg.
 833          */
 834         if (amp == NULL) {
 835                 pcp->p_htag0 = (void *)seg;
 836                 pcp->p_flags = flags & 0xffff;
 837         } else {
 838                 pcp->p_htag0 = (void *)amp;
 839                 pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
 840         }
 841         pcp->p_addr = addr;
 842         pcp->p_len = len;
 843         pcp->p_wlen = wlen;
 844         pcp->p_pp = pp;
 845         pcp->p_write = (rw == S_WRITE);
 846         pcp->p_callback = callback;
 847         pcp->p_active = 1;
 848 
 849         hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
 850         if (!IS_PFLAGS_WIRED(flags)) {
 851                 int found;
 852                 void *htag0;
 853                 if (amp == NULL) {
 854                         pheadp = &seg->s_phead;
 855                         pmtx = &seg->s_pmtx;
 856                         htag0 = (void *)seg;
 857                 } else {
 858                         pheadp = &amp->a_phead;
 859                         pmtx = &amp->a_pmtx;
 860                         htag0 = (void *)amp;
 861                 }
 862                 mutex_enter(pmtx);
 863                 mutex_enter(&hp->p_hmutex);
 864                 delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
 865                     len, &found);
 866                 if (found) {
 867                         mutex_exit(&hp->p_hmutex);
 868                         mutex_exit(pmtx);
 869                         mutex_enter(&seg_pmem_mtx);
 870                         seg_plocked -= npages;
 871                         seg_plocked_window -= npages;
 872                         mutex_exit(&seg_pmem_mtx);
 873                         kmem_cache_free(seg_pkmcache, pcp);
 874                         goto out;
 875                 }
 876                 pcp->p_plink.p_lnext = pheadp->p_lnext;
 877                 pcp->p_plink.p_lprev = pheadp;
 878                 pheadp->p_lnext->p_lprev = &pcp->p_plink;
 879                 pheadp->p_lnext = &pcp->p_plink;
 880         } else {
 881                 mutex_enter(&hp->p_hmutex);
 882         }
 883         pcp->p_hashp = hp;
 884         pcp->p_hnext = hp->p_hnext;
 885         pcp->p_hprev = (struct seg_pcache *)hp;
 886         hp->p_hnext->p_hprev = pcp;
 887         hp->p_hnext = pcp;
 888         if (!IS_PFLAGS_WIRED(flags) &&
 889             hp->p_hprev == pcp) {
 890                 seg_padd_abuck(hp);
 891         }
 892         mutex_exit(&hp->p_hmutex);
 893         if (!IS_PFLAGS_WIRED(flags)) {
 894                 mutex_exit(pmtx);
 895         }
 896 
 897 out:
 898         npages = 0;
 899         while (delcallb_list != NULL) {
 900                 pcp = delcallb_list;
 901                 delcallb_list = pcp->p_hprev;
 902                 ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
 903                 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
 904                     pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
 905                 npages += btop(pcp->p_len);
 906                 kmem_cache_free(seg_pkmcache, pcp);
 907         }
 908         if (npages) {
 909                 ASSERT(!IS_PFLAGS_WIRED(flags));
 910                 mutex_enter(&seg_pmem_mtx);
 911                 ASSERT(seg_plocked >= npages);
 912                 ASSERT(seg_plocked_window >= npages);
 913                 seg_plocked -= npages;
 914                 seg_plocked_window -= npages;
 915                 mutex_exit(&seg_pmem_mtx);
 916         }
 917 
 918         return (SEGP_SUCCESS);
 919 }
 920 
 921 /*
 922  * purge entries from the pagelock cache if not active
 923  * and not recently used.
 924  */
 925 static void
 926 seg_ppurge_async(int force)
 927 {
 928         struct seg_pcache *delcallb_list = NULL;
 929         struct seg_pcache *pcp;
 930         struct seg_phash *hp;
 931         pgcnt_t npages = 0;
 932         pgcnt_t npages_window = 0;
 933         pgcnt_t npgs_to_purge;
 934         pgcnt_t npgs_purged = 0;
 935         int hlinks = 0;
 936         int hlix;
 937         pcache_link_t *hlinkp;
 938         pcache_link_t *hlnextp = NULL;
 939         int lowmem;
 940 
 941         ASSERT(seg_phashsize_win != 0);
 942 
 943         /*
 944          * if the cache is off or empty, return
 945          */
 946         if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
 947                 return;
 948         }
 949 
 950         if (!force) {
 951                 lowmem = 0;
 952                 if (freemem < lotsfree + needfree) {
 953                         spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
 954                         if (fmem <= 5 * (desfree >> 2)) {
 955                                 lowmem = 1;
 956                         } else if (fmem <= 7 * (lotsfree >> 3)) {
 957                                 if (seg_plocked_window >=
 958                                     (availrmem_initial >> 1)) {
 959                                         lowmem = 1;
 960                                 }
 961                         } else if (fmem < lotsfree) {
 962                                 if (seg_plocked_window >=
 963                                     3 * (availrmem_initial >> 2)) {
 964                                         lowmem = 1;
 965                                 }
 966                         }
 967                 }
 968                 if (!lowmem) {
 969                         return;
 970                 }
 971                 npgs_to_purge = seg_plocked_window >>
 972                     seg_pshrink_shift;
 973                 if (lowmem) {
 974                         npgs_to_purge = MIN(npgs_to_purge,
 975                             MAX(seg_pmaxapurge_npages, desfree));
 976                 } else {
 977                         npgs_to_purge = MIN(npgs_to_purge,
 978                             seg_pmaxapurge_npages);
 979                 }
 980                 if (npgs_to_purge == 0) {
 981                         return;
 982                 }
 983         } else {
 984                 struct seg_phash_wired *hpw;
 985 
 986                 ASSERT(seg_phashsize_wired != 0);
 987 
 988                 for (hpw = seg_phashtab_wired;
 989                     hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
 990 
 991                         if (hpw->p_hnext == (struct seg_pcache *)hpw) {
 992                                 continue;
 993                         }
 994 
 995                         mutex_enter(&hpw->p_hmutex);
 996 
 997                         for (pcp = hpw->p_hnext;
 998                             pcp != (struct seg_pcache *)hpw;
 999                             pcp = pcp->p_hnext) {
1000 
1001                                 ASSERT(IS_PCP_WIRED(pcp));
1002                                 ASSERT(pcp->p_hashp ==
1003                                     (struct seg_phash *)hpw);
1004 
1005                                 if (pcp->p_active) {
1006                                         continue;
1007                                 }
1008                                 pcp->p_hprev->p_hnext = pcp->p_hnext;
1009                                 pcp->p_hnext->p_hprev = pcp->p_hprev;
1010                                 pcp->p_hprev = delcallb_list;
1011                                 delcallb_list = pcp;
1012                         }
1013                         mutex_exit(&hpw->p_hmutex);
1014                 }
1015         }
1016 
1017         mutex_enter(&seg_pmem_mtx);
1018         if (seg_pathr_on) {
1019                 mutex_exit(&seg_pmem_mtx);
1020                 goto runcb;
1021         }
1022         seg_pathr_on = 1;
1023         mutex_exit(&seg_pmem_mtx);
1024         ASSERT(seg_pahcur <= 1);
1025         hlix = !seg_pahcur;
1026 
1027 again:
1028         for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
1029             hlinkp = hlnextp) {
1030 
1031                 hlnextp = hlinkp->p_lnext;
1032                 ASSERT(hlnextp != NULL);
1033 
1034                 hp = hlink2phash(hlinkp, hlix);
1035                 if (hp->p_hnext == (struct seg_pcache *)hp) {
1036                         seg_pathr_empty_ahb++;
1037                         continue;
1038                 }
1039                 seg_pathr_full_ahb++;
1040                 mutex_enter(&hp->p_hmutex);
1041 
1042                 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
1043                     pcp = pcp->p_hnext) {
1044                         pcache_link_t *pheadp;
1045                         pcache_link_t *plinkp;
1046                         void *htag0;
1047                         kmutex_t *pmtx;
1048 
1049                         ASSERT(!IS_PCP_WIRED(pcp));
1050                         ASSERT(pcp->p_hashp == hp);
1051 
1052                         if (pcp->p_active) {
1053                                 continue;
1054                         }
1055                         if (!force && pcp->p_ref &&
1056                             PCP_AGE(pcp) < seg_pmax_pcpage) {
1057                                 pcp->p_ref = 0;
1058                                 continue;
1059                         }
1060                         plinkp = &pcp->p_plink;
1061                         htag0 = pcp->p_htag0;
1062                         if (pcp->p_flags & SEGP_AMP) {
1063                                 pheadp = &((amp_t *)htag0)->a_phead;
1064                                 pmtx = &((amp_t *)htag0)->a_pmtx;
1065                         } else {
1066                                 pheadp = &((seg_t *)htag0)->s_phead;
1067                                 pmtx = &((seg_t *)htag0)->s_pmtx;
1068                         }
1069                         if (!mutex_tryenter(pmtx)) {
1070                                 continue;
1071                         }
1072                         ASSERT(pheadp->p_lnext != pheadp);
1073                         ASSERT(pheadp->p_lprev != pheadp);
1074                         plinkp->p_lprev->p_lnext =
1075                             plinkp->p_lnext;
1076                         plinkp->p_lnext->p_lprev =
1077                             plinkp->p_lprev;
1078                         pcp->p_hprev->p_hnext = pcp->p_hnext;
1079                         pcp->p_hnext->p_hprev = pcp->p_hprev;
1080                         mutex_exit(pmtx);
1081                         pcp->p_hprev = delcallb_list;
1082                         delcallb_list = pcp;
1083                         npgs_purged += btop(pcp->p_len);
1084                 }
1085                 if (hp->p_hnext == (struct seg_pcache *)hp) {
1086                         seg_premove_abuck(hp, 1);
1087                 }
1088                 mutex_exit(&hp->p_hmutex);
1089                 if (npgs_purged >= seg_plocked_window) {
1090                         break;
1091                 }
1092                 if (!force) {
1093                         if (npgs_purged >= npgs_to_purge) {
1094                                 break;
1095                         }
1096                         if (!(seg_pathr_full_ahb & 15)) {
1097                                 ASSERT(lowmem);
1098                                 if (freemem >= lotsfree + needfree) {
1099                                         break;
1100                                 }
1101                         }
1102                 }
1103         }
1104 
1105         if (hlinkp == &seg_pahhead[hlix]) {
1106                 /*
1107                  * We processed the entire hlix active bucket list
1108                  * but didn't find enough pages to reclaim.
1109                  * Switch the lists and walk the other list
1110                  * if we haven't done it yet.
1111                  */
1112                 mutex_enter(&seg_pmem_mtx);
1113                 ASSERT(seg_pathr_on);
1114                 ASSERT(seg_pahcur == !hlix);
1115                 seg_pahcur = hlix;
1116                 mutex_exit(&seg_pmem_mtx);
1117                 if (++hlinks < 2) {
1118                         hlix = !hlix;
1119                         goto again;
1120                 }
1121         } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
1122             seg_pahhead[hlix].p_lnext != hlinkp) {
1123                 ASSERT(hlinkp != NULL);
1124                 ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
1125                 ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
1126                 ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
1127 
1128                 /*
1129                  * Reinsert the header to point to hlinkp
1130                  * so that we start from hlinkp bucket next time around.
1131                  */
1132                 seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
1133                 seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
1134                 seg_pahhead[hlix].p_lnext = hlinkp;
1135                 seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
1136                 hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
1137                 hlinkp->p_lprev = &seg_pahhead[hlix];
1138         }
1139 
1140         mutex_enter(&seg_pmem_mtx);
1141         ASSERT(seg_pathr_on);
1142         seg_pathr_on = 0;
1143         mutex_exit(&seg_pmem_mtx);
1144 
1145 runcb:
1146         /*
1147          * Run the delayed callback list. segments/amps can't go away until
1148          * callback is executed since they must have non 0 softlockcnt. That's
1149          * why we don't need to hold as/seg/amp locks to execute the callback.
1150          */
1151         while (delcallb_list != NULL) {
1152                 pcp = delcallb_list;
1153                 delcallb_list = pcp->p_hprev;
1154                 ASSERT(!pcp->p_active);
1155                 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1156                     pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
1157                 npages += btop(pcp->p_len);
1158                 if (!IS_PCP_WIRED(pcp)) {
1159                         npages_window += btop(pcp->p_len);
1160                 }
1161                 kmem_cache_free(seg_pkmcache, pcp);
1162         }
1163         if (npages) {
1164                 mutex_enter(&seg_pmem_mtx);
1165                 ASSERT(seg_plocked >= npages);
1166                 ASSERT(seg_plocked_window >= npages_window);
1167                 seg_plocked -= npages;
1168                 seg_plocked_window -= npages_window;
1169                 mutex_exit(&seg_pmem_mtx);
1170         }
1171 }
1172 
1173 /*
1174  * Remove cached pages for segment(s) entries from hashtable.  The segments
1175  * are identified by pp array. This is useful for multiple seg's cached on
1176  * behalf of dummy segment (ISM/DISM) with common pp array.
1177  */
1178 void
1179 seg_ppurge_wiredpp(struct page **pp)
1180 {
1181         struct seg_pcache *pcp;
1182         struct seg_phash_wired *hp;
1183         pgcnt_t npages = 0;
1184         struct  seg_pcache *delcallb_list = NULL;
1185 
1186         /*
1187          * if the cache is empty, return
1188          */
1189         if (seg_plocked == 0) {
1190                 return;
1191         }
1192         ASSERT(seg_phashsize_wired != 0);
1193 
1194         for (hp = seg_phashtab_wired;
1195             hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
1196                 if (hp->p_hnext == (struct seg_pcache *)hp) {
1197                         continue;
1198                 }
1199                 mutex_enter(&hp->p_hmutex);
1200                 pcp = hp->p_hnext;
1201                 while (pcp != (struct seg_pcache *)hp) {
1202                         ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
1203                         ASSERT(IS_PCP_WIRED(pcp));
1204                         /*
1205                          * purge entries which are not active
1206                          */
1207                         if (!pcp->p_active && pcp->p_pp == pp) {
1208                                 ASSERT(pcp->p_htag0 != NULL);
1209                                 pcp->p_hprev->p_hnext = pcp->p_hnext;
1210                                 pcp->p_hnext->p_hprev = pcp->p_hprev;
1211                                 pcp->p_hprev = delcallb_list;
1212                                 delcallb_list = pcp;
1213                         }
1214                         pcp = pcp->p_hnext;
1215                 }
1216                 mutex_exit(&hp->p_hmutex);
1217                 /*
1218                  * segments can't go away until callback is executed since
1219                  * they must have non 0 softlockcnt. That's why we don't
1220                  * need to hold as/seg locks to execute the callback.
1221                  */
1222                 while (delcallb_list != NULL) {
1223                         int done;
1224                         pcp = delcallb_list;
1225                         delcallb_list = pcp->p_hprev;
1226                         ASSERT(!pcp->p_active);
1227                         done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1228                             pcp->p_len, pcp->p_pp,
1229                             pcp->p_write ? S_WRITE : S_READ, 1);
1230                         npages += btop(pcp->p_len);
1231                         ASSERT(IS_PCP_WIRED(pcp));
1232                         kmem_cache_free(seg_pkmcache, pcp);
1233                         if (done) {
1234                                 ASSERT(delcallb_list == NULL);
1235                                 goto out;
1236                         }
1237                 }
1238         }
1239 
1240 out:
1241         mutex_enter(&seg_pmem_mtx);
1242         ASSERT(seg_plocked >= npages);
1243         seg_plocked -= npages;
1244         mutex_exit(&seg_pmem_mtx);
1245 }
1246 
1247 /*
1248  * purge all entries for a given segment. Since we
1249  * callback into the segment driver directly for page
1250  * reclaim the caller needs to hold the right locks.
1251  */
1252 void
1253 seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
1254 {
1255         struct seg_pcache *delcallb_list = NULL;
1256         struct seg_pcache *pcp;
1257         struct seg_phash *hp;
1258         pgcnt_t npages = 0;
1259         void *htag0;
1260 
1261         if (seg_plocked == 0) {
1262                 return;
1263         }
1264         ASSERT(seg_phashsize_win != 0);
1265 
1266         /*
1267          * If amp is not NULL use amp as a lookup tag otherwise use seg
1268          * as a lookup tag.
1269          */
1270         htag0 = (amp == NULL ? (void *)seg : (void *)amp);
1271         ASSERT(htag0 != NULL);
1272         if (IS_PFLAGS_WIRED(flags)) {
1273                 hp = P_HASHBP(seg, htag0, 0, flags);
1274                 mutex_enter(&hp->p_hmutex);
1275                 pcp = hp->p_hnext;
1276                 while (pcp != (struct seg_pcache *)hp) {
1277                         ASSERT(pcp->p_hashp == hp);
1278                         ASSERT(IS_PCP_WIRED(pcp));
1279                         if (pcp->p_htag0 == htag0) {
1280                                 if (pcp->p_active) {
1281                                         break;
1282                                 }
1283                                 pcp->p_hprev->p_hnext = pcp->p_hnext;
1284                                 pcp->p_hnext->p_hprev = pcp->p_hprev;
1285                                 pcp->p_hprev = delcallb_list;
1286                                 delcallb_list = pcp;
1287                         }
1288                         pcp = pcp->p_hnext;
1289                 }
1290                 mutex_exit(&hp->p_hmutex);
1291         } else {
1292                 pcache_link_t *plinkp;
1293                 pcache_link_t *pheadp;
1294                 kmutex_t *pmtx;
1295 
1296                 if (amp == NULL) {
1297                         ASSERT(seg != NULL);
1298                         pheadp = &seg->s_phead;
1299                         pmtx = &seg->s_pmtx;
1300                 } else {
1301                         pheadp = &amp->a_phead;
1302                         pmtx = &amp->a_pmtx;
1303                 }
1304                 mutex_enter(pmtx);
1305                 while ((plinkp = pheadp->p_lnext) != pheadp) {
1306                         pcp = plink2pcache(plinkp);
1307                         ASSERT(!IS_PCP_WIRED(pcp));
1308                         ASSERT(pcp->p_htag0 == htag0);
1309                         hp = pcp->p_hashp;
1310                         mutex_enter(&hp->p_hmutex);
1311                         if (pcp->p_active) {
1312                                 mutex_exit(&hp->p_hmutex);
1313                                 break;
1314                         }
1315                         ASSERT(plinkp->p_lprev == pheadp);
1316                         pheadp->p_lnext = plinkp->p_lnext;
1317                         plinkp->p_lnext->p_lprev = pheadp;
1318                         pcp->p_hprev->p_hnext = pcp->p_hnext;
1319                         pcp->p_hnext->p_hprev = pcp->p_hprev;
1320                         pcp->p_hprev = delcallb_list;
1321                         delcallb_list = pcp;
1322                         if (hp->p_hnext == (struct seg_pcache *)hp) {
1323                                 seg_premove_abuck(hp, 0);
1324                         }
1325                         mutex_exit(&hp->p_hmutex);
1326                 }
1327                 mutex_exit(pmtx);
1328         }
1329         while (delcallb_list != NULL) {
1330                 pcp = delcallb_list;
1331                 delcallb_list = pcp->p_hprev;
1332                 ASSERT(!pcp->p_active);
1333                 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
1334                     pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
1335                 npages += btop(pcp->p_len);
1336                 kmem_cache_free(seg_pkmcache, pcp);
1337         }
1338         mutex_enter(&seg_pmem_mtx);
1339         ASSERT(seg_plocked >= npages);
1340         seg_plocked -= npages;
1341         if (!IS_PFLAGS_WIRED(flags)) {
1342                 ASSERT(seg_plocked_window >= npages);
1343                 seg_plocked_window -= npages;
1344         }
1345         mutex_exit(&seg_pmem_mtx);
1346 }
1347 
1348 static void seg_pinit_mem_config(void);
1349 
1350 /*
1351  * setup the pagelock cache
1352  */
1353 static void
1354 seg_pinit(void)
1355 {
1356         struct seg_phash *hp;
1357         ulong_t i;
1358         pgcnt_t physmegs;
1359 
1360         seg_plocked = 0;
1361         seg_plocked_window = 0;
1362 
1363         if (segpcache_enabled == 0) {
1364                 seg_phashsize_win = 0;
1365                 seg_phashsize_wired = 0;
1366                 seg_pdisabled = 1;
1367                 return;
1368         }
1369 
1370         seg_pdisabled = 0;
1371         seg_pkmcache = kmem_cache_create("seg_pcache",
1372             sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
1373         if (segpcache_pcp_maxage_ticks <= 0) {
1374                 segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
1375         }
1376         seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
1377         seg_pathr_empty_ahb = 0;
1378         seg_pathr_full_ahb = 0;
1379         seg_pshrink_shift = segpcache_shrink_shift;
1380         seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
1381 
1382         mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
1383         mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
1384         mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
1385         cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
1386 
1387         physmegs = physmem >> (20 - PAGESHIFT);
1388 
1389         /*
1390          * If segpcache_hashsize_win was not set in /etc/system or it has
1391          * absurd value set it to a default.
1392          */
1393         if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
1394                 /*
1395                  * Create one bucket per 32K (or at least per 8 pages) of
1396                  * available memory.
1397                  */
1398                 pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
1399                 segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
1400         }
1401         if (!ISP2(segpcache_hashsize_win)) {
1402                 ulong_t rndfac = ~(1UL <<
1403                     (highbit(segpcache_hashsize_win) - 1));
1404                 rndfac &= segpcache_hashsize_win;
1405                 segpcache_hashsize_win += rndfac;
1406                 segpcache_hashsize_win = 1 <<
1407                     (highbit(segpcache_hashsize_win) - 1);
1408         }
1409         seg_phashsize_win = segpcache_hashsize_win;
1410         seg_phashtab_win = kmem_zalloc(
1411             seg_phashsize_win * sizeof (struct seg_phash),
1412             KM_SLEEP);
1413         for (i = 0; i < seg_phashsize_win; i++) {
1414                 hp = &seg_phashtab_win[i];
1415                 hp->p_hnext = (struct seg_pcache *)hp;
1416                 hp->p_hprev = (struct seg_pcache *)hp;
1417                 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1418         }
1419 
1420         seg_pahcur = 0;
1421         seg_pathr_on = 0;
1422         seg_pahhead[0].p_lnext = &seg_pahhead[0];
1423         seg_pahhead[0].p_lprev = &seg_pahhead[0];
1424         seg_pahhead[1].p_lnext = &seg_pahhead[1];
1425         seg_pahhead[1].p_lprev = &seg_pahhead[1];
1426 
1427         /*
1428          * If segpcache_hashsize_wired was not set in /etc/system or it has
1429          * absurd value set it to a default.
1430          */
1431         if (segpcache_hashsize_wired == 0 ||
1432             segpcache_hashsize_wired > physmem / 4) {
1433                 /*
1434                  * Choose segpcache_hashsize_wired based on physmem.
1435                  * Create a bucket per 128K bytes upto 256K buckets.
1436                  */
1437                 if (physmegs < 20 * 1024) {
1438                         segpcache_hashsize_wired = MAX(1024, physmegs << 3);
1439                 } else {
1440                         segpcache_hashsize_wired = 256 * 1024;
1441                 }
1442         }
1443         if (!ISP2(segpcache_hashsize_wired)) {
1444                 segpcache_hashsize_wired = 1 <<
1445                     highbit(segpcache_hashsize_wired);
1446         }
1447         seg_phashsize_wired = segpcache_hashsize_wired;
1448         seg_phashtab_wired = kmem_zalloc(
1449             seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
1450         for (i = 0; i < seg_phashsize_wired; i++) {
1451                 hp = (struct seg_phash *)&seg_phashtab_wired[i];
1452                 hp->p_hnext = (struct seg_pcache *)hp;
1453                 hp->p_hprev = (struct seg_pcache *)hp;
1454                 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1455         }
1456 
1457         seg_pinit_mem_config();
1458 }
1459 
1460 /*
1461  * called by pageout if memory is low
1462  */
1463 void
1464 seg_preap(void)
1465 {
1466         /*
1467          * if the cache is off or empty, return
1468          */
1469         if (seg_plocked_window == 0) {
1470                 return;
1471         }
1472         ASSERT(seg_phashsize_win != 0);
1473 
1474         /*
1475          * If somebody is already purging pcache
1476          * just return.
1477          */
1478         if (seg_pdisabled) {
1479                 return;
1480         }
1481 
1482         cv_signal(&seg_pasync_cv);
1483 }
1484 
1485 /*
1486  * run as a backgroud thread and reclaim pagelock
1487  * pages which have not been used recently
1488  */
1489 void
1490 seg_pasync_thread(void)
1491 {
1492         callb_cpr_t cpr_info;
1493 
1494         if (seg_phashsize_win == 0) {
1495                 thread_exit();
1496                 /*NOTREACHED*/
1497         }
1498 
1499         seg_pasync_thr = curthread;
1500 
1501         CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
1502             callb_generic_cpr, "seg_pasync");
1503 
1504         if (segpcache_reap_ticks <= 0) {
1505                 segpcache_reap_ticks = segpcache_reap_sec * hz;
1506         }
1507 
1508         mutex_enter(&seg_pasync_mtx);
1509         for (;;) {
1510                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
1511                 (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx,
1512                     segpcache_reap_ticks, TR_CLOCK_TICK);
1513                 CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
1514                 if (seg_pdisabled == 0) {
1515                         seg_ppurge_async(0);
1516                 }
1517         }
1518 }
1519 
1520 static struct kmem_cache *seg_cache;
1521 
1522 /*
1523  * Initialize segment management data structures.
1524  */
1525 void
1526 seg_init(void)
1527 {
1528         kstat_t *ksp;
1529 
1530         seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
1531             0, NULL, NULL, NULL, NULL, NULL, 0);
1532 
1533         ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
1534             segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
1535         if (ksp) {
1536                 ksp->ks_data = (void *)segadvstat_ptr;
1537                 kstat_install(ksp);
1538         }
1539 
1540         seg_pinit();
1541 }
1542 
1543 /*
1544  * Allocate a segment to cover [base, base+size]
1545  * and attach it to the specified address space.
1546  */
1547 struct seg *
1548 seg_alloc(struct as *as, caddr_t base, size_t size)
1549 {
1550         struct seg *new;
1551         caddr_t segbase;
1552         size_t segsize;
1553 
1554         segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
1555         segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
1556             (uintptr_t)segbase;
1557 
1558         if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
1559                 return ((struct seg *)NULL);    /* bad virtual addr range */
1560 
1561         if (as != &kas &&
1562             valid_usr_range(segbase, segsize, 0, as,
1563             as->a_userlimit) != RANGE_OKAY)
1564                 return ((struct seg *)NULL);    /* bad virtual addr range */
1565 
1566         new = kmem_cache_alloc(seg_cache, KM_SLEEP);
1567         new->s_ops = NULL;
1568         new->s_data = NULL;
1569         new->s_szc = 0;
1570         new->s_flags = 0;
1571         mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
1572         new->s_phead.p_lnext = &new->s_phead;
1573         new->s_phead.p_lprev = &new->s_phead;
1574         if (seg_attach(as, segbase, segsize, new) < 0) {
1575                 kmem_cache_free(seg_cache, new);
1576                 return ((struct seg *)NULL);
1577         }
1578         /* caller must fill in ops, data */
1579         return (new);
1580 }
1581 
1582 /*
1583  * Attach a segment to the address space.  Used by seg_alloc()
1584  * and for kernel startup to attach to static segments.
1585  */
1586 int
1587 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
1588 {
1589         seg->s_as = as;
1590         seg->s_base = base;
1591         seg->s_size = size;
1592 
1593         /*
1594          * as_addseg() will add the segment at the appropraite point
1595          * in the list. It will return -1 if there is overlap with
1596          * an already existing segment.
1597          */
1598         return (as_addseg(as, seg));
1599 }
1600 
1601 /*
1602  * Unmap a segment and free it from its associated address space.
1603  * This should be called by anybody who's finished with a whole segment's
1604  * mapping.  Just calls segop_unmap() on the whole mapping .  It is the
1605  * responsibility of the segment driver to unlink the the segment
1606  * from the address space, and to free public and private data structures
1607  * associated with the segment.  (This is typically done by a call to
1608  * seg_free()).
1609  */
1610 void
1611 seg_unmap(struct seg *seg)
1612 {
1613 #ifdef DEBUG
1614         int ret;
1615 #endif /* DEBUG */
1616 
1617         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1618 
1619         /* Shouldn't have called seg_unmap if mapping isn't yet established */
1620         ASSERT(seg->s_data != NULL);
1621 
1622         /* Unmap the whole mapping */
1623 #ifdef DEBUG
1624         ret = segop_unmap(seg, seg->s_base, seg->s_size);
1625         ASSERT(ret == 0);
1626 #else
1627         segop_unmap(seg, seg->s_base, seg->s_size);
1628 #endif /* DEBUG */
1629 }
1630 
1631 /*
1632  * Free the segment from its associated as. This should only be called
1633  * if a mapping to the segment has not yet been established (e.g., if
1634  * an error occurs in the middle of doing an as_map when the segment
1635  * has already been partially set up) or if it has already been deleted
1636  * (e.g., from a segment driver unmap routine if the unmap applies to the
1637  * entire segment). If the mapping is currently set up then seg_unmap() should
1638  * be called instead.
1639  */
1640 void
1641 seg_free(struct seg *seg)
1642 {
1643         register struct as *as = seg->s_as;
1644         struct seg *tseg = as_removeseg(as, seg);
1645 
1646         ASSERT(tseg == seg);
1647 
1648         /*
1649          * If the segment private data field is NULL,
1650          * then segment driver is not attached yet.
1651          */
1652         if (seg->s_data != NULL)
1653                 segop_free(seg);
1654 
1655         mutex_destroy(&seg->s_pmtx);
1656         ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
1657         ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
1658         kmem_cache_free(seg_cache, seg);
1659 }
1660 
1661 /*ARGSUSED*/
1662 static void
1663 seg_p_mem_config_post_add(
1664         void *arg,
1665         pgcnt_t delta_pages)
1666 {
1667         /* Nothing to do. */
1668 }
1669 
1670 void
1671 seg_p_enable(void)
1672 {
1673         mutex_enter(&seg_pcache_mtx);
1674         ASSERT(seg_pdisabled != 0);
1675         seg_pdisabled--;
1676         mutex_exit(&seg_pcache_mtx);
1677 }
1678 
1679 /*
1680  * seg_p_disable - disables seg_pcache, and then attempts to empty the
1681  * cache.
1682  * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1683  * SEGP_FAIL if the cache could not be emptied.
1684  */
1685 int
1686 seg_p_disable(void)
1687 {
1688         pgcnt_t old_plocked;
1689         int stall_count = 0;
1690 
1691         mutex_enter(&seg_pcache_mtx);
1692         seg_pdisabled++;
1693         ASSERT(seg_pdisabled != 0);
1694         mutex_exit(&seg_pcache_mtx);
1695 
1696         /*
1697          * Attempt to empty the cache. Terminate if seg_plocked does not
1698          * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
1699          */
1700         while (seg_plocked != 0) {
1701                 ASSERT(seg_phashsize_win != 0);
1702                 old_plocked = seg_plocked;
1703                 seg_ppurge_async(1);
1704                 if (seg_plocked == old_plocked) {
1705                         if (stall_count++ > SEGP_STALL_THRESHOLD) {
1706                                 return (SEGP_FAIL);
1707                         }
1708                 } else
1709                         stall_count = 0;
1710                 if (seg_plocked != 0)
1711                         delay(hz/SEGP_PREDEL_DELAY_FACTOR);
1712         }
1713         return (SEGP_SUCCESS);
1714 }
1715 
1716 /*
1717  * Attempt to purge seg_pcache.  May need to return before this has
1718  * completed to allow other pre_del callbacks to unlock pages. This is
1719  * ok because:
1720  *      1) The seg_pdisabled flag has been set so at least we won't
1721  *      cache anymore locks and the locks we couldn't purge
1722  *      will not be held if they do get released by a subsequent
1723  *      pre-delete callback.
1724  *
1725  *      2) The rest of the memory delete thread processing does not
1726  *      depend on the changes made in this pre-delete callback. No
1727  *      panics will result, the worst that will happen is that the
1728  *      DR code will timeout and cancel the delete.
1729  */
1730 /*ARGSUSED*/
1731 static int
1732 seg_p_mem_config_pre_del(
1733         void *arg,
1734         pgcnt_t delta_pages)
1735 {
1736         if (seg_phashsize_win == 0) {
1737                 return (0);
1738         }
1739         if (seg_p_disable() != SEGP_SUCCESS)
1740                 cmn_err(CE_NOTE,
1741                     "!Pre-delete couldn't purge"" pagelock cache - continuing");
1742         return (0);
1743 }
1744 
1745 /*ARGSUSED*/
1746 static void
1747 seg_p_mem_config_post_del(
1748         void *arg,
1749         pgcnt_t delta_pages,
1750         int cancelled)
1751 {
1752         if (seg_phashsize_win == 0) {
1753                 return;
1754         }
1755         seg_p_enable();
1756 }
1757 
1758 static kphysm_setup_vector_t seg_p_mem_config_vec = {
1759         KPHYSM_SETUP_VECTOR_VERSION,
1760         seg_p_mem_config_post_add,
1761         seg_p_mem_config_pre_del,
1762         seg_p_mem_config_post_del,
1763 };
1764 
1765 static void
1766 seg_pinit_mem_config(void)
1767 {
1768         int ret;
1769 
1770         ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
1771         /*
1772          * Want to catch this in the debug kernel. At run time, if the
1773          * callbacks don't get run all will be OK as the disable just makes
1774          * it more likely that the pages can be collected.
1775          */
1776         ASSERT(ret == 0);
1777 }
1778 
1779 /*
1780  * Verify that segment is not a shared anonymous segment which reserves
1781  * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
1782  * from one zone to another if any segments are shared.  This is because the
1783  * last process to exit will credit the swap reservation.  This could lead
1784  * to the swap being reserved by one zone, and credited to another.
1785  */
1786 boolean_t
1787 seg_can_change_zones(struct seg *seg)
1788 {
1789         struct segvn_data *svd;
1790 
1791         if (seg->s_ops == &segspt_shmops)
1792                 return (B_FALSE);
1793 
1794         if (seg->s_ops == &segvn_ops) {
1795                 svd = (struct segvn_data *)seg->s_data;
1796                 if (svd->type == MAP_SHARED &&
1797                     svd->amp != NULL &&
1798                     svd->amp->swresv > 0)
1799                 return (B_FALSE);
1800         }
1801         return (B_TRUE);
1802 }
1803 
1804 /*
1805  * Return swap reserved by a segment backing a private mapping.
1806  */
1807 size_t
1808 seg_swresv(struct seg *seg)
1809 {
1810         struct segvn_data *svd;
1811         size_t swap = 0;
1812 
1813         if (seg->s_ops == &segvn_ops) {
1814                 svd = (struct segvn_data *)seg->s_data;
1815                 if (svd->type == MAP_PRIVATE && svd->swresv > 0)
1816                         swap = svd->swresv;
1817         }
1818         return (swap);
1819 }
1820 
1821 /*
1822  * segop wrappers
1823  */
1824 int
1825 segop_dup(struct seg *seg, struct seg *new)
1826 {
1827         VERIFY3P(seg->s_ops->dup, !=, NULL);
1828 
1829         return (seg->s_ops->dup(seg, new));
1830 }
1831 
1832 int
1833 segop_unmap(struct seg *seg, caddr_t addr, size_t len)
1834 {
1835         VERIFY3P(seg->s_ops->unmap, !=, NULL);
1836 
1837         return (seg->s_ops->unmap(seg, addr, len));
1838 }
1839 
1840 void
1841 segop_free(struct seg *seg)
1842 {
1843         VERIFY3P(seg->s_ops->free, !=, NULL);
1844 
1845         seg->s_ops->free(seg);
1846 }
1847 
1848 faultcode_t
1849 segop_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
1850     enum fault_type type, enum seg_rw rw)
1851 {
1852         VERIFY3P(seg->s_ops->fault, !=, NULL);
1853 
1854         return (seg->s_ops->fault(hat, seg, addr, len, type, rw));
1855 }
1856 
1857 faultcode_t
1858 segop_faulta(struct seg *seg, caddr_t addr)
1859 {
1860         VERIFY3P(seg->s_ops->faulta, !=, NULL);
1861 
1862         return (seg->s_ops->faulta(seg, addr));
1863 }
1864 
1865 int
1866 segop_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1867 {
1868         VERIFY3P(seg->s_ops->setprot, !=, NULL);
1869 
1870         return (seg->s_ops->setprot(seg, addr, len, prot));
1871 }
1872 
1873 int
1874 segop_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1875 {
1876         VERIFY3P(seg->s_ops->checkprot, !=, NULL);
1877 
1878         return (seg->s_ops->checkprot(seg, addr, len, prot));
1879 }
1880 
1881 int
1882 segop_kluster(struct seg *seg, caddr_t addr, ssize_t d)
1883 {
1884         VERIFY3P(seg->s_ops->kluster, !=, NULL);
1885 
1886         return (seg->s_ops->kluster(seg, addr, d));
1887 }
1888 
1889 int
1890 segop_sync(struct seg *seg, caddr_t addr, size_t len, int atr, uint_t f)
1891 {
1892         VERIFY3P(seg->s_ops->sync, !=, NULL);
1893 
1894         return (seg->s_ops->sync(seg, addr, len, atr, f));
1895 }
1896 
1897 size_t
1898 segop_incore(struct seg *seg, caddr_t addr, size_t len, char *v)
1899 {
1900         VERIFY3P(seg->s_ops->incore, !=, NULL);
1901 
1902         return (seg->s_ops->incore(seg, addr, len, v));
1903 }
1904 
1905 int
1906 segop_lockop(struct seg *seg, caddr_t addr, size_t len, int atr, int op,
1907     ulong_t *b, size_t p)
1908 {
1909         VERIFY3P(seg->s_ops->lockop, !=, NULL);
1910 
1911         return (seg->s_ops->lockop(seg, addr, len, atr, op, b, p));
1912 }
1913 
1914 int
1915 segop_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *p)
1916 {
1917         VERIFY3P(seg->s_ops->getprot, !=, NULL);
1918 
1919         return (seg->s_ops->getprot(seg, addr, len, p));
1920 }
1921 
1922 u_offset_t
1923 segop_getoffset(struct seg *seg, caddr_t addr)
1924 {
1925         VERIFY3P(seg->s_ops->getoffset, !=, NULL);
1926 
1927         return (seg->s_ops->getoffset(seg, addr));
1928 }
1929 
1930 int
1931 segop_gettype(struct seg *seg, caddr_t addr)
1932 {
1933         VERIFY3P(seg->s_ops->gettype, !=, NULL);
1934 
1935         return (seg->s_ops->gettype(seg, addr));
1936 }
1937 
1938 int
1939 segop_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
1940 {
1941         VERIFY3P(seg->s_ops->getvp, !=, NULL);
1942 
1943         return (seg->s_ops->getvp(seg, addr, vpp));
1944 }
1945 
1946 int
1947 segop_advise(struct seg *seg, caddr_t addr, size_t len, uint_t b)
1948 {
1949         VERIFY3P(seg->s_ops->advise, !=, NULL);
1950 
1951         return (seg->s_ops->advise(seg, addr, len, b));
1952 }
1953 
1954 void
1955 segop_dump(struct seg *seg)
1956 {
1957         if (seg->s_ops->dump == NULL)
1958                 return;
1959 
1960         seg->s_ops->dump(seg);
1961 }
1962 
1963 int
1964 segop_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***page,
1965     enum lock_type type, enum seg_rw rw)
1966 {
1967         VERIFY3P(seg->s_ops->pagelock, !=, NULL);
1968 
1969         return (seg->s_ops->pagelock(seg, addr, len, page, type, rw));
1970 }
1971 
1972 int
1973 segop_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
1974 {
1975         if (seg->s_ops->setpagesize == NULL)
1976                 return (ENOTSUP);
1977 
1978         return (seg->s_ops->setpagesize(seg, addr, len, szc));
1979 }
1980 
1981 int
1982 segop_getmemid(struct seg *seg, caddr_t addr, memid_t *mp)
1983 {
1984         if (seg->s_ops->getmemid == NULL)
1985                 return (ENODEV);
1986 
1987         return (seg->s_ops->getmemid(seg, addr, mp));
1988 }
1989 
1990 struct lgrp_mem_policy_info *
1991 segop_getpolicy(struct seg *seg, caddr_t addr)
1992 {
1993         if (seg->s_ops->getpolicy == NULL)
1994                 return (NULL);
1995 
1996         return (seg->s_ops->getpolicy(seg, addr));
1997 }
1998 
1999 int
2000 segop_capable(struct seg *seg, segcapability_t cap)
2001 {
2002         if (seg->s_ops->capable == NULL)
2003                 return (0);
2004 
2005         return (seg->s_ops->capable(seg, cap));
2006 }
2007 
2008 int
2009 segop_inherit(struct seg *seg, caddr_t addr, size_t len, uint_t op)
2010 {
2011         if (seg->s_ops->inherit == NULL)
2012                 return (ENOTSUP);
2013 
2014         return (seg->s_ops->inherit(seg, addr, len, op));
2015 }