use-NULL-capable-segop-as-a-shorthand-for-no-capabilities New usr/src/uts/common/vm/seg

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2015, Joyent, Inc. All rights reserved.
  24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * University Copyright- Copyright (c) 1982, 1986, 1988
  32  * The Regents of the University of California
  33  * All Rights Reserved
  34  *
  35  * University Acknowledgment- Portions of this document are derived from
  36  * software developed by the University of California, Berkeley, and its
  37  * contributors.
  38  */
  39 
  40 /*
  41  * VM - shared or copy-on-write from a vnode/anonymous memory.
  42  */
  43 
  44 #include <sys/types.h>
  45 #include <sys/param.h>
  46 #include <sys/t_lock.h>
  47 #include <sys/errno.h>
  48 #include <sys/systm.h>
  49 #include <sys/mman.h>
  50 #include <sys/debug.h>
  51 #include <sys/cred.h>
  52 #include <sys/vmsystm.h>
  53 #include <sys/tuneable.h>
  54 #include <sys/bitmap.h>
  55 #include <sys/swap.h>
  56 #include <sys/kmem.h>
  57 #include <sys/sysmacros.h>
  58 #include <sys/vtrace.h>
  59 #include <sys/cmn_err.h>
  60 #include <sys/callb.h>
  61 #include <sys/vm.h>
  62 #include <sys/dumphdr.h>
  63 #include <sys/lgrp.h>
  64 
  65 #include <vm/hat.h>
  66 #include <vm/as.h>
  67 #include <vm/seg.h>
  68 #include <vm/seg_vn.h>
  69 #include <vm/pvn.h>
  70 #include <vm/anon.h>
  71 #include <vm/page.h>
  72 #include <vm/vpage.h>
  73 #include <sys/proc.h>
  74 #include <sys/task.h>
  75 #include <sys/project.h>
  76 #include <sys/zone.h>
  77 #include <sys/shm_impl.h>
  78 
  79 /*
  80  * segvn_fault needs a temporary page list array.  To avoid calling kmem all
  81  * the time, it creates a small (PVN_GETPAGE_NUM entry) array and uses it if
  82  * it can.  In the rare case when this page list is not large enough, it
  83  * goes and gets a large enough array from kmem.
  84  *
  85  * This small page list array covers either 8 pages or 64kB worth of pages -
  86  * whichever is smaller.
  87  */
  88 #define PVN_MAX_GETPAGE_SZ      0x10000
  89 #define PVN_MAX_GETPAGE_NUM     0x8
  90 
  91 #if PVN_MAX_GETPAGE_SZ > PVN_MAX_GETPAGE_NUM * PAGESIZE
  92 #define PVN_GETPAGE_SZ  ptob(PVN_MAX_GETPAGE_NUM)
  93 #define PVN_GETPAGE_NUM PVN_MAX_GETPAGE_NUM
  94 #else
  95 #define PVN_GETPAGE_SZ  PVN_MAX_GETPAGE_SZ
  96 #define PVN_GETPAGE_NUM btop(PVN_MAX_GETPAGE_SZ)
  97 #endif
  98 
  99 /*
 100  * Private seg op routines.
 101  */
 102 static int      segvn_dup(struct seg *seg, struct seg *newseg);
 103 static int      segvn_unmap(struct seg *seg, caddr_t addr, size_t len);
 104 static void     segvn_free(struct seg *seg);
 105 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg,
 106                     caddr_t addr, size_t len, enum fault_type type,
 107                     enum seg_rw rw);
 108 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr);
 109 static int      segvn_setprot(struct seg *seg, caddr_t addr,
 110                     size_t len, uint_t prot);
 111 static int      segvn_checkprot(struct seg *seg, caddr_t addr,
 112                     size_t len, uint_t prot);
 113 static int      segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
 114 static int      segvn_sync(struct seg *seg, caddr_t addr, size_t len,
 115                     int attr, uint_t flags);
 116 static size_t   segvn_incore(struct seg *seg, caddr_t addr, size_t len,
 117                     char *vec);
 118 static int      segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
 119                     int attr, int op, ulong_t *lockmap, size_t pos);
 120 static int      segvn_getprot(struct seg *seg, caddr_t addr, size_t len,
 121                     uint_t *protv);
 122 static u_offset_t       segvn_getoffset(struct seg *seg, caddr_t addr);
 123 static int      segvn_gettype(struct seg *seg, caddr_t addr);
 124 static int      segvn_getvp(struct seg *seg, caddr_t addr,
 125                     struct vnode **vpp);
 126 static int      segvn_advise(struct seg *seg, caddr_t addr, size_t len,
 127                     uint_t behav);
 128 static void     segvn_dump(struct seg *seg);
 129 static int      segvn_pagelock(struct seg *seg, caddr_t addr, size_t len,
 130                     struct page ***ppp, enum lock_type type, enum seg_rw rw);
 131 static int      segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len,
 132                     uint_t szc);
 133 static int      segvn_getmemid(struct seg *seg, caddr_t addr,
 134                     memid_t *memidp);
 135 static lgrp_mem_policy_info_t   *segvn_getpolicy(struct seg *, caddr_t);
 136 static int      segvn_inherit(struct seg *, caddr_t, size_t, uint_t);
 137 
 138 struct  seg_ops segvn_ops = {
 139         .dup            = segvn_dup,
 140         .unmap          = segvn_unmap,
 141         .free           = segvn_free,
 142         .fault          = segvn_fault,
 143         .faulta         = segvn_faulta,
 144         .setprot        = segvn_setprot,
 145         .checkprot      = segvn_checkprot,
 146         .kluster        = segvn_kluster,
 147         .sync           = segvn_sync,
 148         .incore         = segvn_incore,
 149         .lockop         = segvn_lockop,
 150         .getprot        = segvn_getprot,
 151         .getoffset      = segvn_getoffset,
 152         .gettype        = segvn_gettype,
 153         .getvp          = segvn_getvp,
 154         .advise         = segvn_advise,
 155         .dump           = segvn_dump,
 156         .pagelock       = segvn_pagelock,
 157         .setpagesize    = segvn_setpagesize,
 158         .getmemid       = segvn_getmemid,
 159         .getpolicy      = segvn_getpolicy,
 160         .inherit        = segvn_inherit,
 161 };
 162 
 163 /*
 164  * Common zfod structures, provided as a shorthand for others to use.
 165  */
 166 static segvn_crargs_t zfod_segvn_crargs =
 167         SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 168 static segvn_crargs_t kzfod_segvn_crargs =
 169         SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER,
 170         PROT_ALL & ~PROT_USER);
 171 static segvn_crargs_t stack_noexec_crargs =
 172         SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL);
 173 
 174 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs;   /* user zfod argsp */
 175 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */
 176 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs;     /* executable stack */
 177 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */
 178 
 179 #define vpgtob(n)       ((n) * sizeof (struct vpage))   /* For brevity */
 180 
 181 size_t  segvn_comb_thrshld = UINT_MAX;  /* patchable -- see 1196681 */
 182 
 183 size_t  segvn_pglock_comb_thrshld = (1UL << 16);  /* 64K */
 184 size_t  segvn_pglock_comb_balign = (1UL << 16);           /* 64K */
 185 uint_t  segvn_pglock_comb_bshift;
 186 size_t  segvn_pglock_comb_palign;
 187 
 188 static int      segvn_concat(struct seg *, struct seg *, int);
 189 static int      segvn_extend_prev(struct seg *, struct seg *,
 190                     struct segvn_crargs *, size_t);
 191 static int      segvn_extend_next(struct seg *, struct seg *,
 192                     struct segvn_crargs *, size_t);
 193 static void     segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw);
 194 static void     segvn_pagelist_rele(page_t **);
 195 static void     segvn_setvnode_mpss(vnode_t *);
 196 static void     segvn_relocate_pages(page_t **, page_t *);
 197 static int      segvn_full_szcpages(page_t **, uint_t, int *, uint_t *);
 198 static int      segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t,
 199     uint_t, page_t **, page_t **, uint_t *, int *);
 200 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t,
 201     caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
 202 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t,
 203     caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
 204 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t,
 205     u_offset_t, struct vpage *, page_t **, uint_t,
 206     enum fault_type, enum seg_rw, int);
 207 static void     segvn_vpage(struct seg *);
 208 static size_t   segvn_count_swap_by_vpages(struct seg *);
 209 
 210 static void segvn_purge(struct seg *seg);
 211 static int segvn_reclaim(void *, caddr_t, size_t, struct page **,
 212     enum seg_rw, int);
 213 static int shamp_reclaim(void *, caddr_t, size_t, struct page **,
 214     enum seg_rw, int);
 215 
 216 static int sameprot(struct seg *, caddr_t, size_t);
 217 
 218 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t);
 219 static int segvn_clrszc(struct seg *);
 220 static struct seg *segvn_split_seg(struct seg *, caddr_t);
 221 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t,
 222     ulong_t, uint_t);
 223 
 224 static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t,
 225     size_t, void *, u_offset_t);
 226 
 227 static struct kmem_cache *segvn_cache;
 228 static struct kmem_cache **segvn_szc_cache;
 229 
 230 #ifdef VM_STATS
 231 static struct segvnvmstats_str {
 232         ulong_t fill_vp_pages[31];
 233         ulong_t fltvnpages[49];
 234         ulong_t fullszcpages[10];
 235         ulong_t relocatepages[3];
 236         ulong_t fltanpages[17];
 237         ulong_t pagelock[2];
 238         ulong_t demoterange[3];
 239 } segvnvmstats;
 240 #endif /* VM_STATS */
 241 
 242 #define SDR_RANGE       1               /* demote entire range */
 243 #define SDR_END         2               /* demote non aligned ends only */
 244 
 245 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) {          \
 246                 if ((len) != 0) {                                             \
 247                         lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);  \
 248                         ASSERT(lpgaddr >= (seg)->s_base);               \
 249                         lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) +    \
 250                             (len)), pgsz);                                    \
 251                         ASSERT(lpgeaddr > lpgaddr);                        \
 252                         ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size);    \
 253                 } else {                                                      \
 254                         lpgeaddr = lpgaddr = (addr);                          \
 255                 }                                                             \
 256         }
 257 
 258 /*ARGSUSED*/
 259 static int
 260 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags)
 261 {
 262         struct segvn_data *svd = buf;
 263 
 264         rw_init(&svd->lock, NULL, RW_DEFAULT, NULL);
 265         mutex_init(&svd->segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
 266         svd->svn_trnext = svd->svn_trprev = NULL;
 267         return (0);
 268 }
 269 
 270 /*ARGSUSED1*/
 271 static void
 272 segvn_cache_destructor(void *buf, void *cdrarg)
 273 {
 274         struct segvn_data *svd = buf;
 275 
 276         rw_destroy(&svd->lock);
 277         mutex_destroy(&svd->segfree_syncmtx);
 278 }
 279 
 280 /*ARGSUSED*/
 281 static int
 282 svntr_cache_constructor(void *buf, void *cdrarg, int kmflags)
 283 {
 284         bzero(buf, sizeof (svntr_t));
 285         return (0);
 286 }
 287 
 288 /*
 289  * Patching this variable to non-zero allows the system to run with
 290  * stacks marked as "not executable".  It's a bit of a kludge, but is
 291  * provided as a tweakable for platforms that export those ABIs
 292  * (e.g. sparc V8) that have executable stacks enabled by default.
 293  * There are also some restrictions for platforms that don't actually
 294  * implement 'noexec' protections.
 295  *
 296  * Once enabled, the system is (therefore) unable to provide a fully
 297  * ABI-compliant execution environment, though practically speaking,
 298  * most everything works.  The exceptions are generally some interpreters
 299  * and debuggers that create executable code on the stack and jump
 300  * into it (without explicitly mprotecting the address range to include
 301  * PROT_EXEC).
 302  *
 303  * One important class of applications that are disabled are those
 304  * that have been transformed into malicious agents using one of the
 305  * numerous "buffer overflow" attacks.  See 4007890.
 306  */
 307 int noexec_user_stack = 0;
 308 int noexec_user_stack_log = 1;
 309 
 310 int segvn_lpg_disable = 0;
 311 uint_t segvn_maxpgszc = 0;
 312 
 313 ulong_t segvn_vmpss_clrszc_cnt;
 314 ulong_t segvn_vmpss_clrszc_err;
 315 ulong_t segvn_fltvnpages_clrszc_cnt;
 316 ulong_t segvn_fltvnpages_clrszc_err;
 317 ulong_t segvn_setpgsz_align_err;
 318 ulong_t segvn_setpgsz_anon_align_err;
 319 ulong_t segvn_setpgsz_getattr_err;
 320 ulong_t segvn_setpgsz_eof_err;
 321 ulong_t segvn_faultvnmpss_align_err1;
 322 ulong_t segvn_faultvnmpss_align_err2;
 323 ulong_t segvn_faultvnmpss_align_err3;
 324 ulong_t segvn_faultvnmpss_align_err4;
 325 ulong_t segvn_faultvnmpss_align_err5;
 326 ulong_t segvn_vmpss_pageio_deadlk_err;
 327 
 328 int segvn_use_regions = 1;
 329 
 330 /*
 331  * Segvn supports text replication optimization for NUMA platforms. Text
 332  * replica's are represented by anon maps (amp). There's one amp per text file
 333  * region per lgroup. A process chooses the amp for each of its text mappings
 334  * based on the lgroup assignment of its main thread (t_tid = 1). All
 335  * processes that want a replica on a particular lgroup for the same text file
 336  * mapping share the same amp. amp's are looked up in svntr_hashtab hash table
 337  * with vp,off,size,szc used as a key. Text replication segments are read only
 338  * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by
 339  * forcing COW faults from vnode to amp and mapping amp pages instead of vnode
 340  * pages. Replication amp is assigned to a segment when it gets its first
 341  * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread
 342  * rechecks periodically if the process still maps an amp local to the main
 343  * thread. If not async thread forces process to remap to an amp in the new
 344  * home lgroup of the main thread. Current text replication implementation
 345  * only provides the benefit to workloads that do most of their work in the
 346  * main thread of a process or all the threads of a process run in the same
 347  * lgroup. To extend text replication benefit to different types of
 348  * multithreaded workloads further work would be needed in the hat layer to
 349  * allow the same virtual address in the same hat to simultaneously map
 350  * different physical addresses (i.e. page table replication would be needed
 351  * for x86).
 352  *
 353  * amp pages are used instead of vnode pages as long as segment has a very
 354  * simple life cycle.  It's created via segvn_create(), handles S_EXEC
 355  * (S_READ) pagefaults and is fully unmapped.  If anything more complicated
 356  * happens such as protection is changed, real COW fault happens, pagesize is
 357  * changed, MC_LOCK is requested or segment is partially unmapped we turn off
 358  * text replication by converting the segment back to vnode only segment
 359  * (unmap segment's address range and set svd->amp to NULL).
 360  *
 361  * The original file can be changed after amp is inserted into
 362  * svntr_hashtab. Processes that are launched after the file is already
 363  * changed can't use the replica's created prior to the file change. To
 364  * implement this functionality hash entries are timestamped. Replica's can
 365  * only be used if current file modification time is the same as the timestamp
 366  * saved when hash entry was created. However just timestamps alone are not
 367  * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We
 368  * deal with file changes via MAP_SHARED mappings differently. When writable
 369  * MAP_SHARED mappings are created to vnodes marked as executable we mark all
 370  * existing replica's for this vnode as not usable for future text
 371  * mappings. And we don't create new replica's for files that currently have
 372  * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is
 373  * true).
 374  */
 375 
 376 #define SEGVN_TEXTREPL_MAXBYTES_FACTOR  (20)
 377 size_t  segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR;
 378 
 379 static ulong_t                  svntr_hashtab_sz = 512;
 380 static svntr_bucket_t           *svntr_hashtab = NULL;
 381 static struct kmem_cache        *svntr_cache;
 382 static svntr_stats_t            *segvn_textrepl_stats;
 383 static ksema_t                  segvn_trasync_sem;
 384 
 385 int                             segvn_disable_textrepl = 1;
 386 size_t                          textrepl_size_thresh = (size_t)-1;
 387 size_t                          segvn_textrepl_bytes = 0;
 388 size_t                          segvn_textrepl_max_bytes = 0;
 389 clock_t                         segvn_update_textrepl_interval = 0;
 390 int                             segvn_update_tr_time = 10;
 391 int                             segvn_disable_textrepl_update = 0;
 392 
 393 static void segvn_textrepl(struct seg *);
 394 static void segvn_textunrepl(struct seg *, int);
 395 static void segvn_inval_trcache(vnode_t *);
 396 static void segvn_trasync_thread(void);
 397 static void segvn_trupdate_wakeup(void *);
 398 static void segvn_trupdate(void);
 399 static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *,
 400     ulong_t);
 401 
 402 /*
 403  * Initialize segvn data structures
 404  */
 405 void
 406 segvn_init(void)
 407 {
 408         uint_t maxszc;
 409         uint_t szc;
 410         size_t pgsz;
 411 
 412         segvn_cache = kmem_cache_create("segvn_cache",
 413             sizeof (struct segvn_data), 0,
 414             segvn_cache_constructor, segvn_cache_destructor, NULL,
 415             NULL, NULL, 0);
 416 
 417         if (segvn_lpg_disable == 0) {
 418                 szc = maxszc = page_num_pagesizes() - 1;
 419                 if (szc == 0) {
 420                         segvn_lpg_disable = 1;
 421                 }
 422                 if (page_get_pagesize(0) != PAGESIZE) {
 423                         panic("segvn_init: bad szc 0");
 424                         /*NOTREACHED*/
 425                 }
 426                 while (szc != 0) {
 427                         pgsz = page_get_pagesize(szc);
 428                         if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) {
 429                                 panic("segvn_init: bad szc %d", szc);
 430                                 /*NOTREACHED*/
 431                         }
 432                         szc--;
 433                 }
 434                 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc)
 435                         segvn_maxpgszc = maxszc;
 436         }
 437 
 438         if (segvn_maxpgszc) {
 439                 segvn_szc_cache = (struct kmem_cache **)kmem_alloc(
 440                     (segvn_maxpgszc + 1) * sizeof (struct kmem_cache *),
 441                     KM_SLEEP);
 442         }
 443 
 444         for (szc = 1; szc <= segvn_maxpgszc; szc++) {
 445                 char    str[32];
 446 
 447                 (void) sprintf(str, "segvn_szc_cache%d", szc);
 448                 segvn_szc_cache[szc] = kmem_cache_create(str,
 449                     page_get_pagecnt(szc) * sizeof (page_t *), 0,
 450                     NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
 451         }
 452 
 453 
 454         if (segvn_use_regions && !hat_supported(HAT_SHARED_REGIONS, NULL))
 455                 segvn_use_regions = 0;
 456 
 457         /*
 458          * For now shared regions and text replication segvn support
 459          * are mutually exclusive. This is acceptable because
 460          * currently significant benefit from text replication was
 461          * only observed on AMD64 NUMA platforms (due to relatively
 462          * small L2$ size) and currently we don't support shared
 463          * regions on x86.
 464          */
 465         if (segvn_use_regions && !segvn_disable_textrepl) {
 466                 segvn_disable_textrepl = 1;
 467         }
 468 
 469 #if defined(_LP64)
 470         if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 &&
 471             !segvn_disable_textrepl) {
 472                 ulong_t i;
 473                 size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t);
 474 
 475                 svntr_cache = kmem_cache_create("svntr_cache",
 476                     sizeof (svntr_t), 0, svntr_cache_constructor, NULL,
 477                     NULL, NULL, NULL, 0);
 478                 svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP);
 479                 for (i = 0; i < svntr_hashtab_sz; i++) {
 480                         mutex_init(&svntr_hashtab[i].tr_lock,  NULL,
 481                             MUTEX_DEFAULT, NULL);
 482                 }
 483                 segvn_textrepl_max_bytes = ptob(physmem) /
 484                     segvn_textrepl_max_bytes_factor;
 485                 segvn_textrepl_stats = kmem_zalloc(NCPU *
 486                     sizeof (svntr_stats_t), KM_SLEEP);
 487                 sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL);
 488                 (void) thread_create(NULL, 0, segvn_trasync_thread,
 489                     NULL, 0, &p0, TS_RUN, minclsyspri);
 490         }
 491 #endif
 492 
 493         if (!ISP2(segvn_pglock_comb_balign) ||
 494             segvn_pglock_comb_balign < PAGESIZE) {
 495                 segvn_pglock_comb_balign = 1UL << 16; /* 64K */
 496         }
 497         segvn_pglock_comb_bshift = highbit(segvn_pglock_comb_balign) - 1;
 498         segvn_pglock_comb_palign = btop(segvn_pglock_comb_balign);
 499 }
 500 
 501 #define SEGVN_PAGEIO    ((void *)0x1)
 502 #define SEGVN_NOPAGEIO  ((void *)0x2)
 503 
 504 static void
 505 segvn_setvnode_mpss(vnode_t *vp)
 506 {
 507         int err;
 508 
 509         ASSERT(vp->v_mpssdata == NULL ||
 510             vp->v_mpssdata == SEGVN_PAGEIO ||
 511             vp->v_mpssdata == SEGVN_NOPAGEIO);
 512 
 513         if (vp->v_mpssdata == NULL) {
 514                 if (vn_vmpss_usepageio(vp)) {
 515                         err = VOP_PAGEIO(vp, (page_t *)NULL,
 516                             (u_offset_t)0, 0, 0, CRED(), NULL);
 517                 } else {
 518                         err = ENOSYS;
 519                 }
 520                 /*
 521                  * set v_mpssdata just once per vnode life
 522                  * so that it never changes.
 523                  */
 524                 mutex_enter(&vp->v_lock);
 525                 if (vp->v_mpssdata == NULL) {
 526                         if (err == EINVAL) {
 527                                 vp->v_mpssdata = SEGVN_PAGEIO;
 528                         } else {
 529                                 vp->v_mpssdata = SEGVN_NOPAGEIO;
 530                         }
 531                 }
 532                 mutex_exit(&vp->v_lock);
 533         }
 534 }
 535 
 536 int
 537 segvn_create(struct seg *seg, void *argsp)
 538 {
 539         struct segvn_crargs *a = (struct segvn_crargs *)argsp;
 540         struct segvn_data *svd;
 541         size_t swresv = 0;
 542         struct cred *cred;
 543         struct anon_map *amp;
 544         int error = 0;
 545         size_t pgsz;
 546         lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT;
 547         int use_rgn = 0;
 548         int trok = 0;
 549 
 550         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
 551 
 552         if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) {
 553                 panic("segvn_create type");
 554                 /*NOTREACHED*/
 555         }
 556 
 557         /*
 558          * Check arguments.  If a shared anon structure is given then
 559          * it is illegal to also specify a vp.
 560          */
 561         if (a->amp != NULL && a->vp != NULL) {
 562                 panic("segvn_create anon_map");
 563                 /*NOTREACHED*/
 564         }
 565 
 566         if (a->type == MAP_PRIVATE && (a->flags & MAP_TEXT) &&
 567             a->vp != NULL && a->prot == (PROT_USER | PROT_READ | PROT_EXEC) &&
 568             segvn_use_regions) {
 569                 use_rgn = 1;
 570         }
 571 
 572         /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */
 573         if (a->type == MAP_SHARED)
 574                 a->flags &= ~MAP_NORESERVE;
 575 
 576         if (a->szc != 0) {
 577                 if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) ||
 578                     (a->amp != NULL && a->type == MAP_PRIVATE) ||
 579                     (a->flags & MAP_NORESERVE) || seg->s_as == &kas) {
 580                         a->szc = 0;
 581                 } else {
 582                         if (a->szc > segvn_maxpgszc)
 583                                 a->szc = segvn_maxpgszc;
 584                         pgsz = page_get_pagesize(a->szc);
 585                         if (!IS_P2ALIGNED(seg->s_base, pgsz) ||
 586                             !IS_P2ALIGNED(seg->s_size, pgsz)) {
 587                                 a->szc = 0;
 588                         } else if (a->vp != NULL) {
 589                                 if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) {
 590                                         /*
 591                                          * paranoid check.
 592                                          * hat_page_demote() is not supported
 593                                          * on swapfs pages.
 594                                          */
 595                                         a->szc = 0;
 596                                 } else if (map_addr_vacalign_check(seg->s_base,
 597                                     a->offset & PAGEMASK)) {
 598                                         a->szc = 0;
 599                                 }
 600                         } else if (a->amp != NULL) {
 601                                 pgcnt_t anum = btopr(a->offset);
 602                                 pgcnt_t pgcnt = page_get_pagecnt(a->szc);
 603                                 if (!IS_P2ALIGNED(anum, pgcnt)) {
 604                                         a->szc = 0;
 605                                 }
 606                         }
 607                 }
 608         }
 609 
 610         /*
 611          * If segment may need private pages, reserve them now.
 612          */
 613         if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) ||
 614             (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) {
 615                 if (anon_resv_zone(seg->s_size,
 616                     seg->s_as->a_proc->p_zone) == 0)
 617                         return (EAGAIN);
 618                 swresv = seg->s_size;
 619                 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
 620                     seg, swresv, 1);
 621         }
 622 
 623         /*
 624          * Reserve any mapping structures that may be required.
 625          *
 626          * Don't do it for segments that may use regions. It's currently a
 627          * noop in the hat implementations anyway.
 628          */
 629         if (!use_rgn) {
 630                 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP);
 631         }
 632 
 633         if (a->cred) {
 634                 cred = a->cred;
 635                 crhold(cred);
 636         } else {
 637                 crhold(cred = CRED());
 638         }
 639 
 640         /* Inform the vnode of the new mapping */
 641         if (a->vp != NULL) {
 642                 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK,
 643                     seg->s_as, seg->s_base, seg->s_size, a->prot,
 644                     a->maxprot, a->type, cred, NULL);
 645                 if (error) {
 646                         if (swresv != 0) {
 647                                 anon_unresv_zone(swresv,
 648                                     seg->s_as->a_proc->p_zone);
 649                                 TRACE_3(TR_FAC_VM, TR_ANON_PROC,
 650                                     "anon proc:%p %lu %u", seg, swresv, 0);
 651                         }
 652                         crfree(cred);
 653                         if (!use_rgn) {
 654                                 hat_unload(seg->s_as->a_hat, seg->s_base,
 655                                     seg->s_size, HAT_UNLOAD_UNMAP);
 656                         }
 657                         return (error);
 658                 }
 659                 /*
 660                  * svntr_hashtab will be NULL if we support shared regions.
 661                  */
 662                 trok = ((a->flags & MAP_TEXT) &&
 663                     (seg->s_size > textrepl_size_thresh ||
 664                     (a->flags & _MAP_TEXTREPL)) &&
 665                     lgrp_optimizations() && svntr_hashtab != NULL &&
 666                     a->type == MAP_PRIVATE && swresv == 0 &&
 667                     !(a->flags & MAP_NORESERVE) &&
 668                     seg->s_as != &kas && a->vp->v_type == VREG);
 669 
 670                 ASSERT(!trok || !use_rgn);
 671         }
 672 
 673         /*
 674          * MAP_NORESERVE mappings don't count towards the VSZ of a process
 675          * until we fault the pages in.
 676          */
 677         if ((a->vp == NULL || a->vp->v_type != VREG) &&
 678             a->flags & MAP_NORESERVE) {
 679                 seg->s_as->a_resvsize -= seg->s_size;
 680         }
 681 
 682         /*
 683          * If more than one segment in the address space, and they're adjacent
 684          * virtually, try to concatenate them.  Don't concatenate if an
 685          * explicit anon_map structure was supplied (e.g., SystemV shared
 686          * memory) or if we'll use text replication for this segment.
 687          */
 688         if (a->amp == NULL && !use_rgn && !trok) {
 689                 struct seg *pseg, *nseg;
 690                 struct segvn_data *psvd, *nsvd;
 691                 lgrp_mem_policy_t ppolicy, npolicy;
 692                 uint_t  lgrp_mem_policy_flags = 0;
 693                 extern lgrp_mem_policy_t lgrp_mem_default_policy;
 694 
 695                 /*
 696                  * Memory policy flags (lgrp_mem_policy_flags) is valid when
 697                  * extending stack/heap segments.
 698                  */
 699                 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) &&
 700                     !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) {
 701                         lgrp_mem_policy_flags = a->lgrp_mem_policy_flags;
 702                 } else {
 703                         /*
 704                          * Get policy when not extending it from another segment
 705                          */
 706                         mpolicy = lgrp_mem_policy_default(seg->s_size, a->type);
 707                 }
 708 
 709                 /*
 710                  * First, try to concatenate the previous and new segments
 711                  */
 712                 pseg = AS_SEGPREV(seg->s_as, seg);
 713                 if (pseg != NULL &&
 714                     pseg->s_base + pseg->s_size == seg->s_base &&
 715                     pseg->s_ops == &segvn_ops) {
 716                         /*
 717                          * Get memory allocation policy from previous segment.
 718                          * When extension is specified (e.g. for heap) apply
 719                          * this policy to the new segment regardless of the
 720                          * outcome of segment concatenation.  Extension occurs
 721                          * for non-default policy otherwise default policy is
 722                          * used and is based on extended segment size.
 723                          */
 724                         psvd = (struct segvn_data *)pseg->s_data;
 725                         ppolicy = psvd->policy_info.mem_policy;
 726                         if (lgrp_mem_policy_flags ==
 727                             LGRP_MP_FLAG_EXTEND_UP) {
 728                                 if (ppolicy != lgrp_mem_default_policy) {
 729                                         mpolicy = ppolicy;
 730                                 } else {
 731                                         mpolicy = lgrp_mem_policy_default(
 732                                             pseg->s_size + seg->s_size,
 733                                             a->type);
 734                                 }
 735                         }
 736 
 737                         if (mpolicy == ppolicy &&
 738                             (pseg->s_size + seg->s_size <=
 739                             segvn_comb_thrshld || psvd->amp == NULL) &&
 740                             segvn_extend_prev(pseg, seg, a, swresv) == 0) {
 741                                 /*
 742                                  * success! now try to concatenate
 743                                  * with following seg
 744                                  */
 745                                 crfree(cred);
 746                                 nseg = AS_SEGNEXT(pseg->s_as, pseg);
 747                                 if (nseg != NULL &&
 748                                     nseg != pseg &&
 749                                     nseg->s_ops == &segvn_ops &&
 750                                     pseg->s_base + pseg->s_size ==
 751                                     nseg->s_base)
 752                                         (void) segvn_concat(pseg, nseg, 0);
 753                                 ASSERT(pseg->s_szc == 0 ||
 754                                     (a->szc == pseg->s_szc &&
 755                                     IS_P2ALIGNED(pseg->s_base, pgsz) &&
 756                                     IS_P2ALIGNED(pseg->s_size, pgsz)));
 757                                 return (0);
 758                         }
 759                 }
 760 
 761                 /*
 762                  * Failed, so try to concatenate with following seg
 763                  */
 764                 nseg = AS_SEGNEXT(seg->s_as, seg);
 765                 if (nseg != NULL &&
 766                     seg->s_base + seg->s_size == nseg->s_base &&
 767                     nseg->s_ops == &segvn_ops) {
 768                         /*
 769                          * Get memory allocation policy from next segment.
 770                          * When extension is specified (e.g. for stack) apply
 771                          * this policy to the new segment regardless of the
 772                          * outcome of segment concatenation.  Extension occurs
 773                          * for non-default policy otherwise default policy is
 774                          * used and is based on extended segment size.
 775                          */
 776                         nsvd = (struct segvn_data *)nseg->s_data;
 777                         npolicy = nsvd->policy_info.mem_policy;
 778                         if (lgrp_mem_policy_flags ==
 779                             LGRP_MP_FLAG_EXTEND_DOWN) {
 780                                 if (npolicy != lgrp_mem_default_policy) {
 781                                         mpolicy = npolicy;
 782                                 } else {
 783                                         mpolicy = lgrp_mem_policy_default(
 784                                             nseg->s_size + seg->s_size,
 785                                             a->type);
 786                                 }
 787                         }
 788 
 789                         if (mpolicy == npolicy &&
 790                             segvn_extend_next(seg, nseg, a, swresv) == 0) {
 791                                 crfree(cred);
 792                                 ASSERT(nseg->s_szc == 0 ||
 793                                     (a->szc == nseg->s_szc &&
 794                                     IS_P2ALIGNED(nseg->s_base, pgsz) &&
 795                                     IS_P2ALIGNED(nseg->s_size, pgsz)));
 796                                 return (0);
 797                         }
 798                 }
 799         }
 800 
 801         if (a->vp != NULL) {
 802                 VN_HOLD(a->vp);
 803                 if (a->type == MAP_SHARED)
 804                         lgrp_shm_policy_init(NULL, a->vp);
 805         }
 806         svd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
 807 
 808         seg->s_ops = &segvn_ops;
 809         seg->s_data = (void *)svd;
 810         seg->s_szc = a->szc;
 811 
 812         svd->seg = seg;
 813         svd->vp = a->vp;
 814         /*
 815          * Anonymous mappings have no backing file so the offset is meaningless.
 816          */
 817         svd->offset = a->vp ? (a->offset & PAGEMASK) : 0;
 818         svd->prot = a->prot;
 819         svd->maxprot = a->maxprot;
 820         svd->pageprot = 0;
 821         svd->type = a->type;
 822         svd->vpage = NULL;
 823         svd->cred = cred;
 824         svd->advice = MADV_NORMAL;
 825         svd->pageadvice = 0;
 826         svd->flags = (ushort_t)a->flags;
 827         svd->softlockcnt = 0;
 828         svd->softlockcnt_sbase = 0;
 829         svd->softlockcnt_send = 0;
 830         svd->svn_inz = 0;
 831         svd->rcookie = HAT_INVALID_REGION_COOKIE;
 832         svd->pageswap = 0;
 833 
 834         if (a->szc != 0 && a->vp != NULL) {
 835                 segvn_setvnode_mpss(a->vp);
 836         }
 837         if (svd->type == MAP_SHARED && svd->vp != NULL &&
 838             (svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) {
 839                 ASSERT(vn_is_mapped(svd->vp, V_WRITE));
 840                 segvn_inval_trcache(svd->vp);
 841         }
 842 
 843         amp = a->amp;
 844         if ((svd->amp = amp) == NULL) {
 845                 svd->anon_index = 0;
 846                 if (svd->type == MAP_SHARED) {
 847                         svd->swresv = 0;
 848                         /*
 849                          * Shared mappings to a vp need no other setup.
 850                          * If we have a shared mapping to an anon_map object
 851                          * which hasn't been allocated yet,  allocate the
 852                          * struct now so that it will be properly shared
 853                          * by remembering the swap reservation there.
 854                          */
 855                         if (a->vp == NULL) {
 856                                 svd->amp = anonmap_alloc(seg->s_size, swresv,
 857                                     ANON_SLEEP);
 858                                 svd->amp->a_szc = seg->s_szc;
 859                         }
 860                 } else {
 861                         /*
 862                          * Private mapping (with or without a vp).
 863                          * Allocate anon_map when needed.
 864                          */
 865                         svd->swresv = swresv;
 866                 }
 867         } else {
 868                 pgcnt_t anon_num;
 869 
 870                 /*
 871                  * Mapping to an existing anon_map structure without a vp.
 872                  * For now we will insure that the segment size isn't larger
 873                  * than the size - offset gives us.  Later on we may wish to
 874                  * have the anon array dynamically allocated itself so that
 875                  * we don't always have to allocate all the anon pointer slots.
 876                  * This of course involves adding extra code to check that we
 877                  * aren't trying to use an anon pointer slot beyond the end
 878                  * of the currently allocated anon array.
 879                  */
 880                 if ((amp->size - a->offset) < seg->s_size) {
 881                         panic("segvn_create anon_map size");
 882                         /*NOTREACHED*/
 883                 }
 884 
 885                 anon_num = btopr(a->offset);
 886 
 887                 if (a->type == MAP_SHARED) {
 888                         /*
 889                          * SHARED mapping to a given anon_map.
 890                          */
 891                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 892                         amp->refcnt++;
 893                         if (a->szc > amp->a_szc) {
 894                                 amp->a_szc = a->szc;
 895                         }
 896                         ANON_LOCK_EXIT(&amp->a_rwlock);
 897                         svd->anon_index = anon_num;
 898                         svd->swresv = 0;
 899                 } else {
 900                         /*
 901                          * PRIVATE mapping to a given anon_map.
 902                          * Make sure that all the needed anon
 903                          * structures are created (so that we will
 904                          * share the underlying pages if nothing
 905                          * is written by this mapping) and then
 906                          * duplicate the anon array as is done
 907                          * when a privately mapped segment is dup'ed.
 908                          */
 909                         struct anon *ap;
 910                         caddr_t addr;
 911                         caddr_t eaddr;
 912                         ulong_t anon_idx;
 913                         int hat_flag = HAT_LOAD;
 914 
 915                         if (svd->flags & MAP_TEXT) {
 916                                 hat_flag |= HAT_LOAD_TEXT;
 917                         }
 918 
 919                         svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
 920                         svd->amp->a_szc = seg->s_szc;
 921                         svd->anon_index = 0;
 922                         svd->swresv = swresv;
 923 
 924                         /*
 925                          * Prevent 2 threads from allocating anon
 926                          * slots simultaneously.
 927                          */
 928                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 929                         eaddr = seg->s_base + seg->s_size;
 930 
 931                         for (anon_idx = anon_num, addr = seg->s_base;
 932                             addr < eaddr; addr += PAGESIZE, anon_idx++) {
 933                                 page_t *pp;
 934 
 935                                 if ((ap = anon_get_ptr(amp->ahp,
 936                                     anon_idx)) != NULL)
 937                                         continue;
 938 
 939                                 /*
 940                                  * Allocate the anon struct now.
 941                                  * Might as well load up translation
 942                                  * to the page while we're at it...
 943                                  */
 944                                 pp = anon_zero(seg, addr, &ap, cred);
 945                                 if (ap == NULL || pp == NULL) {
 946                                         panic("segvn_create anon_zero");
 947                                         /*NOTREACHED*/
 948                                 }
 949 
 950                                 /*
 951                                  * Re-acquire the anon_map lock and
 952                                  * initialize the anon array entry.
 953                                  */
 954                                 ASSERT(anon_get_ptr(amp->ahp,
 955                                     anon_idx) == NULL);
 956                                 (void) anon_set_ptr(amp->ahp, anon_idx, ap,
 957                                     ANON_SLEEP);
 958 
 959                                 ASSERT(seg->s_szc == 0);
 960                                 ASSERT(!IS_VMODSORT(pp->p_vnode));
 961 
 962                                 ASSERT(use_rgn == 0);
 963                                 hat_memload(seg->s_as->a_hat, addr, pp,
 964                                     svd->prot & ~PROT_WRITE, hat_flag);
 965 
 966                                 page_unlock(pp);
 967                         }
 968                         ASSERT(seg->s_szc == 0);
 969                         anon_dup(amp->ahp, anon_num, svd->amp->ahp,
 970                             0, seg->s_size);
 971                         ANON_LOCK_EXIT(&amp->a_rwlock);
 972                 }
 973         }
 974 
 975         /*
 976          * Set default memory allocation policy for segment
 977          *
 978          * Always set policy for private memory at least for initialization
 979          * even if this is a shared memory segment
 980          */
 981         (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size);
 982 
 983         if (svd->type == MAP_SHARED)
 984                 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index,
 985                     svd->vp, svd->offset, seg->s_size);
 986 
 987         if (use_rgn) {
 988                 ASSERT(!trok);
 989                 ASSERT(svd->amp == NULL);
 990                 svd->rcookie = hat_join_region(seg->s_as->a_hat, seg->s_base,
 991                     seg->s_size, (void *)svd->vp, svd->offset, svd->prot,
 992                     (uchar_t)seg->s_szc, segvn_hat_rgn_unload_callback,
 993                     HAT_REGION_TEXT);
 994         }
 995 
 996         ASSERT(!trok || !(svd->prot & PROT_WRITE));
 997         svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF;
 998 
 999         return (0);
1000 }
1001 
1002 /*
1003  * Concatenate two existing segments, if possible.
1004  * Return 0 on success, -1 if two segments are not compatible
1005  * or -2 on memory allocation failure.
1006  * If amp_cat == 1 then try and concat segments with anon maps
1007  */
1008 static int
1009 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat)
1010 {
1011         struct segvn_data *svd1 = seg1->s_data;
1012         struct segvn_data *svd2 = seg2->s_data;
1013         struct anon_map *amp1 = svd1->amp;
1014         struct anon_map *amp2 = svd2->amp;
1015         struct vpage *vpage1 = svd1->vpage;
1016         struct vpage *vpage2 = svd2->vpage, *nvpage = NULL;
1017         size_t size, nvpsize;
1018         pgcnt_t npages1, npages2;
1019 
1020         ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as);
1021         ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock));
1022         ASSERT(seg1->s_ops == seg2->s_ops);
1023 
1024         if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie) ||
1025             HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) {
1026                 return (-1);
1027         }
1028 
1029         /* both segments exist, try to merge them */
1030 #define incompat(x)     (svd1->x != svd2->x)
1031         if (incompat(vp) || incompat(maxprot) ||
1032             (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) ||
1033             (!svd1->pageprot && !svd2->pageprot && incompat(prot)) ||
1034             incompat(type) || incompat(cred) || incompat(flags) ||
1035             seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) ||
1036             (svd2->softlockcnt > 0) || svd1->softlockcnt_send > 0)
1037                 return (-1);
1038 #undef incompat
1039 
1040         /*
1041          * vp == NULL implies zfod, offset doesn't matter
1042          */
1043         if (svd1->vp != NULL &&
1044             svd1->offset + seg1->s_size != svd2->offset) {
1045                 return (-1);
1046         }
1047 
1048         /*
1049          * Don't concatenate if either segment uses text replication.
1050          */
1051         if (svd1->tr_state != SEGVN_TR_OFF || svd2->tr_state != SEGVN_TR_OFF) {
1052                 return (-1);
1053         }
1054 
1055         /*
1056          * Fail early if we're not supposed to concatenate
1057          * segments with non NULL amp.
1058          */
1059         if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) {
1060                 return (-1);
1061         }
1062 
1063         if (svd1->vp == NULL && svd1->type == MAP_SHARED) {
1064                 if (amp1 != amp2) {
1065                         return (-1);
1066                 }
1067                 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) !=
1068                     svd2->anon_index) {
1069                         return (-1);
1070                 }
1071                 ASSERT(amp1 == NULL || amp1->refcnt >= 2);
1072         }
1073 
1074         /*
1075          * If either seg has vpages, create a new merged vpage array.
1076          */
1077         if (vpage1 != NULL || vpage2 != NULL) {
1078                 struct vpage *vp, *evp;
1079 
1080                 npages1 = seg_pages(seg1);
1081                 npages2 = seg_pages(seg2);
1082                 nvpsize = vpgtob(npages1 + npages2);
1083 
1084                 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) {
1085                         return (-2);
1086                 }
1087 
1088                 if (vpage1 != NULL) {
1089                         bcopy(vpage1, nvpage, vpgtob(npages1));
1090                 } else {
1091                         evp = nvpage + npages1;
1092                         for (vp = nvpage; vp < evp; vp++) {
1093                                 VPP_SETPROT(vp, svd1->prot);
1094                                 VPP_SETADVICE(vp, svd1->advice);
1095                         }
1096                 }
1097 
1098                 if (vpage2 != NULL) {
1099                         bcopy(vpage2, nvpage + npages1, vpgtob(npages2));
1100                 } else {
1101                         evp = nvpage + npages1 + npages2;
1102                         for (vp = nvpage + npages1; vp < evp; vp++) {
1103                                 VPP_SETPROT(vp, svd2->prot);
1104                                 VPP_SETADVICE(vp, svd2->advice);
1105                         }
1106                 }
1107 
1108                 if (svd2->pageswap && (!svd1->pageswap && svd1->swresv)) {
1109                         ASSERT(svd1->swresv == seg1->s_size);
1110                         ASSERT(!(svd1->flags & MAP_NORESERVE));
1111                         ASSERT(!(svd2->flags & MAP_NORESERVE));
1112                         evp = nvpage + npages1;
1113                         for (vp = nvpage; vp < evp; vp++) {
1114                                 VPP_SETSWAPRES(vp);
1115                         }
1116                 }
1117 
1118                 if (svd1->pageswap && (!svd2->pageswap && svd2->swresv)) {
1119                         ASSERT(svd2->swresv == seg2->s_size);
1120                         ASSERT(!(svd1->flags & MAP_NORESERVE));
1121                         ASSERT(!(svd2->flags & MAP_NORESERVE));
1122                         vp = nvpage + npages1;
1123                         evp = vp + npages2;
1124                         for (; vp < evp; vp++) {
1125                                 VPP_SETSWAPRES(vp);
1126                         }
1127                 }
1128         }
1129         ASSERT((vpage1 != NULL || vpage2 != NULL) ||
1130             (svd1->pageswap == 0 && svd2->pageswap == 0));
1131 
1132         /*
1133          * If either segment has private pages, create a new merged anon
1134          * array. If mergeing shared anon segments just decrement anon map's
1135          * refcnt.
1136          */
1137         if (amp1 != NULL && svd1->type == MAP_SHARED) {
1138                 ASSERT(amp1 == amp2 && svd1->vp == NULL);
1139                 ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
1140                 ASSERT(amp1->refcnt >= 2);
1141                 amp1->refcnt--;
1142                 ANON_LOCK_EXIT(&amp1->a_rwlock);
1143                 svd2->amp = NULL;
1144         } else if (amp1 != NULL || amp2 != NULL) {
1145                 struct anon_hdr *nahp;
1146                 struct anon_map *namp = NULL;
1147                 size_t asize;
1148 
1149                 ASSERT(svd1->type == MAP_PRIVATE);
1150 
1151                 asize = seg1->s_size + seg2->s_size;
1152                 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) {
1153                         if (nvpage != NULL) {
1154                                 kmem_free(nvpage, nvpsize);
1155                         }
1156                         return (-2);
1157                 }
1158                 if (amp1 != NULL) {
1159                         /*
1160                          * XXX anon rwlock is not really needed because
1161                          * this is a private segment and we are writers.
1162                          */
1163                         ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
1164                         ASSERT(amp1->refcnt == 1);
1165                         if (anon_copy_ptr(amp1->ahp, svd1->anon_index,
1166                             nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) {
1167                                 anon_release(nahp, btop(asize));
1168                                 ANON_LOCK_EXIT(&amp1->a_rwlock);
1169                                 if (nvpage != NULL) {
1170                                         kmem_free(nvpage, nvpsize);
1171                                 }
1172                                 return (-2);
1173                         }
1174                 }
1175                 if (amp2 != NULL) {
1176                         ANON_LOCK_ENTER(&amp2->a_rwlock, RW_WRITER);
1177                         ASSERT(amp2->refcnt == 1);
1178                         if (anon_copy_ptr(amp2->ahp, svd2->anon_index,
1179                             nahp, btop(seg1->s_size), btop(seg2->s_size),
1180                             ANON_NOSLEEP)) {
1181                                 anon_release(nahp, btop(asize));
1182                                 ANON_LOCK_EXIT(&amp2->a_rwlock);
1183                                 if (amp1 != NULL) {
1184                                         ANON_LOCK_EXIT(&amp1->a_rwlock);
1185                                 }
1186                                 if (nvpage != NULL) {
1187                                         kmem_free(nvpage, nvpsize);
1188                                 }
1189                                 return (-2);
1190                         }
1191                 }
1192                 if (amp1 != NULL) {
1193                         namp = amp1;
1194                         anon_release(amp1->ahp, btop(amp1->size));
1195                 }
1196                 if (amp2 != NULL) {
1197                         if (namp == NULL) {
1198                                 ASSERT(amp1 == NULL);
1199                                 namp = amp2;
1200                                 anon_release(amp2->ahp, btop(amp2->size));
1201                         } else {
1202                                 amp2->refcnt--;
1203                                 ANON_LOCK_EXIT(&amp2->a_rwlock);
1204                                 anonmap_free(amp2);
1205                         }
1206                         svd2->amp = NULL; /* needed for seg_free */
1207                 }
1208                 namp->ahp = nahp;
1209                 namp->size = asize;
1210                 svd1->amp = namp;
1211                 svd1->anon_index = 0;
1212                 ANON_LOCK_EXIT(&namp->a_rwlock);
1213         }
1214         /*
1215          * Now free the old vpage structures.
1216          */
1217         if (nvpage != NULL) {
1218                 if (vpage1 != NULL) {
1219                         kmem_free(vpage1, vpgtob(npages1));
1220                 }
1221                 if (vpage2 != NULL) {
1222                         svd2->vpage = NULL;
1223                         kmem_free(vpage2, vpgtob(npages2));
1224                 }
1225                 if (svd2->pageprot) {
1226                         svd1->pageprot = 1;
1227                 }
1228                 if (svd2->pageadvice) {
1229                         svd1->pageadvice = 1;
1230                 }
1231                 if (svd2->pageswap) {
1232                         svd1->pageswap = 1;
1233                 }
1234                 svd1->vpage = nvpage;
1235         }
1236 
1237         /* all looks ok, merge segments */
1238         svd1->swresv += svd2->swresv;
1239         svd2->swresv = 0;  /* so seg_free doesn't release swap space */
1240         size = seg2->s_size;
1241         seg_free(seg2);
1242         seg1->s_size += size;
1243         return (0);
1244 }
1245 
1246 /*
1247  * Extend the previous segment (seg1) to include the
1248  * new segment (seg2 + a), if possible.
1249  * Return 0 on success.
1250  */
1251 static int
1252 segvn_extend_prev(seg1, seg2, a, swresv)
1253         struct seg *seg1, *seg2;
1254         struct segvn_crargs *a;
1255         size_t swresv;
1256 {
1257         struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data;
1258         size_t size;
1259         struct anon_map *amp1;
1260         struct vpage *new_vpage;
1261 
1262         /*
1263          * We don't need any segment level locks for "segvn" data
1264          * since the address space is "write" locked.
1265          */
1266         ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock));
1267 
1268         if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie)) {
1269                 return (-1);
1270         }
1271 
1272         /* second segment is new, try to extend first */
1273         /* XXX - should also check cred */
1274         if (svd1->vp != a->vp || svd1->maxprot != a->maxprot ||
1275             (!svd1->pageprot && (svd1->prot != a->prot)) ||
1276             svd1->type != a->type || svd1->flags != a->flags ||
1277             seg1->s_szc != a->szc || svd1->softlockcnt_send > 0)
1278                 return (-1);
1279 
1280         /* vp == NULL implies zfod, offset doesn't matter */
1281         if (svd1->vp != NULL &&
1282             svd1->offset + seg1->s_size != (a->offset & PAGEMASK))
1283                 return (-1);
1284 
1285         if (svd1->tr_state != SEGVN_TR_OFF) {
1286                 return (-1);
1287         }
1288 
1289         amp1 = svd1->amp;
1290         if (amp1) {
1291                 pgcnt_t newpgs;
1292 
1293                 /*
1294                  * Segment has private pages, can data structures
1295                  * be expanded?
1296                  *
1297                  * Acquire the anon_map lock to prevent it from changing,
1298                  * if it is shared.  This ensures that the anon_map
1299                  * will not change while a thread which has a read/write
1300                  * lock on an address space references it.
1301                  * XXX - Don't need the anon_map lock at all if "refcnt"
1302                  * is 1.
1303                  *
1304                  * Can't grow a MAP_SHARED segment with an anonmap because
1305                  * there may be existing anon slots where we want to extend
1306                  * the segment and we wouldn't know what to do with them
1307                  * (e.g., for tmpfs right thing is to just leave them there,
1308                  * for /dev/zero they should be cleared out).
1309                  */
1310                 if (svd1->type == MAP_SHARED)
1311                         return (-1);
1312 
1313                 ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
1314                 if (amp1->refcnt > 1) {
1315                         ANON_LOCK_EXIT(&amp1->a_rwlock);
1316                         return (-1);
1317                 }
1318                 newpgs = anon_grow(amp1->ahp, &svd1->anon_index,
1319                     btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP);
1320 
1321                 if (newpgs == 0) {
1322                         ANON_LOCK_EXIT(&amp1->a_rwlock);
1323                         return (-1);
1324                 }
1325                 amp1->size = ptob(newpgs);
1326                 ANON_LOCK_EXIT(&amp1->a_rwlock);
1327         }
1328         if (svd1->vpage != NULL) {
1329                 struct vpage *vp, *evp;
1330                 new_vpage =
1331                     kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
1332                         KM_NOSLEEP);
1333                 if (new_vpage == NULL)
1334                         return (-1);
1335                 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1)));
1336                 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1)));
1337                 svd1->vpage = new_vpage;
1338 
1339                 vp = new_vpage + seg_pages(seg1);
1340                 evp = vp + seg_pages(seg2);
1341                 for (; vp < evp; vp++)
1342                         VPP_SETPROT(vp, a->prot);
1343                 if (svd1->pageswap && swresv) {
1344                         ASSERT(!(svd1->flags & MAP_NORESERVE));
1345                         ASSERT(swresv == seg2->s_size);
1346                         vp = new_vpage + seg_pages(seg1);
1347                         for (; vp < evp; vp++) {
1348                                 VPP_SETSWAPRES(vp);
1349                         }
1350                 }
1351         }
1352         ASSERT(svd1->vpage != NULL || svd1->pageswap == 0);
1353         size = seg2->s_size;
1354         seg_free(seg2);
1355         seg1->s_size += size;
1356         svd1->swresv += swresv;
1357         if (svd1->pageprot && (a->prot & PROT_WRITE) &&
1358             svd1->type == MAP_SHARED && svd1->vp != NULL &&
1359             (svd1->vp->v_flag & VVMEXEC)) {
1360                 ASSERT(vn_is_mapped(svd1->vp, V_WRITE));
1361                 segvn_inval_trcache(svd1->vp);
1362         }
1363         return (0);
1364 }
1365 
1366 /*
1367  * Extend the next segment (seg2) to include the
1368  * new segment (seg1 + a), if possible.
1369  * Return 0 on success.
1370  */
1371 static int
1372 segvn_extend_next(
1373         struct seg *seg1,
1374         struct seg *seg2,
1375         struct segvn_crargs *a,
1376         size_t swresv)
1377 {
1378         struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data;
1379         size_t size;
1380         struct anon_map *amp2;
1381         struct vpage *new_vpage;
1382 
1383         /*
1384          * We don't need any segment level locks for "segvn" data
1385          * since the address space is "write" locked.
1386          */
1387         ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock));
1388 
1389         if (HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) {
1390                 return (-1);
1391         }
1392 
1393         /* first segment is new, try to extend second */
1394         /* XXX - should also check cred */
1395         if (svd2->vp != a->vp || svd2->maxprot != a->maxprot ||
1396             (!svd2->pageprot && (svd2->prot != a->prot)) ||
1397             svd2->type != a->type || svd2->flags != a->flags ||
1398             seg2->s_szc != a->szc || svd2->softlockcnt_sbase > 0)
1399                 return (-1);
1400         /* vp == NULL implies zfod, offset doesn't matter */
1401         if (svd2->vp != NULL &&
1402             (a->offset & PAGEMASK) + seg1->s_size != svd2->offset)
1403                 return (-1);
1404 
1405         if (svd2->tr_state != SEGVN_TR_OFF) {
1406                 return (-1);
1407         }
1408 
1409         amp2 = svd2->amp;
1410         if (amp2) {
1411                 pgcnt_t newpgs;
1412 
1413                 /*
1414                  * Segment has private pages, can data structures
1415                  * be expanded?
1416                  *
1417                  * Acquire the anon_map lock to prevent it from changing,
1418                  * if it is shared.  This ensures that the anon_map
1419                  * will not change while a thread which has a read/write
1420                  * lock on an address space references it.
1421                  *
1422                  * XXX - Don't need the anon_map lock at all if "refcnt"
1423                  * is 1.
1424                  */
1425                 if (svd2->type == MAP_SHARED)
1426                         return (-1);
1427 
1428                 ANON_LOCK_ENTER(&amp2->a_rwlock, RW_WRITER);
1429                 if (amp2->refcnt > 1) {
1430                         ANON_LOCK_EXIT(&amp2->a_rwlock);
1431                         return (-1);
1432                 }
1433                 newpgs = anon_grow(amp2->ahp, &svd2->anon_index,
1434                     btop(seg2->s_size), btop(seg1->s_size),
1435                     ANON_NOSLEEP | ANON_GROWDOWN);
1436 
1437                 if (newpgs == 0) {
1438                         ANON_LOCK_EXIT(&amp2->a_rwlock);
1439                         return (-1);
1440                 }
1441                 amp2->size = ptob(newpgs);
1442                 ANON_LOCK_EXIT(&amp2->a_rwlock);
1443         }
1444         if (svd2->vpage != NULL) {
1445                 struct vpage *vp, *evp;
1446                 new_vpage =
1447                     kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
1448                     KM_NOSLEEP);
1449                 if (new_vpage == NULL) {
1450                         /* Not merging segments so adjust anon_index back */
1451                         if (amp2)
1452                                 svd2->anon_index += seg_pages(seg1);
1453                         return (-1);
1454                 }
1455                 bcopy(svd2->vpage, new_vpage + seg_pages(seg1),
1456                     vpgtob(seg_pages(seg2)));
1457                 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2)));
1458                 svd2->vpage = new_vpage;
1459 
1460                 vp = new_vpage;
1461                 evp = vp + seg_pages(seg1);
1462                 for (; vp < evp; vp++)
1463                         VPP_SETPROT(vp, a->prot);
1464                 if (svd2->pageswap && swresv) {
1465                         ASSERT(!(svd2->flags & MAP_NORESERVE));
1466                         ASSERT(swresv == seg1->s_size);
1467                         vp = new_vpage;
1468                         for (; vp < evp; vp++) {
1469                                 VPP_SETSWAPRES(vp);
1470                         }
1471                 }
1472         }
1473         ASSERT(svd2->vpage != NULL || svd2->pageswap == 0);
1474         size = seg1->s_size;
1475         seg_free(seg1);
1476         seg2->s_size += size;
1477         seg2->s_base -= size;
1478         svd2->offset -= size;
1479         svd2->swresv += swresv;
1480         if (svd2->pageprot && (a->prot & PROT_WRITE) &&
1481             svd2->type == MAP_SHARED && svd2->vp != NULL &&
1482             (svd2->vp->v_flag & VVMEXEC)) {
1483                 ASSERT(vn_is_mapped(svd2->vp, V_WRITE));
1484                 segvn_inval_trcache(svd2->vp);
1485         }
1486         return (0);
1487 }
1488 
1489 /*
1490  * Duplicate all the pages in the segment. This may break COW sharing for a
1491  * given page. If the page is marked with inherit zero set, then instead of
1492  * duplicating the page, we zero the page.
1493  */
1494 static int
1495 segvn_dup_pages(struct seg *seg, struct seg *newseg)
1496 {
1497         int error;
1498         uint_t prot;
1499         page_t *pp;
1500         struct anon *ap, *newap;
1501         size_t i;
1502         caddr_t addr;
1503 
1504         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
1505         struct segvn_data *newsvd = (struct segvn_data *)newseg->s_data;
1506         ulong_t old_idx = svd->anon_index;
1507         ulong_t new_idx = 0;
1508 
1509         i = btopr(seg->s_size);
1510         addr = seg->s_base;
1511 
1512         /*
1513          * XXX break cow sharing using PAGESIZE
1514          * pages. They will be relocated into larger
1515          * pages at fault time.
1516          */
1517         while (i-- > 0) {
1518                 if ((ap = anon_get_ptr(svd->amp->ahp, old_idx)) != NULL) {
1519                         struct vpage *vpp;
1520 
1521                         vpp = &svd->vpage[seg_page(seg, addr)];
1522 
1523                         /*
1524                          * prot need not be computed below 'cause anon_private
1525                          * is going to ignore it anyway as child doesn't inherit
1526                          * pagelock from parent.
1527                          */
1528                         prot = svd->pageprot ? VPP_PROT(vpp) : svd->prot;
1529 
1530                         /*
1531                          * Check whether we should zero this or dup it.
1532                          */
1533                         if (svd->svn_inz == SEGVN_INZ_ALL ||
1534                             (svd->svn_inz == SEGVN_INZ_VPP &&
1535                             VPP_ISINHZERO(vpp))) {
1536                                 pp = anon_zero(newseg, addr, &newap,
1537                                     newsvd->cred);
1538                         } else {
1539                                 page_t *anon_pl[1+1];
1540                                 uint_t vpprot;
1541                                 error = anon_getpage(&ap, &vpprot, anon_pl,
1542                                     PAGESIZE, seg, addr, S_READ, svd->cred);
1543                                 if (error != 0)
1544                                         return (error);
1545 
1546                                 pp = anon_private(&newap, newseg, addr, prot,
1547                                     anon_pl[0], 0, newsvd->cred);
1548                         }
1549                         if (pp == NULL) {
1550                                 return (ENOMEM);
1551                         }
1552                         (void) anon_set_ptr(newsvd->amp->ahp, new_idx, newap,
1553                             ANON_SLEEP);
1554                         page_unlock(pp);
1555                 }
1556                 addr += PAGESIZE;
1557                 old_idx++;
1558                 new_idx++;
1559         }
1560 
1561         return (0);
1562 }
1563 
1564 static int
1565 segvn_dup(struct seg *seg, struct seg *newseg)
1566 {
1567         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
1568         struct segvn_data *newsvd;
1569         pgcnt_t npages = seg_pages(seg);
1570         int error = 0;
1571         size_t len;
1572         struct anon_map *amp;
1573 
1574         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1575         ASSERT(newseg->s_as->a_proc->p_parent == curproc);
1576 
1577         /*
1578          * If segment has anon reserved, reserve more for the new seg.
1579          * For a MAP_NORESERVE segment swresv will be a count of all the
1580          * allocated anon slots; thus we reserve for the child as many slots
1581          * as the parent has allocated. This semantic prevents the child or
1582          * parent from dieing during a copy-on-write fault caused by trying
1583          * to write a shared pre-existing anon page.
1584          */
1585         if ((len = svd->swresv) != 0) {
1586                 if (anon_resv(svd->swresv) == 0)
1587                         return (ENOMEM);
1588 
1589                 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
1590                     seg, len, 0);
1591         }
1592 
1593         newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
1594 
1595         newseg->s_ops = &segvn_ops;
1596         newseg->s_data = (void *)newsvd;
1597         newseg->s_szc = seg->s_szc;
1598 
1599         newsvd->seg = newseg;
1600         if ((newsvd->vp = svd->vp) != NULL) {
1601                 VN_HOLD(svd->vp);
1602                 if (svd->type == MAP_SHARED)
1603                         lgrp_shm_policy_init(NULL, svd->vp);
1604         }
1605         newsvd->offset = svd->offset;
1606         newsvd->prot = svd->prot;
1607         newsvd->maxprot = svd->maxprot;
1608         newsvd->pageprot = svd->pageprot;
1609         newsvd->type = svd->type;
1610         newsvd->cred = svd->cred;
1611         crhold(newsvd->cred);
1612         newsvd->advice = svd->advice;
1613         newsvd->pageadvice = svd->pageadvice;
1614         newsvd->svn_inz = svd->svn_inz;
1615         newsvd->swresv = svd->swresv;
1616         newsvd->pageswap = svd->pageswap;
1617         newsvd->flags = svd->flags;
1618         newsvd->softlockcnt = 0;
1619         newsvd->softlockcnt_sbase = 0;
1620         newsvd->softlockcnt_send = 0;
1621         newsvd->policy_info = svd->policy_info;
1622         newsvd->rcookie = HAT_INVALID_REGION_COOKIE;
1623 
1624         if ((amp = svd->amp) == NULL || svd->tr_state == SEGVN_TR_ON) {
1625                 /*
1626                  * Not attaching to a shared anon object.
1627                  */
1628                 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie) ||
1629                     svd->tr_state == SEGVN_TR_OFF);
1630                 if (svd->tr_state == SEGVN_TR_ON) {
1631                         ASSERT(newsvd->vp != NULL && amp != NULL);
1632                         newsvd->tr_state = SEGVN_TR_INIT;
1633                 } else {
1634                         newsvd->tr_state = svd->tr_state;
1635                 }
1636                 newsvd->amp = NULL;
1637                 newsvd->anon_index = 0;
1638         } else {
1639                 /* regions for now are only used on pure vnode segments */
1640                 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
1641                 ASSERT(svd->tr_state == SEGVN_TR_OFF);
1642                 newsvd->tr_state = SEGVN_TR_OFF;
1643                 if (svd->type == MAP_SHARED) {
1644                         ASSERT(svd->svn_inz == SEGVN_INZ_NONE);
1645                         newsvd->amp = amp;
1646                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1647                         amp->refcnt++;
1648                         ANON_LOCK_EXIT(&amp->a_rwlock);
1649                         newsvd->anon_index = svd->anon_index;
1650                 } else {
1651                         int reclaim = 1;
1652 
1653                         /*
1654                          * Allocate and initialize new anon_map structure.
1655                          */
1656                         newsvd->amp = anonmap_alloc(newseg->s_size, 0,
1657                             ANON_SLEEP);
1658                         newsvd->amp->a_szc = newseg->s_szc;
1659                         newsvd->anon_index = 0;
1660                         ASSERT(svd->svn_inz == SEGVN_INZ_NONE ||
1661                             svd->svn_inz == SEGVN_INZ_ALL ||
1662                             svd->svn_inz == SEGVN_INZ_VPP);
1663 
1664                         /*
1665                          * We don't have to acquire the anon_map lock
1666                          * for the new segment (since it belongs to an
1667                          * address space that is still not associated
1668                          * with any process), or the segment in the old
1669                          * address space (since all threads in it
1670                          * are stopped while duplicating the address space).
1671                          */
1672 
1673                         /*
1674                          * The goal of the following code is to make sure that
1675                          * softlocked pages do not end up as copy on write
1676                          * pages.  This would cause problems where one
1677                          * thread writes to a page that is COW and a different
1678                          * thread in the same process has softlocked it.  The
1679                          * softlock lock would move away from this process
1680                          * because the write would cause this process to get
1681                          * a copy (without the softlock).
1682                          *
1683                          * The strategy here is to just break the
1684                          * sharing on pages that could possibly be
1685                          * softlocked.
1686                          *
1687                          * In addition, if any pages have been marked that they
1688                          * should be inherited as zero, then we immediately go
1689                          * ahead and break COW and zero them. In the case of a
1690                          * softlocked page that should be inherited zero, we
1691                          * break COW and just get a zero page.
1692                          */
1693 retry:
1694                         if (svd->softlockcnt ||
1695                             svd->svn_inz != SEGVN_INZ_NONE) {
1696                                 /*
1697                                  * The softlock count might be non zero
1698                                  * because some pages are still stuck in the
1699                                  * cache for lazy reclaim. Flush the cache
1700                                  * now. This should drop the count to zero.
1701                                  * [or there is really I/O going on to these
1702                                  * pages]. Note, we have the writers lock so
1703                                  * nothing gets inserted during the flush.
1704                                  */
1705                                 if (svd->softlockcnt && reclaim == 1) {
1706                                         segvn_purge(seg);
1707                                         reclaim = 0;
1708                                         goto retry;
1709                                 }
1710 
1711                                 error = segvn_dup_pages(seg, newseg);
1712                                 if (error != 0) {
1713                                         newsvd->vpage = NULL;
1714                                         goto out;
1715                                 }
1716                         } else {        /* common case */
1717                                 if (seg->s_szc != 0) {
1718                                         /*
1719                                          * If at least one of anon slots of a
1720                                          * large page exists then make sure
1721                                          * all anon slots of a large page
1722                                          * exist to avoid partial cow sharing
1723                                          * of a large page in the future.
1724                                          */
1725                                         anon_dup_fill_holes(amp->ahp,
1726                                             svd->anon_index, newsvd->amp->ahp,
1727                                             0, seg->s_size, seg->s_szc,
1728                                             svd->vp != NULL);
1729                                 } else {
1730                                         anon_dup(amp->ahp, svd->anon_index,
1731                                             newsvd->amp->ahp, 0, seg->s_size);
1732                                 }
1733 
1734                                 hat_clrattr(seg->s_as->a_hat, seg->s_base,
1735                                     seg->s_size, PROT_WRITE);
1736                         }
1737                 }
1738         }
1739         /*
1740          * If necessary, create a vpage structure for the new segment.
1741          * Do not copy any page lock indications.
1742          */
1743         if (svd->vpage != NULL) {
1744                 uint_t i;
1745                 struct vpage *ovp = svd->vpage;
1746                 struct vpage *nvp;
1747 
1748                 nvp = newsvd->vpage =
1749                     kmem_alloc(vpgtob(npages), KM_SLEEP);
1750                 for (i = 0; i < npages; i++) {
1751                         *nvp = *ovp++;
1752                         VPP_CLRPPLOCK(nvp++);
1753                 }
1754         } else
1755                 newsvd->vpage = NULL;
1756 
1757         /* Inform the vnode of the new mapping */
1758         if (newsvd->vp != NULL) {
1759                 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset,
1760                     newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot,
1761                     newsvd->maxprot, newsvd->type, newsvd->cred, NULL);
1762         }
1763 out:
1764         if (error == 0 && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
1765                 ASSERT(newsvd->amp == NULL);
1766                 ASSERT(newsvd->tr_state == SEGVN_TR_OFF);
1767                 newsvd->rcookie = svd->rcookie;
1768                 hat_dup_region(newseg->s_as->a_hat, newsvd->rcookie);
1769         }
1770         return (error);
1771 }
1772 
1773 
1774 /*
1775  * callback function to invoke free_vp_pages() for only those pages actually
1776  * processed by the HAT when a shared region is destroyed.
1777  */
1778 extern int free_pages;
1779 
1780 static void
1781 segvn_hat_rgn_unload_callback(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr,
1782     size_t r_size, void *r_obj, u_offset_t r_objoff)
1783 {
1784         u_offset_t off;
1785         size_t len;
1786         vnode_t *vp = (vnode_t *)r_obj;
1787 
1788         ASSERT(eaddr > saddr);
1789         ASSERT(saddr >= r_saddr);
1790         ASSERT(saddr < r_saddr + r_size);
1791         ASSERT(eaddr > r_saddr);
1792         ASSERT(eaddr <= r_saddr + r_size);
1793         ASSERT(vp != NULL);
1794 
1795         if (!free_pages) {
1796                 return;
1797         }
1798 
1799         len = eaddr - saddr;
1800         off = (saddr - r_saddr) + r_objoff;
1801         free_vp_pages(vp, off, len);
1802 }
1803 
1804 /*
1805  * callback function used by segvn_unmap to invoke free_vp_pages() for only
1806  * those pages actually processed by the HAT
1807  */
1808 static void
1809 segvn_hat_unload_callback(hat_callback_t *cb)
1810 {
1811         struct seg              *seg = cb->hcb_data;
1812         struct segvn_data       *svd = (struct segvn_data *)seg->s_data;
1813         size_t                  len;
1814         u_offset_t              off;
1815 
1816         ASSERT(svd->vp != NULL);
1817         ASSERT(cb->hcb_end_addr > cb->hcb_start_addr);
1818         ASSERT(cb->hcb_start_addr >= seg->s_base);
1819 
1820         len = cb->hcb_end_addr - cb->hcb_start_addr;
1821         off = cb->hcb_start_addr - seg->s_base;
1822         free_vp_pages(svd->vp, svd->offset + off, len);
1823 }
1824 
1825 /*
1826  * This function determines the number of bytes of swap reserved by
1827  * a segment for which per-page accounting is present. It is used to
1828  * calculate the correct value of a segvn_data's swresv.
1829  */
1830 static size_t
1831 segvn_count_swap_by_vpages(struct seg *seg)
1832 {
1833         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
1834         struct vpage *vp, *evp;
1835         size_t nswappages = 0;
1836 
1837         ASSERT(svd->pageswap);
1838         ASSERT(svd->vpage != NULL);
1839 
1840         evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)];
1841 
1842         for (vp = svd->vpage; vp < evp; vp++) {
1843                 if (VPP_ISSWAPRES(vp))
1844                         nswappages++;
1845         }
1846 
1847         return (nswappages << PAGESHIFT);
1848 }
1849 
1850 static int
1851 segvn_unmap(struct seg *seg, caddr_t addr, size_t len)
1852 {
1853         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
1854         struct segvn_data *nsvd;
1855         struct seg *nseg;
1856         struct anon_map *amp;
1857         pgcnt_t opages;         /* old segment size in pages */
1858         pgcnt_t npages;         /* new segment size in pages */
1859         pgcnt_t dpages;         /* pages being deleted (unmapped) */
1860         hat_callback_t callback;        /* used for free_vp_pages() */
1861         hat_callback_t *cbp = NULL;
1862         caddr_t nbase;
1863         size_t nsize;
1864         size_t oswresv;
1865         int reclaim = 1;
1866 
1867         /*
1868          * We don't need any segment level locks for "segvn" data
1869          * since the address space is "write" locked.
1870          */
1871         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1872 
1873         /*
1874          * Fail the unmap if pages are SOFTLOCKed through this mapping.
1875          * softlockcnt is protected from change by the as write lock.
1876          */
1877 retry:
1878         if (svd->softlockcnt > 0) {
1879                 ASSERT(svd->tr_state == SEGVN_TR_OFF);
1880 
1881                 /*
1882                  * If this is shared segment non 0 softlockcnt
1883                  * means locked pages are still in use.
1884                  */
1885                 if (svd->type == MAP_SHARED) {
1886                         return (EAGAIN);
1887                 }
1888 
1889                 /*
1890                  * since we do have the writers lock nobody can fill
1891                  * the cache during the purge. The flush either succeeds
1892                  * or we still have pending I/Os.
1893                  */
1894                 if (reclaim == 1) {
1895                         segvn_purge(seg);
1896                         reclaim = 0;
1897                         goto retry;
1898                 }
1899                 return (EAGAIN);
1900         }
1901 
1902         /*
1903          * Check for bad sizes
1904          */
1905         if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size ||
1906             (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) {
1907                 panic("segvn_unmap");
1908                 /*NOTREACHED*/
1909         }
1910 
1911         if (seg->s_szc != 0) {
1912                 size_t pgsz = page_get_pagesize(seg->s_szc);
1913                 int err;
1914                 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
1915                         ASSERT(seg->s_base != addr || seg->s_size != len);
1916                         if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
1917                                 ASSERT(svd->amp == NULL);
1918                                 ASSERT(svd->tr_state == SEGVN_TR_OFF);
1919                                 hat_leave_region(seg->s_as->a_hat,
1920                                     svd->rcookie, HAT_REGION_TEXT);
1921                                 svd->rcookie = HAT_INVALID_REGION_COOKIE;
1922                                 /*
1923                                  * could pass a flag to segvn_demote_range()
1924                                  * below to tell it not to do any unloads but
1925                                  * this case is rare enough to not bother for
1926                                  * now.
1927                                  */
1928                         } else if (svd->tr_state == SEGVN_TR_INIT) {
1929                                 svd->tr_state = SEGVN_TR_OFF;
1930                         } else if (svd->tr_state == SEGVN_TR_ON) {
1931                                 ASSERT(svd->amp != NULL);
1932                                 segvn_textunrepl(seg, 1);
1933                                 ASSERT(svd->amp == NULL);
1934                                 ASSERT(svd->tr_state == SEGVN_TR_OFF);
1935                         }
1936                         VM_STAT_ADD(segvnvmstats.demoterange[0]);
1937                         err = segvn_demote_range(seg, addr, len, SDR_END, 0);
1938                         if (err == 0) {
1939                                 return (IE_RETRY);
1940                         }
1941                         return (err);
1942                 }
1943         }
1944 
1945         /* Inform the vnode of the unmapping. */
1946         if (svd->vp) {
1947                 int error;
1948 
1949                 error = VOP_DELMAP(svd->vp,
1950                     (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base),
1951                     seg->s_as, addr, len, svd->prot, svd->maxprot,
1952                     svd->type, svd->cred, NULL);
1953 
1954                 if (error == EAGAIN)
1955                         return (error);
1956         }
1957 
1958         /*
1959          * Remove any page locks set through this mapping.
1960          * If text replication is not off no page locks could have been
1961          * established via this mapping.
1962          */
1963         if (svd->tr_state == SEGVN_TR_OFF) {
1964                 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0);
1965         }
1966 
1967         if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
1968                 ASSERT(svd->amp == NULL);
1969                 ASSERT(svd->tr_state == SEGVN_TR_OFF);
1970                 ASSERT(svd->type == MAP_PRIVATE);
1971                 hat_leave_region(seg->s_as->a_hat, svd->rcookie,
1972                     HAT_REGION_TEXT);
1973                 svd->rcookie = HAT_INVALID_REGION_COOKIE;
1974         } else if (svd->tr_state == SEGVN_TR_ON) {
1975                 ASSERT(svd->amp != NULL);
1976                 ASSERT(svd->pageprot == 0 && !(svd->prot & PROT_WRITE));
1977                 segvn_textunrepl(seg, 1);
1978                 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
1979         } else {
1980                 if (svd->tr_state != SEGVN_TR_OFF) {
1981                         ASSERT(svd->tr_state == SEGVN_TR_INIT);
1982                         svd->tr_state = SEGVN_TR_OFF;
1983                 }
1984                 /*
1985                  * Unload any hardware translations in the range to be taken
1986                  * out. Use a callback to invoke free_vp_pages() effectively.
1987                  */
1988                 if (svd->vp != NULL && free_pages != 0) {
1989                         callback.hcb_data = seg;
1990                         callback.hcb_function = segvn_hat_unload_callback;
1991                         cbp = &callback;
1992                 }
1993                 hat_unload_callback(seg->s_as->a_hat, addr, len,
1994                     HAT_UNLOAD_UNMAP, cbp);
1995 
1996                 if (svd->type == MAP_SHARED && svd->vp != NULL &&
1997                     (svd->vp->v_flag & VVMEXEC) &&
1998                     ((svd->prot & PROT_WRITE) || svd->pageprot)) {
1999                         segvn_inval_trcache(svd->vp);
2000                 }
2001         }
2002 
2003         /*
2004          * Check for entire segment
2005          */
2006         if (addr == seg->s_base && len == seg->s_size) {
2007                 seg_free(seg);
2008                 return (0);
2009         }
2010 
2011         opages = seg_pages(seg);
2012         dpages = btop(len);
2013         npages = opages - dpages;
2014         amp = svd->amp;
2015         ASSERT(amp == NULL || amp->a_szc >= seg->s_szc);
2016 
2017         /*
2018          * Check for beginning of segment
2019          */
2020         if (addr == seg->s_base) {
2021                 if (svd->vpage != NULL) {
2022                         size_t nbytes;
2023                         struct vpage *ovpage;
2024 
2025                         ovpage = svd->vpage; /* keep pointer to vpage */
2026 
2027                         nbytes = vpgtob(npages);
2028                         svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
2029                         bcopy(&ovpage[dpages], svd->vpage, nbytes);
2030 
2031                         /* free up old vpage */
2032                         kmem_free(ovpage, vpgtob(opages));
2033                 }
2034                 if (amp != NULL) {
2035                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2036                         if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
2037                                 /*
2038                                  * Shared anon map is no longer in use. Before
2039                                  * freeing its pages purge all entries from
2040                                  * pcache that belong to this amp.
2041                                  */
2042                                 if (svd->type == MAP_SHARED) {
2043                                         ASSERT(amp->refcnt == 1);
2044                                         ASSERT(svd->softlockcnt == 0);
2045                                         anonmap_purge(amp);
2046                                 }
2047                                 /*
2048                                  * Free up now unused parts of anon_map array.
2049                                  */
2050                                 if (amp->a_szc == seg->s_szc) {
2051                                         if (seg->s_szc != 0) {
2052                                                 anon_free_pages(amp->ahp,
2053                                                     svd->anon_index, len,
2054                                                     seg->s_szc);
2055                                         } else {
2056                                                 anon_free(amp->ahp,
2057                                                     svd->anon_index,
2058                                                     len);
2059                                         }
2060                                 } else {
2061                                         ASSERT(svd->type == MAP_SHARED);
2062                                         ASSERT(amp->a_szc > seg->s_szc);
2063                                         anon_shmap_free_pages(amp,
2064                                             svd->anon_index, len);
2065                                 }
2066 
2067                                 /*
2068                                  * Unreserve swap space for the
2069                                  * unmapped chunk of this segment in
2070                                  * case it's MAP_SHARED
2071                                  */
2072                                 if (svd->type == MAP_SHARED) {
2073                                         anon_unresv_zone(len,
2074                                             seg->s_as->a_proc->p_zone);
2075                                         amp->swresv -= len;
2076                                 }
2077                         }
2078                         ANON_LOCK_EXIT(&amp->a_rwlock);
2079                         svd->anon_index += dpages;
2080                 }
2081                 if (svd->vp != NULL)
2082                         svd->offset += len;
2083 
2084                 seg->s_base += len;
2085                 seg->s_size -= len;
2086 
2087                 if (svd->swresv) {
2088                         if (svd->flags & MAP_NORESERVE) {
2089                                 ASSERT(amp);
2090                                 oswresv = svd->swresv;
2091 
2092                                 svd->swresv = ptob(anon_pages(amp->ahp,
2093                                     svd->anon_index, npages));
2094                                 anon_unresv_zone(oswresv - svd->swresv,
2095                                     seg->s_as->a_proc->p_zone);
2096                                 if (SEG_IS_PARTIAL_RESV(seg))
2097                                         seg->s_as->a_resvsize -= oswresv -
2098                                             svd->swresv;
2099                         } else {
2100                                 size_t unlen;
2101 
2102                                 if (svd->pageswap) {
2103                                         oswresv = svd->swresv;
2104                                         svd->swresv =
2105                                             segvn_count_swap_by_vpages(seg);
2106                                         ASSERT(oswresv >= svd->swresv);
2107                                         unlen = oswresv - svd->swresv;
2108                                 } else {
2109                                         svd->swresv -= len;
2110                                         ASSERT(svd->swresv == seg->s_size);
2111                                         unlen = len;
2112                                 }
2113                                 anon_unresv_zone(unlen,
2114                                     seg->s_as->a_proc->p_zone);
2115                         }
2116                         TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
2117                             seg, len, 0);
2118                 }
2119 
2120                 return (0);
2121         }
2122 
2123         /*
2124          * Check for end of segment
2125          */
2126         if (addr + len == seg->s_base + seg->s_size) {
2127                 if (svd->vpage != NULL) {
2128                         size_t nbytes;
2129                         struct vpage *ovpage;
2130 
2131                         ovpage = svd->vpage; /* keep pointer to vpage */
2132 
2133                         nbytes = vpgtob(npages);
2134                         svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
2135                         bcopy(ovpage, svd->vpage, nbytes);
2136 
2137                         /* free up old vpage */
2138                         kmem_free(ovpage, vpgtob(opages));
2139 
2140                 }
2141                 if (amp != NULL) {
2142                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2143                         if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
2144                                 /*
2145                                  * Free up now unused parts of anon_map array.
2146                                  */
2147                                 ulong_t an_idx = svd->anon_index + npages;
2148 
2149                                 /*
2150                                  * Shared anon map is no longer in use. Before
2151                                  * freeing its pages purge all entries from
2152                                  * pcache that belong to this amp.
2153                                  */
2154                                 if (svd->type == MAP_SHARED) {
2155                                         ASSERT(amp->refcnt == 1);
2156                                         ASSERT(svd->softlockcnt == 0);
2157                                         anonmap_purge(amp);
2158                                 }
2159 
2160                                 if (amp->a_szc == seg->s_szc) {
2161                                         if (seg->s_szc != 0) {
2162                                                 anon_free_pages(amp->ahp,
2163                                                     an_idx, len,
2164                                                     seg->s_szc);
2165                                         } else {
2166                                                 anon_free(amp->ahp, an_idx,
2167                                                     len);
2168                                         }
2169                                 } else {
2170                                         ASSERT(svd->type == MAP_SHARED);
2171                                         ASSERT(amp->a_szc > seg->s_szc);
2172                                         anon_shmap_free_pages(amp,
2173                                             an_idx, len);
2174                                 }
2175 
2176                                 /*
2177                                  * Unreserve swap space for the
2178                                  * unmapped chunk of this segment in
2179                                  * case it's MAP_SHARED
2180                                  */
2181                                 if (svd->type == MAP_SHARED) {
2182                                         anon_unresv_zone(len,
2183                                             seg->s_as->a_proc->p_zone);
2184                                         amp->swresv -= len;
2185                                 }
2186                         }
2187                         ANON_LOCK_EXIT(&amp->a_rwlock);
2188                 }
2189 
2190                 seg->s_size -= len;
2191 
2192                 if (svd->swresv) {
2193                         if (svd->flags & MAP_NORESERVE) {
2194                                 ASSERT(amp);
2195                                 oswresv = svd->swresv;
2196                                 svd->swresv = ptob(anon_pages(amp->ahp,
2197                                     svd->anon_index, npages));
2198                                 anon_unresv_zone(oswresv - svd->swresv,
2199                                     seg->s_as->a_proc->p_zone);
2200                                 if (SEG_IS_PARTIAL_RESV(seg))
2201                                         seg->s_as->a_resvsize -= oswresv -
2202                                             svd->swresv;
2203                         } else {
2204                                 size_t unlen;
2205 
2206                                 if (svd->pageswap) {
2207                                         oswresv = svd->swresv;
2208                                         svd->swresv =
2209                                             segvn_count_swap_by_vpages(seg);
2210                                         ASSERT(oswresv >= svd->swresv);
2211                                         unlen = oswresv - svd->swresv;
2212                                 } else {
2213                                         svd->swresv -= len;
2214                                         ASSERT(svd->swresv == seg->s_size);
2215                                         unlen = len;
2216                                 }
2217                                 anon_unresv_zone(unlen,
2218                                     seg->s_as->a_proc->p_zone);
2219                         }
2220                         TRACE_3(TR_FAC_VM, TR_ANON_PROC,
2221                             "anon proc:%p %lu %u", seg, len, 0);
2222                 }
2223 
2224                 return (0);
2225         }
2226 
2227         /*
2228          * The section to go is in the middle of the segment,
2229          * have to make it into two segments.  nseg is made for
2230          * the high end while seg is cut down at the low end.
2231          */
2232         nbase = addr + len;                             /* new seg base */
2233         nsize = (seg->s_base + seg->s_size) - nbase;      /* new seg size */
2234         seg->s_size = addr - seg->s_base;         /* shrink old seg */
2235         nseg = seg_alloc(seg->s_as, nbase, nsize);
2236         if (nseg == NULL) {
2237                 panic("segvn_unmap seg_alloc");
2238                 /*NOTREACHED*/
2239         }
2240         nseg->s_ops = seg->s_ops;
2241         nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
2242         nseg->s_data = (void *)nsvd;
2243         nseg->s_szc = seg->s_szc;
2244         *nsvd = *svd;
2245         nsvd->seg = nseg;
2246         nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base);
2247         nsvd->swresv = 0;
2248         nsvd->softlockcnt = 0;
2249         nsvd->softlockcnt_sbase = 0;
2250         nsvd->softlockcnt_send = 0;
2251         nsvd->svn_inz = svd->svn_inz;
2252         ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE);
2253 
2254         if (svd->vp != NULL) {
2255                 VN_HOLD(nsvd->vp);
2256                 if (nsvd->type == MAP_SHARED)
2257                         lgrp_shm_policy_init(NULL, nsvd->vp);
2258         }
2259         crhold(svd->cred);
2260 
2261         if (svd->vpage == NULL) {
2262                 nsvd->vpage = NULL;
2263         } else {
2264                 /* need to split vpage into two arrays */
2265                 size_t nbytes;
2266                 struct vpage *ovpage;
2267 
2268                 ovpage = svd->vpage;         /* keep pointer to vpage */
2269 
2270                 npages = seg_pages(seg);        /* seg has shrunk */
2271                 nbytes = vpgtob(npages);
2272                 svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
2273 
2274                 bcopy(ovpage, svd->vpage, nbytes);
2275 
2276                 npages = seg_pages(nseg);
2277                 nbytes = vpgtob(npages);
2278                 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP);
2279 
2280                 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes);
2281 
2282                 /* free up old vpage */
2283                 kmem_free(ovpage, vpgtob(opages));
2284         }
2285 
2286         if (amp == NULL) {
2287                 nsvd->amp = NULL;
2288                 nsvd->anon_index = 0;
2289         } else {
2290                 /*
2291                  * Need to create a new anon map for the new segment.
2292                  * We'll also allocate a new smaller array for the old
2293                  * smaller segment to save space.
2294                  */
2295                 opages = btop((uintptr_t)(addr - seg->s_base));
2296                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2297                 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
2298                         /*
2299                          * Free up now unused parts of anon_map array.
2300                          */
2301                         ulong_t an_idx = svd->anon_index + opages;
2302 
2303                         /*
2304                          * Shared anon map is no longer in use. Before
2305                          * freeing its pages purge all entries from
2306                          * pcache that belong to this amp.
2307                          */
2308                         if (svd->type == MAP_SHARED) {
2309                                 ASSERT(amp->refcnt == 1);
2310                                 ASSERT(svd->softlockcnt == 0);
2311                                 anonmap_purge(amp);
2312                         }
2313 
2314                         if (amp->a_szc == seg->s_szc) {
2315                                 if (seg->s_szc != 0) {
2316                                         anon_free_pages(amp->ahp, an_idx, len,
2317                                             seg->s_szc);
2318                                 } else {
2319                                         anon_free(amp->ahp, an_idx,
2320                                             len);
2321                                 }
2322                         } else {
2323                                 ASSERT(svd->type == MAP_SHARED);
2324                                 ASSERT(amp->a_szc > seg->s_szc);
2325                                 anon_shmap_free_pages(amp, an_idx, len);
2326                         }
2327 
2328                         /*
2329                          * Unreserve swap space for the
2330                          * unmapped chunk of this segment in
2331                          * case it's MAP_SHARED
2332                          */
2333                         if (svd->type == MAP_SHARED) {
2334                                 anon_unresv_zone(len,
2335                                     seg->s_as->a_proc->p_zone);
2336                                 amp->swresv -= len;
2337                         }
2338                 }
2339                 nsvd->anon_index = svd->anon_index +
2340                     btop((uintptr_t)(nseg->s_base - seg->s_base));
2341                 if (svd->type == MAP_SHARED) {
2342                         amp->refcnt++;
2343                         nsvd->amp = amp;
2344                 } else {
2345                         struct anon_map *namp;
2346                         struct anon_hdr *nahp;
2347 
2348                         ASSERT(svd->type == MAP_PRIVATE);
2349                         nahp = anon_create(btop(seg->s_size), ANON_SLEEP);
2350                         namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP);
2351                         namp->a_szc = seg->s_szc;
2352                         (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp,
2353                             0, btop(seg->s_size), ANON_SLEEP);
2354                         (void) anon_copy_ptr(amp->ahp, nsvd->anon_index,
2355                             namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP);
2356                         anon_release(amp->ahp, btop(amp->size));
2357                         svd->anon_index = 0;
2358                         nsvd->anon_index = 0;
2359                         amp->ahp = nahp;
2360                         amp->size = seg->s_size;
2361                         nsvd->amp = namp;
2362                 }
2363                 ANON_LOCK_EXIT(&amp->a_rwlock);
2364         }
2365         if (svd->swresv) {
2366                 if (svd->flags & MAP_NORESERVE) {
2367                         ASSERT(amp);
2368                         oswresv = svd->swresv;
2369                         svd->swresv = ptob(anon_pages(amp->ahp,
2370                             svd->anon_index, btop(seg->s_size)));
2371                         nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp,
2372                             nsvd->anon_index, btop(nseg->s_size)));
2373                         ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
2374                         anon_unresv_zone(oswresv - (svd->swresv + nsvd->swresv),
2375                             seg->s_as->a_proc->p_zone);
2376                         if (SEG_IS_PARTIAL_RESV(seg))
2377                                 seg->s_as->a_resvsize -= oswresv -
2378                                     (svd->swresv + nsvd->swresv);
2379                 } else {
2380                         size_t unlen;
2381 
2382                         if (svd->pageswap) {
2383                                 oswresv = svd->swresv;
2384                                 svd->swresv = segvn_count_swap_by_vpages(seg);
2385                                 nsvd->swresv = segvn_count_swap_by_vpages(nseg);
2386                                 ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
2387                                 unlen = oswresv - (svd->swresv + nsvd->swresv);
2388                         } else {
2389                                 if (seg->s_size + nseg->s_size + len !=
2390                                     svd->swresv) {
2391                                         panic("segvn_unmap: cannot split "
2392                                             "swap reservation");
2393                                         /*NOTREACHED*/
2394                                 }
2395                                 svd->swresv = seg->s_size;
2396                                 nsvd->swresv = nseg->s_size;
2397                                 unlen = len;
2398                         }
2399                         anon_unresv_zone(unlen,
2400                             seg->s_as->a_proc->p_zone);
2401                 }
2402                 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
2403                     seg, len, 0);
2404         }
2405 
2406         return (0);                     /* I'm glad that's all over with! */
2407 }
2408 
2409 static void
2410 segvn_free(struct seg *seg)
2411 {
2412         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
2413         pgcnt_t npages = seg_pages(seg);
2414         struct anon_map *amp;
2415         size_t len;
2416 
2417         /*
2418          * We don't need any segment level locks for "segvn" data
2419          * since the address space is "write" locked.
2420          */
2421         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
2422         ASSERT(svd->tr_state == SEGVN_TR_OFF);
2423 
2424         ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
2425 
2426         /*
2427          * Be sure to unlock pages. XXX Why do things get free'ed instead
2428          * of unmapped? XXX
2429          */
2430         (void) segvn_lockop(seg, seg->s_base, seg->s_size,
2431             0, MC_UNLOCK, NULL, 0);
2432 
2433         /*
2434          * Deallocate the vpage and anon pointers if necessary and possible.
2435          */
2436         if (svd->vpage != NULL) {
2437                 kmem_free(svd->vpage, vpgtob(npages));
2438                 svd->vpage = NULL;
2439         }
2440         if ((amp = svd->amp) != NULL) {
2441                 /*
2442                  * If there are no more references to this anon_map
2443                  * structure, then deallocate the structure after freeing
2444                  * up all the anon slot pointers that we can.
2445                  */
2446                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2447                 ASSERT(amp->a_szc >= seg->s_szc);
2448                 if (--amp->refcnt == 0) {
2449                         if (svd->type == MAP_PRIVATE) {
2450                                 /*
2451                                  * Private - we only need to anon_free
2452                                  * the part that this segment refers to.
2453                                  */
2454                                 if (seg->s_szc != 0) {
2455                                         anon_free_pages(amp->ahp,
2456                                             svd->anon_index, seg->s_size,
2457                                             seg->s_szc);
2458                                 } else {
2459                                         anon_free(amp->ahp, svd->anon_index,
2460                                             seg->s_size);
2461                                 }
2462                         } else {
2463 
2464                                 /*
2465                                  * Shared anon map is no longer in use. Before
2466                                  * freeing its pages purge all entries from
2467                                  * pcache that belong to this amp.
2468                                  */
2469                                 ASSERT(svd->softlockcnt == 0);
2470                                 anonmap_purge(amp);
2471 
2472                                 /*
2473                                  * Shared - anon_free the entire
2474                                  * anon_map's worth of stuff and
2475                                  * release any swap reservation.
2476                                  */
2477                                 if (amp->a_szc != 0) {
2478                                         anon_shmap_free_pages(amp, 0,
2479                                             amp->size);
2480                                 } else {
2481                                         anon_free(amp->ahp, 0, amp->size);
2482                                 }
2483                                 if ((len = amp->swresv) != 0) {
2484                                         anon_unresv_zone(len,
2485                                             seg->s_as->a_proc->p_zone);
2486                                         TRACE_3(TR_FAC_VM, TR_ANON_PROC,
2487                                             "anon proc:%p %lu %u", seg, len, 0);
2488                                 }
2489                         }
2490                         svd->amp = NULL;
2491                         ANON_LOCK_EXIT(&amp->a_rwlock);
2492                         anonmap_free(amp);
2493                 } else if (svd->type == MAP_PRIVATE) {
2494                         /*
2495                          * We had a private mapping which still has
2496                          * a held anon_map so just free up all the
2497                          * anon slot pointers that we were using.
2498                          */
2499                         if (seg->s_szc != 0) {
2500                                 anon_free_pages(amp->ahp, svd->anon_index,
2501                                     seg->s_size, seg->s_szc);
2502                         } else {
2503                                 anon_free(amp->ahp, svd->anon_index,
2504                                     seg->s_size);
2505                         }
2506                         ANON_LOCK_EXIT(&amp->a_rwlock);
2507                 } else {
2508                         ANON_LOCK_EXIT(&amp->a_rwlock);
2509                 }
2510         }
2511 
2512         /*
2513          * Release swap reservation.
2514          */
2515         if ((len = svd->swresv) != 0) {
2516                 anon_unresv_zone(svd->swresv,
2517                     seg->s_as->a_proc->p_zone);
2518                 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
2519                     seg, len, 0);
2520                 if (SEG_IS_PARTIAL_RESV(seg))
2521                         seg->s_as->a_resvsize -= svd->swresv;
2522                 svd->swresv = 0;
2523         }
2524         /*
2525          * Release claim on vnode, credentials, and finally free the
2526          * private data.
2527          */
2528         if (svd->vp != NULL) {
2529                 if (svd->type == MAP_SHARED)
2530                         lgrp_shm_policy_fini(NULL, svd->vp);
2531                 VN_RELE(svd->vp);
2532                 svd->vp = NULL;
2533         }
2534         crfree(svd->cred);
2535         svd->pageprot = 0;
2536         svd->pageadvice = 0;
2537         svd->pageswap = 0;
2538         svd->cred = NULL;
2539 
2540         /*
2541          * Take segfree_syncmtx lock to let segvn_reclaim() finish if it's
2542          * still working with this segment without holding as lock (in case
2543          * it's called by pcache async thread).
2544          */
2545         ASSERT(svd->softlockcnt == 0);
2546         mutex_enter(&svd->segfree_syncmtx);
2547         mutex_exit(&svd->segfree_syncmtx);
2548 
2549         seg->s_data = NULL;
2550         kmem_cache_free(segvn_cache, svd);
2551 }
2552 
2553 /*
2554  * Do a F_SOFTUNLOCK call over the range requested.  The range must have
2555  * already been F_SOFTLOCK'ed.
2556  * Caller must always match addr and len of a softunlock with a previous
2557  * softlock with exactly the same addr and len.
2558  */
2559 static void
2560 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
2561 {
2562         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
2563         page_t *pp;
2564         caddr_t adr;
2565         struct vnode *vp;
2566         u_offset_t offset;
2567         ulong_t anon_index;
2568         struct anon_map *amp;
2569         struct anon *ap = NULL;
2570 
2571         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2572         ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
2573 
2574         if ((amp = svd->amp) != NULL)
2575                 anon_index = svd->anon_index + seg_page(seg, addr);
2576 
2577         if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
2578                 ASSERT(svd->tr_state == SEGVN_TR_OFF);
2579                 hat_unlock_region(seg->s_as->a_hat, addr, len, svd->rcookie);
2580         } else {
2581                 hat_unlock(seg->s_as->a_hat, addr, len);
2582         }
2583         for (adr = addr; adr < addr + len; adr += PAGESIZE) {
2584                 if (amp != NULL) {
2585                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2586                         if ((ap = anon_get_ptr(amp->ahp, anon_index++))
2587                             != NULL) {
2588                                 swap_xlate(ap, &vp, &offset);
2589                         } else {
2590                                 vp = svd->vp;
2591                                 offset = svd->offset +
2592                                     (uintptr_t)(adr - seg->s_base);
2593                         }
2594                         ANON_LOCK_EXIT(&amp->a_rwlock);
2595                 } else {
2596                         vp = svd->vp;
2597                         offset = svd->offset +
2598                             (uintptr_t)(adr - seg->s_base);
2599                 }
2600 
2601                 /*
2602                  * Use page_find() instead of page_lookup() to
2603                  * find the page since we know that it is locked.
2604                  */
2605                 pp = page_find(vp, offset);
2606                 if (pp == NULL) {
2607                         panic(
2608                             "segvn_softunlock: addr %p, ap %p, vp %p, off %llx",
2609                             (void *)adr, (void *)ap, (void *)vp, offset);
2610                         /*NOTREACHED*/
2611                 }
2612 
2613                 if (rw == S_WRITE) {
2614                         hat_setrefmod(pp);
2615                         if (seg->s_as->a_vbits)
2616                                 hat_setstat(seg->s_as, adr, PAGESIZE,
2617                                     P_REF | P_MOD);
2618                 } else if (rw != S_OTHER) {
2619                         hat_setref(pp);
2620                         if (seg->s_as->a_vbits)
2621                                 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF);
2622                 }
2623                 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
2624                     "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset);
2625                 page_unlock(pp);
2626         }
2627         ASSERT(svd->softlockcnt >= btop(len));
2628         if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -btop(len))) {
2629                 /*
2630                  * All SOFTLOCKS are gone. Wakeup any waiting
2631                  * unmappers so they can try again to unmap.
2632                  * Check for waiters first without the mutex
2633                  * held so we don't always grab the mutex on
2634                  * softunlocks.
2635                  */
2636                 if (AS_ISUNMAPWAIT(seg->s_as)) {
2637                         mutex_enter(&seg->s_as->a_contents);
2638                         if (AS_ISUNMAPWAIT(seg->s_as)) {
2639                                 AS_CLRUNMAPWAIT(seg->s_as);
2640                                 cv_broadcast(&seg->s_as->a_cv);
2641                         }
2642                         mutex_exit(&seg->s_as->a_contents);
2643                 }
2644         }
2645 }
2646 
2647 #define PAGE_HANDLED    ((page_t *)-1)
2648 
2649 /*
2650  * Release all the pages in the NULL terminated ppp list
2651  * which haven't already been converted to PAGE_HANDLED.
2652  */
2653 static void
2654 segvn_pagelist_rele(page_t **ppp)
2655 {
2656         for (; *ppp != NULL; ppp++) {
2657                 if (*ppp != PAGE_HANDLED)
2658                         page_unlock(*ppp);
2659         }
2660 }
2661 
2662 static int stealcow = 1;
2663 
2664 /*
2665  * Workaround for viking chip bug.  See bug id 1220902.
2666  * To fix this down in pagefault() would require importing so
2667  * much as and segvn code as to be unmaintainable.
2668  */
2669 int enable_mbit_wa = 0;
2670 
2671 /*
2672  * Handles all the dirty work of getting the right
2673  * anonymous pages and loading up the translations.
2674  * This routine is called only from segvn_fault()
2675  * when looping over the range of addresses requested.
2676  *
2677  * The basic algorithm here is:
2678  *      If this is an anon_zero case
2679  *              Call anon_zero to allocate page
2680  *              Load up translation
2681  *              Return
2682  *      endif
2683  *      If this is an anon page
2684  *              Use anon_getpage to get the page
2685  *      else
2686  *              Find page in pl[] list passed in
2687  *      endif
2688  *      If not a cow
2689  *              Load up the translation to the page
2690  *              return
2691  *      endif
2692  *      Call anon_private to handle cow
2693  *      Load up (writable) translation to new page
2694  */
2695 static faultcode_t
2696 segvn_faultpage(
2697         struct hat *hat,                /* the hat to use for mapping */
2698         struct seg *seg,                /* seg_vn of interest */
2699         caddr_t addr,                   /* address in as */
2700         u_offset_t off,                 /* offset in vp */
2701         struct vpage *vpage,            /* pointer to vpage for vp, off */
2702         page_t *pl[],                   /* object source page pointer */
2703         uint_t vpprot,                  /* access allowed to object pages */
2704         enum fault_type type,           /* type of fault */
2705         enum seg_rw rw,                 /* type of access at fault */
2706         int brkcow)                     /* we may need to break cow */
2707 {
2708         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
2709         page_t *pp, **ppp;
2710         uint_t pageflags = 0;
2711         page_t *anon_pl[1 + 1];
2712         page_t *opp = NULL;             /* original page */
2713         uint_t prot;
2714         int err;
2715         int cow;
2716         int claim;
2717         int steal = 0;
2718         ulong_t anon_index;
2719         struct anon *ap, *oldap;
2720         struct anon_map *amp;
2721         int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
2722         int anon_lock = 0;
2723         anon_sync_obj_t cookie;
2724 
2725         if (svd->flags & MAP_TEXT) {
2726                 hat_flag |= HAT_LOAD_TEXT;
2727         }
2728 
2729         ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock));
2730         ASSERT(seg->s_szc == 0);
2731         ASSERT(svd->tr_state != SEGVN_TR_INIT);
2732 
2733         /*
2734          * Initialize protection value for this page.
2735          * If we have per page protection values check it now.
2736          */
2737         if (svd->pageprot) {
2738                 uint_t protchk;
2739 
2740                 switch (rw) {
2741                 case S_READ:
2742                         protchk = PROT_READ;
2743                         break;
2744                 case S_WRITE:
2745                         protchk = PROT_WRITE;
2746                         break;
2747                 case S_EXEC:
2748                         protchk = PROT_EXEC;
2749                         break;
2750                 case S_OTHER:
2751                 default:
2752                         protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
2753                         break;
2754                 }
2755 
2756                 prot = VPP_PROT(vpage);
2757                 if ((prot & protchk) == 0)
2758                         return (FC_PROT);       /* illegal access type */
2759         } else {
2760                 prot = svd->prot;
2761         }
2762 
2763         if (type == F_SOFTLOCK) {
2764                 atomic_inc_ulong((ulong_t *)&svd->softlockcnt);
2765         }
2766 
2767         /*
2768          * Always acquire the anon array lock to prevent 2 threads from
2769          * allocating separate anon slots for the same "addr".
2770          */
2771 
2772         if ((amp = svd->amp) != NULL) {
2773                 ASSERT(RW_READ_HELD(&amp->a_rwlock));
2774                 anon_index = svd->anon_index + seg_page(seg, addr);
2775                 anon_array_enter(amp, anon_index, &cookie);
2776                 anon_lock = 1;
2777         }
2778 
2779         if (svd->vp == NULL && amp != NULL) {
2780                 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) {
2781                         /*
2782                          * Allocate a (normally) writable anonymous page of
2783                          * zeroes. If no advance reservations, reserve now.
2784                          */
2785                         if (svd->flags & MAP_NORESERVE) {
2786                                 if (anon_resv_zone(ptob(1),
2787                                     seg->s_as->a_proc->p_zone)) {
2788                                         atomic_add_long(&svd->swresv, ptob(1));
2789                                         atomic_add_long(&seg->s_as->a_resvsize,
2790                                             ptob(1));
2791                                 } else {
2792                                         err = ENOMEM;
2793                                         goto out;
2794                                 }
2795                         }
2796                         if ((pp = anon_zero(seg, addr, &ap,
2797                             svd->cred)) == NULL) {
2798                                 err = ENOMEM;
2799                                 goto out;       /* out of swap space */
2800                         }
2801                         /*
2802                          * Re-acquire the anon_map lock and
2803                          * initialize the anon array entry.
2804                          */
2805                         (void) anon_set_ptr(amp->ahp, anon_index, ap,
2806                             ANON_SLEEP);
2807 
2808                         ASSERT(pp->p_szc == 0);
2809 
2810                         /*
2811                          * Handle pages that have been marked for migration
2812                          */
2813                         if (lgrp_optimizations())
2814                                 page_migrate(seg, addr, &pp, 1);
2815 
2816                         if (enable_mbit_wa) {
2817                                 if (rw == S_WRITE)
2818                                         hat_setmod(pp);
2819                                 else if (!hat_ismod(pp))
2820                                         prot &= ~PROT_WRITE;
2821                         }
2822                         /*
2823                          * If AS_PAGLCK is set in a_flags (via memcntl(2)
2824                          * with MC_LOCKAS, MCL_FUTURE) and this is a
2825                          * MAP_NORESERVE segment, we may need to
2826                          * permanently lock the page as it is being faulted
2827                          * for the first time. The following text applies
2828                          * only to MAP_NORESERVE segments:
2829                          *
2830                          * As per memcntl(2), if this segment was created
2831                          * after MCL_FUTURE was applied (a "future"
2832                          * segment), its pages must be locked.  If this
2833                          * segment existed at MCL_FUTURE application (a
2834                          * "past" segment), the interface is unclear.
2835                          *
2836                          * We decide to lock only if vpage is present:
2837                          *
2838                          * - "future" segments will have a vpage array (see
2839                          *    as_map), and so will be locked as required
2840                          *
2841                          * - "past" segments may not have a vpage array,
2842                          *    depending on whether events (such as
2843                          *    mprotect) have occurred. Locking if vpage
2844                          *    exists will preserve legacy behavior.  Not
2845                          *    locking if vpage is absent, will not break
2846                          *    the interface or legacy behavior.  Note that
2847                          *    allocating vpage here if it's absent requires
2848                          *    upgrading the segvn reader lock, the cost of
2849                          *    which does not seem worthwhile.
2850                          *
2851                          * Usually testing and setting VPP_ISPPLOCK and
2852                          * VPP_SETPPLOCK requires holding the segvn lock as
2853                          * writer, but in this case all readers are
2854                          * serializing on the anon array lock.
2855                          */
2856                         if (AS_ISPGLCK(seg->s_as) && vpage != NULL &&
2857                             (svd->flags & MAP_NORESERVE) &&
2858                             !VPP_ISPPLOCK(vpage)) {
2859                                 proc_t *p = seg->s_as->a_proc;
2860                                 ASSERT(svd->type == MAP_PRIVATE);
2861                                 mutex_enter(&p->p_lock);
2862                                 if (rctl_incr_locked_mem(p, NULL, PAGESIZE,
2863                                     1) == 0) {
2864                                         claim = VPP_PROT(vpage) & PROT_WRITE;
2865                                         if (page_pp_lock(pp, claim, 0)) {
2866                                                 VPP_SETPPLOCK(vpage);
2867                                         } else {
2868                                                 rctl_decr_locked_mem(p, NULL,
2869                                                     PAGESIZE, 1);
2870                                         }
2871                                 }
2872                                 mutex_exit(&p->p_lock);
2873                         }
2874 
2875                         ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
2876                         hat_memload(hat, addr, pp, prot, hat_flag);
2877 
2878                         if (!(hat_flag & HAT_LOAD_LOCK))
2879                                 page_unlock(pp);
2880 
2881                         anon_array_exit(&cookie);
2882                         return (0);
2883                 }
2884         }
2885 
2886         /*
2887          * Obtain the page structure via anon_getpage() if it is
2888          * a private copy of an object (the result of a previous
2889          * copy-on-write).
2890          */
2891         if (amp != NULL) {
2892                 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) {
2893                         err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE,
2894                             seg, addr, rw, svd->cred);
2895                         if (err)
2896                                 goto out;
2897 
2898                         if (svd->type == MAP_SHARED) {
2899                                 /*
2900                                  * If this is a shared mapping to an
2901                                  * anon_map, then ignore the write
2902                                  * permissions returned by anon_getpage().
2903                                  * They apply to the private mappings
2904                                  * of this anon_map.
2905                                  */
2906                                 vpprot |= PROT_WRITE;
2907                         }
2908                         opp = anon_pl[0];
2909                 }
2910         }
2911 
2912         /*
2913          * Search the pl[] list passed in if it is from the
2914          * original object (i.e., not a private copy).
2915          */
2916         if (opp == NULL) {
2917                 /*
2918                  * Find original page.  We must be bringing it in
2919                  * from the list in pl[].
2920                  */
2921                 for (ppp = pl; (opp = *ppp) != NULL; ppp++) {
2922                         if (opp == PAGE_HANDLED)
2923                                 continue;
2924                         ASSERT(opp->p_vnode == svd->vp); /* XXX */
2925                         if (opp->p_offset == off)
2926                                 break;
2927                 }
2928                 if (opp == NULL) {
2929                         panic("segvn_faultpage not found");
2930                         /*NOTREACHED*/
2931                 }
2932                 *ppp = PAGE_HANDLED;
2933 
2934         }
2935 
2936         ASSERT(PAGE_LOCKED(opp));
2937 
2938         TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
2939             "segvn_fault:pp %p vp %p offset %llx", opp, NULL, 0);
2940 
2941         /*
2942          * The fault is treated as a copy-on-write fault if a
2943          * write occurs on a private segment and the object
2944          * page (i.e., mapping) is write protected.  We assume
2945          * that fatal protection checks have already been made.
2946          */
2947 
2948         if (brkcow) {
2949                 ASSERT(svd->tr_state == SEGVN_TR_OFF);
2950                 cow = !(vpprot & PROT_WRITE);
2951         } else if (svd->tr_state == SEGVN_TR_ON) {
2952                 /*
2953                  * If we are doing text replication COW on first touch.
2954                  */
2955                 ASSERT(amp != NULL);
2956                 ASSERT(svd->vp != NULL);
2957                 ASSERT(rw != S_WRITE);
2958                 cow = (ap == NULL);
2959         } else {
2960                 cow = 0;
2961         }
2962 
2963         /*
2964          * If not a copy-on-write case load the translation
2965          * and return.
2966          */
2967         if (cow == 0) {
2968 
2969                 /*
2970                  * Handle pages that have been marked for migration
2971                  */
2972                 if (lgrp_optimizations())
2973                         page_migrate(seg, addr, &opp, 1);
2974 
2975                 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) {
2976                         if (rw == S_WRITE)
2977                                 hat_setmod(opp);
2978                         else if (rw != S_OTHER && !hat_ismod(opp))
2979                                 prot &= ~PROT_WRITE;
2980                 }
2981 
2982                 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE ||
2983                     (!svd->pageprot && svd->prot == (prot & vpprot)));
2984                 ASSERT(amp == NULL ||
2985                     svd->rcookie == HAT_INVALID_REGION_COOKIE);
2986                 hat_memload_region(hat, addr, opp, prot & vpprot, hat_flag,
2987                     svd->rcookie);
2988 
2989                 if (!(hat_flag & HAT_LOAD_LOCK))
2990                         page_unlock(opp);
2991 
2992                 if (anon_lock) {
2993                         anon_array_exit(&cookie);
2994                 }
2995                 return (0);
2996         }
2997 
2998         ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
2999 
3000         hat_setref(opp);
3001 
3002         ASSERT(amp != NULL && anon_lock);
3003 
3004         /*
3005          * Steal the page only if it isn't a private page
3006          * since stealing a private page is not worth the effort.
3007          */
3008         if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL)
3009                 steal = 1;
3010 
3011         /*
3012          * Steal the original page if the following conditions are true:
3013          *
3014          * We are low on memory, the page is not private, page is not large,
3015          * not shared, not modified, not `locked' or if we have it `locked'
3016          * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies
3017          * that the page is not shared) and if it doesn't have any
3018          * translations. page_struct_lock isn't needed to look at p_cowcnt
3019          * and p_lckcnt because we first get exclusive lock on page.
3020          */
3021         (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
3022 
3023         if (stealcow && freemem < minfree && steal && opp->p_szc == 0 &&
3024             page_tryupgrade(opp) && !hat_ismod(opp) &&
3025             ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) ||
3026             (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 &&
3027             vpage != NULL && VPP_ISPPLOCK(vpage)))) {
3028                 /*
3029                  * Check if this page has other translations
3030                  * after unloading our translation.
3031                  */
3032                 if (hat_page_is_mapped(opp)) {
3033                         ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
3034                         hat_unload(seg->s_as->a_hat, addr, PAGESIZE,
3035                             HAT_UNLOAD);
3036                 }
3037 
3038                 /*
3039                  * hat_unload() might sync back someone else's recent
3040                  * modification, so check again.
3041                  */
3042                 if (!hat_ismod(opp) && !hat_page_is_mapped(opp))
3043                         pageflags |= STEAL_PAGE;
3044         }
3045 
3046         /*
3047          * If we have a vpage pointer, see if it indicates that we have
3048          * ``locked'' the page we map -- if so, tell anon_private to
3049          * transfer the locking resource to the new page.
3050          *
3051          * See Statement at the beginning of segvn_lockop regarding
3052          * the way lockcnts/cowcnts are handled during COW.
3053          *
3054          */
3055         if (vpage != NULL && VPP_ISPPLOCK(vpage))
3056                 pageflags |= LOCK_PAGE;
3057 
3058         /*
3059          * Allocate a private page and perform the copy.
3060          * For MAP_NORESERVE reserve swap space now, unless this
3061          * is a cow fault on an existing anon page in which case
3062          * MAP_NORESERVE will have made advance reservations.
3063          */
3064         if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) {
3065                 if (anon_resv_zone(ptob(1), seg->s_as->a_proc->p_zone)) {
3066                         atomic_add_long(&svd->swresv, ptob(1));
3067                         atomic_add_long(&seg->s_as->a_resvsize, ptob(1));
3068                 } else {
3069                         page_unlock(opp);
3070                         err = ENOMEM;
3071                         goto out;
3072                 }
3073         }
3074         oldap = ap;
3075         pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred);
3076         if (pp == NULL) {
3077                 err = ENOMEM;   /* out of swap space */
3078                 goto out;
3079         }
3080 
3081         /*
3082          * If we copied away from an anonymous page, then
3083          * we are one step closer to freeing up an anon slot.
3084          *
3085          * NOTE:  The original anon slot must be released while
3086          * holding the "anon_map" lock.  This is necessary to prevent
3087          * other threads from obtaining a pointer to the anon slot
3088          * which may be freed if its "refcnt" is 1.
3089          */
3090         if (oldap != NULL)
3091                 anon_decref(oldap);
3092 
3093         (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
3094 
3095         /*
3096          * Handle pages that have been marked for migration
3097          */
3098         if (lgrp_optimizations())
3099                 page_migrate(seg, addr, &pp, 1);
3100 
3101         ASSERT(pp->p_szc == 0);
3102 
3103         ASSERT(!IS_VMODSORT(pp->p_vnode));
3104         if (enable_mbit_wa) {
3105                 if (rw == S_WRITE)
3106                         hat_setmod(pp);
3107                 else if (!hat_ismod(pp))
3108                         prot &= ~PROT_WRITE;
3109         }
3110 
3111         ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
3112         hat_memload(hat, addr, pp, prot, hat_flag);
3113 
3114         if (!(hat_flag & HAT_LOAD_LOCK))
3115                 page_unlock(pp);
3116 
3117         ASSERT(anon_lock);
3118         anon_array_exit(&cookie);
3119         return (0);
3120 out:
3121         if (anon_lock)
3122                 anon_array_exit(&cookie);
3123 
3124         if (type == F_SOFTLOCK) {
3125                 atomic_dec_ulong((ulong_t *)&svd->softlockcnt);
3126         }
3127         return (FC_MAKE_ERR(err));
3128 }
3129 
3130 /*
3131  * relocate a bunch of smaller targ pages into one large repl page. all targ
3132  * pages must be complete pages smaller than replacement pages.
3133  * it's assumed that no page's szc can change since they are all PAGESIZE or
3134  * complete large pages locked SHARED.
3135  */
3136 static void
3137 segvn_relocate_pages(page_t **targ, page_t *replacement)
3138 {
3139         page_t *pp;
3140         pgcnt_t repl_npgs, curnpgs;
3141         pgcnt_t i;
3142         uint_t repl_szc = replacement->p_szc;
3143         page_t *first_repl = replacement;
3144         page_t *repl;
3145         spgcnt_t npgs;
3146 
3147         VM_STAT_ADD(segvnvmstats.relocatepages[0]);
3148 
3149         ASSERT(repl_szc != 0);
3150         npgs = repl_npgs = page_get_pagecnt(repl_szc);
3151 
3152         i = 0;
3153         while (repl_npgs) {
3154                 spgcnt_t nreloc;
3155                 int err;
3156                 ASSERT(replacement != NULL);
3157                 pp = targ[i];
3158                 ASSERT(pp->p_szc < repl_szc);
3159                 ASSERT(PAGE_EXCL(pp));
3160                 ASSERT(!PP_ISFREE(pp));
3161                 curnpgs = page_get_pagecnt(pp->p_szc);
3162                 if (curnpgs == 1) {
3163                         VM_STAT_ADD(segvnvmstats.relocatepages[1]);
3164                         repl = replacement;
3165                         page_sub(&replacement, repl);
3166                         ASSERT(PAGE_EXCL(repl));
3167                         ASSERT(!PP_ISFREE(repl));
3168                         ASSERT(repl->p_szc == repl_szc);
3169                 } else {
3170                         page_t *repl_savepp;
3171                         int j;
3172                         VM_STAT_ADD(segvnvmstats.relocatepages[2]);
3173                         repl_savepp = replacement;
3174                         for (j = 0; j < curnpgs; j++) {
3175                                 repl = replacement;
3176                                 page_sub(&replacement, repl);
3177                                 ASSERT(PAGE_EXCL(repl));
3178                                 ASSERT(!PP_ISFREE(repl));
3179                                 ASSERT(repl->p_szc == repl_szc);
3180                                 ASSERT(page_pptonum(targ[i + j]) ==
3181                                     page_pptonum(targ[i]) + j);
3182                         }
3183                         repl = repl_savepp;
3184                         ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs));
3185                 }
3186                 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL);
3187                 if (err || nreloc != curnpgs) {
3188                         panic("segvn_relocate_pages: "
3189                             "page_relocate failed err=%d curnpgs=%ld "
3190                             "nreloc=%ld", err, curnpgs, nreloc);
3191                 }
3192                 ASSERT(curnpgs <= repl_npgs);
3193                 repl_npgs -= curnpgs;
3194                 i += curnpgs;
3195         }
3196         ASSERT(replacement == NULL);
3197 
3198         repl = first_repl;
3199         repl_npgs = npgs;
3200         for (i = 0; i < repl_npgs; i++) {
3201                 ASSERT(PAGE_EXCL(repl));
3202                 ASSERT(!PP_ISFREE(repl));
3203                 targ[i] = repl;
3204                 page_downgrade(targ[i]);
3205                 repl++;
3206         }
3207 }
3208 
3209 /*
3210  * Check if all pages in ppa array are complete smaller than szc pages and
3211  * their roots will still be aligned relative to their current size if the
3212  * entire ppa array is relocated into one szc page. If these conditions are
3213  * not met return 0.
3214  *
3215  * If all pages are properly aligned attempt to upgrade their locks
3216  * to exclusive mode. If it fails set *upgrdfail to 1 and return 0.
3217  * upgrdfail was set to 0 by caller.
3218  *
3219  * Return 1 if all pages are aligned and locked exclusively.
3220  *
3221  * If all pages in ppa array happen to be physically contiguous to make one
3222  * szc page and all exclusive locks are successfully obtained promote the page
3223  * size to szc and set *pszc to szc. Return 1 with pages locked shared.
3224  */
3225 static int
3226 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc)
3227 {
3228         page_t *pp;
3229         pfn_t pfn;
3230         pgcnt_t totnpgs = page_get_pagecnt(szc);
3231         pfn_t first_pfn;
3232         int contig = 1;
3233         pgcnt_t i;
3234         pgcnt_t j;
3235         uint_t curszc;
3236         pgcnt_t curnpgs;
3237         int root = 0;
3238 
3239         ASSERT(szc > 0);
3240 
3241         VM_STAT_ADD(segvnvmstats.fullszcpages[0]);
3242 
3243         for (i = 0; i < totnpgs; i++) {
3244                 pp = ppa[i];
3245                 ASSERT(PAGE_SHARED(pp));
3246                 ASSERT(!PP_ISFREE(pp));
3247                 pfn = page_pptonum(pp);
3248                 if (i == 0) {
3249                         if (!IS_P2ALIGNED(pfn, totnpgs)) {
3250                                 contig = 0;
3251                         } else {
3252                                 first_pfn = pfn;
3253                         }
3254                 } else if (contig && pfn != first_pfn + i) {
3255                         contig = 0;
3256                 }
3257                 if (pp->p_szc == 0) {
3258                         if (root) {
3259                                 VM_STAT_ADD(segvnvmstats.fullszcpages[1]);
3260                                 return (0);
3261                         }
3262                 } else if (!root) {
3263                         if ((curszc = pp->p_szc) >= szc) {
3264                                 VM_STAT_ADD(segvnvmstats.fullszcpages[2]);
3265                                 return (0);
3266                         }
3267                         if (curszc == 0) {
3268                                 /*
3269                                  * p_szc changed means we don't have all pages
3270                                  * locked. return failure.
3271                                  */
3272                                 VM_STAT_ADD(segvnvmstats.fullszcpages[3]);
3273                                 return (0);
3274                         }
3275                         curnpgs = page_get_pagecnt(curszc);
3276                         if (!IS_P2ALIGNED(pfn, curnpgs) ||
3277                             !IS_P2ALIGNED(i, curnpgs)) {
3278                                 VM_STAT_ADD(segvnvmstats.fullszcpages[4]);
3279                                 return (0);
3280                         }
3281                         root = 1;
3282                 } else {
3283                         ASSERT(i > 0);
3284                         VM_STAT_ADD(segvnvmstats.fullszcpages[5]);
3285                         if (pp->p_szc != curszc) {
3286                                 VM_STAT_ADD(segvnvmstats.fullszcpages[6]);
3287                                 return (0);
3288                         }
3289                         if (pfn - 1 != page_pptonum(ppa[i - 1])) {
3290                                 panic("segvn_full_szcpages: "
3291                                     "large page not physically contiguous");
3292                         }
3293                         if (P2PHASE(pfn, curnpgs) == curnpgs - 1) {
3294                                 root = 0;
3295                         }
3296                 }
3297         }
3298 
3299         for (i = 0; i < totnpgs; i++) {
3300                 ASSERT(ppa[i]->p_szc < szc);
3301                 if (!page_tryupgrade(ppa[i])) {
3302                         for (j = 0; j < i; j++) {
3303                                 page_downgrade(ppa[j]);
3304                         }
3305                         *pszc = ppa[i]->p_szc;
3306                         *upgrdfail = 1;
3307                         VM_STAT_ADD(segvnvmstats.fullszcpages[7]);
3308                         return (0);
3309                 }
3310         }
3311 
3312         /*
3313          * When a page is put a free cachelist its szc is set to 0.  if file
3314          * system reclaimed pages from cachelist targ pages will be physically
3315          * contiguous with 0 p_szc.  in this case just upgrade szc of targ
3316          * pages without any relocations.
3317          * To avoid any hat issues with previous small mappings
3318          * hat_pageunload() the target pages first.
3319          */
3320         if (contig) {
3321                 VM_STAT_ADD(segvnvmstats.fullszcpages[8]);
3322                 for (i = 0; i < totnpgs; i++) {
3323                         (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD);
3324                 }
3325                 for (i = 0; i < totnpgs; i++) {
3326                         ppa[i]->p_szc = szc;
3327                 }
3328                 for (i = 0; i < totnpgs; i++) {
3329                         ASSERT(PAGE_EXCL(ppa[i]));
3330                         page_downgrade(ppa[i]);
3331                 }
3332                 if (pszc != NULL) {
3333                         *pszc = szc;
3334                 }
3335         }
3336         VM_STAT_ADD(segvnvmstats.fullszcpages[9]);
3337         return (1);
3338 }
3339 
3340 /*
3341  * Create physically contiguous pages for [vp, off] - [vp, off +
3342  * page_size(szc)) range and for private segment return them in ppa array.
3343  * Pages are created either via IO or relocations.
3344  *
3345  * Return 1 on success and 0 on failure.
3346  *
3347  * If physically contiguous pages already exist for this range return 1 without
3348  * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa
3349  * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE().
3350  */
3351 
3352 static int
3353 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off,
3354     uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc,
3355     int *downsize)
3356 
3357 {
3358         page_t *pplist = *ppplist;
3359         size_t pgsz = page_get_pagesize(szc);
3360         pgcnt_t pages = btop(pgsz);
3361         ulong_t start_off = off;
3362         u_offset_t eoff = off + pgsz;
3363         spgcnt_t nreloc;
3364         u_offset_t io_off = off;
3365         size_t io_len;
3366         page_t *io_pplist = NULL;
3367         page_t *done_pplist = NULL;
3368         pgcnt_t pgidx = 0;
3369         page_t *pp;
3370         page_t *newpp;
3371         page_t *targpp;
3372         int io_err = 0;
3373         int i;
3374         pfn_t pfn;
3375         ulong_t ppages;
3376         page_t *targ_pplist = NULL;
3377         page_t *repl_pplist = NULL;
3378         page_t *tmp_pplist;
3379         int nios = 0;
3380         uint_t pszc;
3381         struct vattr va;
3382 
3383         VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]);
3384 
3385         ASSERT(szc != 0);
3386         ASSERT(pplist->p_szc == szc);
3387 
3388         /*
3389          * downsize will be set to 1 only if we fail to lock pages. this will
3390          * allow subsequent faults to try to relocate the page again. If we
3391          * fail due to misalignment don't downsize and let the caller map the
3392          * whole region with small mappings to avoid more faults into the area
3393          * where we can't get large pages anyway.
3394          */
3395         *downsize = 0;
3396 
3397         while (off < eoff) {
3398                 newpp = pplist;
3399                 ASSERT(newpp != NULL);
3400                 ASSERT(PAGE_EXCL(newpp));
3401                 ASSERT(!PP_ISFREE(newpp));
3402                 /*
3403                  * we pass NULL for nrelocp to page_lookup_create()
3404                  * so that it doesn't relocate. We relocate here
3405                  * later only after we make sure we can lock all
3406                  * pages in the range we handle and they are all
3407                  * aligned.
3408                  */
3409                 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0);
3410                 ASSERT(pp != NULL);
3411                 ASSERT(!PP_ISFREE(pp));
3412                 ASSERT(pp->p_vnode == vp);
3413                 ASSERT(pp->p_offset == off);
3414                 if (pp == newpp) {
3415                         VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]);
3416                         page_sub(&pplist, pp);
3417                         ASSERT(PAGE_EXCL(pp));
3418                         ASSERT(page_iolock_assert(pp));
3419                         page_list_concat(&io_pplist, &pp);
3420                         off += PAGESIZE;
3421                         continue;
3422                 }
3423                 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]);
3424                 pfn = page_pptonum(pp);
3425                 pszc = pp->p_szc;
3426                 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL &&
3427                     IS_P2ALIGNED(pfn, pages)) {
3428                         ASSERT(repl_pplist == NULL);
3429                         ASSERT(done_pplist == NULL);
3430                         ASSERT(pplist == *ppplist);
3431                         page_unlock(pp);
3432                         page_free_replacement_page(pplist);
3433                         page_create_putback(pages);
3434                         *ppplist = NULL;
3435                         VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]);
3436                         return (1);
3437                 }
3438                 if (pszc >= szc) {
3439                         page_unlock(pp);
3440                         segvn_faultvnmpss_align_err1++;
3441                         goto out;
3442                 }
3443                 ppages = page_get_pagecnt(pszc);
3444                 if (!IS_P2ALIGNED(pfn, ppages)) {
3445                         ASSERT(pszc > 0);
3446                         /*
3447                          * sizing down to pszc won't help.
3448                          */
3449                         page_unlock(pp);
3450                         segvn_faultvnmpss_align_err2++;
3451                         goto out;
3452                 }
3453                 pfn = page_pptonum(newpp);
3454                 if (!IS_P2ALIGNED(pfn, ppages)) {
3455                         ASSERT(pszc > 0);
3456                         /*
3457                          * sizing down to pszc won't help.
3458                          */
3459                         page_unlock(pp);
3460                         segvn_faultvnmpss_align_err3++;
3461                         goto out;
3462                 }
3463                 if (!PAGE_EXCL(pp)) {
3464                         VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]);
3465                         page_unlock(pp);
3466                         *downsize = 1;
3467                         *ret_pszc = pp->p_szc;
3468                         goto out;
3469                 }
3470                 targpp = pp;
3471                 if (io_pplist != NULL) {
3472                         VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]);
3473                         io_len = off - io_off;
3474                         /*
3475                          * Some file systems like NFS don't check EOF
3476                          * conditions in VOP_PAGEIO(). Check it here
3477                          * now that pages are locked SE_EXCL. Any file
3478                          * truncation will wait until the pages are
3479                          * unlocked so no need to worry that file will
3480                          * be truncated after we check its size here.
3481                          * XXX fix NFS to remove this check.
3482                          */
3483                         va.va_mask = AT_SIZE;
3484                         if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred, NULL)) {
3485                                 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]);
3486                                 page_unlock(targpp);
3487                                 goto out;
3488                         }
3489                         if (btopr(va.va_size) < btopr(io_off + io_len)) {
3490                                 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]);
3491                                 *downsize = 1;
3492                                 *ret_pszc = 0;
3493                                 page_unlock(targpp);
3494                                 goto out;
3495                         }
3496                         io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len,
3497                                 B_READ, svd->cred, NULL);
3498                         if (io_err) {
3499                                 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]);
3500                                 page_unlock(targpp);
3501                                 if (io_err == EDEADLK) {
3502                                         segvn_vmpss_pageio_deadlk_err++;
3503                                 }
3504                                 goto out;
3505                         }
3506                         nios++;
3507                         VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]);
3508                         while (io_pplist != NULL) {
3509                                 pp = io_pplist;
3510                                 page_sub(&io_pplist, pp);
3511                                 ASSERT(page_iolock_assert(pp));
3512                                 page_io_unlock(pp);
3513                                 pgidx = (pp->p_offset - start_off) >>
3514                                     PAGESHIFT;
3515                                 ASSERT(pgidx < pages);
3516                                 ppa[pgidx] = pp;
3517                                 page_list_concat(&done_pplist, &pp);
3518                         }
3519                 }
3520                 pp = targpp;
3521                 ASSERT(PAGE_EXCL(pp));
3522                 ASSERT(pp->p_szc <= pszc);
3523                 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) {
3524                         VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]);
3525                         page_unlock(pp);
3526                         *downsize = 1;
3527                         *ret_pszc = pp->p_szc;
3528                         goto out;
3529                 }
3530                 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]);
3531                 /*
3532                  * page szc chould have changed before the entire group was
3533                  * locked. reread page szc.
3534                  */
3535                 pszc = pp->p_szc;
3536                 ppages = page_get_pagecnt(pszc);
3537 
3538                 /* link just the roots */
3539                 page_list_concat(&targ_pplist, &pp);
3540                 page_sub(&pplist, newpp);
3541                 page_list_concat(&repl_pplist, &newpp);
3542                 off += PAGESIZE;
3543                 while (--ppages != 0) {
3544                         newpp = pplist;
3545                         page_sub(&pplist, newpp);
3546                         off += PAGESIZE;
3547                 }
3548                 io_off = off;
3549         }
3550         if (io_pplist != NULL) {
3551                 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]);
3552                 io_len = eoff - io_off;
3553                 va.va_mask = AT_SIZE;
3554                 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred, NULL) != 0) {
3555                         VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]);
3556                         goto out;
3557                 }
3558                 if (btopr(va.va_size) < btopr(io_off + io_len)) {
3559                         VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]);
3560                         *downsize = 1;
3561                         *ret_pszc = 0;
3562                         goto out;
3563                 }
3564                 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len,
3565                     B_READ, svd->cred, NULL);
3566                 if (io_err) {
3567                         VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]);
3568                         if (io_err == EDEADLK) {
3569                                 segvn_vmpss_pageio_deadlk_err++;
3570                         }
3571                         goto out;
3572                 }
3573                 nios++;
3574                 while (io_pplist != NULL) {
3575                         pp = io_pplist;
3576                         page_sub(&io_pplist, pp);
3577                         ASSERT(page_iolock_assert(pp));
3578                         page_io_unlock(pp);
3579                         pgidx = (pp->p_offset - start_off) >> PAGESHIFT;
3580                         ASSERT(pgidx < pages);
3581                         ppa[pgidx] = pp;
3582                 }
3583         }
3584         /*
3585          * we're now bound to succeed or panic.
3586          * remove pages from done_pplist. it's not needed anymore.
3587          */
3588         while (done_pplist != NULL) {
3589                 pp = done_pplist;
3590                 page_sub(&done_pplist, pp);
3591         }
3592         VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]);
3593         ASSERT(pplist == NULL);
3594         *ppplist = NULL;
3595         while (targ_pplist != NULL) {
3596                 int ret;
3597                 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]);
3598                 ASSERT(repl_pplist);
3599                 pp = targ_pplist;
3600                 page_sub(&targ_pplist, pp);
3601                 pgidx = (pp->p_offset - start_off) >> PAGESHIFT;
3602                 newpp = repl_pplist;
3603                 page_sub(&repl_pplist, newpp);
3604 #ifdef DEBUG
3605                 pfn = page_pptonum(pp);
3606                 pszc = pp->p_szc;
3607                 ppages = page_get_pagecnt(pszc);
3608                 ASSERT(IS_P2ALIGNED(pfn, ppages));
3609                 pfn = page_pptonum(newpp);
3610                 ASSERT(IS_P2ALIGNED(pfn, ppages));
3611                 ASSERT(P2PHASE(pfn, pages) == pgidx);
3612 #endif
3613                 nreloc = 0;
3614                 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL);
3615                 if (ret != 0 || nreloc == 0) {
3616                         panic("segvn_fill_vp_pages: "
3617                             "page_relocate failed");
3618                 }
3619                 pp = newpp;
3620                 while (nreloc-- != 0) {
3621                         ASSERT(PAGE_EXCL(pp));
3622                         ASSERT(pp->p_vnode == vp);
3623                         ASSERT(pgidx ==
3624                             ((pp->p_offset - start_off) >> PAGESHIFT));
3625                         ppa[pgidx++] = pp;
3626                         pp++;
3627                 }
3628         }
3629 
3630         if (svd->type == MAP_PRIVATE) {
3631                 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]);
3632                 for (i = 0; i < pages; i++) {
3633                         ASSERT(ppa[i] != NULL);
3634                         ASSERT(PAGE_EXCL(ppa[i]));
3635                         ASSERT(ppa[i]->p_vnode == vp);
3636                         ASSERT(ppa[i]->p_offset ==
3637                             start_off + (i << PAGESHIFT));
3638                         page_downgrade(ppa[i]);
3639                 }
3640                 ppa[pages] = NULL;
3641         } else {
3642                 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]);
3643                 /*
3644                  * the caller will still call VOP_GETPAGE() for shared segments
3645                  * to check FS write permissions. For private segments we map
3646                  * file read only anyway.  so no VOP_GETPAGE is needed.
3647                  */
3648                 for (i = 0; i < pages; i++) {
3649                         ASSERT(ppa[i] != NULL);
3650                         ASSERT(PAGE_EXCL(ppa[i]));
3651                         ASSERT(ppa[i]->p_vnode == vp);
3652                         ASSERT(ppa[i]->p_offset ==
3653                             start_off + (i << PAGESHIFT));
3654                         page_unlock(ppa[i]);
3655                 }
3656                 ppa[0] = NULL;
3657         }
3658 
3659         return (1);
3660 out:
3661         /*
3662          * Do the cleanup. Unlock target pages we didn't relocate. They are
3663          * linked on targ_pplist by root pages. reassemble unused replacement
3664          * and io pages back to pplist.
3665          */
3666         if (io_pplist != NULL) {
3667                 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]);
3668                 pp = io_pplist;
3669                 do {
3670                         ASSERT(pp->p_vnode == vp);
3671                         ASSERT(pp->p_offset == io_off);
3672                         ASSERT(page_iolock_assert(pp));
3673                         page_io_unlock(pp);
3674                         page_hashout(pp, NULL);
3675                         io_off += PAGESIZE;
3676                 } while ((pp = pp->p_next) != io_pplist);
3677                 page_list_concat(&io_pplist, &pplist);
3678                 pplist = io_pplist;
3679         }
3680         tmp_pplist = NULL;
3681         while (targ_pplist != NULL) {
3682                 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]);
3683                 pp = targ_pplist;
3684                 ASSERT(PAGE_EXCL(pp));
3685                 page_sub(&targ_pplist, pp);
3686 
3687                 pszc = pp->p_szc;
3688                 ppages = page_get_pagecnt(pszc);
3689                 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages));
3690 
3691                 if (pszc != 0) {
3692                         group_page_unlock(pp);
3693                 }
3694                 page_unlock(pp);
3695 
3696                 pp = repl_pplist;
3697                 ASSERT(pp != NULL);
3698                 ASSERT(PAGE_EXCL(pp));
3699                 ASSERT(pp->p_szc == szc);
3700                 page_sub(&repl_pplist, pp);
3701 
3702                 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages));
3703 
3704                 /* relink replacement page */
3705                 page_list_concat(&tmp_pplist, &pp);
3706                 while (--ppages != 0) {
3707                         VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]);
3708                         pp++;
3709                         ASSERT(PAGE_EXCL(pp));
3710                         ASSERT(pp->p_szc == szc);
3711                         page_list_concat(&tmp_pplist, &pp);
3712                 }
3713         }
3714         if (tmp_pplist != NULL) {
3715                 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]);
3716                 page_list_concat(&tmp_pplist, &pplist);
3717                 pplist = tmp_pplist;
3718         }
3719         /*
3720          * at this point all pages are either on done_pplist or
3721          * pplist. They can't be all on done_pplist otherwise
3722          * we'd've been done.
3723          */
3724         ASSERT(pplist != NULL);
3725         if (nios != 0) {
3726                 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]);
3727                 pp = pplist;
3728                 do {
3729                         VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]);
3730                         ASSERT(pp->p_szc == szc);
3731                         ASSERT(PAGE_EXCL(pp));
3732                         ASSERT(pp->p_vnode != vp);
3733                         pp->p_szc = 0;
3734                 } while ((pp = pp->p_next) != pplist);
3735 
3736                 pp = done_pplist;
3737                 do {
3738                         VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]);
3739                         ASSERT(pp->p_szc == szc);
3740                         ASSERT(PAGE_EXCL(pp));
3741                         ASSERT(pp->p_vnode == vp);
3742                         pp->p_szc = 0;
3743                 } while ((pp = pp->p_next) != done_pplist);
3744 
3745                 while (pplist != NULL) {
3746                         VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]);
3747                         pp = pplist;
3748                         page_sub(&pplist, pp);
3749                         page_free(pp, 0);
3750                 }
3751 
3752                 while (done_pplist != NULL) {
3753                         VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]);
3754                         pp = done_pplist;
3755                         page_sub(&done_pplist, pp);
3756                         page_unlock(pp);
3757                 }
3758                 *ppplist = NULL;
3759                 return (0);
3760         }
3761         ASSERT(pplist == *ppplist);
3762         if (io_err) {
3763                 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]);
3764                 /*
3765                  * don't downsize on io error.
3766                  * see if vop_getpage succeeds.
3767                  * pplist may still be used in this case
3768                  * for relocations.
3769                  */
3770                 return (0);
3771         }
3772         VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]);
3773         page_free_replacement_page(pplist);
3774         page_create_putback(pages);
3775         *ppplist = NULL;
3776         return (0);
3777 }
3778 
3779 int segvn_anypgsz = 0;
3780 
3781 #define SEGVN_RESTORE_SOFTLOCK_VP(type, pages)                          \
3782                 if ((type) == F_SOFTLOCK) {                             \
3783                         atomic_add_long((ulong_t *)&(svd)->softlockcnt, \
3784                             -(pages));                                  \
3785                 }
3786 
3787 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot)              \
3788                 if (IS_VMODSORT((ppa)[0]->p_vnode)) {                        \
3789                         if ((rw) == S_WRITE) {                          \
3790                                 for (i = 0; i < (pages); i++) {              \
3791                                         ASSERT((ppa)[i]->p_vnode ==  \
3792                                             (ppa)[0]->p_vnode);              \
3793                                         hat_setmod((ppa)[i]);           \
3794                                 }                                       \
3795                         } else if ((rw) != S_OTHER &&                   \
3796                             ((prot) & (vpprot) & PROT_WRITE)) {         \
3797                                 for (i = 0; i < (pages); i++) {              \
3798                                         ASSERT((ppa)[i]->p_vnode ==  \
3799                                             (ppa)[0]->p_vnode);              \
3800                                         if (!hat_ismod((ppa)[i])) {     \
3801                                                 prot &= ~PROT_WRITE;        \
3802                                                 break;                  \
3803                                         }                               \
3804                                 }                                       \
3805                         }                                               \
3806                 }
3807 
3808 #ifdef  VM_STATS
3809 
3810 #define SEGVN_VMSTAT_FLTVNPAGES(idx)                                    \
3811                 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]);
3812 
3813 #else /* VM_STATS */
3814 
3815 #define SEGVN_VMSTAT_FLTVNPAGES(idx)
3816 
3817 #endif
3818 
3819 static faultcode_t
3820 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
3821     caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr,
3822     caddr_t eaddr, int brkcow)
3823 {
3824         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
3825         struct anon_map *amp = svd->amp;
3826         uchar_t segtype = svd->type;
3827         uint_t szc = seg->s_szc;
3828         size_t pgsz = page_get_pagesize(szc);
3829         size_t maxpgsz = pgsz;
3830         pgcnt_t pages = btop(pgsz);
3831         pgcnt_t maxpages = pages;
3832         size_t ppasize = (pages + 1) * sizeof (page_t *);
3833         caddr_t a = lpgaddr;
3834         caddr_t maxlpgeaddr = lpgeaddr;
3835         u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base);
3836         ulong_t aindx = svd->anon_index + seg_page(seg, a);
3837         struct vpage *vpage = (svd->vpage != NULL) ?
3838             &svd->vpage[seg_page(seg, a)] : NULL;
3839         vnode_t *vp = svd->vp;
3840         page_t **ppa;
3841         uint_t  pszc;
3842         size_t  ppgsz;
3843         pgcnt_t ppages;
3844         faultcode_t err = 0;
3845         int ierr;
3846         int vop_size_err = 0;
3847         uint_t protchk, prot, vpprot;
3848         ulong_t i;
3849         int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
3850         anon_sync_obj_t an_cookie;
3851         enum seg_rw arw;
3852         int alloc_failed = 0;
3853         int adjszc_chk;
3854         struct vattr va;
3855         page_t *pplist;
3856         pfn_t pfn;
3857         int physcontig;
3858         int upgrdfail;
3859         int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */
3860         int tron = (svd->tr_state == SEGVN_TR_ON);
3861 
3862         ASSERT(szc != 0);
3863         ASSERT(vp != NULL);
3864         ASSERT(brkcow == 0 || amp != NULL);
3865         ASSERT(tron == 0 || amp != NULL);
3866         ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */
3867         ASSERT(!(svd->flags & MAP_NORESERVE));
3868         ASSERT(type != F_SOFTUNLOCK);
3869         ASSERT(IS_P2ALIGNED(a, maxpgsz));
3870         ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages));
3871         ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
3872         ASSERT(seg->s_szc < NBBY * sizeof (int));
3873         ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz);
3874         ASSERT(svd->tr_state != SEGVN_TR_INIT);
3875 
3876         VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]);
3877         VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]);
3878 
3879         if (svd->flags & MAP_TEXT) {
3880                 hat_flag |= HAT_LOAD_TEXT;
3881         }
3882 
3883         if (svd->pageprot) {
3884                 switch (rw) {
3885                 case S_READ:
3886                         protchk = PROT_READ;
3887                         break;
3888                 case S_WRITE:
3889                         protchk = PROT_WRITE;
3890                         break;
3891                 case S_EXEC:
3892                         protchk = PROT_EXEC;
3893                         break;
3894                 case S_OTHER:
3895                 default:
3896                         protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
3897                         break;
3898                 }
3899         } else {
3900                 prot = svd->prot;
3901                 /* caller has already done segment level protection check. */
3902         }
3903 
3904         if (rw == S_WRITE && segtype == MAP_PRIVATE) {
3905                 SEGVN_VMSTAT_FLTVNPAGES(2);
3906                 arw = S_READ;
3907         } else {
3908                 arw = rw;
3909         }
3910 
3911         ppa = kmem_alloc(ppasize, KM_SLEEP);
3912 
3913         VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]);
3914 
3915         for (;;) {
3916                 adjszc_chk = 0;
3917                 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) {
3918                         if (adjszc_chk) {
3919                                 while (szc < seg->s_szc) {
3920                                         uintptr_t e;
3921                                         uint_t tszc;
3922                                         tszc = segvn_anypgsz_vnode ? szc + 1 :
3923                                             seg->s_szc;
3924                                         ppgsz = page_get_pagesize(tszc);
3925                                         if (!IS_P2ALIGNED(a, ppgsz) ||
3926                                             ((alloc_failed >> tszc) & 0x1)) {
3927                                                 break;
3928                                         }
3929                                         SEGVN_VMSTAT_FLTVNPAGES(4);
3930                                         szc = tszc;
3931                                         pgsz = ppgsz;
3932                                         pages = btop(pgsz);
3933                                         e = P2ROUNDUP((uintptr_t)eaddr, pgsz);
3934                                         lpgeaddr = (caddr_t)e;
3935                                 }
3936                         }
3937 
3938                 again:
3939                         if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) {
3940                                 ASSERT(IS_P2ALIGNED(aindx, maxpages));
3941                                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
3942                                 anon_array_enter(amp, aindx, &an_cookie);
3943                                 if (anon_get_ptr(amp->ahp, aindx) != NULL) {
3944                                         SEGVN_VMSTAT_FLTVNPAGES(5);
3945                                         ASSERT(anon_pages(amp->ahp, aindx,
3946                                             maxpages) == maxpages);
3947                                         anon_array_exit(&an_cookie);
3948                                         ANON_LOCK_EXIT(&amp->a_rwlock);
3949                                         err = segvn_fault_anonpages(hat, seg,
3950                                             a, a + maxpgsz, type, rw,
3951                                             MAX(a, addr),
3952                                             MIN(a + maxpgsz, eaddr), brkcow);
3953                                         if (err != 0) {
3954                                                 SEGVN_VMSTAT_FLTVNPAGES(6);
3955                                                 goto out;
3956                                         }
3957                                         if (szc < seg->s_szc) {
3958                                                 szc = seg->s_szc;
3959                                                 pgsz = maxpgsz;
3960                                                 pages = maxpages;
3961                                                 lpgeaddr = maxlpgeaddr;
3962                                         }
3963                                         goto next;
3964                                 } else {
3965                                         ASSERT(anon_pages(amp->ahp, aindx,
3966                                             maxpages) == 0);
3967                                         SEGVN_VMSTAT_FLTVNPAGES(7);
3968                                         anon_array_exit(&an_cookie);
3969                                         ANON_LOCK_EXIT(&amp->a_rwlock);
3970                                 }
3971                         }
3972                         ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz));
3973                         ASSERT(!tron || IS_P2ALIGNED(a, maxpgsz));
3974 
3975                         if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) {
3976                                 ASSERT(vpage != NULL);
3977                                 prot = VPP_PROT(vpage);
3978                                 ASSERT(sameprot(seg, a, maxpgsz));
3979                                 if ((prot & protchk) == 0) {
3980                                         SEGVN_VMSTAT_FLTVNPAGES(8);
3981                                         err = FC_PROT;
3982                                         goto out;
3983                                 }
3984                         }
3985                         if (type == F_SOFTLOCK) {
3986                                 atomic_add_long((ulong_t *)&svd->softlockcnt,
3987                                     pages);
3988                         }
3989 
3990                         pplist = NULL;
3991                         physcontig = 0;
3992                         ppa[0] = NULL;
3993                         if (!brkcow && !tron && szc &&
3994                             !page_exists_physcontig(vp, off, szc,
3995                             segtype == MAP_PRIVATE ? ppa : NULL)) {
3996                                 SEGVN_VMSTAT_FLTVNPAGES(9);
3997                                 if (page_alloc_pages(vp, seg, a, &pplist, NULL,
3998                                     szc, 0, 0) && type != F_SOFTLOCK) {
3999                                         SEGVN_VMSTAT_FLTVNPAGES(10);
4000                                         pszc = 0;
4001                                         ierr = -1;
4002                                         alloc_failed |= (1 << szc);
4003                                         break;
4004                                 }
4005                                 if (pplist != NULL &&
4006                                     vp->v_mpssdata == SEGVN_PAGEIO) {
4007                                         int downsize;
4008                                         SEGVN_VMSTAT_FLTVNPAGES(11);
4009                                         physcontig = segvn_fill_vp_pages(svd,
4010                                             vp, off, szc, ppa, &pplist,
4011                                             &pszc, &downsize);
4012                                         ASSERT(!physcontig || pplist == NULL);
4013                                         if (!physcontig && downsize &&
4014                                             type != F_SOFTLOCK) {
4015                                                 ASSERT(pplist == NULL);
4016                                                 SEGVN_VMSTAT_FLTVNPAGES(12);
4017                                                 ierr = -1;
4018                                                 break;
4019                                         }
4020                                         ASSERT(!physcontig ||
4021                                             segtype == MAP_PRIVATE ||
4022                                             ppa[0] == NULL);
4023                                         if (physcontig && ppa[0] == NULL) {
4024                                                 physcontig = 0;
4025                                         }
4026                                 }
4027                         } else if (!brkcow && !tron && szc && ppa[0] != NULL) {
4028                                 SEGVN_VMSTAT_FLTVNPAGES(13);
4029                                 ASSERT(segtype == MAP_PRIVATE);
4030                                 physcontig = 1;
4031                         }
4032 
4033                         if (!physcontig) {
4034                                 SEGVN_VMSTAT_FLTVNPAGES(14);
4035                                 ppa[0] = NULL;
4036                                 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz,
4037                                     &vpprot, ppa, pgsz, seg, a, arw,
4038                                     svd->cred, NULL);
4039 #ifdef DEBUG
4040                                 if (ierr == 0) {
4041                                         for (i = 0; i < pages; i++) {
4042                                                 ASSERT(PAGE_LOCKED(ppa[i]));
4043                                                 ASSERT(!PP_ISFREE(ppa[i]));
4044                                                 ASSERT(ppa[i]->p_vnode == vp);
4045                                                 ASSERT(ppa[i]->p_offset ==
4046                                                     off + (i << PAGESHIFT));
4047                                         }
4048                                 }
4049 #endif /* DEBUG */
4050                                 if (segtype == MAP_PRIVATE) {
4051                                         SEGVN_VMSTAT_FLTVNPAGES(15);
4052                                         vpprot &= ~PROT_WRITE;
4053                                 }
4054                         } else {
4055                                 ASSERT(segtype == MAP_PRIVATE);
4056                                 SEGVN_VMSTAT_FLTVNPAGES(16);
4057                                 vpprot = PROT_ALL & ~PROT_WRITE;
4058                                 ierr = 0;
4059                         }
4060 
4061                         if (ierr != 0) {
4062                                 SEGVN_VMSTAT_FLTVNPAGES(17);
4063                                 if (pplist != NULL) {
4064                                         SEGVN_VMSTAT_FLTVNPAGES(18);
4065                                         page_free_replacement_page(pplist);
4066                                         page_create_putback(pages);
4067                                 }
4068                                 SEGVN_RESTORE_SOFTLOCK_VP(type, pages);
4069                                 if (a + pgsz <= eaddr) {
4070                                         SEGVN_VMSTAT_FLTVNPAGES(19);
4071                                         err = FC_MAKE_ERR(ierr);
4072                                         goto out;
4073                                 }
4074                                 va.va_mask = AT_SIZE;
4075                                 if (VOP_GETATTR(vp, &va, 0, svd->cred, NULL)) {
4076                                         SEGVN_VMSTAT_FLTVNPAGES(20);
4077                                         err = FC_MAKE_ERR(EIO);
4078                                         goto out;
4079                                 }
4080                                 if (btopr(va.va_size) >= btopr(off + pgsz)) {
4081                                         SEGVN_VMSTAT_FLTVNPAGES(21);
4082                                         err = FC_MAKE_ERR(ierr);
4083                                         goto out;
4084                                 }
4085                                 if (btopr(va.va_size) <
4086                                     btopr(off + (eaddr - a))) {
4087                                         SEGVN_VMSTAT_FLTVNPAGES(22);
4088                                         err = FC_MAKE_ERR(ierr);
4089                                         goto out;
4090                                 }
4091                                 if (brkcow || tron || type == F_SOFTLOCK) {
4092                                         /* can't reduce map area */
4093                                         SEGVN_VMSTAT_FLTVNPAGES(23);
4094                                         vop_size_err = 1;
4095                                         goto out;
4096                                 }
4097                                 SEGVN_VMSTAT_FLTVNPAGES(24);
4098                                 ASSERT(szc != 0);
4099                                 pszc = 0;
4100                                 ierr = -1;
4101                                 break;
4102                         }
4103 
4104                         if (amp != NULL) {
4105                                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
4106                                 anon_array_enter(amp, aindx, &an_cookie);
4107                         }
4108                         if (amp != NULL &&
4109                             anon_get_ptr(amp->ahp, aindx) != NULL) {
4110                                 ulong_t taindx = P2ALIGN(aindx, maxpages);
4111 
4112                                 SEGVN_VMSTAT_FLTVNPAGES(25);
4113                                 ASSERT(anon_pages(amp->ahp, taindx,
4114                                     maxpages) == maxpages);
4115                                 for (i = 0; i < pages; i++) {
4116                                         page_unlock(ppa[i]);
4117                                 }
4118                                 anon_array_exit(&an_cookie);
4119                                 ANON_LOCK_EXIT(&amp->a_rwlock);
4120                                 if (pplist != NULL) {
4121                                         page_free_replacement_page(pplist);
4122                                         page_create_putback(pages);
4123                                 }
4124                                 SEGVN_RESTORE_SOFTLOCK_VP(type, pages);
4125                                 if (szc < seg->s_szc) {
4126                                         SEGVN_VMSTAT_FLTVNPAGES(26);
4127                                         /*
4128                                          * For private segments SOFTLOCK
4129                                          * either always breaks cow (any rw
4130                                          * type except S_READ_NOCOW) or
4131                                          * address space is locked as writer
4132                                          * (S_READ_NOCOW case) and anon slots
4133                                          * can't show up on second check.
4134                                          * Therefore if we are here for
4135                                          * SOFTLOCK case it must be a cow
4136                                          * break but cow break never reduces
4137                                          * szc. text replication (tron) in
4138                                          * this case works as cow break.
4139                                          * Thus the assert below.
4140                                          */
4141                                         ASSERT(!brkcow && !tron &&
4142                                             type != F_SOFTLOCK);
4143                                         pszc = seg->s_szc;
4144                                         ierr = -2;
4145                                         break;
4146                                 }
4147                                 ASSERT(IS_P2ALIGNED(a, maxpgsz));
4148                                 goto again;
4149                         }
4150 #ifdef DEBUG
4151                         if (amp != NULL) {
4152                                 ulong_t taindx = P2ALIGN(aindx, maxpages);
4153                                 ASSERT(!anon_pages(amp->ahp, taindx, maxpages));
4154                         }
4155 #endif /* DEBUG */
4156 
4157                         if (brkcow || tron) {
4158                                 ASSERT(amp != NULL);
4159                                 ASSERT(pplist == NULL);
4160                                 ASSERT(szc == seg->s_szc);
4161                                 ASSERT(IS_P2ALIGNED(a, maxpgsz));
4162                                 ASSERT(IS_P2ALIGNED(aindx, maxpages));
4163                                 SEGVN_VMSTAT_FLTVNPAGES(27);
4164                                 ierr = anon_map_privatepages(amp, aindx, szc,
4165                                     seg, a, prot, ppa, vpage, segvn_anypgsz,
4166                                     tron ? PG_LOCAL : 0, svd->cred);
4167                                 if (ierr != 0) {
4168                                         SEGVN_VMSTAT_FLTVNPAGES(28);
4169                                         anon_array_exit(&an_cookie);
4170                                         ANON_LOCK_EXIT(&amp->a_rwlock);
4171                                         SEGVN_RESTORE_SOFTLOCK_VP(type, pages);
4172                                         err = FC_MAKE_ERR(ierr);
4173                                         goto out;
4174                                 }
4175 
4176                                 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode));
4177                                 /*
4178                                  * p_szc can't be changed for locked
4179                                  * swapfs pages.
4180                                  */
4181                                 ASSERT(svd->rcookie ==
4182                                     HAT_INVALID_REGION_COOKIE);
4183                                 hat_memload_array(hat, a, pgsz, ppa, prot,
4184                                     hat_flag);
4185 
4186                                 if (!(hat_flag & HAT_LOAD_LOCK)) {
4187                                         SEGVN_VMSTAT_FLTVNPAGES(29);
4188                                         for (i = 0; i < pages; i++) {
4189                                                 page_unlock(ppa[i]);
4190                                         }
4191                                 }
4192                                 anon_array_exit(&an_cookie);
4193                                 ANON_LOCK_EXIT(&amp->a_rwlock);
4194                                 goto next;
4195                         }
4196 
4197                         ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE ||
4198                             (!svd->pageprot && svd->prot == (prot & vpprot)));
4199 
4200                         pfn = page_pptonum(ppa[0]);
4201                         /*
4202                          * hat_page_demote() needs an SE_EXCL lock on one of
4203                          * constituent page_t's and it decreases root's p_szc
4204                          * last. This means if root's p_szc is equal szc and
4205                          * all its constituent pages are locked
4206                          * hat_page_demote() that could have changed p_szc to
4207                          * szc is already done and no new have page_demote()
4208                          * can start for this large page.
4209                          */
4210 
4211                         /*
4212                          * we need to make sure same mapping size is used for
4213                          * the same address range if there's a possibility the
4214                          * adddress is already mapped because hat layer panics
4215                          * when translation is loaded for the range already
4216                          * mapped with a different page size.  We achieve it
4217                          * by always using largest page size possible subject
4218                          * to the constraints of page size, segment page size
4219                          * and page alignment.  Since mappings are invalidated
4220                          * when those constraints change and make it
4221                          * impossible to use previously used mapping size no
4222                          * mapping size conflicts should happen.
4223                          */
4224 
4225                 chkszc:
4226                         if ((pszc = ppa[0]->p_szc) == szc &&
4227                             IS_P2ALIGNED(pfn, pages)) {
4228 
4229                                 SEGVN_VMSTAT_FLTVNPAGES(30);
4230 #ifdef DEBUG
4231                                 for (i = 0; i < pages; i++) {
4232                                         ASSERT(PAGE_LOCKED(ppa[i]));
4233                                         ASSERT(!PP_ISFREE(ppa[i]));
4234                                         ASSERT(page_pptonum(ppa[i]) ==
4235                                             pfn + i);
4236                                         ASSERT(ppa[i]->p_szc == szc);
4237                                         ASSERT(ppa[i]->p_vnode == vp);
4238                                         ASSERT(ppa[i]->p_offset ==
4239                                             off + (i << PAGESHIFT));
4240                                 }
4241 #endif /* DEBUG */
4242                                 /*
4243                                  * All pages are of szc we need and they are
4244                                  * all locked so they can't change szc. load
4245                                  * translations.
4246                                  *
4247                                  * if page got promoted since last check
4248                                  * we don't need pplist.
4249                                  */
4250                                 if (pplist != NULL) {
4251                                         page_free_replacement_page(pplist);
4252                                         page_create_putback(pages);
4253                                 }
4254                                 if (PP_ISMIGRATE(ppa[0])) {
4255                                         page_migrate(seg, a, ppa, pages);
4256                                 }
4257                                 SEGVN_UPDATE_MODBITS(ppa, pages, rw,
4258                                     prot, vpprot);
4259                                 hat_memload_array_region(hat, a, pgsz,
4260                                     ppa, prot & vpprot, hat_flag,
4261                                     svd->rcookie);
4262 
4263                                 if (!(hat_flag & HAT_LOAD_LOCK)) {
4264                                         for (i = 0; i < pages; i++) {
4265                                                 page_unlock(ppa[i]);
4266                                         }
4267                                 }
4268                                 if (amp != NULL) {
4269                                         anon_array_exit(&an_cookie);
4270                                         ANON_LOCK_EXIT(&amp->a_rwlock);
4271                                 }
4272                                 goto next;
4273                         }
4274 
4275                         /*
4276                          * See if upsize is possible.
4277                          */
4278                         if (pszc > szc && szc < seg->s_szc &&
4279                             (segvn_anypgsz_vnode || pszc >= seg->s_szc)) {
4280                                 pgcnt_t aphase;
4281                                 uint_t pszc1 = MIN(pszc, seg->s_szc);
4282                                 ppgsz = page_get_pagesize(pszc1);
4283                                 ppages = btop(ppgsz);
4284                                 aphase = btop(P2PHASE((uintptr_t)a, ppgsz));
4285 
4286                                 ASSERT(type != F_SOFTLOCK);
4287 
4288                                 SEGVN_VMSTAT_FLTVNPAGES(31);
4289                                 if (aphase != P2PHASE(pfn, ppages)) {
4290                                         segvn_faultvnmpss_align_err4++;
4291                                 } else {
4292                                         SEGVN_VMSTAT_FLTVNPAGES(32);
4293                                         if (pplist != NULL) {
4294                                                 page_t *pl = pplist;
4295                                                 page_free_replacement_page(pl);
4296                                                 page_create_putback(pages);
4297                                         }
4298                                         for (i = 0; i < pages; i++) {
4299                                                 page_unlock(ppa[i]);
4300                                         }
4301                                         if (amp != NULL) {
4302                                                 anon_array_exit(&an_cookie);
4303                                                 ANON_LOCK_EXIT(&amp->a_rwlock);
4304                                         }
4305                                         pszc = pszc1;
4306                                         ierr = -2;
4307                                         break;
4308                                 }
4309                         }
4310 
4311                         /*
4312                          * check if we should use smallest mapping size.
4313                          */
4314                         upgrdfail = 0;
4315                         if (szc == 0 ||
4316                             (pszc >= szc &&
4317                             !IS_P2ALIGNED(pfn, pages)) ||
4318                             (pszc < szc &&
4319                             !segvn_full_szcpages(ppa, szc, &upgrdfail,
4320                             &pszc))) {
4321 
4322                                 if (upgrdfail && type != F_SOFTLOCK) {
4323                                         /*
4324                                          * segvn_full_szcpages failed to lock
4325                                          * all pages EXCL. Size down.
4326                                          */
4327                                         ASSERT(pszc < szc);
4328 
4329                                         SEGVN_VMSTAT_FLTVNPAGES(33);
4330 
4331                                         if (pplist != NULL) {
4332                                                 page_t *pl = pplist;
4333                                                 page_free_replacement_page(pl);
4334                                                 page_create_putback(pages);
4335                                         }
4336 
4337                                         for (i = 0; i < pages; i++) {
4338                                                 page_unlock(ppa[i]);
4339                                         }
4340                                         if (amp != NULL) {
4341                                                 anon_array_exit(&an_cookie);
4342                                                 ANON_LOCK_EXIT(&amp->a_rwlock);
4343                                         }
4344                                         ierr = -1;
4345                                         break;
4346                                 }
4347                                 if (szc != 0 && !upgrdfail) {
4348                                         segvn_faultvnmpss_align_err5++;
4349                                 }
4350                                 SEGVN_VMSTAT_FLTVNPAGES(34);
4351                                 if (pplist != NULL) {
4352                                         page_free_replacement_page(pplist);
4353                                         page_create_putback(pages);
4354                                 }
4355                                 SEGVN_UPDATE_MODBITS(ppa, pages, rw,
4356                                     prot, vpprot);
4357                                 if (upgrdfail && segvn_anypgsz_vnode) {
4358                                         /* SOFTLOCK case */
4359                                         hat_memload_array_region(hat, a, pgsz,
4360                                             ppa, prot & vpprot, hat_flag,
4361                                             svd->rcookie);
4362                                 } else {
4363                                         for (i = 0; i < pages; i++) {
4364                                                 hat_memload_region(hat,
4365                                                     a + (i << PAGESHIFT),
4366                                                     ppa[i], prot & vpprot,
4367                                                     hat_flag, svd->rcookie);
4368                                         }
4369                                 }
4370                                 if (!(hat_flag & HAT_LOAD_LOCK)) {
4371                                         for (i = 0; i < pages; i++) {
4372                                                 page_unlock(ppa[i]);
4373                                         }
4374                                 }
4375                                 if (amp != NULL) {
4376                                         anon_array_exit(&an_cookie);
4377                                         ANON_LOCK_EXIT(&amp->a_rwlock);
4378                                 }
4379                                 goto next;
4380                         }
4381 
4382                         if (pszc == szc) {
4383                                 /*
4384                                  * segvn_full_szcpages() upgraded pages szc.
4385                                  */
4386                                 ASSERT(pszc == ppa[0]->p_szc);
4387                                 ASSERT(IS_P2ALIGNED(pfn, pages));
4388                                 goto chkszc;
4389                         }
4390 
4391                         if (pszc > szc) {
4392                                 kmutex_t *szcmtx;
4393                                 SEGVN_VMSTAT_FLTVNPAGES(35);
4394                                 /*
4395                                  * p_szc of ppa[0] can change since we haven't
4396                                  * locked all constituent pages. Call
4397                                  * page_lock_szc() to prevent szc changes.
4398                                  * This should be a rare case that happens when
4399                                  * multiple segments use a different page size
4400                                  * to map the same file offsets.
4401                                  */
4402                                 szcmtx = page_szc_lock(ppa[0]);
4403                                 pszc = ppa[0]->p_szc;
4404                                 ASSERT(szcmtx != NULL || pszc == 0);
4405                                 ASSERT(ppa[0]->p_szc <= pszc);
4406                                 if (pszc <= szc) {
4407                                         SEGVN_VMSTAT_FLTVNPAGES(36);
4408                                         if (szcmtx != NULL) {
4409                                                 mutex_exit(szcmtx);
4410                                         }
4411                                         goto chkszc;
4412                                 }
4413                                 if (pplist != NULL) {
4414                                         /*
4415                                          * page got promoted since last check.
4416                                          * we don't need preaalocated large
4417                                          * page.
4418                                          */
4419                                         SEGVN_VMSTAT_FLTVNPAGES(37);
4420                                         page_free_replacement_page(pplist);
4421                                         page_create_putback(pages);
4422                                 }
4423                                 SEGVN_UPDATE_MODBITS(ppa, pages, rw,
4424                                     prot, vpprot);
4425                                 hat_memload_array_region(hat, a, pgsz, ppa,
4426                                     prot & vpprot, hat_flag, svd->rcookie);
4427                                 mutex_exit(szcmtx);
4428                                 if (!(hat_flag & HAT_LOAD_LOCK)) {
4429                                         for (i = 0; i < pages; i++) {
4430                                                 page_unlock(ppa[i]);
4431                                         }
4432                                 }
4433                                 if (amp != NULL) {
4434                                         anon_array_exit(&an_cookie);
4435                                         ANON_LOCK_EXIT(&amp->a_rwlock);
4436                                 }
4437                                 goto next;
4438                         }
4439 
4440                         /*
4441                          * if page got demoted since last check
4442                          * we could have not allocated larger page.
4443                          * allocate now.
4444                          */
4445                         if (pplist == NULL &&
4446                             page_alloc_pages(vp, seg, a, &pplist, NULL,
4447                             szc, 0, 0) && type != F_SOFTLOCK) {
4448                                 SEGVN_VMSTAT_FLTVNPAGES(38);
4449                                 for (i = 0; i < pages; i++) {
4450                                         page_unlock(ppa[i]);
4451                                 }
4452                                 if (amp != NULL) {
4453                                         anon_array_exit(&an_cookie);
4454                                         ANON_LOCK_EXIT(&amp->a_rwlock);
4455                                 }
4456                                 ierr = -1;
4457                                 alloc_failed |= (1 << szc);
4458                                 break;
4459                         }
4460 
4461                         SEGVN_VMSTAT_FLTVNPAGES(39);
4462 
4463                         if (pplist != NULL) {
4464                                 segvn_relocate_pages(ppa, pplist);
4465 #ifdef DEBUG
4466                         } else {
4467                                 ASSERT(type == F_SOFTLOCK);
4468                                 SEGVN_VMSTAT_FLTVNPAGES(40);
4469 #endif /* DEBUG */
4470                         }
4471 
4472                         SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot);
4473 
4474                         if (pplist == NULL && segvn_anypgsz_vnode == 0) {
4475                                 ASSERT(type == F_SOFTLOCK);
4476                                 for (i = 0; i < pages; i++) {
4477                                         ASSERT(ppa[i]->p_szc < szc);
4478                                         hat_memload_region(hat,
4479                                             a + (i << PAGESHIFT),
4480                                             ppa[i], prot & vpprot, hat_flag,
4481                                             svd->rcookie);
4482                                 }
4483                         } else {
4484                                 ASSERT(pplist != NULL || type == F_SOFTLOCK);
4485                                 hat_memload_array_region(hat, a, pgsz, ppa,
4486                                     prot & vpprot, hat_flag, svd->rcookie);
4487                         }
4488                         if (!(hat_flag & HAT_LOAD_LOCK)) {
4489                                 for (i = 0; i < pages; i++) {
4490                                         ASSERT(PAGE_SHARED(ppa[i]));
4491                                         page_unlock(ppa[i]);
4492                                 }
4493                         }
4494                         if (amp != NULL) {
4495                                 anon_array_exit(&an_cookie);
4496                                 ANON_LOCK_EXIT(&amp->a_rwlock);
4497                         }
4498 
4499                 next:
4500                         if (vpage != NULL) {
4501                                 vpage += pages;
4502                         }
4503                         adjszc_chk = 1;
4504                 }
4505                 if (a == lpgeaddr)
4506                         break;
4507                 ASSERT(a < lpgeaddr);
4508 
4509                 ASSERT(!brkcow && !tron && type != F_SOFTLOCK);
4510 
4511                 /*
4512                  * ierr == -1 means we failed to map with a large page.
4513                  * (either due to allocation/relocation failures or
4514                  * misalignment with other mappings to this file.
4515                  *
4516                  * ierr == -2 means some other thread allocated a large page
4517                  * after we gave up tp map with a large page.  retry with
4518                  * larger mapping.
4519                  */
4520                 ASSERT(ierr == -1 || ierr == -2);
4521                 ASSERT(ierr == -2 || szc != 0);
4522                 ASSERT(ierr == -1 || szc < seg->s_szc);
4523                 if (ierr == -2) {
4524                         SEGVN_VMSTAT_FLTVNPAGES(41);
4525                         ASSERT(pszc > szc && pszc <= seg->s_szc);
4526                         szc = pszc;
4527                 } else if (segvn_anypgsz_vnode) {
4528                         SEGVN_VMSTAT_FLTVNPAGES(42);
4529                         szc--;
4530                 } else {
4531                         SEGVN_VMSTAT_FLTVNPAGES(43);
4532                         ASSERT(pszc < szc);
4533                         /*
4534                          * other process created pszc large page.
4535                          * but we still have to drop to 0 szc.
4536                          */
4537                         szc = 0;
4538                 }
4539 
4540                 pgsz = page_get_pagesize(szc);
4541                 pages = btop(pgsz);
4542                 if (ierr == -2) {
4543                         /*
4544                          * Size up case. Note lpgaddr may only be needed for
4545                          * softlock case so we don't adjust it here.
4546                          */
4547                         a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz);
4548                         ASSERT(a >= lpgaddr);
4549                         lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
4550                         off = svd->offset + (uintptr_t)(a - seg->s_base);
4551                         aindx = svd->anon_index + seg_page(seg, a);
4552                         vpage = (svd->vpage != NULL) ?
4553                             &svd->vpage[seg_page(seg, a)] : NULL;
4554                 } else {
4555                         /*
4556                          * Size down case. Note lpgaddr may only be needed for
4557                          * softlock case so we don't adjust it here.
4558                          */
4559                         ASSERT(IS_P2ALIGNED(a, pgsz));
4560                         ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz));
4561                         lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
4562                         ASSERT(a < lpgeaddr);
4563                         if (a < addr) {
4564                                 SEGVN_VMSTAT_FLTVNPAGES(44);
4565                                 /*
4566                                  * The beginning of the large page region can
4567                                  * be pulled to the right to make a smaller
4568                                  * region. We haven't yet faulted a single
4569                                  * page.
4570                                  */
4571                                 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz);
4572                                 ASSERT(a >= lpgaddr);
4573                                 off = svd->offset +
4574                                     (uintptr_t)(a - seg->s_base);
4575                                 aindx = svd->anon_index + seg_page(seg, a);
4576                                 vpage = (svd->vpage != NULL) ?
4577                                     &svd->vpage[seg_page(seg, a)] : NULL;
4578                         }
4579                 }
4580         }
4581 out:
4582         kmem_free(ppa, ppasize);
4583         if (!err && !vop_size_err) {
4584                 SEGVN_VMSTAT_FLTVNPAGES(45);
4585                 return (0);
4586         }
4587         if (type == F_SOFTLOCK && a > lpgaddr) {
4588                 SEGVN_VMSTAT_FLTVNPAGES(46);
4589                 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER);
4590         }
4591         if (!vop_size_err) {
4592                 SEGVN_VMSTAT_FLTVNPAGES(47);
4593                 return (err);
4594         }
4595         ASSERT(brkcow || tron || type == F_SOFTLOCK);
4596         /*
4597          * Large page end is mapped beyond the end of file and it's a cow
4598          * fault (can be a text replication induced cow) or softlock so we can't
4599          * reduce the map area.  For now just demote the segment. This should
4600          * really only happen if the end of the file changed after the mapping
4601          * was established since when large page segments are created we make
4602          * sure they don't extend beyond the end of the file.
4603          */
4604         SEGVN_VMSTAT_FLTVNPAGES(48);
4605 
4606         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
4607         SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
4608         err = 0;
4609         if (seg->s_szc != 0) {
4610                 segvn_fltvnpages_clrszc_cnt++;
4611                 ASSERT(svd->softlockcnt == 0);
4612                 err = segvn_clrszc(seg);
4613                 if (err != 0) {
4614                         segvn_fltvnpages_clrszc_err++;
4615                 }
4616         }
4617         ASSERT(err || seg->s_szc == 0);
4618         SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock);
4619         /* segvn_fault will do its job as if szc had been zero to begin with */
4620         return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err));
4621 }
4622 
4623 /*
4624  * This routine will attempt to fault in one large page.
4625  * it will use smaller pages if that fails.
4626  * It should only be called for pure anonymous segments.
4627  */
4628 static faultcode_t
4629 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
4630     caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr,
4631     caddr_t eaddr, int brkcow)
4632 {
4633         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
4634         struct anon_map *amp = svd->amp;
4635         uchar_t segtype = svd->type;
4636         uint_t szc = seg->s_szc;
4637         size_t pgsz = page_get_pagesize(szc);
4638         size_t maxpgsz = pgsz;
4639         pgcnt_t pages = btop(pgsz);
4640         uint_t ppaszc = szc;
4641         caddr_t a = lpgaddr;
4642         ulong_t aindx = svd->anon_index + seg_page(seg, a);
4643         struct vpage *vpage = (svd->vpage != NULL) ?
4644             &svd->vpage[seg_page(seg, a)] : NULL;
4645         page_t **ppa;
4646         uint_t  ppa_szc;
4647         faultcode_t err;
4648         int ierr;
4649         uint_t protchk, prot, vpprot;
4650         ulong_t i;
4651         int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
4652         anon_sync_obj_t cookie;
4653         int adjszc_chk;
4654         int pgflags = (svd->tr_state == SEGVN_TR_ON) ? PG_LOCAL : 0;
4655 
4656         ASSERT(szc != 0);
4657         ASSERT(amp != NULL);
4658         ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */
4659         ASSERT(!(svd->flags & MAP_NORESERVE));
4660         ASSERT(type != F_SOFTUNLOCK);
4661         ASSERT(IS_P2ALIGNED(a, maxpgsz));
4662         ASSERT(!brkcow || svd->tr_state == SEGVN_TR_OFF);
4663         ASSERT(svd->tr_state != SEGVN_TR_INIT);
4664 
4665         ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
4666 
4667         VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]);
4668         VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]);
4669 
4670         if (svd->flags & MAP_TEXT) {
4671                 hat_flag |= HAT_LOAD_TEXT;
4672         }
4673 
4674         if (svd->pageprot) {
4675                 switch (rw) {
4676                 case S_READ:
4677                         protchk = PROT_READ;
4678                         break;
4679                 case S_WRITE:
4680                         protchk = PROT_WRITE;
4681                         break;
4682                 case S_EXEC:
4683                         protchk = PROT_EXEC;
4684                         break;
4685                 case S_OTHER:
4686                 default:
4687                         protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
4688                         break;
4689                 }
4690                 VM_STAT_ADD(segvnvmstats.fltanpages[2]);
4691         } else {
4692                 prot = svd->prot;
4693                 /* caller has already done segment level protection check. */
4694         }
4695 
4696         ppa = kmem_cache_alloc(segvn_szc_cache[ppaszc], KM_SLEEP);
4697         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
4698         for (;;) {
4699                 adjszc_chk = 0;
4700                 for (; a < lpgeaddr; a += pgsz, aindx += pages) {
4701                         if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) {
4702                                 VM_STAT_ADD(segvnvmstats.fltanpages[3]);
4703                                 ASSERT(vpage != NULL);
4704                                 prot = VPP_PROT(vpage);
4705                                 ASSERT(sameprot(seg, a, maxpgsz));
4706                                 if ((prot & protchk) == 0) {
4707                                         err = FC_PROT;
4708                                         goto error;
4709                                 }
4710                         }
4711                         if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) &&
4712                             pgsz < maxpgsz) {
4713                                 ASSERT(a > lpgaddr);
4714                                 szc = seg->s_szc;
4715                                 pgsz = maxpgsz;
4716                                 pages = btop(pgsz);
4717                                 ASSERT(IS_P2ALIGNED(aindx, pages));
4718                                 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr,
4719                                     pgsz);
4720                         }
4721                         if (type == F_SOFTLOCK) {
4722                                 atomic_add_long((ulong_t *)&svd->softlockcnt,
4723                                     pages);
4724                         }
4725                         anon_array_enter(amp, aindx, &cookie);
4726                         ppa_szc = (uint_t)-1;
4727                         ierr = anon_map_getpages(amp, aindx, szc, seg, a,
4728                             prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow,
4729                             segvn_anypgsz, pgflags, svd->cred);
4730                         if (ierr != 0) {
4731                                 anon_array_exit(&cookie);
4732                                 VM_STAT_ADD(segvnvmstats.fltanpages[4]);
4733                                 if (type == F_SOFTLOCK) {
4734                                         atomic_add_long(
4735                                             (ulong_t *)&svd->softlockcnt,
4736                                             -pages);
4737                                 }
4738                                 if (ierr > 0) {
4739                                         VM_STAT_ADD(segvnvmstats.fltanpages[6]);
4740                                         err = FC_MAKE_ERR(ierr);
4741                                         goto error;
4742                                 }
4743                                 break;
4744                         }
4745 
4746                         ASSERT(!IS_VMODSORT(ppa[0]->p_vnode));
4747 
4748                         ASSERT(segtype == MAP_SHARED ||
4749                             ppa[0]->p_szc <= szc);
4750                         ASSERT(segtype == MAP_PRIVATE ||
4751                             ppa[0]->p_szc >= szc);
4752 
4753                         /*
4754                          * Handle pages that have been marked for migration
4755                          */
4756                         if (lgrp_optimizations())
4757                                 page_migrate(seg, a, ppa, pages);
4758 
4759                         ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
4760 
4761                         if (segtype == MAP_SHARED) {
4762                                 vpprot |= PROT_WRITE;
4763                         }
4764 
4765                         hat_memload_array(hat, a, pgsz, ppa,
4766                             prot & vpprot, hat_flag);
4767 
4768                         if (hat_flag & HAT_LOAD_LOCK) {
4769                                 VM_STAT_ADD(segvnvmstats.fltanpages[7]);
4770                         } else {
4771                                 VM_STAT_ADD(segvnvmstats.fltanpages[8]);
4772                                 for (i = 0; i < pages; i++)
4773                                         page_unlock(ppa[i]);
4774                         }
4775                         if (vpage != NULL)
4776                                 vpage += pages;
4777 
4778                         anon_array_exit(&cookie);
4779                         adjszc_chk = 1;
4780                 }
4781                 if (a == lpgeaddr)
4782                         break;
4783                 ASSERT(a < lpgeaddr);
4784                 /*
4785                  * ierr == -1 means we failed to allocate a large page.
4786                  * so do a size down operation.
4787                  *
4788                  * ierr == -2 means some other process that privately shares
4789                  * pages with this process has allocated a larger page and we
4790                  * need to retry with larger pages. So do a size up
4791                  * operation. This relies on the fact that large pages are
4792                  * never partially shared i.e. if we share any constituent
4793                  * page of a large page with another process we must share the
4794                  * entire large page. Note this cannot happen for SOFTLOCK
4795                  * case, unless current address (a) is at the beginning of the
4796                  * next page size boundary because the other process couldn't
4797                  * have relocated locked pages.
4798                  */
4799                 ASSERT(ierr == -1 || ierr == -2);
4800 
4801                 if (segvn_anypgsz) {
4802                         ASSERT(ierr == -2 || szc != 0);
4803                         ASSERT(ierr == -1 || szc < seg->s_szc);
4804                         szc = (ierr == -1) ? szc - 1 : szc + 1;
4805                 } else {
4806                         /*
4807                          * For non COW faults and segvn_anypgsz == 0
4808                          * we need to be careful not to loop forever
4809                          * if existing page is found with szc other
4810                          * than 0 or seg->s_szc. This could be due
4811                          * to page relocations on behalf of DR or
4812                          * more likely large page creation. For this
4813                          * case simply re-size to existing page's szc
4814                          * if returned by anon_map_getpages().
4815                          */
4816                         if (ppa_szc == (uint_t)-1) {
4817                                 szc = (ierr == -1) ? 0 : seg->s_szc;
4818                         } else {
4819                                 ASSERT(ppa_szc <= seg->s_szc);
4820                                 ASSERT(ierr == -2 || ppa_szc < szc);
4821                                 ASSERT(ierr == -1 || ppa_szc > szc);
4822                                 szc = ppa_szc;
4823                         }
4824                 }
4825 
4826                 pgsz = page_get_pagesize(szc);
4827                 pages = btop(pgsz);
4828                 ASSERT(type != F_SOFTLOCK || ierr == -1 ||
4829                     (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz)));
4830                 if (type == F_SOFTLOCK) {
4831                         /*
4832                          * For softlocks we cannot reduce the fault area
4833                          * (calculated based on the largest page size for this
4834                          * segment) for size down and a is already next
4835                          * page size aligned as assertted above for size
4836                          * ups. Therefore just continue in case of softlock.
4837                          */
4838                         VM_STAT_ADD(segvnvmstats.fltanpages[9]);
4839                         continue; /* keep lint happy */
4840                 } else if (ierr == -2) {
4841 
4842                         /*
4843                          * Size up case. Note lpgaddr may only be needed for
4844                          * softlock case so we don't adjust it here.
4845                          */
4846                         VM_STAT_ADD(segvnvmstats.fltanpages[10]);
4847                         a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz);
4848                         ASSERT(a >= lpgaddr);
4849                         lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
4850                         aindx = svd->anon_index + seg_page(seg, a);
4851                         vpage = (svd->vpage != NULL) ?
4852                             &svd->vpage[seg_page(seg, a)] : NULL;
4853                 } else {
4854                         /*
4855                          * Size down case. Note lpgaddr may only be needed for
4856                          * softlock case so we don't adjust it here.
4857                          */
4858                         VM_STAT_ADD(segvnvmstats.fltanpages[11]);
4859                         ASSERT(IS_P2ALIGNED(a, pgsz));
4860                         ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz));
4861                         lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
4862                         ASSERT(a < lpgeaddr);
4863                         if (a < addr) {
4864                                 /*
4865                                  * The beginning of the large page region can
4866                                  * be pulled to the right to make a smaller
4867                                  * region. We haven't yet faulted a single
4868                                  * page.
4869                                  */
4870                                 VM_STAT_ADD(segvnvmstats.fltanpages[12]);
4871                                 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz);
4872                                 ASSERT(a >= lpgaddr);
4873                                 aindx = svd->anon_index + seg_page(seg, a);
4874                                 vpage = (svd->vpage != NULL) ?
4875                                     &svd->vpage[seg_page(seg, a)] : NULL;
4876                         }
4877                 }
4878         }
4879         VM_STAT_ADD(segvnvmstats.fltanpages[13]);
4880         ANON_LOCK_EXIT(&amp->a_rwlock);
4881         kmem_cache_free(segvn_szc_cache[ppaszc], ppa);
4882         return (0);
4883 error:
4884         VM_STAT_ADD(segvnvmstats.fltanpages[14]);
4885         ANON_LOCK_EXIT(&amp->a_rwlock);
4886         kmem_cache_free(segvn_szc_cache[ppaszc], ppa);
4887         if (type == F_SOFTLOCK && a > lpgaddr) {
4888                 VM_STAT_ADD(segvnvmstats.fltanpages[15]);
4889                 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER);
4890         }
4891         return (err);
4892 }
4893 
4894 int fltadvice = 1;      /* set to free behind pages for sequential access */
4895 
4896 /*
4897  * This routine is called via a machine specific fault handling routine.
4898  * It is also called by software routines wishing to lock or unlock
4899  * a range of addresses.
4900  *
4901  * Here is the basic algorithm:
4902  *      If unlocking
4903  *              Call segvn_softunlock
4904  *              Return
4905  *      endif
4906  *      Checking and set up work
4907  *      If we will need some non-anonymous pages
4908  *              Call VOP_GETPAGE over the range of non-anonymous pages
4909  *      endif
4910  *      Loop over all addresses requested
4911  *              Call segvn_faultpage passing in page list
4912  *                  to load up translations and handle anonymous pages
4913  *      endloop
4914  *      Load up translation to any additional pages in page list not
4915  *          already handled that fit into this segment
4916  */
4917 static faultcode_t
4918 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
4919     enum fault_type type, enum seg_rw rw)
4920 {
4921         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
4922         page_t **plp, **ppp, *pp;
4923         u_offset_t off;
4924         caddr_t a;
4925         struct vpage *vpage;
4926         uint_t vpprot, prot;
4927         int err;
4928         page_t *pl[PVN_GETPAGE_NUM + 1];
4929         size_t plsz, pl_alloc_sz;
4930         size_t page;
4931         ulong_t anon_index;
4932         struct anon_map *amp;
4933         int dogetpage = 0;
4934         caddr_t lpgaddr, lpgeaddr;
4935         size_t pgsz;
4936         anon_sync_obj_t cookie;
4937         int brkcow = BREAK_COW_SHARE(rw, type, svd->type);
4938 
4939         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
4940         ASSERT(svd->amp == NULL || svd->rcookie == HAT_INVALID_REGION_COOKIE);
4941 
4942         /*
4943          * First handle the easy stuff
4944          */
4945         if (type == F_SOFTUNLOCK) {
4946                 if (rw == S_READ_NOCOW) {
4947                         rw = S_READ;
4948                         ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
4949                 }
4950                 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
4951                 pgsz = (seg->s_szc == 0) ? PAGESIZE :
4952                     page_get_pagesize(seg->s_szc);
4953                 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]);
4954                 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
4955                 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw);
4956                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
4957                 return (0);
4958         }
4959 
4960         ASSERT(svd->tr_state == SEGVN_TR_OFF ||
4961             !HAT_IS_REGION_COOKIE_VALID(svd->rcookie));
4962         if (brkcow == 0) {
4963                 if (svd->tr_state == SEGVN_TR_INIT) {
4964                         SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
4965                         if (svd->tr_state == SEGVN_TR_INIT) {
4966                                 ASSERT(svd->vp != NULL && svd->amp == NULL);
4967                                 ASSERT(svd->flags & MAP_TEXT);
4968                                 ASSERT(svd->type == MAP_PRIVATE);
4969                                 segvn_textrepl(seg);
4970                                 ASSERT(svd->tr_state != SEGVN_TR_INIT);
4971                                 ASSERT(svd->tr_state != SEGVN_TR_ON ||
4972                                     svd->amp != NULL);
4973                         }
4974                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
4975                 }
4976         } else if (svd->tr_state != SEGVN_TR_OFF) {
4977                 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
4978 
4979                 if (rw == S_WRITE && svd->tr_state != SEGVN_TR_OFF) {
4980                         ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE));
4981                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
4982                         return (FC_PROT);
4983                 }
4984 
4985                 if (svd->tr_state == SEGVN_TR_ON) {
4986                         ASSERT(svd->vp != NULL && svd->amp != NULL);
4987                         segvn_textunrepl(seg, 0);
4988                         ASSERT(svd->amp == NULL &&
4989                             svd->tr_state == SEGVN_TR_OFF);
4990                 } else if (svd->tr_state != SEGVN_TR_OFF) {
4991                         svd->tr_state = SEGVN_TR_OFF;
4992                 }
4993                 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
4994                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
4995         }
4996 
4997 top:
4998         SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
4999 
5000         /*
5001          * If we have the same protections for the entire segment,
5002          * insure that the access being attempted is legitimate.
5003          */
5004 
5005         if (svd->pageprot == 0) {
5006                 uint_t protchk;
5007 
5008                 switch (rw) {
5009                 case S_READ:
5010                 case S_READ_NOCOW:
5011                         protchk = PROT_READ;
5012                         break;
5013                 case S_WRITE:
5014                         protchk = PROT_WRITE;
5015                         break;
5016                 case S_EXEC:
5017                         protchk = PROT_EXEC;
5018                         break;
5019                 case S_OTHER:
5020                 default:
5021                         protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
5022                         break;
5023                 }
5024 
5025                 if ((svd->prot & protchk) == 0) {
5026                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5027                         return (FC_PROT);       /* illegal access type */
5028                 }
5029         }
5030 
5031         if (brkcow && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
5032                 /* this must be SOFTLOCK S_READ fault */
5033                 ASSERT(svd->amp == NULL);
5034                 ASSERT(svd->tr_state == SEGVN_TR_OFF);
5035                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5036                 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
5037                 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
5038                         /*
5039                          * this must be the first ever non S_READ_NOCOW
5040                          * softlock for this segment.
5041                          */
5042                         ASSERT(svd->softlockcnt == 0);
5043                         hat_leave_region(seg->s_as->a_hat, svd->rcookie,
5044                             HAT_REGION_TEXT);
5045                         svd->rcookie = HAT_INVALID_REGION_COOKIE;
5046                 }
5047                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5048                 goto top;
5049         }
5050 
5051         /*
5052          * We can't allow the long term use of softlocks for vmpss segments,
5053          * because in some file truncation cases we should be able to demote
5054          * the segment, which requires that there are no softlocks.  The
5055          * only case where it's ok to allow a SOFTLOCK fault against a vmpss
5056          * segment is S_READ_NOCOW, where the caller holds the address space
5057          * locked as writer and calls softunlock before dropping the as lock.
5058          * S_READ_NOCOW is used by /proc to read memory from another user.
5059          *
5060          * Another deadlock between SOFTLOCK and file truncation can happen
5061          * because segvn_fault_vnodepages() calls the FS one pagesize at
5062          * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages()
5063          * can cause a deadlock because the first set of page_t's remain
5064          * locked SE_SHARED.  To avoid this, we demote segments on a first
5065          * SOFTLOCK if they have a length greater than the segment's
5066          * page size.
5067          *
5068          * So for now, we only avoid demoting a segment on a SOFTLOCK when
5069          * the access type is S_READ_NOCOW and the fault length is less than
5070          * or equal to the segment's page size. While this is quite restrictive,
5071          * it should be the most common case of SOFTLOCK against a vmpss
5072          * segment.
5073          *
5074          * For S_READ_NOCOW, it's safe not to do a copy on write because the
5075          * caller makes sure no COW will be caused by another thread for a
5076          * softlocked page.
5077          */
5078         if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) {
5079                 int demote = 0;
5080 
5081                 if (rw != S_READ_NOCOW) {
5082                         demote = 1;
5083                 }
5084                 if (!demote && len > PAGESIZE) {
5085                         pgsz = page_get_pagesize(seg->s_szc);
5086                         CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr,
5087                             lpgeaddr);
5088                         if (lpgeaddr - lpgaddr > pgsz) {
5089                                 demote = 1;
5090                         }
5091                 }
5092 
5093                 ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
5094 
5095                 if (demote) {
5096                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5097                         SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
5098                         if (seg->s_szc != 0) {
5099                                 segvn_vmpss_clrszc_cnt++;
5100                                 ASSERT(svd->softlockcnt == 0);
5101                                 err = segvn_clrszc(seg);
5102                                 if (err) {
5103                                         segvn_vmpss_clrszc_err++;
5104                                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5105                                         return (FC_MAKE_ERR(err));
5106                                 }
5107                         }
5108                         ASSERT(seg->s_szc == 0);
5109                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5110                         goto top;
5111                 }
5112         }
5113 
5114         /*
5115          * Check to see if we need to allocate an anon_map structure.
5116          */
5117         if (svd->amp == NULL && (svd->vp == NULL || brkcow)) {
5118                 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
5119                 /*
5120                  * Drop the "read" lock on the segment and acquire
5121                  * the "write" version since we have to allocate the
5122                  * anon_map.
5123                  */
5124                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5125                 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
5126 
5127                 if (svd->amp == NULL) {
5128                         svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
5129                         svd->amp->a_szc = seg->s_szc;
5130                 }
5131                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5132 
5133                 /*
5134                  * Start all over again since segment protections
5135                  * may have changed after we dropped the "read" lock.
5136                  */
5137                 goto top;
5138         }
5139 
5140         /*
5141          * S_READ_NOCOW vs S_READ distinction was
5142          * only needed for the code above. After
5143          * that we treat it as S_READ.
5144          */
5145         if (rw == S_READ_NOCOW) {
5146                 ASSERT(type == F_SOFTLOCK);
5147                 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
5148                 rw = S_READ;
5149         }
5150 
5151         amp = svd->amp;
5152 
5153         /*
5154          * MADV_SEQUENTIAL work is ignored for large page segments.
5155          */
5156         if (seg->s_szc != 0) {
5157                 pgsz = page_get_pagesize(seg->s_szc);
5158                 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
5159                 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
5160                 if (svd->vp == NULL) {
5161                         err = segvn_fault_anonpages(hat, seg, lpgaddr,
5162                             lpgeaddr, type, rw, addr, addr + len, brkcow);
5163                 } else {
5164                         err = segvn_fault_vnodepages(hat, seg, lpgaddr,
5165                             lpgeaddr, type, rw, addr, addr + len, brkcow);
5166                         if (err == IE_RETRY) {
5167                                 ASSERT(seg->s_szc == 0);
5168                                 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock));
5169                                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5170                                 goto top;
5171                         }
5172                 }
5173                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5174                 return (err);
5175         }
5176 
5177         page = seg_page(seg, addr);
5178         if (amp != NULL) {
5179                 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
5180                 anon_index = svd->anon_index + page;
5181 
5182                 if (type == F_PROT && rw == S_READ &&
5183                     svd->tr_state == SEGVN_TR_OFF &&
5184                     svd->type == MAP_PRIVATE && svd->pageprot == 0) {
5185                         size_t index = anon_index;
5186                         struct anon *ap;
5187 
5188                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5189                         /*
5190                          * The fast path could apply to S_WRITE also, except
5191                          * that the protection fault could be caused by lazy
5192                          * tlb flush when ro->rw. In this case, the pte is
5193                          * RW already. But RO in the other cpu's tlb causes
5194                          * the fault. Since hat_chgprot won't do anything if
5195                          * pte doesn't change, we may end up faulting
5196                          * indefinitely until the RO tlb entry gets replaced.
5197                          */
5198                         for (a = addr; a < addr + len; a += PAGESIZE, index++) {
5199                                 anon_array_enter(amp, index, &cookie);
5200                                 ap = anon_get_ptr(amp->ahp, index);
5201                                 anon_array_exit(&cookie);
5202                                 if ((ap == NULL) || (ap->an_refcnt != 1)) {
5203                                         ANON_LOCK_EXIT(&amp->a_rwlock);
5204                                         goto slow;
5205                                 }
5206                         }
5207                         hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot);
5208                         ANON_LOCK_EXIT(&amp->a_rwlock);
5209                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5210                         return (0);
5211                 }
5212         }
5213 slow:
5214 
5215         if (svd->vpage == NULL)
5216                 vpage = NULL;
5217         else
5218                 vpage = &svd->vpage[page];
5219 
5220         off = svd->offset + (uintptr_t)(addr - seg->s_base);
5221 
5222         /*
5223          * If MADV_SEQUENTIAL has been set for the particular page we
5224          * are faulting on, free behind all pages in the segment and put
5225          * them on the free list.
5226          */
5227 
5228         if ((page != 0) && fltadvice && svd->tr_state != SEGVN_TR_ON) {
5229                 struct vpage *vpp;
5230                 ulong_t fanon_index;
5231                 size_t fpage;
5232                 u_offset_t pgoff, fpgoff;
5233                 struct vnode *fvp;
5234                 struct anon *fap = NULL;
5235 
5236                 if (svd->advice == MADV_SEQUENTIAL ||
5237                     (svd->pageadvice &&
5238                     VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) {
5239                         pgoff = off - PAGESIZE;
5240                         fpage = page - 1;
5241                         if (vpage != NULL)
5242                                 vpp = &svd->vpage[fpage];
5243                         if (amp != NULL)
5244                                 fanon_index = svd->anon_index + fpage;
5245 
5246                         while (pgoff > svd->offset) {
5247                                 if (svd->advice != MADV_SEQUENTIAL &&
5248                                     (!svd->pageadvice || (vpage &&
5249                                     VPP_ADVICE(vpp) != MADV_SEQUENTIAL)))
5250                                         break;
5251 
5252                                 /*
5253                                  * If this is an anon page, we must find the
5254                                  * correct <vp, offset> for it
5255                                  */
5256                                 fap = NULL;
5257                                 if (amp != NULL) {
5258                                         ANON_LOCK_ENTER(&amp->a_rwlock,
5259                                             RW_READER);
5260                                         anon_array_enter(amp, fanon_index,
5261                                             &cookie);
5262                                         fap = anon_get_ptr(amp->ahp,
5263                                             fanon_index);
5264                                         if (fap != NULL) {
5265                                                 swap_xlate(fap, &fvp, &fpgoff);
5266                                         } else {
5267                                                 fpgoff = pgoff;
5268                                                 fvp = svd->vp;
5269                                         }
5270                                         anon_array_exit(&cookie);
5271                                         ANON_LOCK_EXIT(&amp->a_rwlock);
5272                                 } else {
5273                                         fpgoff = pgoff;
5274                                         fvp = svd->vp;
5275                                 }
5276                                 if (fvp == NULL)
5277                                         break;  /* XXX */
5278                                 /*
5279                                  * Skip pages that are free or have an
5280                                  * "exclusive" lock.
5281                                  */
5282                                 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED);
5283                                 if (pp == NULL)
5284                                         break;
5285                                 /*
5286                                  * We don't need the page_struct_lock to test
5287                                  * as this is only advisory; even if we
5288                                  * acquire it someone might race in and lock
5289                                  * the page after we unlock and before the
5290                                  * PUTPAGE, then VOP_PUTPAGE will do nothing.
5291                                  */
5292                                 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
5293                                         /*
5294                                          * Hold the vnode before releasing
5295                                          * the page lock to prevent it from
5296                                          * being freed and re-used by some
5297                                          * other thread.
5298                                          */
5299                                         VN_HOLD(fvp);
5300                                         page_unlock(pp);
5301                                         /*
5302                                          * We should build a page list
5303                                          * to kluster putpages XXX
5304                                          */
5305                                         (void) VOP_PUTPAGE(fvp,
5306                                             (offset_t)fpgoff, PAGESIZE,
5307                                             (B_DONTNEED|B_FREE|B_ASYNC),
5308                                             svd->cred, NULL);
5309                                         VN_RELE(fvp);
5310                                 } else {
5311                                         /*
5312                                          * XXX - Should the loop terminate if
5313                                          * the page is `locked'?
5314                                          */
5315                                         page_unlock(pp);
5316                                 }
5317                                 --vpp;
5318                                 --fanon_index;
5319                                 pgoff -= PAGESIZE;
5320                         }
5321                 }
5322         }
5323 
5324         plp = pl;
5325         *plp = NULL;
5326         pl_alloc_sz = 0;
5327 
5328         /*
5329          * See if we need to call VOP_GETPAGE for
5330          * *any* of the range being faulted on.
5331          * We can skip all of this work if there
5332          * was no original vnode.
5333          */
5334         if (svd->vp != NULL) {
5335                 u_offset_t vp_off;
5336                 size_t vp_len;
5337                 struct anon *ap;
5338                 vnode_t *vp;
5339 
5340                 vp_off = off;
5341                 vp_len = len;
5342 
5343                 if (amp == NULL)
5344                         dogetpage = 1;
5345                 else {
5346                         /*
5347                          * Only acquire reader lock to prevent amp->ahp
5348                          * from being changed.  It's ok to miss pages,
5349                          * hence we don't do anon_array_enter
5350                          */
5351                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5352                         ap = anon_get_ptr(amp->ahp, anon_index);
5353 
5354                         if (len <= PAGESIZE)
5355                                 /* inline non_anon() */
5356                                 dogetpage = (ap == NULL);
5357                         else
5358                                 dogetpage = non_anon(amp->ahp, anon_index,
5359                                     &vp_off, &vp_len);
5360                         ANON_LOCK_EXIT(&amp->a_rwlock);
5361                 }
5362 
5363                 if (dogetpage) {
5364                         enum seg_rw arw;
5365                         struct as *as = seg->s_as;
5366 
5367                         if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) {
5368                                 /*
5369                                  * Page list won't fit in local array,
5370                                  * allocate one of the needed size.
5371                                  */
5372                                 pl_alloc_sz =
5373                                     (btop(len) + 1) * sizeof (page_t *);
5374                                 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP);
5375                                 plp[0] = NULL;
5376                                 plsz = len;
5377                         } else if (rw == S_WRITE && svd->type == MAP_PRIVATE ||
5378                             svd->tr_state == SEGVN_TR_ON || rw == S_OTHER ||
5379                             (((size_t)(addr + PAGESIZE) <
5380                             (size_t)(seg->s_base + seg->s_size)) &&
5381                             hat_probe(as->a_hat, addr + PAGESIZE))) {
5382                                 /*
5383                                  * Ask VOP_GETPAGE to return the exact number
5384                                  * of pages if
5385                                  * (a) this is a COW fault, or
5386                                  * (b) this is a software fault, or
5387                                  * (c) next page is already mapped.
5388                                  */
5389                                 plsz = len;
5390                         } else {
5391                                 /*
5392                                  * Ask VOP_GETPAGE to return adjacent pages
5393                                  * within the segment.
5394                                  */
5395                                 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t)
5396                                     ((seg->s_base + seg->s_size) - addr));
5397                                 ASSERT((addr + plsz) <=
5398                                     (seg->s_base + seg->s_size));
5399                         }
5400 
5401                         /*
5402                          * Need to get some non-anonymous pages.
5403                          * We need to make only one call to GETPAGE to do
5404                          * this to prevent certain deadlocking conditions
5405                          * when we are doing locking.  In this case
5406                          * non_anon() should have picked up the smallest
5407                          * range which includes all the non-anonymous
5408                          * pages in the requested range.  We have to
5409                          * be careful regarding which rw flag to pass in
5410                          * because on a private mapping, the underlying
5411                          * object is never allowed to be written.
5412                          */
5413                         if (rw == S_WRITE && svd->type == MAP_PRIVATE) {
5414                                 arw = S_READ;
5415                         } else {
5416                                 arw = rw;
5417                         }
5418                         vp = svd->vp;
5419                         TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE,
5420                             "segvn_getpage:seg %p addr %p vp %p",
5421                             seg, addr, vp);
5422                         err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len,
5423                             &vpprot, plp, plsz, seg, addr + (vp_off - off), arw,
5424                             svd->cred, NULL);
5425                         if (err) {
5426                                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5427                                 segvn_pagelist_rele(plp);
5428                                 if (pl_alloc_sz)
5429                                         kmem_free(plp, pl_alloc_sz);
5430                                 return (FC_MAKE_ERR(err));
5431                         }
5432                         if (svd->type == MAP_PRIVATE)
5433                                 vpprot &= ~PROT_WRITE;
5434                 }
5435         }
5436 
5437         /*
5438          * N.B. at this time the plp array has all the needed non-anon
5439          * pages in addition to (possibly) having some adjacent pages.
5440          */
5441 
5442         /*
5443          * Always acquire the anon_array_lock to prevent
5444          * 2 threads from allocating separate anon slots for
5445          * the same "addr".
5446          *
5447          * If this is a copy-on-write fault and we don't already
5448          * have the anon_array_lock, acquire it to prevent the
5449          * fault routine from handling multiple copy-on-write faults
5450          * on the same "addr" in the same address space.
5451          *
5452          * Only one thread should deal with the fault since after
5453          * it is handled, the other threads can acquire a translation
5454          * to the newly created private page.  This prevents two or
5455          * more threads from creating different private pages for the
5456          * same fault.
5457          *
5458          * We grab "serialization" lock here if this is a MAP_PRIVATE segment
5459          * to prevent deadlock between this thread and another thread
5460          * which has soft-locked this page and wants to acquire serial_lock.
5461          * ( bug 4026339 )
5462          *
5463          * The fix for bug 4026339 becomes unnecessary when using the
5464          * locking scheme with per amp rwlock and a global set of hash
5465          * lock, anon_array_lock.  If we steal a vnode page when low
5466          * on memory and upgrad the page lock through page_rename,
5467          * then the page is PAGE_HANDLED, nothing needs to be done
5468          * for this page after returning from segvn_faultpage.
5469          *
5470          * But really, the page lock should be downgraded after
5471          * the stolen page is page_rename'd.
5472          */
5473 
5474         if (amp != NULL)
5475                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5476 
5477         /*
5478          * Ok, now loop over the address range and handle faults
5479          */
5480         for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) {
5481                 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot,
5482                     type, rw, brkcow);
5483                 if (err) {
5484                         if (amp != NULL)
5485                                 ANON_LOCK_EXIT(&amp->a_rwlock);
5486                         if (type == F_SOFTLOCK && a > addr) {
5487                                 segvn_softunlock(seg, addr, (a - addr),
5488                                     S_OTHER);
5489                         }
5490                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5491                         segvn_pagelist_rele(plp);
5492                         if (pl_alloc_sz)
5493                                 kmem_free(plp, pl_alloc_sz);
5494                         return (err);
5495                 }
5496                 if (vpage) {
5497                         vpage++;
5498                 } else if (svd->vpage) {
5499                         page = seg_page(seg, addr);
5500                         vpage = &svd->vpage[++page];
5501                 }
5502         }
5503 
5504         /* Didn't get pages from the underlying fs so we're done */
5505         if (!dogetpage)
5506                 goto done;
5507 
5508         /*
5509          * Now handle any other pages in the list returned.
5510          * If the page can be used, load up the translations now.
5511          * Note that the for loop will only be entered if "plp"
5512          * is pointing to a non-NULL page pointer which means that
5513          * VOP_GETPAGE() was called and vpprot has been initialized.
5514          */
5515         if (svd->pageprot == 0)
5516                 prot = svd->prot & vpprot;
5517 
5518 
5519         /*
5520          * Large Files: diff should be unsigned value because we started
5521          * supporting > 2GB segment sizes from 2.5.1 and when a
5522          * large file of size > 2GB gets mapped to address space
5523          * the diff value can be > 2GB.
5524          */
5525 
5526         for (ppp = plp; (pp = *ppp) != NULL; ppp++) {
5527                 size_t diff;
5528                 struct anon *ap;
5529                 int anon_index;
5530                 anon_sync_obj_t cookie;
5531                 int hat_flag = HAT_LOAD_ADV;
5532 
5533                 if (svd->flags & MAP_TEXT) {
5534                         hat_flag |= HAT_LOAD_TEXT;
5535                 }
5536 
5537                 if (pp == PAGE_HANDLED)
5538                         continue;
5539 
5540                 if (svd->tr_state != SEGVN_TR_ON &&
5541                     pp->p_offset >=  svd->offset &&
5542                     pp->p_offset < svd->offset + seg->s_size) {
5543 
5544                         diff = pp->p_offset - svd->offset;
5545 
5546                         /*
5547                          * Large Files: Following is the assertion
5548                          * validating the above cast.
5549                          */
5550                         ASSERT(svd->vp == pp->p_vnode);
5551 
5552                         page = btop(diff);
5553                         if (svd->pageprot)
5554                                 prot = VPP_PROT(&svd->vpage[page]) & vpprot;
5555 
5556                         /*
5557                          * Prevent other threads in the address space from
5558                          * creating private pages (i.e., allocating anon slots)
5559                          * while we are in the process of loading translations
5560                          * to additional pages returned by the underlying
5561                          * object.
5562                          */
5563                         if (amp != NULL) {
5564                                 anon_index = svd->anon_index + page;
5565                                 anon_array_enter(amp, anon_index, &cookie);
5566                                 ap = anon_get_ptr(amp->ahp, anon_index);
5567                         }
5568                         if ((amp == NULL) || (ap == NULL)) {
5569                                 if (IS_VMODSORT(pp->p_vnode) ||
5570                                     enable_mbit_wa) {
5571                                         if (rw == S_WRITE)
5572                                                 hat_setmod(pp);
5573                                         else if (rw != S_OTHER &&
5574                                             !hat_ismod(pp))
5575                                                 prot &= ~PROT_WRITE;
5576                                 }
5577                                 /*
5578                                  * Skip mapping read ahead pages marked
5579                                  * for migration, so they will get migrated
5580                                  * properly on fault
5581                                  */
5582                                 ASSERT(amp == NULL ||
5583                                     svd->rcookie == HAT_INVALID_REGION_COOKIE);
5584                                 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) {
5585                                         hat_memload_region(hat,
5586                                             seg->s_base + diff,
5587                                             pp, prot, hat_flag,
5588                                             svd->rcookie);
5589                                 }
5590                         }
5591                         if (amp != NULL)
5592                                 anon_array_exit(&cookie);
5593                 }
5594                 page_unlock(pp);
5595         }
5596 done:
5597         if (amp != NULL)
5598                 ANON_LOCK_EXIT(&amp->a_rwlock);
5599         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5600         if (pl_alloc_sz)
5601                 kmem_free(plp, pl_alloc_sz);
5602         return (0);
5603 }
5604 
5605 /*
5606  * This routine is used to start I/O on pages asynchronously.  XXX it will
5607  * only create PAGESIZE pages. At fault time they will be relocated into
5608  * larger pages.
5609  */
5610 static faultcode_t
5611 segvn_faulta(struct seg *seg, caddr_t addr)
5612 {
5613         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
5614         int err;
5615         struct anon_map *amp;
5616         vnode_t *vp;
5617 
5618         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5619 
5620         SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
5621         if ((amp = svd->amp) != NULL) {
5622                 struct anon *ap;
5623 
5624                 /*
5625                  * Reader lock to prevent amp->ahp from being changed.
5626                  * This is advisory, it's ok to miss a page, so
5627                  * we don't do anon_array_enter lock.
5628                  */
5629                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5630                 if ((ap = anon_get_ptr(amp->ahp,
5631                     svd->anon_index + seg_page(seg, addr))) != NULL) {
5632 
5633                         err = anon_getpage(&ap, NULL, NULL,
5634                             0, seg, addr, S_READ, svd->cred);
5635 
5636                         ANON_LOCK_EXIT(&amp->a_rwlock);
5637                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5638                         if (err)
5639                                 return (FC_MAKE_ERR(err));
5640                         return (0);
5641                 }
5642                 ANON_LOCK_EXIT(&amp->a_rwlock);
5643         }
5644 
5645         if (svd->vp == NULL) {
5646                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5647                 return (0);                     /* zfod page - do nothing now */
5648         }
5649 
5650         vp = svd->vp;
5651         TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE,
5652             "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp);
5653         err = VOP_GETPAGE(vp,
5654             (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)),
5655             PAGESIZE, NULL, NULL, 0, seg, addr,
5656             S_OTHER, svd->cred, NULL);
5657 
5658         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5659         if (err)
5660                 return (FC_MAKE_ERR(err));
5661         return (0);
5662 }
5663 
5664 static int
5665 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
5666 {
5667         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
5668         struct vpage *cvp, *svp, *evp;
5669         struct vnode *vp;
5670         size_t pgsz;
5671         pgcnt_t pgcnt;
5672         anon_sync_obj_t cookie;
5673         int unload_done = 0;
5674 
5675         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5676 
5677         if ((svd->maxprot & prot) != prot)
5678                 return (EACCES);                        /* violated maxprot */
5679 
5680         SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
5681 
5682         /* return if prot is the same */
5683         if (!svd->pageprot && svd->prot == prot) {
5684                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5685                 return (0);
5686         }
5687 
5688         /*
5689          * Since we change protections we first have to flush the cache.
5690          * This makes sure all the pagelock calls have to recheck
5691          * protections.
5692          */
5693         if (svd->softlockcnt > 0) {
5694                 ASSERT(svd->tr_state == SEGVN_TR_OFF);
5695 
5696                 /*
5697                  * If this is shared segment non 0 softlockcnt
5698                  * means locked pages are still in use.
5699                  */
5700                 if (svd->type == MAP_SHARED) {
5701                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5702                         return (EAGAIN);
5703                 }
5704 
5705                 /*
5706                  * Since we do have the segvn writers lock nobody can fill
5707                  * the cache with entries belonging to this seg during
5708                  * the purge. The flush either succeeds or we still have
5709                  * pending I/Os.
5710                  */
5711                 segvn_purge(seg);
5712                 if (svd->softlockcnt > 0) {
5713                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5714                         return (EAGAIN);
5715                 }
5716         }
5717 
5718         if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
5719                 ASSERT(svd->amp == NULL);
5720                 ASSERT(svd->tr_state == SEGVN_TR_OFF);
5721                 hat_leave_region(seg->s_as->a_hat, svd->rcookie,
5722                     HAT_REGION_TEXT);
5723                 svd->rcookie = HAT_INVALID_REGION_COOKIE;
5724                 unload_done = 1;
5725         } else if (svd->tr_state == SEGVN_TR_INIT) {
5726                 svd->tr_state = SEGVN_TR_OFF;
5727         } else if (svd->tr_state == SEGVN_TR_ON) {
5728                 ASSERT(svd->amp != NULL);
5729                 segvn_textunrepl(seg, 0);
5730                 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
5731                 unload_done = 1;
5732         }
5733 
5734         if ((prot & PROT_WRITE) && svd->type == MAP_SHARED &&
5735             svd->vp != NULL && (svd->vp->v_flag & VVMEXEC)) {
5736                 ASSERT(vn_is_mapped(svd->vp, V_WRITE));
5737                 segvn_inval_trcache(svd->vp);
5738         }
5739         if (seg->s_szc != 0) {
5740                 int err;
5741                 pgsz = page_get_pagesize(seg->s_szc);
5742                 pgcnt = pgsz >> PAGESHIFT;
5743                 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
5744                 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
5745                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5746                         ASSERT(seg->s_base != addr || seg->s_size != len);
5747                         /*
5748                          * If we are holding the as lock as a reader then
5749                          * we need to return IE_RETRY and let the as
5750                          * layer drop and re-acquire the lock as a writer.
5751                          */
5752                         if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock))
5753                                 return (IE_RETRY);
5754                         VM_STAT_ADD(segvnvmstats.demoterange[1]);
5755                         if (svd->type == MAP_PRIVATE || svd->vp != NULL) {
5756                                 err = segvn_demote_range(seg, addr, len,
5757                                     SDR_END, 0);
5758                         } else {
5759                                 uint_t szcvec = map_pgszcvec(seg->s_base,
5760                                     pgsz, (uintptr_t)seg->s_base,
5761                                     (svd->flags & MAP_TEXT), MAPPGSZC_SHM, 0);
5762                                 err = segvn_demote_range(seg, addr, len,
5763                                     SDR_END, szcvec);
5764                         }
5765                         if (err == 0)
5766                                 return (IE_RETRY);
5767                         if (err == ENOMEM)
5768                                 return (IE_NOMEM);
5769                         return (err);
5770                 }
5771         }
5772 
5773 
5774         /*
5775          * If it's a private mapping and we're making it writable then we
5776          * may have to reserve the additional swap space now. If we are
5777          * making writable only a part of the segment then we use its vpage
5778          * array to keep a record of the pages for which we have reserved
5779          * swap. In this case we set the pageswap field in the segment's
5780          * segvn structure to record this.
5781          *
5782          * If it's a private mapping to a file (i.e., vp != NULL) and we're
5783          * removing write permission on the entire segment and we haven't
5784          * modified any pages, we can release the swap space.
5785          */
5786         if (svd->type == MAP_PRIVATE) {
5787                 if (prot & PROT_WRITE) {
5788                         if (!(svd->flags & MAP_NORESERVE) &&
5789                             !(svd->swresv && svd->pageswap == 0)) {
5790                                 size_t sz = 0;
5791 
5792                                 /*
5793                                  * Start by determining how much swap
5794                                  * space is required.
5795                                  */
5796                                 if (addr == seg->s_base &&
5797                                     len == seg->s_size &&
5798                                     svd->pageswap == 0) {
5799                                         /* The whole segment */
5800                                         sz = seg->s_size;
5801                                 } else {
5802                                         /*
5803                                          * Make sure that the vpage array
5804                                          * exists, and make a note of the
5805                                          * range of elements corresponding
5806                                          * to len.
5807                                          */
5808                                         segvn_vpage(seg);
5809                                         if (svd->vpage == NULL) {
5810                                                 SEGVN_LOCK_EXIT(seg->s_as,
5811                                                     &svd->lock);
5812                                                 return (ENOMEM);
5813                                         }
5814                                         svp = &svd->vpage[seg_page(seg, addr)];
5815                                         evp = &svd->vpage[seg_page(seg,
5816                                             addr + len)];
5817 
5818                                         if (svd->pageswap == 0) {
5819                                                 /*
5820                                                  * This is the first time we've
5821                                                  * asked for a part of this
5822                                                  * segment, so we need to
5823                                                  * reserve everything we've
5824                                                  * been asked for.
5825                                                  */
5826                                                 sz = len;
5827                                         } else {
5828                                                 /*
5829                                                  * We have to count the number
5830                                                  * of pages required.
5831                                                  */
5832                                                 for (cvp = svp;  cvp < evp;
5833                                                     cvp++) {
5834                                                         if (!VPP_ISSWAPRES(cvp))
5835                                                                 sz++;
5836                                                 }
5837                                                 sz <<= PAGESHIFT;
5838                                         }
5839                                 }
5840 
5841                                 /* Try to reserve the necessary swap. */
5842                                 if (anon_resv_zone(sz,
5843                                     seg->s_as->a_proc->p_zone) == 0) {
5844                                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5845                                         return (IE_NOMEM);
5846                                 }
5847 
5848                                 /*
5849                                  * Make a note of how much swap space
5850                                  * we've reserved.
5851                                  */
5852                                 if (svd->pageswap == 0 && sz == seg->s_size) {
5853                                         svd->swresv = sz;
5854                                 } else {
5855                                         ASSERT(svd->vpage != NULL);
5856                                         svd->swresv += sz;
5857                                         svd->pageswap = 1;
5858                                         for (cvp = svp; cvp < evp; cvp++) {
5859                                                 if (!VPP_ISSWAPRES(cvp))
5860                                                         VPP_SETSWAPRES(cvp);
5861                                         }
5862                                 }
5863                         }
5864                 } else {
5865                         /*
5866                          * Swap space is released only if this segment
5867                          * does not map anonymous memory, since read faults
5868                          * on such segments still need an anon slot to read
5869                          * in the data.
5870                          */
5871                         if (svd->swresv != 0 && svd->vp != NULL &&
5872                             svd->amp == NULL && addr == seg->s_base &&
5873                             len == seg->s_size && svd->pageprot == 0) {
5874                                 ASSERT(svd->pageswap == 0);
5875                                 anon_unresv_zone(svd->swresv,
5876                                     seg->s_as->a_proc->p_zone);
5877                                 svd->swresv = 0;
5878                                 TRACE_3(TR_FAC_VM, TR_ANON_PROC,
5879                                     "anon proc:%p %lu %u", seg, 0, 0);
5880                         }
5881                 }
5882         }
5883 
5884         if (addr == seg->s_base && len == seg->s_size && svd->vpage == NULL) {
5885                 if (svd->prot == prot) {
5886                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5887                         return (0);                     /* all done */
5888                 }
5889                 svd->prot = (uchar_t)prot;
5890         } else if (svd->type == MAP_PRIVATE) {
5891                 struct anon *ap = NULL;
5892                 page_t *pp;
5893                 u_offset_t offset, off;
5894                 struct anon_map *amp;
5895                 ulong_t anon_idx = 0;
5896 
5897                 /*
5898                  * A vpage structure exists or else the change does not
5899                  * involve the entire segment.  Establish a vpage structure
5900                  * if none is there.  Then, for each page in the range,
5901                  * adjust its individual permissions.  Note that write-
5902                  * enabling a MAP_PRIVATE page can affect the claims for
5903                  * locked down memory.  Overcommitting memory terminates
5904                  * the operation.
5905                  */
5906                 segvn_vpage(seg);
5907                 if (svd->vpage == NULL) {
5908                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5909                         return (ENOMEM);
5910                 }
5911                 svd->pageprot = 1;
5912                 if ((amp = svd->amp) != NULL) {
5913                         anon_idx = svd->anon_index + seg_page(seg, addr);
5914                         ASSERT(seg->s_szc == 0 ||
5915                             IS_P2ALIGNED(anon_idx, pgcnt));
5916                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5917                 }
5918 
5919                 offset = svd->offset + (uintptr_t)(addr - seg->s_base);
5920                 evp = &svd->vpage[seg_page(seg, addr + len)];
5921 
5922                 /*
5923                  * See Statement at the beginning of segvn_lockop regarding
5924                  * the way cowcnts and lckcnts are handled.
5925                  */
5926                 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) {
5927 
5928                         if (seg->s_szc != 0) {
5929                                 if (amp != NULL) {
5930                                         anon_array_enter(amp, anon_idx,
5931                                             &cookie);
5932                                 }
5933                                 if (IS_P2ALIGNED(anon_idx, pgcnt) &&
5934                                     !segvn_claim_pages(seg, svp, offset,
5935                                     anon_idx, prot)) {
5936                                         if (amp != NULL) {
5937                                                 anon_array_exit(&cookie);
5938                                         }
5939                                         break;
5940                                 }
5941                                 if (amp != NULL) {
5942                                         anon_array_exit(&cookie);
5943                                 }
5944                                 anon_idx++;
5945                         } else {
5946                                 if (amp != NULL) {
5947                                         anon_array_enter(amp, anon_idx,
5948                                             &cookie);
5949                                         ap = anon_get_ptr(amp->ahp, anon_idx++);
5950                                 }
5951 
5952                                 if (VPP_ISPPLOCK(svp) &&
5953                                     VPP_PROT(svp) != prot) {
5954 
5955                                         if (amp == NULL || ap == NULL) {
5956                                                 vp = svd->vp;
5957                                                 off = offset;
5958                                         } else
5959                                                 swap_xlate(ap, &vp, &off);
5960                                         if (amp != NULL)
5961                                                 anon_array_exit(&cookie);
5962 
5963                                         if ((pp = page_lookup(vp, off,
5964                                             SE_SHARED)) == NULL) {
5965                                                 panic("segvn_setprot: no page");
5966                                                 /*NOTREACHED*/
5967                                         }
5968                                         ASSERT(seg->s_szc == 0);
5969                                         if ((VPP_PROT(svp) ^ prot) &
5970                                             PROT_WRITE) {
5971                                                 if (prot & PROT_WRITE) {
5972                                                         if (!page_addclaim(
5973                                                             pp)) {
5974                                                                 page_unlock(pp);
5975                                                                 break;
5976                                                         }
5977                                                 } else {
5978                                                         if (!page_subclaim(
5979                                                             pp)) {
5980                                                                 page_unlock(pp);
5981                                                                 break;
5982                                                         }
5983                                                 }
5984                                         }
5985                                         page_unlock(pp);
5986                                 } else if (amp != NULL)
5987                                         anon_array_exit(&cookie);
5988                         }
5989                         VPP_SETPROT(svp, prot);
5990                         offset += PAGESIZE;
5991                 }
5992                 if (amp != NULL)
5993                         ANON_LOCK_EXIT(&amp->a_rwlock);
5994 
5995                 /*
5996                  * Did we terminate prematurely?  If so, simply unload
5997                  * the translations to the things we've updated so far.
5998                  */
5999                 if (svp != evp) {
6000                         if (unload_done) {
6001                                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
6002                                 return (IE_NOMEM);
6003                         }
6004                         len = (svp - &svd->vpage[seg_page(seg, addr)]) *
6005                             PAGESIZE;
6006                         ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz));
6007                         if (len != 0)
6008                                 hat_unload(seg->s_as->a_hat, addr,
6009                                     len, HAT_UNLOAD);
6010                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
6011                         return (IE_NOMEM);
6012                 }
6013         } else {
6014                 segvn_vpage(seg);
6015                 if (svd->vpage == NULL) {
6016                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
6017                         return (ENOMEM);
6018                 }
6019                 svd->pageprot = 1;
6020                 evp = &svd->vpage[seg_page(seg, addr + len)];
6021                 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) {
6022                         VPP_SETPROT(svp, prot);
6023                 }
6024         }
6025 
6026         if (unload_done) {
6027                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
6028                 return (0);
6029         }
6030 
6031         if (((prot & PROT_WRITE) != 0 &&
6032             (svd->vp != NULL || svd->type == MAP_PRIVATE)) ||
6033             (prot & ~PROT_USER) == PROT_NONE) {
6034                 /*
6035                  * Either private or shared data with write access (in
6036                  * which case we need to throw out all former translations
6037                  * so that we get the right translations set up on fault
6038                  * and we don't allow write access to any copy-on-write pages
6039                  * that might be around or to prevent write access to pages
6040                  * representing holes in a file), or we don't have permission
6041                  * to access the memory at all (in which case we have to
6042                  * unload any current translations that might exist).
6043                  */
6044                 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD);
6045         } else {
6046                 /*
6047                  * A shared mapping or a private mapping in which write
6048                  * protection is going to be denied - just change all the
6049                  * protections over the range of addresses in question.
6050                  * segvn does not support any other attributes other
6051                  * than prot so we can use hat_chgattr.
6052                  */
6053                 hat_chgattr(seg->s_as->a_hat, addr, len, prot);
6054         }
6055 
6056         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
6057 
6058         return (0);
6059 }
6060 
6061 /*
6062  * segvn_setpagesize is called via segop_setpagesize from as_setpagesize,
6063  * to determine if the seg is capable of mapping the requested szc.
6064  */
6065 static int
6066 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
6067 {
6068         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6069         struct segvn_data *nsvd;
6070         struct anon_map *amp = svd->amp;
6071         struct seg *nseg;
6072         caddr_t eaddr = addr + len, a;
6073         size_t pgsz = page_get_pagesize(szc);
6074         pgcnt_t pgcnt = page_get_pagecnt(szc);
6075         int err;
6076         u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base);
6077 
6078         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
6079         ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size);
6080 
6081         if (seg->s_szc == szc || segvn_lpg_disable != 0) {
6082                 return (0);
6083         }
6084 
6085         /*
6086          * addr should always be pgsz aligned but eaddr may be misaligned if
6087          * it's at the end of the segment.
6088          *
6089          * XXX we should assert this condition since as_setpagesize() logic
6090          * guarantees it.
6091          */
6092         if (!IS_P2ALIGNED(addr, pgsz) ||
6093             (!IS_P2ALIGNED(eaddr, pgsz) &&
6094             eaddr != seg->s_base + seg->s_size)) {
6095 
6096                 segvn_setpgsz_align_err++;
6097                 return (EINVAL);
6098         }
6099 
6100         if (amp != NULL && svd->type == MAP_SHARED) {
6101                 ulong_t an_idx = svd->anon_index + seg_page(seg, addr);
6102                 if (!IS_P2ALIGNED(an_idx, pgcnt)) {
6103 
6104                         segvn_setpgsz_anon_align_err++;
6105                         return (EINVAL);
6106                 }
6107         }
6108 
6109         if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas ||
6110             szc > segvn_maxpgszc) {
6111                 return (EINVAL);
6112         }
6113 
6114         /* paranoid check */
6115         if (svd->vp != NULL &&
6116             (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) {
6117                 return (EINVAL);
6118         }
6119 
6120         if (seg->s_szc == 0 && svd->vp != NULL &&
6121             map_addr_vacalign_check(addr, off)) {
6122                 return (EINVAL);
6123         }
6124 
6125         /*
6126          * Check that protections are the same within new page
6127          * size boundaries.
6128          */
6129         if (svd->pageprot) {
6130                 for (a = addr; a < eaddr; a += pgsz) {
6131                         if ((a + pgsz) > eaddr) {
6132                                 if (!sameprot(seg, a, eaddr - a)) {
6133                                         return (EINVAL);
6134                                 }
6135                         } else {
6136                                 if (!sameprot(seg, a, pgsz)) {
6137                                         return (EINVAL);
6138                                 }
6139                         }
6140                 }
6141         }
6142 
6143         /*
6144          * Since we are changing page size we first have to flush
6145          * the cache. This makes sure all the pagelock calls have
6146          * to recheck protections.
6147          */
6148         if (svd->softlockcnt > 0) {
6149                 ASSERT(svd->tr_state == SEGVN_TR_OFF);
6150 
6151                 /*
6152                  * If this is shared segment non 0 softlockcnt
6153                  * means locked pages are still in use.
6154                  */
6155                 if (svd->type == MAP_SHARED) {
6156                         return (EAGAIN);
6157                 }
6158 
6159                 /*
6160                  * Since we do have the segvn writers lock nobody can fill
6161                  * the cache with entries belonging to this seg during
6162                  * the purge. The flush either succeeds or we still have
6163                  * pending I/Os.
6164                  */
6165                 segvn_purge(seg);
6166                 if (svd->softlockcnt > 0) {
6167                         return (EAGAIN);
6168                 }
6169         }
6170 
6171         if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
6172                 ASSERT(svd->amp == NULL);
6173                 ASSERT(svd->tr_state == SEGVN_TR_OFF);
6174                 hat_leave_region(seg->s_as->a_hat, svd->rcookie,
6175                     HAT_REGION_TEXT);
6176                 svd->rcookie = HAT_INVALID_REGION_COOKIE;
6177         } else if (svd->tr_state == SEGVN_TR_INIT) {
6178                 svd->tr_state = SEGVN_TR_OFF;
6179         } else if (svd->tr_state == SEGVN_TR_ON) {
6180                 ASSERT(svd->amp != NULL);
6181                 segvn_textunrepl(seg, 1);
6182                 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
6183                 amp = NULL;
6184         }
6185 
6186         /*
6187          * Operation for sub range of existing segment.
6188          */
6189         if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) {
6190                 if (szc < seg->s_szc) {
6191                         VM_STAT_ADD(segvnvmstats.demoterange[2]);
6192                         err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0);
6193                         if (err == 0) {
6194                                 return (IE_RETRY);
6195                         }
6196                         if (err == ENOMEM) {
6197                                 return (IE_NOMEM);
6198                         }
6199                         return (err);
6200                 }
6201                 if (addr != seg->s_base) {
6202                         nseg = segvn_split_seg(seg, addr);
6203                         if (eaddr != (nseg->s_base + nseg->s_size)) {
6204                                 /* eaddr is szc aligned */
6205                                 (void) segvn_split_seg(nseg, eaddr);
6206                         }
6207                         return (IE_RETRY);
6208                 }
6209                 if (eaddr != (seg->s_base + seg->s_size)) {
6210                         /* eaddr is szc aligned */
6211                         (void) segvn_split_seg(seg, eaddr);
6212                 }
6213                 return (IE_RETRY);
6214         }
6215 
6216         /*
6217          * Break any low level sharing and reset seg->s_szc to 0.
6218          */
6219         if ((err = segvn_clrszc(seg)) != 0) {
6220                 if (err == ENOMEM) {
6221                         err = IE_NOMEM;
6222                 }
6223                 return (err);
6224         }
6225         ASSERT(seg->s_szc == 0);
6226 
6227         /*
6228          * If the end of the current segment is not pgsz aligned
6229          * then attempt to concatenate with the next segment.
6230          */
6231         if (!IS_P2ALIGNED(eaddr, pgsz)) {
6232                 nseg = AS_SEGNEXT(seg->s_as, seg);
6233                 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) {
6234                         return (ENOMEM);
6235                 }
6236                 if (nseg->s_ops != &segvn_ops) {
6237                         return (EINVAL);
6238                 }
6239                 nsvd = (struct segvn_data *)nseg->s_data;
6240                 if (nsvd->softlockcnt > 0) {
6241                         /*
6242                          * If this is shared segment non 0 softlockcnt
6243                          * means locked pages are still in use.
6244                          */
6245                         if (nsvd->type == MAP_SHARED) {
6246                                 return (EAGAIN);
6247                         }
6248                         segvn_purge(nseg);
6249                         if (nsvd->softlockcnt > 0) {
6250                                 return (EAGAIN);
6251                         }
6252                 }
6253                 err = segvn_clrszc(nseg);
6254                 if (err == ENOMEM) {
6255                         err = IE_NOMEM;
6256                 }
6257                 if (err != 0) {
6258                         return (err);
6259                 }
6260                 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE);
6261                 err = segvn_concat(seg, nseg, 1);
6262                 if (err == -1) {
6263                         return (EINVAL);
6264                 }
6265                 if (err == -2) {
6266                         return (IE_NOMEM);
6267                 }
6268                 return (IE_RETRY);
6269         }
6270 
6271         /*
6272          * May need to re-align anon array to
6273          * new szc.
6274          */
6275         if (amp != NULL) {
6276                 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) {
6277                         struct anon_hdr *nahp;
6278 
6279                         ASSERT(svd->type == MAP_PRIVATE);
6280 
6281                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
6282                         ASSERT(amp->refcnt == 1);
6283                         nahp = anon_create(btop(amp->size), ANON_NOSLEEP);
6284                         if (nahp == NULL) {
6285                                 ANON_LOCK_EXIT(&amp->a_rwlock);
6286                                 return (IE_NOMEM);
6287                         }
6288                         if (anon_copy_ptr(amp->ahp, svd->anon_index,
6289                             nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) {
6290                                 anon_release(nahp, btop(amp->size));
6291                                 ANON_LOCK_EXIT(&amp->a_rwlock);
6292                                 return (IE_NOMEM);
6293                         }
6294                         anon_release(amp->ahp, btop(amp->size));
6295                         amp->ahp = nahp;
6296                         svd->anon_index = 0;
6297                         ANON_LOCK_EXIT(&amp->a_rwlock);
6298                 }
6299         }
6300         if (svd->vp != NULL && szc != 0) {
6301                 struct vattr va;
6302                 u_offset_t eoffpage = svd->offset;
6303                 va.va_mask = AT_SIZE;
6304                 eoffpage += seg->s_size;
6305                 eoffpage = btopr(eoffpage);
6306                 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred, NULL) != 0) {
6307                         segvn_setpgsz_getattr_err++;
6308                         return (EINVAL);
6309                 }
6310                 if (btopr(va.va_size) < eoffpage) {
6311                         segvn_setpgsz_eof_err++;
6312                         return (EINVAL);
6313                 }
6314                 if (amp != NULL) {
6315                         /*
6316                          * anon_fill_cow_holes() may call VOP_GETPAGE().
6317                          * don't take anon map lock here to avoid holding it
6318                          * across VOP_GETPAGE() calls that may call back into
6319                          * segvn for klsutering checks. We don't really need
6320                          * anon map lock here since it's a private segment and
6321                          * we hold as level lock as writers.
6322                          */
6323                         if ((err = anon_fill_cow_holes(seg, seg->s_base,
6324                             amp->ahp, svd->anon_index, svd->vp, svd->offset,
6325                             seg->s_size, szc, svd->prot, svd->vpage,
6326                             svd->cred)) != 0) {
6327                                 return (EINVAL);
6328                         }
6329                 }
6330                 segvn_setvnode_mpss(svd->vp);
6331         }
6332 
6333         if (amp != NULL) {
6334                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
6335                 if (svd->type == MAP_PRIVATE) {
6336                         amp->a_szc = szc;
6337                 } else if (szc > amp->a_szc) {
6338                         amp->a_szc = szc;
6339                 }
6340                 ANON_LOCK_EXIT(&amp->a_rwlock);
6341         }
6342 
6343         seg->s_szc = szc;
6344 
6345         return (0);
6346 }
6347 
6348 static int
6349 segvn_clrszc(struct seg *seg)
6350 {
6351         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6352         struct anon_map *amp = svd->amp;
6353         size_t pgsz;
6354         pgcnt_t pages;
6355         int err = 0;
6356         caddr_t a = seg->s_base;
6357         caddr_t ea = a + seg->s_size;
6358         ulong_t an_idx = svd->anon_index;
6359         vnode_t *vp = svd->vp;
6360         struct vpage *vpage = svd->vpage;
6361         page_t *anon_pl[1 + 1], *pp;
6362         struct anon *ap, *oldap;
6363         uint_t prot = svd->prot, vpprot;
6364         int pageflag = 0;
6365 
6366         ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) ||
6367             SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
6368         ASSERT(svd->softlockcnt == 0);
6369 
6370         if (vp == NULL && amp == NULL) {
6371                 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
6372                 seg->s_szc = 0;
6373                 return (0);
6374         }
6375 
6376         if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
6377                 ASSERT(svd->amp == NULL);
6378                 ASSERT(svd->tr_state == SEGVN_TR_OFF);
6379                 hat_leave_region(seg->s_as->a_hat, svd->rcookie,
6380                     HAT_REGION_TEXT);
6381                 svd->rcookie = HAT_INVALID_REGION_COOKIE;
6382         } else if (svd->tr_state == SEGVN_TR_ON) {
6383                 ASSERT(svd->amp != NULL);
6384                 segvn_textunrepl(seg, 1);
6385                 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
6386                 amp = NULL;
6387         } else {
6388                 if (svd->tr_state != SEGVN_TR_OFF) {
6389                         ASSERT(svd->tr_state == SEGVN_TR_INIT);
6390                         svd->tr_state = SEGVN_TR_OFF;
6391                 }
6392 
6393                 /*
6394                  * do HAT_UNLOAD_UNMAP since we are changing the pagesize.
6395                  * unload argument is 0 when we are freeing the segment
6396                  * and unload was already done.
6397                  */
6398                 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size,
6399                     HAT_UNLOAD_UNMAP);
6400         }
6401 
6402         if (amp == NULL || svd->type == MAP_SHARED) {
6403                 seg->s_szc = 0;
6404                 return (0);
6405         }
6406 
6407         pgsz = page_get_pagesize(seg->s_szc);
6408         pages = btop(pgsz);
6409 
6410         /*
6411          * XXX anon rwlock is not really needed because this is a
6412          * private segment and we are writers.
6413          */
6414         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
6415 
6416         for (; a < ea; a += pgsz, an_idx += pages) {
6417                 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) {
6418                         ASSERT(vpage != NULL || svd->pageprot == 0);
6419                         if (vpage != NULL) {
6420                                 ASSERT(sameprot(seg, a, pgsz));
6421                                 prot = VPP_PROT(vpage);
6422                                 pageflag = VPP_ISPPLOCK(vpage) ? LOCK_PAGE : 0;
6423                         }
6424                         if (seg->s_szc != 0) {
6425                                 ASSERT(vp == NULL || anon_pages(amp->ahp,
6426                                     an_idx, pages) == pages);
6427                                 if ((err = anon_map_demotepages(amp, an_idx,
6428                                     seg, a, prot, vpage, svd->cred)) != 0) {
6429                                         goto out;
6430                                 }
6431                         } else {
6432                                 if (oldap->an_refcnt == 1) {
6433                                         continue;
6434                                 }
6435                                 if ((err = anon_getpage(&oldap, &vpprot,
6436                                     anon_pl, PAGESIZE, seg, a, S_READ,
6437                                     svd->cred))) {
6438                                         goto out;
6439                                 }
6440                                 if ((pp = anon_private(&ap, seg, a, prot,
6441                                     anon_pl[0], pageflag, svd->cred)) == NULL) {
6442                                         err = ENOMEM;
6443                                         goto out;
6444                                 }
6445                                 anon_decref(oldap);
6446                                 (void) anon_set_ptr(amp->ahp, an_idx, ap,
6447                                     ANON_SLEEP);
6448                                 page_unlock(pp);
6449                         }
6450                 }
6451                 vpage = (vpage == NULL) ? NULL : vpage + pages;
6452         }
6453 
6454         amp->a_szc = 0;
6455         seg->s_szc = 0;
6456 out:
6457         ANON_LOCK_EXIT(&amp->a_rwlock);
6458         return (err);
6459 }
6460 
6461 static int
6462 segvn_claim_pages(
6463         struct seg *seg,
6464         struct vpage *svp,
6465         u_offset_t off,
6466         ulong_t anon_idx,
6467         uint_t prot)
6468 {
6469         pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc);
6470         size_t ppasize = (pgcnt + 1) * sizeof (page_t *);
6471         page_t  **ppa;
6472         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6473         struct anon_map *amp = svd->amp;
6474         struct vpage *evp = svp + pgcnt;
6475         caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT)
6476             + seg->s_base;
6477         struct anon *ap;
6478         struct vnode *vp = svd->vp;
6479         page_t *pp;
6480         pgcnt_t pg_idx, i;
6481         int err = 0;
6482         anoff_t aoff;
6483         int anon = (amp != NULL) ? 1 : 0;
6484 
6485         ASSERT(svd->type == MAP_PRIVATE);
6486         ASSERT(svd->vpage != NULL);
6487         ASSERT(seg->s_szc != 0);
6488         ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
6489         ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt));
6490         ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT));
6491 
6492         if (VPP_PROT(svp) == prot)
6493                 return (1);
6494         if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE))
6495                 return (1);
6496 
6497         ppa = kmem_alloc(ppasize, KM_SLEEP);
6498         if (anon && vp != NULL) {
6499                 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) {
6500                         anon = 0;
6501                         ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt));
6502                 }
6503                 ASSERT(!anon ||
6504                     anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt);
6505         }
6506 
6507         for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) {
6508                 if (!VPP_ISPPLOCK(svp))
6509                         continue;
6510                 if (anon) {
6511                         ap = anon_get_ptr(amp->ahp, anon_idx);
6512                         if (ap == NULL) {
6513                                 panic("segvn_claim_pages: no anon slot");
6514                         }
6515                         swap_xlate(ap, &vp, &aoff);
6516                         off = (u_offset_t)aoff;
6517                 }
6518                 ASSERT(vp != NULL);
6519                 if ((pp = page_lookup(vp,
6520                     (u_offset_t)off, SE_SHARED)) == NULL) {
6521                         panic("segvn_claim_pages: no page");
6522                 }
6523                 ppa[pg_idx++] = pp;
6524                 off += PAGESIZE;
6525         }
6526 
6527         if (ppa[0] == NULL) {
6528                 kmem_free(ppa, ppasize);
6529                 return (1);
6530         }
6531 
6532         ASSERT(pg_idx <= pgcnt);
6533         ppa[pg_idx] = NULL;
6534 
6535 
6536         /* Find each large page within ppa, and adjust its claim */
6537 
6538         /* Does ppa cover a single large page? */
6539         if (ppa[0]->p_szc == seg->s_szc) {
6540                 if (prot & PROT_WRITE)
6541                         err = page_addclaim_pages(ppa);
6542                 else
6543                         err = page_subclaim_pages(ppa);
6544         } else {
6545                 for (i = 0; ppa[i]; i += pgcnt) {
6546                         ASSERT(IS_P2ALIGNED(page_pptonum(ppa[i]), pgcnt));
6547                         if (prot & PROT_WRITE)
6548                                 err = page_addclaim_pages(&ppa[i]);
6549                         else
6550                                 err = page_subclaim_pages(&ppa[i]);
6551                         if (err == 0)
6552                                 break;
6553                 }
6554         }
6555 
6556         for (i = 0; i < pg_idx; i++) {
6557                 ASSERT(ppa[i] != NULL);
6558                 page_unlock(ppa[i]);
6559         }
6560 
6561         kmem_free(ppa, ppasize);
6562         return (err);
6563 }
6564 
6565 /*
6566  * Returns right (upper address) segment if split occurred.
6567  * If the address is equal to the beginning or end of its segment it returns
6568  * the current segment.
6569  */
6570 static struct seg *
6571 segvn_split_seg(struct seg *seg, caddr_t addr)
6572 {
6573         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6574         struct seg *nseg;
6575         size_t nsize;
6576         struct segvn_data *nsvd;
6577 
6578         ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
6579         ASSERT(svd->tr_state == SEGVN_TR_OFF);
6580 
6581         ASSERT(addr >= seg->s_base);
6582         ASSERT(addr <= seg->s_base + seg->s_size);
6583         ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
6584 
6585         if (addr == seg->s_base || addr == seg->s_base + seg->s_size)
6586                 return (seg);
6587 
6588         nsize = seg->s_base + seg->s_size - addr;
6589         seg->s_size = addr - seg->s_base;
6590         nseg = seg_alloc(seg->s_as, addr, nsize);
6591         ASSERT(nseg != NULL);
6592         nseg->s_ops = seg->s_ops;
6593         nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
6594         nseg->s_data = (void *)nsvd;
6595         nseg->s_szc = seg->s_szc;
6596         *nsvd = *svd;
6597         ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE);
6598         nsvd->seg = nseg;
6599         rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL);
6600 
6601         if (nsvd->vp != NULL) {
6602                 VN_HOLD(nsvd->vp);
6603                 nsvd->offset = svd->offset +
6604                     (uintptr_t)(nseg->s_base - seg->s_base);
6605                 if (nsvd->type == MAP_SHARED)
6606                         lgrp_shm_policy_init(NULL, nsvd->vp);
6607         } else {
6608                 /*
6609                  * The offset for an anonymous segment has no signifigance in
6610                  * terms of an offset into a file. If we were to use the above
6611                  * calculation instead, the structures read out of
6612                  * /proc/<pid>/xmap would be more difficult to decipher since
6613                  * it would be unclear whether two seemingly contiguous
6614                  * prxmap_t structures represented different segments or a
6615                  * single segment that had been split up into multiple prxmap_t
6616                  * structures (e.g. if some part of the segment had not yet
6617                  * been faulted in).
6618                  */
6619                 nsvd->offset = 0;
6620         }
6621 
6622         ASSERT(svd->softlockcnt == 0);
6623         ASSERT(svd->softlockcnt_sbase == 0);
6624         ASSERT(svd->softlockcnt_send == 0);
6625         crhold(svd->cred);
6626 
6627         if (svd->vpage != NULL) {
6628                 size_t bytes = vpgtob(seg_pages(seg));
6629                 size_t nbytes = vpgtob(seg_pages(nseg));
6630                 struct vpage *ovpage = svd->vpage;
6631 
6632                 svd->vpage = kmem_alloc(bytes, KM_SLEEP);
6633                 bcopy(ovpage, svd->vpage, bytes);
6634                 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP);
6635                 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes);
6636                 kmem_free(ovpage, bytes + nbytes);
6637         }
6638         if (svd->amp != NULL && svd->type == MAP_PRIVATE) {
6639                 struct anon_map *oamp = svd->amp, *namp;
6640                 struct anon_hdr *nahp;
6641 
6642                 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER);
6643                 ASSERT(oamp->refcnt == 1);
6644                 nahp = anon_create(btop(seg->s_size), ANON_SLEEP);
6645                 (void) anon_copy_ptr(oamp->ahp, svd->anon_index,
6646                     nahp, 0, btop(seg->s_size), ANON_SLEEP);
6647 
6648                 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP);
6649                 namp->a_szc = nseg->s_szc;
6650                 (void) anon_copy_ptr(oamp->ahp,
6651                     svd->anon_index + btop(seg->s_size),
6652                     namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP);
6653                 anon_release(oamp->ahp, btop(oamp->size));
6654                 oamp->ahp = nahp;
6655                 oamp->size = seg->s_size;
6656                 svd->anon_index = 0;
6657                 nsvd->amp = namp;
6658                 nsvd->anon_index = 0;
6659                 ANON_LOCK_EXIT(&oamp->a_rwlock);
6660         } else if (svd->amp != NULL) {
6661                 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc);
6662                 ASSERT(svd->amp == nsvd->amp);
6663                 ASSERT(seg->s_szc <= svd->amp->a_szc);
6664                 nsvd->anon_index = svd->anon_index + seg_pages(seg);
6665                 ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt));
6666                 ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER);
6667                 svd->amp->refcnt++;
6668                 ANON_LOCK_EXIT(&svd->amp->a_rwlock);
6669         }
6670 
6671         /*
6672          * Split the amount of swap reserved.
6673          */
6674         if (svd->swresv) {
6675                 /*
6676                  * For MAP_NORESERVE, only allocate swap reserve for pages
6677                  * being used.  Other segments get enough to cover whole
6678                  * segment.
6679                  */
6680                 if (svd->flags & MAP_NORESERVE) {
6681                         size_t  oswresv;
6682 
6683                         ASSERT(svd->amp);
6684                         oswresv = svd->swresv;
6685                         svd->swresv = ptob(anon_pages(svd->amp->ahp,
6686                             svd->anon_index, btop(seg->s_size)));
6687                         nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp,
6688                             nsvd->anon_index, btop(nseg->s_size)));
6689                         ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
6690                 } else {
6691                         if (svd->pageswap) {
6692                                 svd->swresv = segvn_count_swap_by_vpages(seg);
6693                                 ASSERT(nsvd->swresv >= svd->swresv);
6694                                 nsvd->swresv -= svd->swresv;
6695                         } else {
6696                                 ASSERT(svd->swresv == seg->s_size +
6697                                     nseg->s_size);
6698                                 svd->swresv = seg->s_size;
6699                                 nsvd->swresv = nseg->s_size;
6700                         }
6701                 }
6702         }
6703 
6704         return (nseg);
6705 }
6706 
6707 /*
6708  * called on memory operations (unmap, setprot, setpagesize) for a subset
6709  * of a large page segment to either demote the memory range (SDR_RANGE)
6710  * or the ends (SDR_END) by addr/len.
6711  *
6712  * returns 0 on success. returns errno, including ENOMEM, on failure.
6713  */
6714 static int
6715 segvn_demote_range(
6716         struct seg *seg,
6717         caddr_t addr,
6718         size_t len,
6719         int flag,
6720         uint_t szcvec)
6721 {
6722         caddr_t eaddr = addr + len;
6723         caddr_t lpgaddr, lpgeaddr;
6724         struct seg *nseg;
6725         struct seg *badseg1 = NULL;
6726         struct seg *badseg2 = NULL;
6727         size_t pgsz;
6728         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6729         int err;
6730         uint_t szc = seg->s_szc;
6731         uint_t tszcvec;
6732 
6733         ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
6734         ASSERT(svd->tr_state == SEGVN_TR_OFF);
6735         ASSERT(szc != 0);
6736         pgsz = page_get_pagesize(szc);
6737         ASSERT(seg->s_base != addr || seg->s_size != len);
6738         ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size);
6739         ASSERT(svd->softlockcnt == 0);
6740         ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
6741         ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED));
6742 
6743         CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
6744         ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr);
6745         if (flag == SDR_RANGE) {
6746                 /* demote entire range */
6747                 badseg1 = nseg = segvn_split_seg(seg, lpgaddr);
6748                 (void) segvn_split_seg(nseg, lpgeaddr);
6749                 ASSERT(badseg1->s_base == lpgaddr);
6750                 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr);
6751         } else if (addr != lpgaddr) {
6752                 ASSERT(flag == SDR_END);
6753                 badseg1 = nseg = segvn_split_seg(seg, lpgaddr);
6754                 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz &&
6755                     eaddr < lpgaddr + 2 * pgsz) {
6756                         (void) segvn_split_seg(nseg, lpgeaddr);
6757                         ASSERT(badseg1->s_base == lpgaddr);
6758                         ASSERT(badseg1->s_size == 2 * pgsz);
6759                 } else {
6760                         nseg = segvn_split_seg(nseg, lpgaddr + pgsz);
6761                         ASSERT(badseg1->s_base == lpgaddr);
6762                         ASSERT(badseg1->s_size == pgsz);
6763                         if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) {
6764                                 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz);
6765                                 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz);
6766                                 badseg2 = nseg;
6767                                 (void) segvn_split_seg(nseg, lpgeaddr);
6768                                 ASSERT(badseg2->s_base == lpgeaddr - pgsz);
6769                                 ASSERT(badseg2->s_size == pgsz);
6770                         }
6771                 }
6772         } else {
6773                 ASSERT(flag == SDR_END);
6774                 ASSERT(eaddr < lpgeaddr);
6775                 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz);
6776                 (void) segvn_split_seg(nseg, lpgeaddr);
6777                 ASSERT(badseg1->s_base == lpgeaddr - pgsz);
6778                 ASSERT(badseg1->s_size == pgsz);
6779         }
6780 
6781         ASSERT(badseg1 != NULL);
6782         ASSERT(badseg1->s_szc == szc);
6783         ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz ||
6784             badseg1->s_size == 2 * pgsz);
6785         ASSERT(sameprot(badseg1, badseg1->s_base, pgsz));
6786         ASSERT(badseg1->s_size == pgsz ||
6787             sameprot(badseg1, badseg1->s_base + pgsz, pgsz));
6788         if (err = segvn_clrszc(badseg1)) {
6789                 return (err);
6790         }
6791         ASSERT(badseg1->s_szc == 0);
6792 
6793         if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) {
6794                 uint_t tszc = highbit(tszcvec) - 1;
6795                 caddr_t ta = MAX(addr, badseg1->s_base);
6796                 caddr_t te;
6797                 size_t tpgsz = page_get_pagesize(tszc);
6798 
6799                 ASSERT(svd->type == MAP_SHARED);
6800                 ASSERT(flag == SDR_END);
6801                 ASSERT(tszc < szc && tszc > 0);
6802 
6803                 if (eaddr > badseg1->s_base + badseg1->s_size) {
6804                         te = badseg1->s_base + badseg1->s_size;
6805                 } else {
6806                         te = eaddr;
6807                 }
6808 
6809                 ASSERT(ta <= te);
6810                 badseg1->s_szc = tszc;
6811                 if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) {
6812                         if (badseg2 != NULL) {
6813                                 err = segvn_demote_range(badseg1, ta, te - ta,
6814                                     SDR_END, tszcvec);
6815                                 if (err != 0) {
6816                                         return (err);
6817                                 }
6818                         } else {
6819                                 return (segvn_demote_range(badseg1, ta,
6820                                     te - ta, SDR_END, tszcvec));
6821                         }
6822                 }
6823         }
6824 
6825         if (badseg2 == NULL)
6826                 return (0);
6827         ASSERT(badseg2->s_szc == szc);
6828         ASSERT(badseg2->s_size == pgsz);
6829         ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size));
6830         if (err = segvn_clrszc(badseg2)) {
6831                 return (err);
6832         }
6833         ASSERT(badseg2->s_szc == 0);
6834 
6835         if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) {
6836                 uint_t tszc = highbit(tszcvec) - 1;
6837                 size_t tpgsz = page_get_pagesize(tszc);
6838 
6839                 ASSERT(svd->type == MAP_SHARED);
6840                 ASSERT(flag == SDR_END);
6841                 ASSERT(tszc < szc && tszc > 0);
6842                 ASSERT(badseg2->s_base > addr);
6843                 ASSERT(eaddr > badseg2->s_base);
6844                 ASSERT(eaddr < badseg2->s_base + badseg2->s_size);
6845 
6846                 badseg2->s_szc = tszc;
6847                 if (!IS_P2ALIGNED(eaddr, tpgsz)) {
6848                         return (segvn_demote_range(badseg2, badseg2->s_base,
6849                             eaddr - badseg2->s_base, SDR_END, tszcvec));
6850                 }
6851         }
6852 
6853         return (0);
6854 }
6855 
6856 static int
6857 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
6858 {
6859         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6860         struct vpage *vp, *evp;
6861 
6862         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
6863 
6864         SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
6865         /*
6866          * If segment protection can be used, simply check against them.
6867          */
6868         if (svd->pageprot == 0) {
6869                 int err;
6870 
6871                 err = ((svd->prot & prot) != prot) ? EACCES : 0;
6872                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
6873                 return (err);
6874         }
6875 
6876         /*
6877          * Have to check down to the vpage level.
6878          */
6879         evp = &svd->vpage[seg_page(seg, addr + len)];
6880         for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) {
6881                 if ((VPP_PROT(vp) & prot) != prot) {
6882                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
6883                         return (EACCES);
6884                 }
6885         }
6886         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
6887         return (0);
6888 }
6889 
6890 static int
6891 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
6892 {
6893         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6894         size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
6895 
6896         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
6897 
6898         if (pgno != 0) {
6899                 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
6900                 if (svd->pageprot == 0) {
6901                         do {
6902                                 protv[--pgno] = svd->prot;
6903                         } while (pgno != 0);
6904                 } else {
6905                         size_t pgoff = seg_page(seg, addr);
6906 
6907                         do {
6908                                 pgno--;
6909                                 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]);
6910                         } while (pgno != 0);
6911                 }
6912                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
6913         }
6914         return (0);
6915 }
6916 
6917 static u_offset_t
6918 segvn_getoffset(struct seg *seg, caddr_t addr)
6919 {
6920         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6921 
6922         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
6923 
6924         return (svd->offset + (uintptr_t)(addr - seg->s_base));
6925 }
6926 
6927 /*ARGSUSED*/
6928 static int
6929 segvn_gettype(struct seg *seg, caddr_t addr)
6930 {
6931         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6932 
6933         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
6934 
6935         return (svd->type | (svd->flags & (MAP_NORESERVE | MAP_TEXT |
6936             MAP_INITDATA)));
6937 }
6938 
6939 /*ARGSUSED*/
6940 static int
6941 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
6942 {
6943         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6944 
6945         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
6946 
6947         *vpp = svd->vp;
6948         return (0);
6949 }
6950 
6951 /*
6952  * Check to see if it makes sense to do kluster/read ahead to
6953  * addr + delta relative to the mapping at addr.  We assume here
6954  * that delta is a signed PAGESIZE'd multiple (which can be negative).
6955  *
6956  * For segvn, we currently "approve" of the action if we are
6957  * still in the segment and it maps from the same vp/off,
6958  * or if the advice stored in segvn_data or vpages allows it.
6959  * Currently, klustering is not allowed only if MADV_RANDOM is set.
6960  */
6961 static int
6962 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
6963 {
6964         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6965         struct anon *oap, *ap;
6966         ssize_t pd;
6967         size_t page;
6968         struct vnode *vp1, *vp2;
6969         u_offset_t off1, off2;
6970         struct anon_map *amp;
6971 
6972         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
6973         ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) ||
6974             SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
6975 
6976         if (addr + delta < seg->s_base ||
6977             addr + delta >= (seg->s_base + seg->s_size))
6978                 return (-1);            /* exceeded segment bounds */
6979 
6980         pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */
6981         page = seg_page(seg, addr);
6982 
6983         /*
6984          * Check to see if either of the pages addr or addr + delta
6985          * have advice set that prevents klustering (if MADV_RANDOM advice
6986          * is set for entire segment, or MADV_SEQUENTIAL is set and delta
6987          * is negative).
6988          */
6989         if (svd->advice == MADV_RANDOM ||
6990             svd->advice == MADV_SEQUENTIAL && delta < 0)
6991                 return (-1);
6992         else if (svd->pageadvice && svd->vpage) {
6993                 struct vpage *bvpp, *evpp;
6994 
6995                 bvpp = &svd->vpage[page];
6996                 evpp = &svd->vpage[page + pd];
6997                 if (VPP_ADVICE(bvpp) == MADV_RANDOM ||
6998                     VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0)
6999                         return (-1);
7000                 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) &&
7001                     VPP_ADVICE(evpp) == MADV_RANDOM)
7002                         return (-1);
7003         }
7004 
7005         if (svd->type == MAP_SHARED)
7006                 return (0);             /* shared mapping - all ok */
7007 
7008         if ((amp = svd->amp) == NULL)
7009                 return (0);             /* off original vnode */
7010 
7011         page += svd->anon_index;
7012 
7013         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
7014 
7015         oap = anon_get_ptr(amp->ahp, page);
7016         ap = anon_get_ptr(amp->ahp, page + pd);
7017 
7018         ANON_LOCK_EXIT(&amp->a_rwlock);
7019 
7020         if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) {
7021                 return (-1);            /* one with and one without an anon */
7022         }
7023 
7024         if (oap == NULL) {              /* implies that ap == NULL */
7025                 return (0);             /* off original vnode */
7026         }
7027 
7028         /*
7029          * Now we know we have two anon pointers - check to
7030          * see if they happen to be properly allocated.
7031          */
7032 
7033         /*
7034          * XXX We cheat here and don't lock the anon slots. We can't because
7035          * we may have been called from the anon layer which might already
7036          * have locked them. We are holding a refcnt on the slots so they
7037          * can't disappear. The worst that will happen is we'll get the wrong
7038          * names (vp, off) for the slots and make a poor klustering decision.
7039          */
7040         swap_xlate(ap, &vp1, &off1);
7041         swap_xlate(oap, &vp2, &off2);
7042 
7043 
7044         if (!VOP_CMP(vp1, vp2, NULL) || off1 - off2 != delta)
7045                 return (-1);
7046         return (0);
7047 }
7048 
7049 /*
7050  * Synchronize primary storage cache with real object in virtual memory.
7051  *
7052  * XXX - Anonymous pages should not be sync'ed out at all.
7053  */
7054 static int
7055 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
7056 {
7057         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
7058         struct vpage *vpp;
7059         page_t *pp;
7060         u_offset_t offset;
7061         struct vnode *vp;
7062         u_offset_t off;
7063         caddr_t eaddr;
7064         int bflags;
7065         int err = 0;
7066         int segtype;
7067         int pageprot;
7068         int prot;
7069         ulong_t anon_index;
7070         struct anon_map *amp;
7071         struct anon *ap;
7072         anon_sync_obj_t cookie;
7073 
7074         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
7075 
7076         SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
7077 
7078         if (svd->softlockcnt > 0) {
7079                 /*
7080                  * If this is shared segment non 0 softlockcnt
7081                  * means locked pages are still in use.
7082                  */
7083                 if (svd->type == MAP_SHARED) {
7084                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7085                         return (EAGAIN);
7086                 }
7087 
7088                 /*
7089                  * flush all pages from seg cache
7090                  * otherwise we may deadlock in swap_putpage
7091                  * for B_INVAL page (4175402).
7092                  *
7093                  * Even if we grab segvn WRITER's lock
7094                  * here, there might be another thread which could've
7095                  * successfully performed lookup/insert just before
7096                  * we acquired the lock here.  So, grabbing either
7097                  * lock here is of not much use.  Until we devise
7098                  * a strategy at upper layers to solve the
7099                  * synchronization issues completely, we expect
7100                  * applications to handle this appropriately.
7101                  */
7102                 segvn_purge(seg);
7103                 if (svd->softlockcnt > 0) {
7104                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7105                         return (EAGAIN);
7106                 }
7107         } else if (svd->type == MAP_SHARED && svd->amp != NULL &&
7108             svd->amp->a_softlockcnt > 0) {
7109                 /*
7110                  * Try to purge this amp's entries from pcache. It will
7111                  * succeed only if other segments that share the amp have no
7112                  * outstanding softlock's.
7113                  */
7114                 segvn_purge(seg);
7115                 if (svd->amp->a_softlockcnt > 0 || svd->softlockcnt > 0) {
7116                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7117                         return (EAGAIN);
7118                 }
7119         }
7120 
7121         vpp = svd->vpage;
7122         offset = svd->offset + (uintptr_t)(addr - seg->s_base);
7123         bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) |
7124             ((flags & MS_INVALIDATE) ? B_INVAL : 0);
7125 
7126         if (attr) {
7127                 pageprot = attr & ~(SHARED|PRIVATE);
7128                 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE;
7129 
7130                 /*
7131                  * We are done if the segment types don't match
7132                  * or if we have segment level protections and
7133                  * they don't match.
7134                  */
7135                 if (svd->type != segtype) {
7136                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7137                         return (0);
7138                 }
7139                 if (vpp == NULL) {
7140                         if (svd->prot != pageprot) {
7141                                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7142                                 return (0);
7143                         }
7144                         prot = svd->prot;
7145                 } else
7146                         vpp = &svd->vpage[seg_page(seg, addr)];
7147 
7148         } else if (svd->vp && svd->amp == NULL &&
7149             (flags & MS_INVALIDATE) == 0) {
7150 
7151                 /*
7152                  * No attributes, no anonymous pages and MS_INVALIDATE flag
7153                  * is not on, just use one big request.
7154                  */
7155                 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len,
7156                     bflags, svd->cred, NULL);
7157                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7158                 return (err);
7159         }
7160 
7161         if ((amp = svd->amp) != NULL)
7162                 anon_index = svd->anon_index + seg_page(seg, addr);
7163 
7164         for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) {
7165                 ap = NULL;
7166                 if (amp != NULL) {
7167                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
7168                         anon_array_enter(amp, anon_index, &cookie);
7169                         ap = anon_get_ptr(amp->ahp, anon_index++);
7170                         if (ap != NULL) {
7171                                 swap_xlate(ap, &vp, &off);
7172                         } else {
7173                                 vp = svd->vp;
7174                                 off = offset;
7175                         }
7176                         anon_array_exit(&cookie);
7177                         ANON_LOCK_EXIT(&amp->a_rwlock);
7178                 } else {
7179                         vp = svd->vp;
7180                         off = offset;
7181                 }
7182                 offset += PAGESIZE;
7183 
7184                 if (vp == NULL)         /* untouched zfod page */
7185                         continue;
7186 
7187                 if (attr) {
7188                         if (vpp) {
7189                                 prot = VPP_PROT(vpp);
7190                                 vpp++;
7191                         }
7192                         if (prot != pageprot) {
7193                                 continue;
7194                         }
7195                 }
7196 
7197                 /*
7198                  * See if any of these pages are locked --  if so, then we
7199                  * will have to truncate an invalidate request at the first
7200                  * locked one. We don't need the page_struct_lock to test
7201                  * as this is only advisory; even if we acquire it someone
7202                  * might race in and lock the page after we unlock and before
7203                  * we do the PUTPAGE, then PUTPAGE simply does nothing.
7204                  */
7205                 if (flags & MS_INVALIDATE) {
7206                         if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) {
7207                                 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
7208                                         page_unlock(pp);
7209                                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7210                                         return (EBUSY);
7211                                 }
7212                                 if (ap != NULL && pp->p_szc != 0 &&
7213                                     page_tryupgrade(pp)) {
7214                                         if (pp->p_lckcnt == 0 &&
7215                                             pp->p_cowcnt == 0) {
7216                                                 /*
7217                                                  * swapfs VN_DISPOSE() won't
7218                                                  * invalidate large pages.
7219                                                  * Attempt to demote.
7220                                                  * XXX can't help it if it
7221                                                  * fails. But for swapfs
7222                                                  * pages it is no big deal.
7223                                                  */
7224                                                 (void) page_try_demote_pages(
7225                                                     pp);
7226                                         }
7227                                 }
7228                                 page_unlock(pp);
7229                         }
7230                 } else if (svd->type == MAP_SHARED && amp != NULL) {
7231                         /*
7232                          * Avoid writing out to disk ISM's large pages
7233                          * because segspt_free_pages() relies on NULL an_pvp
7234                          * of anon slots of such pages.
7235                          */
7236 
7237                         ASSERT(svd->vp == NULL);
7238                         /*
7239                          * swapfs uses page_lookup_nowait if not freeing or
7240                          * invalidating and skips a page if
7241                          * page_lookup_nowait returns NULL.
7242                          */
7243                         pp = page_lookup_nowait(vp, off, SE_SHARED);
7244                         if (pp == NULL) {
7245                                 continue;
7246                         }
7247                         if (pp->p_szc != 0) {
7248                                 page_unlock(pp);
7249                                 continue;
7250                         }
7251 
7252                         /*
7253                          * Note ISM pages are created large so (vp, off)'s
7254                          * page cannot suddenly become large after we unlock
7255                          * pp.
7256                          */
7257                         page_unlock(pp);
7258                 }
7259                 /*
7260                  * XXX - Should ultimately try to kluster
7261                  * calls to VOP_PUTPAGE() for performance.
7262                  */
7263                 VN_HOLD(vp);
7264                 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE,
7265                     (bflags | (IS_SWAPFSVP(vp) ? B_PAGE_NOWAIT : 0)),
7266                     svd->cred, NULL);
7267 
7268                 VN_RELE(vp);
7269                 if (err)
7270                         break;
7271         }
7272         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7273         return (err);
7274 }
7275 
7276 /*
7277  * Determine if we have data corresponding to pages in the
7278  * primary storage virtual memory cache (i.e., "in core").
7279  */
7280 static size_t
7281 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
7282 {
7283         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
7284         struct vnode *vp, *avp;
7285         u_offset_t offset, aoffset;
7286         size_t p, ep;
7287         int ret;
7288         struct vpage *vpp;
7289         page_t *pp;
7290         uint_t start;
7291         struct anon_map *amp;           /* XXX - for locknest */
7292         struct anon *ap;
7293         uint_t attr;
7294         anon_sync_obj_t cookie;
7295 
7296         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
7297 
7298         SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
7299         if (svd->amp == NULL && svd->vp == NULL) {
7300                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7301                 bzero(vec, btopr(len));
7302                 return (len);   /* no anonymous pages created yet */
7303         }
7304 
7305         p = seg_page(seg, addr);
7306         ep = seg_page(seg, addr + len);
7307         start = svd->vp ? SEG_PAGE_VNODEBACKED : 0;
7308 
7309         amp = svd->amp;
7310         for (; p < ep; p++, addr += PAGESIZE) {
7311                 vpp = (svd->vpage) ? &svd->vpage[p]: NULL;
7312                 ret = start;
7313                 ap = NULL;
7314                 avp = NULL;
7315                 /* Grab the vnode/offset for the anon slot */
7316                 if (amp != NULL) {
7317                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
7318                         anon_array_enter(amp, svd->anon_index + p, &cookie);
7319                         ap = anon_get_ptr(amp->ahp, svd->anon_index + p);
7320                         if (ap != NULL) {
7321                                 swap_xlate(ap, &avp, &aoffset);
7322                         }
7323                         anon_array_exit(&cookie);
7324                         ANON_LOCK_EXIT(&amp->a_rwlock);
7325                 }
7326                 if ((avp != NULL) && page_exists(avp, aoffset)) {
7327                         /* A page exists for the anon slot */
7328                         ret |= SEG_PAGE_INCORE;
7329 
7330                         /*
7331                          * If page is mapped and writable
7332                          */
7333                         attr = (uint_t)0;
7334                         if ((hat_getattr(seg->s_as->a_hat, addr,
7335                             &attr) != -1) && (attr & PROT_WRITE)) {
7336                                 ret |= SEG_PAGE_ANON;
7337                         }
7338                         /*
7339                          * Don't get page_struct lock for lckcnt and cowcnt,
7340                          * since this is purely advisory.
7341                          */
7342                         if ((pp = page_lookup_nowait(avp, aoffset,
7343                             SE_SHARED)) != NULL) {
7344                                 if (pp->p_lckcnt)
7345                                         ret |= SEG_PAGE_SOFTLOCK;
7346                                 if (pp->p_cowcnt)
7347                                         ret |= SEG_PAGE_HASCOW;
7348                                 page_unlock(pp);
7349                         }
7350                 }
7351 
7352                 /* Gather vnode statistics */
7353                 vp = svd->vp;
7354                 offset = svd->offset + (uintptr_t)(addr - seg->s_base);
7355 
7356                 if (vp != NULL) {
7357                         /*
7358                          * Try to obtain a "shared" lock on the page
7359                          * without blocking.  If this fails, determine
7360                          * if the page is in memory.
7361                          */
7362                         pp = page_lookup_nowait(vp, offset, SE_SHARED);
7363                         if ((pp == NULL) && (page_exists(vp, offset))) {
7364                                 /* Page is incore, and is named */
7365                                 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE);
7366                         }
7367                         /*
7368                          * Don't get page_struct lock for lckcnt and cowcnt,
7369                          * since this is purely advisory.
7370                          */
7371                         if (pp != NULL) {
7372                                 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE);
7373                                 if (pp->p_lckcnt)
7374                                         ret |= SEG_PAGE_SOFTLOCK;
7375                                 if (pp->p_cowcnt)
7376                                         ret |= SEG_PAGE_HASCOW;
7377                                 page_unlock(pp);
7378                         }
7379                 }
7380 
7381                 /* Gather virtual page information */
7382                 if (vpp) {
7383                         if (VPP_ISPPLOCK(vpp))
7384                                 ret |= SEG_PAGE_LOCKED;
7385                         vpp++;
7386                 }
7387 
7388                 *vec++ = (char)ret;
7389         }
7390         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7391         return (len);
7392 }
7393 
7394 /*
7395  * Statement for p_cowcnts/p_lckcnts.
7396  *
7397  * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region
7398  * irrespective of the following factors or anything else:
7399  *
7400  *      (1) anon slots are populated or not
7401  *      (2) cow is broken or not
7402  *      (3) refcnt on ap is 1 or greater than 1
7403  *
7404  * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock
7405  * and munlock.
7406  *
7407  *
7408  * Handling p_cowcnts/p_lckcnts during copy-on-write fault:
7409  *
7410  *      if vpage has PROT_WRITE
7411  *              transfer cowcnt on the oldpage -> cowcnt on the newpage
7412  *      else
7413  *              transfer lckcnt on the oldpage -> lckcnt on the newpage
7414  *
7415  *      During copy-on-write, decrement p_cowcnt on the oldpage and increment
7416  *      p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE.
7417  *
7418  *      We may also break COW if softlocking on read access in the physio case.
7419  *      In this case, vpage may not have PROT_WRITE. So, we need to decrement
7420  *      p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the
7421  *      vpage doesn't have PROT_WRITE.
7422  *
7423  *
7424  * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region:
7425  *
7426  *      If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and
7427  *      increment p_lckcnt by calling page_subclaim() which takes care of
7428  *      availrmem accounting and p_lckcnt overflow.
7429  *
7430  *      If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and
7431  *      increment p_cowcnt by calling page_addclaim() which takes care of
7432  *      availrmem availability and p_cowcnt overflow.
7433  */
7434 
7435 /*
7436  * Lock down (or unlock) pages mapped by this segment.
7437  *
7438  * XXX only creates PAGESIZE pages if anon slots are not initialized.
7439  * At fault time they will be relocated into larger pages.
7440  */
7441 static int
7442 segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
7443     int attr, int op, ulong_t *lockmap, size_t pos)
7444 {
7445         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
7446         struct vpage *vpp;
7447         struct vpage *evp;
7448         page_t *pp;
7449         u_offset_t offset;
7450         u_offset_t off;
7451         int segtype;
7452         int pageprot;
7453         int claim;
7454         struct vnode *vp;
7455         ulong_t anon_index;
7456         struct anon_map *amp;
7457         struct anon *ap;
7458         struct vattr va;
7459         anon_sync_obj_t cookie;
7460         struct kshmid *sp = NULL;
7461         struct proc     *p = curproc;
7462         kproject_t      *proj = NULL;
7463         int chargeproc = 1;
7464         size_t locked_bytes = 0;
7465         size_t unlocked_bytes = 0;
7466         int err = 0;
7467 
7468         /*
7469          * Hold write lock on address space because may split or concatenate
7470          * segments
7471          */
7472         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
7473 
7474         /*
7475          * If this is a shm, use shm's project and zone, else use
7476          * project and zone of calling process
7477          */
7478 
7479         /* Determine if this segment backs a sysV shm */
7480         if (svd->amp != NULL && svd->amp->a_sp != NULL) {
7481                 ASSERT(svd->type == MAP_SHARED);
7482                 ASSERT(svd->tr_state == SEGVN_TR_OFF);
7483                 sp = svd->amp->a_sp;
7484                 proj = sp->shm_perm.ipc_proj;
7485                 chargeproc = 0;
7486         }
7487 
7488         SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
7489         if (attr) {
7490                 pageprot = attr & ~(SHARED|PRIVATE);
7491                 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE;
7492 
7493                 /*
7494                  * We are done if the segment types don't match
7495                  * or if we have segment level protections and
7496                  * they don't match.
7497                  */
7498                 if (svd->type != segtype) {
7499                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7500                         return (0);
7501                 }
7502                 if (svd->pageprot == 0 && svd->prot != pageprot) {
7503                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7504                         return (0);
7505                 }
7506         }
7507 
7508         if (op == MC_LOCK) {
7509                 if (svd->tr_state == SEGVN_TR_INIT) {
7510                         svd->tr_state = SEGVN_TR_OFF;
7511                 } else if (svd->tr_state == SEGVN_TR_ON) {
7512                         ASSERT(svd->amp != NULL);
7513                         segvn_textunrepl(seg, 0);
7514                         ASSERT(svd->amp == NULL &&
7515                             svd->tr_state == SEGVN_TR_OFF);
7516                 }
7517         }
7518 
7519         /*
7520          * If we're locking, then we must create a vpage structure if
7521          * none exists.  If we're unlocking, then check to see if there
7522          * is a vpage --  if not, then we could not have locked anything.
7523          */
7524 
7525         if ((vpp = svd->vpage) == NULL) {
7526                 if (op == MC_LOCK) {
7527                         segvn_vpage(seg);
7528                         if (svd->vpage == NULL) {
7529                                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7530                                 return (ENOMEM);
7531                         }
7532                 } else {
7533                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7534                         return (0);
7535                 }
7536         }
7537 
7538         /*
7539          * The anonymous data vector (i.e., previously
7540          * unreferenced mapping to swap space) can be allocated
7541          * by lazily testing for its existence.
7542          */
7543         if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) {
7544                 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
7545                 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
7546                 svd->amp->a_szc = seg->s_szc;
7547         }
7548 
7549         if ((amp = svd->amp) != NULL) {
7550                 anon_index = svd->anon_index + seg_page(seg, addr);
7551         }
7552 
7553         offset = svd->offset + (uintptr_t)(addr - seg->s_base);
7554         evp = &svd->vpage[seg_page(seg, addr + len)];
7555 
7556         if (sp != NULL)
7557                 mutex_enter(&sp->shm_mlock);
7558 
7559         /* determine number of unlocked bytes in range for lock operation */
7560         if (op == MC_LOCK) {
7561 
7562                 if (sp == NULL) {
7563                         for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp;
7564                             vpp++) {
7565                                 if (!VPP_ISPPLOCK(vpp))
7566                                         unlocked_bytes += PAGESIZE;
7567                         }
7568                 } else {
7569                         ulong_t         i_idx, i_edx;
7570                         anon_sync_obj_t i_cookie;
7571                         struct anon     *i_ap;
7572                         struct vnode    *i_vp;
7573                         u_offset_t      i_off;
7574 
7575                         /* Only count sysV pages once for locked memory */
7576                         i_edx = svd->anon_index + seg_page(seg, addr + len);
7577                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
7578                         for (i_idx = anon_index; i_idx < i_edx; i_idx++) {
7579                                 anon_array_enter(amp, i_idx, &i_cookie);
7580                                 i_ap = anon_get_ptr(amp->ahp, i_idx);
7581                                 if (i_ap == NULL) {
7582                                         unlocked_bytes += PAGESIZE;
7583                                         anon_array_exit(&i_cookie);
7584                                         continue;
7585                                 }
7586                                 swap_xlate(i_ap, &i_vp, &i_off);
7587                                 anon_array_exit(&i_cookie);
7588                                 pp = page_lookup(i_vp, i_off, SE_SHARED);
7589                                 if (pp == NULL) {
7590                                         unlocked_bytes += PAGESIZE;
7591                                         continue;
7592                                 } else if (pp->p_lckcnt == 0)
7593                                         unlocked_bytes += PAGESIZE;
7594                                 page_unlock(pp);
7595                         }
7596                         ANON_LOCK_EXIT(&amp->a_rwlock);
7597                 }
7598 
7599                 mutex_enter(&p->p_lock);
7600                 err = rctl_incr_locked_mem(p, proj, unlocked_bytes,
7601                     chargeproc);
7602                 mutex_exit(&p->p_lock);
7603 
7604                 if (err) {
7605                         if (sp != NULL)
7606                                 mutex_exit(&sp->shm_mlock);
7607                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7608                         return (err);
7609                 }
7610         }
7611         /*
7612          * Loop over all pages in the range.  Process if we're locking and
7613          * page has not already been locked in this mapping; or if we're
7614          * unlocking and the page has been locked.
7615          */
7616         for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp;
7617             vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) {
7618                 if ((attr == 0 || VPP_PROT(vpp) == pageprot) &&
7619                     ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) ||
7620                     (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) {
7621 
7622                         if (amp != NULL)
7623                                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
7624                         /*
7625                          * If this isn't a MAP_NORESERVE segment and
7626                          * we're locking, allocate anon slots if they
7627                          * don't exist.  The page is brought in later on.
7628                          */
7629                         if (op == MC_LOCK && svd->vp == NULL &&
7630                             ((svd->flags & MAP_NORESERVE) == 0) &&
7631                             amp != NULL &&
7632                             ((ap = anon_get_ptr(amp->ahp, anon_index))
7633                             == NULL)) {
7634                                 anon_array_enter(amp, anon_index, &cookie);
7635 
7636                                 if ((ap = anon_get_ptr(amp->ahp,
7637                                     anon_index)) == NULL) {
7638                                         pp = anon_zero(seg, addr, &ap,
7639                                             svd->cred);
7640                                         if (pp == NULL) {
7641                                                 anon_array_exit(&cookie);
7642                                                 ANON_LOCK_EXIT(&amp->a_rwlock);
7643                                                 err = ENOMEM;
7644                                                 goto out;
7645                                         }
7646                                         ASSERT(anon_get_ptr(amp->ahp,
7647                                             anon_index) == NULL);
7648                                         (void) anon_set_ptr(amp->ahp,
7649                                             anon_index, ap, ANON_SLEEP);
7650                                         page_unlock(pp);
7651                                 }
7652                                 anon_array_exit(&cookie);
7653                         }
7654 
7655                         /*
7656                          * Get name for page, accounting for
7657                          * existence of private copy.
7658                          */
7659                         ap = NULL;
7660                         if (amp != NULL) {
7661                                 anon_array_enter(amp, anon_index, &cookie);
7662                                 ap = anon_get_ptr(amp->ahp, anon_index);
7663                                 if (ap != NULL) {
7664                                         swap_xlate(ap, &vp, &off);
7665                                 } else {
7666                                         if (svd->vp == NULL &&
7667                                             (svd->flags & MAP_NORESERVE)) {
7668                                                 anon_array_exit(&cookie);
7669                                                 ANON_LOCK_EXIT(&amp->a_rwlock);
7670                                                 continue;
7671                                         }
7672                                         vp = svd->vp;
7673                                         off = offset;
7674                                 }
7675                                 if (op != MC_LOCK || ap == NULL) {
7676                                         anon_array_exit(&cookie);
7677                                         ANON_LOCK_EXIT(&amp->a_rwlock);
7678                                 }
7679                         } else {
7680                                 vp = svd->vp;
7681                                 off = offset;
7682                         }
7683 
7684                         /*
7685                          * Get page frame.  It's ok if the page is
7686                          * not available when we're unlocking, as this
7687                          * may simply mean that a page we locked got
7688                          * truncated out of existence after we locked it.
7689                          *
7690                          * Invoke VOP_GETPAGE() to obtain the page struct
7691                          * since we may need to read it from disk if its
7692                          * been paged out.
7693                          */
7694                         if (op != MC_LOCK)
7695                                 pp = page_lookup(vp, off, SE_SHARED);
7696                         else {
7697                                 page_t *pl[1 + 1];
7698                                 int error;
7699 
7700                                 ASSERT(vp != NULL);
7701 
7702                                 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE,
7703                                     (uint_t *)NULL, pl, PAGESIZE, seg, addr,
7704                                     S_OTHER, svd->cred, NULL);
7705 
7706                                 if (error && ap != NULL) {
7707                                         anon_array_exit(&cookie);
7708                                         ANON_LOCK_EXIT(&amp->a_rwlock);
7709                                 }
7710 
7711                                 /*
7712                                  * If the error is EDEADLK then we must bounce
7713                                  * up and drop all vm subsystem locks and then
7714                                  * retry the operation later
7715                                  * This behavior is a temporary measure because
7716                                  * ufs/sds logging is badly designed and will
7717                                  * deadlock if we don't allow this bounce to
7718                                  * happen.  The real solution is to re-design
7719                                  * the logging code to work properly.  See bug
7720                                  * 4125102 for details of the problem.
7721                                  */
7722                                 if (error == EDEADLK) {
7723                                         err = error;
7724                                         goto out;
7725                                 }
7726                                 /*
7727                                  * Quit if we fail to fault in the page.  Treat
7728                                  * the failure as an error, unless the addr
7729                                  * is mapped beyond the end of a file.
7730                                  */
7731                                 if (error && svd->vp) {
7732                                         va.va_mask = AT_SIZE;
7733                                         if (VOP_GETATTR(svd->vp, &va, 0,
7734                                             svd->cred, NULL) != 0) {
7735                                                 err = EIO;
7736                                                 goto out;
7737                                         }
7738                                         if (btopr(va.va_size) >=
7739                                             btopr(off + 1)) {
7740                                                 err = EIO;
7741                                                 goto out;
7742                                         }
7743                                         goto out;
7744 
7745                                 } else if (error) {
7746                                         err = EIO;
7747                                         goto out;
7748                                 }
7749                                 pp = pl[0];
7750                                 ASSERT(pp != NULL);
7751                         }
7752 
7753                         /*
7754                          * See Statement at the beginning of this routine.
7755                          *
7756                          * claim is always set if MAP_PRIVATE and PROT_WRITE
7757                          * irrespective of following factors:
7758                          *
7759                          * (1) anon slots are populated or not
7760                          * (2) cow is broken or not
7761                          * (3) refcnt on ap is 1 or greater than 1
7762                          *
7763                          * See 4140683 for details
7764                          */
7765                         claim = ((VPP_PROT(vpp) & PROT_WRITE) &&
7766                             (svd->type == MAP_PRIVATE));
7767 
7768                         /*
7769                          * Perform page-level operation appropriate to
7770                          * operation.  If locking, undo the SOFTLOCK
7771                          * performed to bring the page into memory
7772                          * after setting the lock.  If unlocking,
7773                          * and no page was found, account for the claim
7774                          * separately.
7775                          */
7776                         if (op == MC_LOCK) {
7777                                 int ret = 1;    /* Assume success */
7778 
7779                                 ASSERT(!VPP_ISPPLOCK(vpp));
7780 
7781                                 ret = page_pp_lock(pp, claim, 0);
7782                                 if (ap != NULL) {
7783                                         if (ap->an_pvp != NULL) {
7784                                                 anon_swap_free(ap, pp);
7785                                         }
7786                                         anon_array_exit(&cookie);
7787                                         ANON_LOCK_EXIT(&amp->a_rwlock);
7788                                 }
7789                                 if (ret == 0) {
7790                                         /* locking page failed */
7791                                         page_unlock(pp);
7792                                         err = EAGAIN;
7793                                         goto out;
7794                                 }
7795                                 VPP_SETPPLOCK(vpp);
7796                                 if (sp != NULL) {
7797                                         if (pp->p_lckcnt == 1)
7798                                                 locked_bytes += PAGESIZE;
7799                                 } else
7800                                         locked_bytes += PAGESIZE;
7801 
7802                                 if (lockmap != (ulong_t *)NULL)
7803                                         BT_SET(lockmap, pos);
7804 
7805                                 page_unlock(pp);
7806                         } else {
7807                                 ASSERT(VPP_ISPPLOCK(vpp));
7808                                 if (pp != NULL) {
7809                                         /* sysV pages should be locked */
7810                                         ASSERT(sp == NULL || pp->p_lckcnt > 0);
7811                                         page_pp_unlock(pp, claim, 0);
7812                                         if (sp != NULL) {
7813                                                 if (pp->p_lckcnt == 0)
7814                                                         unlocked_bytes
7815                                                             += PAGESIZE;
7816                                         } else
7817                                                 unlocked_bytes += PAGESIZE;
7818                                         page_unlock(pp);
7819                                 } else {
7820                                         ASSERT(sp == NULL);
7821                                         unlocked_bytes += PAGESIZE;
7822                                 }
7823                                 VPP_CLRPPLOCK(vpp);
7824                         }
7825                 }
7826         }
7827 out:
7828         if (op == MC_LOCK) {
7829                 /* Credit back bytes that did not get locked */
7830                 if ((unlocked_bytes - locked_bytes) > 0) {
7831                         if (proj == NULL)
7832                                 mutex_enter(&p->p_lock);
7833                         rctl_decr_locked_mem(p, proj,
7834                             (unlocked_bytes - locked_bytes), chargeproc);
7835                         if (proj == NULL)
7836                                 mutex_exit(&p->p_lock);
7837                 }
7838 
7839         } else {
7840                 /* Account bytes that were unlocked */
7841                 if (unlocked_bytes > 0) {
7842                         if (proj == NULL)
7843                                 mutex_enter(&p->p_lock);
7844                         rctl_decr_locked_mem(p, proj, unlocked_bytes,
7845                             chargeproc);
7846                         if (proj == NULL)
7847                                 mutex_exit(&p->p_lock);
7848                 }
7849         }
7850         if (sp != NULL)
7851                 mutex_exit(&sp->shm_mlock);
7852         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7853 
7854         return (err);
7855 }
7856 
7857 /*
7858  * Set advice from user for specified pages
7859  * There are 9 types of advice:
7860  *      MADV_NORMAL     - Normal (default) behavior (whatever that is)
7861  *      MADV_RANDOM     - Random page references
7862  *                              do not allow readahead or 'klustering'
7863  *      MADV_SEQUENTIAL - Sequential page references
7864  *                              Pages previous to the one currently being
7865  *                              accessed (determined by fault) are 'not needed'
7866  *                              and are freed immediately
7867  *      MADV_WILLNEED   - Pages are likely to be used (fault ahead in mctl)
7868  *      MADV_DONTNEED   - Pages are not needed (synced out in mctl)
7869  *      MADV_FREE       - Contents can be discarded
7870  *      MADV_ACCESS_DEFAULT- Default access
7871  *      MADV_ACCESS_LWP - Next LWP will access heavily
7872  *      MADV_ACCESS_MANY- Many LWPs or processes will access heavily
7873  */
7874 static int
7875 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
7876 {
7877         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
7878         size_t page;
7879         int err = 0;
7880         int already_set;
7881         struct anon_map *amp;
7882         ulong_t anon_index;
7883         struct seg *next;
7884         lgrp_mem_policy_t policy;
7885         struct seg *prev;
7886         struct vnode *vp;
7887 
7888         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
7889 
7890         /*
7891          * In case of MADV_FREE, we won't be modifying any segment private
7892          * data structures; so, we only need to grab READER's lock
7893          */
7894         if (behav != MADV_FREE) {
7895                 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
7896                 if (svd->tr_state != SEGVN_TR_OFF) {
7897                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7898                         return (0);
7899                 }
7900         } else {
7901                 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
7902         }
7903 
7904         /*
7905          * Large pages are assumed to be only turned on when accesses to the
7906          * segment's address range have spatial and temporal locality. That
7907          * justifies ignoring MADV_SEQUENTIAL for large page segments.
7908          * Also, ignore advice affecting lgroup memory allocation
7909          * if don't need to do lgroup optimizations on this system
7910          */
7911 
7912         if ((behav == MADV_SEQUENTIAL &&
7913             (seg->s_szc != 0 || HAT_IS_REGION_COOKIE_VALID(svd->rcookie))) ||
7914             (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT ||
7915             behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) {
7916                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7917                 return (0);
7918         }
7919 
7920         if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT ||
7921             behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) {
7922                 /*
7923                  * Since we are going to unload hat mappings
7924                  * we first have to flush the cache. Otherwise
7925                  * this might lead to system panic if another
7926                  * thread is doing physio on the range whose
7927                  * mappings are unloaded by madvise(3C).
7928                  */
7929                 if (svd->softlockcnt > 0) {
7930                         /*
7931                          * If this is shared segment non 0 softlockcnt
7932                          * means locked pages are still in use.
7933                          */
7934                         if (svd->type == MAP_SHARED) {
7935                                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7936                                 return (EAGAIN);
7937                         }
7938                         /*
7939                          * Since we do have the segvn writers lock
7940                          * nobody can fill the cache with entries
7941                          * belonging to this seg during the purge.
7942                          * The flush either succeeds or we still
7943                          * have pending I/Os. In the later case,
7944                          * madvise(3C) fails.
7945                          */
7946                         segvn_purge(seg);
7947                         if (svd->softlockcnt > 0) {
7948                                 /*
7949                                  * Since madvise(3C) is advisory and
7950                                  * it's not part of UNIX98, madvise(3C)
7951                                  * failure here doesn't cause any hardship.
7952                                  * Note that we don't block in "as" layer.
7953                                  */
7954                                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7955                                 return (EAGAIN);
7956                         }
7957                 } else if (svd->type == MAP_SHARED && svd->amp != NULL &&
7958                     svd->amp->a_softlockcnt > 0) {
7959                         /*
7960                          * Try to purge this amp's entries from pcache. It
7961                          * will succeed only if other segments that share the
7962                          * amp have no outstanding softlock's.
7963                          */
7964                         segvn_purge(seg);
7965                 }
7966         }
7967 
7968         amp = svd->amp;
7969         vp = svd->vp;
7970         if (behav == MADV_FREE) {
7971                 /*
7972                  * MADV_FREE is not supported for segments with
7973                  * underlying object; if anonmap is NULL, anon slots
7974                  * are not yet populated and there is nothing for
7975                  * us to do. As MADV_FREE is advisory, we don't
7976                  * return error in either case.
7977                  */
7978                 if (vp != NULL || amp == NULL) {
7979                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7980                         return (0);
7981                 }
7982 
7983                 segvn_purge(seg);
7984 
7985                 page = seg_page(seg, addr);
7986                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
7987                 anon_disclaim(amp, svd->anon_index + page, len);
7988                 ANON_LOCK_EXIT(&amp->a_rwlock);
7989                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7990                 return (0);
7991         }
7992 
7993         /*
7994          * If advice is to be applied to entire segment,
7995          * use advice field in seg_data structure
7996          * otherwise use appropriate vpage entry.
7997          */
7998         if ((addr == seg->s_base) && (len == seg->s_size)) {
7999                 switch (behav) {
8000                 case MADV_ACCESS_LWP:
8001                 case MADV_ACCESS_MANY:
8002                 case MADV_ACCESS_DEFAULT:
8003                         /*
8004                          * Set memory allocation policy for this segment
8005                          */
8006                         policy = lgrp_madv_to_policy(behav, len, svd->type);
8007                         if (svd->type == MAP_SHARED)
8008                                 already_set = lgrp_shm_policy_set(policy, amp,
8009                                     svd->anon_index, vp, svd->offset, len);
8010                         else {
8011                                 /*
8012                                  * For private memory, need writers lock on
8013                                  * address space because the segment may be
8014                                  * split or concatenated when changing policy
8015                                  */
8016                                 if (AS_READ_HELD(seg->s_as,
8017                                     &seg->s_as->a_lock)) {
8018                                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8019                                         return (IE_RETRY);
8020                                 }
8021 
8022                                 already_set = lgrp_privm_policy_set(policy,
8023                                     &svd->policy_info, len);
8024                         }
8025 
8026                         /*
8027                          * If policy set already and it shouldn't be reapplied,
8028                          * don't do anything.
8029                          */
8030                         if (already_set &&
8031                             !LGRP_MEM_POLICY_REAPPLICABLE(policy))
8032                                 break;
8033 
8034                         /*
8035                          * Mark any existing pages in given range for
8036                          * migration
8037                          */
8038                         page_mark_migrate(seg, addr, len, amp, svd->anon_index,
8039                             vp, svd->offset, 1);
8040 
8041                         /*
8042                          * If same policy set already or this is a shared
8043                          * memory segment, don't need to try to concatenate
8044                          * segment with adjacent ones.
8045                          */
8046                         if (already_set || svd->type == MAP_SHARED)
8047                                 break;
8048 
8049                         /*
8050                          * Try to concatenate this segment with previous
8051                          * one and next one, since we changed policy for
8052                          * this one and it may be compatible with adjacent
8053                          * ones now.
8054                          */
8055                         prev = AS_SEGPREV(seg->s_as, seg);
8056                         next = AS_SEGNEXT(seg->s_as, seg);
8057 
8058                         if (next && next->s_ops == &segvn_ops &&
8059                             addr + len == next->s_base)
8060                                 (void) segvn_concat(seg, next, 1);
8061 
8062                         if (prev && prev->s_ops == &segvn_ops &&
8063                             addr == prev->s_base + prev->s_size) {
8064                                 /*
8065                                  * Drop lock for private data of current
8066                                  * segment before concatenating (deleting) it
8067                                  * and return IE_REATTACH to tell as_ctl() that
8068                                  * current segment has changed
8069                                  */
8070                                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8071                                 if (!segvn_concat(prev, seg, 1))
8072                                         err = IE_REATTACH;
8073 
8074                                 return (err);
8075                         }
8076                         break;
8077 
8078                 case MADV_SEQUENTIAL:
8079                         /*
8080                          * unloading mapping guarantees
8081                          * detection in segvn_fault
8082                          */
8083                         ASSERT(seg->s_szc == 0);
8084                         ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
8085                         hat_unload(seg->s_as->a_hat, addr, len,
8086                             HAT_UNLOAD);
8087                         /* FALLTHROUGH */
8088                 case MADV_NORMAL:
8089                 case MADV_RANDOM:
8090                         svd->advice = (uchar_t)behav;
8091                         svd->pageadvice = 0;
8092                         break;
8093                 case MADV_WILLNEED:     /* handled in memcntl */
8094                 case MADV_DONTNEED:     /* handled in memcntl */
8095                 case MADV_FREE:         /* handled above */
8096                         break;
8097                 default:
8098                         err = EINVAL;
8099                 }
8100         } else {
8101                 caddr_t                 eaddr;
8102                 struct seg              *new_seg;
8103                 struct segvn_data       *new_svd;
8104                 u_offset_t              off;
8105                 caddr_t                 oldeaddr;
8106 
8107                 page = seg_page(seg, addr);
8108 
8109                 segvn_vpage(seg);
8110                 if (svd->vpage == NULL) {
8111                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8112                         return (ENOMEM);
8113                 }
8114 
8115                 switch (behav) {
8116                         struct vpage *bvpp, *evpp;
8117 
8118                 case MADV_ACCESS_LWP:
8119                 case MADV_ACCESS_MANY:
8120                 case MADV_ACCESS_DEFAULT:
8121                         /*
8122                          * Set memory allocation policy for portion of this
8123                          * segment
8124                          */
8125 
8126                         /*
8127                          * Align address and length of advice to page
8128                          * boundaries for large pages
8129                          */
8130                         if (seg->s_szc != 0) {
8131                                 size_t  pgsz;
8132 
8133                                 pgsz = page_get_pagesize(seg->s_szc);
8134                                 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz);
8135                                 len = P2ROUNDUP(len, pgsz);
8136                         }
8137 
8138                         /*
8139                          * Check to see whether policy is set already
8140                          */
8141                         policy = lgrp_madv_to_policy(behav, len, svd->type);
8142 
8143                         anon_index = svd->anon_index + page;
8144                         off = svd->offset + (uintptr_t)(addr - seg->s_base);
8145 
8146                         if (svd->type == MAP_SHARED)
8147                                 already_set = lgrp_shm_policy_set(policy, amp,
8148                                     anon_index, vp, off, len);
8149                         else
8150                                 already_set =
8151                                     (policy == svd->policy_info.mem_policy);
8152 
8153                         /*
8154                          * If policy set already and it shouldn't be reapplied,
8155                          * don't do anything.
8156                          */
8157                         if (already_set &&
8158                             !LGRP_MEM_POLICY_REAPPLICABLE(policy))
8159                                 break;
8160 
8161                         /*
8162                          * For private memory, need writers lock on
8163                          * address space because the segment may be
8164                          * split or concatenated when changing policy
8165                          */
8166                         if (svd->type == MAP_PRIVATE &&
8167                             AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) {
8168                                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8169                                 return (IE_RETRY);
8170                         }
8171 
8172                         /*
8173                          * Mark any existing pages in given range for
8174                          * migration
8175                          */
8176                         page_mark_migrate(seg, addr, len, amp, svd->anon_index,
8177                             vp, svd->offset, 1);
8178 
8179                         /*
8180                          * Don't need to try to split or concatenate
8181                          * segments, since policy is same or this is a shared
8182                          * memory segment
8183                          */
8184                         if (already_set || svd->type == MAP_SHARED)
8185                                 break;
8186 
8187                         if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
8188                                 ASSERT(svd->amp == NULL);
8189                                 ASSERT(svd->tr_state == SEGVN_TR_OFF);
8190                                 ASSERT(svd->softlockcnt == 0);
8191                                 hat_leave_region(seg->s_as->a_hat, svd->rcookie,
8192                                     HAT_REGION_TEXT);
8193                                 svd->rcookie = HAT_INVALID_REGION_COOKIE;
8194                         }
8195 
8196                         /*
8197                          * Split off new segment if advice only applies to a
8198                          * portion of existing segment starting in middle
8199                          */
8200                         new_seg = NULL;
8201                         eaddr = addr + len;
8202                         oldeaddr = seg->s_base + seg->s_size;
8203                         if (addr > seg->s_base) {
8204                                 /*
8205                                  * Must flush I/O page cache
8206                                  * before splitting segment
8207                                  */
8208                                 if (svd->softlockcnt > 0)
8209                                         segvn_purge(seg);
8210 
8211                                 /*
8212                                  * Split segment and return IE_REATTACH to tell
8213                                  * as_ctl() that current segment changed
8214                                  */
8215                                 new_seg = segvn_split_seg(seg, addr);
8216                                 new_svd = (struct segvn_data *)new_seg->s_data;
8217                                 err = IE_REATTACH;
8218 
8219                                 /*
8220                                  * If new segment ends where old one
8221                                  * did, try to concatenate the new
8222                                  * segment with next one.
8223                                  */
8224                                 if (eaddr == oldeaddr) {
8225                                         /*
8226                                          * Set policy for new segment
8227                                          */
8228                                         (void) lgrp_privm_policy_set(policy,
8229                                             &new_svd->policy_info,
8230                                             new_seg->s_size);
8231 
8232                                         next = AS_SEGNEXT(new_seg->s_as,
8233                                             new_seg);
8234 
8235                                         if (next &&
8236                                             next->s_ops == &segvn_ops &&
8237                                             eaddr == next->s_base)
8238                                                 (void) segvn_concat(new_seg,
8239                                                     next, 1);
8240                                 }
8241                         }
8242 
8243                         /*
8244                          * Split off end of existing segment if advice only
8245                          * applies to a portion of segment ending before
8246                          * end of the existing segment
8247                          */
8248                         if (eaddr < oldeaddr) {
8249                                 /*
8250                                  * Must flush I/O page cache
8251                                  * before splitting segment
8252                                  */
8253                                 if (svd->softlockcnt > 0)
8254                                         segvn_purge(seg);
8255 
8256                                 /*
8257                                  * If beginning of old segment was already
8258                                  * split off, use new segment to split end off
8259                                  * from.
8260                                  */
8261                                 if (new_seg != NULL && new_seg != seg) {
8262                                         /*
8263                                          * Split segment
8264                                          */
8265                                         (void) segvn_split_seg(new_seg, eaddr);
8266 
8267                                         /*
8268                                          * Set policy for new segment
8269                                          */
8270                                         (void) lgrp_privm_policy_set(policy,
8271                                             &new_svd->policy_info,
8272                                             new_seg->s_size);
8273                                 } else {
8274                                         /*
8275                                          * Split segment and return IE_REATTACH
8276                                          * to tell as_ctl() that current
8277                                          * segment changed
8278                                          */
8279                                         (void) segvn_split_seg(seg, eaddr);
8280                                         err = IE_REATTACH;
8281 
8282                                         (void) lgrp_privm_policy_set(policy,
8283                                             &svd->policy_info, seg->s_size);
8284 
8285                                         /*
8286                                          * If new segment starts where old one
8287                                          * did, try to concatenate it with
8288                                          * previous segment.
8289                                          */
8290                                         if (addr == seg->s_base) {
8291                                                 prev = AS_SEGPREV(seg->s_as,
8292                                                     seg);
8293 
8294                                                 /*
8295                                                  * Drop lock for private data
8296                                                  * of current segment before
8297                                                  * concatenating (deleting) it
8298                                                  */
8299                                                 if (prev &&
8300                                                     prev->s_ops ==
8301                                                     &segvn_ops &&
8302                                                     addr == prev->s_base +
8303                                                     prev->s_size) {
8304                                                         SEGVN_LOCK_EXIT(
8305                                                             seg->s_as,
8306                                                             &svd->lock);
8307                                                         (void) segvn_concat(
8308                                                             prev, seg, 1);
8309                                                         return (err);
8310                                                 }
8311                                         }
8312                                 }
8313                         }
8314                         break;
8315                 case MADV_SEQUENTIAL:
8316                         ASSERT(seg->s_szc == 0);
8317                         ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
8318                         hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD);
8319                         /* FALLTHROUGH */
8320                 case MADV_NORMAL:
8321                 case MADV_RANDOM:
8322                         bvpp = &svd->vpage[page];
8323                         evpp = &svd->vpage[page + (len >> PAGESHIFT)];
8324                         for (; bvpp < evpp; bvpp++)
8325                                 VPP_SETADVICE(bvpp, behav);
8326                         svd->advice = MADV_NORMAL;
8327                         break;
8328                 case MADV_WILLNEED:     /* handled in memcntl */
8329                 case MADV_DONTNEED:     /* handled in memcntl */
8330                 case MADV_FREE:         /* handled above */
8331                         break;
8332                 default:
8333                         err = EINVAL;
8334                 }
8335         }
8336         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8337         return (err);
8338 }
8339 
8340 /*
8341  * There is one kind of inheritance that can be specified for pages:
8342  *
8343  *     SEGP_INH_ZERO - Pages should be zeroed in the child
8344  */
8345 static int
8346 segvn_inherit(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
8347 {
8348         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
8349         struct vpage *bvpp, *evpp;
8350         size_t page;
8351         int ret = 0;
8352 
8353         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
8354 
8355         /* Can't support something we don't know about */
8356         if (behav != SEGP_INH_ZERO)
8357                 return (ENOTSUP);
8358 
8359         SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
8360 
8361         /*
8362          * This must be a straightforward anonymous segment that is mapped
8363          * privately and is not backed by a vnode.
8364          */
8365         if (svd->tr_state != SEGVN_TR_OFF ||
8366             svd->type != MAP_PRIVATE ||
8367             svd->vp != NULL) {
8368                 ret = EINVAL;
8369                 goto out;
8370         }
8371 
8372         /*
8373          * If the entire segment has been marked as inherit zero, then no reason
8374          * to do anything else.
8375          */
8376         if (svd->svn_inz == SEGVN_INZ_ALL) {
8377                 ret = 0;
8378                 goto out;
8379         }
8380 
8381         /*
8382          * If this applies to the entire segment, simply mark it and we're done.
8383          */
8384         if ((addr == seg->s_base) && (len == seg->s_size)) {
8385                 svd->svn_inz = SEGVN_INZ_ALL;
8386                 ret = 0;
8387                 goto out;
8388         }
8389 
8390         /*
8391          * We've been asked to mark a subset of this segment as inherit zero,
8392          * therefore we need to mainpulate its vpages.
8393          */
8394         if (svd->vpage == NULL) {
8395                 segvn_vpage(seg);
8396                 if (svd->vpage == NULL) {
8397                         ret = ENOMEM;
8398                         goto out;
8399                 }
8400         }
8401 
8402         svd->svn_inz = SEGVN_INZ_VPP;
8403         page = seg_page(seg, addr);
8404         bvpp = &svd->vpage[page];
8405         evpp = &svd->vpage[page + (len >> PAGESHIFT)];
8406         for (; bvpp < evpp; bvpp++)
8407                 VPP_SETINHZERO(bvpp);
8408         ret = 0;
8409 
8410 out:
8411         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8412         return (ret);
8413 }
8414 
8415 /*
8416  * Create a vpage structure for this seg.
8417  */
8418 static void
8419 segvn_vpage(struct seg *seg)
8420 {
8421         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
8422         struct vpage *vp, *evp;
8423         static pgcnt_t page_limit = 0;
8424 
8425         ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
8426 
8427         /*
8428          * If no vpage structure exists, allocate one.  Copy the protections
8429          * and the advice from the segment itself to the individual pages.
8430          */
8431         if (svd->vpage == NULL) {
8432                 /*
8433                  * Start by calculating the number of pages we must allocate to
8434                  * track the per-page vpage structs needs for this entire
8435                  * segment. If we know now that it will require more than our
8436                  * heuristic for the maximum amount of kmem we can consume then
8437                  * fail. We do this here, instead of trying to detect this deep
8438                  * in page_resv and propagating the error up, since the entire
8439                  * memory allocation stack is not amenable to passing this
8440                  * back. Instead, it wants to keep trying.
8441                  *
8442                  * As a heuristic we set a page limit of 5/8s of total_pages
8443                  * for this allocation. We use shifts so that no floating
8444                  * point conversion takes place and only need to do the
8445                  * calculation once.
8446                  */
8447                 ulong_t mem_needed = seg_pages(seg) * sizeof (struct vpage);
8448                 pgcnt_t npages = mem_needed >> PAGESHIFT;
8449 
8450                 if (page_limit == 0)
8451                         page_limit = (total_pages >> 1) + (total_pages >> 3);
8452 
8453                 if (npages > page_limit)
8454                         return;
8455 
8456                 svd->pageadvice = 1;
8457                 svd->vpage = kmem_zalloc(mem_needed, KM_SLEEP);
8458                 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)];
8459                 for (vp = svd->vpage; vp < evp; vp++) {
8460                         VPP_SETPROT(vp, svd->prot);
8461                         VPP_SETADVICE(vp, svd->advice);
8462                 }
8463         }
8464 }
8465 
8466 /*
8467  * Dump the pages belonging to this segvn segment.
8468  */
8469 static void
8470 segvn_dump(struct seg *seg)
8471 {
8472         struct segvn_data *svd;
8473         page_t *pp;
8474         struct anon_map *amp;
8475         ulong_t anon_index;
8476         struct vnode *vp;
8477         u_offset_t off, offset;
8478         pfn_t pfn;
8479         pgcnt_t page, npages;
8480         caddr_t addr;
8481 
8482         npages = seg_pages(seg);
8483         svd = (struct segvn_data *)seg->s_data;
8484         vp = svd->vp;
8485         off = offset = svd->offset;
8486         addr = seg->s_base;
8487 
8488         if ((amp = svd->amp) != NULL) {
8489                 anon_index = svd->anon_index;
8490                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
8491         }
8492 
8493         for (page = 0; page < npages; page++, offset += PAGESIZE) {
8494                 struct anon *ap;
8495                 int we_own_it = 0;
8496 
8497                 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) {
8498                         swap_xlate_nopanic(ap, &vp, &off);
8499                 } else {
8500                         vp = svd->vp;
8501                         off = offset;
8502                 }
8503 
8504                 /*
8505                  * If pp == NULL, the page either does not exist
8506                  * or is exclusively locked.  So determine if it
8507                  * exists before searching for it.
8508                  */
8509 
8510                 if ((pp = page_lookup_nowait(vp, off, SE_SHARED)))
8511                         we_own_it = 1;
8512                 else
8513                         pp = page_exists(vp, off);
8514 
8515                 if (pp) {
8516                         pfn = page_pptonum(pp);
8517                         dump_addpage(seg->s_as, addr, pfn);
8518                         if (we_own_it)
8519                                 page_unlock(pp);
8520                 }
8521                 addr += PAGESIZE;
8522                 dump_timeleft = dump_timeout;
8523         }
8524 
8525         if (amp != NULL)
8526                 ANON_LOCK_EXIT(&amp->a_rwlock);
8527 }
8528 
8529 #ifdef DEBUG
8530 static uint32_t segvn_pglock_mtbf = 0;
8531 #endif
8532 
8533 #define PCACHE_SHWLIST          ((page_t *)-2)
8534 #define NOPCACHE_SHWLIST        ((page_t *)-1)
8535 
8536 /*
8537  * Lock/Unlock anon pages over a given range. Return shadow list. This routine
8538  * uses global segment pcache to cache shadow lists (i.e. pp arrays) of pages
8539  * to avoid the overhead of per page locking, unlocking for subsequent IOs to
8540  * the same parts of the segment. Currently shadow list creation is only
8541  * supported for pure anon segments. MAP_PRIVATE segment pcache entries are
8542  * tagged with segment pointer, starting virtual address and length. This
8543  * approach for MAP_SHARED segments may add many pcache entries for the same
8544  * set of pages and lead to long hash chains that decrease pcache lookup
8545  * performance. To avoid this issue for shared segments shared anon map and
8546  * starting anon index are used for pcache entry tagging. This allows all
8547  * segments to share pcache entries for the same anon range and reduces pcache
8548  * chain's length as well as memory overhead from duplicate shadow lists and
8549  * pcache entries.
8550  *
8551  * softlockcnt field in segvn_data structure counts the number of F_SOFTLOCK'd
8552  * pages via segvn_fault() and pagelock'd pages via this routine. But pagelock
8553  * part of softlockcnt accounting is done differently for private and shared
8554  * segments. In private segment case softlock is only incremented when a new
8555  * shadow list is created but not when an existing one is found via
8556  * seg_plookup(). pcache entries have reference count incremented/decremented
8557  * by each seg_plookup()/seg_pinactive() operation. Only entries that have 0
8558  * reference count can be purged (and purging is needed before segment can be
8559  * freed). When a private segment pcache entry is purged segvn_reclaim() will
8560  * decrement softlockcnt. Since in private segment case each of its pcache
8561  * entries only belongs to this segment we can expect that when
8562  * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this
8563  * segment purge will succeed and softlockcnt will drop to 0. In shared
8564  * segment case reference count in pcache entry counts active locks from many
8565  * different segments so we can't expect segment purging to succeed even when
8566  * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this
8567  * segment. To be able to determine when there're no pending pagelocks in
8568  * shared segment case we don't rely on purging to make softlockcnt drop to 0
8569  * but instead softlockcnt is incremented and decremented for every
8570  * segvn_pagelock(L_PAGELOCK/L_PAGEUNLOCK) call regardless if a new shadow
8571  * list was created or an existing one was found. When softlockcnt drops to 0
8572  * this segment no longer has any claims for pcached shadow lists and the
8573  * segment can be freed even if there're still active pcache entries
8574  * shared by this segment anon map. Shared segment pcache entries belong to
8575  * anon map and are typically removed when anon map is freed after all
8576  * processes destroy the segments that use this anon map.
8577  */
8578 static int
8579 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp,
8580     enum lock_type type, enum seg_rw rw)
8581 {
8582         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
8583         size_t np;
8584         pgcnt_t adjustpages;
8585         pgcnt_t npages;
8586         ulong_t anon_index;
8587         uint_t protchk = (rw == S_READ) ? PROT_READ : PROT_WRITE;
8588         uint_t error;
8589         struct anon_map *amp;
8590         pgcnt_t anpgcnt;
8591         struct page **pplist, **pl, *pp;
8592         caddr_t a;
8593         size_t page;
8594         caddr_t lpgaddr, lpgeaddr;
8595         anon_sync_obj_t cookie;
8596         int anlock;
8597         struct anon_map *pamp;
8598         caddr_t paddr;
8599         seg_preclaim_cbfunc_t preclaim_callback;
8600         size_t pgsz;
8601         int use_pcache;
8602         size_t wlen;
8603         uint_t pflags = 0;
8604         int sftlck_sbase = 0;
8605         int sftlck_send = 0;
8606 
8607 #ifdef DEBUG
8608         if (type == L_PAGELOCK && segvn_pglock_mtbf) {
8609                 hrtime_t ts = gethrtime();
8610                 if ((ts % segvn_pglock_mtbf) == 0) {
8611                         return (ENOTSUP);
8612                 }
8613                 if ((ts % segvn_pglock_mtbf) == 1) {
8614                         return (EFAULT);
8615                 }
8616         }
8617 #endif
8618 
8619         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START,
8620             "segvn_pagelock: start seg %p addr %p", seg, addr);
8621 
8622         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
8623         ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
8624 
8625         SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
8626 
8627         /*
8628          * for now we only support pagelock to anon memory. We would have to
8629          * check protections for vnode objects and call into the vnode driver.
8630          * That's too much for a fast path. Let the fault entry point handle
8631          * it.
8632          */
8633         if (svd->vp != NULL) {
8634                 if (type == L_PAGELOCK) {
8635                         error = ENOTSUP;
8636                         goto out;
8637                 }
8638                 panic("segvn_pagelock(L_PAGEUNLOCK): vp != NULL");
8639         }
8640         if ((amp = svd->amp) == NULL) {
8641                 if (type == L_PAGELOCK) {
8642                         error = EFAULT;
8643                         goto out;
8644                 }
8645                 panic("segvn_pagelock(L_PAGEUNLOCK): amp == NULL");
8646         }
8647         if (rw != S_READ && rw != S_WRITE) {
8648                 if (type == L_PAGELOCK) {
8649                         error = ENOTSUP;
8650                         goto out;
8651                 }
8652                 panic("segvn_pagelock(L_PAGEUNLOCK): bad rw");
8653         }
8654 
8655         if (seg->s_szc != 0) {
8656                 /*
8657                  * We are adjusting the pagelock region to the large page size
8658                  * boundary because the unlocked part of a large page cannot
8659                  * be freed anyway unless all constituent pages of a large
8660                  * page are locked. Bigger regions reduce pcache chain length
8661                  * and improve lookup performance. The tradeoff is that the
8662                  * very first segvn_pagelock() call for a given page is more
8663                  * expensive if only 1 page_t is needed for IO. This is only
8664                  * an issue if pcache entry doesn't get reused by several
8665                  * subsequent calls. We optimize here for the case when pcache
8666                  * is heavily used by repeated IOs to the same address range.
8667                  *
8668                  * Note segment's page size cannot change while we are holding
8669                  * as lock.  And then it cannot change while softlockcnt is
8670                  * not 0. This will allow us to correctly recalculate large
8671                  * page size region for the matching pageunlock/reclaim call
8672                  * since as_pageunlock() caller must always match
8673                  * as_pagelock() call's addr and len.
8674                  *
8675                  * For pageunlock *ppp points to the pointer of page_t that
8676                  * corresponds to the real unadjusted start address. Similar
8677                  * for pagelock *ppp must point to the pointer of page_t that
8678                  * corresponds to the real unadjusted start address.
8679                  */
8680                 pgsz = page_get_pagesize(seg->s_szc);
8681                 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
8682                 adjustpages = btop((uintptr_t)(addr - lpgaddr));
8683         } else if (len < segvn_pglock_comb_thrshld) {
8684                 lpgaddr = addr;
8685                 lpgeaddr = addr + len;
8686                 adjustpages = 0;
8687                 pgsz = PAGESIZE;
8688         } else {
8689                 /*
8690                  * Align the address range of large enough requests to allow
8691                  * combining of different shadow lists into 1 to reduce memory
8692                  * overhead from potentially overlapping large shadow lists
8693                  * (worst case is we have a 1MB IO into buffers with start
8694                  * addresses separated by 4K).  Alignment is only possible if
8695                  * padded chunks have sufficient access permissions. Note
8696                  * permissions won't change between L_PAGELOCK and
8697                  * L_PAGEUNLOCK calls since non 0 softlockcnt will force
8698                  * segvn_setprot() to wait until softlockcnt drops to 0. This
8699                  * allows us to determine in L_PAGEUNLOCK the same range we
8700                  * computed in L_PAGELOCK.
8701                  *
8702                  * If alignment is limited by segment ends set
8703                  * sftlck_sbase/sftlck_send flags. In L_PAGELOCK case when
8704                  * these flags are set bump softlockcnt_sbase/softlockcnt_send
8705                  * per segment counters. In L_PAGEUNLOCK case decrease
8706                  * softlockcnt_sbase/softlockcnt_send counters if
8707                  * sftlck_sbase/sftlck_send flags are set.  When
8708                  * softlockcnt_sbase/softlockcnt_send are non 0
8709                  * segvn_concat()/segvn_extend_prev()/segvn_extend_next()
8710                  * won't merge the segments. This restriction combined with
8711                  * restriction on segment unmapping and splitting for segments
8712                  * that have non 0 softlockcnt allows L_PAGEUNLOCK to
8713                  * correctly determine the same range that was previously
8714                  * locked by matching L_PAGELOCK.
8715                  */
8716                 pflags = SEGP_PSHIFT | (segvn_pglock_comb_bshift << 16);
8717                 pgsz = PAGESIZE;
8718                 if (svd->type == MAP_PRIVATE) {
8719                         lpgaddr = (caddr_t)P2ALIGN((uintptr_t)addr,
8720                             segvn_pglock_comb_balign);
8721                         if (lpgaddr < seg->s_base) {
8722                                 lpgaddr = seg->s_base;
8723                                 sftlck_sbase = 1;
8724                         }
8725                 } else {
8726                         ulong_t aix = svd->anon_index + seg_page(seg, addr);
8727                         ulong_t aaix = P2ALIGN(aix, segvn_pglock_comb_palign);
8728                         if (aaix < svd->anon_index) {
8729                                 lpgaddr = seg->s_base;
8730                                 sftlck_sbase = 1;
8731                         } else {
8732                                 lpgaddr = addr - ptob(aix - aaix);
8733                                 ASSERT(lpgaddr >= seg->s_base);
8734                         }
8735                 }
8736                 if (svd->pageprot && lpgaddr != addr) {
8737                         struct vpage *vp = &svd->vpage[seg_page(seg, lpgaddr)];
8738                         struct vpage *evp = &svd->vpage[seg_page(seg, addr)];
8739                         while (vp < evp) {
8740                                 if ((VPP_PROT(vp) & protchk) == 0) {
8741                                         break;
8742                                 }
8743                                 vp++;
8744                         }
8745                         if (vp < evp) {
8746                                 lpgaddr = addr;
8747                                 pflags = 0;
8748                         }
8749                 }
8750                 lpgeaddr = addr + len;
8751                 if (pflags) {
8752                         if (svd->type == MAP_PRIVATE) {
8753                                 lpgeaddr = (caddr_t)P2ROUNDUP(
8754                                     (uintptr_t)lpgeaddr,
8755                                     segvn_pglock_comb_balign);
8756                         } else {
8757                                 ulong_t aix = svd->anon_index +
8758                                     seg_page(seg, lpgeaddr);
8759                                 ulong_t aaix = P2ROUNDUP(aix,
8760                                     segvn_pglock_comb_palign);
8761                                 if (aaix < aix) {
8762                                         lpgeaddr = 0;
8763                                 } else {
8764                                         lpgeaddr += ptob(aaix - aix);
8765                                 }
8766                         }
8767                         if (lpgeaddr == 0 ||
8768                             lpgeaddr > seg->s_base + seg->s_size) {
8769                                 lpgeaddr = seg->s_base + seg->s_size;
8770                                 sftlck_send = 1;
8771                         }
8772                 }
8773                 if (svd->pageprot && lpgeaddr != addr + len) {
8774                         struct vpage *vp;
8775                         struct vpage *evp;
8776 
8777                         vp = &svd->vpage[seg_page(seg, addr + len)];
8778                         evp = &svd->vpage[seg_page(seg, lpgeaddr)];
8779 
8780                         while (vp < evp) {
8781                                 if ((VPP_PROT(vp) & protchk) == 0) {
8782                                         break;
8783                                 }
8784                                 vp++;
8785                         }
8786                         if (vp < evp) {
8787                                 lpgeaddr = addr + len;
8788                         }
8789                 }
8790                 adjustpages = btop((uintptr_t)(addr - lpgaddr));
8791         }
8792 
8793         /*
8794          * For MAP_SHARED segments we create pcache entries tagged by amp and
8795          * anon index so that we can share pcache entries with other segments
8796          * that map this amp.  For private segments pcache entries are tagged
8797          * with segment and virtual address.
8798          */
8799         if (svd->type == MAP_SHARED) {
8800                 pamp = amp;
8801                 paddr = (caddr_t)((lpgaddr - seg->s_base) +
8802                     ptob(svd->anon_index));
8803                 preclaim_callback = shamp_reclaim;
8804         } else {
8805                 pamp = NULL;
8806                 paddr = lpgaddr;
8807                 preclaim_callback = segvn_reclaim;
8808         }
8809 
8810         if (type == L_PAGEUNLOCK) {
8811                 VM_STAT_ADD(segvnvmstats.pagelock[0]);
8812 
8813                 /*
8814                  * update hat ref bits for /proc. We need to make sure
8815                  * that threads tracing the ref and mod bits of the
8816                  * address space get the right data.
8817                  * Note: page ref and mod bits are updated at reclaim time
8818                  */
8819                 if (seg->s_as->a_vbits) {
8820                         for (a = addr; a < addr + len; a += PAGESIZE) {
8821                                 if (rw == S_WRITE) {
8822                                         hat_setstat(seg->s_as, a,
8823                                             PAGESIZE, P_REF | P_MOD);
8824                                 } else {
8825                                         hat_setstat(seg->s_as, a,
8826                                             PAGESIZE, P_REF);
8827                                 }
8828                         }
8829                 }
8830 
8831                 /*
8832                  * Check the shadow list entry after the last page used in
8833                  * this IO request. If it's NOPCACHE_SHWLIST the shadow list
8834                  * was not inserted into pcache and is not large page
8835                  * adjusted.  In this case call reclaim callback directly and
8836                  * don't adjust the shadow list start and size for large
8837                  * pages.
8838                  */
8839                 npages = btop(len);
8840                 if ((*ppp)[npages] == NOPCACHE_SHWLIST) {
8841                         void *ptag;
8842                         if (pamp != NULL) {
8843                                 ASSERT(svd->type == MAP_SHARED);
8844                                 ptag = (void *)pamp;
8845                                 paddr = (caddr_t)((addr - seg->s_base) +
8846                                     ptob(svd->anon_index));
8847                         } else {
8848                                 ptag = (void *)seg;
8849                                 paddr = addr;
8850                         }
8851                         (*preclaim_callback)(ptag, paddr, len, *ppp, rw, 0);
8852                 } else {
8853                         ASSERT((*ppp)[npages] == PCACHE_SHWLIST ||
8854                             IS_SWAPFSVP((*ppp)[npages]->p_vnode));
8855                         len = lpgeaddr - lpgaddr;
8856                         npages = btop(len);
8857                         seg_pinactive(seg, pamp, paddr, len,
8858                             *ppp - adjustpages, rw, pflags, preclaim_callback);
8859                 }
8860 
8861                 if (pamp != NULL) {
8862                         ASSERT(svd->type == MAP_SHARED);
8863                         ASSERT(svd->softlockcnt >= npages);
8864                         atomic_add_long((ulong_t *)&svd->softlockcnt, -npages);
8865                 }
8866 
8867                 if (sftlck_sbase) {
8868                         ASSERT(svd->softlockcnt_sbase > 0);
8869                         atomic_dec_ulong((ulong_t *)&svd->softlockcnt_sbase);
8870                 }
8871                 if (sftlck_send) {
8872                         ASSERT(svd->softlockcnt_send > 0);
8873                         atomic_dec_ulong((ulong_t *)&svd->softlockcnt_send);
8874                 }
8875 
8876                 /*
8877                  * If someone is blocked while unmapping, we purge
8878                  * segment page cache and thus reclaim pplist synchronously
8879                  * without waiting for seg_pasync_thread. This speeds up
8880                  * unmapping in cases where munmap(2) is called, while
8881                  * raw async i/o is still in progress or where a thread
8882                  * exits on data fault in a multithreaded application.
8883                  */
8884                 if (AS_ISUNMAPWAIT(seg->s_as)) {
8885                         if (svd->softlockcnt == 0) {
8886                                 mutex_enter(&seg->s_as->a_contents);
8887                                 if (AS_ISUNMAPWAIT(seg->s_as)) {
8888                                         AS_CLRUNMAPWAIT(seg->s_as);
8889                                         cv_broadcast(&seg->s_as->a_cv);
8890                                 }
8891                                 mutex_exit(&seg->s_as->a_contents);
8892                         } else if (pamp == NULL) {
8893                                 /*
8894                                  * softlockcnt is not 0 and this is a
8895                                  * MAP_PRIVATE segment. Try to purge its
8896                                  * pcache entries to reduce softlockcnt.
8897                                  * If it drops to 0 segvn_reclaim()
8898                                  * will wake up a thread waiting on
8899                                  * unmapwait flag.
8900                                  *
8901                                  * We don't purge MAP_SHARED segments with non
8902                                  * 0 softlockcnt since IO is still in progress
8903                                  * for such segments.
8904                                  */
8905                                 ASSERT(svd->type == MAP_PRIVATE);
8906                                 segvn_purge(seg);
8907                         }
8908                 }
8909                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8910                 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END,
8911                     "segvn_pagelock: unlock seg %p addr %p", seg, addr);
8912                 return (0);
8913         }
8914 
8915         /* The L_PAGELOCK case ... */
8916 
8917         VM_STAT_ADD(segvnvmstats.pagelock[1]);
8918 
8919         /*
8920          * For MAP_SHARED segments we have to check protections before
8921          * seg_plookup() since pcache entries may be shared by many segments
8922          * with potentially different page protections.
8923          */
8924         if (pamp != NULL) {
8925                 ASSERT(svd->type == MAP_SHARED);
8926                 if (svd->pageprot == 0) {
8927                         if ((svd->prot & protchk) == 0) {
8928                                 error = EACCES;
8929                                 goto out;
8930                         }
8931                 } else {
8932                         /*
8933                          * check page protections
8934                          */
8935                         caddr_t ea;
8936 
8937                         if (seg->s_szc) {
8938                                 a = lpgaddr;
8939                                 ea = lpgeaddr;
8940                         } else {
8941                                 a = addr;
8942                                 ea = addr + len;
8943                         }
8944                         for (; a < ea; a += pgsz) {
8945                                 struct vpage *vp;
8946 
8947                                 ASSERT(seg->s_szc == 0 ||
8948                                     sameprot(seg, a, pgsz));
8949                                 vp = &svd->vpage[seg_page(seg, a)];
8950                                 if ((VPP_PROT(vp) & protchk) == 0) {
8951                                         error = EACCES;
8952                                         goto out;
8953                                 }
8954                         }
8955                 }
8956         }
8957 
8958         /*
8959          * try to find pages in segment page cache
8960          */
8961         pplist = seg_plookup(seg, pamp, paddr, lpgeaddr - lpgaddr, rw, pflags);
8962         if (pplist != NULL) {
8963                 if (pamp != NULL) {
8964                         npages = btop((uintptr_t)(lpgeaddr - lpgaddr));
8965                         ASSERT(svd->type == MAP_SHARED);
8966                         atomic_add_long((ulong_t *)&svd->softlockcnt,
8967                             npages);
8968                 }
8969                 if (sftlck_sbase) {
8970                         atomic_inc_ulong((ulong_t *)&svd->softlockcnt_sbase);
8971                 }
8972                 if (sftlck_send) {
8973                         atomic_inc_ulong((ulong_t *)&svd->softlockcnt_send);
8974                 }
8975                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8976                 *ppp = pplist + adjustpages;
8977                 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END,
8978                     "segvn_pagelock: cache hit seg %p addr %p", seg, addr);
8979                 return (0);
8980         }
8981 
8982         /*
8983          * For MAP_SHARED segments we already verified above that segment
8984          * protections allow this pagelock operation.
8985          */
8986         if (pamp == NULL) {
8987                 ASSERT(svd->type == MAP_PRIVATE);
8988                 if (svd->pageprot == 0) {
8989                         if ((svd->prot & protchk) == 0) {
8990                                 error = EACCES;
8991                                 goto out;
8992                         }
8993                         if (svd->prot & PROT_WRITE) {
8994                                 wlen = lpgeaddr - lpgaddr;
8995                         } else {
8996                                 wlen = 0;
8997                                 ASSERT(rw == S_READ);
8998                         }
8999                 } else {
9000                         int wcont = 1;
9001                         /*
9002                          * check page protections
9003                          */
9004                         for (a = lpgaddr, wlen = 0; a < lpgeaddr; a += pgsz) {
9005                                 struct vpage *vp;
9006 
9007                                 ASSERT(seg->s_szc == 0 ||
9008                                     sameprot(seg, a, pgsz));
9009                                 vp = &svd->vpage[seg_page(seg, a)];
9010                                 if ((VPP_PROT(vp) & protchk) == 0) {
9011                                         error = EACCES;
9012                                         goto out;
9013                                 }
9014                                 if (wcont && (VPP_PROT(vp) & PROT_WRITE)) {
9015                                         wlen += pgsz;
9016                                 } else {
9017                                         wcont = 0;
9018                                         ASSERT(rw == S_READ);
9019                                 }
9020                         }
9021                 }
9022                 ASSERT(rw == S_READ || wlen == lpgeaddr - lpgaddr);
9023                 ASSERT(rw == S_WRITE || wlen <= lpgeaddr - lpgaddr);
9024         }
9025 
9026         /*
9027          * Only build large page adjusted shadow list if we expect to insert
9028          * it into pcache. For large enough pages it's a big overhead to
9029          * create a shadow list of the entire large page. But this overhead
9030          * should be amortized over repeated pcache hits on subsequent reuse
9031          * of this shadow list (IO into any range within this shadow list will
9032          * find it in pcache since we large page align the request for pcache
9033          * lookups). pcache performance is improved with bigger shadow lists
9034          * as it reduces the time to pcache the entire big segment and reduces
9035          * pcache chain length.
9036          */
9037         if (seg_pinsert_check(seg, pamp, paddr,
9038             lpgeaddr - lpgaddr, pflags) == SEGP_SUCCESS) {
9039                 addr = lpgaddr;
9040                 len = lpgeaddr - lpgaddr;
9041                 use_pcache = 1;
9042         } else {
9043                 use_pcache = 0;
9044                 /*
9045                  * Since this entry will not be inserted into the pcache, we
9046                  * will not do any adjustments to the starting address or
9047                  * size of the memory to be locked.
9048                  */
9049                 adjustpages = 0;
9050         }
9051         npages = btop(len);
9052 
9053         pplist = kmem_alloc(sizeof (page_t *) * (npages + 1), KM_SLEEP);
9054         pl = pplist;
9055         *ppp = pplist + adjustpages;
9056         /*
9057          * If use_pcache is 0 this shadow list is not large page adjusted.
9058          * Record this info in the last entry of shadow array so that
9059          * L_PAGEUNLOCK can determine if it should large page adjust the
9060          * address range to find the real range that was locked.
9061          */
9062         pl[npages] = use_pcache ? PCACHE_SHWLIST : NOPCACHE_SHWLIST;
9063 
9064         page = seg_page(seg, addr);
9065         anon_index = svd->anon_index + page;
9066 
9067         anlock = 0;
9068         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
9069         ASSERT(amp->a_szc >= seg->s_szc);
9070         anpgcnt = page_get_pagecnt(amp->a_szc);
9071         for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) {
9072                 struct anon *ap;
9073                 struct vnode *vp;
9074                 u_offset_t off;
9075 
9076                 /*
9077                  * Lock and unlock anon array only once per large page.
9078                  * anon_array_enter() locks the root anon slot according to
9079                  * a_szc which can't change while anon map is locked.  We lock
9080                  * anon the first time through this loop and each time we
9081                  * reach anon index that corresponds to a root of a large
9082                  * page.
9083                  */
9084                 if (a == addr || P2PHASE(anon_index, anpgcnt) == 0) {
9085                         ASSERT(anlock == 0);
9086                         anon_array_enter(amp, anon_index, &cookie);
9087                         anlock = 1;
9088                 }
9089                 ap = anon_get_ptr(amp->ahp, anon_index);
9090 
9091                 /*
9092                  * We must never use seg_pcache for COW pages
9093                  * because we might end up with original page still
9094                  * lying in seg_pcache even after private page is
9095                  * created. This leads to data corruption as
9096                  * aio_write refers to the page still in cache
9097                  * while all other accesses refer to the private
9098                  * page.
9099                  */
9100                 if (ap == NULL || ap->an_refcnt != 1) {
9101                         struct vpage *vpage;
9102 
9103                         if (seg->s_szc) {
9104                                 error = EFAULT;
9105                                 break;
9106                         }
9107                         if (svd->vpage != NULL) {
9108                                 vpage = &svd->vpage[seg_page(seg, a)];
9109                         } else {
9110                                 vpage = NULL;
9111                         }
9112                         ASSERT(anlock);
9113                         anon_array_exit(&cookie);
9114                         anlock = 0;
9115                         pp = NULL;
9116                         error = segvn_faultpage(seg->s_as->a_hat, seg, a, 0,
9117                             vpage, &pp, 0, F_INVAL, rw, 1);
9118                         if (error) {
9119                                 error = fc_decode(error);
9120                                 break;
9121                         }
9122                         anon_array_enter(amp, anon_index, &cookie);
9123                         anlock = 1;
9124                         ap = anon_get_ptr(amp->ahp, anon_index);
9125                         if (ap == NULL || ap->an_refcnt != 1) {
9126                                 error = EFAULT;
9127                                 break;
9128                         }
9129                 }
9130                 swap_xlate(ap, &vp, &off);
9131                 pp = page_lookup_nowait(vp, off, SE_SHARED);
9132                 if (pp == NULL) {
9133                         error = EFAULT;
9134                         break;
9135                 }
9136                 if (ap->an_pvp != NULL) {
9137                         anon_swap_free(ap, pp);
9138                 }
9139                 /*
9140                  * Unlock anon if this is the last slot in a large page.
9141                  */
9142                 if (P2PHASE(anon_index, anpgcnt) == anpgcnt - 1) {
9143                         ASSERT(anlock);
9144                         anon_array_exit(&cookie);
9145                         anlock = 0;
9146                 }
9147                 *pplist++ = pp;
9148         }
9149         if (anlock) {           /* Ensure the lock is dropped */
9150                 anon_array_exit(&cookie);
9151         }
9152         ANON_LOCK_EXIT(&amp->a_rwlock);
9153 
9154         if (a >= addr + len) {
9155                 atomic_add_long((ulong_t *)&svd->softlockcnt, npages);
9156                 if (pamp != NULL) {
9157                         ASSERT(svd->type == MAP_SHARED);
9158                         atomic_add_long((ulong_t *)&pamp->a_softlockcnt,
9159                             npages);
9160                         wlen = len;
9161                 }
9162                 if (sftlck_sbase) {
9163                         atomic_inc_ulong((ulong_t *)&svd->softlockcnt_sbase);
9164                 }
9165                 if (sftlck_send) {
9166                         atomic_inc_ulong((ulong_t *)&svd->softlockcnt_send);
9167                 }
9168                 if (use_pcache) {
9169                         (void) seg_pinsert(seg, pamp, paddr, len, wlen, pl,
9170                             rw, pflags, preclaim_callback);
9171                 }
9172                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
9173                 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END,
9174                     "segvn_pagelock: cache fill seg %p addr %p", seg, addr);
9175                 return (0);
9176         }
9177 
9178         pplist = pl;
9179         np = ((uintptr_t)(a - addr)) >> PAGESHIFT;
9180         while (np > (uint_t)0) {
9181                 ASSERT(PAGE_LOCKED(*pplist));
9182                 page_unlock(*pplist);
9183                 np--;
9184                 pplist++;
9185         }
9186         kmem_free(pl, sizeof (page_t *) * (npages + 1));
9187 out:
9188         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
9189         *ppp = NULL;
9190         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END,
9191             "segvn_pagelock: cache miss seg %p addr %p", seg, addr);
9192         return (error);
9193 }
9194 
9195 /*
9196  * purge any cached pages in the I/O page cache
9197  */
9198 static void
9199 segvn_purge(struct seg *seg)
9200 {
9201         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
9202 
9203         /*
9204          * pcache is only used by pure anon segments.
9205          */
9206         if (svd->amp == NULL || svd->vp != NULL) {
9207                 return;
9208         }
9209 
9210         /*
9211          * For MAP_SHARED segments non 0 segment's softlockcnt means
9212          * active IO is still in progress via this segment. So we only
9213          * purge MAP_SHARED segments when their softlockcnt is 0.
9214          */
9215         if (svd->type == MAP_PRIVATE) {
9216                 if (svd->softlockcnt) {
9217                         seg_ppurge(seg, NULL, 0);
9218                 }
9219         } else if (svd->softlockcnt == 0 && svd->amp->a_softlockcnt != 0) {
9220                 seg_ppurge(seg, svd->amp, 0);
9221         }
9222 }
9223 
9224 /*
9225  * If async argument is not 0 we are called from pcache async thread and don't
9226  * hold AS lock.
9227  */
9228 
9229 /*ARGSUSED*/
9230 static int
9231 segvn_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
9232         enum seg_rw rw, int async)
9233 {
9234         struct seg *seg = (struct seg *)ptag;
9235         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
9236         pgcnt_t np, npages;
9237         struct page **pl;
9238 
9239         npages = np = btop(len);
9240         ASSERT(npages);
9241 
9242         ASSERT(svd->vp == NULL && svd->amp != NULL);
9243         ASSERT(svd->softlockcnt >= npages);
9244         ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
9245 
9246         pl = pplist;
9247 
9248         ASSERT(pl[np] == NOPCACHE_SHWLIST || pl[np] == PCACHE_SHWLIST);
9249         ASSERT(!async || pl[np] == PCACHE_SHWLIST);
9250 
9251         while (np > (uint_t)0) {
9252                 if (rw == S_WRITE) {
9253                         hat_setrefmod(*pplist);
9254                 } else {
9255                         hat_setref(*pplist);
9256                 }
9257                 page_unlock(*pplist);
9258                 np--;
9259                 pplist++;
9260         }
9261 
9262         kmem_free(pl, sizeof (page_t *) * (npages + 1));
9263 
9264         /*
9265          * If we are pcache async thread we don't hold AS lock. This means if
9266          * softlockcnt drops to 0 after the decrement below address space may
9267          * get freed. We can't allow it since after softlock derement to 0 we
9268          * still need to access as structure for possible wakeup of unmap
9269          * waiters. To prevent the disappearance of as we take this segment
9270          * segfree_syncmtx. segvn_free() also takes this mutex as a barrier to
9271          * make sure this routine completes before segment is freed.
9272          *
9273          * The second complication we have to deal with in async case is a
9274          * possibility of missed wake up of unmap wait thread. When we don't
9275          * hold as lock here we may take a_contents lock before unmap wait
9276          * thread that was first to see softlockcnt was still not 0. As a
9277          * result we'll fail to wake up an unmap wait thread. To avoid this
9278          * race we set nounmapwait flag in as structure if we drop softlockcnt
9279          * to 0 when we were called by pcache async thread.  unmapwait thread
9280          * will not block if this flag is set.
9281          */
9282         if (async) {
9283                 mutex_enter(&svd->segfree_syncmtx);
9284         }
9285 
9286         if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -npages)) {
9287                 if (async || AS_ISUNMAPWAIT(seg->s_as)) {
9288                         mutex_enter(&seg->s_as->a_contents);
9289                         if (async) {
9290                                 AS_SETNOUNMAPWAIT(seg->s_as);
9291                         }
9292                         if (AS_ISUNMAPWAIT(seg->s_as)) {
9293                                 AS_CLRUNMAPWAIT(seg->s_as);
9294                                 cv_broadcast(&seg->s_as->a_cv);
9295                         }
9296                         mutex_exit(&seg->s_as->a_contents);
9297                 }
9298         }
9299 
9300         if (async) {
9301                 mutex_exit(&svd->segfree_syncmtx);
9302         }
9303         return (0);
9304 }
9305 
9306 /*ARGSUSED*/
9307 static int
9308 shamp_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
9309         enum seg_rw rw, int async)
9310 {
9311         amp_t *amp = (amp_t *)ptag;
9312         pgcnt_t np, npages;
9313         struct page **pl;
9314 
9315         npages = np = btop(len);
9316         ASSERT(npages);
9317         ASSERT(amp->a_softlockcnt >= npages);
9318 
9319         pl = pplist;
9320 
9321         ASSERT(pl[np] == NOPCACHE_SHWLIST || pl[np] == PCACHE_SHWLIST);
9322         ASSERT(!async || pl[np] == PCACHE_SHWLIST);
9323 
9324         while (np > (uint_t)0) {
9325                 if (rw == S_WRITE) {
9326                         hat_setrefmod(*pplist);
9327                 } else {
9328                         hat_setref(*pplist);
9329                 }
9330                 page_unlock(*pplist);
9331                 np--;
9332                 pplist++;
9333         }
9334 
9335         kmem_free(pl, sizeof (page_t *) * (npages + 1));
9336 
9337         /*
9338          * If somebody sleeps in anonmap_purge() wake them up if a_softlockcnt
9339          * drops to 0. anon map can't be freed until a_softlockcnt drops to 0
9340          * and anonmap_purge() acquires a_purgemtx.
9341          */
9342         mutex_enter(&amp->a_purgemtx);
9343         if (!atomic_add_long_nv((ulong_t *)&amp->a_softlockcnt, -npages) &&
9344             amp->a_purgewait) {
9345                 amp->a_purgewait = 0;
9346                 cv_broadcast(&amp->a_purgecv);
9347         }
9348         mutex_exit(&amp->a_purgemtx);
9349         return (0);
9350 }
9351 
9352 /*
9353  * get a memory ID for an addr in a given segment
9354  *
9355  * XXX only creates PAGESIZE pages if anon slots are not initialized.
9356  * At fault time they will be relocated into larger pages.
9357  */
9358 static int
9359 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
9360 {
9361         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
9362         struct anon     *ap = NULL;
9363         ulong_t         anon_index;
9364         struct anon_map *amp;
9365         anon_sync_obj_t cookie;
9366 
9367         if (svd->type == MAP_PRIVATE) {
9368                 memidp->val[0] = (uintptr_t)seg->s_as;
9369                 memidp->val[1] = (uintptr_t)addr;
9370                 return (0);
9371         }
9372 
9373         if (svd->type == MAP_SHARED) {
9374                 if (svd->vp) {
9375                         memidp->val[0] = (uintptr_t)svd->vp;
9376                         memidp->val[1] = (u_longlong_t)svd->offset +
9377                             (uintptr_t)(addr - seg->s_base);
9378                         return (0);
9379                 } else {
9380 
9381                         SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
9382                         if ((amp = svd->amp) != NULL) {
9383                                 anon_index = svd->anon_index +
9384                                     seg_page(seg, addr);
9385                         }
9386                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
9387 
9388                         ASSERT(amp != NULL);
9389 
9390                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
9391                         anon_array_enter(amp, anon_index, &cookie);
9392                         ap = anon_get_ptr(amp->ahp, anon_index);
9393                         if (ap == NULL) {
9394                                 page_t          *pp;
9395 
9396                                 pp = anon_zero(seg, addr, &ap, svd->cred);
9397                                 if (pp == NULL) {
9398                                         anon_array_exit(&cookie);
9399                                         ANON_LOCK_EXIT(&amp->a_rwlock);
9400                                         return (ENOMEM);
9401                                 }
9402                                 ASSERT(anon_get_ptr(amp->ahp, anon_index)
9403                                     == NULL);
9404                                 (void) anon_set_ptr(amp->ahp, anon_index,
9405                                     ap, ANON_SLEEP);
9406                                 page_unlock(pp);
9407                         }
9408 
9409                         anon_array_exit(&cookie);
9410                         ANON_LOCK_EXIT(&amp->a_rwlock);
9411 
9412                         memidp->val[0] = (uintptr_t)ap;
9413                         memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
9414                         return (0);
9415                 }
9416         }
9417         return (EINVAL);
9418 }
9419 
9420 static int
9421 sameprot(struct seg *seg, caddr_t a, size_t len)
9422 {
9423         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
9424         struct vpage *vpage;
9425         spgcnt_t pages = btop(len);
9426         uint_t prot;
9427 
9428         if (svd->pageprot == 0)
9429                 return (1);
9430 
9431         ASSERT(svd->vpage != NULL);
9432 
9433         vpage = &svd->vpage[seg_page(seg, a)];
9434         prot = VPP_PROT(vpage);
9435         vpage++;
9436         pages--;
9437         while (pages-- > 0) {
9438                 if (prot != VPP_PROT(vpage))
9439                         return (0);
9440                 vpage++;
9441         }
9442         return (1);
9443 }
9444 
9445 /*
9446  * Get memory allocation policy info for specified address in given segment
9447  */
9448 static lgrp_mem_policy_info_t *
9449 segvn_getpolicy(struct seg *seg, caddr_t addr)
9450 {
9451         struct anon_map         *amp;
9452         ulong_t                 anon_index;
9453         lgrp_mem_policy_info_t  *policy_info;
9454         struct segvn_data       *svn_data;
9455         u_offset_t              vn_off;
9456         vnode_t                 *vp;
9457 
9458         ASSERT(seg != NULL);
9459 
9460         svn_data = (struct segvn_data *)seg->s_data;
9461         if (svn_data == NULL)
9462                 return (NULL);
9463 
9464         /*
9465          * Get policy info for private or shared memory
9466          */
9467         if (svn_data->type != MAP_SHARED) {
9468                 if (svn_data->tr_state != SEGVN_TR_ON) {
9469                         policy_info = &svn_data->policy_info;
9470                 } else {
9471                         policy_info = &svn_data->tr_policy_info;
9472                         ASSERT(policy_info->mem_policy ==
9473                             LGRP_MEM_POLICY_NEXT_SEG);
9474                 }
9475         } else {
9476                 amp = svn_data->amp;
9477                 anon_index = svn_data->anon_index + seg_page(seg, addr);
9478                 vp = svn_data->vp;
9479                 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base);
9480                 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off);
9481         }
9482 
9483         return (policy_info);
9484 }
9485 
9486 /*
9487  * Bind text vnode segment to an amp. If we bind successfully mappings will be
9488  * established to per vnode mapping per lgroup amp pages instead of to vnode
9489  * pages. There's one amp per vnode text mapping per lgroup. Many processes
9490  * may share the same text replication amp. If a suitable amp doesn't already
9491  * exist in svntr hash table create a new one.  We may fail to bind to amp if
9492  * segment is not eligible for text replication.  Code below first checks for
9493  * these conditions. If binding is successful segment tr_state is set to on
9494  * and svd->amp points to the amp to use. Otherwise tr_state is set to off and
9495  * svd->amp remains as NULL.
9496  */
9497 static void
9498 segvn_textrepl(struct seg *seg)
9499 {
9500         struct segvn_data       *svd = (struct segvn_data *)seg->s_data;
9501         vnode_t                 *vp = svd->vp;
9502         u_offset_t              off = svd->offset;
9503         size_t                  size = seg->s_size;
9504         u_offset_t              eoff = off + size;
9505         uint_t                  szc = seg->s_szc;
9506         ulong_t                 hash = SVNTR_HASH_FUNC(vp);
9507         svntr_t                 *svntrp;
9508         struct vattr            va;
9509         proc_t                  *p = seg->s_as->a_proc;
9510         lgrp_id_t               lgrp_id;
9511         lgrp_id_t               olid;
9512         int                     first;
9513         struct anon_map         *amp;
9514 
9515         ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
9516         ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
9517         ASSERT(p != NULL);
9518         ASSERT(svd->tr_state == SEGVN_TR_INIT);
9519         ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie));
9520         ASSERT(svd->flags & MAP_TEXT);
9521         ASSERT(svd->type == MAP_PRIVATE);
9522         ASSERT(vp != NULL && svd->amp == NULL);
9523         ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE));
9524         ASSERT(!(svd->flags & MAP_NORESERVE) && svd->swresv == 0);
9525         ASSERT(seg->s_as != &kas);
9526         ASSERT(off < eoff);
9527         ASSERT(svntr_hashtab != NULL);
9528 
9529         /*
9530          * If numa optimizations are no longer desired bail out.
9531          */
9532         if (!lgrp_optimizations()) {
9533                 svd->tr_state = SEGVN_TR_OFF;
9534                 return;
9535         }
9536 
9537         /*
9538          * Avoid creating anon maps with size bigger than the file size.
9539          * If VOP_GETATTR() call fails bail out.
9540          */
9541         va.va_mask = AT_SIZE | AT_MTIME | AT_CTIME;
9542         if (VOP_GETATTR(vp, &va, 0, svd->cred, NULL) != 0) {
9543                 svd->tr_state = SEGVN_TR_OFF;
9544                 SEGVN_TR_ADDSTAT(gaerr);
9545                 return;
9546         }
9547         if (btopr(va.va_size) < btopr(eoff)) {
9548                 svd->tr_state = SEGVN_TR_OFF;
9549                 SEGVN_TR_ADDSTAT(overmap);
9550                 return;
9551         }
9552 
9553         /*
9554          * VVMEXEC may not be set yet if exec() prefaults text segment. Set
9555          * this flag now before vn_is_mapped(V_WRITE) so that MAP_SHARED
9556          * mapping that checks if trcache for this vnode needs to be
9557          * invalidated can't miss us.
9558          */
9559         if (!(vp->v_flag & VVMEXEC)) {
9560                 mutex_enter(&vp->v_lock);
9561                 vp->v_flag |= VVMEXEC;
9562                 mutex_exit(&vp->v_lock);
9563         }
9564         mutex_enter(&svntr_hashtab[hash].tr_lock);
9565         /*
9566          * Bail out if potentially MAP_SHARED writable mappings exist to this
9567          * vnode.  We don't want to use old file contents from existing
9568          * replicas if this mapping was established after the original file
9569          * was changed.
9570          */
9571         if (vn_is_mapped(vp, V_WRITE)) {
9572                 mutex_exit(&svntr_hashtab[hash].tr_lock);
9573                 svd->tr_state = SEGVN_TR_OFF;
9574                 SEGVN_TR_ADDSTAT(wrcnt);
9575                 return;
9576         }
9577         svntrp = svntr_hashtab[hash].tr_head;
9578         for (; svntrp != NULL; svntrp = svntrp->tr_next) {
9579                 ASSERT(svntrp->tr_refcnt != 0);
9580                 if (svntrp->tr_vp != vp) {
9581                         continue;
9582                 }
9583 
9584                 /*
9585                  * Bail out if the file or its attributes were changed after
9586                  * this replication entry was created since we need to use the
9587                  * latest file contents. Note that mtime test alone is not
9588                  * sufficient because a user can explicitly change mtime via
9589                  * utimes(2) interfaces back to the old value after modifiying
9590                  * the file contents. To detect this case we also have to test
9591                  * ctime which among other things records the time of the last
9592                  * mtime change by utimes(2). ctime is not changed when the file
9593                  * is only read or executed so we expect that typically existing
9594                  * replication amp's can be used most of the time.
9595                  */
9596                 if (!svntrp->tr_valid ||
9597                     svntrp->tr_mtime.tv_sec != va.va_mtime.tv_sec ||
9598                     svntrp->tr_mtime.tv_nsec != va.va_mtime.tv_nsec ||
9599                     svntrp->tr_ctime.tv_sec != va.va_ctime.tv_sec ||
9600                     svntrp->tr_ctime.tv_nsec != va.va_ctime.tv_nsec) {
9601                         mutex_exit(&svntr_hashtab[hash].tr_lock);
9602                         svd->tr_state = SEGVN_TR_OFF;
9603                         SEGVN_TR_ADDSTAT(stale);
9604                         return;
9605                 }
9606                 /*
9607                  * if off, eoff and szc match current segment we found the
9608                  * existing entry we can use.
9609                  */
9610                 if (svntrp->tr_off == off && svntrp->tr_eoff == eoff &&
9611                     svntrp->tr_szc == szc) {
9612                         break;
9613                 }
9614                 /*
9615                  * Don't create different but overlapping in file offsets
9616                  * entries to avoid replication of the same file pages more
9617                  * than once per lgroup.
9618                  */
9619                 if ((off >= svntrp->tr_off && off < svntrp->tr_eoff) ||
9620                     (eoff > svntrp->tr_off && eoff <= svntrp->tr_eoff)) {
9621                         mutex_exit(&svntr_hashtab[hash].tr_lock);
9622                         svd->tr_state = SEGVN_TR_OFF;
9623                         SEGVN_TR_ADDSTAT(overlap);
9624                         return;
9625                 }
9626         }
9627         /*
9628          * If we didn't find existing entry create a new one.
9629          */
9630         if (svntrp == NULL) {
9631                 svntrp = kmem_cache_alloc(svntr_cache, KM_NOSLEEP);
9632                 if (svntrp == NULL) {
9633                         mutex_exit(&svntr_hashtab[hash].tr_lock);
9634                         svd->tr_state = SEGVN_TR_OFF;
9635                         SEGVN_TR_ADDSTAT(nokmem);
9636                         return;
9637                 }
9638 #ifdef DEBUG
9639                 {
9640                         lgrp_id_t i;
9641                         for (i = 0; i < NLGRPS_MAX; i++) {
9642                                 ASSERT(svntrp->tr_amp[i] == NULL);
9643                         }
9644                 }
9645 #endif /* DEBUG */
9646                 svntrp->tr_vp = vp;
9647                 svntrp->tr_off = off;
9648                 svntrp->tr_eoff = eoff;
9649                 svntrp->tr_szc = szc;
9650                 svntrp->tr_valid = 1;
9651                 svntrp->tr_mtime = va.va_mtime;
9652                 svntrp->tr_ctime = va.va_ctime;
9653                 svntrp->tr_refcnt = 0;
9654                 svntrp->tr_next = svntr_hashtab[hash].tr_head;
9655                 svntr_hashtab[hash].tr_head = svntrp;
9656         }
9657         first = 1;
9658 again:
9659         /*
9660          * We want to pick a replica with pages on main thread's (t_tid = 1,
9661          * aka T1) lgrp. Currently text replication is only optimized for
9662          * workloads that either have all threads of a process on the same
9663          * lgrp or execute their large text primarily on main thread.
9664          */
9665         lgrp_id = p->p_t1_lgrpid;
9666         if (lgrp_id == LGRP_NONE) {
9667                 /*
9668                  * In case exec() prefaults text on non main thread use
9669                  * current thread lgrpid.  It will become main thread anyway
9670                  * soon.
9671                  */
9672                 lgrp_id = lgrp_home_id(curthread);
9673         }
9674         /*
9675          * Set p_tr_lgrpid to lgrpid if it hasn't been set yet.  Otherwise
9676          * just set it to NLGRPS_MAX if it's different from current process T1
9677          * home lgrp.  p_tr_lgrpid is used to detect if process uses text
9678          * replication and T1 new home is different from lgrp used for text
9679          * replication. When this happens asyncronous segvn thread rechecks if
9680          * segments should change lgrps used for text replication.  If we fail
9681          * to set p_tr_lgrpid with atomic_cas_32 then set it to NLGRPS_MAX
9682          * without cas if it's not already NLGRPS_MAX and not equal lgrp_id
9683          * we want to use.  We don't need to use cas in this case because
9684          * another thread that races in between our non atomic check and set
9685          * may only change p_tr_lgrpid to NLGRPS_MAX at this point.
9686          */
9687         ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX);
9688         olid = p->p_tr_lgrpid;
9689         if (lgrp_id != olid && olid != NLGRPS_MAX) {
9690                 lgrp_id_t nlid = (olid == LGRP_NONE) ? lgrp_id : NLGRPS_MAX;
9691                 if (atomic_cas_32((uint32_t *)&p->p_tr_lgrpid, olid, nlid) !=
9692                     olid) {
9693                         olid = p->p_tr_lgrpid;
9694                         ASSERT(olid != LGRP_NONE);
9695                         if (olid != lgrp_id && olid != NLGRPS_MAX) {
9696                                 p->p_tr_lgrpid = NLGRPS_MAX;
9697                         }
9698                 }
9699                 ASSERT(p->p_tr_lgrpid != LGRP_NONE);
9700                 membar_producer();
9701                 /*
9702                  * lgrp_move_thread() won't schedule async recheck after
9703                  * p->p_t1_lgrpid update unless p->p_tr_lgrpid is not
9704                  * LGRP_NONE. Recheck p_t1_lgrpid once now that p->p_tr_lgrpid
9705                  * is not LGRP_NONE.
9706                  */
9707                 if (first && p->p_t1_lgrpid != LGRP_NONE &&
9708                     p->p_t1_lgrpid != lgrp_id) {
9709                         first = 0;
9710                         goto again;
9711                 }
9712         }
9713         /*
9714          * If no amp was created yet for lgrp_id create a new one as long as
9715          * we have enough memory to afford it.
9716          */
9717         if ((amp = svntrp->tr_amp[lgrp_id]) == NULL) {
9718                 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size);
9719                 if (trmem > segvn_textrepl_max_bytes) {
9720                         SEGVN_TR_ADDSTAT(normem);
9721                         goto fail;
9722                 }
9723                 if (anon_try_resv_zone(size, NULL) == 0) {
9724                         SEGVN_TR_ADDSTAT(noanon);
9725                         goto fail;
9726                 }
9727                 amp = anonmap_alloc(size, size, ANON_NOSLEEP);
9728                 if (amp == NULL) {
9729                         anon_unresv_zone(size, NULL);
9730                         SEGVN_TR_ADDSTAT(nokmem);
9731                         goto fail;
9732                 }
9733                 ASSERT(amp->refcnt == 1);
9734                 amp->a_szc = szc;
9735                 svntrp->tr_amp[lgrp_id] = amp;
9736                 SEGVN_TR_ADDSTAT(newamp);
9737         }
9738         svntrp->tr_refcnt++;
9739         ASSERT(svd->svn_trnext == NULL);
9740         ASSERT(svd->svn_trprev == NULL);
9741         svd->svn_trnext = svntrp->tr_svnhead;
9742         svd->svn_trprev = NULL;
9743         if (svntrp->tr_svnhead != NULL) {
9744                 svntrp->tr_svnhead->svn_trprev = svd;
9745         }
9746         svntrp->tr_svnhead = svd;
9747         ASSERT(amp->a_szc == szc && amp->size == size && amp->swresv == size);
9748         ASSERT(amp->refcnt >= 1);
9749         svd->amp = amp;
9750         svd->anon_index = 0;
9751         svd->tr_policy_info.mem_policy = LGRP_MEM_POLICY_NEXT_SEG;
9752         svd->tr_policy_info.mem_lgrpid = lgrp_id;
9753         svd->tr_state = SEGVN_TR_ON;
9754         mutex_exit(&svntr_hashtab[hash].tr_lock);
9755         SEGVN_TR_ADDSTAT(repl);
9756         return;
9757 fail:
9758         ASSERT(segvn_textrepl_bytes >= size);
9759         atomic_add_long(&segvn_textrepl_bytes, -size);
9760         ASSERT(svntrp != NULL);
9761         ASSERT(svntrp->tr_amp[lgrp_id] == NULL);
9762         if (svntrp->tr_refcnt == 0) {
9763                 ASSERT(svntrp == svntr_hashtab[hash].tr_head);
9764                 svntr_hashtab[hash].tr_head = svntrp->tr_next;
9765                 mutex_exit(&svntr_hashtab[hash].tr_lock);
9766                 kmem_cache_free(svntr_cache, svntrp);
9767         } else {
9768                 mutex_exit(&svntr_hashtab[hash].tr_lock);
9769         }
9770         svd->tr_state = SEGVN_TR_OFF;
9771 }
9772 
9773 /*
9774  * Convert seg back to regular vnode mapping seg by unbinding it from its text
9775  * replication amp.  This routine is most typically called when segment is
9776  * unmapped but can also be called when segment no longer qualifies for text
9777  * replication (e.g. due to protection changes). If unload_unmap is set use
9778  * HAT_UNLOAD_UNMAP flag in hat_unload_callback().  If we are the last user of
9779  * svntr free all its anon maps and remove it from the hash table.
9780  */
9781 static void
9782 segvn_textunrepl(struct seg *seg, int unload_unmap)
9783 {
9784         struct segvn_data       *svd = (struct segvn_data *)seg->s_data;
9785         vnode_t                 *vp = svd->vp;
9786         u_offset_t              off = svd->offset;
9787         size_t                  size = seg->s_size;
9788         u_offset_t              eoff = off + size;
9789         uint_t                  szc = seg->s_szc;
9790         ulong_t                 hash = SVNTR_HASH_FUNC(vp);
9791         svntr_t                 *svntrp;
9792         svntr_t                 **prv_svntrp;
9793         lgrp_id_t               lgrp_id = svd->tr_policy_info.mem_lgrpid;
9794         lgrp_id_t               i;
9795 
9796         ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
9797         ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) ||
9798             SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
9799         ASSERT(svd->tr_state == SEGVN_TR_ON);
9800         ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie));
9801         ASSERT(svd->amp != NULL);
9802         ASSERT(svd->amp->refcnt >= 1);
9803         ASSERT(svd->anon_index == 0);
9804         ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX);
9805         ASSERT(svntr_hashtab != NULL);
9806 
9807         mutex_enter(&svntr_hashtab[hash].tr_lock);
9808         prv_svntrp = &svntr_hashtab[hash].tr_head;
9809         for (; (svntrp = *prv_svntrp) != NULL; prv_svntrp = &svntrp->tr_next) {
9810                 ASSERT(svntrp->tr_refcnt != 0);
9811                 if (svntrp->tr_vp == vp && svntrp->tr_off == off &&
9812                     svntrp->tr_eoff == eoff && svntrp->tr_szc == szc) {
9813                         break;
9814                 }
9815         }
9816         if (svntrp == NULL) {
9817                 panic("segvn_textunrepl: svntr record not found");
9818         }
9819         if (svntrp->tr_amp[lgrp_id] != svd->amp) {
9820                 panic("segvn_textunrepl: amp mismatch");
9821         }
9822         svd->tr_state = SEGVN_TR_OFF;
9823         svd->amp = NULL;
9824         if (svd->svn_trprev == NULL) {
9825                 ASSERT(svntrp->tr_svnhead == svd);
9826                 svntrp->tr_svnhead = svd->svn_trnext;
9827                 if (svntrp->tr_svnhead != NULL) {
9828                         svntrp->tr_svnhead->svn_trprev = NULL;
9829                 }
9830                 svd->svn_trnext = NULL;
9831         } else {
9832                 svd->svn_trprev->svn_trnext = svd->svn_trnext;
9833                 if (svd->svn_trnext != NULL) {
9834                         svd->svn_trnext->svn_trprev = svd->svn_trprev;
9835                         svd->svn_trnext = NULL;
9836                 }
9837                 svd->svn_trprev = NULL;
9838         }
9839         if (--svntrp->tr_refcnt) {
9840                 mutex_exit(&svntr_hashtab[hash].tr_lock);
9841                 goto done;
9842         }
9843         *prv_svntrp = svntrp->tr_next;
9844         mutex_exit(&svntr_hashtab[hash].tr_lock);
9845         for (i = 0; i < NLGRPS_MAX; i++) {
9846                 struct anon_map *amp = svntrp->tr_amp[i];
9847                 if (amp == NULL) {
9848                         continue;
9849                 }
9850                 ASSERT(amp->refcnt == 1);
9851                 ASSERT(amp->swresv == size);
9852                 ASSERT(amp->size == size);
9853                 ASSERT(amp->a_szc == szc);
9854                 if (amp->a_szc != 0) {
9855                         anon_free_pages(amp->ahp, 0, size, szc);
9856                 } else {
9857                         anon_free(amp->ahp, 0, size);
9858                 }
9859                 svntrp->tr_amp[i] = NULL;
9860                 ASSERT(segvn_textrepl_bytes >= size);
9861                 atomic_add_long(&segvn_textrepl_bytes, -size);
9862                 anon_unresv_zone(amp->swresv, NULL);
9863                 amp->refcnt = 0;
9864                 anonmap_free(amp);
9865         }
9866         kmem_cache_free(svntr_cache, svntrp);
9867 done:
9868         hat_unload_callback(seg->s_as->a_hat, seg->s_base, size,
9869             unload_unmap ? HAT_UNLOAD_UNMAP : 0, NULL);
9870 }
9871 
9872 /*
9873  * This is called when a MAP_SHARED writable mapping is created to a vnode
9874  * that is currently used for execution (VVMEXEC flag is set). In this case we
9875  * need to prevent further use of existing replicas.
9876  */
9877 static void
9878 segvn_inval_trcache(vnode_t *vp)
9879 {
9880         ulong_t                 hash = SVNTR_HASH_FUNC(vp);
9881         svntr_t                 *svntrp;
9882 
9883         ASSERT(vp->v_flag & VVMEXEC);
9884 
9885         if (svntr_hashtab == NULL) {
9886                 return;
9887         }
9888 
9889         mutex_enter(&svntr_hashtab[hash].tr_lock);
9890         svntrp = svntr_hashtab[hash].tr_head;
9891         for (; svntrp != NULL; svntrp = svntrp->tr_next) {
9892                 ASSERT(svntrp->tr_refcnt != 0);
9893                 if (svntrp->tr_vp == vp && svntrp->tr_valid) {
9894                         svntrp->tr_valid = 0;
9895                 }
9896         }
9897         mutex_exit(&svntr_hashtab[hash].tr_lock);
9898 }
9899 
9900 static void
9901 segvn_trasync_thread(void)
9902 {
9903         callb_cpr_t cpr_info;
9904         kmutex_t cpr_lock;      /* just for CPR stuff */
9905 
9906         mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
9907 
9908         CALLB_CPR_INIT(&cpr_info, &cpr_lock,
9909             callb_generic_cpr, "segvn_async");
9910 
9911         if (segvn_update_textrepl_interval == 0) {
9912                 segvn_update_textrepl_interval = segvn_update_tr_time * hz;
9913         } else {
9914                 segvn_update_textrepl_interval *= hz;
9915         }
9916         (void) timeout(segvn_trupdate_wakeup, NULL,
9917             segvn_update_textrepl_interval);
9918 
9919         for (;;) {
9920                 mutex_enter(&cpr_lock);
9921                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
9922                 mutex_exit(&cpr_lock);
9923                 sema_p(&segvn_trasync_sem);
9924                 mutex_enter(&cpr_lock);
9925                 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
9926                 mutex_exit(&cpr_lock);
9927                 segvn_trupdate();
9928         }
9929 }
9930 
9931 static uint64_t segvn_lgrp_trthr_migrs_snpsht = 0;
9932 
9933 static void
9934 segvn_trupdate_wakeup(void *dummy)
9935 {
9936         uint64_t cur_lgrp_trthr_migrs = lgrp_get_trthr_migrations();
9937 
9938         if (cur_lgrp_trthr_migrs != segvn_lgrp_trthr_migrs_snpsht) {
9939                 segvn_lgrp_trthr_migrs_snpsht = cur_lgrp_trthr_migrs;
9940                 sema_v(&segvn_trasync_sem);
9941         }
9942 
9943         if (!segvn_disable_textrepl_update &&
9944             segvn_update_textrepl_interval != 0) {
9945                 (void) timeout(segvn_trupdate_wakeup, dummy,
9946                     segvn_update_textrepl_interval);
9947         }
9948 }
9949 
9950 static void
9951 segvn_trupdate(void)
9952 {
9953         ulong_t         hash;
9954         svntr_t         *svntrp;
9955         segvn_data_t    *svd;
9956 
9957         ASSERT(svntr_hashtab != NULL);
9958 
9959         for (hash = 0; hash < svntr_hashtab_sz; hash++) {
9960                 mutex_enter(&svntr_hashtab[hash].tr_lock);
9961                 svntrp = svntr_hashtab[hash].tr_head;
9962                 for (; svntrp != NULL; svntrp = svntrp->tr_next) {
9963                         ASSERT(svntrp->tr_refcnt != 0);
9964                         svd = svntrp->tr_svnhead;
9965                         for (; svd != NULL; svd = svd->svn_trnext) {
9966                                 segvn_trupdate_seg(svd->seg, svd, svntrp,
9967                                     hash);
9968                         }
9969                 }
9970                 mutex_exit(&svntr_hashtab[hash].tr_lock);
9971         }
9972 }
9973 
9974 static void
9975 segvn_trupdate_seg(struct seg *seg,
9976         segvn_data_t *svd,
9977         svntr_t *svntrp,
9978         ulong_t hash)
9979 {
9980         proc_t                  *p;
9981         lgrp_id_t               lgrp_id;
9982         struct as               *as;
9983         size_t                  size;
9984         struct anon_map         *amp;
9985 
9986         ASSERT(svd->vp != NULL);
9987         ASSERT(svd->vp == svntrp->tr_vp);
9988         ASSERT(svd->offset == svntrp->tr_off);
9989         ASSERT(svd->offset + seg->s_size == svntrp->tr_eoff);
9990         ASSERT(seg != NULL);
9991         ASSERT(svd->seg == seg);
9992         ASSERT(seg->s_data == (void *)svd);
9993         ASSERT(seg->s_szc == svntrp->tr_szc);
9994         ASSERT(svd->tr_state == SEGVN_TR_ON);
9995         ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie));
9996         ASSERT(svd->amp != NULL);
9997         ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG);
9998         ASSERT(svd->tr_policy_info.mem_lgrpid != LGRP_NONE);
9999         ASSERT(svd->tr_policy_info.mem_lgrpid < NLGRPS_MAX);
10000         ASSERT(svntrp->tr_amp[svd->tr_policy_info.mem_lgrpid] == svd->amp);
10001         ASSERT(svntrp->tr_refcnt != 0);
10002         ASSERT(mutex_owned(&svntr_hashtab[hash].tr_lock));
10003 
10004         as = seg->s_as;
10005         ASSERT(as != NULL && as != &kas);
10006         p = as->a_proc;
10007         ASSERT(p != NULL);
10008         ASSERT(p->p_tr_lgrpid != LGRP_NONE);
10009         lgrp_id = p->p_t1_lgrpid;
10010         if (lgrp_id == LGRP_NONE) {
10011                 return;
10012         }
10013         ASSERT(lgrp_id < NLGRPS_MAX);
10014         if (svd->tr_policy_info.mem_lgrpid == lgrp_id) {
10015                 return;
10016         }
10017 
10018         /*
10019          * Use tryenter locking since we are locking as/seg and svntr hash
10020          * lock in reverse from syncrounous thread order.
10021          */
10022         if (!AS_LOCK_TRYENTER(as, &as->a_lock, RW_READER)) {
10023                 SEGVN_TR_ADDSTAT(nolock);
10024                 if (segvn_lgrp_trthr_migrs_snpsht) {
10025                         segvn_lgrp_trthr_migrs_snpsht = 0;
10026                 }
10027                 return;
10028         }
10029         if (!SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock, RW_WRITER)) {
10030                 AS_LOCK_EXIT(as, &as->a_lock);
10031                 SEGVN_TR_ADDSTAT(nolock);
10032                 if (segvn_lgrp_trthr_migrs_snpsht) {
10033                         segvn_lgrp_trthr_migrs_snpsht = 0;
10034                 }
10035                 return;
10036         }
10037         size = seg->s_size;
10038         if (svntrp->tr_amp[lgrp_id] == NULL) {
10039                 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size);
10040                 if (trmem > segvn_textrepl_max_bytes) {
10041                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
10042                         AS_LOCK_EXIT(as, &as->a_lock);
10043                         atomic_add_long(&segvn_textrepl_bytes, -size);
10044                         SEGVN_TR_ADDSTAT(normem);
10045                         return;
10046                 }
10047                 if (anon_try_resv_zone(size, NULL) == 0) {
10048                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
10049                         AS_LOCK_EXIT(as, &as->a_lock);
10050                         atomic_add_long(&segvn_textrepl_bytes, -size);
10051                         SEGVN_TR_ADDSTAT(noanon);
10052                         return;
10053                 }
10054                 amp = anonmap_alloc(size, size, KM_NOSLEEP);
10055                 if (amp == NULL) {
10056                         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
10057                         AS_LOCK_EXIT(as, &as->a_lock);
10058                         atomic_add_long(&segvn_textrepl_bytes, -size);
10059                         anon_unresv_zone(size, NULL);
10060                         SEGVN_TR_ADDSTAT(nokmem);
10061                         return;
10062                 }
10063                 ASSERT(amp->refcnt == 1);
10064                 amp->a_szc = seg->s_szc;
10065                 svntrp->tr_amp[lgrp_id] = amp;
10066         }
10067         /*
10068          * We don't need to drop the bucket lock but here we give other
10069          * threads a chance.  svntr and svd can't be unlinked as long as
10070          * segment lock is held as a writer and AS held as well.  After we
10071          * retake bucket lock we'll continue from where we left. We'll be able
10072          * to reach the end of either list since new entries are always added
10073          * to the beginning of the lists.
10074          */
10075         mutex_exit(&svntr_hashtab[hash].tr_lock);
10076         hat_unload_callback(as->a_hat, seg->s_base, size, 0, NULL);
10077         mutex_enter(&svntr_hashtab[hash].tr_lock);
10078 
10079         ASSERT(svd->tr_state == SEGVN_TR_ON);
10080         ASSERT(svd->amp != NULL);
10081         ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG);
10082         ASSERT(svd->tr_policy_info.mem_lgrpid != lgrp_id);
10083         ASSERT(svd->amp != svntrp->tr_amp[lgrp_id]);
10084 
10085         svd->tr_policy_info.mem_lgrpid = lgrp_id;
10086         svd->amp = svntrp->tr_amp[lgrp_id];
10087         p->p_tr_lgrpid = NLGRPS_MAX;
10088         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
10089         AS_LOCK_EXIT(as, &as->a_lock);
10090 
10091         ASSERT(svntrp->tr_refcnt != 0);
10092         ASSERT(svd->vp == svntrp->tr_vp);
10093         ASSERT(svd->tr_policy_info.mem_lgrpid == lgrp_id);
10094         ASSERT(svd->amp != NULL && svd->amp == svntrp->tr_amp[lgrp_id]);
10095         ASSERT(svd->seg == seg);
10096         ASSERT(svd->tr_state == SEGVN_TR_ON);
10097 
10098         SEGVN_TR_ADDSTAT(asyncrepl);
10099 }