combined Sdiff usr/src/uts/common/vm/seg

Print this page

PVN_GETPAGE_{SZ,NUM} are misnamed and unnecessarily complicated
There is really no reason to not allow 8 pages all the time.  With the
current logic, we get the following:
Assuming 4kB pages (x86):
    _SZ  = ptob(8) /* 32kB */
    _NUM = 8
Assuming 8kB pages (sparc):
    _SZ  = ptob(8) /* 64kB */
    _NUM = 8
We'd have to deal with 16kB base pages in order for the _NUM #define to not
be 8 (it'd be 4 in that case).  So, in the spirit of simplicity, let's just
always grab 8 pages as there are no interesting systems with 16kB+ base pages.
Finally, the defines are poorly named.
const-ify make segment ops structures
There is no reason to keep the segment ops structures writable.
use NULL capable segop as a shorthand for no-capabilities
Instead of forcing every segment driver to implement a dummy "return 0"
function, handle NULL capable segop function pointer as "no copabilities
supported" shorthand.
patch lower-case-segops
use C99 initializers in segment ops structures
remove whole-process swapping
Long before Unix supported paging, it used process swapping to reclaim
memory.  The code is there and in theory it runs when we get *extremely* low
on memory.  In practice, it never runs since the definition of low-on-memory
is antiquated. (XXX: define what antiquated means)
You can check the number of swapout/swapin events with kstats:
$ kstat -p ::vm:swapin ::vm:swapout
remove xhat
The xhat infrastructure was added to support hardware such as the zulu
graphics card - hardware which had on-board MMUs.  The VM used the xhat code
to keep the CPU's and Zulu's page tables in-sync.  Since the only xhat user
was zulu (which is gone), we can safely remove it simplifying the whole VM
subsystem.
Assorted notes:
- AS_BUSY flag was used solely by xhat

  61 #include <sys/vm.h>
  62 #include <sys/dumphdr.h>
  63 #include <sys/lgrp.h>
  64 
  65 #include <vm/hat.h>
  66 #include <vm/as.h>
  67 #include <vm/seg.h>
  68 #include <vm/seg_vn.h>
  69 #include <vm/pvn.h>
  70 #include <vm/anon.h>
  71 #include <vm/page.h>
  72 #include <vm/vpage.h>
  73 #include <sys/proc.h>
  74 #include <sys/task.h>
  75 #include <sys/project.h>
  76 #include <sys/zone.h>
  77 #include <sys/shm_impl.h>
  78 
  79 /*
  80  * segvn_fault needs a temporary page list array.  To avoid calling kmem all
  81  * the time, it creates a small (PVN_GETPAGE_NUM entry) array and uses it if
  82  * it can.  In the rare case when this page list is not large enough, it
  83  * goes and gets a large enough array from kmem.
  84  *
  85  * This small page list array covers either 8 pages or 64kB worth of pages -
  86  * whichever is smaller.
  87  */
  88 #define PVN_MAX_GETPAGE_SZ      0x10000
  89 #define PVN_MAX_GETPAGE_NUM     0x8
  90 
  91 #if PVN_MAX_GETPAGE_SZ > PVN_MAX_GETPAGE_NUM * PAGESIZE
  92 #define PVN_GETPAGE_SZ  ptob(PVN_MAX_GETPAGE_NUM)
  93 #define PVN_GETPAGE_NUM PVN_MAX_GETPAGE_NUM
  94 #else
  95 #define PVN_GETPAGE_SZ  PVN_MAX_GETPAGE_SZ
  96 #define PVN_GETPAGE_NUM btop(PVN_MAX_GETPAGE_SZ)
  97 #endif
  98 
  99 /*
 100  * Private seg op routines.
 101  */
 102 static int      segvn_dup(struct seg *seg, struct seg *newseg);
 103 static int      segvn_unmap(struct seg *seg, caddr_t addr, size_t len);
 104 static void     segvn_free(struct seg *seg);
 105 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg,
 106                     caddr_t addr, size_t len, enum fault_type type,
 107                     enum seg_rw rw);
 108 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr);
 109 static int      segvn_setprot(struct seg *seg, caddr_t addr,
 110                     size_t len, uint_t prot);
 111 static int      segvn_checkprot(struct seg *seg, caddr_t addr,
 112                     size_t len, uint_t prot);
 113 static int      segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
 114 static size_t   segvn_swapout(struct seg *seg);
 115 static int      segvn_sync(struct seg *seg, caddr_t addr, size_t len,
 116                     int attr, uint_t flags);
 117 static size_t   segvn_incore(struct seg *seg, caddr_t addr, size_t len,
 118                     char *vec);
 119 static int      segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
 120                     int attr, int op, ulong_t *lockmap, size_t pos);
 121 static int      segvn_getprot(struct seg *seg, caddr_t addr, size_t len,
 122                     uint_t *protv);
 123 static u_offset_t       segvn_getoffset(struct seg *seg, caddr_t addr);
 124 static int      segvn_gettype(struct seg *seg, caddr_t addr);
 125 static int      segvn_getvp(struct seg *seg, caddr_t addr,
 126                     struct vnode **vpp);
 127 static int      segvn_advise(struct seg *seg, caddr_t addr, size_t len,
 128                     uint_t behav);
 129 static void     segvn_dump(struct seg *seg);
 130 static int      segvn_pagelock(struct seg *seg, caddr_t addr, size_t len,
 131                     struct page ***ppp, enum lock_type type, enum seg_rw rw);
 132 static int      segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len,
 133                     uint_t szc);
 134 static int      segvn_getmemid(struct seg *seg, caddr_t addr,
 135                     memid_t *memidp);
 136 static lgrp_mem_policy_info_t   *segvn_getpolicy(struct seg *, caddr_t);
 137 static int      segvn_capable(struct seg *seg, segcapability_t capable);
 138 static int      segvn_inherit(struct seg *, caddr_t, size_t, uint_t);
 139 
 140 struct  seg_ops segvn_ops = {
 141         segvn_dup,
 142         segvn_unmap,
 143         segvn_free,
 144         segvn_fault,
 145         segvn_faulta,
 146         segvn_setprot,
 147         segvn_checkprot,
 148         segvn_kluster,
 149         segvn_swapout,
 150         segvn_sync,
 151         segvn_incore,
 152         segvn_lockop,
 153         segvn_getprot,
 154         segvn_getoffset,
 155         segvn_gettype,
 156         segvn_getvp,
 157         segvn_advise,
 158         segvn_dump,
 159         segvn_pagelock,
 160         segvn_setpagesize,
 161         segvn_getmemid,
 162         segvn_getpolicy,
 163         segvn_capable,
 164         segvn_inherit
 165 };
 166 
 167 /*
 168  * Common zfod structures, provided as a shorthand for others to use.
 169  */
 170 static segvn_crargs_t zfod_segvn_crargs =
 171         SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 172 static segvn_crargs_t kzfod_segvn_crargs =
 173         SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER,
 174         PROT_ALL & ~PROT_USER);
 175 static segvn_crargs_t stack_noexec_crargs =
 176         SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL);
 177 
 178 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs;   /* user zfod argsp */
 179 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */
 180 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs;     /* executable stack */
 181 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */
 182 
 183 #define vpgtob(n)       ((n) * sizeof (struct vpage))   /* For brevity */
 184

3839         u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base);
3840         ulong_t aindx = svd->anon_index + seg_page(seg, a);
3841         struct vpage *vpage = (svd->vpage != NULL) ?
3842             &svd->vpage[seg_page(seg, a)] : NULL;
3843         vnode_t *vp = svd->vp;
3844         page_t **ppa;
3845         uint_t  pszc;
3846         size_t  ppgsz;
3847         pgcnt_t ppages;
3848         faultcode_t err = 0;
3849         int ierr;
3850         int vop_size_err = 0;
3851         uint_t protchk, prot, vpprot;
3852         ulong_t i;
3853         int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
3854         anon_sync_obj_t an_cookie;
3855         enum seg_rw arw;
3856         int alloc_failed = 0;
3857         int adjszc_chk;
3858         struct vattr va;
3859         int xhat = 0;
3860         page_t *pplist;
3861         pfn_t pfn;
3862         int physcontig;
3863         int upgrdfail;
3864         int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */
3865         int tron = (svd->tr_state == SEGVN_TR_ON);
3866 
3867         ASSERT(szc != 0);
3868         ASSERT(vp != NULL);
3869         ASSERT(brkcow == 0 || amp != NULL);
3870         ASSERT(tron == 0 || amp != NULL);
3871         ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */
3872         ASSERT(!(svd->flags & MAP_NORESERVE));
3873         ASSERT(type != F_SOFTUNLOCK);
3874         ASSERT(IS_P2ALIGNED(a, maxpgsz));
3875         ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages));
3876         ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
3877         ASSERT(seg->s_szc < NBBY * sizeof (int));
3878         ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz);
3879         ASSERT(svd->tr_state != SEGVN_TR_INIT);

3889                 switch (rw) {
3890                 case S_READ:
3891                         protchk = PROT_READ;
3892                         break;
3893                 case S_WRITE:
3894                         protchk = PROT_WRITE;
3895                         break;
3896                 case S_EXEC:
3897                         protchk = PROT_EXEC;
3898                         break;
3899                 case S_OTHER:
3900                 default:
3901                         protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
3902                         break;
3903                 }
3904         } else {
3905                 prot = svd->prot;
3906                 /* caller has already done segment level protection check. */
3907         }
3908 
3909         if (seg->s_as->a_hat != hat) {
3910                 xhat = 1;
3911         }
3912 
3913         if (rw == S_WRITE && segtype == MAP_PRIVATE) {
3914                 SEGVN_VMSTAT_FLTVNPAGES(2);
3915                 arw = S_READ;
3916         } else {
3917                 arw = rw;
3918         }
3919 
3920         ppa = kmem_alloc(ppasize, KM_SLEEP);
3921 
3922         VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]);
3923 
3924         for (;;) {
3925                 adjszc_chk = 0;
3926                 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) {
3927                         if (adjszc_chk) {
3928                                 while (szc < seg->s_szc) {
3929                                         uintptr_t e;
3930                                         uint_t tszc;
3931                                         tszc = segvn_anypgsz_vnode ? szc + 1 :
3932                                             seg->s_szc;

4248                                             off + (i << PAGESHIFT));
4249                                 }
4250 #endif /* DEBUG */
4251                                 /*
4252                                  * All pages are of szc we need and they are
4253                                  * all locked so they can't change szc. load
4254                                  * translations.
4255                                  *
4256                                  * if page got promoted since last check
4257                                  * we don't need pplist.
4258                                  */
4259                                 if (pplist != NULL) {
4260                                         page_free_replacement_page(pplist);
4261                                         page_create_putback(pages);
4262                                 }
4263                                 if (PP_ISMIGRATE(ppa[0])) {
4264                                         page_migrate(seg, a, ppa, pages);
4265                                 }
4266                                 SEGVN_UPDATE_MODBITS(ppa, pages, rw,
4267                                     prot, vpprot);
4268                                 if (!xhat) {
4269                                         hat_memload_array_region(hat, a, pgsz,
4270                                             ppa, prot & vpprot, hat_flag,
4271                                             svd->rcookie);
4272                                 } else {
4273                                         /*
4274                                          * avoid large xhat mappings to FS
4275                                          * pages so that hat_page_demote()
4276                                          * doesn't need to check for xhat
4277                                          * large mappings.
4278                                          * Don't use regions with xhats.
4279                                          */
4280                                         for (i = 0; i < pages; i++) {
4281                                                 hat_memload(hat,
4282                                                     a + (i << PAGESHIFT),
4283                                                     ppa[i], prot & vpprot,
4284                                                     hat_flag);
4285                                         }
4286                                 }
4287 
4288                                 if (!(hat_flag & HAT_LOAD_LOCK)) {
4289                                         for (i = 0; i < pages; i++) {
4290                                                 page_unlock(ppa[i]);
4291                                         }
4292                                 }
4293                                 if (amp != NULL) {
4294                                         anon_array_exit(&an_cookie);
4295                                         ANON_LOCK_EXIT(&amp->a_rwlock);
4296                                 }
4297                                 goto next;
4298                         }
4299 
4300                         /*
4301                          * See if upsize is possible.
4302                          */
4303                         if (pszc > szc && szc < seg->s_szc &&
4304                             (segvn_anypgsz_vnode || pszc >= seg->s_szc)) {
4305                                 pgcnt_t aphase;
4306                                 uint_t pszc1 = MIN(pszc, seg->s_szc);

4320                                                 page_free_replacement_page(pl);
4321                                                 page_create_putback(pages);
4322                                         }
4323                                         for (i = 0; i < pages; i++) {
4324                                                 page_unlock(ppa[i]);
4325                                         }
4326                                         if (amp != NULL) {
4327                                                 anon_array_exit(&an_cookie);
4328                                                 ANON_LOCK_EXIT(&amp->a_rwlock);
4329                                         }
4330                                         pszc = pszc1;
4331                                         ierr = -2;
4332                                         break;
4333                                 }
4334                         }
4335 
4336                         /*
4337                          * check if we should use smallest mapping size.
4338                          */
4339                         upgrdfail = 0;
4340                         if (szc == 0 || xhat ||
4341                             (pszc >= szc &&
4342                             !IS_P2ALIGNED(pfn, pages)) ||
4343                             (pszc < szc &&
4344                             !segvn_full_szcpages(ppa, szc, &upgrdfail,
4345                             &pszc))) {
4346 
4347                                 if (upgrdfail && type != F_SOFTLOCK) {
4348                                         /*
4349                                          * segvn_full_szcpages failed to lock
4350                                          * all pages EXCL. Size down.
4351                                          */
4352                                         ASSERT(pszc < szc);
4353 
4354                                         SEGVN_VMSTAT_FLTVNPAGES(33);
4355 
4356                                         if (pplist != NULL) {
4357                                                 page_t *pl = pplist;
4358                                                 page_free_replacement_page(pl);
4359                                                 page_create_putback(pages);
4360                                         }
4361 
4362                                         for (i = 0; i < pages; i++) {
4363                                                 page_unlock(ppa[i]);
4364                                         }
4365                                         if (amp != NULL) {
4366                                                 anon_array_exit(&an_cookie);
4367                                                 ANON_LOCK_EXIT(&amp->a_rwlock);
4368                                         }
4369                                         ierr = -1;
4370                                         break;
4371                                 }
4372                                 if (szc != 0 && !xhat && !upgrdfail) {
4373                                         segvn_faultvnmpss_align_err5++;
4374                                 }
4375                                 SEGVN_VMSTAT_FLTVNPAGES(34);
4376                                 if (pplist != NULL) {
4377                                         page_free_replacement_page(pplist);
4378                                         page_create_putback(pages);
4379                                 }
4380                                 SEGVN_UPDATE_MODBITS(ppa, pages, rw,
4381                                     prot, vpprot);
4382                                 if (upgrdfail && segvn_anypgsz_vnode) {
4383                                         /* SOFTLOCK case */
4384                                         hat_memload_array_region(hat, a, pgsz,
4385                                             ppa, prot & vpprot, hat_flag,
4386                                             svd->rcookie);
4387                                 } else {
4388                                         for (i = 0; i < pages; i++) {
4389                                                 hat_memload_region(hat,
4390                                                     a + (i << PAGESHIFT),
4391                                                     ppa[i], prot & vpprot,
4392                                                     hat_flag, svd->rcookie);

4933  *              Call VOP_GETPAGE over the range of non-anonymous pages
4934  *      endif
4935  *      Loop over all addresses requested
4936  *              Call segvn_faultpage passing in page list
4937  *                  to load up translations and handle anonymous pages
4938  *      endloop
4939  *      Load up translation to any additional pages in page list not
4940  *          already handled that fit into this segment
4941  */
4942 static faultcode_t
4943 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
4944     enum fault_type type, enum seg_rw rw)
4945 {
4946         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
4947         page_t **plp, **ppp, *pp;
4948         u_offset_t off;
4949         caddr_t a;
4950         struct vpage *vpage;
4951         uint_t vpprot, prot;
4952         int err;
4953         page_t *pl[PVN_GETPAGE_NUM + 1];
4954         size_t plsz, pl_alloc_sz;
4955         size_t page;
4956         ulong_t anon_index;
4957         struct anon_map *amp;
4958         int dogetpage = 0;
4959         caddr_t lpgaddr, lpgeaddr;
4960         size_t pgsz;
4961         anon_sync_obj_t cookie;
4962         int brkcow = BREAK_COW_SHARE(rw, type, svd->type);
4963 
4964         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
4965         ASSERT(svd->amp == NULL || svd->rcookie == HAT_INVALID_REGION_COOKIE);
4966 
4967         /*
4968          * First handle the easy stuff
4969          */
4970         if (type == F_SOFTUNLOCK) {
4971                 if (rw == S_READ_NOCOW) {
4972                         rw = S_READ;
4973                         ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));

5372                          * Only acquire reader lock to prevent amp->ahp
5373                          * from being changed.  It's ok to miss pages,
5374                          * hence we don't do anon_array_enter
5375                          */
5376                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5377                         ap = anon_get_ptr(amp->ahp, anon_index);
5378 
5379                         if (len <= PAGESIZE)
5380                                 /* inline non_anon() */
5381                                 dogetpage = (ap == NULL);
5382                         else
5383                                 dogetpage = non_anon(amp->ahp, anon_index,
5384                                     &vp_off, &vp_len);
5385                         ANON_LOCK_EXIT(&amp->a_rwlock);
5386                 }
5387 
5388                 if (dogetpage) {
5389                         enum seg_rw arw;
5390                         struct as *as = seg->s_as;
5391 
5392                         if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) {
5393                                 /*
5394                                  * Page list won't fit in local array,
5395                                  * allocate one of the needed size.
5396                                  */
5397                                 pl_alloc_sz =
5398                                     (btop(len) + 1) * sizeof (page_t *);
5399                                 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP);
5400                                 plp[0] = NULL;
5401                                 plsz = len;
5402                         } else if (rw == S_WRITE && svd->type == MAP_PRIVATE ||
5403                             svd->tr_state == SEGVN_TR_ON || rw == S_OTHER ||
5404                             (((size_t)(addr + PAGESIZE) <
5405                             (size_t)(seg->s_base + seg->s_size)) &&
5406                             hat_probe(as->a_hat, addr + PAGESIZE))) {
5407                                 /*
5408                                  * Ask VOP_GETPAGE to return the exact number
5409                                  * of pages if
5410                                  * (a) this is a COW fault, or
5411                                  * (b) this is a software fault, or
5412                                  * (c) next page is already mapped.
5413                                  */
5414                                 plsz = len;
5415                         } else {
5416                                 /*
5417                                  * Ask VOP_GETPAGE to return adjacent pages
5418                                  * within the segment.
5419                                  */
5420                                 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t)
5421                                     ((seg->s_base + seg->s_size) - addr));
5422                                 ASSERT((addr + plsz) <=
5423                                     (seg->s_base + seg->s_size));
5424                         }
5425 
5426                         /*
5427                          * Need to get some non-anonymous pages.
5428                          * We need to make only one call to GETPAGE to do
5429                          * this to prevent certain deadlocking conditions
5430                          * when we are doing locking.  In this case
5431                          * non_anon() should have picked up the smallest
5432                          * range which includes all the non-anonymous
5433                          * pages in the requested range.  We have to
5434                          * be careful regarding which rw flag to pass in
5435                          * because on a private mapping, the underlying
5436                          * object is never allowed to be written.
5437                          */
5438                         if (rw == S_WRITE && svd->type == MAP_PRIVATE) {
5439                                 arw = S_READ;
5440                         } else {

6067                  * unload any current translations that might exist).
6068                  */
6069                 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD);
6070         } else {
6071                 /*
6072                  * A shared mapping or a private mapping in which write
6073                  * protection is going to be denied - just change all the
6074                  * protections over the range of addresses in question.
6075                  * segvn does not support any other attributes other
6076                  * than prot so we can use hat_chgattr.
6077                  */
6078                 hat_chgattr(seg->s_as->a_hat, addr, len, prot);
6079         }
6080 
6081         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
6082 
6083         return (0);
6084 }
6085 
6086 /*
6087  * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize,
6088  * to determine if the seg is capable of mapping the requested szc.
6089  */
6090 static int
6091 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
6092 {
6093         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6094         struct segvn_data *nsvd;
6095         struct anon_map *amp = svd->amp;
6096         struct seg *nseg;
6097         caddr_t eaddr = addr + len, a;
6098         size_t pgsz = page_get_pagesize(szc);
6099         pgcnt_t pgcnt = page_get_pagecnt(szc);
6100         int err;
6101         u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base);
6102 
6103         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
6104         ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size);
6105 
6106         if (seg->s_szc == szc || segvn_lpg_disable != 0) {
6107                 return (0);

7055          * see if they happen to be properly allocated.
7056          */
7057 
7058         /*
7059          * XXX We cheat here and don't lock the anon slots. We can't because
7060          * we may have been called from the anon layer which might already
7061          * have locked them. We are holding a refcnt on the slots so they
7062          * can't disappear. The worst that will happen is we'll get the wrong
7063          * names (vp, off) for the slots and make a poor klustering decision.
7064          */
7065         swap_xlate(ap, &vp1, &off1);
7066         swap_xlate(oap, &vp2, &off2);
7067 
7068 
7069         if (!VOP_CMP(vp1, vp2, NULL) || off1 - off2 != delta)
7070                 return (-1);
7071         return (0);
7072 }
7073 
7074 /*
7075  * Swap the pages of seg out to secondary storage, returning the
7076  * number of bytes of storage freed.
7077  *
7078  * The basic idea is first to unload all translations and then to call
7079  * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the
7080  * swap device.  Pages to which other segments have mappings will remain
7081  * mapped and won't be swapped.  Our caller (as_swapout) has already
7082  * performed the unloading step.
7083  *
7084  * The value returned is intended to correlate well with the process's
7085  * memory requirements.  However, there are some caveats:
7086  * 1)   When given a shared segment as argument, this routine will
7087  *      only succeed in swapping out pages for the last sharer of the
7088  *      segment.  (Previous callers will only have decremented mapping
7089  *      reference counts.)
7090  * 2)   We assume that the hat layer maintains a large enough translation
7091  *      cache to capture process reference patterns.
7092  */
7093 static size_t
7094 segvn_swapout(struct seg *seg)
7095 {
7096         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
7097         struct anon_map *amp;
7098         pgcnt_t pgcnt = 0;
7099         pgcnt_t npages;
7100         pgcnt_t page;
7101         ulong_t anon_index;
7102 
7103         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
7104 
7105         SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
7106         /*
7107          * Find pages unmapped by our caller and force them
7108          * out to the virtual swap device.
7109          */
7110         if ((amp = svd->amp) != NULL)
7111                 anon_index = svd->anon_index;
7112         npages = seg->s_size >> PAGESHIFT;
7113         for (page = 0; page < npages; page++) {
7114                 page_t *pp;
7115                 struct anon *ap;
7116                 struct vnode *vp;
7117                 u_offset_t off;
7118                 anon_sync_obj_t cookie;
7119 
7120                 /*
7121                  * Obtain <vp, off> pair for the page, then look it up.
7122                  *
7123                  * Note that this code is willing to consider regular
7124                  * pages as well as anon pages.  Is this appropriate here?
7125                  */
7126                 ap = NULL;
7127                 if (amp != NULL) {
7128                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
7129                         if (anon_array_try_enter(amp, anon_index + page,
7130                             &cookie)) {
7131                                 ANON_LOCK_EXIT(&amp->a_rwlock);
7132                                 continue;
7133                         }
7134                         ap = anon_get_ptr(amp->ahp, anon_index + page);
7135                         if (ap != NULL) {
7136                                 swap_xlate(ap, &vp, &off);
7137                         } else {
7138                                 vp = svd->vp;
7139                                 off = svd->offset + ptob(page);
7140                         }
7141                         anon_array_exit(&cookie);
7142                         ANON_LOCK_EXIT(&amp->a_rwlock);
7143                 } else {
7144                         vp = svd->vp;
7145                         off = svd->offset + ptob(page);
7146                 }
7147                 if (vp == NULL) {               /* untouched zfod page */
7148                         ASSERT(ap == NULL);
7149                         continue;
7150                 }
7151 
7152                 pp = page_lookup_nowait(vp, off, SE_SHARED);
7153                 if (pp == NULL)
7154                         continue;
7155 
7156 
7157                 /*
7158                  * Examine the page to see whether it can be tossed out,
7159                  * keeping track of how many we've found.
7160                  */
7161                 if (!page_tryupgrade(pp)) {
7162                         /*
7163                          * If the page has an i/o lock and no mappings,
7164                          * it's very likely that the page is being
7165                          * written out as a result of klustering.
7166                          * Assume this is so and take credit for it here.
7167                          */
7168                         if (!page_io_trylock(pp)) {
7169                                 if (!hat_page_is_mapped(pp))
7170                                         pgcnt++;
7171                         } else {
7172                                 page_io_unlock(pp);
7173                         }
7174                         page_unlock(pp);
7175                         continue;
7176                 }
7177                 ASSERT(!page_iolock_assert(pp));
7178 
7179 
7180                 /*
7181                  * Skip if page is locked or has mappings.
7182                  * We don't need the page_struct_lock to look at lckcnt
7183                  * and cowcnt because the page is exclusive locked.
7184                  */
7185                 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
7186                     hat_page_is_mapped(pp)) {
7187                         page_unlock(pp);
7188                         continue;
7189                 }
7190 
7191                 /*
7192                  * dispose skips large pages so try to demote first.
7193                  */
7194                 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) {
7195                         page_unlock(pp);
7196                         /*
7197                          * XXX should skip the remaining page_t's of this
7198                          * large page.
7199                          */
7200                         continue;
7201                 }
7202 
7203                 ASSERT(pp->p_szc == 0);
7204 
7205                 /*
7206                  * No longer mapped -- we can toss it out.  How
7207                  * we do so depends on whether or not it's dirty.
7208                  */
7209                 if (hat_ismod(pp) && pp->p_vnode) {
7210                         /*
7211                          * We must clean the page before it can be
7212                          * freed.  Setting B_FREE will cause pvn_done
7213                          * to free the page when the i/o completes.
7214                          * XXX: This also causes it to be accounted
7215                          *      as a pageout instead of a swap: need
7216                          *      B_SWAPOUT bit to use instead of B_FREE.
7217                          *
7218                          * Hold the vnode before releasing the page lock
7219                          * to prevent it from being freed and re-used by
7220                          * some other thread.
7221                          */
7222                         VN_HOLD(vp);
7223                         page_unlock(pp);
7224 
7225                         /*
7226                          * Queue all i/o requests for the pageout thread
7227                          * to avoid saturating the pageout devices.
7228                          */
7229                         if (!queue_io_request(vp, off))
7230                                 VN_RELE(vp);
7231                 } else {
7232                         /*
7233                          * The page was clean, free it.
7234                          *
7235                          * XXX: Can we ever encounter modified pages
7236                          *      with no associated vnode here?
7237                          */
7238                         ASSERT(pp->p_vnode != NULL);
7239                         /*LINTED: constant in conditional context*/
7240                         VN_DISPOSE(pp, B_FREE, 0, kcred);
7241                 }
7242 
7243                 /*
7244                  * Credit now even if i/o is in progress.
7245                  */
7246                 pgcnt++;
7247         }
7248         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7249 
7250         /*
7251          * Wakeup pageout to initiate i/o on all queued requests.
7252          */
7253         cv_signal_pageout();
7254         return (ptob(pgcnt));
7255 }
7256 
7257 /*
7258  * Synchronize primary storage cache with real object in virtual memory.
7259  *
7260  * XXX - Anonymous pages should not be sync'ed out at all.
7261  */
7262 static int
7263 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
7264 {
7265         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
7266         struct vpage *vpp;
7267         page_t *pp;
7268         u_offset_t offset;
7269         struct vnode *vp;
7270         u_offset_t off;
7271         caddr_t eaddr;
7272         int bflags;
7273         int err = 0;
7274         int segtype;
7275         int pageprot;
7276         int prot;
7277         ulong_t anon_index;

9672         /*
9673          * Get policy info for private or shared memory
9674          */
9675         if (svn_data->type != MAP_SHARED) {
9676                 if (svn_data->tr_state != SEGVN_TR_ON) {
9677                         policy_info = &svn_data->policy_info;
9678                 } else {
9679                         policy_info = &svn_data->tr_policy_info;
9680                         ASSERT(policy_info->mem_policy ==
9681                             LGRP_MEM_POLICY_NEXT_SEG);
9682                 }
9683         } else {
9684                 amp = svn_data->amp;
9685                 anon_index = svn_data->anon_index + seg_page(seg, addr);
9686                 vp = svn_data->vp;
9687                 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base);
9688                 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off);
9689         }
9690 
9691         return (policy_info);
9692 }
9693 
9694 /*ARGSUSED*/
9695 static int
9696 segvn_capable(struct seg *seg, segcapability_t capability)
9697 {
9698         return (0);
9699 }
9700 
9701 /*
9702  * Bind text vnode segment to an amp. If we bind successfully mappings will be
9703  * established to per vnode mapping per lgroup amp pages instead of to vnode
9704  * pages. There's one amp per vnode text mapping per lgroup. Many processes
9705  * may share the same text replication amp. If a suitable amp doesn't already
9706  * exist in svntr hash table create a new one.  We may fail to bind to amp if
9707  * segment is not eligible for text replication.  Code below first checks for
9708  * these conditions. If binding is successful segment tr_state is set to on
9709  * and svd->amp points to the amp to use. Otherwise tr_state is set to off and
9710  * svd->amp remains as NULL.
9711  */
9712 static void
9713 segvn_textrepl(struct seg *seg)
9714 {
9715         struct segvn_data       *svd = (struct segvn_data *)seg->s_data;
9716         vnode_t                 *vp = svd->vp;
9717         u_offset_t              off = svd->offset;
9718         size_t                  size = seg->s_size;

  61 #include <sys/vm.h>
  62 #include <sys/dumphdr.h>
  63 #include <sys/lgrp.h>
  64 
  65 #include <vm/hat.h>
  66 #include <vm/as.h>
  67 #include <vm/seg.h>
  68 #include <vm/seg_vn.h>
  69 #include <vm/pvn.h>
  70 #include <vm/anon.h>
  71 #include <vm/page.h>
  72 #include <vm/vpage.h>
  73 #include <sys/proc.h>
  74 #include <sys/task.h>
  75 #include <sys/project.h>
  76 #include <sys/zone.h>
  77 #include <sys/shm_impl.h>
  78 
  79 /*
  80  * segvn_fault needs a temporary page list array.  To avoid calling kmem all
  81  * the time, it creates a small (FAULT_TMP_PAGES_NUM entry) array and uses
  82  * it if it can.  In the rare case when this page list is not large enough,
  83  * it goes and gets a large enough array from kmem.
  84  */
  85 #define FAULT_TMP_PAGES_NUM     0x8
  86 #define FAULT_TMP_PAGES_SZ      ptob(FAULT_TMP_PAGES_NUM)











  87 
  88 /*
  89  * Private seg op routines.
  90  */
  91 static int      segvn_dup(struct seg *seg, struct seg *newseg);
  92 static int      segvn_unmap(struct seg *seg, caddr_t addr, size_t len);
  93 static void     segvn_free(struct seg *seg);
  94 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg,
  95                     caddr_t addr, size_t len, enum fault_type type,
  96                     enum seg_rw rw);
  97 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr);
  98 static int      segvn_setprot(struct seg *seg, caddr_t addr,
  99                     size_t len, uint_t prot);
 100 static int      segvn_checkprot(struct seg *seg, caddr_t addr,
 101                     size_t len, uint_t prot);
 102 static int      segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta);

 103 static int      segvn_sync(struct seg *seg, caddr_t addr, size_t len,
 104                     int attr, uint_t flags);
 105 static size_t   segvn_incore(struct seg *seg, caddr_t addr, size_t len,
 106                     char *vec);
 107 static int      segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
 108                     int attr, int op, ulong_t *lockmap, size_t pos);
 109 static int      segvn_getprot(struct seg *seg, caddr_t addr, size_t len,
 110                     uint_t *protv);
 111 static u_offset_t       segvn_getoffset(struct seg *seg, caddr_t addr);
 112 static int      segvn_gettype(struct seg *seg, caddr_t addr);
 113 static int      segvn_getvp(struct seg *seg, caddr_t addr,
 114                     struct vnode **vpp);
 115 static int      segvn_advise(struct seg *seg, caddr_t addr, size_t len,
 116                     uint_t behav);
 117 static void     segvn_dump(struct seg *seg);
 118 static int      segvn_pagelock(struct seg *seg, caddr_t addr, size_t len,
 119                     struct page ***ppp, enum lock_type type, enum seg_rw rw);
 120 static int      segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len,
 121                     uint_t szc);
 122 static int      segvn_getmemid(struct seg *seg, caddr_t addr,
 123                     memid_t *memidp);
 124 static lgrp_mem_policy_info_t   *segvn_getpolicy(struct seg *, caddr_t);

 125 static int      segvn_inherit(struct seg *, caddr_t, size_t, uint_t);
 126 
 127 const struct seg_ops segvn_ops = {
 128         .dup            = segvn_dup,
 129         .unmap          = segvn_unmap,
 130         .free           = segvn_free,
 131         .fault          = segvn_fault,
 132         .faulta         = segvn_faulta,
 133         .setprot        = segvn_setprot,
 134         .checkprot      = segvn_checkprot,
 135         .kluster        = segvn_kluster,
 136         .sync           = segvn_sync,
 137         .incore         = segvn_incore,
 138         .lockop         = segvn_lockop,
 139         .getprot        = segvn_getprot,
 140         .getoffset      = segvn_getoffset,
 141         .gettype        = segvn_gettype,
 142         .getvp          = segvn_getvp,
 143         .advise         = segvn_advise,
 144         .dump           = segvn_dump,
 145         .pagelock       = segvn_pagelock,
 146         .setpagesize    = segvn_setpagesize,
 147         .getmemid       = segvn_getmemid,
 148         .getpolicy      = segvn_getpolicy,
 149         .inherit        = segvn_inherit,


 150 };
 151 
 152 /*
 153  * Common zfod structures, provided as a shorthand for others to use.
 154  */
 155 static segvn_crargs_t zfod_segvn_crargs =
 156         SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 157 static segvn_crargs_t kzfod_segvn_crargs =
 158         SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER,
 159         PROT_ALL & ~PROT_USER);
 160 static segvn_crargs_t stack_noexec_crargs =
 161         SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL);
 162 
 163 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs;   /* user zfod argsp */
 164 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */
 165 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs;     /* executable stack */
 166 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */
 167 
 168 #define vpgtob(n)       ((n) * sizeof (struct vpage))   /* For brevity */
 169

3824         u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base);
3825         ulong_t aindx = svd->anon_index + seg_page(seg, a);
3826         struct vpage *vpage = (svd->vpage != NULL) ?
3827             &svd->vpage[seg_page(seg, a)] : NULL;
3828         vnode_t *vp = svd->vp;
3829         page_t **ppa;
3830         uint_t  pszc;
3831         size_t  ppgsz;
3832         pgcnt_t ppages;
3833         faultcode_t err = 0;
3834         int ierr;
3835         int vop_size_err = 0;
3836         uint_t protchk, prot, vpprot;
3837         ulong_t i;
3838         int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
3839         anon_sync_obj_t an_cookie;
3840         enum seg_rw arw;
3841         int alloc_failed = 0;
3842         int adjszc_chk;
3843         struct vattr va;

3844         page_t *pplist;
3845         pfn_t pfn;
3846         int physcontig;
3847         int upgrdfail;
3848         int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */
3849         int tron = (svd->tr_state == SEGVN_TR_ON);
3850 
3851         ASSERT(szc != 0);
3852         ASSERT(vp != NULL);
3853         ASSERT(brkcow == 0 || amp != NULL);
3854         ASSERT(tron == 0 || amp != NULL);
3855         ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */
3856         ASSERT(!(svd->flags & MAP_NORESERVE));
3857         ASSERT(type != F_SOFTUNLOCK);
3858         ASSERT(IS_P2ALIGNED(a, maxpgsz));
3859         ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages));
3860         ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
3861         ASSERT(seg->s_szc < NBBY * sizeof (int));
3862         ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz);
3863         ASSERT(svd->tr_state != SEGVN_TR_INIT);

3873                 switch (rw) {
3874                 case S_READ:
3875                         protchk = PROT_READ;
3876                         break;
3877                 case S_WRITE:
3878                         protchk = PROT_WRITE;
3879                         break;
3880                 case S_EXEC:
3881                         protchk = PROT_EXEC;
3882                         break;
3883                 case S_OTHER:
3884                 default:
3885                         protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
3886                         break;
3887                 }
3888         } else {
3889                 prot = svd->prot;
3890                 /* caller has already done segment level protection check. */
3891         }
3892 




3893         if (rw == S_WRITE && segtype == MAP_PRIVATE) {
3894                 SEGVN_VMSTAT_FLTVNPAGES(2);
3895                 arw = S_READ;
3896         } else {
3897                 arw = rw;
3898         }
3899 
3900         ppa = kmem_alloc(ppasize, KM_SLEEP);
3901 
3902         VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]);
3903 
3904         for (;;) {
3905                 adjszc_chk = 0;
3906                 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) {
3907                         if (adjszc_chk) {
3908                                 while (szc < seg->s_szc) {
3909                                         uintptr_t e;
3910                                         uint_t tszc;
3911                                         tszc = segvn_anypgsz_vnode ? szc + 1 :
3912                                             seg->s_szc;

4228                                             off + (i << PAGESHIFT));
4229                                 }
4230 #endif /* DEBUG */
4231                                 /*
4232                                  * All pages are of szc we need and they are
4233                                  * all locked so they can't change szc. load
4234                                  * translations.
4235                                  *
4236                                  * if page got promoted since last check
4237                                  * we don't need pplist.
4238                                  */
4239                                 if (pplist != NULL) {
4240                                         page_free_replacement_page(pplist);
4241                                         page_create_putback(pages);
4242                                 }
4243                                 if (PP_ISMIGRATE(ppa[0])) {
4244                                         page_migrate(seg, a, ppa, pages);
4245                                 }
4246                                 SEGVN_UPDATE_MODBITS(ppa, pages, rw,
4247                                     prot, vpprot);

4248                                 hat_memload_array_region(hat, a, pgsz,
4249                                     ppa, prot & vpprot, hat_flag,
4250                                     svd->rcookie);















4251 
4252                                 if (!(hat_flag & HAT_LOAD_LOCK)) {
4253                                         for (i = 0; i < pages; i++) {
4254                                                 page_unlock(ppa[i]);
4255                                         }
4256                                 }
4257                                 if (amp != NULL) {
4258                                         anon_array_exit(&an_cookie);
4259                                         ANON_LOCK_EXIT(&amp->a_rwlock);
4260                                 }
4261                                 goto next;
4262                         }
4263 
4264                         /*
4265                          * See if upsize is possible.
4266                          */
4267                         if (pszc > szc && szc < seg->s_szc &&
4268                             (segvn_anypgsz_vnode || pszc >= seg->s_szc)) {
4269                                 pgcnt_t aphase;
4270                                 uint_t pszc1 = MIN(pszc, seg->s_szc);

4284                                                 page_free_replacement_page(pl);
4285                                                 page_create_putback(pages);
4286                                         }
4287                                         for (i = 0; i < pages; i++) {
4288                                                 page_unlock(ppa[i]);
4289                                         }
4290                                         if (amp != NULL) {
4291                                                 anon_array_exit(&an_cookie);
4292                                                 ANON_LOCK_EXIT(&amp->a_rwlock);
4293                                         }
4294                                         pszc = pszc1;
4295                                         ierr = -2;
4296                                         break;
4297                                 }
4298                         }
4299 
4300                         /*
4301                          * check if we should use smallest mapping size.
4302                          */
4303                         upgrdfail = 0;
4304                         if (szc == 0 ||
4305                             (pszc >= szc &&
4306                             !IS_P2ALIGNED(pfn, pages)) ||
4307                             (pszc < szc &&
4308                             !segvn_full_szcpages(ppa, szc, &upgrdfail,
4309                             &pszc))) {
4310 
4311                                 if (upgrdfail && type != F_SOFTLOCK) {
4312                                         /*
4313                                          * segvn_full_szcpages failed to lock
4314                                          * all pages EXCL. Size down.
4315                                          */
4316                                         ASSERT(pszc < szc);
4317 
4318                                         SEGVN_VMSTAT_FLTVNPAGES(33);
4319 
4320                                         if (pplist != NULL) {
4321                                                 page_t *pl = pplist;
4322                                                 page_free_replacement_page(pl);
4323                                                 page_create_putback(pages);
4324                                         }
4325 
4326                                         for (i = 0; i < pages; i++) {
4327                                                 page_unlock(ppa[i]);
4328                                         }
4329                                         if (amp != NULL) {
4330                                                 anon_array_exit(&an_cookie);
4331                                                 ANON_LOCK_EXIT(&amp->a_rwlock);
4332                                         }
4333                                         ierr = -1;
4334                                         break;
4335                                 }
4336                                 if (szc != 0 && !upgrdfail) {
4337                                         segvn_faultvnmpss_align_err5++;
4338                                 }
4339                                 SEGVN_VMSTAT_FLTVNPAGES(34);
4340                                 if (pplist != NULL) {
4341                                         page_free_replacement_page(pplist);
4342                                         page_create_putback(pages);
4343                                 }
4344                                 SEGVN_UPDATE_MODBITS(ppa, pages, rw,
4345                                     prot, vpprot);
4346                                 if (upgrdfail && segvn_anypgsz_vnode) {
4347                                         /* SOFTLOCK case */
4348                                         hat_memload_array_region(hat, a, pgsz,
4349                                             ppa, prot & vpprot, hat_flag,
4350                                             svd->rcookie);
4351                                 } else {
4352                                         for (i = 0; i < pages; i++) {
4353                                                 hat_memload_region(hat,
4354                                                     a + (i << PAGESHIFT),
4355                                                     ppa[i], prot & vpprot,
4356                                                     hat_flag, svd->rcookie);

4897  *              Call VOP_GETPAGE over the range of non-anonymous pages
4898  *      endif
4899  *      Loop over all addresses requested
4900  *              Call segvn_faultpage passing in page list
4901  *                  to load up translations and handle anonymous pages
4902  *      endloop
4903  *      Load up translation to any additional pages in page list not
4904  *          already handled that fit into this segment
4905  */
4906 static faultcode_t
4907 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
4908     enum fault_type type, enum seg_rw rw)
4909 {
4910         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
4911         page_t **plp, **ppp, *pp;
4912         u_offset_t off;
4913         caddr_t a;
4914         struct vpage *vpage;
4915         uint_t vpprot, prot;
4916         int err;
4917         page_t *pl[FAULT_TMP_PAGES_NUM + 1];
4918         size_t plsz, pl_alloc_sz;
4919         size_t page;
4920         ulong_t anon_index;
4921         struct anon_map *amp;
4922         int dogetpage = 0;
4923         caddr_t lpgaddr, lpgeaddr;
4924         size_t pgsz;
4925         anon_sync_obj_t cookie;
4926         int brkcow = BREAK_COW_SHARE(rw, type, svd->type);
4927 
4928         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
4929         ASSERT(svd->amp == NULL || svd->rcookie == HAT_INVALID_REGION_COOKIE);
4930 
4931         /*
4932          * First handle the easy stuff
4933          */
4934         if (type == F_SOFTUNLOCK) {
4935                 if (rw == S_READ_NOCOW) {
4936                         rw = S_READ;
4937                         ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));

5336                          * Only acquire reader lock to prevent amp->ahp
5337                          * from being changed.  It's ok to miss pages,
5338                          * hence we don't do anon_array_enter
5339                          */
5340                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5341                         ap = anon_get_ptr(amp->ahp, anon_index);
5342 
5343                         if (len <= PAGESIZE)
5344                                 /* inline non_anon() */
5345                                 dogetpage = (ap == NULL);
5346                         else
5347                                 dogetpage = non_anon(amp->ahp, anon_index,
5348                                     &vp_off, &vp_len);
5349                         ANON_LOCK_EXIT(&amp->a_rwlock);
5350                 }
5351 
5352                 if (dogetpage) {
5353                         enum seg_rw arw;
5354                         struct as *as = seg->s_as;
5355 
5356                         if (len > FAULT_TMP_PAGES_SZ) {
5357                                 /*
5358                                  * Page list won't fit in local array,
5359                                  * allocate one of the needed size.
5360                                  */
5361                                 pl_alloc_sz =
5362                                     (btop(len) + 1) * sizeof (page_t *);
5363                                 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP);
5364                                 plp[0] = NULL;
5365                                 plsz = len;
5366                         } else if (rw == S_WRITE && svd->type == MAP_PRIVATE ||
5367                             svd->tr_state == SEGVN_TR_ON || rw == S_OTHER ||
5368                             (((size_t)(addr + PAGESIZE) <
5369                             (size_t)(seg->s_base + seg->s_size)) &&
5370                             hat_probe(as->a_hat, addr + PAGESIZE))) {
5371                                 /*
5372                                  * Ask VOP_GETPAGE to return the exact number
5373                                  * of pages if
5374                                  * (a) this is a COW fault, or
5375                                  * (b) this is a software fault, or
5376                                  * (c) next page is already mapped.
5377                                  */
5378                                 plsz = len;
5379                         } else {
5380                                 /*
5381                                  * Ask VOP_GETPAGE to return adjacent pages
5382                                  * within the segment.
5383                                  */
5384                                 plsz = MIN((size_t)FAULT_TMP_PAGES_SZ, (size_t)
5385                                     ((seg->s_base + seg->s_size) - addr));
5386                                 ASSERT((addr + plsz) <=
5387                                     (seg->s_base + seg->s_size));
5388                         }
5389 
5390                         /*
5391                          * Need to get some non-anonymous pages.
5392                          * We need to make only one call to GETPAGE to do
5393                          * this to prevent certain deadlocking conditions
5394                          * when we are doing locking.  In this case
5395                          * non_anon() should have picked up the smallest
5396                          * range which includes all the non-anonymous
5397                          * pages in the requested range.  We have to
5398                          * be careful regarding which rw flag to pass in
5399                          * because on a private mapping, the underlying
5400                          * object is never allowed to be written.
5401                          */
5402                         if (rw == S_WRITE && svd->type == MAP_PRIVATE) {
5403                                 arw = S_READ;
5404                         } else {

6031                  * unload any current translations that might exist).
6032                  */
6033                 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD);
6034         } else {
6035                 /*
6036                  * A shared mapping or a private mapping in which write
6037                  * protection is going to be denied - just change all the
6038                  * protections over the range of addresses in question.
6039                  * segvn does not support any other attributes other
6040                  * than prot so we can use hat_chgattr.
6041                  */
6042                 hat_chgattr(seg->s_as->a_hat, addr, len, prot);
6043         }
6044 
6045         SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
6046 
6047         return (0);
6048 }
6049 
6050 /*
6051  * segvn_setpagesize is called via segop_setpagesize from as_setpagesize,
6052  * to determine if the seg is capable of mapping the requested szc.
6053  */
6054 static int
6055 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
6056 {
6057         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6058         struct segvn_data *nsvd;
6059         struct anon_map *amp = svd->amp;
6060         struct seg *nseg;
6061         caddr_t eaddr = addr + len, a;
6062         size_t pgsz = page_get_pagesize(szc);
6063         pgcnt_t pgcnt = page_get_pagecnt(szc);
6064         int err;
6065         u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base);
6066 
6067         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
6068         ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size);
6069 
6070         if (seg->s_szc == szc || segvn_lpg_disable != 0) {
6071                 return (0);

7019          * see if they happen to be properly allocated.
7020          */
7021 
7022         /*
7023          * XXX We cheat here and don't lock the anon slots. We can't because
7024          * we may have been called from the anon layer which might already
7025          * have locked them. We are holding a refcnt on the slots so they
7026          * can't disappear. The worst that will happen is we'll get the wrong
7027          * names (vp, off) for the slots and make a poor klustering decision.
7028          */
7029         swap_xlate(ap, &vp1, &off1);
7030         swap_xlate(oap, &vp2, &off2);
7031 
7032 
7033         if (!VOP_CMP(vp1, vp2, NULL) || off1 - off2 != delta)
7034                 return (-1);
7035         return (0);
7036 }
7037 
7038 /*























































































































































































7039  * Synchronize primary storage cache with real object in virtual memory.
7040  *
7041  * XXX - Anonymous pages should not be sync'ed out at all.
7042  */
7043 static int
7044 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
7045 {
7046         struct segvn_data *svd = (struct segvn_data *)seg->s_data;
7047         struct vpage *vpp;
7048         page_t *pp;
7049         u_offset_t offset;
7050         struct vnode *vp;
7051         u_offset_t off;
7052         caddr_t eaddr;
7053         int bflags;
7054         int err = 0;
7055         int segtype;
7056         int pageprot;
7057         int prot;
7058         ulong_t anon_index;

9453         /*
9454          * Get policy info for private or shared memory
9455          */
9456         if (svn_data->type != MAP_SHARED) {
9457                 if (svn_data->tr_state != SEGVN_TR_ON) {
9458                         policy_info = &svn_data->policy_info;
9459                 } else {
9460                         policy_info = &svn_data->tr_policy_info;
9461                         ASSERT(policy_info->mem_policy ==
9462                             LGRP_MEM_POLICY_NEXT_SEG);
9463                 }
9464         } else {
9465                 amp = svn_data->amp;
9466                 anon_index = svn_data->anon_index + seg_page(seg, addr);
9467                 vp = svn_data->vp;
9468                 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base);
9469                 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off);
9470         }
9471 
9472         return (policy_info);







9473 }
9474 
9475 /*
9476  * Bind text vnode segment to an amp. If we bind successfully mappings will be
9477  * established to per vnode mapping per lgroup amp pages instead of to vnode
9478  * pages. There's one amp per vnode text mapping per lgroup. Many processes
9479  * may share the same text replication amp. If a suitable amp doesn't already
9480  * exist in svntr hash table create a new one.  We may fail to bind to amp if
9481  * segment is not eligible for text replication.  Code below first checks for
9482  * these conditions. If binding is successful segment tr_state is set to on
9483  * and svd->amp points to the amp to use. Otherwise tr_state is set to off and
9484  * svd->amp remains as NULL.
9485  */
9486 static void
9487 segvn_textrepl(struct seg *seg)
9488 {
9489         struct segvn_data       *svd = (struct segvn_data *)seg->s_data;
9490         vnode_t                 *vp = svd->vp;
9491         u_offset_t              off = svd->offset;
9492         size_t                  size = seg->s_size;