1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 #include <sys/param.h>
  26 #include <sys/user.h>
  27 #include <sys/mman.h>
  28 #include <sys/kmem.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/cmn_err.h>
  31 #include <sys/systm.h>
  32 #include <sys/tuneable.h>
  33 #include <vm/hat.h>
  34 #include <vm/seg.h>
  35 #include <vm/as.h>
  36 #include <vm/anon.h>
  37 #include <vm/page.h>
  38 #include <sys/buf.h>
  39 #include <sys/swap.h>
  40 #include <sys/atomic.h>
  41 #include <vm/seg_spt.h>
  42 #include <sys/debug.h>
  43 #include <sys/vtrace.h>
  44 #include <sys/shm.h>
  45 #include <sys/shm_impl.h>
  46 #include <sys/lgrp.h>
  47 #include <sys/vmsystm.h>
  48 #include <sys/policy.h>
  49 #include <sys/project.h>
  50 #include <sys/tnf_probe.h>
  51 #include <sys/zone.h>
  52 
  53 #define SEGSPTADDR      (caddr_t)0x0
  54 
  55 /*
  56  * # pages used for spt
  57  */
  58 size_t  spt_used;
  59 
  60 /*
  61  * segspt_minfree is the memory left for system after ISM
  62  * locked its pages; it is set up to 5% of availrmem in
  63  * sptcreate when ISM is created.  ISM should not use more
  64  * than ~90% of availrmem; if it does, then the performance
  65  * of the system may decrease. Machines with large memories may
  66  * be able to use up more memory for ISM so we set the default
  67  * segspt_minfree to 5% (which gives ISM max 95% of availrmem.
  68  * If somebody wants even more memory for ISM (risking hanging
  69  * the system) they can patch the segspt_minfree to smaller number.
  70  */
  71 pgcnt_t segspt_minfree = 0;
  72 
  73 static int segspt_create(struct seg *seg, caddr_t argsp);
  74 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
  75 static void segspt_free(struct seg *seg);
  76 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
  77 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
  78 
  79 static const struct seg_ops segspt_ops = {
  80         .unmap          = segspt_unmap,
  81         .free           = segspt_free,
  82         .getpolicy      = segspt_getpolicy,
  83 };
  84 
  85 static int segspt_shmdup(struct seg *seg, struct seg *newseg);
  86 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
  87 static void segspt_shmfree(struct seg *seg);
  88 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
  89                 caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
  90 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
  91 static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr,
  92                         register size_t len, register uint_t prot);
  93 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
  94                         uint_t prot);
  95 static int      segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
  96 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
  97                         register char *vec);
  98 static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len,
  99                         int attr, uint_t flags);
 100 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
 101                         int attr, int op, ulong_t *lockmap, size_t pos);
 102 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
 103                         uint_t *protv);
 104 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
 105 static int segspt_shmgettype(struct seg *seg, caddr_t addr);
 106 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
 107 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
 108                         uint_t behav);
 109 static void segspt_shmdump(struct seg *seg);
 110 static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
 111                         struct page ***, enum lock_type, enum seg_rw);
 112 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
 113 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
 114 
 115 const struct seg_ops segspt_shmops = {
 116         .dup            = segspt_shmdup,
 117         .unmap          = segspt_shmunmap,
 118         .free           = segspt_shmfree,
 119         .fault          = segspt_shmfault,
 120         .faulta         = segspt_shmfaulta,
 121         .setprot        = segspt_shmsetprot,
 122         .checkprot      = segspt_shmcheckprot,
 123         .kluster        = segspt_shmkluster,
 124         .sync           = segspt_shmsync,
 125         .incore         = segspt_shmincore,
 126         .lockop         = segspt_shmlockop,
 127         .getprot        = segspt_shmgetprot,
 128         .getoffset      = segspt_shmgetoffset,
 129         .gettype        = segspt_shmgettype,
 130         .getvp          = segspt_shmgetvp,
 131         .advise         = segspt_shmadvise,
 132         .dump           = segspt_shmdump,
 133         .pagelock       = segspt_shmpagelock,
 134         .getmemid       = segspt_shmgetmemid,
 135         .getpolicy      = segspt_shmgetpolicy,
 136 };
 137 
 138 static void segspt_purge(struct seg *seg);
 139 static int segspt_reclaim(void *, caddr_t, size_t, struct page **,
 140                 enum seg_rw, int);
 141 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
 142                 page_t **ppa);
 143 
 144 
 145 
 146 /*ARGSUSED*/
 147 int
 148 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
 149         uint_t prot, uint_t flags, uint_t share_szc)
 150 {
 151         int     err;
 152         struct  as      *newas;
 153         struct  segspt_crargs sptcargs;
 154 
 155 #ifdef DEBUG
 156         TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */,
 157                         tnf_ulong, size, size );
 158 #endif
 159         if (segspt_minfree == 0)        /* leave min 5% of availrmem for */
 160                 segspt_minfree = availrmem/20;  /* for the system */
 161 
 162         if (!hat_supported(HAT_SHARED_PT, (void *)0))
 163                 return (EINVAL);
 164 
 165         /*
 166          * get a new as for this shared memory segment
 167          */
 168         newas = as_alloc();
 169         newas->a_proc = NULL;
 170         sptcargs.amp = amp;
 171         sptcargs.prot = prot;
 172         sptcargs.flags = flags;
 173         sptcargs.szc = share_szc;
 174         /*
 175          * create a shared page table (spt) segment
 176          */
 177 
 178         if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
 179                 as_free(newas);
 180                 return (err);
 181         }
 182         *sptseg = sptcargs.seg_spt;
 183         return (0);
 184 }
 185 
 186 void
 187 sptdestroy(struct as *as, struct anon_map *amp)
 188 {
 189 
 190 #ifdef DEBUG
 191         TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */);
 192 #endif
 193         (void) as_unmap(as, SEGSPTADDR, amp->size);
 194         as_free(as);
 195 }
 196 
 197 /*
 198  * called from seg_free().
 199  * free (i.e., unlock, unmap, return to free list)
 200  *  all the pages in the given seg.
 201  */
 202 void
 203 segspt_free(struct seg  *seg)
 204 {
 205         struct spt_data *sptd = (struct spt_data *)seg->s_data;
 206 
 207         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
 208 
 209         if (sptd != NULL) {
 210                 if (sptd->spt_realsize)
 211                         segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
 212 
 213         if (sptd->spt_ppa_lckcnt)
 214                 kmem_free(sptd->spt_ppa_lckcnt,
 215                     sizeof (*sptd->spt_ppa_lckcnt)
 216                     * btopr(sptd->spt_amp->size));
 217                 kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
 218                 cv_destroy(&sptd->spt_cv);
 219                 mutex_destroy(&sptd->spt_lock);
 220                 kmem_free(sptd, sizeof (*sptd));
 221         }
 222 }
 223 
 224 /*ARGSUSED*/
 225 static int
 226 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
 227         uint_t flags)
 228 {
 229         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
 230 
 231         return (0);
 232 }
 233 
 234 /*ARGSUSED*/
 235 static size_t
 236 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
 237 {
 238         caddr_t eo_seg;
 239         pgcnt_t npages;
 240         struct shm_data *shmd = (struct shm_data *)seg->s_data;
 241         struct seg      *sptseg;
 242         struct spt_data *sptd;
 243 
 244         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
 245 #ifdef lint
 246         seg = seg;
 247 #endif
 248         sptseg = shmd->shm_sptseg;
 249         sptd = sptseg->s_data;
 250 
 251         if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 252                 eo_seg = addr + len;
 253                 while (addr < eo_seg) {
 254                         /* page exists, and it's locked. */
 255                         *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
 256                             SEG_PAGE_ANON;
 257                         addr += PAGESIZE;
 258                 }
 259                 return (len);
 260         } else {
 261                 struct  anon_map *amp = shmd->shm_amp;
 262                 struct  anon    *ap;
 263                 page_t          *pp;
 264                 pgcnt_t         anon_index;
 265                 struct vnode    *vp;
 266                 u_offset_t      off;
 267                 ulong_t         i;
 268                 int             ret;
 269                 anon_sync_obj_t cookie;
 270 
 271                 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 272                 anon_index = seg_page(seg, addr);
 273                 npages = btopr(len);
 274                 if (anon_index + npages > btopr(shmd->shm_amp->size)) {
 275                         return (EINVAL);
 276                 }
 277                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
 278                 for (i = 0; i < npages; i++, anon_index++) {
 279                         ret = 0;
 280                         anon_array_enter(amp, anon_index, &cookie);
 281                         ap = anon_get_ptr(amp->ahp, anon_index);
 282                         if (ap != NULL) {
 283                                 swap_xlate(ap, &vp, &off);
 284                                 anon_array_exit(&cookie);
 285                                 pp = page_lookup_nowait(vp, off, SE_SHARED);
 286                                 if (pp != NULL) {
 287                                         ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
 288                                         page_unlock(pp);
 289                                 }
 290                         } else {
 291                                 anon_array_exit(&cookie);
 292                         }
 293                         if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
 294                                 ret |= SEG_PAGE_LOCKED;
 295                         }
 296                         *vec++ = (char)ret;
 297                 }
 298                 ANON_LOCK_EXIT(&amp->a_rwlock);
 299                 return (len);
 300         }
 301 }
 302 
 303 static int
 304 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
 305 {
 306         size_t share_size;
 307 
 308         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
 309 
 310         /*
 311          * seg.s_size may have been rounded up to the largest page size
 312          * in shmat().
 313          * XXX This should be cleanedup. sptdestroy should take a length
 314          * argument which should be the same as sptcreate. Then
 315          * this rounding would not be needed (or is done in shm.c)
 316          * Only the check for full segment will be needed.
 317          *
 318          * XXX -- shouldn't raddr == 0 always? These tests don't seem
 319          * to be useful at all.
 320          */
 321         share_size = page_get_pagesize(seg->s_szc);
 322         ssize = P2ROUNDUP(ssize, share_size);
 323 
 324         if (raddr == seg->s_base && ssize == seg->s_size) {
 325                 seg_free(seg);
 326                 return (0);
 327         } else
 328                 return (EINVAL);
 329 }
 330 
 331 int
 332 segspt_create(struct seg *seg, caddr_t argsp)
 333 {
 334         int             err;
 335         caddr_t         addr = seg->s_base;
 336         struct spt_data *sptd;
 337         struct  segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
 338         struct anon_map *amp = sptcargs->amp;
 339         struct kshmid   *sp = amp->a_sp;
 340         struct  cred    *cred = CRED();
 341         ulong_t         i, j, anon_index = 0;
 342         pgcnt_t         npages = btopr(amp->size);
 343         struct vnode    *vp;
 344         page_t          **ppa;
 345         uint_t          hat_flags;
 346         size_t          pgsz;
 347         pgcnt_t         pgcnt;
 348         caddr_t         a;
 349         pgcnt_t         pidx;
 350         size_t          sz;
 351         proc_t          *procp = curproc;
 352         rctl_qty_t      lockedbytes = 0;
 353         kproject_t      *proj;
 354 
 355         /*
 356          * We are holding the a_lock on the underlying dummy as,
 357          * so we can make calls to the HAT layer.
 358          */
 359         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
 360         ASSERT(sp != NULL);
 361 
 362 #ifdef DEBUG
 363         TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */,
 364             tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size);
 365 #endif
 366         if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
 367                 if (err = anon_swap_adjust(npages))
 368                         return (err);
 369         }
 370         err = ENOMEM;
 371 
 372         if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
 373                 goto out1;
 374 
 375         if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
 376                 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
 377                     KM_NOSLEEP)) == NULL)
 378                         goto out2;
 379         }
 380 
 381         mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
 382 
 383         if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
 384                 goto out3;
 385 
 386         seg->s_ops = &segspt_ops;
 387         sptd->spt_vp = vp;
 388         sptd->spt_amp = amp;
 389         sptd->spt_prot = sptcargs->prot;
 390         sptd->spt_flags = sptcargs->flags;
 391         seg->s_data = (caddr_t)sptd;
 392         sptd->spt_ppa = NULL;
 393         sptd->spt_ppa_lckcnt = NULL;
 394         seg->s_szc = sptcargs->szc;
 395         cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL);
 396         sptd->spt_gen = 0;
 397 
 398         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 399         if (seg->s_szc > amp->a_szc) {
 400                 amp->a_szc = seg->s_szc;
 401         }
 402         ANON_LOCK_EXIT(&amp->a_rwlock);
 403 
 404         /*
 405          * Set policy to affect initial allocation of pages in
 406          * anon_map_createpages()
 407          */
 408         (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
 409             NULL, 0, ptob(npages));
 410 
 411         if (sptcargs->flags & SHM_PAGEABLE) {
 412                 size_t  share_sz;
 413                 pgcnt_t new_npgs, more_pgs;
 414                 struct anon_hdr *nahp;
 415                 zone_t *zone;
 416 
 417                 share_sz = page_get_pagesize(seg->s_szc);
 418                 if (!IS_P2ALIGNED(amp->size, share_sz)) {
 419                         /*
 420                          * We are rounding up the size of the anon array
 421                          * on 4 M boundary because we always create 4 M
 422                          * of page(s) when locking, faulting pages and we
 423                          * don't have to check for all corner cases e.g.
 424                          * if there is enough space to allocate 4 M
 425                          * page.
 426                          */
 427                         new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
 428                         more_pgs = new_npgs - npages;
 429 
 430                         /*
 431                          * The zone will never be NULL, as a fully created
 432                          * shm always has an owning zone.
 433                          */
 434                         zone = sp->shm_perm.ipc_zone_ref.zref_zone;
 435                         ASSERT(zone != NULL);
 436                         if (anon_resv_zone(ptob(more_pgs), zone) == 0) {
 437                                 err = ENOMEM;
 438                                 goto out4;
 439                         }
 440 
 441                         nahp = anon_create(new_npgs, ANON_SLEEP);
 442                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 443                         (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
 444                             ANON_SLEEP);
 445                         anon_release(amp->ahp, npages);
 446                         amp->ahp = nahp;
 447                         ASSERT(amp->swresv == ptob(npages));
 448                         amp->swresv = amp->size = ptob(new_npgs);
 449                         ANON_LOCK_EXIT(&amp->a_rwlock);
 450                         npages = new_npgs;
 451                 }
 452 
 453                 sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
 454                     sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
 455                 sptd->spt_pcachecnt = 0;
 456                 sptd->spt_realsize = ptob(npages);
 457                 sptcargs->seg_spt = seg;
 458                 return (0);
 459         }
 460 
 461         /*
 462          * get array of pages for each anon slot in amp
 463          */
 464         if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
 465             seg, addr, S_CREATE, cred)) != 0)
 466                 goto out4;
 467 
 468         mutex_enter(&sp->shm_mlock);
 469 
 470         /* May be partially locked, so, count bytes to charge for locking */
 471         for (i = 0; i < npages; i++)
 472                 if (ppa[i]->p_lckcnt == 0)
 473                         lockedbytes += PAGESIZE;
 474 
 475         proj = sp->shm_perm.ipc_proj;
 476 
 477         if (lockedbytes > 0) {
 478                 mutex_enter(&procp->p_lock);
 479                 if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) {
 480                         mutex_exit(&procp->p_lock);
 481                         mutex_exit(&sp->shm_mlock);
 482                         for (i = 0; i < npages; i++)
 483                                 page_unlock(ppa[i]);
 484                         err = ENOMEM;
 485                         goto out4;
 486                 }
 487                 mutex_exit(&procp->p_lock);
 488         }
 489 
 490         /*
 491          * addr is initial address corresponding to the first page on ppa list
 492          */
 493         for (i = 0; i < npages; i++) {
 494                 /* attempt to lock all pages */
 495                 if (page_pp_lock(ppa[i], 0, 1) == 0) {
 496                         /*
 497                          * if unable to lock any page, unlock all
 498                          * of them and return error
 499                          */
 500                         for (j = 0; j < i; j++)
 501                                 page_pp_unlock(ppa[j], 0, 1);
 502                         for (i = 0; i < npages; i++)
 503                                 page_unlock(ppa[i]);
 504                         rctl_decr_locked_mem(NULL, proj, lockedbytes, 0);
 505                         mutex_exit(&sp->shm_mlock);
 506                         err = ENOMEM;
 507                         goto out4;
 508                 }
 509         }
 510         mutex_exit(&sp->shm_mlock);
 511 
 512         /*
 513          * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
 514          * for the entire life of the segment. For example platforms
 515          * that do not support Dynamic Reconfiguration.
 516          */
 517         hat_flags = HAT_LOAD_SHARE;
 518         if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
 519                 hat_flags |= HAT_LOAD_LOCK;
 520 
 521         /*
 522          * Load translations one lare page at a time
 523          * to make sure we don't create mappings bigger than
 524          * segment's size code in case underlying pages
 525          * are shared with segvn's segment that uses bigger
 526          * size code than we do.
 527          */
 528         pgsz = page_get_pagesize(seg->s_szc);
 529         pgcnt = page_get_pagecnt(seg->s_szc);
 530         for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) {
 531                 sz = MIN(pgsz, ptob(npages - pidx));
 532                 hat_memload_array(seg->s_as->a_hat, a, sz,
 533                     &ppa[pidx], sptd->spt_prot, hat_flags);
 534         }
 535 
 536         /*
 537          * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
 538          * we will leave the pages locked SE_SHARED for the life
 539          * of the ISM segment. This will prevent any calls to
 540          * hat_pageunload() on this ISM segment for those platforms.
 541          */
 542         if (!(hat_flags & HAT_LOAD_LOCK)) {
 543                 /*
 544                  * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
 545                  * we no longer need to hold the SE_SHARED lock on the pages,
 546                  * since L_PAGELOCK and F_SOFTLOCK calls will grab the
 547                  * SE_SHARED lock on the pages as necessary.
 548                  */
 549                 for (i = 0; i < npages; i++)
 550                         page_unlock(ppa[i]);
 551         }
 552         sptd->spt_pcachecnt = 0;
 553         kmem_free(ppa, ((sizeof (page_t *)) * npages));
 554         sptd->spt_realsize = ptob(npages);
 555         atomic_add_long(&spt_used, npages);
 556         sptcargs->seg_spt = seg;
 557         return (0);
 558 
 559 out4:
 560         seg->s_data = NULL;
 561         kmem_free(vp, sizeof (*vp));
 562         cv_destroy(&sptd->spt_cv);
 563 out3:
 564         mutex_destroy(&sptd->spt_lock);
 565         if ((sptcargs->flags & SHM_PAGEABLE) == 0)
 566                 kmem_free(ppa, (sizeof (*ppa) * npages));
 567 out2:
 568         kmem_free(sptd, sizeof (*sptd));
 569 out1:
 570         if ((sptcargs->flags & SHM_PAGEABLE) == 0)
 571                 anon_swap_restore(npages);
 572         return (err);
 573 }
 574 
 575 /*ARGSUSED*/
 576 void
 577 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
 578 {
 579         struct page     *pp;
 580         struct spt_data *sptd = (struct spt_data *)seg->s_data;
 581         pgcnt_t         npages;
 582         ulong_t         anon_idx;
 583         struct anon_map *amp;
 584         struct anon     *ap;
 585         struct vnode    *vp;
 586         u_offset_t      off;
 587         uint_t          hat_flags;
 588         int             root = 0;
 589         pgcnt_t         pgs, curnpgs = 0;
 590         page_t          *rootpp;
 591         rctl_qty_t      unlocked_bytes = 0;
 592         kproject_t      *proj;
 593         kshmid_t        *sp;
 594 
 595         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
 596 
 597         len = P2ROUNDUP(len, PAGESIZE);
 598 
 599         npages = btop(len);
 600 
 601         hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP;
 602         if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
 603             (sptd->spt_flags & SHM_PAGEABLE)) {
 604                 hat_flags = HAT_UNLOAD_UNMAP;
 605         }
 606 
 607         hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
 608 
 609         amp = sptd->spt_amp;
 610         if (sptd->spt_flags & SHM_PAGEABLE)
 611                 npages = btop(amp->size);
 612 
 613         ASSERT(amp != NULL);
 614 
 615         if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 616                 sp = amp->a_sp;
 617                 proj = sp->shm_perm.ipc_proj;
 618                 mutex_enter(&sp->shm_mlock);
 619         }
 620         for (anon_idx = 0; anon_idx < npages; anon_idx++) {
 621                 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 622                         if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
 623                                 panic("segspt_free_pages: null app");
 624                                 /*NOTREACHED*/
 625                         }
 626                 } else {
 627                         if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
 628                             == NULL)
 629                                 continue;
 630                 }
 631                 ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
 632                 swap_xlate(ap, &vp, &off);
 633 
 634                 /*
 635                  * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
 636                  * the pages won't be having SE_SHARED lock at this
 637                  * point.
 638                  *
 639                  * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
 640                  * the pages are still held SE_SHARED locked from the
 641                  * original segspt_create()
 642                  *
 643                  * Our goal is to get SE_EXCL lock on each page, remove
 644                  * permanent lock on it and invalidate the page.
 645                  */
 646                 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 647                         if (hat_flags == HAT_UNLOAD_UNMAP)
 648                                 pp = page_lookup(vp, off, SE_EXCL);
 649                         else {
 650                                 if ((pp = page_find(vp, off)) == NULL) {
 651                                         panic("segspt_free_pages: "
 652                                             "page not locked");
 653                                         /*NOTREACHED*/
 654                                 }
 655                                 if (!page_tryupgrade(pp)) {
 656                                         page_unlock(pp);
 657                                         pp = page_lookup(vp, off, SE_EXCL);
 658                                 }
 659                         }
 660                         if (pp == NULL) {
 661                                 panic("segspt_free_pages: "
 662                                     "page not in the system");
 663                                 /*NOTREACHED*/
 664                         }
 665                         ASSERT(pp->p_lckcnt > 0);
 666                         page_pp_unlock(pp, 0, 1);
 667                         if (pp->p_lckcnt == 0)
 668                                 unlocked_bytes += PAGESIZE;
 669                 } else {
 670                         if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
 671                                 continue;
 672                 }
 673                 /*
 674                  * It's logical to invalidate the pages here as in most cases
 675                  * these were created by segspt.
 676                  */
 677                 if (pp->p_szc != 0) {
 678                         if (root == 0) {
 679                                 ASSERT(curnpgs == 0);
 680                                 root = 1;
 681                                 rootpp = pp;
 682                                 pgs = curnpgs = page_get_pagecnt(pp->p_szc);
 683                                 ASSERT(pgs > 1);
 684                                 ASSERT(IS_P2ALIGNED(pgs, pgs));
 685                                 ASSERT(!(page_pptonum(pp) & (pgs - 1)));
 686                                 curnpgs--;
 687                         } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
 688                                 ASSERT(curnpgs == 1);
 689                                 ASSERT(page_pptonum(pp) ==
 690                                     page_pptonum(rootpp) + (pgs - 1));
 691                                 page_destroy_pages(rootpp);
 692                                 root = 0;
 693                                 curnpgs = 0;
 694                         } else {
 695                                 ASSERT(curnpgs > 1);
 696                                 ASSERT(page_pptonum(pp) ==
 697                                     page_pptonum(rootpp) + (pgs - curnpgs));
 698                                 curnpgs--;
 699                         }
 700                 } else {
 701                         if (root != 0 || curnpgs != 0) {
 702                                 panic("segspt_free_pages: bad large page");
 703                                 /*NOTREACHED*/
 704                         }
 705                         /*
 706                          * Before destroying the pages, we need to take care
 707                          * of the rctl locked memory accounting. For that
 708                          * we need to calculte the unlocked_bytes.
 709                          */
 710                         if (pp->p_lckcnt > 0)
 711                                 unlocked_bytes += PAGESIZE;
 712                         /*LINTED: constant in conditional context */
 713                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
 714                 }
 715         }
 716         if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 717                 if (unlocked_bytes > 0)
 718                         rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0);
 719                 mutex_exit(&sp->shm_mlock);
 720         }
 721         if (root != 0 || curnpgs != 0) {
 722                 panic("segspt_free_pages: bad large page");
 723                 /*NOTREACHED*/
 724         }
 725 
 726         /*
 727          * mark that pages have been released
 728          */
 729         sptd->spt_realsize = 0;
 730 
 731         if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 732                 atomic_add_long(&spt_used, -npages);
 733                 anon_swap_restore(npages);
 734         }
 735 }
 736 
 737 /*
 738  * Get memory allocation policy info for specified address in given segment
 739  */
 740 static lgrp_mem_policy_info_t *
 741 segspt_getpolicy(struct seg *seg, caddr_t addr)
 742 {
 743         struct anon_map         *amp;
 744         ulong_t                 anon_index;
 745         lgrp_mem_policy_info_t  *policy_info;
 746         struct spt_data         *spt_data;
 747 
 748         ASSERT(seg != NULL);
 749 
 750         /*
 751          * Get anon_map from segspt
 752          *
 753          * Assume that no lock needs to be held on anon_map, since
 754          * it should be protected by its reference count which must be
 755          * nonzero for an existing segment
 756          * Need to grab readers lock on policy tree though
 757          */
 758         spt_data = (struct spt_data *)seg->s_data;
 759         if (spt_data == NULL)
 760                 return (NULL);
 761         amp = spt_data->spt_amp;
 762         ASSERT(amp->refcnt != 0);
 763 
 764         /*
 765          * Get policy info
 766          *
 767          * Assume starting anon index of 0
 768          */
 769         anon_index = seg_page(seg, addr);
 770         policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
 771 
 772         return (policy_info);
 773 }
 774 
 775 /*
 776  * DISM only.
 777  * Return locked pages over a given range.
 778  *
 779  * We will cache all DISM locked pages and save the pplist for the
 780  * entire segment in the ppa field of the underlying DISM segment structure.
 781  * Later, during a call to segspt_reclaim() we will use this ppa array
 782  * to page_unlock() all of the pages and then we will free this ppa list.
 783  */
 784 /*ARGSUSED*/
 785 static int
 786 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
 787     struct page ***ppp, enum lock_type type, enum seg_rw rw)
 788 {
 789         struct  shm_data *shmd = (struct shm_data *)seg->s_data;
 790         struct  seg     *sptseg = shmd->shm_sptseg;
 791         struct  spt_data *sptd = sptseg->s_data;
 792         pgcnt_t pg_idx, npages, tot_npages, npgs;
 793         struct  page **pplist, **pl, **ppa, *pp;
 794         struct  anon_map *amp;
 795         spgcnt_t        an_idx;
 796         int     ret = ENOTSUP;
 797         uint_t  pl_built = 0;
 798         struct  anon *ap;
 799         struct  vnode *vp;
 800         u_offset_t off;
 801         pgcnt_t claim_availrmem = 0;
 802         uint_t  szc;
 803 
 804         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
 805         ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
 806 
 807         /*
 808          * We want to lock/unlock the entire ISM segment. Therefore,
 809          * we will be using the underlying sptseg and it's base address
 810          * and length for the caching arguments.
 811          */
 812         ASSERT(sptseg);
 813         ASSERT(sptd);
 814 
 815         pg_idx = seg_page(seg, addr);
 816         npages = btopr(len);
 817 
 818         /*
 819          * check if the request is larger than number of pages covered
 820          * by amp
 821          */
 822         if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
 823                 *ppp = NULL;
 824                 return (ENOTSUP);
 825         }
 826 
 827         if (type == L_PAGEUNLOCK) {
 828                 ASSERT(sptd->spt_ppa != NULL);
 829 
 830                 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
 831                     sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
 832 
 833                 /*
 834                  * If someone is blocked while unmapping, we purge
 835                  * segment page cache and thus reclaim pplist synchronously
 836                  * without waiting for seg_pasync_thread. This speeds up
 837                  * unmapping in cases where munmap(2) is called, while
 838                  * raw async i/o is still in progress or where a thread
 839                  * exits on data fault in a multithreaded application.
 840                  */
 841                 if ((sptd->spt_flags & DISM_PPA_CHANGED) ||
 842                     (AS_ISUNMAPWAIT(seg->s_as) &&
 843                     shmd->shm_softlockcnt > 0)) {
 844                         segspt_purge(seg);
 845                 }
 846                 return (0);
 847         }
 848 
 849         /* The L_PAGELOCK case ... */
 850 
 851         if (sptd->spt_flags & DISM_PPA_CHANGED) {
 852                 segspt_purge(seg);
 853                 /*
 854                  * for DISM ppa needs to be rebuild since
 855                  * number of locked pages could be changed
 856                  */
 857                 *ppp = NULL;
 858                 return (ENOTSUP);
 859         }
 860 
 861         /*
 862          * First try to find pages in segment page cache, without
 863          * holding the segment lock.
 864          */
 865         pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
 866             S_WRITE, SEGP_FORCE_WIRED);
 867         if (pplist != NULL) {
 868                 ASSERT(sptd->spt_ppa != NULL);
 869                 ASSERT(sptd->spt_ppa == pplist);
 870                 ppa = sptd->spt_ppa;
 871                 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
 872                         if (ppa[an_idx] == NULL) {
 873                                 seg_pinactive(seg, NULL, seg->s_base,
 874                                     sptd->spt_amp->size, ppa,
 875                                     S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
 876                                 *ppp = NULL;
 877                                 return (ENOTSUP);
 878                         }
 879                         if ((szc = ppa[an_idx]->p_szc) != 0) {
 880                                 npgs = page_get_pagecnt(szc);
 881                                 an_idx = P2ROUNDUP(an_idx + 1, npgs);
 882                         } else {
 883                                 an_idx++;
 884                         }
 885                 }
 886                 /*
 887                  * Since we cache the entire DISM segment, we want to
 888                  * set ppp to point to the first slot that corresponds
 889                  * to the requested addr, i.e. pg_idx.
 890                  */
 891                 *ppp = &(sptd->spt_ppa[pg_idx]);
 892                 return (0);
 893         }
 894 
 895         mutex_enter(&sptd->spt_lock);
 896         /*
 897          * try to find pages in segment page cache with mutex
 898          */
 899         pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
 900             S_WRITE, SEGP_FORCE_WIRED);
 901         if (pplist != NULL) {
 902                 ASSERT(sptd->spt_ppa != NULL);
 903                 ASSERT(sptd->spt_ppa == pplist);
 904                 ppa = sptd->spt_ppa;
 905                 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
 906                         if (ppa[an_idx] == NULL) {
 907                                 mutex_exit(&sptd->spt_lock);
 908                                 seg_pinactive(seg, NULL, seg->s_base,
 909                                     sptd->spt_amp->size, ppa,
 910                                     S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
 911                                 *ppp = NULL;
 912                                 return (ENOTSUP);
 913                         }
 914                         if ((szc = ppa[an_idx]->p_szc) != 0) {
 915                                 npgs = page_get_pagecnt(szc);
 916                                 an_idx = P2ROUNDUP(an_idx + 1, npgs);
 917                         } else {
 918                                 an_idx++;
 919                         }
 920                 }
 921                 /*
 922                  * Since we cache the entire DISM segment, we want to
 923                  * set ppp to point to the first slot that corresponds
 924                  * to the requested addr, i.e. pg_idx.
 925                  */
 926                 mutex_exit(&sptd->spt_lock);
 927                 *ppp = &(sptd->spt_ppa[pg_idx]);
 928                 return (0);
 929         }
 930         if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
 931             SEGP_FORCE_WIRED) == SEGP_FAIL) {
 932                 mutex_exit(&sptd->spt_lock);
 933                 *ppp = NULL;
 934                 return (ENOTSUP);
 935         }
 936 
 937         /*
 938          * No need to worry about protections because DISM pages are always rw.
 939          */
 940         pl = pplist = NULL;
 941         amp = sptd->spt_amp;
 942 
 943         /*
 944          * Do we need to build the ppa array?
 945          */
 946         if (sptd->spt_ppa == NULL) {
 947                 pgcnt_t lpg_cnt = 0;
 948 
 949                 pl_built = 1;
 950                 tot_npages = btopr(sptd->spt_amp->size);
 951 
 952                 ASSERT(sptd->spt_pcachecnt == 0);
 953                 pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
 954                 pl = pplist;
 955 
 956                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 957                 for (an_idx = 0; an_idx < tot_npages; ) {
 958                         ap = anon_get_ptr(amp->ahp, an_idx);
 959                         /*
 960                          * Cache only mlocked pages. For large pages
 961                          * if one (constituent) page is mlocked
 962                          * all pages for that large page
 963                          * are cached also. This is for quick
 964                          * lookups of ppa array;
 965                          */
 966                         if ((ap != NULL) && (lpg_cnt != 0 ||
 967                             (sptd->spt_ppa_lckcnt[an_idx] != 0))) {
 968 
 969                                 swap_xlate(ap, &vp, &off);
 970                                 pp = page_lookup(vp, off, SE_SHARED);
 971                                 ASSERT(pp != NULL);
 972                                 if (lpg_cnt == 0) {
 973                                         lpg_cnt++;
 974                                         /*
 975                                          * For a small page, we are done --
 976                                          * lpg_count is reset to 0 below.
 977                                          *
 978                                          * For a large page, we are guaranteed
 979                                          * to find the anon structures of all
 980                                          * constituent pages and a non-zero
 981                                          * lpg_cnt ensures that we don't test
 982                                          * for mlock for these. We are done
 983                                          * when lpg_count reaches (npgs + 1).
 984                                          * If we are not the first constituent
 985                                          * page, restart at the first one.
 986                                          */
 987                                         npgs = page_get_pagecnt(pp->p_szc);
 988                                         if (!IS_P2ALIGNED(an_idx, npgs)) {
 989                                                 an_idx = P2ALIGN(an_idx, npgs);
 990                                                 page_unlock(pp);
 991                                                 continue;
 992                                         }
 993                                 }
 994                                 if (++lpg_cnt > npgs)
 995                                         lpg_cnt = 0;
 996 
 997                                 /*
 998                                  * availrmem is decremented only
 999                                  * for unlocked pages
1000                                  */
1001                                 if (sptd->spt_ppa_lckcnt[an_idx] == 0)
1002                                         claim_availrmem++;
1003                                 pplist[an_idx] = pp;
1004                         }
1005                         an_idx++;
1006                 }
1007                 ANON_LOCK_EXIT(&amp->a_rwlock);
1008 
1009                 if (claim_availrmem) {
1010                         mutex_enter(&freemem_lock);
1011                         if (availrmem < tune.t_minarmem + claim_availrmem) {
1012                                 mutex_exit(&freemem_lock);
1013                                 ret = ENOTSUP;
1014                                 claim_availrmem = 0;
1015                                 goto insert_fail;
1016                         } else {
1017                                 availrmem -= claim_availrmem;
1018                         }
1019                         mutex_exit(&freemem_lock);
1020                 }
1021 
1022                 sptd->spt_ppa = pl;
1023         } else {
1024                 /*
1025                  * We already have a valid ppa[].
1026                  */
1027                 pl = sptd->spt_ppa;
1028         }
1029 
1030         ASSERT(pl != NULL);
1031 
1032         ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1033             sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1034             segspt_reclaim);
1035         if (ret == SEGP_FAIL) {
1036                 /*
1037                  * seg_pinsert failed. We return
1038                  * ENOTSUP, so that the as_pagelock() code will
1039                  * then try the slower F_SOFTLOCK path.
1040                  */
1041                 if (pl_built) {
1042                         /*
1043                          * No one else has referenced the ppa[].
1044                          * We created it and we need to destroy it.
1045                          */
1046                         sptd->spt_ppa = NULL;
1047                 }
1048                 ret = ENOTSUP;
1049                 goto insert_fail;
1050         }
1051 
1052         /*
1053          * In either case, we increment softlockcnt on the 'real' segment.
1054          */
1055         sptd->spt_pcachecnt++;
1056         atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1057 
1058         ppa = sptd->spt_ppa;
1059         for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1060                 if (ppa[an_idx] == NULL) {
1061                         mutex_exit(&sptd->spt_lock);
1062                         seg_pinactive(seg, NULL, seg->s_base,
1063                             sptd->spt_amp->size,
1064                             pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1065                         *ppp = NULL;
1066                         return (ENOTSUP);
1067                 }
1068                 if ((szc = ppa[an_idx]->p_szc) != 0) {
1069                         npgs = page_get_pagecnt(szc);
1070                         an_idx = P2ROUNDUP(an_idx + 1, npgs);
1071                 } else {
1072                         an_idx++;
1073                 }
1074         }
1075         /*
1076          * We can now drop the sptd->spt_lock since the ppa[]
1077          * exists and he have incremented pacachecnt.
1078          */
1079         mutex_exit(&sptd->spt_lock);
1080 
1081         /*
1082          * Since we cache the entire segment, we want to
1083          * set ppp to point to the first slot that corresponds
1084          * to the requested addr, i.e. pg_idx.
1085          */
1086         *ppp = &(sptd->spt_ppa[pg_idx]);
1087         return (0);
1088 
1089 insert_fail:
1090         /*
1091          * We will only reach this code if we tried and failed.
1092          *
1093          * And we can drop the lock on the dummy seg, once we've failed
1094          * to set up a new ppa[].
1095          */
1096         mutex_exit(&sptd->spt_lock);
1097 
1098         if (pl_built) {
1099                 if (claim_availrmem) {
1100                         mutex_enter(&freemem_lock);
1101                         availrmem += claim_availrmem;
1102                         mutex_exit(&freemem_lock);
1103                 }
1104 
1105                 /*
1106                  * We created pl and we need to destroy it.
1107                  */
1108                 pplist = pl;
1109                 for (an_idx = 0; an_idx < tot_npages; an_idx++) {
1110                         if (pplist[an_idx] != NULL)
1111                                 page_unlock(pplist[an_idx]);
1112                 }
1113                 kmem_free(pl, sizeof (page_t *) * tot_npages);
1114         }
1115 
1116         if (shmd->shm_softlockcnt <= 0) {
1117                 if (AS_ISUNMAPWAIT(seg->s_as)) {
1118                         mutex_enter(&seg->s_as->a_contents);
1119                         if (AS_ISUNMAPWAIT(seg->s_as)) {
1120                                 AS_CLRUNMAPWAIT(seg->s_as);
1121                                 cv_broadcast(&seg->s_as->a_cv);
1122                         }
1123                         mutex_exit(&seg->s_as->a_contents);
1124                 }
1125         }
1126         *ppp = NULL;
1127         return (ret);
1128 }
1129 
1130 
1131 
1132 /*
1133  * return locked pages over a given range.
1134  *
1135  * We will cache the entire ISM segment and save the pplist for the
1136  * entire segment in the ppa field of the underlying ISM segment structure.
1137  * Later, during a call to segspt_reclaim() we will use this ppa array
1138  * to page_unlock() all of the pages and then we will free this ppa list.
1139  */
1140 /*ARGSUSED*/
1141 static int
1142 segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
1143     struct page ***ppp, enum lock_type type, enum seg_rw rw)
1144 {
1145         struct shm_data *shmd = (struct shm_data *)seg->s_data;
1146         struct seg      *sptseg = shmd->shm_sptseg;
1147         struct spt_data *sptd = sptseg->s_data;
1148         pgcnt_t np, page_index, npages;
1149         caddr_t a, spt_base;
1150         struct page **pplist, **pl, *pp;
1151         struct anon_map *amp;
1152         ulong_t anon_index;
1153         int ret = ENOTSUP;
1154         uint_t  pl_built = 0;
1155         struct anon *ap;
1156         struct vnode *vp;
1157         u_offset_t off;
1158 
1159         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1160         ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
1161 
1162 
1163         /*
1164          * We want to lock/unlock the entire ISM segment. Therefore,
1165          * we will be using the underlying sptseg and it's base address
1166          * and length for the caching arguments.
1167          */
1168         ASSERT(sptseg);
1169         ASSERT(sptd);
1170 
1171         if (sptd->spt_flags & SHM_PAGEABLE) {
1172                 return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
1173         }
1174 
1175         page_index = seg_page(seg, addr);
1176         npages = btopr(len);
1177 
1178         /*
1179          * check if the request is larger than number of pages covered
1180          * by amp
1181          */
1182         if (page_index + npages > btopr(sptd->spt_amp->size)) {
1183                 *ppp = NULL;
1184                 return (ENOTSUP);
1185         }
1186 
1187         if (type == L_PAGEUNLOCK) {
1188 
1189                 ASSERT(sptd->spt_ppa != NULL);
1190 
1191                 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1192                     sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1193 
1194                 /*
1195                  * If someone is blocked while unmapping, we purge
1196                  * segment page cache and thus reclaim pplist synchronously
1197                  * without waiting for seg_pasync_thread. This speeds up
1198                  * unmapping in cases where munmap(2) is called, while
1199                  * raw async i/o is still in progress or where a thread
1200                  * exits on data fault in a multithreaded application.
1201                  */
1202                 if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
1203                         segspt_purge(seg);
1204                 }
1205                 return (0);
1206         }
1207 
1208         /* The L_PAGELOCK case... */
1209 
1210         /*
1211          * First try to find pages in segment page cache, without
1212          * holding the segment lock.
1213          */
1214         pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1215             S_WRITE, SEGP_FORCE_WIRED);
1216         if (pplist != NULL) {
1217                 ASSERT(sptd->spt_ppa == pplist);
1218                 ASSERT(sptd->spt_ppa[page_index]);
1219                 /*
1220                  * Since we cache the entire ISM segment, we want to
1221                  * set ppp to point to the first slot that corresponds
1222                  * to the requested addr, i.e. page_index.
1223                  */
1224                 *ppp = &(sptd->spt_ppa[page_index]);
1225                 return (0);
1226         }
1227 
1228         mutex_enter(&sptd->spt_lock);
1229 
1230         /*
1231          * try to find pages in segment page cache
1232          */
1233         pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1234             S_WRITE, SEGP_FORCE_WIRED);
1235         if (pplist != NULL) {
1236                 ASSERT(sptd->spt_ppa == pplist);
1237                 /*
1238                  * Since we cache the entire segment, we want to
1239                  * set ppp to point to the first slot that corresponds
1240                  * to the requested addr, i.e. page_index.
1241                  */
1242                 mutex_exit(&sptd->spt_lock);
1243                 *ppp = &(sptd->spt_ppa[page_index]);
1244                 return (0);
1245         }
1246 
1247         if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1248             SEGP_FORCE_WIRED) == SEGP_FAIL) {
1249                 mutex_exit(&sptd->spt_lock);
1250                 *ppp = NULL;
1251                 return (ENOTSUP);
1252         }
1253 
1254         /*
1255          * No need to worry about protections because ISM pages
1256          * are always rw.
1257          */
1258         pl = pplist = NULL;
1259 
1260         /*
1261          * Do we need to build the ppa array?
1262          */
1263         if (sptd->spt_ppa == NULL) {
1264                 ASSERT(sptd->spt_ppa == pplist);
1265 
1266                 spt_base = sptseg->s_base;
1267                 pl_built = 1;
1268 
1269                 /*
1270                  * availrmem is decremented once during anon_swap_adjust()
1271                  * and is incremented during the anon_unresv(), which is
1272                  * called from shm_rm_amp() when the segment is destroyed.
1273                  */
1274                 amp = sptd->spt_amp;
1275                 ASSERT(amp != NULL);
1276 
1277                 /* pcachecnt is protected by sptd->spt_lock */
1278                 ASSERT(sptd->spt_pcachecnt == 0);
1279                 pplist = kmem_zalloc(sizeof (page_t *)
1280                     * btopr(sptd->spt_amp->size), KM_SLEEP);
1281                 pl = pplist;
1282 
1283                 anon_index = seg_page(sptseg, spt_base);
1284 
1285                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1286                 for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
1287                     a += PAGESIZE, anon_index++, pplist++) {
1288                         ap = anon_get_ptr(amp->ahp, anon_index);
1289                         ASSERT(ap != NULL);
1290                         swap_xlate(ap, &vp, &off);
1291                         pp = page_lookup(vp, off, SE_SHARED);
1292                         ASSERT(pp != NULL);
1293                         *pplist = pp;
1294                 }
1295                 ANON_LOCK_EXIT(&amp->a_rwlock);
1296 
1297                 if (a < (spt_base + sptd->spt_amp->size)) {
1298                         ret = ENOTSUP;
1299                         goto insert_fail;
1300                 }
1301                 sptd->spt_ppa = pl;
1302         } else {
1303                 /*
1304                  * We already have a valid ppa[].
1305                  */
1306                 pl = sptd->spt_ppa;
1307         }
1308 
1309         ASSERT(pl != NULL);
1310 
1311         ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1312             sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1313             segspt_reclaim);
1314         if (ret == SEGP_FAIL) {
1315                 /*
1316                  * seg_pinsert failed. We return
1317                  * ENOTSUP, so that the as_pagelock() code will
1318                  * then try the slower F_SOFTLOCK path.
1319                  */
1320                 if (pl_built) {
1321                         /*
1322                          * No one else has referenced the ppa[].
1323                          * We created it and we need to destroy it.
1324                          */
1325                         sptd->spt_ppa = NULL;
1326                 }
1327                 ret = ENOTSUP;
1328                 goto insert_fail;
1329         }
1330 
1331         /*
1332          * In either case, we increment softlockcnt on the 'real' segment.
1333          */
1334         sptd->spt_pcachecnt++;
1335         atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1336 
1337         /*
1338          * We can now drop the sptd->spt_lock since the ppa[]
1339          * exists and he have incremented pacachecnt.
1340          */
1341         mutex_exit(&sptd->spt_lock);
1342 
1343         /*
1344          * Since we cache the entire segment, we want to
1345          * set ppp to point to the first slot that corresponds
1346          * to the requested addr, i.e. page_index.
1347          */
1348         *ppp = &(sptd->spt_ppa[page_index]);
1349         return (0);
1350 
1351 insert_fail:
1352         /*
1353          * We will only reach this code if we tried and failed.
1354          *
1355          * And we can drop the lock on the dummy seg, once we've failed
1356          * to set up a new ppa[].
1357          */
1358         mutex_exit(&sptd->spt_lock);
1359 
1360         if (pl_built) {
1361                 /*
1362                  * We created pl and we need to destroy it.
1363                  */
1364                 pplist = pl;
1365                 np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
1366                 while (np) {
1367                         page_unlock(*pplist);
1368                         np--;
1369                         pplist++;
1370                 }
1371                 kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size));
1372         }
1373         if (shmd->shm_softlockcnt <= 0) {
1374                 if (AS_ISUNMAPWAIT(seg->s_as)) {
1375                         mutex_enter(&seg->s_as->a_contents);
1376                         if (AS_ISUNMAPWAIT(seg->s_as)) {
1377                                 AS_CLRUNMAPWAIT(seg->s_as);
1378                                 cv_broadcast(&seg->s_as->a_cv);
1379                         }
1380                         mutex_exit(&seg->s_as->a_contents);
1381                 }
1382         }
1383         *ppp = NULL;
1384         return (ret);
1385 }
1386 
1387 /*
1388  * purge any cached pages in the I/O page cache
1389  */
1390 static void
1391 segspt_purge(struct seg *seg)
1392 {
1393         seg_ppurge(seg, NULL, SEGP_FORCE_WIRED);
1394 }
1395 
1396 static int
1397 segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
1398         enum seg_rw rw, int async)
1399 {
1400         struct seg *seg = (struct seg *)ptag;
1401         struct  shm_data *shmd = (struct shm_data *)seg->s_data;
1402         struct  seg     *sptseg;
1403         struct  spt_data *sptd;
1404         pgcnt_t npages, i, free_availrmem = 0;
1405         int     done = 0;
1406 
1407 #ifdef lint
1408         addr = addr;
1409 #endif
1410         sptseg = shmd->shm_sptseg;
1411         sptd = sptseg->s_data;
1412         npages = (len >> PAGESHIFT);
1413         ASSERT(npages);
1414         ASSERT(sptd->spt_pcachecnt != 0);
1415         ASSERT(sptd->spt_ppa == pplist);
1416         ASSERT(npages == btopr(sptd->spt_amp->size));
1417         ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1418 
1419         /*
1420          * Acquire the lock on the dummy seg and destroy the
1421          * ppa array IF this is the last pcachecnt.
1422          */
1423         mutex_enter(&sptd->spt_lock);
1424         if (--sptd->spt_pcachecnt == 0) {
1425                 for (i = 0; i < npages; i++) {
1426                         if (pplist[i] == NULL) {
1427                                 continue;
1428                         }
1429                         if (rw == S_WRITE) {
1430                                 hat_setrefmod(pplist[i]);
1431                         } else {
1432                                 hat_setref(pplist[i]);
1433                         }
1434                         if ((sptd->spt_flags & SHM_PAGEABLE) &&
1435                             (sptd->spt_ppa_lckcnt[i] == 0))
1436                                 free_availrmem++;
1437                         page_unlock(pplist[i]);
1438                 }
1439                 if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) {
1440                         mutex_enter(&freemem_lock);
1441                         availrmem += free_availrmem;
1442                         mutex_exit(&freemem_lock);
1443                 }
1444                 /*
1445                  * Since we want to cach/uncache the entire ISM segment,
1446                  * we will track the pplist in a segspt specific field
1447                  * ppa, that is initialized at the time we add an entry to
1448                  * the cache.
1449                  */
1450                 ASSERT(sptd->spt_pcachecnt == 0);
1451                 kmem_free(pplist, sizeof (page_t *) * npages);
1452                 sptd->spt_ppa = NULL;
1453                 sptd->spt_flags &= ~DISM_PPA_CHANGED;
1454                 sptd->spt_gen++;
1455                 cv_broadcast(&sptd->spt_cv);
1456                 done = 1;
1457         }
1458         mutex_exit(&sptd->spt_lock);
1459 
1460         /*
1461          * If we are pcache async thread or called via seg_ppurge_wiredpp() we
1462          * may not hold AS lock (in this case async argument is not 0). This
1463          * means if softlockcnt drops to 0 after the decrement below address
1464          * space may get freed. We can't allow it since after softlock
1465          * derement to 0 we still need to access as structure for possible
1466          * wakeup of unmap waiters. To prevent the disappearance of as we take
1467          * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes
1468          * this mutex as a barrier to make sure this routine completes before
1469          * segment is freed.
1470          *
1471          * The second complication we have to deal with in async case is a
1472          * possibility of missed wake up of unmap wait thread. When we don't
1473          * hold as lock here we may take a_contents lock before unmap wait
1474          * thread that was first to see softlockcnt was still not 0. As a
1475          * result we'll fail to wake up an unmap wait thread. To avoid this
1476          * race we set nounmapwait flag in as structure if we drop softlockcnt
1477          * to 0 if async is not 0.  unmapwait thread
1478          * will not block if this flag is set.
1479          */
1480         if (async)
1481                 mutex_enter(&shmd->shm_segfree_syncmtx);
1482 
1483         /*
1484          * Now decrement softlockcnt.
1485          */
1486         ASSERT(shmd->shm_softlockcnt > 0);
1487         atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1488 
1489         if (shmd->shm_softlockcnt <= 0) {
1490                 if (async || AS_ISUNMAPWAIT(seg->s_as)) {
1491                         mutex_enter(&seg->s_as->a_contents);
1492                         if (async)
1493                                 AS_SETNOUNMAPWAIT(seg->s_as);
1494                         if (AS_ISUNMAPWAIT(seg->s_as)) {
1495                                 AS_CLRUNMAPWAIT(seg->s_as);
1496                                 cv_broadcast(&seg->s_as->a_cv);
1497                         }
1498                         mutex_exit(&seg->s_as->a_contents);
1499                 }
1500         }
1501 
1502         if (async)
1503                 mutex_exit(&shmd->shm_segfree_syncmtx);
1504 
1505         return (done);
1506 }
1507 
1508 /*
1509  * Do a F_SOFTUNLOCK call over the range requested.
1510  * The range must have already been F_SOFTLOCK'ed.
1511  *
1512  * The calls to acquire and release the anon map lock mutex were
1513  * removed in order to avoid a deadly embrace during a DR
1514  * memory delete operation.  (Eg. DR blocks while waiting for a
1515  * exclusive lock on a page that is being used for kaio; the
1516  * thread that will complete the kaio and call segspt_softunlock
1517  * blocks on the anon map lock; another thread holding the anon
1518  * map lock blocks on another page lock via the segspt_shmfault
1519  * -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
1520  *
1521  * The appropriateness of the removal is based upon the following:
1522  * 1. If we are holding a segment's reader lock and the page is held
1523  * shared, then the corresponding element in anonmap which points to
1524  * anon struct cannot change and there is no need to acquire the
1525  * anonymous map lock.
1526  * 2. Threads in segspt_softunlock have a reader lock on the segment
1527  * and already have the shared page lock, so we are guaranteed that
1528  * the anon map slot cannot change and therefore can call anon_get_ptr()
1529  * without grabbing the anonymous map lock.
1530  * 3. Threads that softlock a shared page break copy-on-write, even if
1531  * its a read.  Thus cow faults can be ignored with respect to soft
1532  * unlocking, since the breaking of cow means that the anon slot(s) will
1533  * not be shared.
1534  */
1535 static void
1536 segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
1537         size_t len, enum seg_rw rw)
1538 {
1539         struct shm_data *shmd = (struct shm_data *)seg->s_data;
1540         struct seg      *sptseg;
1541         struct spt_data *sptd;
1542         page_t *pp;
1543         caddr_t adr;
1544         struct vnode *vp;
1545         u_offset_t offset;
1546         ulong_t anon_index;
1547         struct anon_map *amp;           /* XXX - for locknest */
1548         struct anon *ap = NULL;
1549         pgcnt_t npages;
1550 
1551         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1552 
1553         sptseg = shmd->shm_sptseg;
1554         sptd = sptseg->s_data;
1555 
1556         /*
1557          * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
1558          * and therefore their pages are SE_SHARED locked
1559          * for the entire life of the segment.
1560          */
1561         if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
1562             ((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
1563                 goto softlock_decrement;
1564         }
1565 
1566         /*
1567          * Any thread is free to do a page_find and
1568          * page_unlock() on the pages within this seg.
1569          *
1570          * We are already holding the as->a_lock on the user's
1571          * real segment, but we need to hold the a_lock on the
1572          * underlying dummy as. This is mostly to satisfy the
1573          * underlying HAT layer.
1574          */
1575         AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
1576         hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
1577         AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
1578 
1579         amp = sptd->spt_amp;
1580         ASSERT(amp != NULL);
1581         anon_index = seg_page(sptseg, sptseg_addr);
1582 
1583         for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
1584                 ap = anon_get_ptr(amp->ahp, anon_index++);
1585                 ASSERT(ap != NULL);
1586                 swap_xlate(ap, &vp, &offset);
1587 
1588                 /*
1589                  * Use page_find() instead of page_lookup() to
1590                  * find the page since we know that it has a
1591                  * "shared" lock.
1592                  */
1593                 pp = page_find(vp, offset);
1594                 ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
1595                 if (pp == NULL) {
1596                         panic("segspt_softunlock: "
1597                             "addr %p, ap %p, vp %p, off %llx",
1598                             (void *)adr, (void *)ap, (void *)vp, offset);
1599                         /*NOTREACHED*/
1600                 }
1601 
1602                 if (rw == S_WRITE) {
1603                         hat_setrefmod(pp);
1604                 } else if (rw != S_OTHER) {
1605                         hat_setref(pp);
1606                 }
1607                 page_unlock(pp);
1608         }
1609 
1610 softlock_decrement:
1611         npages = btopr(len);
1612         ASSERT(shmd->shm_softlockcnt >= npages);
1613         atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
1614         if (shmd->shm_softlockcnt == 0) {
1615                 /*
1616                  * All SOFTLOCKS are gone. Wakeup any waiting
1617                  * unmappers so they can try again to unmap.
1618                  * Check for waiters first without the mutex
1619                  * held so we don't always grab the mutex on
1620                  * softunlocks.
1621                  */
1622                 if (AS_ISUNMAPWAIT(seg->s_as)) {
1623                         mutex_enter(&seg->s_as->a_contents);
1624                         if (AS_ISUNMAPWAIT(seg->s_as)) {
1625                                 AS_CLRUNMAPWAIT(seg->s_as);
1626                                 cv_broadcast(&seg->s_as->a_cv);
1627                         }
1628                         mutex_exit(&seg->s_as->a_contents);
1629                 }
1630         }
1631 }
1632 
1633 int
1634 segspt_shmattach(struct seg *seg, caddr_t *argsp)
1635 {
1636         struct shm_data *shmd_arg = (struct shm_data *)argsp;
1637         struct shm_data *shmd;
1638         struct anon_map *shm_amp = shmd_arg->shm_amp;
1639         struct spt_data *sptd;
1640         int error = 0;
1641 
1642         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1643 
1644         shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
1645         if (shmd == NULL)
1646                 return (ENOMEM);
1647 
1648         shmd->shm_sptas = shmd_arg->shm_sptas;
1649         shmd->shm_amp = shm_amp;
1650         shmd->shm_sptseg = shmd_arg->shm_sptseg;
1651 
1652         (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
1653             NULL, 0, seg->s_size);
1654 
1655         mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
1656 
1657         seg->s_data = (void *)shmd;
1658         seg->s_ops = &segspt_shmops;
1659         seg->s_szc = shmd->shm_sptseg->s_szc;
1660         sptd = shmd->shm_sptseg->s_data;
1661 
1662         if (sptd->spt_flags & SHM_PAGEABLE) {
1663                 if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
1664                     KM_NOSLEEP)) == NULL) {
1665                         seg->s_data = (void *)NULL;
1666                         kmem_free(shmd, (sizeof (*shmd)));
1667                         return (ENOMEM);
1668                 }
1669                 shmd->shm_lckpgs = 0;
1670                 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
1671                         if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
1672                             shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1673                             seg->s_size, seg->s_szc)) != 0) {
1674                                 kmem_free(shmd->shm_vpage,
1675                                     btopr(shm_amp->size));
1676                         }
1677                 }
1678         } else {
1679                 error = hat_share(seg->s_as->a_hat, seg->s_base,
1680                     shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1681                     seg->s_size, seg->s_szc);
1682         }
1683         if (error) {
1684                 seg->s_szc = 0;
1685                 seg->s_data = (void *)NULL;
1686                 kmem_free(shmd, (sizeof (*shmd)));
1687         } else {
1688                 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1689                 shm_amp->refcnt++;
1690                 ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1691         }
1692         return (error);
1693 }
1694 
1695 int
1696 segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
1697 {
1698         struct shm_data *shmd = (struct shm_data *)seg->s_data;
1699         int reclaim = 1;
1700 
1701         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1702 retry:
1703         if (shmd->shm_softlockcnt > 0) {
1704                 if (reclaim == 1) {
1705                         segspt_purge(seg);
1706                         reclaim = 0;
1707                         goto retry;
1708                 }
1709                 return (EAGAIN);
1710         }
1711 
1712         if (ssize != seg->s_size) {
1713 #ifdef DEBUG
1714                 cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
1715                     ssize, seg->s_size);
1716 #endif
1717                 return (EINVAL);
1718         }
1719 
1720         (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
1721             NULL, 0);
1722         hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);
1723 
1724         seg_free(seg);
1725 
1726         return (0);
1727 }
1728 
1729 void
1730 segspt_shmfree(struct seg *seg)
1731 {
1732         struct shm_data *shmd = (struct shm_data *)seg->s_data;
1733         struct anon_map *shm_amp = shmd->shm_amp;
1734 
1735         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1736 
1737         (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
1738             MC_UNLOCK, NULL, 0);
1739 
1740         /*
1741          * Need to increment refcnt when attaching
1742          * and decrement when detaching because of dup().
1743          */
1744         ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1745         shm_amp->refcnt--;
1746         ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1747 
1748         if (shmd->shm_vpage) {       /* only for DISM */
1749                 kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
1750                 shmd->shm_vpage = NULL;
1751         }
1752 
1753         /*
1754          * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's
1755          * still working with this segment without holding as lock.
1756          */
1757         ASSERT(shmd->shm_softlockcnt == 0);
1758         mutex_enter(&shmd->shm_segfree_syncmtx);
1759         mutex_destroy(&shmd->shm_segfree_syncmtx);
1760 
1761         kmem_free(shmd, sizeof (*shmd));
1762 }
1763 
1764 /*ARGSUSED*/
1765 int
1766 segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1767 {
1768         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1769 
1770         /*
1771          * Shared page table is more than shared mapping.
1772          *  Individual process sharing page tables can't change prot
1773          *  because there is only one set of page tables.
1774          *  This will be allowed after private page table is
1775          *  supported.
1776          */
1777 /* need to return correct status error? */
1778         return (0);
1779 }
1780 
1781 
1782 faultcode_t
1783 segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
1784     size_t len, enum fault_type type, enum seg_rw rw)
1785 {
1786         struct  shm_data        *shmd = (struct shm_data *)seg->s_data;
1787         struct  seg             *sptseg = shmd->shm_sptseg;
1788         struct  as              *curspt = shmd->shm_sptas;
1789         struct  spt_data        *sptd = sptseg->s_data;
1790         pgcnt_t npages;
1791         size_t  size;
1792         caddr_t segspt_addr, shm_addr;
1793         page_t  **ppa;
1794         int     i;
1795         ulong_t an_idx = 0;
1796         int     err = 0;
1797         int     dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0);
1798         size_t  pgsz;
1799         pgcnt_t pgcnt;
1800         caddr_t a;
1801         pgcnt_t pidx;
1802 
1803 #ifdef lint
1804         hat = hat;
1805 #endif
1806         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1807 
1808         /*
1809          * Because of the way spt is implemented
1810          * the realsize of the segment does not have to be
1811          * equal to the segment size itself. The segment size is
1812          * often in multiples of a page size larger than PAGESIZE.
1813          * The realsize is rounded up to the nearest PAGESIZE
1814          * based on what the user requested. This is a bit of
1815          * ungliness that is historical but not easily fixed
1816          * without re-designing the higher levels of ISM.
1817          */
1818         ASSERT(addr >= seg->s_base);
1819         if (((addr + len) - seg->s_base) > sptd->spt_realsize)
1820                 return (FC_NOMAP);
1821         /*
1822          * For all of the following cases except F_PROT, we need to
1823          * make any necessary adjustments to addr and len
1824          * and get all of the necessary page_t's into an array called ppa[].
1825          *
1826          * The code in shmat() forces base addr and len of ISM segment
1827          * to be aligned to largest page size supported. Therefore,
1828          * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
1829          * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
1830          * in large pagesize chunks, or else we will screw up the HAT
1831          * layer by calling hat_memload_array() with differing page sizes
1832          * over a given virtual range.
1833          */
1834         pgsz = page_get_pagesize(sptseg->s_szc);
1835         pgcnt = page_get_pagecnt(sptseg->s_szc);
1836         shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
1837         size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
1838         npages = btopr(size);
1839 
1840         /*
1841          * Now we need to convert from addr in segshm to addr in segspt.
1842          */
1843         an_idx = seg_page(seg, shm_addr);
1844         segspt_addr = sptseg->s_base + ptob(an_idx);
1845 
1846         ASSERT((segspt_addr + ptob(npages)) <=
1847             (sptseg->s_base + sptd->spt_realsize));
1848         ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));
1849 
1850         switch (type) {
1851 
1852         case F_SOFTLOCK:
1853 
1854                 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
1855                 /*
1856                  * Fall through to the F_INVAL case to load up the hat layer
1857                  * entries with the HAT_LOAD_LOCK flag.
1858                  */
1859                 /* FALLTHRU */
1860         case F_INVAL:
1861 
1862                 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
1863                         return (FC_NOMAP);
1864 
1865                 ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
1866 
1867                 err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
1868                 if (err != 0) {
1869                         if (type == F_SOFTLOCK) {
1870                                 atomic_add_long((ulong_t *)(
1871                                     &(shmd->shm_softlockcnt)), -npages);
1872                         }
1873                         goto dism_err;
1874                 }
1875                 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
1876                 a = segspt_addr;
1877                 pidx = 0;
1878                 if (type == F_SOFTLOCK) {
1879 
1880                         /*
1881                          * Load up the translation keeping it
1882                          * locked and don't unlock the page.
1883                          */
1884                         for (; pidx < npages; a += pgsz, pidx += pgcnt) {
1885                                 hat_memload_array(sptseg->s_as->a_hat,
1886                                     a, pgsz, &ppa[pidx], sptd->spt_prot,
1887                                     HAT_LOAD_LOCK | HAT_LOAD_SHARE);
1888                         }
1889                 } else {
1890                         /*
1891                          * Migrate pages marked for migration
1892                          */
1893                         if (lgrp_optimizations())
1894                                 page_migrate(seg, shm_addr, ppa, npages);
1895 
1896                         for (; pidx < npages; a += pgsz, pidx += pgcnt) {
1897                                 hat_memload_array(sptseg->s_as->a_hat,
1898                                     a, pgsz, &ppa[pidx],
1899                                     sptd->spt_prot,
1900                                     HAT_LOAD_SHARE);
1901                         }
1902 
1903                         /*
1904                          * And now drop the SE_SHARED lock(s).
1905                          */
1906                         if (dyn_ism_unmap) {
1907                                 for (i = 0; i < npages; i++) {
1908                                         page_unlock(ppa[i]);
1909                                 }
1910                         }
1911                 }
1912 
1913                 if (!dyn_ism_unmap) {
1914                         if (hat_share(seg->s_as->a_hat, shm_addr,
1915                             curspt->a_hat, segspt_addr, ptob(npages),
1916                             seg->s_szc) != 0) {
1917                                 panic("hat_share err in DISM fault");
1918                                 /* NOTREACHED */
1919                         }
1920                         if (type == F_INVAL) {
1921                                 for (i = 0; i < npages; i++) {
1922                                         page_unlock(ppa[i]);
1923                                 }
1924                         }
1925                 }
1926                 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
1927 dism_err:
1928                 kmem_free(ppa, npages * sizeof (page_t *));
1929                 return (err);
1930 
1931         case F_SOFTUNLOCK:
1932 
1933                 /*
1934                  * This is a bit ugly, we pass in the real seg pointer,
1935                  * but the segspt_addr is the virtual address within the
1936                  * dummy seg.
1937                  */
1938                 segspt_softunlock(seg, segspt_addr, size, rw);
1939                 return (0);
1940 
1941         case F_PROT:
1942 
1943                 /*
1944                  * This takes care of the unusual case where a user
1945                  * allocates a stack in shared memory and a register
1946                  * window overflow is written to that stack page before
1947                  * it is otherwise modified.
1948                  *
1949                  * We can get away with this because ISM segments are
1950                  * always rw. Other than this unusual case, there
1951                  * should be no instances of protection violations.
1952                  */
1953                 return (0);
1954 
1955         default:
1956 #ifdef DEBUG
1957                 panic("segspt_dismfault default type?");
1958 #else
1959                 return (FC_NOMAP);
1960 #endif
1961         }
1962 }
1963 
1964 
1965 faultcode_t
1966 segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
1967     size_t len, enum fault_type type, enum seg_rw rw)
1968 {
1969         struct shm_data         *shmd = (struct shm_data *)seg->s_data;
1970         struct seg              *sptseg = shmd->shm_sptseg;
1971         struct as               *curspt = shmd->shm_sptas;
1972         struct spt_data         *sptd   = sptseg->s_data;
1973         pgcnt_t npages;
1974         size_t size;
1975         caddr_t sptseg_addr, shm_addr;
1976         page_t *pp, **ppa;
1977         int     i;
1978         u_offset_t offset;
1979         ulong_t anon_index = 0;
1980         struct vnode *vp;
1981         struct anon_map *amp;           /* XXX - for locknest */
1982         struct anon *ap = NULL;
1983         size_t          pgsz;
1984         pgcnt_t         pgcnt;
1985         caddr_t         a;
1986         pgcnt_t         pidx;
1987         size_t          sz;
1988 
1989 #ifdef lint
1990         hat = hat;
1991 #endif
1992 
1993         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1994 
1995         if (sptd->spt_flags & SHM_PAGEABLE) {
1996                 return (segspt_dismfault(hat, seg, addr, len, type, rw));
1997         }
1998 
1999         /*
2000          * Because of the way spt is implemented
2001          * the realsize of the segment does not have to be
2002          * equal to the segment size itself. The segment size is
2003          * often in multiples of a page size larger than PAGESIZE.
2004          * The realsize is rounded up to the nearest PAGESIZE
2005          * based on what the user requested. This is a bit of
2006          * ungliness that is historical but not easily fixed
2007          * without re-designing the higher levels of ISM.
2008          */
2009         ASSERT(addr >= seg->s_base);
2010         if (((addr + len) - seg->s_base) > sptd->spt_realsize)
2011                 return (FC_NOMAP);
2012         /*
2013          * For all of the following cases except F_PROT, we need to
2014          * make any necessary adjustments to addr and len
2015          * and get all of the necessary page_t's into an array called ppa[].
2016          *
2017          * The code in shmat() forces base addr and len of ISM segment
2018          * to be aligned to largest page size supported. Therefore,
2019          * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2020          * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2021          * in large pagesize chunks, or else we will screw up the HAT
2022          * layer by calling hat_memload_array() with differing page sizes
2023          * over a given virtual range.
2024          */
2025         pgsz = page_get_pagesize(sptseg->s_szc);
2026         pgcnt = page_get_pagecnt(sptseg->s_szc);
2027         shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2028         size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2029         npages = btopr(size);
2030 
2031         /*
2032          * Now we need to convert from addr in segshm to addr in segspt.
2033          */
2034         anon_index = seg_page(seg, shm_addr);
2035         sptseg_addr = sptseg->s_base + ptob(anon_index);
2036 
2037         /*
2038          * And now we may have to adjust npages downward if we have
2039          * exceeded the realsize of the segment or initial anon
2040          * allocations.
2041          */
2042         if ((sptseg_addr + ptob(npages)) >
2043             (sptseg->s_base + sptd->spt_realsize))
2044                 size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;
2045 
2046         npages = btopr(size);
2047 
2048         ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
2049         ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);
2050 
2051         switch (type) {
2052 
2053         case F_SOFTLOCK:
2054 
2055                 /*
2056                  * availrmem is decremented once during anon_swap_adjust()
2057                  * and is incremented during the anon_unresv(), which is
2058                  * called from shm_rm_amp() when the segment is destroyed.
2059                  */
2060                 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2061                 /*
2062                  * Some platforms assume that ISM pages are SE_SHARED
2063                  * locked for the entire life of the segment.
2064                  */
2065                 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
2066                         return (0);
2067                 /*
2068                  * Fall through to the F_INVAL case to load up the hat layer
2069                  * entries with the HAT_LOAD_LOCK flag.
2070                  */
2071 
2072                 /* FALLTHRU */
2073         case F_INVAL:
2074 
2075                 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2076                         return (FC_NOMAP);
2077 
2078                 /*
2079                  * Some platforms that do NOT support DYNAMIC_ISM_UNMAP
2080                  * may still rely on this call to hat_share(). That
2081                  * would imply that those hat's can fault on a
2082                  * HAT_LOAD_LOCK translation, which would seem
2083                  * contradictory.
2084                  */
2085                 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2086                         if (hat_share(seg->s_as->a_hat, seg->s_base,
2087                             curspt->a_hat, sptseg->s_base,
2088                             sptseg->s_size, sptseg->s_szc) != 0) {
2089                                 panic("hat_share error in ISM fault");
2090                                 /*NOTREACHED*/
2091                         }
2092                         return (0);
2093                 }
2094                 ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);
2095 
2096                 /*
2097                  * I see no need to lock the real seg,
2098                  * here, because all of our work will be on the underlying
2099                  * dummy seg.
2100                  *
2101                  * sptseg_addr and npages now account for large pages.
2102                  */
2103                 amp = sptd->spt_amp;
2104                 ASSERT(amp != NULL);
2105                 anon_index = seg_page(sptseg, sptseg_addr);
2106 
2107                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2108                 for (i = 0; i < npages; i++) {
2109                         ap = anon_get_ptr(amp->ahp, anon_index++);
2110                         ASSERT(ap != NULL);
2111                         swap_xlate(ap, &vp, &offset);
2112                         pp = page_lookup(vp, offset, SE_SHARED);
2113                         ASSERT(pp != NULL);
2114                         ppa[i] = pp;
2115                 }
2116                 ANON_LOCK_EXIT(&amp->a_rwlock);
2117                 ASSERT(i == npages);
2118 
2119                 /*
2120                  * We are already holding the as->a_lock on the user's
2121                  * real segment, but we need to hold the a_lock on the
2122                  * underlying dummy as. This is mostly to satisfy the
2123                  * underlying HAT layer.
2124                  */
2125                 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
2126                 a = sptseg_addr;
2127                 pidx = 0;
2128                 if (type == F_SOFTLOCK) {
2129                         /*
2130                          * Load up the translation keeping it
2131                          * locked and don't unlock the page.
2132                          */
2133                         for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2134                                 sz = MIN(pgsz, ptob(npages - pidx));
2135                                 hat_memload_array(sptseg->s_as->a_hat, a,
2136                                     sz, &ppa[pidx], sptd->spt_prot,
2137                                     HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2138                         }
2139                 } else {
2140                         /*
2141                          * Migrate pages marked for migration.
2142                          */
2143                         if (lgrp_optimizations())
2144                                 page_migrate(seg, shm_addr, ppa, npages);
2145 
2146                         for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2147                                 sz = MIN(pgsz, ptob(npages - pidx));
2148                                 hat_memload_array(sptseg->s_as->a_hat,
2149                                     a, sz, &ppa[pidx],
2150                                     sptd->spt_prot, HAT_LOAD_SHARE);
2151                         }
2152 
2153                         /*
2154                          * And now drop the SE_SHARED lock(s).
2155                          */
2156                         for (i = 0; i < npages; i++)
2157                                 page_unlock(ppa[i]);
2158                 }
2159                 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
2160 
2161                 kmem_free(ppa, sizeof (page_t *) * npages);
2162                 return (0);
2163         case F_SOFTUNLOCK:
2164 
2165                 /*
2166                  * This is a bit ugly, we pass in the real seg pointer,
2167                  * but the sptseg_addr is the virtual address within the
2168                  * dummy seg.
2169                  */
2170                 segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
2171                 return (0);
2172 
2173         case F_PROT:
2174 
2175                 /*
2176                  * This takes care of the unusual case where a user
2177                  * allocates a stack in shared memory and a register
2178                  * window overflow is written to that stack page before
2179                  * it is otherwise modified.
2180                  *
2181                  * We can get away with this because ISM segments are
2182                  * always rw. Other than this unusual case, there
2183                  * should be no instances of protection violations.
2184                  */
2185                 return (0);
2186 
2187         default:
2188 #ifdef DEBUG
2189                 cmn_err(CE_WARN, "segspt_shmfault default type?");
2190 #endif
2191                 return (FC_NOMAP);
2192         }
2193 }
2194 
2195 /*ARGSUSED*/
2196 static faultcode_t
2197 segspt_shmfaulta(struct seg *seg, caddr_t addr)
2198 {
2199         return (0);
2200 }
2201 
2202 /*ARGSUSED*/
2203 static int
2204 segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
2205 {
2206         return (0);
2207 }
2208 
2209 /*
2210  * duplicate the shared page tables
2211  */
2212 int
2213 segspt_shmdup(struct seg *seg, struct seg *newseg)
2214 {
2215         struct shm_data         *shmd = (struct shm_data *)seg->s_data;
2216         struct anon_map         *amp = shmd->shm_amp;
2217         struct shm_data         *shmd_new;
2218         struct seg              *spt_seg = shmd->shm_sptseg;
2219         struct spt_data         *sptd = spt_seg->s_data;
2220         int                     error = 0;
2221 
2222         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
2223 
2224         shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
2225         newseg->s_data = (void *)shmd_new;
2226         shmd_new->shm_sptas = shmd->shm_sptas;
2227         shmd_new->shm_amp = amp;
2228         shmd_new->shm_sptseg = shmd->shm_sptseg;
2229         newseg->s_ops = &segspt_shmops;
2230         newseg->s_szc = seg->s_szc;
2231         ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);
2232 
2233         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2234         amp->refcnt++;
2235         ANON_LOCK_EXIT(&amp->a_rwlock);
2236 
2237         if (sptd->spt_flags & SHM_PAGEABLE) {
2238                 shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
2239                 shmd_new->shm_lckpgs = 0;
2240                 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2241                         if ((error = hat_share(newseg->s_as->a_hat,
2242                             newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR,
2243                             seg->s_size, seg->s_szc)) != 0) {
2244                                 kmem_free(shmd_new->shm_vpage,
2245                                     btopr(amp->size));
2246                         }
2247                 }
2248                 return (error);
2249         } else {
2250                 return (hat_share(newseg->s_as->a_hat, newseg->s_base,
2251                     shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size,
2252                     seg->s_szc));
2253 
2254         }
2255 }
2256 
2257 /*ARGSUSED*/
2258 int
2259 segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
2260 {
2261         struct shm_data *shmd = (struct shm_data *)seg->s_data;
2262         struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2263 
2264         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2265 
2266         /*
2267          * ISM segment is always rw.
2268          */
2269         return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
2270 }
2271 
2272 /*
2273  * Return an array of locked large pages, for empty slots allocate
2274  * private zero-filled anon pages.
2275  */
2276 static int
2277 spt_anon_getpages(
2278         struct seg *sptseg,
2279         caddr_t sptaddr,
2280         size_t len,
2281         page_t *ppa[])
2282 {
2283         struct  spt_data *sptd = sptseg->s_data;
2284         struct  anon_map *amp = sptd->spt_amp;
2285         enum    seg_rw rw = sptd->spt_prot;
2286         uint_t  szc = sptseg->s_szc;
2287         size_t  pg_sz, share_sz = page_get_pagesize(szc);
2288         pgcnt_t lp_npgs;
2289         caddr_t lp_addr, e_sptaddr;
2290         uint_t  vpprot, ppa_szc = 0;
2291         struct  vpage *vpage = NULL;
2292         ulong_t j, ppa_idx;
2293         int     err, ierr = 0;
2294         pgcnt_t an_idx;
2295         anon_sync_obj_t cookie;
2296         int anon_locked = 0;
2297         pgcnt_t amp_pgs;
2298 
2299 
2300         ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
2301         ASSERT(len != 0);
2302 
2303         pg_sz = share_sz;
2304         lp_npgs = btop(pg_sz);
2305         lp_addr = sptaddr;
2306         e_sptaddr = sptaddr + len;
2307         an_idx = seg_page(sptseg, sptaddr);
2308         ppa_idx = 0;
2309 
2310         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2311 
2312         amp_pgs = page_get_pagecnt(amp->a_szc);
2313 
2314         /*CONSTCOND*/
2315         while (1) {
2316                 for (; lp_addr < e_sptaddr;
2317                     an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) {
2318 
2319                         /*
2320                          * If we're currently locked, and we get to a new
2321                          * page, unlock our current anon chunk.
2322                          */
2323                         if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) {
2324                                 anon_array_exit(&cookie);
2325                                 anon_locked = 0;
2326                         }
2327                         if (!anon_locked) {
2328                                 anon_array_enter(amp, an_idx, &cookie);
2329                                 anon_locked = 1;
2330                         }
2331                         ppa_szc = (uint_t)-1;
2332                         ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
2333                             lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
2334                             &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred);
2335 
2336                         if (ierr != 0) {
2337                                 if (ierr > 0) {
2338                                         err = FC_MAKE_ERR(ierr);
2339                                         goto lpgs_err;
2340                                 }
2341                                 break;
2342                         }
2343                 }
2344                 if (lp_addr == e_sptaddr) {
2345                         break;
2346                 }
2347                 ASSERT(lp_addr < e_sptaddr);
2348 
2349                 /*
2350                  * ierr == -1 means we failed to allocate a large page.
2351                  * so do a size down operation.
2352                  *
2353                  * ierr == -2 means some other process that privately shares
2354                  * pages with this process has allocated a larger page and we
2355                  * need to retry with larger pages. So do a size up
2356                  * operation. This relies on the fact that large pages are
2357                  * never partially shared i.e. if we share any constituent
2358                  * page of a large page with another process we must share the
2359                  * entire large page. Note this cannot happen for SOFTLOCK
2360                  * case, unless current address (lpaddr) is at the beginning
2361                  * of the next page size boundary because the other process
2362                  * couldn't have relocated locked pages.
2363                  */
2364                 ASSERT(ierr == -1 || ierr == -2);
2365                 if (segvn_anypgsz) {
2366                         ASSERT(ierr == -2 || szc != 0);
2367                         ASSERT(ierr == -1 || szc < sptseg->s_szc);
2368                         szc = (ierr == -1) ? szc - 1 : szc + 1;
2369                 } else {
2370                         /*
2371                          * For faults and segvn_anypgsz == 0
2372                          * we need to be careful not to loop forever
2373                          * if existing page is found with szc other
2374                          * than 0 or seg->s_szc. This could be due
2375                          * to page relocations on behalf of DR or
2376                          * more likely large page creation. For this
2377                          * case simply re-size to existing page's szc
2378                          * if returned by anon_map_getpages().
2379                          */
2380                         if (ppa_szc == (uint_t)-1) {
2381                                 szc = (ierr == -1) ? 0 : sptseg->s_szc;
2382                         } else {
2383                                 ASSERT(ppa_szc <= sptseg->s_szc);
2384                                 ASSERT(ierr == -2 || ppa_szc < szc);
2385                                 ASSERT(ierr == -1 || ppa_szc > szc);
2386                                 szc = ppa_szc;
2387                         }
2388                 }
2389                 pg_sz = page_get_pagesize(szc);
2390                 lp_npgs = btop(pg_sz);
2391                 ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
2392         }
2393         if (anon_locked) {
2394                 anon_array_exit(&cookie);
2395         }
2396         ANON_LOCK_EXIT(&amp->a_rwlock);
2397         return (0);
2398 
2399 lpgs_err:
2400         if (anon_locked) {
2401                 anon_array_exit(&cookie);
2402         }
2403         ANON_LOCK_EXIT(&amp->a_rwlock);
2404         for (j = 0; j < ppa_idx; j++)
2405                 page_unlock(ppa[j]);
2406         return (err);
2407 }
2408 
2409 /*
2410  * count the number of bytes in a set of spt pages that are currently not
2411  * locked
2412  */
2413 static rctl_qty_t
2414 spt_unlockedbytes(pgcnt_t npages, page_t **ppa)
2415 {
2416         ulong_t i;
2417         rctl_qty_t unlocked = 0;
2418 
2419         for (i = 0; i < npages; i++) {
2420                 if (ppa[i]->p_lckcnt == 0)
2421                         unlocked += PAGESIZE;
2422         }
2423         return (unlocked);
2424 }
2425 
2426 extern  u_longlong_t randtick(void);
2427 /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */
2428 #define NLCK    (NCPU_P2)
2429 /* Random number with a range [0, n-1], n must be power of two */
2430 #define RAND_P2(n)      \
2431         ((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1))
2432 
2433 int
2434 spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2435     page_t **ppa, ulong_t *lockmap, size_t pos,
2436     rctl_qty_t *locked)
2437 {
2438         struct  shm_data *shmd = seg->s_data;
2439         struct  spt_data *sptd = shmd->shm_sptseg->s_data;
2440         ulong_t i;
2441         int     kernel;
2442         pgcnt_t nlck = 0;
2443         int     rv = 0;
2444         int     use_reserved = 1;
2445 
2446         /* return the number of bytes actually locked */
2447         *locked = 0;
2448 
2449         /*
2450          * To avoid contention on freemem_lock, availrmem and pages_locked
2451          * global counters are updated only every nlck locked pages instead of
2452          * every time.  Reserve nlck locks up front and deduct from this
2453          * reservation for each page that requires a lock.  When the reservation
2454          * is consumed, reserve again.  nlck is randomized, so the competing
2455          * threads do not fall into a cyclic lock contention pattern. When
2456          * memory is low, the lock ahead is disabled, and instead page_pp_lock()
2457          * is used to lock pages.
2458          */
2459         for (i = 0; i < npages; anon_index++, pos++, i++) {
2460                 if (nlck == 0 && use_reserved == 1) {
2461                         nlck = NLCK + RAND_P2(NLCK);
2462                         /* if fewer loops left, decrease nlck */
2463                         nlck = MIN(nlck, npages - i);
2464                         /*
2465                          * Reserve nlck locks up front and deduct from this
2466                          * reservation for each page that requires a lock.  When
2467                          * the reservation is consumed, reserve again.
2468                          */
2469                         mutex_enter(&freemem_lock);
2470                         if ((availrmem - nlck) < pages_pp_maximum) {
2471                                 /* Do not do advance memory reserves */
2472                                 use_reserved = 0;
2473                         } else {
2474                                 availrmem       -= nlck;
2475                                 pages_locked    += nlck;
2476                         }
2477                         mutex_exit(&freemem_lock);
2478                 }
2479                 if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
2480                         if (sptd->spt_ppa_lckcnt[anon_index] <
2481                             (ushort_t)DISM_LOCK_MAX) {
2482                                 if (++sptd->spt_ppa_lckcnt[anon_index] ==
2483                                     (ushort_t)DISM_LOCK_MAX) {
2484                                         cmn_err(CE_WARN,
2485                                             "DISM page lock limit "
2486                                             "reached on DISM offset 0x%lx\n",
2487                                             anon_index << PAGESHIFT);
2488                                 }
2489                                 kernel = (sptd->spt_ppa &&
2490                                     sptd->spt_ppa[anon_index]);
2491                                 if (!page_pp_lock(ppa[i], 0, kernel ||
2492                                     use_reserved)) {
2493                                         sptd->spt_ppa_lckcnt[anon_index]--;
2494                                         rv = EAGAIN;
2495                                         break;
2496                                 }
2497                                 /* if this is a newly locked page, count it */
2498                                 if (ppa[i]->p_lckcnt == 1) {
2499                                         if (kernel == 0 && use_reserved == 1)
2500                                                 nlck--;
2501                                         *locked += PAGESIZE;
2502                                 }
2503                                 shmd->shm_lckpgs++;
2504                                 shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
2505                                 if (lockmap != NULL)
2506                                         BT_SET(lockmap, pos);
2507                         }
2508                 }
2509         }
2510         /* Return unused lock reservation */
2511         if (nlck != 0 && use_reserved == 1) {
2512                 mutex_enter(&freemem_lock);
2513                 availrmem       += nlck;
2514                 pages_locked    -= nlck;
2515                 mutex_exit(&freemem_lock);
2516         }
2517 
2518         return (rv);
2519 }
2520 
2521 int
2522 spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2523     rctl_qty_t *unlocked)
2524 {
2525         struct shm_data *shmd = seg->s_data;
2526         struct spt_data *sptd = shmd->shm_sptseg->s_data;
2527         struct anon_map *amp = sptd->spt_amp;
2528         struct anon     *ap;
2529         struct vnode    *vp;
2530         u_offset_t      off;
2531         struct page     *pp;
2532         int             kernel;
2533         anon_sync_obj_t cookie;
2534         ulong_t         i;
2535         pgcnt_t         nlck = 0;
2536         pgcnt_t         nlck_limit = NLCK;
2537 
2538         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2539         for (i = 0; i < npages; i++, anon_index++) {
2540                 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
2541                         anon_array_enter(amp, anon_index, &cookie);
2542                         ap = anon_get_ptr(amp->ahp, anon_index);
2543                         ASSERT(ap);
2544 
2545                         swap_xlate(ap, &vp, &off);
2546                         anon_array_exit(&cookie);
2547                         pp = page_lookup(vp, off, SE_SHARED);
2548                         ASSERT(pp);
2549                         /*
2550                          * availrmem is decremented only for pages which are not
2551                          * in seg pcache, for pages in seg pcache availrmem was
2552                          * decremented in _dismpagelock()
2553                          */
2554                         kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]);
2555                         ASSERT(pp->p_lckcnt > 0);
2556 
2557                         /*
2558                          * lock page but do not change availrmem, we do it
2559                          * ourselves every nlck loops.
2560                          */
2561                         page_pp_unlock(pp, 0, 1);
2562                         if (pp->p_lckcnt == 0) {
2563                                 if (kernel == 0)
2564                                         nlck++;
2565                                 *unlocked += PAGESIZE;
2566                         }
2567                         page_unlock(pp);
2568                         shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED;
2569                         sptd->spt_ppa_lckcnt[anon_index]--;
2570                         shmd->shm_lckpgs--;
2571                 }
2572 
2573                 /*
2574                  * To reduce freemem_lock contention, do not update availrmem
2575                  * until at least NLCK pages have been unlocked.
2576                  * 1. No need to update if nlck is zero
2577                  * 2. Always update if the last iteration
2578                  */
2579                 if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) {
2580                         mutex_enter(&freemem_lock);
2581                         availrmem       += nlck;
2582                         pages_locked    -= nlck;
2583                         mutex_exit(&freemem_lock);
2584                         nlck = 0;
2585                         nlck_limit = NLCK + RAND_P2(NLCK);
2586                 }
2587         }
2588         ANON_LOCK_EXIT(&amp->a_rwlock);
2589 
2590         return (0);
2591 }
2592 
2593 /*ARGSUSED*/
2594 static int
2595 segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
2596     int attr, int op, ulong_t *lockmap, size_t pos)
2597 {
2598         struct shm_data *shmd = seg->s_data;
2599         struct seg      *sptseg = shmd->shm_sptseg;
2600         struct spt_data *sptd = sptseg->s_data;
2601         struct kshmid   *sp = sptd->spt_amp->a_sp;
2602         pgcnt_t         npages, a_npages;
2603         page_t          **ppa;
2604         pgcnt_t         an_idx, a_an_idx, ppa_idx;
2605         caddr_t         spt_addr, a_addr;       /* spt and aligned address */
2606         size_t          a_len;                  /* aligned len */
2607         size_t          share_sz;
2608         ulong_t         i;
2609         int             sts = 0;
2610         rctl_qty_t      unlocked = 0;
2611         rctl_qty_t      locked = 0;
2612         struct proc     *p = curproc;
2613         kproject_t      *proj;
2614 
2615         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2616         ASSERT(sp != NULL);
2617 
2618         if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
2619                 return (0);
2620         }
2621 
2622         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2623         an_idx = seg_page(seg, addr);
2624         npages = btopr(len);
2625 
2626         if (an_idx + npages > btopr(shmd->shm_amp->size)) {
2627                 return (ENOMEM);
2628         }
2629 
2630         /*
2631          * A shm's project never changes, so no lock needed.
2632          * The shm has a hold on the project, so it will not go away.
2633          * Since we have a mapping to shm within this zone, we know
2634          * that the zone will not go away.
2635          */
2636         proj = sp->shm_perm.ipc_proj;
2637 
2638         if (op == MC_LOCK) {
2639 
2640                 /*
2641                  * Need to align addr and size request if they are not
2642                  * aligned so we can always allocate large page(s) however
2643                  * we only lock what was requested in initial request.
2644                  */
2645                 share_sz = page_get_pagesize(sptseg->s_szc);
2646                 a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
2647                 a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
2648                     share_sz);
2649                 a_npages = btop(a_len);
2650                 a_an_idx = seg_page(seg, a_addr);
2651                 spt_addr = sptseg->s_base + ptob(a_an_idx);
2652                 ppa_idx = an_idx - a_an_idx;
2653 
2654                 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
2655                     KM_NOSLEEP)) == NULL) {
2656                         return (ENOMEM);
2657                 }
2658 
2659                 /*
2660                  * Don't cache any new pages for IO and
2661                  * flush any cached pages.
2662                  */
2663                 mutex_enter(&sptd->spt_lock);
2664                 if (sptd->spt_ppa != NULL)
2665                         sptd->spt_flags |= DISM_PPA_CHANGED;
2666 
2667                 sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
2668                 if (sts != 0) {
2669                         mutex_exit(&sptd->spt_lock);
2670                         kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2671                         return (sts);
2672                 }
2673 
2674                 mutex_enter(&sp->shm_mlock);
2675                 /* enforce locked memory rctl */
2676                 unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]);
2677 
2678                 mutex_enter(&p->p_lock);
2679                 if (rctl_incr_locked_mem(p, proj, unlocked, 0)) {
2680                         mutex_exit(&p->p_lock);
2681                         sts = EAGAIN;
2682                 } else {
2683                         mutex_exit(&p->p_lock);
2684                         sts = spt_lockpages(seg, an_idx, npages,
2685                             &ppa[ppa_idx], lockmap, pos, &locked);
2686 
2687                         /*
2688                          * correct locked count if not all pages could be
2689                          * locked
2690                          */
2691                         if ((unlocked - locked) > 0) {
2692                                 rctl_decr_locked_mem(NULL, proj,
2693                                     (unlocked - locked), 0);
2694                         }
2695                 }
2696                 /*
2697                  * unlock pages
2698                  */
2699                 for (i = 0; i < a_npages; i++)
2700                         page_unlock(ppa[i]);
2701                 if (sptd->spt_ppa != NULL)
2702                         sptd->spt_flags |= DISM_PPA_CHANGED;
2703                 mutex_exit(&sp->shm_mlock);
2704                 mutex_exit(&sptd->spt_lock);
2705 
2706                 kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2707 
2708         } else if (op == MC_UNLOCK) { /* unlock */
2709                 page_t          **ppa;
2710 
2711                 mutex_enter(&sptd->spt_lock);
2712                 if (shmd->shm_lckpgs == 0) {
2713                         mutex_exit(&sptd->spt_lock);
2714                         return (0);
2715                 }
2716                 /*
2717                  * Don't cache new IO pages.
2718                  */
2719                 if (sptd->spt_ppa != NULL)
2720                         sptd->spt_flags |= DISM_PPA_CHANGED;
2721 
2722                 mutex_enter(&sp->shm_mlock);
2723                 sts = spt_unlockpages(seg, an_idx, npages, &unlocked);
2724                 if ((ppa = sptd->spt_ppa) != NULL)
2725                         sptd->spt_flags |= DISM_PPA_CHANGED;
2726                 mutex_exit(&sptd->spt_lock);
2727 
2728                 rctl_decr_locked_mem(NULL, proj, unlocked, 0);
2729                 mutex_exit(&sp->shm_mlock);
2730 
2731                 if (ppa != NULL)
2732                         seg_ppurge_wiredpp(ppa);
2733         }
2734         return (sts);
2735 }
2736 
2737 /*ARGSUSED*/
2738 int
2739 segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
2740 {
2741         struct shm_data *shmd = (struct shm_data *)seg->s_data;
2742         struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2743         spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;
2744 
2745         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2746 
2747         /*
2748          * ISM segment is always rw.
2749          */
2750         while (--pgno >= 0)
2751                 *protv++ = sptd->spt_prot;
2752         return (0);
2753 }
2754 
2755 /*ARGSUSED*/
2756 u_offset_t
2757 segspt_shmgetoffset(struct seg *seg, caddr_t addr)
2758 {
2759         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2760 
2761         /* Offset does not matter in ISM memory */
2762 
2763         return ((u_offset_t)0);
2764 }
2765 
2766 /* ARGSUSED */
2767 int
2768 segspt_shmgettype(struct seg *seg, caddr_t addr)
2769 {
2770         struct shm_data *shmd = (struct shm_data *)seg->s_data;
2771         struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2772 
2773         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2774 
2775         /*
2776          * The shared memory mapping is always MAP_SHARED, SWAP is only
2777          * reserved for DISM
2778          */
2779         return (MAP_SHARED |
2780             ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
2781 }
2782 
2783 /*ARGSUSED*/
2784 int
2785 segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
2786 {
2787         struct shm_data *shmd = (struct shm_data *)seg->s_data;
2788         struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2789 
2790         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2791 
2792         *vpp = sptd->spt_vp;
2793         return (0);
2794 }
2795 
2796 /*
2797  * We need to wait for pending IO to complete to a DISM segment in order for
2798  * pages to get kicked out of the seg_pcache.  120 seconds should be more
2799  * than enough time to wait.
2800  */
2801 static clock_t spt_pcache_wait = 120;
2802 
2803 /*ARGSUSED*/
2804 static int
2805 segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
2806 {
2807         struct shm_data *shmd = (struct shm_data *)seg->s_data;
2808         struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2809         struct anon_map *amp;
2810         pgcnt_t pg_idx;
2811         ushort_t gen;
2812         clock_t end_lbolt;
2813         int writer;
2814         page_t **ppa;
2815 
2816         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2817 
2818         if (behav == MADV_FREE) {
2819                 if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
2820                         return (0);
2821 
2822                 amp = sptd->spt_amp;
2823                 pg_idx = seg_page(seg, addr);
2824 
2825                 mutex_enter(&sptd->spt_lock);
2826                 if ((ppa = sptd->spt_ppa) == NULL) {
2827                         mutex_exit(&sptd->spt_lock);
2828                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2829                         anon_disclaim(amp, pg_idx, len);
2830                         ANON_LOCK_EXIT(&amp->a_rwlock);
2831                         return (0);
2832                 }
2833 
2834                 sptd->spt_flags |= DISM_PPA_CHANGED;
2835                 gen = sptd->spt_gen;
2836 
2837                 mutex_exit(&sptd->spt_lock);
2838 
2839                 /*
2840                  * Purge all DISM cached pages
2841                  */
2842                 seg_ppurge_wiredpp(ppa);
2843 
2844                 /*
2845                  * Drop the AS_LOCK so that other threads can grab it
2846                  * in the as_pageunlock path and hopefully get the segment
2847                  * kicked out of the seg_pcache.  We bump the shm_softlockcnt
2848                  * to keep this segment resident.
2849                  */
2850                 writer = AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock);
2851                 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
2852                 AS_LOCK_EXIT(seg->s_as, &seg->s_as->a_lock);
2853 
2854                 mutex_enter(&sptd->spt_lock);
2855 
2856                 end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait);
2857 
2858                 /*
2859                  * Try to wait for pages to get kicked out of the seg_pcache.
2860                  */
2861                 while (sptd->spt_gen == gen &&
2862                     (sptd->spt_flags & DISM_PPA_CHANGED) &&
2863                     ddi_get_lbolt() < end_lbolt) {
2864                         if (!cv_timedwait_sig(&sptd->spt_cv,
2865                             &sptd->spt_lock, end_lbolt)) {
2866                                 break;
2867                         }
2868                 }
2869 
2870                 mutex_exit(&sptd->spt_lock);
2871 
2872                 /* Regrab the AS_LOCK and release our hold on the segment */
2873                 AS_LOCK_ENTER(seg->s_as, &seg->s_as->a_lock,
2874                     writer ? RW_WRITER : RW_READER);
2875                 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
2876                 if (shmd->shm_softlockcnt <= 0) {
2877                         if (AS_ISUNMAPWAIT(seg->s_as)) {
2878                                 mutex_enter(&seg->s_as->a_contents);
2879                                 if (AS_ISUNMAPWAIT(seg->s_as)) {
2880                                         AS_CLRUNMAPWAIT(seg->s_as);
2881                                         cv_broadcast(&seg->s_as->a_cv);
2882                                 }
2883                                 mutex_exit(&seg->s_as->a_contents);
2884                         }
2885                 }
2886 
2887                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2888                 anon_disclaim(amp, pg_idx, len);
2889                 ANON_LOCK_EXIT(&amp->a_rwlock);
2890         } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
2891             behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
2892                 int                     already_set;
2893                 ulong_t                 anon_index;
2894                 lgrp_mem_policy_t       policy;
2895                 caddr_t                 shm_addr;
2896                 size_t                  share_size;
2897                 size_t                  size;
2898                 struct seg              *sptseg = shmd->shm_sptseg;
2899                 caddr_t                 sptseg_addr;
2900 
2901                 /*
2902                  * Align address and length to page size of underlying segment
2903                  */
2904                 share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
2905                 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
2906                 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
2907                     share_size);
2908 
2909                 amp = shmd->shm_amp;
2910                 anon_index = seg_page(seg, shm_addr);
2911 
2912                 /*
2913                  * And now we may have to adjust size downward if we have
2914                  * exceeded the realsize of the segment or initial anon
2915                  * allocations.
2916                  */
2917                 sptseg_addr = sptseg->s_base + ptob(anon_index);
2918                 if ((sptseg_addr + size) >
2919                     (sptseg->s_base + sptd->spt_realsize))
2920                         size = (sptseg->s_base + sptd->spt_realsize) -
2921                             sptseg_addr;
2922 
2923                 /*
2924                  * Set memory allocation policy for this segment
2925                  */
2926                 policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
2927                 already_set = lgrp_shm_policy_set(policy, amp, anon_index,
2928                     NULL, 0, len);
2929 
2930                 /*
2931                  * If random memory allocation policy set already,
2932                  * don't bother reapplying it.
2933                  */
2934                 if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
2935                         return (0);
2936 
2937                 /*
2938                  * Mark any existing pages in the given range for
2939                  * migration, flushing the I/O page cache, and using
2940                  * underlying segment to calculate anon index and get
2941                  * anonmap and vnode pointer from
2942                  */
2943                 if (shmd->shm_softlockcnt > 0)
2944                         segspt_purge(seg);
2945 
2946                 page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
2947         }
2948 
2949         return (0);
2950 }
2951 
2952 /*ARGSUSED*/
2953 void
2954 segspt_shmdump(struct seg *seg)
2955 {
2956         /* no-op for ISM segment */
2957 }
2958 
2959 /*
2960  * get a memory ID for an addr in a given segment
2961  */
2962 static int
2963 segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
2964 {
2965         struct shm_data *shmd = (struct shm_data *)seg->s_data;
2966         struct anon     *ap;
2967         size_t          anon_index;
2968         struct anon_map *amp = shmd->shm_amp;
2969         struct spt_data *sptd = shmd->shm_sptseg->s_data;
2970         struct seg      *sptseg = shmd->shm_sptseg;
2971         anon_sync_obj_t cookie;
2972 
2973         anon_index = seg_page(seg, addr);
2974 
2975         if (addr > (seg->s_base + sptd->spt_realsize)) {
2976                 return (EFAULT);
2977         }
2978 
2979         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2980         anon_array_enter(amp, anon_index, &cookie);
2981         ap = anon_get_ptr(amp->ahp, anon_index);
2982         if (ap == NULL) {
2983                 struct page *pp;
2984                 caddr_t spt_addr = sptseg->s_base + ptob(anon_index);
2985 
2986                 pp = anon_zero(sptseg, spt_addr, &ap, kcred);
2987                 if (pp == NULL) {
2988                         anon_array_exit(&cookie);
2989                         ANON_LOCK_EXIT(&amp->a_rwlock);
2990                         return (ENOMEM);
2991                 }
2992                 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
2993                 page_unlock(pp);
2994         }
2995         anon_array_exit(&cookie);
2996         ANON_LOCK_EXIT(&amp->a_rwlock);
2997         memidp->val[0] = (uintptr_t)ap;
2998         memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
2999         return (0);
3000 }
3001 
3002 /*
3003  * Get memory allocation policy info for specified address in given segment
3004  */
3005 static lgrp_mem_policy_info_t *
3006 segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
3007 {
3008         struct anon_map         *amp;
3009         ulong_t                 anon_index;
3010         lgrp_mem_policy_info_t  *policy_info;
3011         struct shm_data         *shm_data;
3012 
3013         ASSERT(seg != NULL);
3014 
3015         /*
3016          * Get anon_map from segshm
3017          *
3018          * Assume that no lock needs to be held on anon_map, since
3019          * it should be protected by its reference count which must be
3020          * nonzero for an existing segment
3021          * Need to grab readers lock on policy tree though
3022          */
3023         shm_data = (struct shm_data *)seg->s_data;
3024         if (shm_data == NULL)
3025                 return (NULL);
3026         amp = shm_data->shm_amp;
3027         ASSERT(amp->refcnt != 0);
3028 
3029         /*
3030          * Get policy info
3031          *
3032          * Assume starting anon index of 0
3033          */
3034         anon_index = seg_page(seg, addr);
3035         policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
3036 
3037         return (policy_info);
3038 }