combined Wdiff usr/src/uts/common/vm/seg_spt.c

Print this page

use NULL dump segop as a shorthand for no-op
Instead of forcing every segment driver to implement a dummy function that
does nothing, handle NULL dump segop function pointer as a no-op shorthand.
segspt_ops can be static
There is nothing that needs access to this structure outside of the spt
segment driver itself.
const-ify make segment ops structures
There is no reason to keep the segment ops structures writable.
use NULL setpagesize segop as a shorthand for ENOTSUP
Instead of forcing every segment driver to implement a dummp function to
return (hopefully) ENOTSUP, handle NULL setpagesize segop function pointer
as "return ENOTSUP" shorthand.
use NULL capable segop as a shorthand for no-capabilities
Instead of forcing every segment driver to implement a dummy "return 0"
function, handle NULL capable segop function pointer as "no copabilities
supported" shorthand.
seg_inherit_notsup is redundant since segop_inherit checks for NULL properly
no need for bad-op segment op functions
The segment drivers have a number of bad-op functions that simply panic.
Keeping the function pointer NULL will accomplish the same thing in most
cases.  In other cases, keeping the function pointer NULL will result in
proper error code being returned.
use C99 initializers in segment ops structures
remove whole-process swapping
Long before Unix supported paging, it used process swapping to reclaim
memory.  The code is there and in theory it runs when we get *extremely* low
on memory.  In practice, it never runs since the definition of low-on-memory
is antiquated. (XXX: define what antiquated means)
You can check the number of swapout/swapin events with kstats:
$ kstat -p ::vm:swapin ::vm:swapout
remove xhat
The xhat infrastructure was added to support hardware such as the zulu
graphics card - hardware which had on-board MMUs.  The VM used the xhat code
to keep the CPU's and Zulu's page tables in-sync.  Since the only xhat user
was zulu (which is gone), we can safely remove it simplifying the whole VM
subsystem.
Assorted notes:
- AS_BUSY flag was used solely by xhat

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/seg_spt.c
          +++ new/usr/src/uts/common/vm/seg_spt.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   */
  24   24  
  25   25  #include <sys/param.h>
  26   26  #include <sys/user.h>
  27   27  #include <sys/mman.h>
  28   28  #include <sys/kmem.h>
  29   29  #include <sys/sysmacros.h>
  30   30  #include <sys/cmn_err.h>
  31   31  #include <sys/systm.h>
  32   32  #include <sys/tuneable.h>
  33   33  #include <vm/hat.h>
  34   34  #include <vm/seg.h>
  35   35  #include <vm/as.h>
  36   36  #include <vm/anon.h>
  37   37  #include <vm/page.h>
  38   38  #include <sys/buf.h>
  39   39  #include <sys/swap.h>
  40   40  #include <sys/atomic.h>
  41   41  #include <vm/seg_spt.h>
  42   42  #include <sys/debug.h>
  43   43  #include <sys/vtrace.h>
  44   44  #include <sys/shm.h>
  45   45  #include <sys/shm_impl.h>
  46   46  #include <sys/lgrp.h>
  47   47  #include <sys/vmsystm.h>
  48   48  #include <sys/policy.h>
  49   49  #include <sys/project.h>
  50   50  #include <sys/tnf_probe.h>
  51   51  #include <sys/zone.h>
  52   52  
  53   53  #define SEGSPTADDR      (caddr_t)0x0
  54   54  
  55   55  /*
  56   56   * # pages used for spt
  57   57   */
  58   58  size_t  spt_used;
  59   59  
  60   60  /*
  61   61   * segspt_minfree is the memory left for system after ISM
  62   62   * locked its pages; it is set up to 5% of availrmem in
  63   63   * sptcreate when ISM is created.  ISM should not use more
  64   64   * than ~90% of availrmem; if it does, then the performance
  65   65   * of the system may decrease. Machines with large memories may
  66   66   * be able to use up more memory for ISM so we set the default
  67   67   * segspt_minfree to 5% (which gives ISM max 95% of availrmem.
  68   68   * If somebody wants even more memory for ISM (risking hanging

↓ open down ↓

68 lines elided

↑ open up ↑

  69   69   * the system) they can patch the segspt_minfree to smaller number.
  70   70   */
  71   71  pgcnt_t segspt_minfree = 0;
  72   72  
  73   73  static int segspt_create(struct seg *seg, caddr_t argsp);
  74   74  static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
  75   75  static void segspt_free(struct seg *seg);
  76   76  static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
  77   77  static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
  78   78  
  79      -static void
  80      -segspt_badop()
  81      -{
  82      -        panic("segspt_badop called");
  83      -        /*NOTREACHED*/
  84      -}
  85      -
  86      -#define SEGSPT_BADOP(t) (t(*)())segspt_badop
  87      -
  88      -struct seg_ops segspt_ops = {
  89      -        SEGSPT_BADOP(int),              /* dup */
  90      -        segspt_unmap,
  91      -        segspt_free,
  92      -        SEGSPT_BADOP(int),              /* fault */
  93      -        SEGSPT_BADOP(faultcode_t),      /* faulta */
  94      -        SEGSPT_BADOP(int),              /* setprot */
  95      -        SEGSPT_BADOP(int),              /* checkprot */
  96      -        SEGSPT_BADOP(int),              /* kluster */
  97      -        SEGSPT_BADOP(size_t),           /* swapout */
  98      -        SEGSPT_BADOP(int),              /* sync */
  99      -        SEGSPT_BADOP(size_t),           /* incore */
 100      -        SEGSPT_BADOP(int),              /* lockop */
 101      -        SEGSPT_BADOP(int),              /* getprot */
 102      -        SEGSPT_BADOP(u_offset_t),       /* getoffset */
 103      -        SEGSPT_BADOP(int),              /* gettype */
 104      -        SEGSPT_BADOP(int),              /* getvp */
 105      -        SEGSPT_BADOP(int),              /* advise */
 106      -        SEGSPT_BADOP(void),             /* dump */
 107      -        SEGSPT_BADOP(int),              /* pagelock */
 108      -        SEGSPT_BADOP(int),              /* setpgsz */
 109      -        SEGSPT_BADOP(int),              /* getmemid */
 110      -        segspt_getpolicy,               /* getpolicy */
 111      -        SEGSPT_BADOP(int),              /* capable */
 112      -        seg_inherit_notsup              /* inherit */
       79 +static const struct seg_ops segspt_ops = {
       80 +        .unmap          = segspt_unmap,
       81 +        .free           = segspt_free,
       82 +        .getpolicy      = segspt_getpolicy,
 113   83  };
 114   84  
 115   85  static int segspt_shmdup(struct seg *seg, struct seg *newseg);
 116   86  static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
 117   87  static void segspt_shmfree(struct seg *seg);
 118   88  static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
 119   89                  caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
 120   90  static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
 121   91  static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr,
 122   92                          register size_t len, register uint_t prot);
 123   93  static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
 124   94                          uint_t prot);
 125   95  static int      segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
 126      -static size_t   segspt_shmswapout(struct seg *seg);
 127   96  static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
 128   97                          register char *vec);
 129   98  static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len,
 130   99                          int attr, uint_t flags);
 131  100  static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
 132  101                          int attr, int op, ulong_t *lockmap, size_t pos);
 133  102  static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
 134  103                          uint_t *protv);
 135  104  static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
 136  105  static int segspt_shmgettype(struct seg *seg, caddr_t addr);
 137  106  static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
 138  107  static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
 139  108                          uint_t behav);
 140      -static void segspt_shmdump(struct seg *seg);
 141  109  static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
 142  110                          struct page ***, enum lock_type, enum seg_rw);
 143      -static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
 144  111  static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
 145  112  static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
 146      -static int segspt_shmcapable(struct seg *, segcapability_t);
 147  113  
 148      -struct seg_ops segspt_shmops = {
 149      -        segspt_shmdup,
 150      -        segspt_shmunmap,
 151      -        segspt_shmfree,
 152      -        segspt_shmfault,
 153      -        segspt_shmfaulta,
 154      -        segspt_shmsetprot,
 155      -        segspt_shmcheckprot,
 156      -        segspt_shmkluster,
 157      -        segspt_shmswapout,
 158      -        segspt_shmsync,
 159      -        segspt_shmincore,
 160      -        segspt_shmlockop,
 161      -        segspt_shmgetprot,
 162      -        segspt_shmgetoffset,
 163      -        segspt_shmgettype,
 164      -        segspt_shmgetvp,
 165      -        segspt_shmadvise,       /* advise */
 166      -        segspt_shmdump,
 167      -        segspt_shmpagelock,
 168      -        segspt_shmsetpgsz,
 169      -        segspt_shmgetmemid,
 170      -        segspt_shmgetpolicy,
 171      -        segspt_shmcapable,
 172      -        seg_inherit_notsup
      114 +const struct seg_ops segspt_shmops = {
      115 +        .dup            = segspt_shmdup,
      116 +        .unmap          = segspt_shmunmap,
      117 +        .free           = segspt_shmfree,
      118 +        .fault          = segspt_shmfault,
      119 +        .faulta         = segspt_shmfaulta,
      120 +        .setprot        = segspt_shmsetprot,
      121 +        .checkprot      = segspt_shmcheckprot,
      122 +        .kluster        = segspt_shmkluster,
      123 +        .sync           = segspt_shmsync,
      124 +        .incore         = segspt_shmincore,
      125 +        .lockop         = segspt_shmlockop,
      126 +        .getprot        = segspt_shmgetprot,
      127 +        .getoffset      = segspt_shmgetoffset,
      128 +        .gettype        = segspt_shmgettype,
      129 +        .getvp          = segspt_shmgetvp,
      130 +        .advise         = segspt_shmadvise,
      131 +        .pagelock       = segspt_shmpagelock,
      132 +        .getmemid       = segspt_shmgetmemid,
      133 +        .getpolicy      = segspt_shmgetpolicy,
 173  134  };
 174  135  
 175  136  static void segspt_purge(struct seg *seg);
 176  137  static int segspt_reclaim(void *, caddr_t, size_t, struct page **,
 177  138                  enum seg_rw, int);
 178  139  static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
 179  140                  page_t **ppa);
 180  141  
 181  142  
 182  143

 183  144  /*ARGSUSED*/
 184  145  int
 185  146  sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
 186  147          uint_t prot, uint_t flags, uint_t share_szc)
 187  148  {
 188  149          int     err;
 189  150          struct  as      *newas;
 190  151          struct  segspt_crargs sptcargs;
 191  152  
 192  153  #ifdef DEBUG
 193  154          TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */,
 194  155                          tnf_ulong, size, size );
 195  156  #endif
 196  157          if (segspt_minfree == 0)        /* leave min 5% of availrmem for */
 197  158                  segspt_minfree = availrmem/20;  /* for the system */
 198  159  
 199  160          if (!hat_supported(HAT_SHARED_PT, (void *)0))
 200  161                  return (EINVAL);
 201  162  
 202  163          /*
 203  164           * get a new as for this shared memory segment
 204  165           */
 205  166          newas = as_alloc();
 206  167          newas->a_proc = NULL;
 207  168          sptcargs.amp = amp;
 208  169          sptcargs.prot = prot;
 209  170          sptcargs.flags = flags;
 210  171          sptcargs.szc = share_szc;
 211  172          /*
 212  173           * create a shared page table (spt) segment
 213  174           */
 214  175  
 215  176          if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
 216  177                  as_free(newas);
 217  178                  return (err);
 218  179          }
 219  180          *sptseg = sptcargs.seg_spt;
 220  181          return (0);
 221  182  }
 222  183  
 223  184  void
 224  185  sptdestroy(struct as *as, struct anon_map *amp)
 225  186  {
 226  187  
 227  188  #ifdef DEBUG
 228  189          TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */);
 229  190  #endif
 230  191          (void) as_unmap(as, SEGSPTADDR, amp->size);
 231  192          as_free(as);
 232  193  }
 233  194  
 234  195  /*
 235  196   * called from seg_free().
 236  197   * free (i.e., unlock, unmap, return to free list)
 237  198   *  all the pages in the given seg.
 238  199   */
 239  200  void
 240  201  segspt_free(struct seg  *seg)
 241  202  {
 242  203          struct spt_data *sptd = (struct spt_data *)seg->s_data;
 243  204  
 244  205          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
 245  206  
 246  207          if (sptd != NULL) {
 247  208                  if (sptd->spt_realsize)
 248  209                          segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
 249  210  
 250  211          if (sptd->spt_ppa_lckcnt)
 251  212                  kmem_free(sptd->spt_ppa_lckcnt,
 252  213                      sizeof (*sptd->spt_ppa_lckcnt)
 253  214                      * btopr(sptd->spt_amp->size));
 254  215                  kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
 255  216                  cv_destroy(&sptd->spt_cv);
 256  217                  mutex_destroy(&sptd->spt_lock);
 257  218                  kmem_free(sptd, sizeof (*sptd));
 258  219          }
 259  220  }
 260  221  
 261  222  /*ARGSUSED*/
 262  223  static int
 263  224  segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
 264  225          uint_t flags)
 265  226  {
 266  227          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
 267  228  
 268  229          return (0);
 269  230  }
 270  231  
 271  232  /*ARGSUSED*/
 272  233  static size_t
 273  234  segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
 274  235  {
 275  236          caddr_t eo_seg;
 276  237          pgcnt_t npages;
 277  238          struct shm_data *shmd = (struct shm_data *)seg->s_data;
 278  239          struct seg      *sptseg;
 279  240          struct spt_data *sptd;
 280  241  
 281  242          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
 282  243  #ifdef lint
 283  244          seg = seg;
 284  245  #endif
 285  246          sptseg = shmd->shm_sptseg;
 286  247          sptd = sptseg->s_data;
 287  248  
 288  249          if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 289  250                  eo_seg = addr + len;
 290  251                  while (addr < eo_seg) {
 291  252                          /* page exists, and it's locked. */
 292  253                          *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
 293  254                              SEG_PAGE_ANON;
 294  255                          addr += PAGESIZE;
 295  256                  }
 296  257                  return (len);
 297  258          } else {
 298  259                  struct  anon_map *amp = shmd->shm_amp;
 299  260                  struct  anon    *ap;
 300  261                  page_t          *pp;
 301  262                  pgcnt_t         anon_index;
 302  263                  struct vnode    *vp;
 303  264                  u_offset_t      off;
 304  265                  ulong_t         i;
 305  266                  int             ret;
 306  267                  anon_sync_obj_t cookie;
 307  268  
 308  269                  addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 309  270                  anon_index = seg_page(seg, addr);
 310  271                  npages = btopr(len);
 311  272                  if (anon_index + npages > btopr(shmd->shm_amp->size)) {
 312  273                          return (EINVAL);
 313  274                  }
 314  275                  ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
 315  276                  for (i = 0; i < npages; i++, anon_index++) {
 316  277                          ret = 0;
 317  278                          anon_array_enter(amp, anon_index, &cookie);
 318  279                          ap = anon_get_ptr(amp->ahp, anon_index);
 319  280                          if (ap != NULL) {
 320  281                                  swap_xlate(ap, &vp, &off);
 321  282                                  anon_array_exit(&cookie);
 322  283                                  pp = page_lookup_nowait(vp, off, SE_SHARED);
 323  284                                  if (pp != NULL) {
 324  285                                          ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
 325  286                                          page_unlock(pp);
 326  287                                  }
 327  288                          } else {
 328  289                                  anon_array_exit(&cookie);
 329  290                          }
 330  291                          if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
 331  292                                  ret |= SEG_PAGE_LOCKED;
 332  293                          }
 333  294                          *vec++ = (char)ret;
 334  295                  }
 335  296                  ANON_LOCK_EXIT(&amp->a_rwlock);
 336  297                  return (len);
 337  298          }
 338  299  }
 339  300  
 340  301  static int
 341  302  segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
 342  303  {
 343  304          size_t share_size;
 344  305  
 345  306          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
 346  307  
 347  308          /*
 348  309           * seg.s_size may have been rounded up to the largest page size
 349  310           * in shmat().
 350  311           * XXX This should be cleanedup. sptdestroy should take a length
 351  312           * argument which should be the same as sptcreate. Then
 352  313           * this rounding would not be needed (or is done in shm.c)
 353  314           * Only the check for full segment will be needed.
 354  315           *
 355  316           * XXX -- shouldn't raddr == 0 always? These tests don't seem
 356  317           * to be useful at all.
 357  318           */
 358  319          share_size = page_get_pagesize(seg->s_szc);
 359  320          ssize = P2ROUNDUP(ssize, share_size);
 360  321  
 361  322          if (raddr == seg->s_base && ssize == seg->s_size) {
 362  323                  seg_free(seg);
 363  324                  return (0);
 364  325          } else
 365  326                  return (EINVAL);
 366  327  }
 367  328  
 368  329  int
 369  330  segspt_create(struct seg *seg, caddr_t argsp)
 370  331  {
 371  332          int             err;
 372  333          caddr_t         addr = seg->s_base;
 373  334          struct spt_data *sptd;
 374  335          struct  segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
 375  336          struct anon_map *amp = sptcargs->amp;
 376  337          struct kshmid   *sp = amp->a_sp;
 377  338          struct  cred    *cred = CRED();
 378  339          ulong_t         i, j, anon_index = 0;
 379  340          pgcnt_t         npages = btopr(amp->size);
 380  341          struct vnode    *vp;
 381  342          page_t          **ppa;
 382  343          uint_t          hat_flags;
 383  344          size_t          pgsz;
 384  345          pgcnt_t         pgcnt;
 385  346          caddr_t         a;
 386  347          pgcnt_t         pidx;
 387  348          size_t          sz;
 388  349          proc_t          *procp = curproc;
 389  350          rctl_qty_t      lockedbytes = 0;
 390  351          kproject_t      *proj;
 391  352  
 392  353          /*
 393  354           * We are holding the a_lock on the underlying dummy as,
 394  355           * so we can make calls to the HAT layer.
 395  356           */
 396  357          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
 397  358          ASSERT(sp != NULL);
 398  359  
 399  360  #ifdef DEBUG
 400  361          TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */,
 401  362              tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size);
 402  363  #endif
 403  364          if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
 404  365                  if (err = anon_swap_adjust(npages))
 405  366                          return (err);
 406  367          }
 407  368          err = ENOMEM;
 408  369  
 409  370          if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
 410  371                  goto out1;
 411  372  
 412  373          if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
 413  374                  if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
 414  375                      KM_NOSLEEP)) == NULL)
 415  376                          goto out2;
 416  377          }
 417  378  
 418  379          mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
 419  380  
 420  381          if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
 421  382                  goto out3;
 422  383  
 423  384          seg->s_ops = &segspt_ops;
 424  385          sptd->spt_vp = vp;
 425  386          sptd->spt_amp = amp;
 426  387          sptd->spt_prot = sptcargs->prot;
 427  388          sptd->spt_flags = sptcargs->flags;
 428  389          seg->s_data = (caddr_t)sptd;
 429  390          sptd->spt_ppa = NULL;
 430  391          sptd->spt_ppa_lckcnt = NULL;
 431  392          seg->s_szc = sptcargs->szc;
 432  393          cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL);
 433  394          sptd->spt_gen = 0;
 434  395  
 435  396          ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 436  397          if (seg->s_szc > amp->a_szc) {
 437  398                  amp->a_szc = seg->s_szc;
 438  399          }
 439  400          ANON_LOCK_EXIT(&amp->a_rwlock);
 440  401  
 441  402          /*
 442  403           * Set policy to affect initial allocation of pages in
 443  404           * anon_map_createpages()
 444  405           */
 445  406          (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
 446  407              NULL, 0, ptob(npages));
 447  408  
 448  409          if (sptcargs->flags & SHM_PAGEABLE) {
 449  410                  size_t  share_sz;
 450  411                  pgcnt_t new_npgs, more_pgs;
 451  412                  struct anon_hdr *nahp;
 452  413                  zone_t *zone;
 453  414  
 454  415                  share_sz = page_get_pagesize(seg->s_szc);
 455  416                  if (!IS_P2ALIGNED(amp->size, share_sz)) {
 456  417                          /*
 457  418                           * We are rounding up the size of the anon array
 458  419                           * on 4 M boundary because we always create 4 M
 459  420                           * of page(s) when locking, faulting pages and we
 460  421                           * don't have to check for all corner cases e.g.
 461  422                           * if there is enough space to allocate 4 M
 462  423                           * page.
 463  424                           */
 464  425                          new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
 465  426                          more_pgs = new_npgs - npages;
 466  427  
 467  428                          /*
 468  429                           * The zone will never be NULL, as a fully created
 469  430                           * shm always has an owning zone.
 470  431                           */
 471  432                          zone = sp->shm_perm.ipc_zone_ref.zref_zone;
 472  433                          ASSERT(zone != NULL);
 473  434                          if (anon_resv_zone(ptob(more_pgs), zone) == 0) {
 474  435                                  err = ENOMEM;
 475  436                                  goto out4;
 476  437                          }
 477  438  
 478  439                          nahp = anon_create(new_npgs, ANON_SLEEP);
 479  440                          ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 480  441                          (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
 481  442                              ANON_SLEEP);
 482  443                          anon_release(amp->ahp, npages);
 483  444                          amp->ahp = nahp;
 484  445                          ASSERT(amp->swresv == ptob(npages));
 485  446                          amp->swresv = amp->size = ptob(new_npgs);
 486  447                          ANON_LOCK_EXIT(&amp->a_rwlock);
 487  448                          npages = new_npgs;
 488  449                  }
 489  450  
 490  451                  sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
 491  452                      sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
 492  453                  sptd->spt_pcachecnt = 0;
 493  454                  sptd->spt_realsize = ptob(npages);
 494  455                  sptcargs->seg_spt = seg;
 495  456                  return (0);
 496  457          }
 497  458  
 498  459          /*
 499  460           * get array of pages for each anon slot in amp
 500  461           */
 501  462          if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
 502  463              seg, addr, S_CREATE, cred)) != 0)
 503  464                  goto out4;
 504  465  
 505  466          mutex_enter(&sp->shm_mlock);
 506  467  
 507  468          /* May be partially locked, so, count bytes to charge for locking */
 508  469          for (i = 0; i < npages; i++)
 509  470                  if (ppa[i]->p_lckcnt == 0)
 510  471                          lockedbytes += PAGESIZE;
 511  472  
 512  473          proj = sp->shm_perm.ipc_proj;
 513  474  
 514  475          if (lockedbytes > 0) {
 515  476                  mutex_enter(&procp->p_lock);
 516  477                  if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) {
 517  478                          mutex_exit(&procp->p_lock);
 518  479                          mutex_exit(&sp->shm_mlock);
 519  480                          for (i = 0; i < npages; i++)
 520  481                                  page_unlock(ppa[i]);
 521  482                          err = ENOMEM;
 522  483                          goto out4;
 523  484                  }
 524  485                  mutex_exit(&procp->p_lock);
 525  486          }
 526  487  
 527  488          /*
 528  489           * addr is initial address corresponding to the first page on ppa list
 529  490           */
 530  491          for (i = 0; i < npages; i++) {
 531  492                  /* attempt to lock all pages */
 532  493                  if (page_pp_lock(ppa[i], 0, 1) == 0) {
 533  494                          /*
 534  495                           * if unable to lock any page, unlock all
 535  496                           * of them and return error
 536  497                           */
 537  498                          for (j = 0; j < i; j++)
 538  499                                  page_pp_unlock(ppa[j], 0, 1);
 539  500                          for (i = 0; i < npages; i++)
 540  501                                  page_unlock(ppa[i]);
 541  502                          rctl_decr_locked_mem(NULL, proj, lockedbytes, 0);
 542  503                          mutex_exit(&sp->shm_mlock);
 543  504                          err = ENOMEM;
 544  505                          goto out4;
 545  506                  }
 546  507          }
 547  508          mutex_exit(&sp->shm_mlock);
 548  509  
 549  510          /*
 550  511           * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
 551  512           * for the entire life of the segment. For example platforms
 552  513           * that do not support Dynamic Reconfiguration.
 553  514           */
 554  515          hat_flags = HAT_LOAD_SHARE;
 555  516          if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
 556  517                  hat_flags |= HAT_LOAD_LOCK;
 557  518  
 558  519          /*
 559  520           * Load translations one lare page at a time
 560  521           * to make sure we don't create mappings bigger than
 561  522           * segment's size code in case underlying pages
 562  523           * are shared with segvn's segment that uses bigger
 563  524           * size code than we do.
 564  525           */
 565  526          pgsz = page_get_pagesize(seg->s_szc);
 566  527          pgcnt = page_get_pagecnt(seg->s_szc);
 567  528          for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) {
 568  529                  sz = MIN(pgsz, ptob(npages - pidx));
 569  530                  hat_memload_array(seg->s_as->a_hat, a, sz,
 570  531                      &ppa[pidx], sptd->spt_prot, hat_flags);
 571  532          }
 572  533  
 573  534          /*
 574  535           * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
 575  536           * we will leave the pages locked SE_SHARED for the life
 576  537           * of the ISM segment. This will prevent any calls to
 577  538           * hat_pageunload() on this ISM segment for those platforms.
 578  539           */
 579  540          if (!(hat_flags & HAT_LOAD_LOCK)) {
 580  541                  /*
 581  542                   * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
 582  543                   * we no longer need to hold the SE_SHARED lock on the pages,
 583  544                   * since L_PAGELOCK and F_SOFTLOCK calls will grab the
 584  545                   * SE_SHARED lock on the pages as necessary.
 585  546                   */
 586  547                  for (i = 0; i < npages; i++)
 587  548                          page_unlock(ppa[i]);
 588  549          }
 589  550          sptd->spt_pcachecnt = 0;
 590  551          kmem_free(ppa, ((sizeof (page_t *)) * npages));
 591  552          sptd->spt_realsize = ptob(npages);
 592  553          atomic_add_long(&spt_used, npages);
 593  554          sptcargs->seg_spt = seg;
 594  555          return (0);
 595  556  
 596  557  out4:
 597  558          seg->s_data = NULL;
 598  559          kmem_free(vp, sizeof (*vp));
 599  560          cv_destroy(&sptd->spt_cv);
 600  561  out3:
 601  562          mutex_destroy(&sptd->spt_lock);
 602  563          if ((sptcargs->flags & SHM_PAGEABLE) == 0)
 603  564                  kmem_free(ppa, (sizeof (*ppa) * npages));
 604  565  out2:
 605  566          kmem_free(sptd, sizeof (*sptd));
 606  567  out1:
 607  568          if ((sptcargs->flags & SHM_PAGEABLE) == 0)
 608  569                  anon_swap_restore(npages);
 609  570          return (err);
 610  571  }
 611  572  
 612  573  /*ARGSUSED*/
 613  574  void
 614  575  segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
 615  576  {
 616  577          struct page     *pp;
 617  578          struct spt_data *sptd = (struct spt_data *)seg->s_data;
 618  579          pgcnt_t         npages;
 619  580          ulong_t         anon_idx;
 620  581          struct anon_map *amp;
 621  582          struct anon     *ap;
 622  583          struct vnode    *vp;
 623  584          u_offset_t      off;
 624  585          uint_t          hat_flags;
 625  586          int             root = 0;
 626  587          pgcnt_t         pgs, curnpgs = 0;
 627  588          page_t          *rootpp;
 628  589          rctl_qty_t      unlocked_bytes = 0;
 629  590          kproject_t      *proj;
 630  591          kshmid_t        *sp;
 631  592  
 632  593          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
 633  594  
 634  595          len = P2ROUNDUP(len, PAGESIZE);
 635  596  
 636  597          npages = btop(len);
 637  598  
 638  599          hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP;
 639  600          if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
 640  601              (sptd->spt_flags & SHM_PAGEABLE)) {
 641  602                  hat_flags = HAT_UNLOAD_UNMAP;
 642  603          }
 643  604  
 644  605          hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
 645  606  
 646  607          amp = sptd->spt_amp;
 647  608          if (sptd->spt_flags & SHM_PAGEABLE)
 648  609                  npages = btop(amp->size);
 649  610  
 650  611          ASSERT(amp != NULL);
 651  612  
 652  613          if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 653  614                  sp = amp->a_sp;
 654  615                  proj = sp->shm_perm.ipc_proj;
 655  616                  mutex_enter(&sp->shm_mlock);
 656  617          }
 657  618          for (anon_idx = 0; anon_idx < npages; anon_idx++) {
 658  619                  if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 659  620                          if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
 660  621                                  panic("segspt_free_pages: null app");
 661  622                                  /*NOTREACHED*/
 662  623                          }
 663  624                  } else {
 664  625                          if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
 665  626                              == NULL)
 666  627                                  continue;
 667  628                  }
 668  629                  ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
 669  630                  swap_xlate(ap, &vp, &off);
 670  631  
 671  632                  /*
 672  633                   * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
 673  634                   * the pages won't be having SE_SHARED lock at this
 674  635                   * point.
 675  636                   *
 676  637                   * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
 677  638                   * the pages are still held SE_SHARED locked from the
 678  639                   * original segspt_create()
 679  640                   *
 680  641                   * Our goal is to get SE_EXCL lock on each page, remove
 681  642                   * permanent lock on it and invalidate the page.
 682  643                   */
 683  644                  if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 684  645                          if (hat_flags == HAT_UNLOAD_UNMAP)
 685  646                                  pp = page_lookup(vp, off, SE_EXCL);
 686  647                          else {
 687  648                                  if ((pp = page_find(vp, off)) == NULL) {
 688  649                                          panic("segspt_free_pages: "
 689  650                                              "page not locked");
 690  651                                          /*NOTREACHED*/
 691  652                                  }
 692  653                                  if (!page_tryupgrade(pp)) {
 693  654                                          page_unlock(pp);
 694  655                                          pp = page_lookup(vp, off, SE_EXCL);
 695  656                                  }
 696  657                          }
 697  658                          if (pp == NULL) {
 698  659                                  panic("segspt_free_pages: "
 699  660                                      "page not in the system");
 700  661                                  /*NOTREACHED*/
 701  662                          }
 702  663                          ASSERT(pp->p_lckcnt > 0);
 703  664                          page_pp_unlock(pp, 0, 1);
 704  665                          if (pp->p_lckcnt == 0)
 705  666                                  unlocked_bytes += PAGESIZE;
 706  667                  } else {
 707  668                          if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
 708  669                                  continue;
 709  670                  }
 710  671                  /*
 711  672                   * It's logical to invalidate the pages here as in most cases
 712  673                   * these were created by segspt.
 713  674                   */
 714  675                  if (pp->p_szc != 0) {
 715  676                          if (root == 0) {
 716  677                                  ASSERT(curnpgs == 0);
 717  678                                  root = 1;
 718  679                                  rootpp = pp;
 719  680                                  pgs = curnpgs = page_get_pagecnt(pp->p_szc);
 720  681                                  ASSERT(pgs > 1);
 721  682                                  ASSERT(IS_P2ALIGNED(pgs, pgs));
 722  683                                  ASSERT(!(page_pptonum(pp) & (pgs - 1)));
 723  684                                  curnpgs--;
 724  685                          } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
 725  686                                  ASSERT(curnpgs == 1);
 726  687                                  ASSERT(page_pptonum(pp) ==
 727  688                                      page_pptonum(rootpp) + (pgs - 1));
 728  689                                  page_destroy_pages(rootpp);
 729  690                                  root = 0;
 730  691                                  curnpgs = 0;
 731  692                          } else {
 732  693                                  ASSERT(curnpgs > 1);
 733  694                                  ASSERT(page_pptonum(pp) ==
 734  695                                      page_pptonum(rootpp) + (pgs - curnpgs));
 735  696                                  curnpgs--;
 736  697                          }
 737  698                  } else {
 738  699                          if (root != 0 || curnpgs != 0) {
 739  700                                  panic("segspt_free_pages: bad large page");
 740  701                                  /*NOTREACHED*/
 741  702                          }
 742  703                          /*
 743  704                           * Before destroying the pages, we need to take care
 744  705                           * of the rctl locked memory accounting. For that
 745  706                           * we need to calculte the unlocked_bytes.
 746  707                           */
 747  708                          if (pp->p_lckcnt > 0)
 748  709                                  unlocked_bytes += PAGESIZE;
 749  710                          /*LINTED: constant in conditional context */
 750  711                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
 751  712                  }
 752  713          }
 753  714          if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 754  715                  if (unlocked_bytes > 0)
 755  716                          rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0);
 756  717                  mutex_exit(&sp->shm_mlock);
 757  718          }
 758  719          if (root != 0 || curnpgs != 0) {
 759  720                  panic("segspt_free_pages: bad large page");
 760  721                  /*NOTREACHED*/
 761  722          }
 762  723  
 763  724          /*
 764  725           * mark that pages have been released
 765  726           */
 766  727          sptd->spt_realsize = 0;
 767  728  
 768  729          if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 769  730                  atomic_add_long(&spt_used, -npages);
 770  731                  anon_swap_restore(npages);
 771  732          }
 772  733  }
 773  734  
 774  735  /*
 775  736   * Get memory allocation policy info for specified address in given segment
 776  737   */
 777  738  static lgrp_mem_policy_info_t *
 778  739  segspt_getpolicy(struct seg *seg, caddr_t addr)
 779  740  {
 780  741          struct anon_map         *amp;
 781  742          ulong_t                 anon_index;
 782  743          lgrp_mem_policy_info_t  *policy_info;
 783  744          struct spt_data         *spt_data;
 784  745  
 785  746          ASSERT(seg != NULL);
 786  747  
 787  748          /*
 788  749           * Get anon_map from segspt
 789  750           *
 790  751           * Assume that no lock needs to be held on anon_map, since
 791  752           * it should be protected by its reference count which must be
 792  753           * nonzero for an existing segment
 793  754           * Need to grab readers lock on policy tree though
 794  755           */
 795  756          spt_data = (struct spt_data *)seg->s_data;
 796  757          if (spt_data == NULL)
 797  758                  return (NULL);
 798  759          amp = spt_data->spt_amp;
 799  760          ASSERT(amp->refcnt != 0);
 800  761  
 801  762          /*
 802  763           * Get policy info
 803  764           *
 804  765           * Assume starting anon index of 0
 805  766           */
 806  767          anon_index = seg_page(seg, addr);
 807  768          policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
 808  769  
 809  770          return (policy_info);
 810  771  }
 811  772  
 812  773  /*
 813  774   * DISM only.
 814  775   * Return locked pages over a given range.
 815  776   *
 816  777   * We will cache all DISM locked pages and save the pplist for the
 817  778   * entire segment in the ppa field of the underlying DISM segment structure.
 818  779   * Later, during a call to segspt_reclaim() we will use this ppa array
 819  780   * to page_unlock() all of the pages and then we will free this ppa list.
 820  781   */
 821  782  /*ARGSUSED*/
 822  783  static int
 823  784  segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
 824  785      struct page ***ppp, enum lock_type type, enum seg_rw rw)
 825  786  {
 826  787          struct  shm_data *shmd = (struct shm_data *)seg->s_data;
 827  788          struct  seg     *sptseg = shmd->shm_sptseg;
 828  789          struct  spt_data *sptd = sptseg->s_data;
 829  790          pgcnt_t pg_idx, npages, tot_npages, npgs;
 830  791          struct  page **pplist, **pl, **ppa, *pp;
 831  792          struct  anon_map *amp;
 832  793          spgcnt_t        an_idx;
 833  794          int     ret = ENOTSUP;
 834  795          uint_t  pl_built = 0;
 835  796          struct  anon *ap;
 836  797          struct  vnode *vp;
 837  798          u_offset_t off;
 838  799          pgcnt_t claim_availrmem = 0;
 839  800          uint_t  szc;
 840  801  
 841  802          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
 842  803          ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
 843  804  
 844  805          /*
 845  806           * We want to lock/unlock the entire ISM segment. Therefore,
 846  807           * we will be using the underlying sptseg and it's base address
 847  808           * and length for the caching arguments.
 848  809           */
 849  810          ASSERT(sptseg);
 850  811          ASSERT(sptd);
 851  812  
 852  813          pg_idx = seg_page(seg, addr);
 853  814          npages = btopr(len);
 854  815  
 855  816          /*
 856  817           * check if the request is larger than number of pages covered
 857  818           * by amp
 858  819           */
 859  820          if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
 860  821                  *ppp = NULL;
 861  822                  return (ENOTSUP);
 862  823          }
 863  824  
 864  825          if (type == L_PAGEUNLOCK) {
 865  826                  ASSERT(sptd->spt_ppa != NULL);
 866  827  
 867  828                  seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
 868  829                      sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
 869  830  
 870  831                  /*
 871  832                   * If someone is blocked while unmapping, we purge
 872  833                   * segment page cache and thus reclaim pplist synchronously
 873  834                   * without waiting for seg_pasync_thread. This speeds up
 874  835                   * unmapping in cases where munmap(2) is called, while
 875  836                   * raw async i/o is still in progress or where a thread
 876  837                   * exits on data fault in a multithreaded application.
 877  838                   */
 878  839                  if ((sptd->spt_flags & DISM_PPA_CHANGED) ||
 879  840                      (AS_ISUNMAPWAIT(seg->s_as) &&
 880  841                      shmd->shm_softlockcnt > 0)) {
 881  842                          segspt_purge(seg);
 882  843                  }
 883  844                  return (0);
 884  845          }
 885  846  
 886  847          /* The L_PAGELOCK case ... */
 887  848  
 888  849          if (sptd->spt_flags & DISM_PPA_CHANGED) {
 889  850                  segspt_purge(seg);
 890  851                  /*
 891  852                   * for DISM ppa needs to be rebuild since
 892  853                   * number of locked pages could be changed
 893  854                   */
 894  855                  *ppp = NULL;
 895  856                  return (ENOTSUP);
 896  857          }
 897  858  
 898  859          /*
 899  860           * First try to find pages in segment page cache, without
 900  861           * holding the segment lock.
 901  862           */
 902  863          pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
 903  864              S_WRITE, SEGP_FORCE_WIRED);
 904  865          if (pplist != NULL) {
 905  866                  ASSERT(sptd->spt_ppa != NULL);
 906  867                  ASSERT(sptd->spt_ppa == pplist);
 907  868                  ppa = sptd->spt_ppa;
 908  869                  for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
 909  870                          if (ppa[an_idx] == NULL) {
 910  871                                  seg_pinactive(seg, NULL, seg->s_base,
 911  872                                      sptd->spt_amp->size, ppa,
 912  873                                      S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
 913  874                                  *ppp = NULL;
 914  875                                  return (ENOTSUP);
 915  876                          }
 916  877                          if ((szc = ppa[an_idx]->p_szc) != 0) {
 917  878                                  npgs = page_get_pagecnt(szc);
 918  879                                  an_idx = P2ROUNDUP(an_idx + 1, npgs);
 919  880                          } else {
 920  881                                  an_idx++;
 921  882                          }
 922  883                  }
 923  884                  /*
 924  885                   * Since we cache the entire DISM segment, we want to
 925  886                   * set ppp to point to the first slot that corresponds
 926  887                   * to the requested addr, i.e. pg_idx.
 927  888                   */
 928  889                  *ppp = &(sptd->spt_ppa[pg_idx]);
 929  890                  return (0);
 930  891          }
 931  892  
 932  893          mutex_enter(&sptd->spt_lock);
 933  894          /*
 934  895           * try to find pages in segment page cache with mutex
 935  896           */
 936  897          pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
 937  898              S_WRITE, SEGP_FORCE_WIRED);
 938  899          if (pplist != NULL) {
 939  900                  ASSERT(sptd->spt_ppa != NULL);
 940  901                  ASSERT(sptd->spt_ppa == pplist);
 941  902                  ppa = sptd->spt_ppa;
 942  903                  for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
 943  904                          if (ppa[an_idx] == NULL) {
 944  905                                  mutex_exit(&sptd->spt_lock);
 945  906                                  seg_pinactive(seg, NULL, seg->s_base,
 946  907                                      sptd->spt_amp->size, ppa,
 947  908                                      S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
 948  909                                  *ppp = NULL;
 949  910                                  return (ENOTSUP);
 950  911                          }
 951  912                          if ((szc = ppa[an_idx]->p_szc) != 0) {
 952  913                                  npgs = page_get_pagecnt(szc);
 953  914                                  an_idx = P2ROUNDUP(an_idx + 1, npgs);
 954  915                          } else {
 955  916                                  an_idx++;
 956  917                          }
 957  918                  }
 958  919                  /*
 959  920                   * Since we cache the entire DISM segment, we want to
 960  921                   * set ppp to point to the first slot that corresponds
 961  922                   * to the requested addr, i.e. pg_idx.
 962  923                   */
 963  924                  mutex_exit(&sptd->spt_lock);
 964  925                  *ppp = &(sptd->spt_ppa[pg_idx]);
 965  926                  return (0);
 966  927          }
 967  928          if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
 968  929              SEGP_FORCE_WIRED) == SEGP_FAIL) {
 969  930                  mutex_exit(&sptd->spt_lock);
 970  931                  *ppp = NULL;
 971  932                  return (ENOTSUP);
 972  933          }
 973  934  
 974  935          /*
 975  936           * No need to worry about protections because DISM pages are always rw.
 976  937           */
 977  938          pl = pplist = NULL;
 978  939          amp = sptd->spt_amp;
 979  940  
 980  941          /*
 981  942           * Do we need to build the ppa array?
 982  943           */
 983  944          if (sptd->spt_ppa == NULL) {
 984  945                  pgcnt_t lpg_cnt = 0;
 985  946  
 986  947                  pl_built = 1;
 987  948                  tot_npages = btopr(sptd->spt_amp->size);
 988  949  
 989  950                  ASSERT(sptd->spt_pcachecnt == 0);
 990  951                  pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
 991  952                  pl = pplist;
 992  953  
 993  954                  ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 994  955                  for (an_idx = 0; an_idx < tot_npages; ) {
 995  956                          ap = anon_get_ptr(amp->ahp, an_idx);
 996  957                          /*
 997  958                           * Cache only mlocked pages. For large pages
 998  959                           * if one (constituent) page is mlocked
 999  960                           * all pages for that large page
1000  961                           * are cached also. This is for quick
1001  962                           * lookups of ppa array;
1002  963                           */
1003  964                          if ((ap != NULL) && (lpg_cnt != 0 ||
1004  965                              (sptd->spt_ppa_lckcnt[an_idx] != 0))) {
1005  966  
1006  967                                  swap_xlate(ap, &vp, &off);
1007  968                                  pp = page_lookup(vp, off, SE_SHARED);
1008  969                                  ASSERT(pp != NULL);
1009  970                                  if (lpg_cnt == 0) {
1010  971                                          lpg_cnt++;
1011  972                                          /*
1012  973                                           * For a small page, we are done --
1013  974                                           * lpg_count is reset to 0 below.
1014  975                                           *
1015  976                                           * For a large page, we are guaranteed
1016  977                                           * to find the anon structures of all
1017  978                                           * constituent pages and a non-zero
1018  979                                           * lpg_cnt ensures that we don't test
1019  980                                           * for mlock for these. We are done
1020  981                                           * when lpg_count reaches (npgs + 1).
1021  982                                           * If we are not the first constituent
1022  983                                           * page, restart at the first one.
1023  984                                           */
1024  985                                          npgs = page_get_pagecnt(pp->p_szc);
1025  986                                          if (!IS_P2ALIGNED(an_idx, npgs)) {
1026  987                                                  an_idx = P2ALIGN(an_idx, npgs);
1027  988                                                  page_unlock(pp);
1028  989                                                  continue;
1029  990                                          }
1030  991                                  }
1031  992                                  if (++lpg_cnt > npgs)
1032  993                                          lpg_cnt = 0;
1033  994  
1034  995                                  /*
1035  996                                   * availrmem is decremented only
1036  997                                   * for unlocked pages
1037  998                                   */
1038  999                                  if (sptd->spt_ppa_lckcnt[an_idx] == 0)
1039 1000                                          claim_availrmem++;
1040 1001                                  pplist[an_idx] = pp;
1041 1002                          }
1042 1003                          an_idx++;
1043 1004                  }
1044 1005                  ANON_LOCK_EXIT(&amp->a_rwlock);
1045 1006  
1046 1007                  if (claim_availrmem) {
1047 1008                          mutex_enter(&freemem_lock);
1048 1009                          if (availrmem < tune.t_minarmem + claim_availrmem) {
1049 1010                                  mutex_exit(&freemem_lock);
1050 1011                                  ret = ENOTSUP;
1051 1012                                  claim_availrmem = 0;
1052 1013                                  goto insert_fail;
1053 1014                          } else {
1054 1015                                  availrmem -= claim_availrmem;
1055 1016                          }
1056 1017                          mutex_exit(&freemem_lock);
1057 1018                  }
1058 1019  
1059 1020                  sptd->spt_ppa = pl;
1060 1021          } else {
1061 1022                  /*
1062 1023                   * We already have a valid ppa[].
1063 1024                   */
1064 1025                  pl = sptd->spt_ppa;
1065 1026          }
1066 1027  
1067 1028          ASSERT(pl != NULL);
1068 1029  
1069 1030          ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1070 1031              sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1071 1032              segspt_reclaim);
1072 1033          if (ret == SEGP_FAIL) {
1073 1034                  /*
1074 1035                   * seg_pinsert failed. We return
1075 1036                   * ENOTSUP, so that the as_pagelock() code will
1076 1037                   * then try the slower F_SOFTLOCK path.
1077 1038                   */
1078 1039                  if (pl_built) {
1079 1040                          /*
1080 1041                           * No one else has referenced the ppa[].
1081 1042                           * We created it and we need to destroy it.
1082 1043                           */
1083 1044                          sptd->spt_ppa = NULL;
1084 1045                  }
1085 1046                  ret = ENOTSUP;
1086 1047                  goto insert_fail;
1087 1048          }
1088 1049  
1089 1050          /*
1090 1051           * In either case, we increment softlockcnt on the 'real' segment.
1091 1052           */
1092 1053          sptd->spt_pcachecnt++;
1093 1054          atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1094 1055  
1095 1056          ppa = sptd->spt_ppa;
1096 1057          for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1097 1058                  if (ppa[an_idx] == NULL) {
1098 1059                          mutex_exit(&sptd->spt_lock);
1099 1060                          seg_pinactive(seg, NULL, seg->s_base,
1100 1061                              sptd->spt_amp->size,
1101 1062                              pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1102 1063                          *ppp = NULL;
1103 1064                          return (ENOTSUP);
1104 1065                  }
1105 1066                  if ((szc = ppa[an_idx]->p_szc) != 0) {
1106 1067                          npgs = page_get_pagecnt(szc);
1107 1068                          an_idx = P2ROUNDUP(an_idx + 1, npgs);
1108 1069                  } else {
1109 1070                          an_idx++;
1110 1071                  }
1111 1072          }
1112 1073          /*
1113 1074           * We can now drop the sptd->spt_lock since the ppa[]
1114 1075           * exists and he have incremented pacachecnt.
1115 1076           */
1116 1077          mutex_exit(&sptd->spt_lock);
1117 1078  
1118 1079          /*
1119 1080           * Since we cache the entire segment, we want to
1120 1081           * set ppp to point to the first slot that corresponds
1121 1082           * to the requested addr, i.e. pg_idx.
1122 1083           */
1123 1084          *ppp = &(sptd->spt_ppa[pg_idx]);
1124 1085          return (0);
1125 1086  
1126 1087  insert_fail:
1127 1088          /*
1128 1089           * We will only reach this code if we tried and failed.
1129 1090           *
1130 1091           * And we can drop the lock on the dummy seg, once we've failed
1131 1092           * to set up a new ppa[].
1132 1093           */
1133 1094          mutex_exit(&sptd->spt_lock);
1134 1095  
1135 1096          if (pl_built) {
1136 1097                  if (claim_availrmem) {
1137 1098                          mutex_enter(&freemem_lock);
1138 1099                          availrmem += claim_availrmem;
1139 1100                          mutex_exit(&freemem_lock);
1140 1101                  }
1141 1102  
1142 1103                  /*
1143 1104                   * We created pl and we need to destroy it.
1144 1105                   */
1145 1106                  pplist = pl;
1146 1107                  for (an_idx = 0; an_idx < tot_npages; an_idx++) {
1147 1108                          if (pplist[an_idx] != NULL)
1148 1109                                  page_unlock(pplist[an_idx]);
1149 1110                  }
1150 1111                  kmem_free(pl, sizeof (page_t *) * tot_npages);
1151 1112          }
1152 1113  
1153 1114          if (shmd->shm_softlockcnt <= 0) {
1154 1115                  if (AS_ISUNMAPWAIT(seg->s_as)) {
1155 1116                          mutex_enter(&seg->s_as->a_contents);
1156 1117                          if (AS_ISUNMAPWAIT(seg->s_as)) {
1157 1118                                  AS_CLRUNMAPWAIT(seg->s_as);
1158 1119                                  cv_broadcast(&seg->s_as->a_cv);
1159 1120                          }
1160 1121                          mutex_exit(&seg->s_as->a_contents);
1161 1122                  }
1162 1123          }
1163 1124          *ppp = NULL;
1164 1125          return (ret);
1165 1126  }
1166 1127  
1167 1128  
1168 1129  
1169 1130  /*
1170 1131   * return locked pages over a given range.
1171 1132   *
1172 1133   * We will cache the entire ISM segment and save the pplist for the
1173 1134   * entire segment in the ppa field of the underlying ISM segment structure.
1174 1135   * Later, during a call to segspt_reclaim() we will use this ppa array
1175 1136   * to page_unlock() all of the pages and then we will free this ppa list.
1176 1137   */
1177 1138  /*ARGSUSED*/
1178 1139  static int
1179 1140  segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
1180 1141      struct page ***ppp, enum lock_type type, enum seg_rw rw)
1181 1142  {
1182 1143          struct shm_data *shmd = (struct shm_data *)seg->s_data;
1183 1144          struct seg      *sptseg = shmd->shm_sptseg;
1184 1145          struct spt_data *sptd = sptseg->s_data;
1185 1146          pgcnt_t np, page_index, npages;
1186 1147          caddr_t a, spt_base;
1187 1148          struct page **pplist, **pl, *pp;
1188 1149          struct anon_map *amp;
1189 1150          ulong_t anon_index;
1190 1151          int ret = ENOTSUP;
1191 1152          uint_t  pl_built = 0;
1192 1153          struct anon *ap;
1193 1154          struct vnode *vp;
1194 1155          u_offset_t off;
1195 1156  
1196 1157          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1197 1158          ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
1198 1159  
1199 1160  
1200 1161          /*
1201 1162           * We want to lock/unlock the entire ISM segment. Therefore,
1202 1163           * we will be using the underlying sptseg and it's base address
1203 1164           * and length for the caching arguments.
1204 1165           */
1205 1166          ASSERT(sptseg);
1206 1167          ASSERT(sptd);
1207 1168  
1208 1169          if (sptd->spt_flags & SHM_PAGEABLE) {
1209 1170                  return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
1210 1171          }
1211 1172  
1212 1173          page_index = seg_page(seg, addr);
1213 1174          npages = btopr(len);
1214 1175  
1215 1176          /*
1216 1177           * check if the request is larger than number of pages covered
1217 1178           * by amp
1218 1179           */
1219 1180          if (page_index + npages > btopr(sptd->spt_amp->size)) {
1220 1181                  *ppp = NULL;
1221 1182                  return (ENOTSUP);
1222 1183          }
1223 1184  
1224 1185          if (type == L_PAGEUNLOCK) {
1225 1186  
1226 1187                  ASSERT(sptd->spt_ppa != NULL);
1227 1188  
1228 1189                  seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1229 1190                      sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1230 1191  
1231 1192                  /*
1232 1193                   * If someone is blocked while unmapping, we purge
1233 1194                   * segment page cache and thus reclaim pplist synchronously
1234 1195                   * without waiting for seg_pasync_thread. This speeds up
1235 1196                   * unmapping in cases where munmap(2) is called, while
1236 1197                   * raw async i/o is still in progress or where a thread
1237 1198                   * exits on data fault in a multithreaded application.
1238 1199                   */
1239 1200                  if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
1240 1201                          segspt_purge(seg);
1241 1202                  }
1242 1203                  return (0);
1243 1204          }
1244 1205  
1245 1206          /* The L_PAGELOCK case... */
1246 1207  
1247 1208          /*
1248 1209           * First try to find pages in segment page cache, without
1249 1210           * holding the segment lock.
1250 1211           */
1251 1212          pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1252 1213              S_WRITE, SEGP_FORCE_WIRED);
1253 1214          if (pplist != NULL) {
1254 1215                  ASSERT(sptd->spt_ppa == pplist);
1255 1216                  ASSERT(sptd->spt_ppa[page_index]);
1256 1217                  /*
1257 1218                   * Since we cache the entire ISM segment, we want to
1258 1219                   * set ppp to point to the first slot that corresponds
1259 1220                   * to the requested addr, i.e. page_index.
1260 1221                   */
1261 1222                  *ppp = &(sptd->spt_ppa[page_index]);
1262 1223                  return (0);
1263 1224          }
1264 1225  
1265 1226          mutex_enter(&sptd->spt_lock);
1266 1227  
1267 1228          /*
1268 1229           * try to find pages in segment page cache
1269 1230           */
1270 1231          pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1271 1232              S_WRITE, SEGP_FORCE_WIRED);
1272 1233          if (pplist != NULL) {
1273 1234                  ASSERT(sptd->spt_ppa == pplist);
1274 1235                  /*
1275 1236                   * Since we cache the entire segment, we want to
1276 1237                   * set ppp to point to the first slot that corresponds
1277 1238                   * to the requested addr, i.e. page_index.
1278 1239                   */
1279 1240                  mutex_exit(&sptd->spt_lock);
1280 1241                  *ppp = &(sptd->spt_ppa[page_index]);
1281 1242                  return (0);
1282 1243          }
1283 1244  
1284 1245          if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1285 1246              SEGP_FORCE_WIRED) == SEGP_FAIL) {
1286 1247                  mutex_exit(&sptd->spt_lock);
1287 1248                  *ppp = NULL;
1288 1249                  return (ENOTSUP);
1289 1250          }
1290 1251  
1291 1252          /*
1292 1253           * No need to worry about protections because ISM pages
1293 1254           * are always rw.
1294 1255           */
1295 1256          pl = pplist = NULL;
1296 1257  
1297 1258          /*
1298 1259           * Do we need to build the ppa array?
1299 1260           */
1300 1261          if (sptd->spt_ppa == NULL) {
1301 1262                  ASSERT(sptd->spt_ppa == pplist);
1302 1263  
1303 1264                  spt_base = sptseg->s_base;
1304 1265                  pl_built = 1;
1305 1266  
1306 1267                  /*
1307 1268                   * availrmem is decremented once during anon_swap_adjust()
1308 1269                   * and is incremented during the anon_unresv(), which is
1309 1270                   * called from shm_rm_amp() when the segment is destroyed.
1310 1271                   */
1311 1272                  amp = sptd->spt_amp;
1312 1273                  ASSERT(amp != NULL);
1313 1274  
1314 1275                  /* pcachecnt is protected by sptd->spt_lock */
1315 1276                  ASSERT(sptd->spt_pcachecnt == 0);
1316 1277                  pplist = kmem_zalloc(sizeof (page_t *)
1317 1278                      * btopr(sptd->spt_amp->size), KM_SLEEP);
1318 1279                  pl = pplist;
1319 1280  
1320 1281                  anon_index = seg_page(sptseg, spt_base);
1321 1282  
1322 1283                  ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1323 1284                  for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
1324 1285                      a += PAGESIZE, anon_index++, pplist++) {
1325 1286                          ap = anon_get_ptr(amp->ahp, anon_index);
1326 1287                          ASSERT(ap != NULL);
1327 1288                          swap_xlate(ap, &vp, &off);
1328 1289                          pp = page_lookup(vp, off, SE_SHARED);
1329 1290                          ASSERT(pp != NULL);
1330 1291                          *pplist = pp;
1331 1292                  }
1332 1293                  ANON_LOCK_EXIT(&amp->a_rwlock);
1333 1294  
1334 1295                  if (a < (spt_base + sptd->spt_amp->size)) {
1335 1296                          ret = ENOTSUP;
1336 1297                          goto insert_fail;
1337 1298                  }
1338 1299                  sptd->spt_ppa = pl;
1339 1300          } else {
1340 1301                  /*
1341 1302                   * We already have a valid ppa[].
1342 1303                   */
1343 1304                  pl = sptd->spt_ppa;
1344 1305          }
1345 1306  
1346 1307          ASSERT(pl != NULL);
1347 1308  
1348 1309          ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1349 1310              sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1350 1311              segspt_reclaim);
1351 1312          if (ret == SEGP_FAIL) {
1352 1313                  /*
1353 1314                   * seg_pinsert failed. We return
1354 1315                   * ENOTSUP, so that the as_pagelock() code will
1355 1316                   * then try the slower F_SOFTLOCK path.
1356 1317                   */
1357 1318                  if (pl_built) {
1358 1319                          /*
1359 1320                           * No one else has referenced the ppa[].
1360 1321                           * We created it and we need to destroy it.
1361 1322                           */
1362 1323                          sptd->spt_ppa = NULL;
1363 1324                  }
1364 1325                  ret = ENOTSUP;
1365 1326                  goto insert_fail;
1366 1327          }
1367 1328  
1368 1329          /*
1369 1330           * In either case, we increment softlockcnt on the 'real' segment.
1370 1331           */
1371 1332          sptd->spt_pcachecnt++;
1372 1333          atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1373 1334  
1374 1335          /*
1375 1336           * We can now drop the sptd->spt_lock since the ppa[]
1376 1337           * exists and he have incremented pacachecnt.
1377 1338           */
1378 1339          mutex_exit(&sptd->spt_lock);
1379 1340  
1380 1341          /*
1381 1342           * Since we cache the entire segment, we want to
1382 1343           * set ppp to point to the first slot that corresponds
1383 1344           * to the requested addr, i.e. page_index.
1384 1345           */
1385 1346          *ppp = &(sptd->spt_ppa[page_index]);
1386 1347          return (0);
1387 1348  
1388 1349  insert_fail:
1389 1350          /*
1390 1351           * We will only reach this code if we tried and failed.
1391 1352           *
1392 1353           * And we can drop the lock on the dummy seg, once we've failed
1393 1354           * to set up a new ppa[].
1394 1355           */
1395 1356          mutex_exit(&sptd->spt_lock);
1396 1357  
1397 1358          if (pl_built) {
1398 1359                  /*
1399 1360                   * We created pl and we need to destroy it.
1400 1361                   */
1401 1362                  pplist = pl;
1402 1363                  np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
1403 1364                  while (np) {
1404 1365                          page_unlock(*pplist);
1405 1366                          np--;
1406 1367                          pplist++;
1407 1368                  }
1408 1369                  kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size));
1409 1370          }
1410 1371          if (shmd->shm_softlockcnt <= 0) {
1411 1372                  if (AS_ISUNMAPWAIT(seg->s_as)) {
1412 1373                          mutex_enter(&seg->s_as->a_contents);
1413 1374                          if (AS_ISUNMAPWAIT(seg->s_as)) {
1414 1375                                  AS_CLRUNMAPWAIT(seg->s_as);
1415 1376                                  cv_broadcast(&seg->s_as->a_cv);
1416 1377                          }
1417 1378                          mutex_exit(&seg->s_as->a_contents);
1418 1379                  }
1419 1380          }
1420 1381          *ppp = NULL;
1421 1382          return (ret);
1422 1383  }
1423 1384  
1424 1385  /*
1425 1386   * purge any cached pages in the I/O page cache
1426 1387   */
1427 1388  static void
1428 1389  segspt_purge(struct seg *seg)
1429 1390  {
1430 1391          seg_ppurge(seg, NULL, SEGP_FORCE_WIRED);
1431 1392  }
1432 1393  
1433 1394  static int
1434 1395  segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
1435 1396          enum seg_rw rw, int async)
1436 1397  {
1437 1398          struct seg *seg = (struct seg *)ptag;
1438 1399          struct  shm_data *shmd = (struct shm_data *)seg->s_data;
1439 1400          struct  seg     *sptseg;
1440 1401          struct  spt_data *sptd;
1441 1402          pgcnt_t npages, i, free_availrmem = 0;
1442 1403          int     done = 0;
1443 1404  
1444 1405  #ifdef lint
1445 1406          addr = addr;
1446 1407  #endif
1447 1408          sptseg = shmd->shm_sptseg;
1448 1409          sptd = sptseg->s_data;
1449 1410          npages = (len >> PAGESHIFT);
1450 1411          ASSERT(npages);
1451 1412          ASSERT(sptd->spt_pcachecnt != 0);
1452 1413          ASSERT(sptd->spt_ppa == pplist);
1453 1414          ASSERT(npages == btopr(sptd->spt_amp->size));
1454 1415          ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1455 1416  
1456 1417          /*
1457 1418           * Acquire the lock on the dummy seg and destroy the
1458 1419           * ppa array IF this is the last pcachecnt.
1459 1420           */
1460 1421          mutex_enter(&sptd->spt_lock);
1461 1422          if (--sptd->spt_pcachecnt == 0) {
1462 1423                  for (i = 0; i < npages; i++) {
1463 1424                          if (pplist[i] == NULL) {
1464 1425                                  continue;
1465 1426                          }
1466 1427                          if (rw == S_WRITE) {
1467 1428                                  hat_setrefmod(pplist[i]);
1468 1429                          } else {
1469 1430                                  hat_setref(pplist[i]);
1470 1431                          }
1471 1432                          if ((sptd->spt_flags & SHM_PAGEABLE) &&
1472 1433                              (sptd->spt_ppa_lckcnt[i] == 0))
1473 1434                                  free_availrmem++;
1474 1435                          page_unlock(pplist[i]);
1475 1436                  }
1476 1437                  if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) {
1477 1438                          mutex_enter(&freemem_lock);
1478 1439                          availrmem += free_availrmem;
1479 1440                          mutex_exit(&freemem_lock);
1480 1441                  }
1481 1442                  /*
1482 1443                   * Since we want to cach/uncache the entire ISM segment,
1483 1444                   * we will track the pplist in a segspt specific field
1484 1445                   * ppa, that is initialized at the time we add an entry to
1485 1446                   * the cache.
1486 1447                   */
1487 1448                  ASSERT(sptd->spt_pcachecnt == 0);
1488 1449                  kmem_free(pplist, sizeof (page_t *) * npages);
1489 1450                  sptd->spt_ppa = NULL;
1490 1451                  sptd->spt_flags &= ~DISM_PPA_CHANGED;
1491 1452                  sptd->spt_gen++;
1492 1453                  cv_broadcast(&sptd->spt_cv);
1493 1454                  done = 1;
1494 1455          }
1495 1456          mutex_exit(&sptd->spt_lock);
1496 1457  
1497 1458          /*
1498 1459           * If we are pcache async thread or called via seg_ppurge_wiredpp() we
1499 1460           * may not hold AS lock (in this case async argument is not 0). This
1500 1461           * means if softlockcnt drops to 0 after the decrement below address
1501 1462           * space may get freed. We can't allow it since after softlock
1502 1463           * derement to 0 we still need to access as structure for possible
1503 1464           * wakeup of unmap waiters. To prevent the disappearance of as we take
1504 1465           * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes
1505 1466           * this mutex as a barrier to make sure this routine completes before
1506 1467           * segment is freed.
1507 1468           *
1508 1469           * The second complication we have to deal with in async case is a
1509 1470           * possibility of missed wake up of unmap wait thread. When we don't
1510 1471           * hold as lock here we may take a_contents lock before unmap wait
1511 1472           * thread that was first to see softlockcnt was still not 0. As a
1512 1473           * result we'll fail to wake up an unmap wait thread. To avoid this
1513 1474           * race we set nounmapwait flag in as structure if we drop softlockcnt
1514 1475           * to 0 if async is not 0.  unmapwait thread
1515 1476           * will not block if this flag is set.
1516 1477           */
1517 1478          if (async)
1518 1479                  mutex_enter(&shmd->shm_segfree_syncmtx);
1519 1480  
1520 1481          /*
1521 1482           * Now decrement softlockcnt.
1522 1483           */
1523 1484          ASSERT(shmd->shm_softlockcnt > 0);
1524 1485          atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1525 1486  
1526 1487          if (shmd->shm_softlockcnt <= 0) {
1527 1488                  if (async || AS_ISUNMAPWAIT(seg->s_as)) {
1528 1489                          mutex_enter(&seg->s_as->a_contents);
1529 1490                          if (async)
1530 1491                                  AS_SETNOUNMAPWAIT(seg->s_as);
1531 1492                          if (AS_ISUNMAPWAIT(seg->s_as)) {
1532 1493                                  AS_CLRUNMAPWAIT(seg->s_as);
1533 1494                                  cv_broadcast(&seg->s_as->a_cv);
1534 1495                          }
1535 1496                          mutex_exit(&seg->s_as->a_contents);
1536 1497                  }
1537 1498          }
1538 1499  
1539 1500          if (async)
1540 1501                  mutex_exit(&shmd->shm_segfree_syncmtx);
1541 1502  
1542 1503          return (done);
1543 1504  }
1544 1505  
1545 1506  /*
1546 1507   * Do a F_SOFTUNLOCK call over the range requested.
1547 1508   * The range must have already been F_SOFTLOCK'ed.
1548 1509   *
1549 1510   * The calls to acquire and release the anon map lock mutex were
1550 1511   * removed in order to avoid a deadly embrace during a DR
1551 1512   * memory delete operation.  (Eg. DR blocks while waiting for a
1552 1513   * exclusive lock on a page that is being used for kaio; the
1553 1514   * thread that will complete the kaio and call segspt_softunlock
1554 1515   * blocks on the anon map lock; another thread holding the anon
1555 1516   * map lock blocks on another page lock via the segspt_shmfault
1556 1517   * -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
1557 1518   *
1558 1519   * The appropriateness of the removal is based upon the following:
1559 1520   * 1. If we are holding a segment's reader lock and the page is held
1560 1521   * shared, then the corresponding element in anonmap which points to
1561 1522   * anon struct cannot change and there is no need to acquire the
1562 1523   * anonymous map lock.
1563 1524   * 2. Threads in segspt_softunlock have a reader lock on the segment
1564 1525   * and already have the shared page lock, so we are guaranteed that
1565 1526   * the anon map slot cannot change and therefore can call anon_get_ptr()
1566 1527   * without grabbing the anonymous map lock.
1567 1528   * 3. Threads that softlock a shared page break copy-on-write, even if
1568 1529   * its a read.  Thus cow faults can be ignored with respect to soft
1569 1530   * unlocking, since the breaking of cow means that the anon slot(s) will
1570 1531   * not be shared.
1571 1532   */
1572 1533  static void
1573 1534  segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
1574 1535          size_t len, enum seg_rw rw)
1575 1536  {
1576 1537          struct shm_data *shmd = (struct shm_data *)seg->s_data;
1577 1538          struct seg      *sptseg;
1578 1539          struct spt_data *sptd;
1579 1540          page_t *pp;
1580 1541          caddr_t adr;
1581 1542          struct vnode *vp;
1582 1543          u_offset_t offset;
1583 1544          ulong_t anon_index;
1584 1545          struct anon_map *amp;           /* XXX - for locknest */
1585 1546          struct anon *ap = NULL;
1586 1547          pgcnt_t npages;
1587 1548  
1588 1549          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1589 1550  
1590 1551          sptseg = shmd->shm_sptseg;
1591 1552          sptd = sptseg->s_data;
1592 1553  
1593 1554          /*
1594 1555           * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
1595 1556           * and therefore their pages are SE_SHARED locked
1596 1557           * for the entire life of the segment.
1597 1558           */
1598 1559          if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
1599 1560              ((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
1600 1561                  goto softlock_decrement;
1601 1562          }
1602 1563  
1603 1564          /*
1604 1565           * Any thread is free to do a page_find and
1605 1566           * page_unlock() on the pages within this seg.
1606 1567           *
1607 1568           * We are already holding the as->a_lock on the user's
1608 1569           * real segment, but we need to hold the a_lock on the
1609 1570           * underlying dummy as. This is mostly to satisfy the
1610 1571           * underlying HAT layer.
1611 1572           */
1612 1573          AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
1613 1574          hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
1614 1575          AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
1615 1576  
1616 1577          amp = sptd->spt_amp;
1617 1578          ASSERT(amp != NULL);
1618 1579          anon_index = seg_page(sptseg, sptseg_addr);
1619 1580  
1620 1581          for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
1621 1582                  ap = anon_get_ptr(amp->ahp, anon_index++);
1622 1583                  ASSERT(ap != NULL);
1623 1584                  swap_xlate(ap, &vp, &offset);
1624 1585  
1625 1586                  /*
1626 1587                   * Use page_find() instead of page_lookup() to
1627 1588                   * find the page since we know that it has a
1628 1589                   * "shared" lock.
1629 1590                   */
1630 1591                  pp = page_find(vp, offset);
1631 1592                  ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
1632 1593                  if (pp == NULL) {
1633 1594                          panic("segspt_softunlock: "
1634 1595                              "addr %p, ap %p, vp %p, off %llx",
1635 1596                              (void *)adr, (void *)ap, (void *)vp, offset);
1636 1597                          /*NOTREACHED*/
1637 1598                  }
1638 1599  
1639 1600                  if (rw == S_WRITE) {
1640 1601                          hat_setrefmod(pp);
1641 1602                  } else if (rw != S_OTHER) {
1642 1603                          hat_setref(pp);
1643 1604                  }
1644 1605                  page_unlock(pp);
1645 1606          }
1646 1607  
1647 1608  softlock_decrement:
1648 1609          npages = btopr(len);
1649 1610          ASSERT(shmd->shm_softlockcnt >= npages);
1650 1611          atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
1651 1612          if (shmd->shm_softlockcnt == 0) {
1652 1613                  /*
1653 1614                   * All SOFTLOCKS are gone. Wakeup any waiting
1654 1615                   * unmappers so they can try again to unmap.
1655 1616                   * Check for waiters first without the mutex
1656 1617                   * held so we don't always grab the mutex on
1657 1618                   * softunlocks.
1658 1619                   */
1659 1620                  if (AS_ISUNMAPWAIT(seg->s_as)) {
1660 1621                          mutex_enter(&seg->s_as->a_contents);
1661 1622                          if (AS_ISUNMAPWAIT(seg->s_as)) {
1662 1623                                  AS_CLRUNMAPWAIT(seg->s_as);
1663 1624                                  cv_broadcast(&seg->s_as->a_cv);
1664 1625                          }
1665 1626                          mutex_exit(&seg->s_as->a_contents);
1666 1627                  }
1667 1628          }
1668 1629  }
1669 1630  
1670 1631  int
1671 1632  segspt_shmattach(struct seg *seg, caddr_t *argsp)
1672 1633  {
1673 1634          struct shm_data *shmd_arg = (struct shm_data *)argsp;
1674 1635          struct shm_data *shmd;
1675 1636          struct anon_map *shm_amp = shmd_arg->shm_amp;
1676 1637          struct spt_data *sptd;
1677 1638          int error = 0;
1678 1639  
1679 1640          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1680 1641  
1681 1642          shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
1682 1643          if (shmd == NULL)
1683 1644                  return (ENOMEM);
1684 1645  
1685 1646          shmd->shm_sptas = shmd_arg->shm_sptas;
1686 1647          shmd->shm_amp = shm_amp;
1687 1648          shmd->shm_sptseg = shmd_arg->shm_sptseg;
1688 1649  
1689 1650          (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
1690 1651              NULL, 0, seg->s_size);
1691 1652  
1692 1653          mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
1693 1654  
1694 1655          seg->s_data = (void *)shmd;
1695 1656          seg->s_ops = &segspt_shmops;
1696 1657          seg->s_szc = shmd->shm_sptseg->s_szc;
1697 1658          sptd = shmd->shm_sptseg->s_data;
1698 1659  
1699 1660          if (sptd->spt_flags & SHM_PAGEABLE) {
1700 1661                  if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
1701 1662                      KM_NOSLEEP)) == NULL) {
1702 1663                          seg->s_data = (void *)NULL;
1703 1664                          kmem_free(shmd, (sizeof (*shmd)));
1704 1665                          return (ENOMEM);
1705 1666                  }
1706 1667                  shmd->shm_lckpgs = 0;
1707 1668                  if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
1708 1669                          if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
1709 1670                              shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1710 1671                              seg->s_size, seg->s_szc)) != 0) {
1711 1672                                  kmem_free(shmd->shm_vpage,
1712 1673                                      btopr(shm_amp->size));
1713 1674                          }
1714 1675                  }
1715 1676          } else {
1716 1677                  error = hat_share(seg->s_as->a_hat, seg->s_base,
1717 1678                      shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1718 1679                      seg->s_size, seg->s_szc);
1719 1680          }
1720 1681          if (error) {
1721 1682                  seg->s_szc = 0;
1722 1683                  seg->s_data = (void *)NULL;
1723 1684                  kmem_free(shmd, (sizeof (*shmd)));
1724 1685          } else {
1725 1686                  ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1726 1687                  shm_amp->refcnt++;
1727 1688                  ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1728 1689          }
1729 1690          return (error);
1730 1691  }
1731 1692  
1732 1693  int
1733 1694  segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
1734 1695  {
1735 1696          struct shm_data *shmd = (struct shm_data *)seg->s_data;
1736 1697          int reclaim = 1;
1737 1698  
1738 1699          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1739 1700  retry:
1740 1701          if (shmd->shm_softlockcnt > 0) {
1741 1702                  if (reclaim == 1) {
1742 1703                          segspt_purge(seg);
1743 1704                          reclaim = 0;
1744 1705                          goto retry;
1745 1706                  }
1746 1707                  return (EAGAIN);
1747 1708          }
1748 1709  
1749 1710          if (ssize != seg->s_size) {
1750 1711  #ifdef DEBUG
1751 1712                  cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
1752 1713                      ssize, seg->s_size);
1753 1714  #endif
1754 1715                  return (EINVAL);
1755 1716          }
1756 1717  
1757 1718          (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
1758 1719              NULL, 0);
1759 1720          hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);
1760 1721  
1761 1722          seg_free(seg);
1762 1723  
1763 1724          return (0);
1764 1725  }
1765 1726  
1766 1727  void
1767 1728  segspt_shmfree(struct seg *seg)
1768 1729  {
1769 1730          struct shm_data *shmd = (struct shm_data *)seg->s_data;
1770 1731          struct anon_map *shm_amp = shmd->shm_amp;
1771 1732  
1772 1733          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1773 1734  
1774 1735          (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
1775 1736              MC_UNLOCK, NULL, 0);
1776 1737  
1777 1738          /*
1778 1739           * Need to increment refcnt when attaching
1779 1740           * and decrement when detaching because of dup().
1780 1741           */
1781 1742          ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1782 1743          shm_amp->refcnt--;
1783 1744          ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1784 1745  
1785 1746          if (shmd->shm_vpage) {  /* only for DISM */
1786 1747                  kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
1787 1748                  shmd->shm_vpage = NULL;
1788 1749          }
1789 1750  
1790 1751          /*
1791 1752           * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's
1792 1753           * still working with this segment without holding as lock.
1793 1754           */
1794 1755          ASSERT(shmd->shm_softlockcnt == 0);
1795 1756          mutex_enter(&shmd->shm_segfree_syncmtx);
1796 1757          mutex_destroy(&shmd->shm_segfree_syncmtx);
1797 1758  
1798 1759          kmem_free(shmd, sizeof (*shmd));
1799 1760  }
1800 1761  
1801 1762  /*ARGSUSED*/
1802 1763  int
1803 1764  segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1804 1765  {
1805 1766          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1806 1767  
1807 1768          /*
1808 1769           * Shared page table is more than shared mapping.
1809 1770           *  Individual process sharing page tables can't change prot
1810 1771           *  because there is only one set of page tables.
1811 1772           *  This will be allowed after private page table is
1812 1773           *  supported.
1813 1774           */
1814 1775  /* need to return correct status error? */
1815 1776          return (0);
1816 1777  }
1817 1778  
1818 1779  
1819 1780  faultcode_t
1820 1781  segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
1821 1782      size_t len, enum fault_type type, enum seg_rw rw)
1822 1783  {
1823 1784          struct  shm_data        *shmd = (struct shm_data *)seg->s_data;
1824 1785          struct  seg             *sptseg = shmd->shm_sptseg;
1825 1786          struct  as              *curspt = shmd->shm_sptas;
1826 1787          struct  spt_data        *sptd = sptseg->s_data;
1827 1788          pgcnt_t npages;
1828 1789          size_t  size;
1829 1790          caddr_t segspt_addr, shm_addr;
1830 1791          page_t  **ppa;
1831 1792          int     i;
1832 1793          ulong_t an_idx = 0;
1833 1794          int     err = 0;
1834 1795          int     dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0);
1835 1796          size_t  pgsz;
1836 1797          pgcnt_t pgcnt;
1837 1798          caddr_t a;
1838 1799          pgcnt_t pidx;
1839 1800  
1840 1801  #ifdef lint
1841 1802          hat = hat;
1842 1803  #endif
1843 1804          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1844 1805  
1845 1806          /*
1846 1807           * Because of the way spt is implemented
1847 1808           * the realsize of the segment does not have to be
1848 1809           * equal to the segment size itself. The segment size is
1849 1810           * often in multiples of a page size larger than PAGESIZE.
1850 1811           * The realsize is rounded up to the nearest PAGESIZE
1851 1812           * based on what the user requested. This is a bit of
1852 1813           * ungliness that is historical but not easily fixed
1853 1814           * without re-designing the higher levels of ISM.
1854 1815           */
1855 1816          ASSERT(addr >= seg->s_base);
1856 1817          if (((addr + len) - seg->s_base) > sptd->spt_realsize)
1857 1818                  return (FC_NOMAP);
1858 1819          /*
1859 1820           * For all of the following cases except F_PROT, we need to
1860 1821           * make any necessary adjustments to addr and len
1861 1822           * and get all of the necessary page_t's into an array called ppa[].
1862 1823           *
1863 1824           * The code in shmat() forces base addr and len of ISM segment
1864 1825           * to be aligned to largest page size supported. Therefore,
1865 1826           * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
1866 1827           * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
1867 1828           * in large pagesize chunks, or else we will screw up the HAT
1868 1829           * layer by calling hat_memload_array() with differing page sizes
1869 1830           * over a given virtual range.
1870 1831           */
1871 1832          pgsz = page_get_pagesize(sptseg->s_szc);
1872 1833          pgcnt = page_get_pagecnt(sptseg->s_szc);
1873 1834          shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
1874 1835          size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
1875 1836          npages = btopr(size);
1876 1837  
1877 1838          /*
1878 1839           * Now we need to convert from addr in segshm to addr in segspt.
1879 1840           */
1880 1841          an_idx = seg_page(seg, shm_addr);
1881 1842          segspt_addr = sptseg->s_base + ptob(an_idx);
1882 1843  
1883 1844          ASSERT((segspt_addr + ptob(npages)) <=
1884 1845              (sptseg->s_base + sptd->spt_realsize));
1885 1846          ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));
1886 1847  
1887 1848          switch (type) {
1888 1849  
1889 1850          case F_SOFTLOCK:
1890 1851  
1891 1852                  atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
1892 1853                  /*
1893 1854                   * Fall through to the F_INVAL case to load up the hat layer
1894 1855                   * entries with the HAT_LOAD_LOCK flag.
1895 1856                   */
1896 1857                  /* FALLTHRU */
1897 1858          case F_INVAL:
1898 1859  
1899 1860                  if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
1900 1861                          return (FC_NOMAP);
1901 1862  
1902 1863                  ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
1903 1864  
1904 1865                  err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
1905 1866                  if (err != 0) {
1906 1867                          if (type == F_SOFTLOCK) {
1907 1868                                  atomic_add_long((ulong_t *)(
1908 1869                                      &(shmd->shm_softlockcnt)), -npages);
1909 1870                          }
1910 1871                          goto dism_err;
1911 1872                  }
1912 1873                  AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
1913 1874                  a = segspt_addr;
1914 1875                  pidx = 0;
1915 1876                  if (type == F_SOFTLOCK) {
1916 1877

↓ open down ↓

1734 lines elided

↑ open up ↑

1917 1878                          /*
1918 1879                           * Load up the translation keeping it
1919 1880                           * locked and don't unlock the page.
1920 1881                           */
1921 1882                          for (; pidx < npages; a += pgsz, pidx += pgcnt) {
1922 1883                                  hat_memload_array(sptseg->s_as->a_hat,
1923 1884                                      a, pgsz, &ppa[pidx], sptd->spt_prot,
1924 1885                                      HAT_LOAD_LOCK | HAT_LOAD_SHARE);
1925 1886                          }
1926 1887                  } else {
1927      -                        if (hat == seg->s_as->a_hat) {
     1888 +                        /*
     1889 +                         * Migrate pages marked for migration
     1890 +                         */
     1891 +                        if (lgrp_optimizations())
     1892 +                                page_migrate(seg, shm_addr, ppa, npages);
1928 1893  
1929      -                                /*
1930      -                                 * Migrate pages marked for migration
1931      -                                 */
1932      -                                if (lgrp_optimizations())
1933      -                                        page_migrate(seg, shm_addr, ppa,
1934      -                                            npages);
1935      -
1936      -                                /* CPU HAT */
1937      -                                for (; pidx < npages;
1938      -                                    a += pgsz, pidx += pgcnt) {
1939      -                                        hat_memload_array(sptseg->s_as->a_hat,
1940      -                                            a, pgsz, &ppa[pidx],
1941      -                                            sptd->spt_prot,
1942      -                                            HAT_LOAD_SHARE);
1943      -                                }
1944      -                        } else {
1945      -                                /* XHAT. Pass real address */
1946      -                                hat_memload_array(hat, shm_addr,
1947      -                                    size, ppa, sptd->spt_prot, HAT_LOAD_SHARE);
     1894 +                        for (; pidx < npages; a += pgsz, pidx += pgcnt) {
     1895 +                                hat_memload_array(sptseg->s_as->a_hat,
     1896 +                                    a, pgsz, &ppa[pidx],
     1897 +                                    sptd->spt_prot,
     1898 +                                    HAT_LOAD_SHARE);
1948 1899                          }
1949 1900  
1950 1901                          /*
1951 1902                           * And now drop the SE_SHARED lock(s).
1952 1903                           */
1953 1904                          if (dyn_ism_unmap) {
1954 1905                                  for (i = 0; i < npages; i++) {
1955 1906                                          page_unlock(ppa[i]);
1956 1907                                  }
1957 1908                          }

1958 1909                  }
1959 1910  
1960 1911                  if (!dyn_ism_unmap) {
1961 1912                          if (hat_share(seg->s_as->a_hat, shm_addr,
1962 1913                              curspt->a_hat, segspt_addr, ptob(npages),
1963 1914                              seg->s_szc) != 0) {
1964 1915                                  panic("hat_share err in DISM fault");
1965 1916                                  /* NOTREACHED */
1966 1917                          }
1967 1918                          if (type == F_INVAL) {
1968 1919                                  for (i = 0; i < npages; i++) {
1969 1920                                          page_unlock(ppa[i]);
1970 1921                                  }
1971 1922                          }
1972 1923                  }
1973 1924                  AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
1974 1925  dism_err:
1975 1926                  kmem_free(ppa, npages * sizeof (page_t *));
1976 1927                  return (err);
1977 1928  
1978 1929          case F_SOFTUNLOCK:
1979 1930  
1980 1931                  /*
1981 1932                   * This is a bit ugly, we pass in the real seg pointer,
1982 1933                   * but the segspt_addr is the virtual address within the
1983 1934                   * dummy seg.
1984 1935                   */
1985 1936                  segspt_softunlock(seg, segspt_addr, size, rw);
1986 1937                  return (0);
1987 1938  
1988 1939          case F_PROT:
1989 1940  
1990 1941                  /*
1991 1942                   * This takes care of the unusual case where a user
1992 1943                   * allocates a stack in shared memory and a register
1993 1944                   * window overflow is written to that stack page before
1994 1945                   * it is otherwise modified.
1995 1946                   *
1996 1947                   * We can get away with this because ISM segments are
1997 1948                   * always rw. Other than this unusual case, there
1998 1949                   * should be no instances of protection violations.
1999 1950                   */
2000 1951                  return (0);
2001 1952  
2002 1953          default:
2003 1954  #ifdef DEBUG
2004 1955                  panic("segspt_dismfault default type?");
2005 1956  #else
2006 1957                  return (FC_NOMAP);
2007 1958  #endif
2008 1959          }
2009 1960  }
2010 1961  
2011 1962  
2012 1963  faultcode_t
2013 1964  segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
2014 1965      size_t len, enum fault_type type, enum seg_rw rw)
2015 1966  {
2016 1967          struct shm_data         *shmd = (struct shm_data *)seg->s_data;
2017 1968          struct seg              *sptseg = shmd->shm_sptseg;
2018 1969          struct as               *curspt = shmd->shm_sptas;
2019 1970          struct spt_data         *sptd   = sptseg->s_data;
2020 1971          pgcnt_t npages;
2021 1972          size_t size;
2022 1973          caddr_t sptseg_addr, shm_addr;
2023 1974          page_t *pp, **ppa;
2024 1975          int     i;
2025 1976          u_offset_t offset;
2026 1977          ulong_t anon_index = 0;
2027 1978          struct vnode *vp;
2028 1979          struct anon_map *amp;           /* XXX - for locknest */
2029 1980          struct anon *ap = NULL;
2030 1981          size_t          pgsz;
2031 1982          pgcnt_t         pgcnt;
2032 1983          caddr_t         a;
2033 1984          pgcnt_t         pidx;
2034 1985          size_t          sz;
2035 1986  
2036 1987  #ifdef lint
2037 1988          hat = hat;
2038 1989  #endif
2039 1990  
2040 1991          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2041 1992  
2042 1993          if (sptd->spt_flags & SHM_PAGEABLE) {
2043 1994                  return (segspt_dismfault(hat, seg, addr, len, type, rw));
2044 1995          }
2045 1996  
2046 1997          /*
2047 1998           * Because of the way spt is implemented
2048 1999           * the realsize of the segment does not have to be
2049 2000           * equal to the segment size itself. The segment size is
2050 2001           * often in multiples of a page size larger than PAGESIZE.
2051 2002           * The realsize is rounded up to the nearest PAGESIZE
2052 2003           * based on what the user requested. This is a bit of
2053 2004           * ungliness that is historical but not easily fixed
2054 2005           * without re-designing the higher levels of ISM.
2055 2006           */
2056 2007          ASSERT(addr >= seg->s_base);
2057 2008          if (((addr + len) - seg->s_base) > sptd->spt_realsize)
2058 2009                  return (FC_NOMAP);
2059 2010          /*
2060 2011           * For all of the following cases except F_PROT, we need to
2061 2012           * make any necessary adjustments to addr and len
2062 2013           * and get all of the necessary page_t's into an array called ppa[].
2063 2014           *
2064 2015           * The code in shmat() forces base addr and len of ISM segment
2065 2016           * to be aligned to largest page size supported. Therefore,
2066 2017           * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2067 2018           * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2068 2019           * in large pagesize chunks, or else we will screw up the HAT
2069 2020           * layer by calling hat_memload_array() with differing page sizes
2070 2021           * over a given virtual range.
2071 2022           */
2072 2023          pgsz = page_get_pagesize(sptseg->s_szc);
2073 2024          pgcnt = page_get_pagecnt(sptseg->s_szc);
2074 2025          shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2075 2026          size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2076 2027          npages = btopr(size);
2077 2028  
2078 2029          /*
2079 2030           * Now we need to convert from addr in segshm to addr in segspt.
2080 2031           */
2081 2032          anon_index = seg_page(seg, shm_addr);
2082 2033          sptseg_addr = sptseg->s_base + ptob(anon_index);
2083 2034  
2084 2035          /*
2085 2036           * And now we may have to adjust npages downward if we have
2086 2037           * exceeded the realsize of the segment or initial anon
2087 2038           * allocations.
2088 2039           */
2089 2040          if ((sptseg_addr + ptob(npages)) >
2090 2041              (sptseg->s_base + sptd->spt_realsize))
2091 2042                  size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;
2092 2043  
2093 2044          npages = btopr(size);
2094 2045  
2095 2046          ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
2096 2047          ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);
2097 2048  
2098 2049          switch (type) {
2099 2050  
2100 2051          case F_SOFTLOCK:
2101 2052  
2102 2053                  /*
2103 2054                   * availrmem is decremented once during anon_swap_adjust()
2104 2055                   * and is incremented during the anon_unresv(), which is
2105 2056                   * called from shm_rm_amp() when the segment is destroyed.
2106 2057                   */
2107 2058                  atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2108 2059                  /*
2109 2060                   * Some platforms assume that ISM pages are SE_SHARED
2110 2061                   * locked for the entire life of the segment.
2111 2062                   */
2112 2063                  if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
2113 2064                          return (0);
2114 2065                  /*
2115 2066                   * Fall through to the F_INVAL case to load up the hat layer
2116 2067                   * entries with the HAT_LOAD_LOCK flag.
2117 2068                   */
2118 2069  
2119 2070                  /* FALLTHRU */
2120 2071          case F_INVAL:
2121 2072  
2122 2073                  if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2123 2074                          return (FC_NOMAP);
2124 2075  
2125 2076                  /*
2126 2077                   * Some platforms that do NOT support DYNAMIC_ISM_UNMAP
2127 2078                   * may still rely on this call to hat_share(). That
2128 2079                   * would imply that those hat's can fault on a
2129 2080                   * HAT_LOAD_LOCK translation, which would seem
2130 2081                   * contradictory.
2131 2082                   */
2132 2083                  if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2133 2084                          if (hat_share(seg->s_as->a_hat, seg->s_base,
2134 2085                              curspt->a_hat, sptseg->s_base,
2135 2086                              sptseg->s_size, sptseg->s_szc) != 0) {
2136 2087                                  panic("hat_share error in ISM fault");
2137 2088                                  /*NOTREACHED*/
2138 2089                          }
2139 2090                          return (0);
2140 2091                  }
2141 2092                  ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);
2142 2093  
2143 2094                  /*
2144 2095                   * I see no need to lock the real seg,
2145 2096                   * here, because all of our work will be on the underlying
2146 2097                   * dummy seg.
2147 2098                   *
2148 2099                   * sptseg_addr and npages now account for large pages.
2149 2100                   */
2150 2101                  amp = sptd->spt_amp;
2151 2102                  ASSERT(amp != NULL);
2152 2103                  anon_index = seg_page(sptseg, sptseg_addr);
2153 2104  
2154 2105                  ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2155 2106                  for (i = 0; i < npages; i++) {
2156 2107                          ap = anon_get_ptr(amp->ahp, anon_index++);
2157 2108                          ASSERT(ap != NULL);
2158 2109                          swap_xlate(ap, &vp, &offset);
2159 2110                          pp = page_lookup(vp, offset, SE_SHARED);
2160 2111                          ASSERT(pp != NULL);
2161 2112                          ppa[i] = pp;
2162 2113                  }
2163 2114                  ANON_LOCK_EXIT(&amp->a_rwlock);
2164 2115                  ASSERT(i == npages);
2165 2116  
2166 2117                  /*
2167 2118                   * We are already holding the as->a_lock on the user's
2168 2119                   * real segment, but we need to hold the a_lock on the
2169 2120                   * underlying dummy as. This is mostly to satisfy the
2170 2121                   * underlying HAT layer.
2171 2122                   */
2172 2123                  AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
2173 2124                  a = sptseg_addr;
2174 2125                  pidx = 0;
2175 2126                  if (type == F_SOFTLOCK) {
2176 2127                          /*

↓ open down ↓

219 lines elided

↑ open up ↑

2177 2128                           * Load up the translation keeping it
2178 2129                           * locked and don't unlock the page.
2179 2130                           */
2180 2131                          for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2181 2132                                  sz = MIN(pgsz, ptob(npages - pidx));
2182 2133                                  hat_memload_array(sptseg->s_as->a_hat, a,
2183 2134                                      sz, &ppa[pidx], sptd->spt_prot,
2184 2135                                      HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2185 2136                          }
2186 2137                  } else {
2187      -                        if (hat == seg->s_as->a_hat) {
     2138 +                        /*
     2139 +                         * Migrate pages marked for migration.
     2140 +                         */
     2141 +                        if (lgrp_optimizations())
     2142 +                                page_migrate(seg, shm_addr, ppa, npages);
2188 2143  
2189      -                                /*
2190      -                                 * Migrate pages marked for migration.
2191      -                                 */
2192      -                                if (lgrp_optimizations())
2193      -                                        page_migrate(seg, shm_addr, ppa,
2194      -                                            npages);
2195      -
2196      -                                /* CPU HAT */
2197      -                                for (; pidx < npages;
2198      -                                    a += pgsz, pidx += pgcnt) {
2199      -                                        sz = MIN(pgsz, ptob(npages - pidx));
2200      -                                        hat_memload_array(sptseg->s_as->a_hat,
2201      -                                            a, sz, &ppa[pidx],
2202      -                                            sptd->spt_prot, HAT_LOAD_SHARE);
2203      -                                }
2204      -                        } else {
2205      -                                /* XHAT. Pass real address */
2206      -                                hat_memload_array(hat, shm_addr,
2207      -                                    ptob(npages), ppa, sptd->spt_prot,
2208      -                                    HAT_LOAD_SHARE);
     2144 +                        for (; pidx < npages; a += pgsz, pidx += pgcnt) {
     2145 +                                sz = MIN(pgsz, ptob(npages - pidx));
     2146 +                                hat_memload_array(sptseg->s_as->a_hat,
     2147 +                                    a, sz, &ppa[pidx],
     2148 +                                    sptd->spt_prot, HAT_LOAD_SHARE);
2209 2149                          }
2210 2150  
2211 2151                          /*
2212 2152                           * And now drop the SE_SHARED lock(s).
2213 2153                           */
2214 2154                          for (i = 0; i < npages; i++)
2215 2155                                  page_unlock(ppa[i]);
2216 2156                  }
2217 2157                  AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
2218 2158

2219 2159                  kmem_free(ppa, sizeof (page_t *) * npages);
2220 2160                  return (0);
2221 2161          case F_SOFTUNLOCK:
2222 2162  
2223 2163                  /*
2224 2164                   * This is a bit ugly, we pass in the real seg pointer,
2225 2165                   * but the sptseg_addr is the virtual address within the
2226 2166                   * dummy seg.
2227 2167                   */
2228 2168                  segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
2229 2169                  return (0);
2230 2170  
2231 2171          case F_PROT:
2232 2172  
2233 2173                  /*
2234 2174                   * This takes care of the unusual case where a user
2235 2175                   * allocates a stack in shared memory and a register
2236 2176                   * window overflow is written to that stack page before
2237 2177                   * it is otherwise modified.
2238 2178                   *
2239 2179                   * We can get away with this because ISM segments are
2240 2180                   * always rw. Other than this unusual case, there
2241 2181                   * should be no instances of protection violations.
2242 2182                   */
2243 2183                  return (0);
2244 2184  
2245 2185          default:
2246 2186  #ifdef DEBUG
2247 2187                  cmn_err(CE_WARN, "segspt_shmfault default type?");
2248 2188  #endif
2249 2189                  return (FC_NOMAP);
2250 2190          }
2251 2191  }
2252 2192  
2253 2193  /*ARGSUSED*/
2254 2194  static faultcode_t
2255 2195  segspt_shmfaulta(struct seg *seg, caddr_t addr)
2256 2196  {

↓ open down ↓

38 lines elided

↑ open up ↑

2257 2197          return (0);
2258 2198  }
2259 2199  
2260 2200  /*ARGSUSED*/
2261 2201  static int
2262 2202  segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
2263 2203  {
2264 2204          return (0);
2265 2205  }
2266 2206  
2267      -/*ARGSUSED*/
2268      -static size_t
2269      -segspt_shmswapout(struct seg *seg)
2270      -{
2271      -        return (0);
2272      -}
2273      -
2274 2207  /*
2275 2208   * duplicate the shared page tables
2276 2209   */
2277 2210  int
2278 2211  segspt_shmdup(struct seg *seg, struct seg *newseg)
2279 2212  {
2280 2213          struct shm_data         *shmd = (struct shm_data *)seg->s_data;
2281 2214          struct anon_map         *amp = shmd->shm_amp;
2282 2215          struct shm_data         *shmd_new;
2283 2216          struct seg              *spt_seg = shmd->shm_sptseg;

2284 2217          struct spt_data         *sptd = spt_seg->s_data;
2285 2218          int                     error = 0;
2286 2219  
2287 2220          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
2288 2221  
2289 2222          shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
2290 2223          newseg->s_data = (void *)shmd_new;
2291 2224          shmd_new->shm_sptas = shmd->shm_sptas;
2292 2225          shmd_new->shm_amp = amp;
2293 2226          shmd_new->shm_sptseg = shmd->shm_sptseg;
2294 2227          newseg->s_ops = &segspt_shmops;
2295 2228          newseg->s_szc = seg->s_szc;
2296 2229          ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);
2297 2230  
2298 2231          ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2299 2232          amp->refcnt++;
2300 2233          ANON_LOCK_EXIT(&amp->a_rwlock);
2301 2234  
2302 2235          if (sptd->spt_flags & SHM_PAGEABLE) {
2303 2236                  shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
2304 2237                  shmd_new->shm_lckpgs = 0;
2305 2238                  if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2306 2239                          if ((error = hat_share(newseg->s_as->a_hat,
2307 2240                              newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR,
2308 2241                              seg->s_size, seg->s_szc)) != 0) {
2309 2242                                  kmem_free(shmd_new->shm_vpage,
2310 2243                                      btopr(amp->size));
2311 2244                          }
2312 2245                  }
2313 2246                  return (error);
2314 2247          } else {
2315 2248                  return (hat_share(newseg->s_as->a_hat, newseg->s_base,
2316 2249                      shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size,
2317 2250                      seg->s_szc));
2318 2251  
2319 2252          }
2320 2253  }
2321 2254  
2322 2255  /*ARGSUSED*/
2323 2256  int
2324 2257  segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
2325 2258  {
2326 2259          struct shm_data *shmd = (struct shm_data *)seg->s_data;
2327 2260          struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2328 2261  
2329 2262          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2330 2263  
2331 2264          /*
2332 2265           * ISM segment is always rw.
2333 2266           */
2334 2267          return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
2335 2268  }
2336 2269  
2337 2270  /*
2338 2271   * Return an array of locked large pages, for empty slots allocate
2339 2272   * private zero-filled anon pages.
2340 2273   */
2341 2274  static int
2342 2275  spt_anon_getpages(
2343 2276          struct seg *sptseg,
2344 2277          caddr_t sptaddr,
2345 2278          size_t len,
2346 2279          page_t *ppa[])
2347 2280  {
2348 2281          struct  spt_data *sptd = sptseg->s_data;
2349 2282          struct  anon_map *amp = sptd->spt_amp;
2350 2283          enum    seg_rw rw = sptd->spt_prot;
2351 2284          uint_t  szc = sptseg->s_szc;
2352 2285          size_t  pg_sz, share_sz = page_get_pagesize(szc);
2353 2286          pgcnt_t lp_npgs;
2354 2287          caddr_t lp_addr, e_sptaddr;
2355 2288          uint_t  vpprot, ppa_szc = 0;
2356 2289          struct  vpage *vpage = NULL;
2357 2290          ulong_t j, ppa_idx;
2358 2291          int     err, ierr = 0;
2359 2292          pgcnt_t an_idx;
2360 2293          anon_sync_obj_t cookie;
2361 2294          int anon_locked = 0;
2362 2295          pgcnt_t amp_pgs;
2363 2296  
2364 2297  
2365 2298          ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
2366 2299          ASSERT(len != 0);
2367 2300  
2368 2301          pg_sz = share_sz;
2369 2302          lp_npgs = btop(pg_sz);
2370 2303          lp_addr = sptaddr;
2371 2304          e_sptaddr = sptaddr + len;
2372 2305          an_idx = seg_page(sptseg, sptaddr);
2373 2306          ppa_idx = 0;
2374 2307  
2375 2308          ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2376 2309  
2377 2310          amp_pgs = page_get_pagecnt(amp->a_szc);
2378 2311  
2379 2312          /*CONSTCOND*/
2380 2313          while (1) {
2381 2314                  for (; lp_addr < e_sptaddr;
2382 2315                      an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) {
2383 2316  
2384 2317                          /*
2385 2318                           * If we're currently locked, and we get to a new
2386 2319                           * page, unlock our current anon chunk.
2387 2320                           */
2388 2321                          if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) {
2389 2322                                  anon_array_exit(&cookie);
2390 2323                                  anon_locked = 0;
2391 2324                          }
2392 2325                          if (!anon_locked) {
2393 2326                                  anon_array_enter(amp, an_idx, &cookie);
2394 2327                                  anon_locked = 1;
2395 2328                          }
2396 2329                          ppa_szc = (uint_t)-1;
2397 2330                          ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
2398 2331                              lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
2399 2332                              &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred);
2400 2333  
2401 2334                          if (ierr != 0) {
2402 2335                                  if (ierr > 0) {
2403 2336                                          err = FC_MAKE_ERR(ierr);
2404 2337                                          goto lpgs_err;
2405 2338                                  }
2406 2339                                  break;
2407 2340                          }
2408 2341                  }
2409 2342                  if (lp_addr == e_sptaddr) {
2410 2343                          break;
2411 2344                  }
2412 2345                  ASSERT(lp_addr < e_sptaddr);
2413 2346  
2414 2347                  /*
2415 2348                   * ierr == -1 means we failed to allocate a large page.
2416 2349                   * so do a size down operation.
2417 2350                   *
2418 2351                   * ierr == -2 means some other process that privately shares
2419 2352                   * pages with this process has allocated a larger page and we
2420 2353                   * need to retry with larger pages. So do a size up
2421 2354                   * operation. This relies on the fact that large pages are
2422 2355                   * never partially shared i.e. if we share any constituent
2423 2356                   * page of a large page with another process we must share the
2424 2357                   * entire large page. Note this cannot happen for SOFTLOCK
2425 2358                   * case, unless current address (lpaddr) is at the beginning
2426 2359                   * of the next page size boundary because the other process
2427 2360                   * couldn't have relocated locked pages.
2428 2361                   */
2429 2362                  ASSERT(ierr == -1 || ierr == -2);
2430 2363                  if (segvn_anypgsz) {
2431 2364                          ASSERT(ierr == -2 || szc != 0);
2432 2365                          ASSERT(ierr == -1 || szc < sptseg->s_szc);
2433 2366                          szc = (ierr == -1) ? szc - 1 : szc + 1;
2434 2367                  } else {
2435 2368                          /*
2436 2369                           * For faults and segvn_anypgsz == 0
2437 2370                           * we need to be careful not to loop forever
2438 2371                           * if existing page is found with szc other
2439 2372                           * than 0 or seg->s_szc. This could be due
2440 2373                           * to page relocations on behalf of DR or
2441 2374                           * more likely large page creation. For this
2442 2375                           * case simply re-size to existing page's szc
2443 2376                           * if returned by anon_map_getpages().
2444 2377                           */
2445 2378                          if (ppa_szc == (uint_t)-1) {
2446 2379                                  szc = (ierr == -1) ? 0 : sptseg->s_szc;
2447 2380                          } else {
2448 2381                                  ASSERT(ppa_szc <= sptseg->s_szc);
2449 2382                                  ASSERT(ierr == -2 || ppa_szc < szc);
2450 2383                                  ASSERT(ierr == -1 || ppa_szc > szc);
2451 2384                                  szc = ppa_szc;
2452 2385                          }
2453 2386                  }
2454 2387                  pg_sz = page_get_pagesize(szc);
2455 2388                  lp_npgs = btop(pg_sz);
2456 2389                  ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
2457 2390          }
2458 2391          if (anon_locked) {
2459 2392                  anon_array_exit(&cookie);
2460 2393          }
2461 2394          ANON_LOCK_EXIT(&amp->a_rwlock);
2462 2395          return (0);
2463 2396  
2464 2397  lpgs_err:
2465 2398          if (anon_locked) {
2466 2399                  anon_array_exit(&cookie);
2467 2400          }
2468 2401          ANON_LOCK_EXIT(&amp->a_rwlock);
2469 2402          for (j = 0; j < ppa_idx; j++)
2470 2403                  page_unlock(ppa[j]);
2471 2404          return (err);
2472 2405  }
2473 2406  
2474 2407  /*
2475 2408   * count the number of bytes in a set of spt pages that are currently not
2476 2409   * locked
2477 2410   */
2478 2411  static rctl_qty_t
2479 2412  spt_unlockedbytes(pgcnt_t npages, page_t **ppa)
2480 2413  {
2481 2414          ulong_t i;
2482 2415          rctl_qty_t unlocked = 0;
2483 2416  
2484 2417          for (i = 0; i < npages; i++) {
2485 2418                  if (ppa[i]->p_lckcnt == 0)
2486 2419                          unlocked += PAGESIZE;
2487 2420          }
2488 2421          return (unlocked);
2489 2422  }
2490 2423  
2491 2424  extern  u_longlong_t randtick(void);
2492 2425  /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */
2493 2426  #define NLCK    (NCPU_P2)
2494 2427  /* Random number with a range [0, n-1], n must be power of two */
2495 2428  #define RAND_P2(n)      \
2496 2429          ((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1))
2497 2430  
2498 2431  int
2499 2432  spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2500 2433      page_t **ppa, ulong_t *lockmap, size_t pos,
2501 2434      rctl_qty_t *locked)
2502 2435  {
2503 2436          struct  shm_data *shmd = seg->s_data;
2504 2437          struct  spt_data *sptd = shmd->shm_sptseg->s_data;
2505 2438          ulong_t i;
2506 2439          int     kernel;
2507 2440          pgcnt_t nlck = 0;
2508 2441          int     rv = 0;
2509 2442          int     use_reserved = 1;
2510 2443  
2511 2444          /* return the number of bytes actually locked */
2512 2445          *locked = 0;
2513 2446  
2514 2447          /*
2515 2448           * To avoid contention on freemem_lock, availrmem and pages_locked
2516 2449           * global counters are updated only every nlck locked pages instead of
2517 2450           * every time.  Reserve nlck locks up front and deduct from this
2518 2451           * reservation for each page that requires a lock.  When the reservation
2519 2452           * is consumed, reserve again.  nlck is randomized, so the competing
2520 2453           * threads do not fall into a cyclic lock contention pattern. When
2521 2454           * memory is low, the lock ahead is disabled, and instead page_pp_lock()
2522 2455           * is used to lock pages.
2523 2456           */
2524 2457          for (i = 0; i < npages; anon_index++, pos++, i++) {
2525 2458                  if (nlck == 0 && use_reserved == 1) {
2526 2459                          nlck = NLCK + RAND_P2(NLCK);
2527 2460                          /* if fewer loops left, decrease nlck */
2528 2461                          nlck = MIN(nlck, npages - i);
2529 2462                          /*
2530 2463                           * Reserve nlck locks up front and deduct from this
2531 2464                           * reservation for each page that requires a lock.  When
2532 2465                           * the reservation is consumed, reserve again.
2533 2466                           */
2534 2467                          mutex_enter(&freemem_lock);
2535 2468                          if ((availrmem - nlck) < pages_pp_maximum) {
2536 2469                                  /* Do not do advance memory reserves */
2537 2470                                  use_reserved = 0;
2538 2471                          } else {
2539 2472                                  availrmem       -= nlck;
2540 2473                                  pages_locked    += nlck;
2541 2474                          }
2542 2475                          mutex_exit(&freemem_lock);
2543 2476                  }
2544 2477                  if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
2545 2478                          if (sptd->spt_ppa_lckcnt[anon_index] <
2546 2479                              (ushort_t)DISM_LOCK_MAX) {
2547 2480                                  if (++sptd->spt_ppa_lckcnt[anon_index] ==
2548 2481                                      (ushort_t)DISM_LOCK_MAX) {
2549 2482                                          cmn_err(CE_WARN,
2550 2483                                              "DISM page lock limit "
2551 2484                                              "reached on DISM offset 0x%lx\n",
2552 2485                                              anon_index << PAGESHIFT);
2553 2486                                  }
2554 2487                                  kernel = (sptd->spt_ppa &&
2555 2488                                      sptd->spt_ppa[anon_index]);
2556 2489                                  if (!page_pp_lock(ppa[i], 0, kernel ||
2557 2490                                      use_reserved)) {
2558 2491                                          sptd->spt_ppa_lckcnt[anon_index]--;
2559 2492                                          rv = EAGAIN;
2560 2493                                          break;
2561 2494                                  }
2562 2495                                  /* if this is a newly locked page, count it */
2563 2496                                  if (ppa[i]->p_lckcnt == 1) {
2564 2497                                          if (kernel == 0 && use_reserved == 1)
2565 2498                                                  nlck--;
2566 2499                                          *locked += PAGESIZE;
2567 2500                                  }
2568 2501                                  shmd->shm_lckpgs++;
2569 2502                                  shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
2570 2503                                  if (lockmap != NULL)
2571 2504                                          BT_SET(lockmap, pos);
2572 2505                          }
2573 2506                  }
2574 2507          }
2575 2508          /* Return unused lock reservation */
2576 2509          if (nlck != 0 && use_reserved == 1) {
2577 2510                  mutex_enter(&freemem_lock);
2578 2511                  availrmem       += nlck;
2579 2512                  pages_locked    -= nlck;
2580 2513                  mutex_exit(&freemem_lock);
2581 2514          }
2582 2515  
2583 2516          return (rv);
2584 2517  }
2585 2518  
2586 2519  int
2587 2520  spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2588 2521      rctl_qty_t *unlocked)
2589 2522  {
2590 2523          struct shm_data *shmd = seg->s_data;
2591 2524          struct spt_data *sptd = shmd->shm_sptseg->s_data;
2592 2525          struct anon_map *amp = sptd->spt_amp;
2593 2526          struct anon     *ap;
2594 2527          struct vnode    *vp;
2595 2528          u_offset_t      off;
2596 2529          struct page     *pp;
2597 2530          int             kernel;
2598 2531          anon_sync_obj_t cookie;
2599 2532          ulong_t         i;
2600 2533          pgcnt_t         nlck = 0;
2601 2534          pgcnt_t         nlck_limit = NLCK;
2602 2535  
2603 2536          ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2604 2537          for (i = 0; i < npages; i++, anon_index++) {
2605 2538                  if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
2606 2539                          anon_array_enter(amp, anon_index, &cookie);
2607 2540                          ap = anon_get_ptr(amp->ahp, anon_index);
2608 2541                          ASSERT(ap);
2609 2542  
2610 2543                          swap_xlate(ap, &vp, &off);
2611 2544                          anon_array_exit(&cookie);
2612 2545                          pp = page_lookup(vp, off, SE_SHARED);
2613 2546                          ASSERT(pp);
2614 2547                          /*
2615 2548                           * availrmem is decremented only for pages which are not
2616 2549                           * in seg pcache, for pages in seg pcache availrmem was
2617 2550                           * decremented in _dismpagelock()
2618 2551                           */
2619 2552                          kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]);
2620 2553                          ASSERT(pp->p_lckcnt > 0);
2621 2554  
2622 2555                          /*
2623 2556                           * lock page but do not change availrmem, we do it
2624 2557                           * ourselves every nlck loops.
2625 2558                           */
2626 2559                          page_pp_unlock(pp, 0, 1);
2627 2560                          if (pp->p_lckcnt == 0) {
2628 2561                                  if (kernel == 0)
2629 2562                                          nlck++;
2630 2563                                  *unlocked += PAGESIZE;
2631 2564                          }
2632 2565                          page_unlock(pp);
2633 2566                          shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED;
2634 2567                          sptd->spt_ppa_lckcnt[anon_index]--;
2635 2568                          shmd->shm_lckpgs--;
2636 2569                  }
2637 2570  
2638 2571                  /*
2639 2572                   * To reduce freemem_lock contention, do not update availrmem
2640 2573                   * until at least NLCK pages have been unlocked.
2641 2574                   * 1. No need to update if nlck is zero
2642 2575                   * 2. Always update if the last iteration
2643 2576                   */
2644 2577                  if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) {
2645 2578                          mutex_enter(&freemem_lock);
2646 2579                          availrmem       += nlck;
2647 2580                          pages_locked    -= nlck;
2648 2581                          mutex_exit(&freemem_lock);
2649 2582                          nlck = 0;
2650 2583                          nlck_limit = NLCK + RAND_P2(NLCK);
2651 2584                  }
2652 2585          }
2653 2586          ANON_LOCK_EXIT(&amp->a_rwlock);
2654 2587  
2655 2588          return (0);
2656 2589  }
2657 2590  
2658 2591  /*ARGSUSED*/
2659 2592  static int
2660 2593  segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
2661 2594      int attr, int op, ulong_t *lockmap, size_t pos)
2662 2595  {
2663 2596          struct shm_data *shmd = seg->s_data;
2664 2597          struct seg      *sptseg = shmd->shm_sptseg;
2665 2598          struct spt_data *sptd = sptseg->s_data;
2666 2599          struct kshmid   *sp = sptd->spt_amp->a_sp;
2667 2600          pgcnt_t         npages, a_npages;
2668 2601          page_t          **ppa;
2669 2602          pgcnt_t         an_idx, a_an_idx, ppa_idx;
2670 2603          caddr_t         spt_addr, a_addr;       /* spt and aligned address */
2671 2604          size_t          a_len;                  /* aligned len */
2672 2605          size_t          share_sz;
2673 2606          ulong_t         i;
2674 2607          int             sts = 0;
2675 2608          rctl_qty_t      unlocked = 0;
2676 2609          rctl_qty_t      locked = 0;
2677 2610          struct proc     *p = curproc;
2678 2611          kproject_t      *proj;
2679 2612  
2680 2613          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2681 2614          ASSERT(sp != NULL);
2682 2615  
2683 2616          if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
2684 2617                  return (0);
2685 2618          }
2686 2619  
2687 2620          addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2688 2621          an_idx = seg_page(seg, addr);
2689 2622          npages = btopr(len);
2690 2623  
2691 2624          if (an_idx + npages > btopr(shmd->shm_amp->size)) {
2692 2625                  return (ENOMEM);
2693 2626          }
2694 2627  
2695 2628          /*
2696 2629           * A shm's project never changes, so no lock needed.
2697 2630           * The shm has a hold on the project, so it will not go away.
2698 2631           * Since we have a mapping to shm within this zone, we know
2699 2632           * that the zone will not go away.
2700 2633           */
2701 2634          proj = sp->shm_perm.ipc_proj;
2702 2635  
2703 2636          if (op == MC_LOCK) {
2704 2637  
2705 2638                  /*
2706 2639                   * Need to align addr and size request if they are not
2707 2640                   * aligned so we can always allocate large page(s) however
2708 2641                   * we only lock what was requested in initial request.
2709 2642                   */
2710 2643                  share_sz = page_get_pagesize(sptseg->s_szc);
2711 2644                  a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
2712 2645                  a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
2713 2646                      share_sz);
2714 2647                  a_npages = btop(a_len);
2715 2648                  a_an_idx = seg_page(seg, a_addr);
2716 2649                  spt_addr = sptseg->s_base + ptob(a_an_idx);
2717 2650                  ppa_idx = an_idx - a_an_idx;
2718 2651  
2719 2652                  if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
2720 2653                      KM_NOSLEEP)) == NULL) {
2721 2654                          return (ENOMEM);
2722 2655                  }
2723 2656  
2724 2657                  /*
2725 2658                   * Don't cache any new pages for IO and
2726 2659                   * flush any cached pages.
2727 2660                   */
2728 2661                  mutex_enter(&sptd->spt_lock);
2729 2662                  if (sptd->spt_ppa != NULL)
2730 2663                          sptd->spt_flags |= DISM_PPA_CHANGED;
2731 2664  
2732 2665                  sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
2733 2666                  if (sts != 0) {
2734 2667                          mutex_exit(&sptd->spt_lock);
2735 2668                          kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2736 2669                          return (sts);
2737 2670                  }
2738 2671  
2739 2672                  mutex_enter(&sp->shm_mlock);
2740 2673                  /* enforce locked memory rctl */
2741 2674                  unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]);
2742 2675  
2743 2676                  mutex_enter(&p->p_lock);
2744 2677                  if (rctl_incr_locked_mem(p, proj, unlocked, 0)) {
2745 2678                          mutex_exit(&p->p_lock);
2746 2679                          sts = EAGAIN;
2747 2680                  } else {
2748 2681                          mutex_exit(&p->p_lock);
2749 2682                          sts = spt_lockpages(seg, an_idx, npages,
2750 2683                              &ppa[ppa_idx], lockmap, pos, &locked);
2751 2684  
2752 2685                          /*
2753 2686                           * correct locked count if not all pages could be
2754 2687                           * locked
2755 2688                           */
2756 2689                          if ((unlocked - locked) > 0) {
2757 2690                                  rctl_decr_locked_mem(NULL, proj,
2758 2691                                      (unlocked - locked), 0);
2759 2692                          }
2760 2693                  }
2761 2694                  /*
2762 2695                   * unlock pages
2763 2696                   */
2764 2697                  for (i = 0; i < a_npages; i++)
2765 2698                          page_unlock(ppa[i]);
2766 2699                  if (sptd->spt_ppa != NULL)
2767 2700                          sptd->spt_flags |= DISM_PPA_CHANGED;
2768 2701                  mutex_exit(&sp->shm_mlock);
2769 2702                  mutex_exit(&sptd->spt_lock);
2770 2703  
2771 2704                  kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2772 2705  
2773 2706          } else if (op == MC_UNLOCK) { /* unlock */
2774 2707                  page_t          **ppa;
2775 2708  
2776 2709                  mutex_enter(&sptd->spt_lock);
2777 2710                  if (shmd->shm_lckpgs == 0) {
2778 2711                          mutex_exit(&sptd->spt_lock);
2779 2712                          return (0);
2780 2713                  }
2781 2714                  /*
2782 2715                   * Don't cache new IO pages.
2783 2716                   */
2784 2717                  if (sptd->spt_ppa != NULL)
2785 2718                          sptd->spt_flags |= DISM_PPA_CHANGED;
2786 2719  
2787 2720                  mutex_enter(&sp->shm_mlock);
2788 2721                  sts = spt_unlockpages(seg, an_idx, npages, &unlocked);
2789 2722                  if ((ppa = sptd->spt_ppa) != NULL)
2790 2723                          sptd->spt_flags |= DISM_PPA_CHANGED;
2791 2724                  mutex_exit(&sptd->spt_lock);
2792 2725  
2793 2726                  rctl_decr_locked_mem(NULL, proj, unlocked, 0);
2794 2727                  mutex_exit(&sp->shm_mlock);
2795 2728  
2796 2729                  if (ppa != NULL)
2797 2730                          seg_ppurge_wiredpp(ppa);
2798 2731          }
2799 2732          return (sts);
2800 2733  }
2801 2734  
2802 2735  /*ARGSUSED*/
2803 2736  int
2804 2737  segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
2805 2738  {
2806 2739          struct shm_data *shmd = (struct shm_data *)seg->s_data;
2807 2740          struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2808 2741          spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;
2809 2742  
2810 2743          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2811 2744  
2812 2745          /*
2813 2746           * ISM segment is always rw.
2814 2747           */
2815 2748          while (--pgno >= 0)
2816 2749                  *protv++ = sptd->spt_prot;
2817 2750          return (0);
2818 2751  }
2819 2752  
2820 2753  /*ARGSUSED*/
2821 2754  u_offset_t
2822 2755  segspt_shmgetoffset(struct seg *seg, caddr_t addr)
2823 2756  {
2824 2757          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2825 2758  
2826 2759          /* Offset does not matter in ISM memory */
2827 2760  
2828 2761          return ((u_offset_t)0);
2829 2762  }
2830 2763  
2831 2764  /* ARGSUSED */
2832 2765  int
2833 2766  segspt_shmgettype(struct seg *seg, caddr_t addr)
2834 2767  {
2835 2768          struct shm_data *shmd = (struct shm_data *)seg->s_data;
2836 2769          struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2837 2770  
2838 2771          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2839 2772  
2840 2773          /*
2841 2774           * The shared memory mapping is always MAP_SHARED, SWAP is only
2842 2775           * reserved for DISM
2843 2776           */
2844 2777          return (MAP_SHARED |
2845 2778              ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
2846 2779  }
2847 2780  
2848 2781  /*ARGSUSED*/
2849 2782  int
2850 2783  segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
2851 2784  {
2852 2785          struct shm_data *shmd = (struct shm_data *)seg->s_data;
2853 2786          struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2854 2787  
2855 2788          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2856 2789  
2857 2790          *vpp = sptd->spt_vp;
2858 2791          return (0);
2859 2792  }
2860 2793  
2861 2794  /*
2862 2795   * We need to wait for pending IO to complete to a DISM segment in order for
2863 2796   * pages to get kicked out of the seg_pcache.  120 seconds should be more
2864 2797   * than enough time to wait.
2865 2798   */
2866 2799  static clock_t spt_pcache_wait = 120;
2867 2800  
2868 2801  /*ARGSUSED*/
2869 2802  static int
2870 2803  segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
2871 2804  {
2872 2805          struct shm_data *shmd = (struct shm_data *)seg->s_data;
2873 2806          struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2874 2807          struct anon_map *amp;
2875 2808          pgcnt_t pg_idx;
2876 2809          ushort_t gen;
2877 2810          clock_t end_lbolt;
2878 2811          int writer;
2879 2812          page_t **ppa;
2880 2813  
2881 2814          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2882 2815  
2883 2816          if (behav == MADV_FREE) {
2884 2817                  if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
2885 2818                          return (0);
2886 2819  
2887 2820                  amp = sptd->spt_amp;
2888 2821                  pg_idx = seg_page(seg, addr);
2889 2822  
2890 2823                  mutex_enter(&sptd->spt_lock);
2891 2824                  if ((ppa = sptd->spt_ppa) == NULL) {
2892 2825                          mutex_exit(&sptd->spt_lock);
2893 2826                          ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2894 2827                          anon_disclaim(amp, pg_idx, len);
2895 2828                          ANON_LOCK_EXIT(&amp->a_rwlock);
2896 2829                          return (0);
2897 2830                  }
2898 2831  
2899 2832                  sptd->spt_flags |= DISM_PPA_CHANGED;
2900 2833                  gen = sptd->spt_gen;
2901 2834  
2902 2835                  mutex_exit(&sptd->spt_lock);
2903 2836  
2904 2837                  /*
2905 2838                   * Purge all DISM cached pages
2906 2839                   */
2907 2840                  seg_ppurge_wiredpp(ppa);
2908 2841  
2909 2842                  /*
2910 2843                   * Drop the AS_LOCK so that other threads can grab it
2911 2844                   * in the as_pageunlock path and hopefully get the segment
2912 2845                   * kicked out of the seg_pcache.  We bump the shm_softlockcnt
2913 2846                   * to keep this segment resident.
2914 2847                   */
2915 2848                  writer = AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock);
2916 2849                  atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
2917 2850                  AS_LOCK_EXIT(seg->s_as, &seg->s_as->a_lock);
2918 2851  
2919 2852                  mutex_enter(&sptd->spt_lock);
2920 2853  
2921 2854                  end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait);
2922 2855  
2923 2856                  /*
2924 2857                   * Try to wait for pages to get kicked out of the seg_pcache.
2925 2858                   */
2926 2859                  while (sptd->spt_gen == gen &&
2927 2860                      (sptd->spt_flags & DISM_PPA_CHANGED) &&
2928 2861                      ddi_get_lbolt() < end_lbolt) {
2929 2862                          if (!cv_timedwait_sig(&sptd->spt_cv,
2930 2863                              &sptd->spt_lock, end_lbolt)) {
2931 2864                                  break;
2932 2865                          }
2933 2866                  }
2934 2867  
2935 2868                  mutex_exit(&sptd->spt_lock);
2936 2869  
2937 2870                  /* Regrab the AS_LOCK and release our hold on the segment */
2938 2871                  AS_LOCK_ENTER(seg->s_as, &seg->s_as->a_lock,
2939 2872                      writer ? RW_WRITER : RW_READER);
2940 2873                  atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
2941 2874                  if (shmd->shm_softlockcnt <= 0) {
2942 2875                          if (AS_ISUNMAPWAIT(seg->s_as)) {
2943 2876                                  mutex_enter(&seg->s_as->a_contents);
2944 2877                                  if (AS_ISUNMAPWAIT(seg->s_as)) {
2945 2878                                          AS_CLRUNMAPWAIT(seg->s_as);
2946 2879                                          cv_broadcast(&seg->s_as->a_cv);
2947 2880                                  }
2948 2881                                  mutex_exit(&seg->s_as->a_contents);
2949 2882                          }
2950 2883                  }
2951 2884  
2952 2885                  ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2953 2886                  anon_disclaim(amp, pg_idx, len);
2954 2887                  ANON_LOCK_EXIT(&amp->a_rwlock);
2955 2888          } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
2956 2889              behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
2957 2890                  int                     already_set;
2958 2891                  ulong_t                 anon_index;
2959 2892                  lgrp_mem_policy_t       policy;
2960 2893                  caddr_t                 shm_addr;
2961 2894                  size_t                  share_size;
2962 2895                  size_t                  size;
2963 2896                  struct seg              *sptseg = shmd->shm_sptseg;
2964 2897                  caddr_t                 sptseg_addr;
2965 2898  
2966 2899                  /*
2967 2900                   * Align address and length to page size of underlying segment
2968 2901                   */
2969 2902                  share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
2970 2903                  shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
2971 2904                  size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
2972 2905                      share_size);
2973 2906  
2974 2907                  amp = shmd->shm_amp;
2975 2908                  anon_index = seg_page(seg, shm_addr);
2976 2909  
2977 2910                  /*
2978 2911                   * And now we may have to adjust size downward if we have
2979 2912                   * exceeded the realsize of the segment or initial anon
2980 2913                   * allocations.
2981 2914                   */
2982 2915                  sptseg_addr = sptseg->s_base + ptob(anon_index);
2983 2916                  if ((sptseg_addr + size) >
2984 2917                      (sptseg->s_base + sptd->spt_realsize))
2985 2918                          size = (sptseg->s_base + sptd->spt_realsize) -
2986 2919                              sptseg_addr;
2987 2920  
2988 2921                  /*
2989 2922                   * Set memory allocation policy for this segment
2990 2923                   */
2991 2924                  policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
2992 2925                  already_set = lgrp_shm_policy_set(policy, amp, anon_index,
2993 2926                      NULL, 0, len);
2994 2927  
2995 2928                  /*
2996 2929                   * If random memory allocation policy set already,
2997 2930                   * don't bother reapplying it.
2998 2931                   */
2999 2932                  if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
3000 2933                          return (0);
3001 2934  
3002 2935                  /*
3003 2936                   * Mark any existing pages in the given range for
3004 2937                   * migration, flushing the I/O page cache, and using
3005 2938                   * underlying segment to calculate anon index and get
3006 2939                   * anonmap and vnode pointer from

↓ open down ↓

723 lines elided

↑ open up ↑

3007 2940                   */
3008 2941                  if (shmd->shm_softlockcnt > 0)
3009 2942                          segspt_purge(seg);
3010 2943  
3011 2944                  page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
3012 2945          }
3013 2946  
3014 2947          return (0);
3015 2948  }
3016 2949  
3017      -/*ARGSUSED*/
3018      -void
3019      -segspt_shmdump(struct seg *seg)
3020      -{
3021      -        /* no-op for ISM segment */
3022      -}
3023      -
3024      -/*ARGSUSED*/
3025      -static faultcode_t
3026      -segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
3027      -{
3028      -        return (ENOTSUP);
3029      -}
3030      -
3031 2950  /*
3032 2951   * get a memory ID for an addr in a given segment
3033 2952   */
3034 2953  static int
3035 2954  segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
3036 2955  {
3037 2956          struct shm_data *shmd = (struct shm_data *)seg->s_data;
3038 2957          struct anon     *ap;
3039 2958          size_t          anon_index;
3040 2959          struct anon_map *amp = shmd->shm_amp;

3041 2960          struct spt_data *sptd = shmd->shm_sptseg->s_data;
3042 2961          struct seg      *sptseg = shmd->shm_sptseg;
3043 2962          anon_sync_obj_t cookie;
3044 2963  
3045 2964          anon_index = seg_page(seg, addr);
3046 2965  
3047 2966          if (addr > (seg->s_base + sptd->spt_realsize)) {
3048 2967                  return (EFAULT);
3049 2968          }
3050 2969  
3051 2970          ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
3052 2971          anon_array_enter(amp, anon_index, &cookie);
3053 2972          ap = anon_get_ptr(amp->ahp, anon_index);
3054 2973          if (ap == NULL) {
3055 2974                  struct page *pp;
3056 2975                  caddr_t spt_addr = sptseg->s_base + ptob(anon_index);
3057 2976  
3058 2977                  pp = anon_zero(sptseg, spt_addr, &ap, kcred);
3059 2978                  if (pp == NULL) {
3060 2979                          anon_array_exit(&cookie);
3061 2980                          ANON_LOCK_EXIT(&amp->a_rwlock);
3062 2981                          return (ENOMEM);
3063 2982                  }
3064 2983                  (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
3065 2984                  page_unlock(pp);
3066 2985          }
3067 2986          anon_array_exit(&cookie);
3068 2987          ANON_LOCK_EXIT(&amp->a_rwlock);
3069 2988          memidp->val[0] = (uintptr_t)ap;
3070 2989          memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
3071 2990          return (0);
3072 2991  }
3073 2992  
3074 2993  /*
3075 2994   * Get memory allocation policy info for specified address in given segment
3076 2995   */
3077 2996  static lgrp_mem_policy_info_t *
3078 2997  segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
3079 2998  {
3080 2999          struct anon_map         *amp;
3081 3000          ulong_t                 anon_index;
3082 3001          lgrp_mem_policy_info_t  *policy_info;
3083 3002          struct shm_data         *shm_data;
3084 3003  
3085 3004          ASSERT(seg != NULL);
3086 3005  
3087 3006          /*
3088 3007           * Get anon_map from segshm
3089 3008           *
3090 3009           * Assume that no lock needs to be held on anon_map, since
3091 3010           * it should be protected by its reference count which must be
3092 3011           * nonzero for an existing segment
3093 3012           * Need to grab readers lock on policy tree though
3094 3013           */
3095 3014          shm_data = (struct shm_data *)seg->s_data;
3096 3015          if (shm_data == NULL)
3097 3016                  return (NULL);
3098 3017          amp = shm_data->shm_amp;
3099 3018          ASSERT(amp->refcnt != 0);

↓ open down ↓

59 lines elided

↑ open up ↑

3100 3019  
3101 3020          /*
3102 3021           * Get policy info
3103 3022           *
3104 3023           * Assume starting anon index of 0
3105 3024           */
3106 3025          anon_index = seg_page(seg, addr);
3107 3026          policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
3108 3027  
3109 3028          return (policy_info);
3110      -}
3111      -
3112      -/*ARGSUSED*/
3113      -static int
3114      -segspt_shmcapable(struct seg *seg, segcapability_t capability)
3115      -{
3116      -        return (0);
3117 3029  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX