1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/param.h> 26 #include <sys/user.h> 27 #include <sys/mman.h> 28 #include <sys/kmem.h> 29 #include <sys/sysmacros.h> 30 #include <sys/cmn_err.h> 31 #include <sys/systm.h> 32 #include <sys/tuneable.h> 33 #include <vm/hat.h> 34 #include <vm/seg.h> 35 #include <vm/as.h> 36 #include <vm/anon.h> 37 #include <vm/page.h> 38 #include <sys/buf.h> 39 #include <sys/swap.h> 40 #include <sys/atomic.h> 41 #include <vm/seg_spt.h> 42 #include <sys/debug.h> 43 #include <sys/vtrace.h> 44 #include <sys/shm.h> 45 #include <sys/shm_impl.h> 46 #include <sys/lgrp.h> 47 #include <sys/vmsystm.h> 48 #include <sys/policy.h> 49 #include <sys/project.h> 50 #include <sys/tnf_probe.h> 51 #include <sys/zone.h> 52 53 #define SEGSPTADDR (caddr_t)0x0 54 55 /* 56 * # pages used for spt 57 */ 58 size_t spt_used; 59 60 /* 61 * segspt_minfree is the memory left for system after ISM 62 * locked its pages; it is set up to 5% of availrmem in 63 * sptcreate when ISM is created. ISM should not use more 64 * than ~90% of availrmem; if it does, then the performance 65 * of the system may decrease. Machines with large memories may 66 * be able to use up more memory for ISM so we set the default 67 * segspt_minfree to 5% (which gives ISM max 95% of availrmem. 68 * If somebody wants even more memory for ISM (risking hanging 69 * the system) they can patch the segspt_minfree to smaller number. 70 */ 71 pgcnt_t segspt_minfree = 0; 72 73 static int segspt_create(struct seg *seg, caddr_t argsp); 74 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize); 75 static void segspt_free(struct seg *seg); 76 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len); 77 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr); 78 79 static void 80 segspt_badop() 81 { 82 panic("segspt_badop called"); 83 /*NOTREACHED*/ 84 } 85 86 #define SEGSPT_BADOP(t) (t(*)())segspt_badop 87 88 struct seg_ops segspt_ops = { 89 .dup = SEGSPT_BADOP(int), 90 .unmap = segspt_unmap, 91 .free = segspt_free, 92 .fault = SEGSPT_BADOP(int), 93 .faulta = SEGSPT_BADOP(faultcode_t), 94 .setprot = SEGSPT_BADOP(int), 95 .checkprot = SEGSPT_BADOP(int), 96 .kluster = SEGSPT_BADOP(int), 97 .swapout = SEGSPT_BADOP(size_t), 98 .sync = SEGSPT_BADOP(int), 99 .incore = SEGSPT_BADOP(size_t), 100 .lockop = SEGSPT_BADOP(int), 101 .getprot = SEGSPT_BADOP(int), 102 .getoffset = SEGSPT_BADOP(u_offset_t), 103 .gettype = SEGSPT_BADOP(int), 104 .getvp = SEGSPT_BADOP(int), 105 .advise = SEGSPT_BADOP(int), 106 .dump = SEGSPT_BADOP(void), 107 .pagelock = SEGSPT_BADOP(int), 108 .setpagesize = SEGSPT_BADOP(int), 109 .getmemid = SEGSPT_BADOP(int), 110 .getpolicy = segspt_getpolicy, 111 .capable = SEGSPT_BADOP(int), 112 }; 113 114 static int segspt_shmdup(struct seg *seg, struct seg *newseg); 115 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize); 116 static void segspt_shmfree(struct seg *seg); 117 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg, 118 caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw); 119 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr); 120 static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr, 121 register size_t len, register uint_t prot); 122 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, 123 uint_t prot); 124 static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta); 125 static size_t segspt_shmswapout(struct seg *seg); 126 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, 127 register char *vec); 128 static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len, 129 int attr, uint_t flags); 130 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len, 131 int attr, int op, ulong_t *lockmap, size_t pos); 132 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, 133 uint_t *protv); 134 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr); 135 static int segspt_shmgettype(struct seg *seg, caddr_t addr); 136 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp); 137 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, 138 uint_t behav); 139 static void segspt_shmdump(struct seg *seg); 140 static int segspt_shmpagelock(struct seg *, caddr_t, size_t, 141 struct page ***, enum lock_type, enum seg_rw); 142 static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t); 143 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *); 144 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t); 145 146 struct seg_ops segspt_shmops = { 147 .dup = segspt_shmdup, 148 .unmap = segspt_shmunmap, 149 .free = segspt_shmfree, 150 .fault = segspt_shmfault, 151 .faulta = segspt_shmfaulta, 152 .setprot = segspt_shmsetprot, 153 .checkprot = segspt_shmcheckprot, 154 .kluster = segspt_shmkluster, 155 .swapout = segspt_shmswapout, 156 .sync = segspt_shmsync, 157 .incore = segspt_shmincore, 158 .lockop = segspt_shmlockop, 159 .getprot = segspt_shmgetprot, 160 .getoffset = segspt_shmgetoffset, 161 .gettype = segspt_shmgettype, 162 .getvp = segspt_shmgetvp, 163 .advise = segspt_shmadvise, 164 .dump = segspt_shmdump, 165 .pagelock = segspt_shmpagelock, 166 .setpagesize = segspt_shmsetpgsz, 167 .getmemid = segspt_shmgetmemid, 168 .getpolicy = segspt_shmgetpolicy, 169 }; 170 171 static void segspt_purge(struct seg *seg); 172 static int segspt_reclaim(void *, caddr_t, size_t, struct page **, 173 enum seg_rw, int); 174 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len, 175 page_t **ppa); 176 177 178 179 /*ARGSUSED*/ 180 int 181 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp, 182 uint_t prot, uint_t flags, uint_t share_szc) 183 { 184 int err; 185 struct as *newas; 186 struct segspt_crargs sptcargs; 187 188 #ifdef DEBUG 189 TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */, 190 tnf_ulong, size, size ); 191 #endif 192 if (segspt_minfree == 0) /* leave min 5% of availrmem for */ 193 segspt_minfree = availrmem/20; /* for the system */ 194 195 if (!hat_supported(HAT_SHARED_PT, (void *)0)) 196 return (EINVAL); 197 198 /* 199 * get a new as for this shared memory segment 200 */ 201 newas = as_alloc(); 202 newas->a_proc = NULL; 203 sptcargs.amp = amp; 204 sptcargs.prot = prot; 205 sptcargs.flags = flags; 206 sptcargs.szc = share_szc; 207 /* 208 * create a shared page table (spt) segment 209 */ 210 211 if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) { 212 as_free(newas); 213 return (err); 214 } 215 *sptseg = sptcargs.seg_spt; 216 return (0); 217 } 218 219 void 220 sptdestroy(struct as *as, struct anon_map *amp) 221 { 222 223 #ifdef DEBUG 224 TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */); 225 #endif 226 (void) as_unmap(as, SEGSPTADDR, amp->size); 227 as_free(as); 228 } 229 230 /* 231 * called from seg_free(). 232 * free (i.e., unlock, unmap, return to free list) 233 * all the pages in the given seg. 234 */ 235 void 236 segspt_free(struct seg *seg) 237 { 238 struct spt_data *sptd = (struct spt_data *)seg->s_data; 239 240 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 241 242 if (sptd != NULL) { 243 if (sptd->spt_realsize) 244 segspt_free_pages(seg, seg->s_base, sptd->spt_realsize); 245 246 if (sptd->spt_ppa_lckcnt) 247 kmem_free(sptd->spt_ppa_lckcnt, 248 sizeof (*sptd->spt_ppa_lckcnt) 249 * btopr(sptd->spt_amp->size)); 250 kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp)); 251 cv_destroy(&sptd->spt_cv); 252 mutex_destroy(&sptd->spt_lock); 253 kmem_free(sptd, sizeof (*sptd)); 254 } 255 } 256 257 /*ARGSUSED*/ 258 static int 259 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr, 260 uint_t flags) 261 { 262 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 263 264 return (0); 265 } 266 267 /*ARGSUSED*/ 268 static size_t 269 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec) 270 { 271 caddr_t eo_seg; 272 pgcnt_t npages; 273 struct shm_data *shmd = (struct shm_data *)seg->s_data; 274 struct seg *sptseg; 275 struct spt_data *sptd; 276 277 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 278 #ifdef lint 279 seg = seg; 280 #endif 281 sptseg = shmd->shm_sptseg; 282 sptd = sptseg->s_data; 283 284 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 285 eo_seg = addr + len; 286 while (addr < eo_seg) { 287 /* page exists, and it's locked. */ 288 *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED | 289 SEG_PAGE_ANON; 290 addr += PAGESIZE; 291 } 292 return (len); 293 } else { 294 struct anon_map *amp = shmd->shm_amp; 295 struct anon *ap; 296 page_t *pp; 297 pgcnt_t anon_index; 298 struct vnode *vp; 299 u_offset_t off; 300 ulong_t i; 301 int ret; 302 anon_sync_obj_t cookie; 303 304 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 305 anon_index = seg_page(seg, addr); 306 npages = btopr(len); 307 if (anon_index + npages > btopr(shmd->shm_amp->size)) { 308 return (EINVAL); 309 } 310 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 311 for (i = 0; i < npages; i++, anon_index++) { 312 ret = 0; 313 anon_array_enter(amp, anon_index, &cookie); 314 ap = anon_get_ptr(amp->ahp, anon_index); 315 if (ap != NULL) { 316 swap_xlate(ap, &vp, &off); 317 anon_array_exit(&cookie); 318 pp = page_lookup_nowait(vp, off, SE_SHARED); 319 if (pp != NULL) { 320 ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON; 321 page_unlock(pp); 322 } 323 } else { 324 anon_array_exit(&cookie); 325 } 326 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) { 327 ret |= SEG_PAGE_LOCKED; 328 } 329 *vec++ = (char)ret; 330 } 331 ANON_LOCK_EXIT(&->a_rwlock); 332 return (len); 333 } 334 } 335 336 static int 337 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize) 338 { 339 size_t share_size; 340 341 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 342 343 /* 344 * seg.s_size may have been rounded up to the largest page size 345 * in shmat(). 346 * XXX This should be cleanedup. sptdestroy should take a length 347 * argument which should be the same as sptcreate. Then 348 * this rounding would not be needed (or is done in shm.c) 349 * Only the check for full segment will be needed. 350 * 351 * XXX -- shouldn't raddr == 0 always? These tests don't seem 352 * to be useful at all. 353 */ 354 share_size = page_get_pagesize(seg->s_szc); 355 ssize = P2ROUNDUP(ssize, share_size); 356 357 if (raddr == seg->s_base && ssize == seg->s_size) { 358 seg_free(seg); 359 return (0); 360 } else 361 return (EINVAL); 362 } 363 364 int 365 segspt_create(struct seg *seg, caddr_t argsp) 366 { 367 int err; 368 caddr_t addr = seg->s_base; 369 struct spt_data *sptd; 370 struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp; 371 struct anon_map *amp = sptcargs->amp; 372 struct kshmid *sp = amp->a_sp; 373 struct cred *cred = CRED(); 374 ulong_t i, j, anon_index = 0; 375 pgcnt_t npages = btopr(amp->size); 376 struct vnode *vp; 377 page_t **ppa; 378 uint_t hat_flags; 379 size_t pgsz; 380 pgcnt_t pgcnt; 381 caddr_t a; 382 pgcnt_t pidx; 383 size_t sz; 384 proc_t *procp = curproc; 385 rctl_qty_t lockedbytes = 0; 386 kproject_t *proj; 387 388 /* 389 * We are holding the a_lock on the underlying dummy as, 390 * so we can make calls to the HAT layer. 391 */ 392 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 393 ASSERT(sp != NULL); 394 395 #ifdef DEBUG 396 TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */, 397 tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size); 398 #endif 399 if ((sptcargs->flags & SHM_PAGEABLE) == 0) { 400 if (err = anon_swap_adjust(npages)) 401 return (err); 402 } 403 err = ENOMEM; 404 405 if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL) 406 goto out1; 407 408 if ((sptcargs->flags & SHM_PAGEABLE) == 0) { 409 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages), 410 KM_NOSLEEP)) == NULL) 411 goto out2; 412 } 413 414 mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL); 415 416 if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL) 417 goto out3; 418 419 seg->s_ops = &segspt_ops; 420 sptd->spt_vp = vp; 421 sptd->spt_amp = amp; 422 sptd->spt_prot = sptcargs->prot; 423 sptd->spt_flags = sptcargs->flags; 424 seg->s_data = (caddr_t)sptd; 425 sptd->spt_ppa = NULL; 426 sptd->spt_ppa_lckcnt = NULL; 427 seg->s_szc = sptcargs->szc; 428 cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL); 429 sptd->spt_gen = 0; 430 431 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 432 if (seg->s_szc > amp->a_szc) { 433 amp->a_szc = seg->s_szc; 434 } 435 ANON_LOCK_EXIT(&->a_rwlock); 436 437 /* 438 * Set policy to affect initial allocation of pages in 439 * anon_map_createpages() 440 */ 441 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index, 442 NULL, 0, ptob(npages)); 443 444 if (sptcargs->flags & SHM_PAGEABLE) { 445 size_t share_sz; 446 pgcnt_t new_npgs, more_pgs; 447 struct anon_hdr *nahp; 448 zone_t *zone; 449 450 share_sz = page_get_pagesize(seg->s_szc); 451 if (!IS_P2ALIGNED(amp->size, share_sz)) { 452 /* 453 * We are rounding up the size of the anon array 454 * on 4 M boundary because we always create 4 M 455 * of page(s) when locking, faulting pages and we 456 * don't have to check for all corner cases e.g. 457 * if there is enough space to allocate 4 M 458 * page. 459 */ 460 new_npgs = btop(P2ROUNDUP(amp->size, share_sz)); 461 more_pgs = new_npgs - npages; 462 463 /* 464 * The zone will never be NULL, as a fully created 465 * shm always has an owning zone. 466 */ 467 zone = sp->shm_perm.ipc_zone_ref.zref_zone; 468 ASSERT(zone != NULL); 469 if (anon_resv_zone(ptob(more_pgs), zone) == 0) { 470 err = ENOMEM; 471 goto out4; 472 } 473 474 nahp = anon_create(new_npgs, ANON_SLEEP); 475 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 476 (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages, 477 ANON_SLEEP); 478 anon_release(amp->ahp, npages); 479 amp->ahp = nahp; 480 ASSERT(amp->swresv == ptob(npages)); 481 amp->swresv = amp->size = ptob(new_npgs); 482 ANON_LOCK_EXIT(&->a_rwlock); 483 npages = new_npgs; 484 } 485 486 sptd->spt_ppa_lckcnt = kmem_zalloc(npages * 487 sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP); 488 sptd->spt_pcachecnt = 0; 489 sptd->spt_realsize = ptob(npages); 490 sptcargs->seg_spt = seg; 491 return (0); 492 } 493 494 /* 495 * get array of pages for each anon slot in amp 496 */ 497 if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa, 498 seg, addr, S_CREATE, cred)) != 0) 499 goto out4; 500 501 mutex_enter(&sp->shm_mlock); 502 503 /* May be partially locked, so, count bytes to charge for locking */ 504 for (i = 0; i < npages; i++) 505 if (ppa[i]->p_lckcnt == 0) 506 lockedbytes += PAGESIZE; 507 508 proj = sp->shm_perm.ipc_proj; 509 510 if (lockedbytes > 0) { 511 mutex_enter(&procp->p_lock); 512 if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) { 513 mutex_exit(&procp->p_lock); 514 mutex_exit(&sp->shm_mlock); 515 for (i = 0; i < npages; i++) 516 page_unlock(ppa[i]); 517 err = ENOMEM; 518 goto out4; 519 } 520 mutex_exit(&procp->p_lock); 521 } 522 523 /* 524 * addr is initial address corresponding to the first page on ppa list 525 */ 526 for (i = 0; i < npages; i++) { 527 /* attempt to lock all pages */ 528 if (page_pp_lock(ppa[i], 0, 1) == 0) { 529 /* 530 * if unable to lock any page, unlock all 531 * of them and return error 532 */ 533 for (j = 0; j < i; j++) 534 page_pp_unlock(ppa[j], 0, 1); 535 for (i = 0; i < npages; i++) 536 page_unlock(ppa[i]); 537 rctl_decr_locked_mem(NULL, proj, lockedbytes, 0); 538 mutex_exit(&sp->shm_mlock); 539 err = ENOMEM; 540 goto out4; 541 } 542 } 543 mutex_exit(&sp->shm_mlock); 544 545 /* 546 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK 547 * for the entire life of the segment. For example platforms 548 * that do not support Dynamic Reconfiguration. 549 */ 550 hat_flags = HAT_LOAD_SHARE; 551 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL)) 552 hat_flags |= HAT_LOAD_LOCK; 553 554 /* 555 * Load translations one lare page at a time 556 * to make sure we don't create mappings bigger than 557 * segment's size code in case underlying pages 558 * are shared with segvn's segment that uses bigger 559 * size code than we do. 560 */ 561 pgsz = page_get_pagesize(seg->s_szc); 562 pgcnt = page_get_pagecnt(seg->s_szc); 563 for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) { 564 sz = MIN(pgsz, ptob(npages - pidx)); 565 hat_memload_array(seg->s_as->a_hat, a, sz, 566 &ppa[pidx], sptd->spt_prot, hat_flags); 567 } 568 569 /* 570 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, 571 * we will leave the pages locked SE_SHARED for the life 572 * of the ISM segment. This will prevent any calls to 573 * hat_pageunload() on this ISM segment for those platforms. 574 */ 575 if (!(hat_flags & HAT_LOAD_LOCK)) { 576 /* 577 * On platforms that support HAT_DYNAMIC_ISM_UNMAP, 578 * we no longer need to hold the SE_SHARED lock on the pages, 579 * since L_PAGELOCK and F_SOFTLOCK calls will grab the 580 * SE_SHARED lock on the pages as necessary. 581 */ 582 for (i = 0; i < npages; i++) 583 page_unlock(ppa[i]); 584 } 585 sptd->spt_pcachecnt = 0; 586 kmem_free(ppa, ((sizeof (page_t *)) * npages)); 587 sptd->spt_realsize = ptob(npages); 588 atomic_add_long(&spt_used, npages); 589 sptcargs->seg_spt = seg; 590 return (0); 591 592 out4: 593 seg->s_data = NULL; 594 kmem_free(vp, sizeof (*vp)); 595 cv_destroy(&sptd->spt_cv); 596 out3: 597 mutex_destroy(&sptd->spt_lock); 598 if ((sptcargs->flags & SHM_PAGEABLE) == 0) 599 kmem_free(ppa, (sizeof (*ppa) * npages)); 600 out2: 601 kmem_free(sptd, sizeof (*sptd)); 602 out1: 603 if ((sptcargs->flags & SHM_PAGEABLE) == 0) 604 anon_swap_restore(npages); 605 return (err); 606 } 607 608 /*ARGSUSED*/ 609 void 610 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len) 611 { 612 struct page *pp; 613 struct spt_data *sptd = (struct spt_data *)seg->s_data; 614 pgcnt_t npages; 615 ulong_t anon_idx; 616 struct anon_map *amp; 617 struct anon *ap; 618 struct vnode *vp; 619 u_offset_t off; 620 uint_t hat_flags; 621 int root = 0; 622 pgcnt_t pgs, curnpgs = 0; 623 page_t *rootpp; 624 rctl_qty_t unlocked_bytes = 0; 625 kproject_t *proj; 626 kshmid_t *sp; 627 628 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 629 630 len = P2ROUNDUP(len, PAGESIZE); 631 632 npages = btop(len); 633 634 hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP; 635 if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) || 636 (sptd->spt_flags & SHM_PAGEABLE)) { 637 hat_flags = HAT_UNLOAD_UNMAP; 638 } 639 640 hat_unload(seg->s_as->a_hat, addr, len, hat_flags); 641 642 amp = sptd->spt_amp; 643 if (sptd->spt_flags & SHM_PAGEABLE) 644 npages = btop(amp->size); 645 646 ASSERT(amp != NULL); 647 648 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 649 sp = amp->a_sp; 650 proj = sp->shm_perm.ipc_proj; 651 mutex_enter(&sp->shm_mlock); 652 } 653 for (anon_idx = 0; anon_idx < npages; anon_idx++) { 654 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 655 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) { 656 panic("segspt_free_pages: null app"); 657 /*NOTREACHED*/ 658 } 659 } else { 660 if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx)) 661 == NULL) 662 continue; 663 } 664 ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0); 665 swap_xlate(ap, &vp, &off); 666 667 /* 668 * If this platform supports HAT_DYNAMIC_ISM_UNMAP, 669 * the pages won't be having SE_SHARED lock at this 670 * point. 671 * 672 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, 673 * the pages are still held SE_SHARED locked from the 674 * original segspt_create() 675 * 676 * Our goal is to get SE_EXCL lock on each page, remove 677 * permanent lock on it and invalidate the page. 678 */ 679 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 680 if (hat_flags == HAT_UNLOAD_UNMAP) 681 pp = page_lookup(vp, off, SE_EXCL); 682 else { 683 if ((pp = page_find(vp, off)) == NULL) { 684 panic("segspt_free_pages: " 685 "page not locked"); 686 /*NOTREACHED*/ 687 } 688 if (!page_tryupgrade(pp)) { 689 page_unlock(pp); 690 pp = page_lookup(vp, off, SE_EXCL); 691 } 692 } 693 if (pp == NULL) { 694 panic("segspt_free_pages: " 695 "page not in the system"); 696 /*NOTREACHED*/ 697 } 698 ASSERT(pp->p_lckcnt > 0); 699 page_pp_unlock(pp, 0, 1); 700 if (pp->p_lckcnt == 0) 701 unlocked_bytes += PAGESIZE; 702 } else { 703 if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL) 704 continue; 705 } 706 /* 707 * It's logical to invalidate the pages here as in most cases 708 * these were created by segspt. 709 */ 710 if (pp->p_szc != 0) { 711 if (root == 0) { 712 ASSERT(curnpgs == 0); 713 root = 1; 714 rootpp = pp; 715 pgs = curnpgs = page_get_pagecnt(pp->p_szc); 716 ASSERT(pgs > 1); 717 ASSERT(IS_P2ALIGNED(pgs, pgs)); 718 ASSERT(!(page_pptonum(pp) & (pgs - 1))); 719 curnpgs--; 720 } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) { 721 ASSERT(curnpgs == 1); 722 ASSERT(page_pptonum(pp) == 723 page_pptonum(rootpp) + (pgs - 1)); 724 page_destroy_pages(rootpp); 725 root = 0; 726 curnpgs = 0; 727 } else { 728 ASSERT(curnpgs > 1); 729 ASSERT(page_pptonum(pp) == 730 page_pptonum(rootpp) + (pgs - curnpgs)); 731 curnpgs--; 732 } 733 } else { 734 if (root != 0 || curnpgs != 0) { 735 panic("segspt_free_pages: bad large page"); 736 /*NOTREACHED*/ 737 } 738 /* 739 * Before destroying the pages, we need to take care 740 * of the rctl locked memory accounting. For that 741 * we need to calculte the unlocked_bytes. 742 */ 743 if (pp->p_lckcnt > 0) 744 unlocked_bytes += PAGESIZE; 745 /*LINTED: constant in conditional context */ 746 VN_DISPOSE(pp, B_INVAL, 0, kcred); 747 } 748 } 749 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 750 if (unlocked_bytes > 0) 751 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0); 752 mutex_exit(&sp->shm_mlock); 753 } 754 if (root != 0 || curnpgs != 0) { 755 panic("segspt_free_pages: bad large page"); 756 /*NOTREACHED*/ 757 } 758 759 /* 760 * mark that pages have been released 761 */ 762 sptd->spt_realsize = 0; 763 764 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 765 atomic_add_long(&spt_used, -npages); 766 anon_swap_restore(npages); 767 } 768 } 769 770 /* 771 * Get memory allocation policy info for specified address in given segment 772 */ 773 static lgrp_mem_policy_info_t * 774 segspt_getpolicy(struct seg *seg, caddr_t addr) 775 { 776 struct anon_map *amp; 777 ulong_t anon_index; 778 lgrp_mem_policy_info_t *policy_info; 779 struct spt_data *spt_data; 780 781 ASSERT(seg != NULL); 782 783 /* 784 * Get anon_map from segspt 785 * 786 * Assume that no lock needs to be held on anon_map, since 787 * it should be protected by its reference count which must be 788 * nonzero for an existing segment 789 * Need to grab readers lock on policy tree though 790 */ 791 spt_data = (struct spt_data *)seg->s_data; 792 if (spt_data == NULL) 793 return (NULL); 794 amp = spt_data->spt_amp; 795 ASSERT(amp->refcnt != 0); 796 797 /* 798 * Get policy info 799 * 800 * Assume starting anon index of 0 801 */ 802 anon_index = seg_page(seg, addr); 803 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0); 804 805 return (policy_info); 806 } 807 808 /* 809 * DISM only. 810 * Return locked pages over a given range. 811 * 812 * We will cache all DISM locked pages and save the pplist for the 813 * entire segment in the ppa field of the underlying DISM segment structure. 814 * Later, during a call to segspt_reclaim() we will use this ppa array 815 * to page_unlock() all of the pages and then we will free this ppa list. 816 */ 817 /*ARGSUSED*/ 818 static int 819 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len, 820 struct page ***ppp, enum lock_type type, enum seg_rw rw) 821 { 822 struct shm_data *shmd = (struct shm_data *)seg->s_data; 823 struct seg *sptseg = shmd->shm_sptseg; 824 struct spt_data *sptd = sptseg->s_data; 825 pgcnt_t pg_idx, npages, tot_npages, npgs; 826 struct page **pplist, **pl, **ppa, *pp; 827 struct anon_map *amp; 828 spgcnt_t an_idx; 829 int ret = ENOTSUP; 830 uint_t pl_built = 0; 831 struct anon *ap; 832 struct vnode *vp; 833 u_offset_t off; 834 pgcnt_t claim_availrmem = 0; 835 uint_t szc; 836 837 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 838 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); 839 840 /* 841 * We want to lock/unlock the entire ISM segment. Therefore, 842 * we will be using the underlying sptseg and it's base address 843 * and length for the caching arguments. 844 */ 845 ASSERT(sptseg); 846 ASSERT(sptd); 847 848 pg_idx = seg_page(seg, addr); 849 npages = btopr(len); 850 851 /* 852 * check if the request is larger than number of pages covered 853 * by amp 854 */ 855 if (pg_idx + npages > btopr(sptd->spt_amp->size)) { 856 *ppp = NULL; 857 return (ENOTSUP); 858 } 859 860 if (type == L_PAGEUNLOCK) { 861 ASSERT(sptd->spt_ppa != NULL); 862 863 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, 864 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 865 866 /* 867 * If someone is blocked while unmapping, we purge 868 * segment page cache and thus reclaim pplist synchronously 869 * without waiting for seg_pasync_thread. This speeds up 870 * unmapping in cases where munmap(2) is called, while 871 * raw async i/o is still in progress or where a thread 872 * exits on data fault in a multithreaded application. 873 */ 874 if ((sptd->spt_flags & DISM_PPA_CHANGED) || 875 (AS_ISUNMAPWAIT(seg->s_as) && 876 shmd->shm_softlockcnt > 0)) { 877 segspt_purge(seg); 878 } 879 return (0); 880 } 881 882 /* The L_PAGELOCK case ... */ 883 884 if (sptd->spt_flags & DISM_PPA_CHANGED) { 885 segspt_purge(seg); 886 /* 887 * for DISM ppa needs to be rebuild since 888 * number of locked pages could be changed 889 */ 890 *ppp = NULL; 891 return (ENOTSUP); 892 } 893 894 /* 895 * First try to find pages in segment page cache, without 896 * holding the segment lock. 897 */ 898 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 899 S_WRITE, SEGP_FORCE_WIRED); 900 if (pplist != NULL) { 901 ASSERT(sptd->spt_ppa != NULL); 902 ASSERT(sptd->spt_ppa == pplist); 903 ppa = sptd->spt_ppa; 904 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 905 if (ppa[an_idx] == NULL) { 906 seg_pinactive(seg, NULL, seg->s_base, 907 sptd->spt_amp->size, ppa, 908 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 909 *ppp = NULL; 910 return (ENOTSUP); 911 } 912 if ((szc = ppa[an_idx]->p_szc) != 0) { 913 npgs = page_get_pagecnt(szc); 914 an_idx = P2ROUNDUP(an_idx + 1, npgs); 915 } else { 916 an_idx++; 917 } 918 } 919 /* 920 * Since we cache the entire DISM segment, we want to 921 * set ppp to point to the first slot that corresponds 922 * to the requested addr, i.e. pg_idx. 923 */ 924 *ppp = &(sptd->spt_ppa[pg_idx]); 925 return (0); 926 } 927 928 mutex_enter(&sptd->spt_lock); 929 /* 930 * try to find pages in segment page cache with mutex 931 */ 932 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 933 S_WRITE, SEGP_FORCE_WIRED); 934 if (pplist != NULL) { 935 ASSERT(sptd->spt_ppa != NULL); 936 ASSERT(sptd->spt_ppa == pplist); 937 ppa = sptd->spt_ppa; 938 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 939 if (ppa[an_idx] == NULL) { 940 mutex_exit(&sptd->spt_lock); 941 seg_pinactive(seg, NULL, seg->s_base, 942 sptd->spt_amp->size, ppa, 943 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 944 *ppp = NULL; 945 return (ENOTSUP); 946 } 947 if ((szc = ppa[an_idx]->p_szc) != 0) { 948 npgs = page_get_pagecnt(szc); 949 an_idx = P2ROUNDUP(an_idx + 1, npgs); 950 } else { 951 an_idx++; 952 } 953 } 954 /* 955 * Since we cache the entire DISM segment, we want to 956 * set ppp to point to the first slot that corresponds 957 * to the requested addr, i.e. pg_idx. 958 */ 959 mutex_exit(&sptd->spt_lock); 960 *ppp = &(sptd->spt_ppa[pg_idx]); 961 return (0); 962 } 963 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size, 964 SEGP_FORCE_WIRED) == SEGP_FAIL) { 965 mutex_exit(&sptd->spt_lock); 966 *ppp = NULL; 967 return (ENOTSUP); 968 } 969 970 /* 971 * No need to worry about protections because DISM pages are always rw. 972 */ 973 pl = pplist = NULL; 974 amp = sptd->spt_amp; 975 976 /* 977 * Do we need to build the ppa array? 978 */ 979 if (sptd->spt_ppa == NULL) { 980 pgcnt_t lpg_cnt = 0; 981 982 pl_built = 1; 983 tot_npages = btopr(sptd->spt_amp->size); 984 985 ASSERT(sptd->spt_pcachecnt == 0); 986 pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP); 987 pl = pplist; 988 989 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 990 for (an_idx = 0; an_idx < tot_npages; ) { 991 ap = anon_get_ptr(amp->ahp, an_idx); 992 /* 993 * Cache only mlocked pages. For large pages 994 * if one (constituent) page is mlocked 995 * all pages for that large page 996 * are cached also. This is for quick 997 * lookups of ppa array; 998 */ 999 if ((ap != NULL) && (lpg_cnt != 0 || 1000 (sptd->spt_ppa_lckcnt[an_idx] != 0))) { 1001 1002 swap_xlate(ap, &vp, &off); 1003 pp = page_lookup(vp, off, SE_SHARED); 1004 ASSERT(pp != NULL); 1005 if (lpg_cnt == 0) { 1006 lpg_cnt++; 1007 /* 1008 * For a small page, we are done -- 1009 * lpg_count is reset to 0 below. 1010 * 1011 * For a large page, we are guaranteed 1012 * to find the anon structures of all 1013 * constituent pages and a non-zero 1014 * lpg_cnt ensures that we don't test 1015 * for mlock for these. We are done 1016 * when lpg_count reaches (npgs + 1). 1017 * If we are not the first constituent 1018 * page, restart at the first one. 1019 */ 1020 npgs = page_get_pagecnt(pp->p_szc); 1021 if (!IS_P2ALIGNED(an_idx, npgs)) { 1022 an_idx = P2ALIGN(an_idx, npgs); 1023 page_unlock(pp); 1024 continue; 1025 } 1026 } 1027 if (++lpg_cnt > npgs) 1028 lpg_cnt = 0; 1029 1030 /* 1031 * availrmem is decremented only 1032 * for unlocked pages 1033 */ 1034 if (sptd->spt_ppa_lckcnt[an_idx] == 0) 1035 claim_availrmem++; 1036 pplist[an_idx] = pp; 1037 } 1038 an_idx++; 1039 } 1040 ANON_LOCK_EXIT(&->a_rwlock); 1041 1042 if (claim_availrmem) { 1043 mutex_enter(&freemem_lock); 1044 if (availrmem < tune.t_minarmem + claim_availrmem) { 1045 mutex_exit(&freemem_lock); 1046 ret = ENOTSUP; 1047 claim_availrmem = 0; 1048 goto insert_fail; 1049 } else { 1050 availrmem -= claim_availrmem; 1051 } 1052 mutex_exit(&freemem_lock); 1053 } 1054 1055 sptd->spt_ppa = pl; 1056 } else { 1057 /* 1058 * We already have a valid ppa[]. 1059 */ 1060 pl = sptd->spt_ppa; 1061 } 1062 1063 ASSERT(pl != NULL); 1064 1065 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size, 1066 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED, 1067 segspt_reclaim); 1068 if (ret == SEGP_FAIL) { 1069 /* 1070 * seg_pinsert failed. We return 1071 * ENOTSUP, so that the as_pagelock() code will 1072 * then try the slower F_SOFTLOCK path. 1073 */ 1074 if (pl_built) { 1075 /* 1076 * No one else has referenced the ppa[]. 1077 * We created it and we need to destroy it. 1078 */ 1079 sptd->spt_ppa = NULL; 1080 } 1081 ret = ENOTSUP; 1082 goto insert_fail; 1083 } 1084 1085 /* 1086 * In either case, we increment softlockcnt on the 'real' segment. 1087 */ 1088 sptd->spt_pcachecnt++; 1089 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 1090 1091 ppa = sptd->spt_ppa; 1092 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 1093 if (ppa[an_idx] == NULL) { 1094 mutex_exit(&sptd->spt_lock); 1095 seg_pinactive(seg, NULL, seg->s_base, 1096 sptd->spt_amp->size, 1097 pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 1098 *ppp = NULL; 1099 return (ENOTSUP); 1100 } 1101 if ((szc = ppa[an_idx]->p_szc) != 0) { 1102 npgs = page_get_pagecnt(szc); 1103 an_idx = P2ROUNDUP(an_idx + 1, npgs); 1104 } else { 1105 an_idx++; 1106 } 1107 } 1108 /* 1109 * We can now drop the sptd->spt_lock since the ppa[] 1110 * exists and he have incremented pacachecnt. 1111 */ 1112 mutex_exit(&sptd->spt_lock); 1113 1114 /* 1115 * Since we cache the entire segment, we want to 1116 * set ppp to point to the first slot that corresponds 1117 * to the requested addr, i.e. pg_idx. 1118 */ 1119 *ppp = &(sptd->spt_ppa[pg_idx]); 1120 return (0); 1121 1122 insert_fail: 1123 /* 1124 * We will only reach this code if we tried and failed. 1125 * 1126 * And we can drop the lock on the dummy seg, once we've failed 1127 * to set up a new ppa[]. 1128 */ 1129 mutex_exit(&sptd->spt_lock); 1130 1131 if (pl_built) { 1132 if (claim_availrmem) { 1133 mutex_enter(&freemem_lock); 1134 availrmem += claim_availrmem; 1135 mutex_exit(&freemem_lock); 1136 } 1137 1138 /* 1139 * We created pl and we need to destroy it. 1140 */ 1141 pplist = pl; 1142 for (an_idx = 0; an_idx < tot_npages; an_idx++) { 1143 if (pplist[an_idx] != NULL) 1144 page_unlock(pplist[an_idx]); 1145 } 1146 kmem_free(pl, sizeof (page_t *) * tot_npages); 1147 } 1148 1149 if (shmd->shm_softlockcnt <= 0) { 1150 if (AS_ISUNMAPWAIT(seg->s_as)) { 1151 mutex_enter(&seg->s_as->a_contents); 1152 if (AS_ISUNMAPWAIT(seg->s_as)) { 1153 AS_CLRUNMAPWAIT(seg->s_as); 1154 cv_broadcast(&seg->s_as->a_cv); 1155 } 1156 mutex_exit(&seg->s_as->a_contents); 1157 } 1158 } 1159 *ppp = NULL; 1160 return (ret); 1161 } 1162 1163 1164 1165 /* 1166 * return locked pages over a given range. 1167 * 1168 * We will cache the entire ISM segment and save the pplist for the 1169 * entire segment in the ppa field of the underlying ISM segment structure. 1170 * Later, during a call to segspt_reclaim() we will use this ppa array 1171 * to page_unlock() all of the pages and then we will free this ppa list. 1172 */ 1173 /*ARGSUSED*/ 1174 static int 1175 segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len, 1176 struct page ***ppp, enum lock_type type, enum seg_rw rw) 1177 { 1178 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1179 struct seg *sptseg = shmd->shm_sptseg; 1180 struct spt_data *sptd = sptseg->s_data; 1181 pgcnt_t np, page_index, npages; 1182 caddr_t a, spt_base; 1183 struct page **pplist, **pl, *pp; 1184 struct anon_map *amp; 1185 ulong_t anon_index; 1186 int ret = ENOTSUP; 1187 uint_t pl_built = 0; 1188 struct anon *ap; 1189 struct vnode *vp; 1190 u_offset_t off; 1191 1192 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1193 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); 1194 1195 1196 /* 1197 * We want to lock/unlock the entire ISM segment. Therefore, 1198 * we will be using the underlying sptseg and it's base address 1199 * and length for the caching arguments. 1200 */ 1201 ASSERT(sptseg); 1202 ASSERT(sptd); 1203 1204 if (sptd->spt_flags & SHM_PAGEABLE) { 1205 return (segspt_dismpagelock(seg, addr, len, ppp, type, rw)); 1206 } 1207 1208 page_index = seg_page(seg, addr); 1209 npages = btopr(len); 1210 1211 /* 1212 * check if the request is larger than number of pages covered 1213 * by amp 1214 */ 1215 if (page_index + npages > btopr(sptd->spt_amp->size)) { 1216 *ppp = NULL; 1217 return (ENOTSUP); 1218 } 1219 1220 if (type == L_PAGEUNLOCK) { 1221 1222 ASSERT(sptd->spt_ppa != NULL); 1223 1224 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, 1225 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 1226 1227 /* 1228 * If someone is blocked while unmapping, we purge 1229 * segment page cache and thus reclaim pplist synchronously 1230 * without waiting for seg_pasync_thread. This speeds up 1231 * unmapping in cases where munmap(2) is called, while 1232 * raw async i/o is still in progress or where a thread 1233 * exits on data fault in a multithreaded application. 1234 */ 1235 if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) { 1236 segspt_purge(seg); 1237 } 1238 return (0); 1239 } 1240 1241 /* The L_PAGELOCK case... */ 1242 1243 /* 1244 * First try to find pages in segment page cache, without 1245 * holding the segment lock. 1246 */ 1247 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 1248 S_WRITE, SEGP_FORCE_WIRED); 1249 if (pplist != NULL) { 1250 ASSERT(sptd->spt_ppa == pplist); 1251 ASSERT(sptd->spt_ppa[page_index]); 1252 /* 1253 * Since we cache the entire ISM segment, we want to 1254 * set ppp to point to the first slot that corresponds 1255 * to the requested addr, i.e. page_index. 1256 */ 1257 *ppp = &(sptd->spt_ppa[page_index]); 1258 return (0); 1259 } 1260 1261 mutex_enter(&sptd->spt_lock); 1262 1263 /* 1264 * try to find pages in segment page cache 1265 */ 1266 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 1267 S_WRITE, SEGP_FORCE_WIRED); 1268 if (pplist != NULL) { 1269 ASSERT(sptd->spt_ppa == pplist); 1270 /* 1271 * Since we cache the entire segment, we want to 1272 * set ppp to point to the first slot that corresponds 1273 * to the requested addr, i.e. page_index. 1274 */ 1275 mutex_exit(&sptd->spt_lock); 1276 *ppp = &(sptd->spt_ppa[page_index]); 1277 return (0); 1278 } 1279 1280 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size, 1281 SEGP_FORCE_WIRED) == SEGP_FAIL) { 1282 mutex_exit(&sptd->spt_lock); 1283 *ppp = NULL; 1284 return (ENOTSUP); 1285 } 1286 1287 /* 1288 * No need to worry about protections because ISM pages 1289 * are always rw. 1290 */ 1291 pl = pplist = NULL; 1292 1293 /* 1294 * Do we need to build the ppa array? 1295 */ 1296 if (sptd->spt_ppa == NULL) { 1297 ASSERT(sptd->spt_ppa == pplist); 1298 1299 spt_base = sptseg->s_base; 1300 pl_built = 1; 1301 1302 /* 1303 * availrmem is decremented once during anon_swap_adjust() 1304 * and is incremented during the anon_unresv(), which is 1305 * called from shm_rm_amp() when the segment is destroyed. 1306 */ 1307 amp = sptd->spt_amp; 1308 ASSERT(amp != NULL); 1309 1310 /* pcachecnt is protected by sptd->spt_lock */ 1311 ASSERT(sptd->spt_pcachecnt == 0); 1312 pplist = kmem_zalloc(sizeof (page_t *) 1313 * btopr(sptd->spt_amp->size), KM_SLEEP); 1314 pl = pplist; 1315 1316 anon_index = seg_page(sptseg, spt_base); 1317 1318 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1319 for (a = spt_base; a < (spt_base + sptd->spt_amp->size); 1320 a += PAGESIZE, anon_index++, pplist++) { 1321 ap = anon_get_ptr(amp->ahp, anon_index); 1322 ASSERT(ap != NULL); 1323 swap_xlate(ap, &vp, &off); 1324 pp = page_lookup(vp, off, SE_SHARED); 1325 ASSERT(pp != NULL); 1326 *pplist = pp; 1327 } 1328 ANON_LOCK_EXIT(&->a_rwlock); 1329 1330 if (a < (spt_base + sptd->spt_amp->size)) { 1331 ret = ENOTSUP; 1332 goto insert_fail; 1333 } 1334 sptd->spt_ppa = pl; 1335 } else { 1336 /* 1337 * We already have a valid ppa[]. 1338 */ 1339 pl = sptd->spt_ppa; 1340 } 1341 1342 ASSERT(pl != NULL); 1343 1344 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size, 1345 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED, 1346 segspt_reclaim); 1347 if (ret == SEGP_FAIL) { 1348 /* 1349 * seg_pinsert failed. We return 1350 * ENOTSUP, so that the as_pagelock() code will 1351 * then try the slower F_SOFTLOCK path. 1352 */ 1353 if (pl_built) { 1354 /* 1355 * No one else has referenced the ppa[]. 1356 * We created it and we need to destroy it. 1357 */ 1358 sptd->spt_ppa = NULL; 1359 } 1360 ret = ENOTSUP; 1361 goto insert_fail; 1362 } 1363 1364 /* 1365 * In either case, we increment softlockcnt on the 'real' segment. 1366 */ 1367 sptd->spt_pcachecnt++; 1368 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 1369 1370 /* 1371 * We can now drop the sptd->spt_lock since the ppa[] 1372 * exists and he have incremented pacachecnt. 1373 */ 1374 mutex_exit(&sptd->spt_lock); 1375 1376 /* 1377 * Since we cache the entire segment, we want to 1378 * set ppp to point to the first slot that corresponds 1379 * to the requested addr, i.e. page_index. 1380 */ 1381 *ppp = &(sptd->spt_ppa[page_index]); 1382 return (0); 1383 1384 insert_fail: 1385 /* 1386 * We will only reach this code if we tried and failed. 1387 * 1388 * And we can drop the lock on the dummy seg, once we've failed 1389 * to set up a new ppa[]. 1390 */ 1391 mutex_exit(&sptd->spt_lock); 1392 1393 if (pl_built) { 1394 /* 1395 * We created pl and we need to destroy it. 1396 */ 1397 pplist = pl; 1398 np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT); 1399 while (np) { 1400 page_unlock(*pplist); 1401 np--; 1402 pplist++; 1403 } 1404 kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size)); 1405 } 1406 if (shmd->shm_softlockcnt <= 0) { 1407 if (AS_ISUNMAPWAIT(seg->s_as)) { 1408 mutex_enter(&seg->s_as->a_contents); 1409 if (AS_ISUNMAPWAIT(seg->s_as)) { 1410 AS_CLRUNMAPWAIT(seg->s_as); 1411 cv_broadcast(&seg->s_as->a_cv); 1412 } 1413 mutex_exit(&seg->s_as->a_contents); 1414 } 1415 } 1416 *ppp = NULL; 1417 return (ret); 1418 } 1419 1420 /* 1421 * purge any cached pages in the I/O page cache 1422 */ 1423 static void 1424 segspt_purge(struct seg *seg) 1425 { 1426 seg_ppurge(seg, NULL, SEGP_FORCE_WIRED); 1427 } 1428 1429 static int 1430 segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist, 1431 enum seg_rw rw, int async) 1432 { 1433 struct seg *seg = (struct seg *)ptag; 1434 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1435 struct seg *sptseg; 1436 struct spt_data *sptd; 1437 pgcnt_t npages, i, free_availrmem = 0; 1438 int done = 0; 1439 1440 #ifdef lint 1441 addr = addr; 1442 #endif 1443 sptseg = shmd->shm_sptseg; 1444 sptd = sptseg->s_data; 1445 npages = (len >> PAGESHIFT); 1446 ASSERT(npages); 1447 ASSERT(sptd->spt_pcachecnt != 0); 1448 ASSERT(sptd->spt_ppa == pplist); 1449 ASSERT(npages == btopr(sptd->spt_amp->size)); 1450 ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1451 1452 /* 1453 * Acquire the lock on the dummy seg and destroy the 1454 * ppa array IF this is the last pcachecnt. 1455 */ 1456 mutex_enter(&sptd->spt_lock); 1457 if (--sptd->spt_pcachecnt == 0) { 1458 for (i = 0; i < npages; i++) { 1459 if (pplist[i] == NULL) { 1460 continue; 1461 } 1462 if (rw == S_WRITE) { 1463 hat_setrefmod(pplist[i]); 1464 } else { 1465 hat_setref(pplist[i]); 1466 } 1467 if ((sptd->spt_flags & SHM_PAGEABLE) && 1468 (sptd->spt_ppa_lckcnt[i] == 0)) 1469 free_availrmem++; 1470 page_unlock(pplist[i]); 1471 } 1472 if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) { 1473 mutex_enter(&freemem_lock); 1474 availrmem += free_availrmem; 1475 mutex_exit(&freemem_lock); 1476 } 1477 /* 1478 * Since we want to cach/uncache the entire ISM segment, 1479 * we will track the pplist in a segspt specific field 1480 * ppa, that is initialized at the time we add an entry to 1481 * the cache. 1482 */ 1483 ASSERT(sptd->spt_pcachecnt == 0); 1484 kmem_free(pplist, sizeof (page_t *) * npages); 1485 sptd->spt_ppa = NULL; 1486 sptd->spt_flags &= ~DISM_PPA_CHANGED; 1487 sptd->spt_gen++; 1488 cv_broadcast(&sptd->spt_cv); 1489 done = 1; 1490 } 1491 mutex_exit(&sptd->spt_lock); 1492 1493 /* 1494 * If we are pcache async thread or called via seg_ppurge_wiredpp() we 1495 * may not hold AS lock (in this case async argument is not 0). This 1496 * means if softlockcnt drops to 0 after the decrement below address 1497 * space may get freed. We can't allow it since after softlock 1498 * derement to 0 we still need to access as structure for possible 1499 * wakeup of unmap waiters. To prevent the disappearance of as we take 1500 * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes 1501 * this mutex as a barrier to make sure this routine completes before 1502 * segment is freed. 1503 * 1504 * The second complication we have to deal with in async case is a 1505 * possibility of missed wake up of unmap wait thread. When we don't 1506 * hold as lock here we may take a_contents lock before unmap wait 1507 * thread that was first to see softlockcnt was still not 0. As a 1508 * result we'll fail to wake up an unmap wait thread. To avoid this 1509 * race we set nounmapwait flag in as structure if we drop softlockcnt 1510 * to 0 if async is not 0. unmapwait thread 1511 * will not block if this flag is set. 1512 */ 1513 if (async) 1514 mutex_enter(&shmd->shm_segfree_syncmtx); 1515 1516 /* 1517 * Now decrement softlockcnt. 1518 */ 1519 ASSERT(shmd->shm_softlockcnt > 0); 1520 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 1521 1522 if (shmd->shm_softlockcnt <= 0) { 1523 if (async || AS_ISUNMAPWAIT(seg->s_as)) { 1524 mutex_enter(&seg->s_as->a_contents); 1525 if (async) 1526 AS_SETNOUNMAPWAIT(seg->s_as); 1527 if (AS_ISUNMAPWAIT(seg->s_as)) { 1528 AS_CLRUNMAPWAIT(seg->s_as); 1529 cv_broadcast(&seg->s_as->a_cv); 1530 } 1531 mutex_exit(&seg->s_as->a_contents); 1532 } 1533 } 1534 1535 if (async) 1536 mutex_exit(&shmd->shm_segfree_syncmtx); 1537 1538 return (done); 1539 } 1540 1541 /* 1542 * Do a F_SOFTUNLOCK call over the range requested. 1543 * The range must have already been F_SOFTLOCK'ed. 1544 * 1545 * The calls to acquire and release the anon map lock mutex were 1546 * removed in order to avoid a deadly embrace during a DR 1547 * memory delete operation. (Eg. DR blocks while waiting for a 1548 * exclusive lock on a page that is being used for kaio; the 1549 * thread that will complete the kaio and call segspt_softunlock 1550 * blocks on the anon map lock; another thread holding the anon 1551 * map lock blocks on another page lock via the segspt_shmfault 1552 * -> page_lookup -> page_lookup_create -> page_lock_es code flow.) 1553 * 1554 * The appropriateness of the removal is based upon the following: 1555 * 1. If we are holding a segment's reader lock and the page is held 1556 * shared, then the corresponding element in anonmap which points to 1557 * anon struct cannot change and there is no need to acquire the 1558 * anonymous map lock. 1559 * 2. Threads in segspt_softunlock have a reader lock on the segment 1560 * and already have the shared page lock, so we are guaranteed that 1561 * the anon map slot cannot change and therefore can call anon_get_ptr() 1562 * without grabbing the anonymous map lock. 1563 * 3. Threads that softlock a shared page break copy-on-write, even if 1564 * its a read. Thus cow faults can be ignored with respect to soft 1565 * unlocking, since the breaking of cow means that the anon slot(s) will 1566 * not be shared. 1567 */ 1568 static void 1569 segspt_softunlock(struct seg *seg, caddr_t sptseg_addr, 1570 size_t len, enum seg_rw rw) 1571 { 1572 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1573 struct seg *sptseg; 1574 struct spt_data *sptd; 1575 page_t *pp; 1576 caddr_t adr; 1577 struct vnode *vp; 1578 u_offset_t offset; 1579 ulong_t anon_index; 1580 struct anon_map *amp; /* XXX - for locknest */ 1581 struct anon *ap = NULL; 1582 pgcnt_t npages; 1583 1584 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1585 1586 sptseg = shmd->shm_sptseg; 1587 sptd = sptseg->s_data; 1588 1589 /* 1590 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK 1591 * and therefore their pages are SE_SHARED locked 1592 * for the entire life of the segment. 1593 */ 1594 if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) && 1595 ((sptd->spt_flags & SHM_PAGEABLE) == 0)) { 1596 goto softlock_decrement; 1597 } 1598 1599 /* 1600 * Any thread is free to do a page_find and 1601 * page_unlock() on the pages within this seg. 1602 * 1603 * We are already holding the as->a_lock on the user's 1604 * real segment, but we need to hold the a_lock on the 1605 * underlying dummy as. This is mostly to satisfy the 1606 * underlying HAT layer. 1607 */ 1608 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER); 1609 hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len); 1610 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock); 1611 1612 amp = sptd->spt_amp; 1613 ASSERT(amp != NULL); 1614 anon_index = seg_page(sptseg, sptseg_addr); 1615 1616 for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) { 1617 ap = anon_get_ptr(amp->ahp, anon_index++); 1618 ASSERT(ap != NULL); 1619 swap_xlate(ap, &vp, &offset); 1620 1621 /* 1622 * Use page_find() instead of page_lookup() to 1623 * find the page since we know that it has a 1624 * "shared" lock. 1625 */ 1626 pp = page_find(vp, offset); 1627 ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1)); 1628 if (pp == NULL) { 1629 panic("segspt_softunlock: " 1630 "addr %p, ap %p, vp %p, off %llx", 1631 (void *)adr, (void *)ap, (void *)vp, offset); 1632 /*NOTREACHED*/ 1633 } 1634 1635 if (rw == S_WRITE) { 1636 hat_setrefmod(pp); 1637 } else if (rw != S_OTHER) { 1638 hat_setref(pp); 1639 } 1640 page_unlock(pp); 1641 } 1642 1643 softlock_decrement: 1644 npages = btopr(len); 1645 ASSERT(shmd->shm_softlockcnt >= npages); 1646 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages); 1647 if (shmd->shm_softlockcnt == 0) { 1648 /* 1649 * All SOFTLOCKS are gone. Wakeup any waiting 1650 * unmappers so they can try again to unmap. 1651 * Check for waiters first without the mutex 1652 * held so we don't always grab the mutex on 1653 * softunlocks. 1654 */ 1655 if (AS_ISUNMAPWAIT(seg->s_as)) { 1656 mutex_enter(&seg->s_as->a_contents); 1657 if (AS_ISUNMAPWAIT(seg->s_as)) { 1658 AS_CLRUNMAPWAIT(seg->s_as); 1659 cv_broadcast(&seg->s_as->a_cv); 1660 } 1661 mutex_exit(&seg->s_as->a_contents); 1662 } 1663 } 1664 } 1665 1666 int 1667 segspt_shmattach(struct seg *seg, caddr_t *argsp) 1668 { 1669 struct shm_data *shmd_arg = (struct shm_data *)argsp; 1670 struct shm_data *shmd; 1671 struct anon_map *shm_amp = shmd_arg->shm_amp; 1672 struct spt_data *sptd; 1673 int error = 0; 1674 1675 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1676 1677 shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP); 1678 if (shmd == NULL) 1679 return (ENOMEM); 1680 1681 shmd->shm_sptas = shmd_arg->shm_sptas; 1682 shmd->shm_amp = shm_amp; 1683 shmd->shm_sptseg = shmd_arg->shm_sptseg; 1684 1685 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0, 1686 NULL, 0, seg->s_size); 1687 1688 mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL); 1689 1690 seg->s_data = (void *)shmd; 1691 seg->s_ops = &segspt_shmops; 1692 seg->s_szc = shmd->shm_sptseg->s_szc; 1693 sptd = shmd->shm_sptseg->s_data; 1694 1695 if (sptd->spt_flags & SHM_PAGEABLE) { 1696 if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size), 1697 KM_NOSLEEP)) == NULL) { 1698 seg->s_data = (void *)NULL; 1699 kmem_free(shmd, (sizeof (*shmd))); 1700 return (ENOMEM); 1701 } 1702 shmd->shm_lckpgs = 0; 1703 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 1704 if ((error = hat_share(seg->s_as->a_hat, seg->s_base, 1705 shmd_arg->shm_sptas->a_hat, SEGSPTADDR, 1706 seg->s_size, seg->s_szc)) != 0) { 1707 kmem_free(shmd->shm_vpage, 1708 btopr(shm_amp->size)); 1709 } 1710 } 1711 } else { 1712 error = hat_share(seg->s_as->a_hat, seg->s_base, 1713 shmd_arg->shm_sptas->a_hat, SEGSPTADDR, 1714 seg->s_size, seg->s_szc); 1715 } 1716 if (error) { 1717 seg->s_szc = 0; 1718 seg->s_data = (void *)NULL; 1719 kmem_free(shmd, (sizeof (*shmd))); 1720 } else { 1721 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER); 1722 shm_amp->refcnt++; 1723 ANON_LOCK_EXIT(&shm_amp->a_rwlock); 1724 } 1725 return (error); 1726 } 1727 1728 int 1729 segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize) 1730 { 1731 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1732 int reclaim = 1; 1733 1734 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1735 retry: 1736 if (shmd->shm_softlockcnt > 0) { 1737 if (reclaim == 1) { 1738 segspt_purge(seg); 1739 reclaim = 0; 1740 goto retry; 1741 } 1742 return (EAGAIN); 1743 } 1744 1745 if (ssize != seg->s_size) { 1746 #ifdef DEBUG 1747 cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n", 1748 ssize, seg->s_size); 1749 #endif 1750 return (EINVAL); 1751 } 1752 1753 (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK, 1754 NULL, 0); 1755 hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc); 1756 1757 seg_free(seg); 1758 1759 return (0); 1760 } 1761 1762 void 1763 segspt_shmfree(struct seg *seg) 1764 { 1765 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1766 struct anon_map *shm_amp = shmd->shm_amp; 1767 1768 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1769 1770 (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0, 1771 MC_UNLOCK, NULL, 0); 1772 1773 /* 1774 * Need to increment refcnt when attaching 1775 * and decrement when detaching because of dup(). 1776 */ 1777 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER); 1778 shm_amp->refcnt--; 1779 ANON_LOCK_EXIT(&shm_amp->a_rwlock); 1780 1781 if (shmd->shm_vpage) { /* only for DISM */ 1782 kmem_free(shmd->shm_vpage, btopr(shm_amp->size)); 1783 shmd->shm_vpage = NULL; 1784 } 1785 1786 /* 1787 * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's 1788 * still working with this segment without holding as lock. 1789 */ 1790 ASSERT(shmd->shm_softlockcnt == 0); 1791 mutex_enter(&shmd->shm_segfree_syncmtx); 1792 mutex_destroy(&shmd->shm_segfree_syncmtx); 1793 1794 kmem_free(shmd, sizeof (*shmd)); 1795 } 1796 1797 /*ARGSUSED*/ 1798 int 1799 segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 1800 { 1801 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1802 1803 /* 1804 * Shared page table is more than shared mapping. 1805 * Individual process sharing page tables can't change prot 1806 * because there is only one set of page tables. 1807 * This will be allowed after private page table is 1808 * supported. 1809 */ 1810 /* need to return correct status error? */ 1811 return (0); 1812 } 1813 1814 1815 faultcode_t 1816 segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr, 1817 size_t len, enum fault_type type, enum seg_rw rw) 1818 { 1819 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1820 struct seg *sptseg = shmd->shm_sptseg; 1821 struct as *curspt = shmd->shm_sptas; 1822 struct spt_data *sptd = sptseg->s_data; 1823 pgcnt_t npages; 1824 size_t size; 1825 caddr_t segspt_addr, shm_addr; 1826 page_t **ppa; 1827 int i; 1828 ulong_t an_idx = 0; 1829 int err = 0; 1830 int dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0); 1831 size_t pgsz; 1832 pgcnt_t pgcnt; 1833 caddr_t a; 1834 pgcnt_t pidx; 1835 1836 #ifdef lint 1837 hat = hat; 1838 #endif 1839 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1840 1841 /* 1842 * Because of the way spt is implemented 1843 * the realsize of the segment does not have to be 1844 * equal to the segment size itself. The segment size is 1845 * often in multiples of a page size larger than PAGESIZE. 1846 * The realsize is rounded up to the nearest PAGESIZE 1847 * based on what the user requested. This is a bit of 1848 * ungliness that is historical but not easily fixed 1849 * without re-designing the higher levels of ISM. 1850 */ 1851 ASSERT(addr >= seg->s_base); 1852 if (((addr + len) - seg->s_base) > sptd->spt_realsize) 1853 return (FC_NOMAP); 1854 /* 1855 * For all of the following cases except F_PROT, we need to 1856 * make any necessary adjustments to addr and len 1857 * and get all of the necessary page_t's into an array called ppa[]. 1858 * 1859 * The code in shmat() forces base addr and len of ISM segment 1860 * to be aligned to largest page size supported. Therefore, 1861 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large 1862 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK 1863 * in large pagesize chunks, or else we will screw up the HAT 1864 * layer by calling hat_memload_array() with differing page sizes 1865 * over a given virtual range. 1866 */ 1867 pgsz = page_get_pagesize(sptseg->s_szc); 1868 pgcnt = page_get_pagecnt(sptseg->s_szc); 1869 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); 1870 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz); 1871 npages = btopr(size); 1872 1873 /* 1874 * Now we need to convert from addr in segshm to addr in segspt. 1875 */ 1876 an_idx = seg_page(seg, shm_addr); 1877 segspt_addr = sptseg->s_base + ptob(an_idx); 1878 1879 ASSERT((segspt_addr + ptob(npages)) <= 1880 (sptseg->s_base + sptd->spt_realsize)); 1881 ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size)); 1882 1883 switch (type) { 1884 1885 case F_SOFTLOCK: 1886 1887 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages); 1888 /* 1889 * Fall through to the F_INVAL case to load up the hat layer 1890 * entries with the HAT_LOAD_LOCK flag. 1891 */ 1892 /* FALLTHRU */ 1893 case F_INVAL: 1894 1895 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC)) 1896 return (FC_NOMAP); 1897 1898 ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP); 1899 1900 err = spt_anon_getpages(sptseg, segspt_addr, size, ppa); 1901 if (err != 0) { 1902 if (type == F_SOFTLOCK) { 1903 atomic_add_long((ulong_t *)( 1904 &(shmd->shm_softlockcnt)), -npages); 1905 } 1906 goto dism_err; 1907 } 1908 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER); 1909 a = segspt_addr; 1910 pidx = 0; 1911 if (type == F_SOFTLOCK) { 1912 1913 /* 1914 * Load up the translation keeping it 1915 * locked and don't unlock the page. 1916 */ 1917 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 1918 hat_memload_array(sptseg->s_as->a_hat, 1919 a, pgsz, &ppa[pidx], sptd->spt_prot, 1920 HAT_LOAD_LOCK | HAT_LOAD_SHARE); 1921 } 1922 } else { 1923 if (hat == seg->s_as->a_hat) { 1924 1925 /* 1926 * Migrate pages marked for migration 1927 */ 1928 if (lgrp_optimizations()) 1929 page_migrate(seg, shm_addr, ppa, 1930 npages); 1931 1932 /* CPU HAT */ 1933 for (; pidx < npages; 1934 a += pgsz, pidx += pgcnt) { 1935 hat_memload_array(sptseg->s_as->a_hat, 1936 a, pgsz, &ppa[pidx], 1937 sptd->spt_prot, 1938 HAT_LOAD_SHARE); 1939 } 1940 } else { 1941 /* XHAT. Pass real address */ 1942 hat_memload_array(hat, shm_addr, 1943 size, ppa, sptd->spt_prot, HAT_LOAD_SHARE); 1944 } 1945 1946 /* 1947 * And now drop the SE_SHARED lock(s). 1948 */ 1949 if (dyn_ism_unmap) { 1950 for (i = 0; i < npages; i++) { 1951 page_unlock(ppa[i]); 1952 } 1953 } 1954 } 1955 1956 if (!dyn_ism_unmap) { 1957 if (hat_share(seg->s_as->a_hat, shm_addr, 1958 curspt->a_hat, segspt_addr, ptob(npages), 1959 seg->s_szc) != 0) { 1960 panic("hat_share err in DISM fault"); 1961 /* NOTREACHED */ 1962 } 1963 if (type == F_INVAL) { 1964 for (i = 0; i < npages; i++) { 1965 page_unlock(ppa[i]); 1966 } 1967 } 1968 } 1969 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock); 1970 dism_err: 1971 kmem_free(ppa, npages * sizeof (page_t *)); 1972 return (err); 1973 1974 case F_SOFTUNLOCK: 1975 1976 /* 1977 * This is a bit ugly, we pass in the real seg pointer, 1978 * but the segspt_addr is the virtual address within the 1979 * dummy seg. 1980 */ 1981 segspt_softunlock(seg, segspt_addr, size, rw); 1982 return (0); 1983 1984 case F_PROT: 1985 1986 /* 1987 * This takes care of the unusual case where a user 1988 * allocates a stack in shared memory and a register 1989 * window overflow is written to that stack page before 1990 * it is otherwise modified. 1991 * 1992 * We can get away with this because ISM segments are 1993 * always rw. Other than this unusual case, there 1994 * should be no instances of protection violations. 1995 */ 1996 return (0); 1997 1998 default: 1999 #ifdef DEBUG 2000 panic("segspt_dismfault default type?"); 2001 #else 2002 return (FC_NOMAP); 2003 #endif 2004 } 2005 } 2006 2007 2008 faultcode_t 2009 segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr, 2010 size_t len, enum fault_type type, enum seg_rw rw) 2011 { 2012 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2013 struct seg *sptseg = shmd->shm_sptseg; 2014 struct as *curspt = shmd->shm_sptas; 2015 struct spt_data *sptd = sptseg->s_data; 2016 pgcnt_t npages; 2017 size_t size; 2018 caddr_t sptseg_addr, shm_addr; 2019 page_t *pp, **ppa; 2020 int i; 2021 u_offset_t offset; 2022 ulong_t anon_index = 0; 2023 struct vnode *vp; 2024 struct anon_map *amp; /* XXX - for locknest */ 2025 struct anon *ap = NULL; 2026 size_t pgsz; 2027 pgcnt_t pgcnt; 2028 caddr_t a; 2029 pgcnt_t pidx; 2030 size_t sz; 2031 2032 #ifdef lint 2033 hat = hat; 2034 #endif 2035 2036 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2037 2038 if (sptd->spt_flags & SHM_PAGEABLE) { 2039 return (segspt_dismfault(hat, seg, addr, len, type, rw)); 2040 } 2041 2042 /* 2043 * Because of the way spt is implemented 2044 * the realsize of the segment does not have to be 2045 * equal to the segment size itself. The segment size is 2046 * often in multiples of a page size larger than PAGESIZE. 2047 * The realsize is rounded up to the nearest PAGESIZE 2048 * based on what the user requested. This is a bit of 2049 * ungliness that is historical but not easily fixed 2050 * without re-designing the higher levels of ISM. 2051 */ 2052 ASSERT(addr >= seg->s_base); 2053 if (((addr + len) - seg->s_base) > sptd->spt_realsize) 2054 return (FC_NOMAP); 2055 /* 2056 * For all of the following cases except F_PROT, we need to 2057 * make any necessary adjustments to addr and len 2058 * and get all of the necessary page_t's into an array called ppa[]. 2059 * 2060 * The code in shmat() forces base addr and len of ISM segment 2061 * to be aligned to largest page size supported. Therefore, 2062 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large 2063 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK 2064 * in large pagesize chunks, or else we will screw up the HAT 2065 * layer by calling hat_memload_array() with differing page sizes 2066 * over a given virtual range. 2067 */ 2068 pgsz = page_get_pagesize(sptseg->s_szc); 2069 pgcnt = page_get_pagecnt(sptseg->s_szc); 2070 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); 2071 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz); 2072 npages = btopr(size); 2073 2074 /* 2075 * Now we need to convert from addr in segshm to addr in segspt. 2076 */ 2077 anon_index = seg_page(seg, shm_addr); 2078 sptseg_addr = sptseg->s_base + ptob(anon_index); 2079 2080 /* 2081 * And now we may have to adjust npages downward if we have 2082 * exceeded the realsize of the segment or initial anon 2083 * allocations. 2084 */ 2085 if ((sptseg_addr + ptob(npages)) > 2086 (sptseg->s_base + sptd->spt_realsize)) 2087 size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr; 2088 2089 npages = btopr(size); 2090 2091 ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size)); 2092 ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0); 2093 2094 switch (type) { 2095 2096 case F_SOFTLOCK: 2097 2098 /* 2099 * availrmem is decremented once during anon_swap_adjust() 2100 * and is incremented during the anon_unresv(), which is 2101 * called from shm_rm_amp() when the segment is destroyed. 2102 */ 2103 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages); 2104 /* 2105 * Some platforms assume that ISM pages are SE_SHARED 2106 * locked for the entire life of the segment. 2107 */ 2108 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) 2109 return (0); 2110 /* 2111 * Fall through to the F_INVAL case to load up the hat layer 2112 * entries with the HAT_LOAD_LOCK flag. 2113 */ 2114 2115 /* FALLTHRU */ 2116 case F_INVAL: 2117 2118 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC)) 2119 return (FC_NOMAP); 2120 2121 /* 2122 * Some platforms that do NOT support DYNAMIC_ISM_UNMAP 2123 * may still rely on this call to hat_share(). That 2124 * would imply that those hat's can fault on a 2125 * HAT_LOAD_LOCK translation, which would seem 2126 * contradictory. 2127 */ 2128 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 2129 if (hat_share(seg->s_as->a_hat, seg->s_base, 2130 curspt->a_hat, sptseg->s_base, 2131 sptseg->s_size, sptseg->s_szc) != 0) { 2132 panic("hat_share error in ISM fault"); 2133 /*NOTREACHED*/ 2134 } 2135 return (0); 2136 } 2137 ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP); 2138 2139 /* 2140 * I see no need to lock the real seg, 2141 * here, because all of our work will be on the underlying 2142 * dummy seg. 2143 * 2144 * sptseg_addr and npages now account for large pages. 2145 */ 2146 amp = sptd->spt_amp; 2147 ASSERT(amp != NULL); 2148 anon_index = seg_page(sptseg, sptseg_addr); 2149 2150 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2151 for (i = 0; i < npages; i++) { 2152 ap = anon_get_ptr(amp->ahp, anon_index++); 2153 ASSERT(ap != NULL); 2154 swap_xlate(ap, &vp, &offset); 2155 pp = page_lookup(vp, offset, SE_SHARED); 2156 ASSERT(pp != NULL); 2157 ppa[i] = pp; 2158 } 2159 ANON_LOCK_EXIT(&->a_rwlock); 2160 ASSERT(i == npages); 2161 2162 /* 2163 * We are already holding the as->a_lock on the user's 2164 * real segment, but we need to hold the a_lock on the 2165 * underlying dummy as. This is mostly to satisfy the 2166 * underlying HAT layer. 2167 */ 2168 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER); 2169 a = sptseg_addr; 2170 pidx = 0; 2171 if (type == F_SOFTLOCK) { 2172 /* 2173 * Load up the translation keeping it 2174 * locked and don't unlock the page. 2175 */ 2176 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 2177 sz = MIN(pgsz, ptob(npages - pidx)); 2178 hat_memload_array(sptseg->s_as->a_hat, a, 2179 sz, &ppa[pidx], sptd->spt_prot, 2180 HAT_LOAD_LOCK | HAT_LOAD_SHARE); 2181 } 2182 } else { 2183 if (hat == seg->s_as->a_hat) { 2184 2185 /* 2186 * Migrate pages marked for migration. 2187 */ 2188 if (lgrp_optimizations()) 2189 page_migrate(seg, shm_addr, ppa, 2190 npages); 2191 2192 /* CPU HAT */ 2193 for (; pidx < npages; 2194 a += pgsz, pidx += pgcnt) { 2195 sz = MIN(pgsz, ptob(npages - pidx)); 2196 hat_memload_array(sptseg->s_as->a_hat, 2197 a, sz, &ppa[pidx], 2198 sptd->spt_prot, HAT_LOAD_SHARE); 2199 } 2200 } else { 2201 /* XHAT. Pass real address */ 2202 hat_memload_array(hat, shm_addr, 2203 ptob(npages), ppa, sptd->spt_prot, 2204 HAT_LOAD_SHARE); 2205 } 2206 2207 /* 2208 * And now drop the SE_SHARED lock(s). 2209 */ 2210 for (i = 0; i < npages; i++) 2211 page_unlock(ppa[i]); 2212 } 2213 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock); 2214 2215 kmem_free(ppa, sizeof (page_t *) * npages); 2216 return (0); 2217 case F_SOFTUNLOCK: 2218 2219 /* 2220 * This is a bit ugly, we pass in the real seg pointer, 2221 * but the sptseg_addr is the virtual address within the 2222 * dummy seg. 2223 */ 2224 segspt_softunlock(seg, sptseg_addr, ptob(npages), rw); 2225 return (0); 2226 2227 case F_PROT: 2228 2229 /* 2230 * This takes care of the unusual case where a user 2231 * allocates a stack in shared memory and a register 2232 * window overflow is written to that stack page before 2233 * it is otherwise modified. 2234 * 2235 * We can get away with this because ISM segments are 2236 * always rw. Other than this unusual case, there 2237 * should be no instances of protection violations. 2238 */ 2239 return (0); 2240 2241 default: 2242 #ifdef DEBUG 2243 cmn_err(CE_WARN, "segspt_shmfault default type?"); 2244 #endif 2245 return (FC_NOMAP); 2246 } 2247 } 2248 2249 /*ARGSUSED*/ 2250 static faultcode_t 2251 segspt_shmfaulta(struct seg *seg, caddr_t addr) 2252 { 2253 return (0); 2254 } 2255 2256 /*ARGSUSED*/ 2257 static int 2258 segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta) 2259 { 2260 return (0); 2261 } 2262 2263 /*ARGSUSED*/ 2264 static size_t 2265 segspt_shmswapout(struct seg *seg) 2266 { 2267 return (0); 2268 } 2269 2270 /* 2271 * duplicate the shared page tables 2272 */ 2273 int 2274 segspt_shmdup(struct seg *seg, struct seg *newseg) 2275 { 2276 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2277 struct anon_map *amp = shmd->shm_amp; 2278 struct shm_data *shmd_new; 2279 struct seg *spt_seg = shmd->shm_sptseg; 2280 struct spt_data *sptd = spt_seg->s_data; 2281 int error = 0; 2282 2283 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 2284 2285 shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP); 2286 newseg->s_data = (void *)shmd_new; 2287 shmd_new->shm_sptas = shmd->shm_sptas; 2288 shmd_new->shm_amp = amp; 2289 shmd_new->shm_sptseg = shmd->shm_sptseg; 2290 newseg->s_ops = &segspt_shmops; 2291 newseg->s_szc = seg->s_szc; 2292 ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc); 2293 2294 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2295 amp->refcnt++; 2296 ANON_LOCK_EXIT(&->a_rwlock); 2297 2298 if (sptd->spt_flags & SHM_PAGEABLE) { 2299 shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP); 2300 shmd_new->shm_lckpgs = 0; 2301 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 2302 if ((error = hat_share(newseg->s_as->a_hat, 2303 newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR, 2304 seg->s_size, seg->s_szc)) != 0) { 2305 kmem_free(shmd_new->shm_vpage, 2306 btopr(amp->size)); 2307 } 2308 } 2309 return (error); 2310 } else { 2311 return (hat_share(newseg->s_as->a_hat, newseg->s_base, 2312 shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size, 2313 seg->s_szc)); 2314 2315 } 2316 } 2317 2318 /*ARGSUSED*/ 2319 int 2320 segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot) 2321 { 2322 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2323 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2324 2325 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2326 2327 /* 2328 * ISM segment is always rw. 2329 */ 2330 return (((sptd->spt_prot & prot) != prot) ? EACCES : 0); 2331 } 2332 2333 /* 2334 * Return an array of locked large pages, for empty slots allocate 2335 * private zero-filled anon pages. 2336 */ 2337 static int 2338 spt_anon_getpages( 2339 struct seg *sptseg, 2340 caddr_t sptaddr, 2341 size_t len, 2342 page_t *ppa[]) 2343 { 2344 struct spt_data *sptd = sptseg->s_data; 2345 struct anon_map *amp = sptd->spt_amp; 2346 enum seg_rw rw = sptd->spt_prot; 2347 uint_t szc = sptseg->s_szc; 2348 size_t pg_sz, share_sz = page_get_pagesize(szc); 2349 pgcnt_t lp_npgs; 2350 caddr_t lp_addr, e_sptaddr; 2351 uint_t vpprot, ppa_szc = 0; 2352 struct vpage *vpage = NULL; 2353 ulong_t j, ppa_idx; 2354 int err, ierr = 0; 2355 pgcnt_t an_idx; 2356 anon_sync_obj_t cookie; 2357 int anon_locked = 0; 2358 pgcnt_t amp_pgs; 2359 2360 2361 ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz)); 2362 ASSERT(len != 0); 2363 2364 pg_sz = share_sz; 2365 lp_npgs = btop(pg_sz); 2366 lp_addr = sptaddr; 2367 e_sptaddr = sptaddr + len; 2368 an_idx = seg_page(sptseg, sptaddr); 2369 ppa_idx = 0; 2370 2371 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2372 2373 amp_pgs = page_get_pagecnt(amp->a_szc); 2374 2375 /*CONSTCOND*/ 2376 while (1) { 2377 for (; lp_addr < e_sptaddr; 2378 an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) { 2379 2380 /* 2381 * If we're currently locked, and we get to a new 2382 * page, unlock our current anon chunk. 2383 */ 2384 if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) { 2385 anon_array_exit(&cookie); 2386 anon_locked = 0; 2387 } 2388 if (!anon_locked) { 2389 anon_array_enter(amp, an_idx, &cookie); 2390 anon_locked = 1; 2391 } 2392 ppa_szc = (uint_t)-1; 2393 ierr = anon_map_getpages(amp, an_idx, szc, sptseg, 2394 lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx], 2395 &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred); 2396 2397 if (ierr != 0) { 2398 if (ierr > 0) { 2399 err = FC_MAKE_ERR(ierr); 2400 goto lpgs_err; 2401 } 2402 break; 2403 } 2404 } 2405 if (lp_addr == e_sptaddr) { 2406 break; 2407 } 2408 ASSERT(lp_addr < e_sptaddr); 2409 2410 /* 2411 * ierr == -1 means we failed to allocate a large page. 2412 * so do a size down operation. 2413 * 2414 * ierr == -2 means some other process that privately shares 2415 * pages with this process has allocated a larger page and we 2416 * need to retry with larger pages. So do a size up 2417 * operation. This relies on the fact that large pages are 2418 * never partially shared i.e. if we share any constituent 2419 * page of a large page with another process we must share the 2420 * entire large page. Note this cannot happen for SOFTLOCK 2421 * case, unless current address (lpaddr) is at the beginning 2422 * of the next page size boundary because the other process 2423 * couldn't have relocated locked pages. 2424 */ 2425 ASSERT(ierr == -1 || ierr == -2); 2426 if (segvn_anypgsz) { 2427 ASSERT(ierr == -2 || szc != 0); 2428 ASSERT(ierr == -1 || szc < sptseg->s_szc); 2429 szc = (ierr == -1) ? szc - 1 : szc + 1; 2430 } else { 2431 /* 2432 * For faults and segvn_anypgsz == 0 2433 * we need to be careful not to loop forever 2434 * if existing page is found with szc other 2435 * than 0 or seg->s_szc. This could be due 2436 * to page relocations on behalf of DR or 2437 * more likely large page creation. For this 2438 * case simply re-size to existing page's szc 2439 * if returned by anon_map_getpages(). 2440 */ 2441 if (ppa_szc == (uint_t)-1) { 2442 szc = (ierr == -1) ? 0 : sptseg->s_szc; 2443 } else { 2444 ASSERT(ppa_szc <= sptseg->s_szc); 2445 ASSERT(ierr == -2 || ppa_szc < szc); 2446 ASSERT(ierr == -1 || ppa_szc > szc); 2447 szc = ppa_szc; 2448 } 2449 } 2450 pg_sz = page_get_pagesize(szc); 2451 lp_npgs = btop(pg_sz); 2452 ASSERT(IS_P2ALIGNED(lp_addr, pg_sz)); 2453 } 2454 if (anon_locked) { 2455 anon_array_exit(&cookie); 2456 } 2457 ANON_LOCK_EXIT(&->a_rwlock); 2458 return (0); 2459 2460 lpgs_err: 2461 if (anon_locked) { 2462 anon_array_exit(&cookie); 2463 } 2464 ANON_LOCK_EXIT(&->a_rwlock); 2465 for (j = 0; j < ppa_idx; j++) 2466 page_unlock(ppa[j]); 2467 return (err); 2468 } 2469 2470 /* 2471 * count the number of bytes in a set of spt pages that are currently not 2472 * locked 2473 */ 2474 static rctl_qty_t 2475 spt_unlockedbytes(pgcnt_t npages, page_t **ppa) 2476 { 2477 ulong_t i; 2478 rctl_qty_t unlocked = 0; 2479 2480 for (i = 0; i < npages; i++) { 2481 if (ppa[i]->p_lckcnt == 0) 2482 unlocked += PAGESIZE; 2483 } 2484 return (unlocked); 2485 } 2486 2487 extern u_longlong_t randtick(void); 2488 /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */ 2489 #define NLCK (NCPU_P2) 2490 /* Random number with a range [0, n-1], n must be power of two */ 2491 #define RAND_P2(n) \ 2492 ((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1)) 2493 2494 int 2495 spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages, 2496 page_t **ppa, ulong_t *lockmap, size_t pos, 2497 rctl_qty_t *locked) 2498 { 2499 struct shm_data *shmd = seg->s_data; 2500 struct spt_data *sptd = shmd->shm_sptseg->s_data; 2501 ulong_t i; 2502 int kernel; 2503 pgcnt_t nlck = 0; 2504 int rv = 0; 2505 int use_reserved = 1; 2506 2507 /* return the number of bytes actually locked */ 2508 *locked = 0; 2509 2510 /* 2511 * To avoid contention on freemem_lock, availrmem and pages_locked 2512 * global counters are updated only every nlck locked pages instead of 2513 * every time. Reserve nlck locks up front and deduct from this 2514 * reservation for each page that requires a lock. When the reservation 2515 * is consumed, reserve again. nlck is randomized, so the competing 2516 * threads do not fall into a cyclic lock contention pattern. When 2517 * memory is low, the lock ahead is disabled, and instead page_pp_lock() 2518 * is used to lock pages. 2519 */ 2520 for (i = 0; i < npages; anon_index++, pos++, i++) { 2521 if (nlck == 0 && use_reserved == 1) { 2522 nlck = NLCK + RAND_P2(NLCK); 2523 /* if fewer loops left, decrease nlck */ 2524 nlck = MIN(nlck, npages - i); 2525 /* 2526 * Reserve nlck locks up front and deduct from this 2527 * reservation for each page that requires a lock. When 2528 * the reservation is consumed, reserve again. 2529 */ 2530 mutex_enter(&freemem_lock); 2531 if ((availrmem - nlck) < pages_pp_maximum) { 2532 /* Do not do advance memory reserves */ 2533 use_reserved = 0; 2534 } else { 2535 availrmem -= nlck; 2536 pages_locked += nlck; 2537 } 2538 mutex_exit(&freemem_lock); 2539 } 2540 if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) { 2541 if (sptd->spt_ppa_lckcnt[anon_index] < 2542 (ushort_t)DISM_LOCK_MAX) { 2543 if (++sptd->spt_ppa_lckcnt[anon_index] == 2544 (ushort_t)DISM_LOCK_MAX) { 2545 cmn_err(CE_WARN, 2546 "DISM page lock limit " 2547 "reached on DISM offset 0x%lx\n", 2548 anon_index << PAGESHIFT); 2549 } 2550 kernel = (sptd->spt_ppa && 2551 sptd->spt_ppa[anon_index]); 2552 if (!page_pp_lock(ppa[i], 0, kernel || 2553 use_reserved)) { 2554 sptd->spt_ppa_lckcnt[anon_index]--; 2555 rv = EAGAIN; 2556 break; 2557 } 2558 /* if this is a newly locked page, count it */ 2559 if (ppa[i]->p_lckcnt == 1) { 2560 if (kernel == 0 && use_reserved == 1) 2561 nlck--; 2562 *locked += PAGESIZE; 2563 } 2564 shmd->shm_lckpgs++; 2565 shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED; 2566 if (lockmap != NULL) 2567 BT_SET(lockmap, pos); 2568 } 2569 } 2570 } 2571 /* Return unused lock reservation */ 2572 if (nlck != 0 && use_reserved == 1) { 2573 mutex_enter(&freemem_lock); 2574 availrmem += nlck; 2575 pages_locked -= nlck; 2576 mutex_exit(&freemem_lock); 2577 } 2578 2579 return (rv); 2580 } 2581 2582 int 2583 spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages, 2584 rctl_qty_t *unlocked) 2585 { 2586 struct shm_data *shmd = seg->s_data; 2587 struct spt_data *sptd = shmd->shm_sptseg->s_data; 2588 struct anon_map *amp = sptd->spt_amp; 2589 struct anon *ap; 2590 struct vnode *vp; 2591 u_offset_t off; 2592 struct page *pp; 2593 int kernel; 2594 anon_sync_obj_t cookie; 2595 ulong_t i; 2596 pgcnt_t nlck = 0; 2597 pgcnt_t nlck_limit = NLCK; 2598 2599 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2600 for (i = 0; i < npages; i++, anon_index++) { 2601 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) { 2602 anon_array_enter(amp, anon_index, &cookie); 2603 ap = anon_get_ptr(amp->ahp, anon_index); 2604 ASSERT(ap); 2605 2606 swap_xlate(ap, &vp, &off); 2607 anon_array_exit(&cookie); 2608 pp = page_lookup(vp, off, SE_SHARED); 2609 ASSERT(pp); 2610 /* 2611 * availrmem is decremented only for pages which are not 2612 * in seg pcache, for pages in seg pcache availrmem was 2613 * decremented in _dismpagelock() 2614 */ 2615 kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]); 2616 ASSERT(pp->p_lckcnt > 0); 2617 2618 /* 2619 * lock page but do not change availrmem, we do it 2620 * ourselves every nlck loops. 2621 */ 2622 page_pp_unlock(pp, 0, 1); 2623 if (pp->p_lckcnt == 0) { 2624 if (kernel == 0) 2625 nlck++; 2626 *unlocked += PAGESIZE; 2627 } 2628 page_unlock(pp); 2629 shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED; 2630 sptd->spt_ppa_lckcnt[anon_index]--; 2631 shmd->shm_lckpgs--; 2632 } 2633 2634 /* 2635 * To reduce freemem_lock contention, do not update availrmem 2636 * until at least NLCK pages have been unlocked. 2637 * 1. No need to update if nlck is zero 2638 * 2. Always update if the last iteration 2639 */ 2640 if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) { 2641 mutex_enter(&freemem_lock); 2642 availrmem += nlck; 2643 pages_locked -= nlck; 2644 mutex_exit(&freemem_lock); 2645 nlck = 0; 2646 nlck_limit = NLCK + RAND_P2(NLCK); 2647 } 2648 } 2649 ANON_LOCK_EXIT(&->a_rwlock); 2650 2651 return (0); 2652 } 2653 2654 /*ARGSUSED*/ 2655 static int 2656 segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len, 2657 int attr, int op, ulong_t *lockmap, size_t pos) 2658 { 2659 struct shm_data *shmd = seg->s_data; 2660 struct seg *sptseg = shmd->shm_sptseg; 2661 struct spt_data *sptd = sptseg->s_data; 2662 struct kshmid *sp = sptd->spt_amp->a_sp; 2663 pgcnt_t npages, a_npages; 2664 page_t **ppa; 2665 pgcnt_t an_idx, a_an_idx, ppa_idx; 2666 caddr_t spt_addr, a_addr; /* spt and aligned address */ 2667 size_t a_len; /* aligned len */ 2668 size_t share_sz; 2669 ulong_t i; 2670 int sts = 0; 2671 rctl_qty_t unlocked = 0; 2672 rctl_qty_t locked = 0; 2673 struct proc *p = curproc; 2674 kproject_t *proj; 2675 2676 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2677 ASSERT(sp != NULL); 2678 2679 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 2680 return (0); 2681 } 2682 2683 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2684 an_idx = seg_page(seg, addr); 2685 npages = btopr(len); 2686 2687 if (an_idx + npages > btopr(shmd->shm_amp->size)) { 2688 return (ENOMEM); 2689 } 2690 2691 /* 2692 * A shm's project never changes, so no lock needed. 2693 * The shm has a hold on the project, so it will not go away. 2694 * Since we have a mapping to shm within this zone, we know 2695 * that the zone will not go away. 2696 */ 2697 proj = sp->shm_perm.ipc_proj; 2698 2699 if (op == MC_LOCK) { 2700 2701 /* 2702 * Need to align addr and size request if they are not 2703 * aligned so we can always allocate large page(s) however 2704 * we only lock what was requested in initial request. 2705 */ 2706 share_sz = page_get_pagesize(sptseg->s_szc); 2707 a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz); 2708 a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)), 2709 share_sz); 2710 a_npages = btop(a_len); 2711 a_an_idx = seg_page(seg, a_addr); 2712 spt_addr = sptseg->s_base + ptob(a_an_idx); 2713 ppa_idx = an_idx - a_an_idx; 2714 2715 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages), 2716 KM_NOSLEEP)) == NULL) { 2717 return (ENOMEM); 2718 } 2719 2720 /* 2721 * Don't cache any new pages for IO and 2722 * flush any cached pages. 2723 */ 2724 mutex_enter(&sptd->spt_lock); 2725 if (sptd->spt_ppa != NULL) 2726 sptd->spt_flags |= DISM_PPA_CHANGED; 2727 2728 sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa); 2729 if (sts != 0) { 2730 mutex_exit(&sptd->spt_lock); 2731 kmem_free(ppa, ((sizeof (page_t *)) * a_npages)); 2732 return (sts); 2733 } 2734 2735 mutex_enter(&sp->shm_mlock); 2736 /* enforce locked memory rctl */ 2737 unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]); 2738 2739 mutex_enter(&p->p_lock); 2740 if (rctl_incr_locked_mem(p, proj, unlocked, 0)) { 2741 mutex_exit(&p->p_lock); 2742 sts = EAGAIN; 2743 } else { 2744 mutex_exit(&p->p_lock); 2745 sts = spt_lockpages(seg, an_idx, npages, 2746 &ppa[ppa_idx], lockmap, pos, &locked); 2747 2748 /* 2749 * correct locked count if not all pages could be 2750 * locked 2751 */ 2752 if ((unlocked - locked) > 0) { 2753 rctl_decr_locked_mem(NULL, proj, 2754 (unlocked - locked), 0); 2755 } 2756 } 2757 /* 2758 * unlock pages 2759 */ 2760 for (i = 0; i < a_npages; i++) 2761 page_unlock(ppa[i]); 2762 if (sptd->spt_ppa != NULL) 2763 sptd->spt_flags |= DISM_PPA_CHANGED; 2764 mutex_exit(&sp->shm_mlock); 2765 mutex_exit(&sptd->spt_lock); 2766 2767 kmem_free(ppa, ((sizeof (page_t *)) * a_npages)); 2768 2769 } else if (op == MC_UNLOCK) { /* unlock */ 2770 page_t **ppa; 2771 2772 mutex_enter(&sptd->spt_lock); 2773 if (shmd->shm_lckpgs == 0) { 2774 mutex_exit(&sptd->spt_lock); 2775 return (0); 2776 } 2777 /* 2778 * Don't cache new IO pages. 2779 */ 2780 if (sptd->spt_ppa != NULL) 2781 sptd->spt_flags |= DISM_PPA_CHANGED; 2782 2783 mutex_enter(&sp->shm_mlock); 2784 sts = spt_unlockpages(seg, an_idx, npages, &unlocked); 2785 if ((ppa = sptd->spt_ppa) != NULL) 2786 sptd->spt_flags |= DISM_PPA_CHANGED; 2787 mutex_exit(&sptd->spt_lock); 2788 2789 rctl_decr_locked_mem(NULL, proj, unlocked, 0); 2790 mutex_exit(&sp->shm_mlock); 2791 2792 if (ppa != NULL) 2793 seg_ppurge_wiredpp(ppa); 2794 } 2795 return (sts); 2796 } 2797 2798 /*ARGSUSED*/ 2799 int 2800 segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 2801 { 2802 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2803 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2804 spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1; 2805 2806 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2807 2808 /* 2809 * ISM segment is always rw. 2810 */ 2811 while (--pgno >= 0) 2812 *protv++ = sptd->spt_prot; 2813 return (0); 2814 } 2815 2816 /*ARGSUSED*/ 2817 u_offset_t 2818 segspt_shmgetoffset(struct seg *seg, caddr_t addr) 2819 { 2820 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2821 2822 /* Offset does not matter in ISM memory */ 2823 2824 return ((u_offset_t)0); 2825 } 2826 2827 /* ARGSUSED */ 2828 int 2829 segspt_shmgettype(struct seg *seg, caddr_t addr) 2830 { 2831 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2832 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2833 2834 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2835 2836 /* 2837 * The shared memory mapping is always MAP_SHARED, SWAP is only 2838 * reserved for DISM 2839 */ 2840 return (MAP_SHARED | 2841 ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE)); 2842 } 2843 2844 /*ARGSUSED*/ 2845 int 2846 segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 2847 { 2848 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2849 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2850 2851 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2852 2853 *vpp = sptd->spt_vp; 2854 return (0); 2855 } 2856 2857 /* 2858 * We need to wait for pending IO to complete to a DISM segment in order for 2859 * pages to get kicked out of the seg_pcache. 120 seconds should be more 2860 * than enough time to wait. 2861 */ 2862 static clock_t spt_pcache_wait = 120; 2863 2864 /*ARGSUSED*/ 2865 static int 2866 segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 2867 { 2868 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2869 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2870 struct anon_map *amp; 2871 pgcnt_t pg_idx; 2872 ushort_t gen; 2873 clock_t end_lbolt; 2874 int writer; 2875 page_t **ppa; 2876 2877 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2878 2879 if (behav == MADV_FREE) { 2880 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) 2881 return (0); 2882 2883 amp = sptd->spt_amp; 2884 pg_idx = seg_page(seg, addr); 2885 2886 mutex_enter(&sptd->spt_lock); 2887 if ((ppa = sptd->spt_ppa) == NULL) { 2888 mutex_exit(&sptd->spt_lock); 2889 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2890 anon_disclaim(amp, pg_idx, len); 2891 ANON_LOCK_EXIT(&->a_rwlock); 2892 return (0); 2893 } 2894 2895 sptd->spt_flags |= DISM_PPA_CHANGED; 2896 gen = sptd->spt_gen; 2897 2898 mutex_exit(&sptd->spt_lock); 2899 2900 /* 2901 * Purge all DISM cached pages 2902 */ 2903 seg_ppurge_wiredpp(ppa); 2904 2905 /* 2906 * Drop the AS_LOCK so that other threads can grab it 2907 * in the as_pageunlock path and hopefully get the segment 2908 * kicked out of the seg_pcache. We bump the shm_softlockcnt 2909 * to keep this segment resident. 2910 */ 2911 writer = AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock); 2912 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 2913 AS_LOCK_EXIT(seg->s_as, &seg->s_as->a_lock); 2914 2915 mutex_enter(&sptd->spt_lock); 2916 2917 end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait); 2918 2919 /* 2920 * Try to wait for pages to get kicked out of the seg_pcache. 2921 */ 2922 while (sptd->spt_gen == gen && 2923 (sptd->spt_flags & DISM_PPA_CHANGED) && 2924 ddi_get_lbolt() < end_lbolt) { 2925 if (!cv_timedwait_sig(&sptd->spt_cv, 2926 &sptd->spt_lock, end_lbolt)) { 2927 break; 2928 } 2929 } 2930 2931 mutex_exit(&sptd->spt_lock); 2932 2933 /* Regrab the AS_LOCK and release our hold on the segment */ 2934 AS_LOCK_ENTER(seg->s_as, &seg->s_as->a_lock, 2935 writer ? RW_WRITER : RW_READER); 2936 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 2937 if (shmd->shm_softlockcnt <= 0) { 2938 if (AS_ISUNMAPWAIT(seg->s_as)) { 2939 mutex_enter(&seg->s_as->a_contents); 2940 if (AS_ISUNMAPWAIT(seg->s_as)) { 2941 AS_CLRUNMAPWAIT(seg->s_as); 2942 cv_broadcast(&seg->s_as->a_cv); 2943 } 2944 mutex_exit(&seg->s_as->a_contents); 2945 } 2946 } 2947 2948 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2949 anon_disclaim(amp, pg_idx, len); 2950 ANON_LOCK_EXIT(&->a_rwlock); 2951 } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP || 2952 behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) { 2953 int already_set; 2954 ulong_t anon_index; 2955 lgrp_mem_policy_t policy; 2956 caddr_t shm_addr; 2957 size_t share_size; 2958 size_t size; 2959 struct seg *sptseg = shmd->shm_sptseg; 2960 caddr_t sptseg_addr; 2961 2962 /* 2963 * Align address and length to page size of underlying segment 2964 */ 2965 share_size = page_get_pagesize(shmd->shm_sptseg->s_szc); 2966 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size); 2967 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), 2968 share_size); 2969 2970 amp = shmd->shm_amp; 2971 anon_index = seg_page(seg, shm_addr); 2972 2973 /* 2974 * And now we may have to adjust size downward if we have 2975 * exceeded the realsize of the segment or initial anon 2976 * allocations. 2977 */ 2978 sptseg_addr = sptseg->s_base + ptob(anon_index); 2979 if ((sptseg_addr + size) > 2980 (sptseg->s_base + sptd->spt_realsize)) 2981 size = (sptseg->s_base + sptd->spt_realsize) - 2982 sptseg_addr; 2983 2984 /* 2985 * Set memory allocation policy for this segment 2986 */ 2987 policy = lgrp_madv_to_policy(behav, len, MAP_SHARED); 2988 already_set = lgrp_shm_policy_set(policy, amp, anon_index, 2989 NULL, 0, len); 2990 2991 /* 2992 * If random memory allocation policy set already, 2993 * don't bother reapplying it. 2994 */ 2995 if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 2996 return (0); 2997 2998 /* 2999 * Mark any existing pages in the given range for 3000 * migration, flushing the I/O page cache, and using 3001 * underlying segment to calculate anon index and get 3002 * anonmap and vnode pointer from 3003 */ 3004 if (shmd->shm_softlockcnt > 0) 3005 segspt_purge(seg); 3006 3007 page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0); 3008 } 3009 3010 return (0); 3011 } 3012 3013 /*ARGSUSED*/ 3014 void 3015 segspt_shmdump(struct seg *seg) 3016 { 3017 /* no-op for ISM segment */ 3018 } 3019 3020 /*ARGSUSED*/ 3021 static faultcode_t 3022 segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 3023 { 3024 return (ENOTSUP); 3025 } 3026 3027 /* 3028 * get a memory ID for an addr in a given segment 3029 */ 3030 static int 3031 segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 3032 { 3033 struct shm_data *shmd = (struct shm_data *)seg->s_data; 3034 struct anon *ap; 3035 size_t anon_index; 3036 struct anon_map *amp = shmd->shm_amp; 3037 struct spt_data *sptd = shmd->shm_sptseg->s_data; 3038 struct seg *sptseg = shmd->shm_sptseg; 3039 anon_sync_obj_t cookie; 3040 3041 anon_index = seg_page(seg, addr); 3042 3043 if (addr > (seg->s_base + sptd->spt_realsize)) { 3044 return (EFAULT); 3045 } 3046 3047 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3048 anon_array_enter(amp, anon_index, &cookie); 3049 ap = anon_get_ptr(amp->ahp, anon_index); 3050 if (ap == NULL) { 3051 struct page *pp; 3052 caddr_t spt_addr = sptseg->s_base + ptob(anon_index); 3053 3054 pp = anon_zero(sptseg, spt_addr, &ap, kcred); 3055 if (pp == NULL) { 3056 anon_array_exit(&cookie); 3057 ANON_LOCK_EXIT(&->a_rwlock); 3058 return (ENOMEM); 3059 } 3060 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 3061 page_unlock(pp); 3062 } 3063 anon_array_exit(&cookie); 3064 ANON_LOCK_EXIT(&->a_rwlock); 3065 memidp->val[0] = (uintptr_t)ap; 3066 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 3067 return (0); 3068 } 3069 3070 /* 3071 * Get memory allocation policy info for specified address in given segment 3072 */ 3073 static lgrp_mem_policy_info_t * 3074 segspt_shmgetpolicy(struct seg *seg, caddr_t addr) 3075 { 3076 struct anon_map *amp; 3077 ulong_t anon_index; 3078 lgrp_mem_policy_info_t *policy_info; 3079 struct shm_data *shm_data; 3080 3081 ASSERT(seg != NULL); 3082 3083 /* 3084 * Get anon_map from segshm 3085 * 3086 * Assume that no lock needs to be held on anon_map, since 3087 * it should be protected by its reference count which must be 3088 * nonzero for an existing segment 3089 * Need to grab readers lock on policy tree though 3090 */ 3091 shm_data = (struct shm_data *)seg->s_data; 3092 if (shm_data == NULL) 3093 return (NULL); 3094 amp = shm_data->shm_amp; 3095 ASSERT(amp->refcnt != 0); 3096 3097 /* 3098 * Get policy info 3099 * 3100 * Assume starting anon index of 0 3101 */ 3102 anon_index = seg_page(seg, addr); 3103 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0); 3104 3105 return (policy_info); 3106 }