1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/param.h> 26 #include <sys/user.h> 27 #include <sys/mman.h> 28 #include <sys/kmem.h> 29 #include <sys/sysmacros.h> 30 #include <sys/cmn_err.h> 31 #include <sys/systm.h> 32 #include <sys/tuneable.h> 33 #include <vm/hat.h> 34 #include <vm/seg.h> 35 #include <vm/as.h> 36 #include <vm/anon.h> 37 #include <vm/page.h> 38 #include <sys/buf.h> 39 #include <sys/swap.h> 40 #include <sys/atomic.h> 41 #include <vm/seg_spt.h> 42 #include <sys/debug.h> 43 #include <sys/vtrace.h> 44 #include <sys/shm.h> 45 #include <sys/shm_impl.h> 46 #include <sys/lgrp.h> 47 #include <sys/vmsystm.h> 48 #include <sys/policy.h> 49 #include <sys/project.h> 50 #include <sys/tnf_probe.h> 51 #include <sys/zone.h> 52 53 #define SEGSPTADDR (caddr_t)0x0 54 55 /* 56 * # pages used for spt 57 */ 58 size_t spt_used; 59 60 /* 61 * segspt_minfree is the memory left for system after ISM 62 * locked its pages; it is set up to 5% of availrmem in 63 * sptcreate when ISM is created. ISM should not use more 64 * than ~90% of availrmem; if it does, then the performance 65 * of the system may decrease. Machines with large memories may 66 * be able to use up more memory for ISM so we set the default 67 * segspt_minfree to 5% (which gives ISM max 95% of availrmem. 68 * If somebody wants even more memory for ISM (risking hanging 69 * the system) they can patch the segspt_minfree to smaller number. 70 */ 71 pgcnt_t segspt_minfree = 0; 72 73 static int segspt_create(struct seg *seg, caddr_t argsp); 74 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize); 75 static void segspt_free(struct seg *seg); 76 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len); 77 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr); 78 79 static const struct seg_ops segspt_ops = { 80 .unmap = segspt_unmap, 81 .free = segspt_free, 82 .getpolicy = segspt_getpolicy, 83 }; 84 85 static int segspt_shmdup(struct seg *seg, struct seg *newseg); 86 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize); 87 static void segspt_shmfree(struct seg *seg); 88 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg, 89 caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw); 90 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr); 91 static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr, 92 register size_t len, register uint_t prot); 93 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, 94 uint_t prot); 95 static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta); 96 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, 97 register char *vec); 98 static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len, 99 int attr, uint_t flags); 100 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len, 101 int attr, int op, ulong_t *lockmap, size_t pos); 102 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, 103 uint_t *protv); 104 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr); 105 static int segspt_shmgettype(struct seg *seg, caddr_t addr); 106 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp); 107 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, 108 uint_t behav); 109 static int segspt_shmpagelock(struct seg *, caddr_t, size_t, 110 struct page ***, enum lock_type, enum seg_rw); 111 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *); 112 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t); 113 114 const struct seg_ops segspt_shmops = { 115 .dup = segspt_shmdup, 116 .unmap = segspt_shmunmap, 117 .free = segspt_shmfree, 118 .fault = segspt_shmfault, 119 .faulta = segspt_shmfaulta, 120 .setprot = segspt_shmsetprot, 121 .checkprot = segspt_shmcheckprot, 122 .kluster = segspt_shmkluster, 123 .sync = segspt_shmsync, 124 .incore = segspt_shmincore, 125 .lockop = segspt_shmlockop, 126 .getprot = segspt_shmgetprot, 127 .getoffset = segspt_shmgetoffset, 128 .gettype = segspt_shmgettype, 129 .getvp = segspt_shmgetvp, 130 .advise = segspt_shmadvise, 131 .pagelock = segspt_shmpagelock, 132 .getmemid = segspt_shmgetmemid, 133 .getpolicy = segspt_shmgetpolicy, 134 }; 135 136 static void segspt_purge(struct seg *seg); 137 static int segspt_reclaim(void *, caddr_t, size_t, struct page **, 138 enum seg_rw, int); 139 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len, 140 page_t **ppa); 141 142 143 144 /*ARGSUSED*/ 145 int 146 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp, 147 uint_t prot, uint_t flags, uint_t share_szc) 148 { 149 int err; 150 struct as *newas; 151 struct segspt_crargs sptcargs; 152 153 #ifdef DEBUG 154 TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */, 155 tnf_ulong, size, size ); 156 #endif 157 if (segspt_minfree == 0) /* leave min 5% of availrmem for */ 158 segspt_minfree = availrmem/20; /* for the system */ 159 160 if (!hat_supported(HAT_SHARED_PT, (void *)0)) 161 return (EINVAL); 162 163 /* 164 * get a new as for this shared memory segment 165 */ 166 newas = as_alloc(); 167 newas->a_proc = NULL; 168 sptcargs.amp = amp; 169 sptcargs.prot = prot; 170 sptcargs.flags = flags; 171 sptcargs.szc = share_szc; 172 /* 173 * create a shared page table (spt) segment 174 */ 175 176 if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) { 177 as_free(newas); 178 return (err); 179 } 180 *sptseg = sptcargs.seg_spt; 181 return (0); 182 } 183 184 void 185 sptdestroy(struct as *as, struct anon_map *amp) 186 { 187 188 #ifdef DEBUG 189 TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */); 190 #endif 191 (void) as_unmap(as, SEGSPTADDR, amp->size); 192 as_free(as); 193 } 194 195 /* 196 * called from seg_free(). 197 * free (i.e., unlock, unmap, return to free list) 198 * all the pages in the given seg. 199 */ 200 void 201 segspt_free(struct seg *seg) 202 { 203 struct spt_data *sptd = (struct spt_data *)seg->s_data; 204 205 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 206 207 if (sptd != NULL) { 208 if (sptd->spt_realsize) 209 segspt_free_pages(seg, seg->s_base, sptd->spt_realsize); 210 211 if (sptd->spt_ppa_lckcnt) 212 kmem_free(sptd->spt_ppa_lckcnt, 213 sizeof (*sptd->spt_ppa_lckcnt) 214 * btopr(sptd->spt_amp->size)); 215 kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp)); 216 cv_destroy(&sptd->spt_cv); 217 mutex_destroy(&sptd->spt_lock); 218 kmem_free(sptd, sizeof (*sptd)); 219 } 220 } 221 222 /*ARGSUSED*/ 223 static int 224 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr, 225 uint_t flags) 226 { 227 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 228 229 return (0); 230 } 231 232 /*ARGSUSED*/ 233 static size_t 234 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec) 235 { 236 caddr_t eo_seg; 237 pgcnt_t npages; 238 struct shm_data *shmd = (struct shm_data *)seg->s_data; 239 struct seg *sptseg; 240 struct spt_data *sptd; 241 242 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 243 #ifdef lint 244 seg = seg; 245 #endif 246 sptseg = shmd->shm_sptseg; 247 sptd = sptseg->s_data; 248 249 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 250 eo_seg = addr + len; 251 while (addr < eo_seg) { 252 /* page exists, and it's locked. */ 253 *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED | 254 SEG_PAGE_ANON; 255 addr += PAGESIZE; 256 } 257 return (len); 258 } else { 259 struct anon_map *amp = shmd->shm_amp; 260 struct anon *ap; 261 page_t *pp; 262 pgcnt_t anon_index; 263 struct vnode *vp; 264 u_offset_t off; 265 ulong_t i; 266 int ret; 267 anon_sync_obj_t cookie; 268 269 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 270 anon_index = seg_page(seg, addr); 271 npages = btopr(len); 272 if (anon_index + npages > btopr(shmd->shm_amp->size)) { 273 return (EINVAL); 274 } 275 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 276 for (i = 0; i < npages; i++, anon_index++) { 277 ret = 0; 278 anon_array_enter(amp, anon_index, &cookie); 279 ap = anon_get_ptr(amp->ahp, anon_index); 280 if (ap != NULL) { 281 swap_xlate(ap, &vp, &off); 282 anon_array_exit(&cookie); 283 pp = page_lookup_nowait(vp, off, SE_SHARED); 284 if (pp != NULL) { 285 ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON; 286 page_unlock(pp); 287 } 288 } else { 289 anon_array_exit(&cookie); 290 } 291 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) { 292 ret |= SEG_PAGE_LOCKED; 293 } 294 *vec++ = (char)ret; 295 } 296 ANON_LOCK_EXIT(&->a_rwlock); 297 return (len); 298 } 299 } 300 301 static int 302 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize) 303 { 304 size_t share_size; 305 306 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 307 308 /* 309 * seg.s_size may have been rounded up to the largest page size 310 * in shmat(). 311 * XXX This should be cleanedup. sptdestroy should take a length 312 * argument which should be the same as sptcreate. Then 313 * this rounding would not be needed (or is done in shm.c) 314 * Only the check for full segment will be needed. 315 * 316 * XXX -- shouldn't raddr == 0 always? These tests don't seem 317 * to be useful at all. 318 */ 319 share_size = page_get_pagesize(seg->s_szc); 320 ssize = P2ROUNDUP(ssize, share_size); 321 322 if (raddr == seg->s_base && ssize == seg->s_size) { 323 seg_free(seg); 324 return (0); 325 } else 326 return (EINVAL); 327 } 328 329 int 330 segspt_create(struct seg *seg, caddr_t argsp) 331 { 332 int err; 333 caddr_t addr = seg->s_base; 334 struct spt_data *sptd; 335 struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp; 336 struct anon_map *amp = sptcargs->amp; 337 struct kshmid *sp = amp->a_sp; 338 struct cred *cred = CRED(); 339 ulong_t i, j, anon_index = 0; 340 pgcnt_t npages = btopr(amp->size); 341 struct vnode *vp; 342 page_t **ppa; 343 uint_t hat_flags; 344 size_t pgsz; 345 pgcnt_t pgcnt; 346 caddr_t a; 347 pgcnt_t pidx; 348 size_t sz; 349 proc_t *procp = curproc; 350 rctl_qty_t lockedbytes = 0; 351 kproject_t *proj; 352 353 /* 354 * We are holding the a_lock on the underlying dummy as, 355 * so we can make calls to the HAT layer. 356 */ 357 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 358 ASSERT(sp != NULL); 359 360 #ifdef DEBUG 361 TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */, 362 tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size); 363 #endif 364 if ((sptcargs->flags & SHM_PAGEABLE) == 0) { 365 if (err = anon_swap_adjust(npages)) 366 return (err); 367 } 368 err = ENOMEM; 369 370 if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL) 371 goto out1; 372 373 if ((sptcargs->flags & SHM_PAGEABLE) == 0) { 374 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages), 375 KM_NOSLEEP)) == NULL) 376 goto out2; 377 } 378 379 mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL); 380 381 if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL) 382 goto out3; 383 384 seg->s_ops = &segspt_ops; 385 sptd->spt_vp = vp; 386 sptd->spt_amp = amp; 387 sptd->spt_prot = sptcargs->prot; 388 sptd->spt_flags = sptcargs->flags; 389 seg->s_data = (caddr_t)sptd; 390 sptd->spt_ppa = NULL; 391 sptd->spt_ppa_lckcnt = NULL; 392 seg->s_szc = sptcargs->szc; 393 cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL); 394 sptd->spt_gen = 0; 395 396 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 397 if (seg->s_szc > amp->a_szc) { 398 amp->a_szc = seg->s_szc; 399 } 400 ANON_LOCK_EXIT(&->a_rwlock); 401 402 /* 403 * Set policy to affect initial allocation of pages in 404 * anon_map_createpages() 405 */ 406 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index, 407 NULL, 0, ptob(npages)); 408 409 if (sptcargs->flags & SHM_PAGEABLE) { 410 size_t share_sz; 411 pgcnt_t new_npgs, more_pgs; 412 struct anon_hdr *nahp; 413 zone_t *zone; 414 415 share_sz = page_get_pagesize(seg->s_szc); 416 if (!IS_P2ALIGNED(amp->size, share_sz)) { 417 /* 418 * We are rounding up the size of the anon array 419 * on 4 M boundary because we always create 4 M 420 * of page(s) when locking, faulting pages and we 421 * don't have to check for all corner cases e.g. 422 * if there is enough space to allocate 4 M 423 * page. 424 */ 425 new_npgs = btop(P2ROUNDUP(amp->size, share_sz)); 426 more_pgs = new_npgs - npages; 427 428 /* 429 * The zone will never be NULL, as a fully created 430 * shm always has an owning zone. 431 */ 432 zone = sp->shm_perm.ipc_zone_ref.zref_zone; 433 ASSERT(zone != NULL); 434 if (anon_resv_zone(ptob(more_pgs), zone) == 0) { 435 err = ENOMEM; 436 goto out4; 437 } 438 439 nahp = anon_create(new_npgs, ANON_SLEEP); 440 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 441 (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages, 442 ANON_SLEEP); 443 anon_release(amp->ahp, npages); 444 amp->ahp = nahp; 445 ASSERT(amp->swresv == ptob(npages)); 446 amp->swresv = amp->size = ptob(new_npgs); 447 ANON_LOCK_EXIT(&->a_rwlock); 448 npages = new_npgs; 449 } 450 451 sptd->spt_ppa_lckcnt = kmem_zalloc(npages * 452 sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP); 453 sptd->spt_pcachecnt = 0; 454 sptd->spt_realsize = ptob(npages); 455 sptcargs->seg_spt = seg; 456 return (0); 457 } 458 459 /* 460 * get array of pages for each anon slot in amp 461 */ 462 if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa, 463 seg, addr, S_CREATE, cred)) != 0) 464 goto out4; 465 466 mutex_enter(&sp->shm_mlock); 467 468 /* May be partially locked, so, count bytes to charge for locking */ 469 for (i = 0; i < npages; i++) 470 if (ppa[i]->p_lckcnt == 0) 471 lockedbytes += PAGESIZE; 472 473 proj = sp->shm_perm.ipc_proj; 474 475 if (lockedbytes > 0) { 476 mutex_enter(&procp->p_lock); 477 if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) { 478 mutex_exit(&procp->p_lock); 479 mutex_exit(&sp->shm_mlock); 480 for (i = 0; i < npages; i++) 481 page_unlock(ppa[i]); 482 err = ENOMEM; 483 goto out4; 484 } 485 mutex_exit(&procp->p_lock); 486 } 487 488 /* 489 * addr is initial address corresponding to the first page on ppa list 490 */ 491 for (i = 0; i < npages; i++) { 492 /* attempt to lock all pages */ 493 if (page_pp_lock(ppa[i], 0, 1) == 0) { 494 /* 495 * if unable to lock any page, unlock all 496 * of them and return error 497 */ 498 for (j = 0; j < i; j++) 499 page_pp_unlock(ppa[j], 0, 1); 500 for (i = 0; i < npages; i++) 501 page_unlock(ppa[i]); 502 rctl_decr_locked_mem(NULL, proj, lockedbytes, 0); 503 mutex_exit(&sp->shm_mlock); 504 err = ENOMEM; 505 goto out4; 506 } 507 } 508 mutex_exit(&sp->shm_mlock); 509 510 /* 511 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK 512 * for the entire life of the segment. For example platforms 513 * that do not support Dynamic Reconfiguration. 514 */ 515 hat_flags = HAT_LOAD_SHARE; 516 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL)) 517 hat_flags |= HAT_LOAD_LOCK; 518 519 /* 520 * Load translations one lare page at a time 521 * to make sure we don't create mappings bigger than 522 * segment's size code in case underlying pages 523 * are shared with segvn's segment that uses bigger 524 * size code than we do. 525 */ 526 pgsz = page_get_pagesize(seg->s_szc); 527 pgcnt = page_get_pagecnt(seg->s_szc); 528 for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) { 529 sz = MIN(pgsz, ptob(npages - pidx)); 530 hat_memload_array(seg->s_as->a_hat, a, sz, 531 &ppa[pidx], sptd->spt_prot, hat_flags); 532 } 533 534 /* 535 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, 536 * we will leave the pages locked SE_SHARED for the life 537 * of the ISM segment. This will prevent any calls to 538 * hat_pageunload() on this ISM segment for those platforms. 539 */ 540 if (!(hat_flags & HAT_LOAD_LOCK)) { 541 /* 542 * On platforms that support HAT_DYNAMIC_ISM_UNMAP, 543 * we no longer need to hold the SE_SHARED lock on the pages, 544 * since L_PAGELOCK and F_SOFTLOCK calls will grab the 545 * SE_SHARED lock on the pages as necessary. 546 */ 547 for (i = 0; i < npages; i++) 548 page_unlock(ppa[i]); 549 } 550 sptd->spt_pcachecnt = 0; 551 kmem_free(ppa, ((sizeof (page_t *)) * npages)); 552 sptd->spt_realsize = ptob(npages); 553 atomic_add_long(&spt_used, npages); 554 sptcargs->seg_spt = seg; 555 return (0); 556 557 out4: 558 seg->s_data = NULL; 559 kmem_free(vp, sizeof (*vp)); 560 cv_destroy(&sptd->spt_cv); 561 out3: 562 mutex_destroy(&sptd->spt_lock); 563 if ((sptcargs->flags & SHM_PAGEABLE) == 0) 564 kmem_free(ppa, (sizeof (*ppa) * npages)); 565 out2: 566 kmem_free(sptd, sizeof (*sptd)); 567 out1: 568 if ((sptcargs->flags & SHM_PAGEABLE) == 0) 569 anon_swap_restore(npages); 570 return (err); 571 } 572 573 /*ARGSUSED*/ 574 void 575 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len) 576 { 577 struct page *pp; 578 struct spt_data *sptd = (struct spt_data *)seg->s_data; 579 pgcnt_t npages; 580 ulong_t anon_idx; 581 struct anon_map *amp; 582 struct anon *ap; 583 struct vnode *vp; 584 u_offset_t off; 585 uint_t hat_flags; 586 int root = 0; 587 pgcnt_t pgs, curnpgs = 0; 588 page_t *rootpp; 589 rctl_qty_t unlocked_bytes = 0; 590 kproject_t *proj; 591 kshmid_t *sp; 592 593 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 594 595 len = P2ROUNDUP(len, PAGESIZE); 596 597 npages = btop(len); 598 599 hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP; 600 if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) || 601 (sptd->spt_flags & SHM_PAGEABLE)) { 602 hat_flags = HAT_UNLOAD_UNMAP; 603 } 604 605 hat_unload(seg->s_as->a_hat, addr, len, hat_flags); 606 607 amp = sptd->spt_amp; 608 if (sptd->spt_flags & SHM_PAGEABLE) 609 npages = btop(amp->size); 610 611 ASSERT(amp != NULL); 612 613 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 614 sp = amp->a_sp; 615 proj = sp->shm_perm.ipc_proj; 616 mutex_enter(&sp->shm_mlock); 617 } 618 for (anon_idx = 0; anon_idx < npages; anon_idx++) { 619 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 620 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) { 621 panic("segspt_free_pages: null app"); 622 /*NOTREACHED*/ 623 } 624 } else { 625 if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx)) 626 == NULL) 627 continue; 628 } 629 ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0); 630 swap_xlate(ap, &vp, &off); 631 632 /* 633 * If this platform supports HAT_DYNAMIC_ISM_UNMAP, 634 * the pages won't be having SE_SHARED lock at this 635 * point. 636 * 637 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, 638 * the pages are still held SE_SHARED locked from the 639 * original segspt_create() 640 * 641 * Our goal is to get SE_EXCL lock on each page, remove 642 * permanent lock on it and invalidate the page. 643 */ 644 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 645 if (hat_flags == HAT_UNLOAD_UNMAP) 646 pp = page_lookup(vp, off, SE_EXCL); 647 else { 648 if ((pp = page_find(vp, off)) == NULL) { 649 panic("segspt_free_pages: " 650 "page not locked"); 651 /*NOTREACHED*/ 652 } 653 if (!page_tryupgrade(pp)) { 654 page_unlock(pp); 655 pp = page_lookup(vp, off, SE_EXCL); 656 } 657 } 658 if (pp == NULL) { 659 panic("segspt_free_pages: " 660 "page not in the system"); 661 /*NOTREACHED*/ 662 } 663 ASSERT(pp->p_lckcnt > 0); 664 page_pp_unlock(pp, 0, 1); 665 if (pp->p_lckcnt == 0) 666 unlocked_bytes += PAGESIZE; 667 } else { 668 if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL) 669 continue; 670 } 671 /* 672 * It's logical to invalidate the pages here as in most cases 673 * these were created by segspt. 674 */ 675 if (pp->p_szc != 0) { 676 if (root == 0) { 677 ASSERT(curnpgs == 0); 678 root = 1; 679 rootpp = pp; 680 pgs = curnpgs = page_get_pagecnt(pp->p_szc); 681 ASSERT(pgs > 1); 682 ASSERT(IS_P2ALIGNED(pgs, pgs)); 683 ASSERT(!(page_pptonum(pp) & (pgs - 1))); 684 curnpgs--; 685 } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) { 686 ASSERT(curnpgs == 1); 687 ASSERT(page_pptonum(pp) == 688 page_pptonum(rootpp) + (pgs - 1)); 689 page_destroy_pages(rootpp); 690 root = 0; 691 curnpgs = 0; 692 } else { 693 ASSERT(curnpgs > 1); 694 ASSERT(page_pptonum(pp) == 695 page_pptonum(rootpp) + (pgs - curnpgs)); 696 curnpgs--; 697 } 698 } else { 699 if (root != 0 || curnpgs != 0) { 700 panic("segspt_free_pages: bad large page"); 701 /*NOTREACHED*/ 702 } 703 /* 704 * Before destroying the pages, we need to take care 705 * of the rctl locked memory accounting. For that 706 * we need to calculte the unlocked_bytes. 707 */ 708 if (pp->p_lckcnt > 0) 709 unlocked_bytes += PAGESIZE; 710 /*LINTED: constant in conditional context */ 711 VN_DISPOSE(pp, B_INVAL, 0, kcred); 712 } 713 } 714 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 715 if (unlocked_bytes > 0) 716 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0); 717 mutex_exit(&sp->shm_mlock); 718 } 719 if (root != 0 || curnpgs != 0) { 720 panic("segspt_free_pages: bad large page"); 721 /*NOTREACHED*/ 722 } 723 724 /* 725 * mark that pages have been released 726 */ 727 sptd->spt_realsize = 0; 728 729 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 730 atomic_add_long(&spt_used, -npages); 731 anon_swap_restore(npages); 732 } 733 } 734 735 /* 736 * Get memory allocation policy info for specified address in given segment 737 */ 738 static lgrp_mem_policy_info_t * 739 segspt_getpolicy(struct seg *seg, caddr_t addr) 740 { 741 struct anon_map *amp; 742 ulong_t anon_index; 743 lgrp_mem_policy_info_t *policy_info; 744 struct spt_data *spt_data; 745 746 ASSERT(seg != NULL); 747 748 /* 749 * Get anon_map from segspt 750 * 751 * Assume that no lock needs to be held on anon_map, since 752 * it should be protected by its reference count which must be 753 * nonzero for an existing segment 754 * Need to grab readers lock on policy tree though 755 */ 756 spt_data = (struct spt_data *)seg->s_data; 757 if (spt_data == NULL) 758 return (NULL); 759 amp = spt_data->spt_amp; 760 ASSERT(amp->refcnt != 0); 761 762 /* 763 * Get policy info 764 * 765 * Assume starting anon index of 0 766 */ 767 anon_index = seg_page(seg, addr); 768 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0); 769 770 return (policy_info); 771 } 772 773 /* 774 * DISM only. 775 * Return locked pages over a given range. 776 * 777 * We will cache all DISM locked pages and save the pplist for the 778 * entire segment in the ppa field of the underlying DISM segment structure. 779 * Later, during a call to segspt_reclaim() we will use this ppa array 780 * to page_unlock() all of the pages and then we will free this ppa list. 781 */ 782 /*ARGSUSED*/ 783 static int 784 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len, 785 struct page ***ppp, enum lock_type type, enum seg_rw rw) 786 { 787 struct shm_data *shmd = (struct shm_data *)seg->s_data; 788 struct seg *sptseg = shmd->shm_sptseg; 789 struct spt_data *sptd = sptseg->s_data; 790 pgcnt_t pg_idx, npages, tot_npages, npgs; 791 struct page **pplist, **pl, **ppa, *pp; 792 struct anon_map *amp; 793 spgcnt_t an_idx; 794 int ret = ENOTSUP; 795 uint_t pl_built = 0; 796 struct anon *ap; 797 struct vnode *vp; 798 u_offset_t off; 799 pgcnt_t claim_availrmem = 0; 800 uint_t szc; 801 802 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 803 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); 804 805 /* 806 * We want to lock/unlock the entire ISM segment. Therefore, 807 * we will be using the underlying sptseg and it's base address 808 * and length for the caching arguments. 809 */ 810 ASSERT(sptseg); 811 ASSERT(sptd); 812 813 pg_idx = seg_page(seg, addr); 814 npages = btopr(len); 815 816 /* 817 * check if the request is larger than number of pages covered 818 * by amp 819 */ 820 if (pg_idx + npages > btopr(sptd->spt_amp->size)) { 821 *ppp = NULL; 822 return (ENOTSUP); 823 } 824 825 if (type == L_PAGEUNLOCK) { 826 ASSERT(sptd->spt_ppa != NULL); 827 828 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, 829 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 830 831 /* 832 * If someone is blocked while unmapping, we purge 833 * segment page cache and thus reclaim pplist synchronously 834 * without waiting for seg_pasync_thread. This speeds up 835 * unmapping in cases where munmap(2) is called, while 836 * raw async i/o is still in progress or where a thread 837 * exits on data fault in a multithreaded application. 838 */ 839 if ((sptd->spt_flags & DISM_PPA_CHANGED) || 840 (AS_ISUNMAPWAIT(seg->s_as) && 841 shmd->shm_softlockcnt > 0)) { 842 segspt_purge(seg); 843 } 844 return (0); 845 } 846 847 /* The L_PAGELOCK case ... */ 848 849 if (sptd->spt_flags & DISM_PPA_CHANGED) { 850 segspt_purge(seg); 851 /* 852 * for DISM ppa needs to be rebuild since 853 * number of locked pages could be changed 854 */ 855 *ppp = NULL; 856 return (ENOTSUP); 857 } 858 859 /* 860 * First try to find pages in segment page cache, without 861 * holding the segment lock. 862 */ 863 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 864 S_WRITE, SEGP_FORCE_WIRED); 865 if (pplist != NULL) { 866 ASSERT(sptd->spt_ppa != NULL); 867 ASSERT(sptd->spt_ppa == pplist); 868 ppa = sptd->spt_ppa; 869 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 870 if (ppa[an_idx] == NULL) { 871 seg_pinactive(seg, NULL, seg->s_base, 872 sptd->spt_amp->size, ppa, 873 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 874 *ppp = NULL; 875 return (ENOTSUP); 876 } 877 if ((szc = ppa[an_idx]->p_szc) != 0) { 878 npgs = page_get_pagecnt(szc); 879 an_idx = P2ROUNDUP(an_idx + 1, npgs); 880 } else { 881 an_idx++; 882 } 883 } 884 /* 885 * Since we cache the entire DISM segment, we want to 886 * set ppp to point to the first slot that corresponds 887 * to the requested addr, i.e. pg_idx. 888 */ 889 *ppp = &(sptd->spt_ppa[pg_idx]); 890 return (0); 891 } 892 893 mutex_enter(&sptd->spt_lock); 894 /* 895 * try to find pages in segment page cache with mutex 896 */ 897 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 898 S_WRITE, SEGP_FORCE_WIRED); 899 if (pplist != NULL) { 900 ASSERT(sptd->spt_ppa != NULL); 901 ASSERT(sptd->spt_ppa == pplist); 902 ppa = sptd->spt_ppa; 903 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 904 if (ppa[an_idx] == NULL) { 905 mutex_exit(&sptd->spt_lock); 906 seg_pinactive(seg, NULL, seg->s_base, 907 sptd->spt_amp->size, ppa, 908 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 909 *ppp = NULL; 910 return (ENOTSUP); 911 } 912 if ((szc = ppa[an_idx]->p_szc) != 0) { 913 npgs = page_get_pagecnt(szc); 914 an_idx = P2ROUNDUP(an_idx + 1, npgs); 915 } else { 916 an_idx++; 917 } 918 } 919 /* 920 * Since we cache the entire DISM segment, we want to 921 * set ppp to point to the first slot that corresponds 922 * to the requested addr, i.e. pg_idx. 923 */ 924 mutex_exit(&sptd->spt_lock); 925 *ppp = &(sptd->spt_ppa[pg_idx]); 926 return (0); 927 } 928 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size, 929 SEGP_FORCE_WIRED) == SEGP_FAIL) { 930 mutex_exit(&sptd->spt_lock); 931 *ppp = NULL; 932 return (ENOTSUP); 933 } 934 935 /* 936 * No need to worry about protections because DISM pages are always rw. 937 */ 938 pl = pplist = NULL; 939 amp = sptd->spt_amp; 940 941 /* 942 * Do we need to build the ppa array? 943 */ 944 if (sptd->spt_ppa == NULL) { 945 pgcnt_t lpg_cnt = 0; 946 947 pl_built = 1; 948 tot_npages = btopr(sptd->spt_amp->size); 949 950 ASSERT(sptd->spt_pcachecnt == 0); 951 pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP); 952 pl = pplist; 953 954 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 955 for (an_idx = 0; an_idx < tot_npages; ) { 956 ap = anon_get_ptr(amp->ahp, an_idx); 957 /* 958 * Cache only mlocked pages. For large pages 959 * if one (constituent) page is mlocked 960 * all pages for that large page 961 * are cached also. This is for quick 962 * lookups of ppa array; 963 */ 964 if ((ap != NULL) && (lpg_cnt != 0 || 965 (sptd->spt_ppa_lckcnt[an_idx] != 0))) { 966 967 swap_xlate(ap, &vp, &off); 968 pp = page_lookup(vp, off, SE_SHARED); 969 ASSERT(pp != NULL); 970 if (lpg_cnt == 0) { 971 lpg_cnt++; 972 /* 973 * For a small page, we are done -- 974 * lpg_count is reset to 0 below. 975 * 976 * For a large page, we are guaranteed 977 * to find the anon structures of all 978 * constituent pages and a non-zero 979 * lpg_cnt ensures that we don't test 980 * for mlock for these. We are done 981 * when lpg_count reaches (npgs + 1). 982 * If we are not the first constituent 983 * page, restart at the first one. 984 */ 985 npgs = page_get_pagecnt(pp->p_szc); 986 if (!IS_P2ALIGNED(an_idx, npgs)) { 987 an_idx = P2ALIGN(an_idx, npgs); 988 page_unlock(pp); 989 continue; 990 } 991 } 992 if (++lpg_cnt > npgs) 993 lpg_cnt = 0; 994 995 /* 996 * availrmem is decremented only 997 * for unlocked pages 998 */ 999 if (sptd->spt_ppa_lckcnt[an_idx] == 0) 1000 claim_availrmem++; 1001 pplist[an_idx] = pp; 1002 } 1003 an_idx++; 1004 } 1005 ANON_LOCK_EXIT(&->a_rwlock); 1006 1007 if (claim_availrmem) { 1008 mutex_enter(&freemem_lock); 1009 if (availrmem < tune.t_minarmem + claim_availrmem) { 1010 mutex_exit(&freemem_lock); 1011 ret = ENOTSUP; 1012 claim_availrmem = 0; 1013 goto insert_fail; 1014 } else { 1015 availrmem -= claim_availrmem; 1016 } 1017 mutex_exit(&freemem_lock); 1018 } 1019 1020 sptd->spt_ppa = pl; 1021 } else { 1022 /* 1023 * We already have a valid ppa[]. 1024 */ 1025 pl = sptd->spt_ppa; 1026 } 1027 1028 ASSERT(pl != NULL); 1029 1030 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size, 1031 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED, 1032 segspt_reclaim); 1033 if (ret == SEGP_FAIL) { 1034 /* 1035 * seg_pinsert failed. We return 1036 * ENOTSUP, so that the as_pagelock() code will 1037 * then try the slower F_SOFTLOCK path. 1038 */ 1039 if (pl_built) { 1040 /* 1041 * No one else has referenced the ppa[]. 1042 * We created it and we need to destroy it. 1043 */ 1044 sptd->spt_ppa = NULL; 1045 } 1046 ret = ENOTSUP; 1047 goto insert_fail; 1048 } 1049 1050 /* 1051 * In either case, we increment softlockcnt on the 'real' segment. 1052 */ 1053 sptd->spt_pcachecnt++; 1054 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 1055 1056 ppa = sptd->spt_ppa; 1057 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 1058 if (ppa[an_idx] == NULL) { 1059 mutex_exit(&sptd->spt_lock); 1060 seg_pinactive(seg, NULL, seg->s_base, 1061 sptd->spt_amp->size, 1062 pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 1063 *ppp = NULL; 1064 return (ENOTSUP); 1065 } 1066 if ((szc = ppa[an_idx]->p_szc) != 0) { 1067 npgs = page_get_pagecnt(szc); 1068 an_idx = P2ROUNDUP(an_idx + 1, npgs); 1069 } else { 1070 an_idx++; 1071 } 1072 } 1073 /* 1074 * We can now drop the sptd->spt_lock since the ppa[] 1075 * exists and he have incremented pacachecnt. 1076 */ 1077 mutex_exit(&sptd->spt_lock); 1078 1079 /* 1080 * Since we cache the entire segment, we want to 1081 * set ppp to point to the first slot that corresponds 1082 * to the requested addr, i.e. pg_idx. 1083 */ 1084 *ppp = &(sptd->spt_ppa[pg_idx]); 1085 return (0); 1086 1087 insert_fail: 1088 /* 1089 * We will only reach this code if we tried and failed. 1090 * 1091 * And we can drop the lock on the dummy seg, once we've failed 1092 * to set up a new ppa[]. 1093 */ 1094 mutex_exit(&sptd->spt_lock); 1095 1096 if (pl_built) { 1097 if (claim_availrmem) { 1098 mutex_enter(&freemem_lock); 1099 availrmem += claim_availrmem; 1100 mutex_exit(&freemem_lock); 1101 } 1102 1103 /* 1104 * We created pl and we need to destroy it. 1105 */ 1106 pplist = pl; 1107 for (an_idx = 0; an_idx < tot_npages; an_idx++) { 1108 if (pplist[an_idx] != NULL) 1109 page_unlock(pplist[an_idx]); 1110 } 1111 kmem_free(pl, sizeof (page_t *) * tot_npages); 1112 } 1113 1114 if (shmd->shm_softlockcnt <= 0) { 1115 if (AS_ISUNMAPWAIT(seg->s_as)) { 1116 mutex_enter(&seg->s_as->a_contents); 1117 if (AS_ISUNMAPWAIT(seg->s_as)) { 1118 AS_CLRUNMAPWAIT(seg->s_as); 1119 cv_broadcast(&seg->s_as->a_cv); 1120 } 1121 mutex_exit(&seg->s_as->a_contents); 1122 } 1123 } 1124 *ppp = NULL; 1125 return (ret); 1126 } 1127 1128 1129 1130 /* 1131 * return locked pages over a given range. 1132 * 1133 * We will cache the entire ISM segment and save the pplist for the 1134 * entire segment in the ppa field of the underlying ISM segment structure. 1135 * Later, during a call to segspt_reclaim() we will use this ppa array 1136 * to page_unlock() all of the pages and then we will free this ppa list. 1137 */ 1138 /*ARGSUSED*/ 1139 static int 1140 segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len, 1141 struct page ***ppp, enum lock_type type, enum seg_rw rw) 1142 { 1143 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1144 struct seg *sptseg = shmd->shm_sptseg; 1145 struct spt_data *sptd = sptseg->s_data; 1146 pgcnt_t np, page_index, npages; 1147 caddr_t a, spt_base; 1148 struct page **pplist, **pl, *pp; 1149 struct anon_map *amp; 1150 ulong_t anon_index; 1151 int ret = ENOTSUP; 1152 uint_t pl_built = 0; 1153 struct anon *ap; 1154 struct vnode *vp; 1155 u_offset_t off; 1156 1157 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1158 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); 1159 1160 1161 /* 1162 * We want to lock/unlock the entire ISM segment. Therefore, 1163 * we will be using the underlying sptseg and it's base address 1164 * and length for the caching arguments. 1165 */ 1166 ASSERT(sptseg); 1167 ASSERT(sptd); 1168 1169 if (sptd->spt_flags & SHM_PAGEABLE) { 1170 return (segspt_dismpagelock(seg, addr, len, ppp, type, rw)); 1171 } 1172 1173 page_index = seg_page(seg, addr); 1174 npages = btopr(len); 1175 1176 /* 1177 * check if the request is larger than number of pages covered 1178 * by amp 1179 */ 1180 if (page_index + npages > btopr(sptd->spt_amp->size)) { 1181 *ppp = NULL; 1182 return (ENOTSUP); 1183 } 1184 1185 if (type == L_PAGEUNLOCK) { 1186 1187 ASSERT(sptd->spt_ppa != NULL); 1188 1189 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, 1190 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 1191 1192 /* 1193 * If someone is blocked while unmapping, we purge 1194 * segment page cache and thus reclaim pplist synchronously 1195 * without waiting for seg_pasync_thread. This speeds up 1196 * unmapping in cases where munmap(2) is called, while 1197 * raw async i/o is still in progress or where a thread 1198 * exits on data fault in a multithreaded application. 1199 */ 1200 if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) { 1201 segspt_purge(seg); 1202 } 1203 return (0); 1204 } 1205 1206 /* The L_PAGELOCK case... */ 1207 1208 /* 1209 * First try to find pages in segment page cache, without 1210 * holding the segment lock. 1211 */ 1212 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 1213 S_WRITE, SEGP_FORCE_WIRED); 1214 if (pplist != NULL) { 1215 ASSERT(sptd->spt_ppa == pplist); 1216 ASSERT(sptd->spt_ppa[page_index]); 1217 /* 1218 * Since we cache the entire ISM segment, we want to 1219 * set ppp to point to the first slot that corresponds 1220 * to the requested addr, i.e. page_index. 1221 */ 1222 *ppp = &(sptd->spt_ppa[page_index]); 1223 return (0); 1224 } 1225 1226 mutex_enter(&sptd->spt_lock); 1227 1228 /* 1229 * try to find pages in segment page cache 1230 */ 1231 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 1232 S_WRITE, SEGP_FORCE_WIRED); 1233 if (pplist != NULL) { 1234 ASSERT(sptd->spt_ppa == pplist); 1235 /* 1236 * Since we cache the entire segment, we want to 1237 * set ppp to point to the first slot that corresponds 1238 * to the requested addr, i.e. page_index. 1239 */ 1240 mutex_exit(&sptd->spt_lock); 1241 *ppp = &(sptd->spt_ppa[page_index]); 1242 return (0); 1243 } 1244 1245 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size, 1246 SEGP_FORCE_WIRED) == SEGP_FAIL) { 1247 mutex_exit(&sptd->spt_lock); 1248 *ppp = NULL; 1249 return (ENOTSUP); 1250 } 1251 1252 /* 1253 * No need to worry about protections because ISM pages 1254 * are always rw. 1255 */ 1256 pl = pplist = NULL; 1257 1258 /* 1259 * Do we need to build the ppa array? 1260 */ 1261 if (sptd->spt_ppa == NULL) { 1262 ASSERT(sptd->spt_ppa == pplist); 1263 1264 spt_base = sptseg->s_base; 1265 pl_built = 1; 1266 1267 /* 1268 * availrmem is decremented once during anon_swap_adjust() 1269 * and is incremented during the anon_unresv(), which is 1270 * called from shm_rm_amp() when the segment is destroyed. 1271 */ 1272 amp = sptd->spt_amp; 1273 ASSERT(amp != NULL); 1274 1275 /* pcachecnt is protected by sptd->spt_lock */ 1276 ASSERT(sptd->spt_pcachecnt == 0); 1277 pplist = kmem_zalloc(sizeof (page_t *) 1278 * btopr(sptd->spt_amp->size), KM_SLEEP); 1279 pl = pplist; 1280 1281 anon_index = seg_page(sptseg, spt_base); 1282 1283 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1284 for (a = spt_base; a < (spt_base + sptd->spt_amp->size); 1285 a += PAGESIZE, anon_index++, pplist++) { 1286 ap = anon_get_ptr(amp->ahp, anon_index); 1287 ASSERT(ap != NULL); 1288 swap_xlate(ap, &vp, &off); 1289 pp = page_lookup(vp, off, SE_SHARED); 1290 ASSERT(pp != NULL); 1291 *pplist = pp; 1292 } 1293 ANON_LOCK_EXIT(&->a_rwlock); 1294 1295 if (a < (spt_base + sptd->spt_amp->size)) { 1296 ret = ENOTSUP; 1297 goto insert_fail; 1298 } 1299 sptd->spt_ppa = pl; 1300 } else { 1301 /* 1302 * We already have a valid ppa[]. 1303 */ 1304 pl = sptd->spt_ppa; 1305 } 1306 1307 ASSERT(pl != NULL); 1308 1309 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size, 1310 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED, 1311 segspt_reclaim); 1312 if (ret == SEGP_FAIL) { 1313 /* 1314 * seg_pinsert failed. We return 1315 * ENOTSUP, so that the as_pagelock() code will 1316 * then try the slower F_SOFTLOCK path. 1317 */ 1318 if (pl_built) { 1319 /* 1320 * No one else has referenced the ppa[]. 1321 * We created it and we need to destroy it. 1322 */ 1323 sptd->spt_ppa = NULL; 1324 } 1325 ret = ENOTSUP; 1326 goto insert_fail; 1327 } 1328 1329 /* 1330 * In either case, we increment softlockcnt on the 'real' segment. 1331 */ 1332 sptd->spt_pcachecnt++; 1333 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 1334 1335 /* 1336 * We can now drop the sptd->spt_lock since the ppa[] 1337 * exists and he have incremented pacachecnt. 1338 */ 1339 mutex_exit(&sptd->spt_lock); 1340 1341 /* 1342 * Since we cache the entire segment, we want to 1343 * set ppp to point to the first slot that corresponds 1344 * to the requested addr, i.e. page_index. 1345 */ 1346 *ppp = &(sptd->spt_ppa[page_index]); 1347 return (0); 1348 1349 insert_fail: 1350 /* 1351 * We will only reach this code if we tried and failed. 1352 * 1353 * And we can drop the lock on the dummy seg, once we've failed 1354 * to set up a new ppa[]. 1355 */ 1356 mutex_exit(&sptd->spt_lock); 1357 1358 if (pl_built) { 1359 /* 1360 * We created pl and we need to destroy it. 1361 */ 1362 pplist = pl; 1363 np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT); 1364 while (np) { 1365 page_unlock(*pplist); 1366 np--; 1367 pplist++; 1368 } 1369 kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size)); 1370 } 1371 if (shmd->shm_softlockcnt <= 0) { 1372 if (AS_ISUNMAPWAIT(seg->s_as)) { 1373 mutex_enter(&seg->s_as->a_contents); 1374 if (AS_ISUNMAPWAIT(seg->s_as)) { 1375 AS_CLRUNMAPWAIT(seg->s_as); 1376 cv_broadcast(&seg->s_as->a_cv); 1377 } 1378 mutex_exit(&seg->s_as->a_contents); 1379 } 1380 } 1381 *ppp = NULL; 1382 return (ret); 1383 } 1384 1385 /* 1386 * purge any cached pages in the I/O page cache 1387 */ 1388 static void 1389 segspt_purge(struct seg *seg) 1390 { 1391 seg_ppurge(seg, NULL, SEGP_FORCE_WIRED); 1392 } 1393 1394 static int 1395 segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist, 1396 enum seg_rw rw, int async) 1397 { 1398 struct seg *seg = (struct seg *)ptag; 1399 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1400 struct seg *sptseg; 1401 struct spt_data *sptd; 1402 pgcnt_t npages, i, free_availrmem = 0; 1403 int done = 0; 1404 1405 #ifdef lint 1406 addr = addr; 1407 #endif 1408 sptseg = shmd->shm_sptseg; 1409 sptd = sptseg->s_data; 1410 npages = (len >> PAGESHIFT); 1411 ASSERT(npages); 1412 ASSERT(sptd->spt_pcachecnt != 0); 1413 ASSERT(sptd->spt_ppa == pplist); 1414 ASSERT(npages == btopr(sptd->spt_amp->size)); 1415 ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1416 1417 /* 1418 * Acquire the lock on the dummy seg and destroy the 1419 * ppa array IF this is the last pcachecnt. 1420 */ 1421 mutex_enter(&sptd->spt_lock); 1422 if (--sptd->spt_pcachecnt == 0) { 1423 for (i = 0; i < npages; i++) { 1424 if (pplist[i] == NULL) { 1425 continue; 1426 } 1427 if (rw == S_WRITE) { 1428 hat_setrefmod(pplist[i]); 1429 } else { 1430 hat_setref(pplist[i]); 1431 } 1432 if ((sptd->spt_flags & SHM_PAGEABLE) && 1433 (sptd->spt_ppa_lckcnt[i] == 0)) 1434 free_availrmem++; 1435 page_unlock(pplist[i]); 1436 } 1437 if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) { 1438 mutex_enter(&freemem_lock); 1439 availrmem += free_availrmem; 1440 mutex_exit(&freemem_lock); 1441 } 1442 /* 1443 * Since we want to cach/uncache the entire ISM segment, 1444 * we will track the pplist in a segspt specific field 1445 * ppa, that is initialized at the time we add an entry to 1446 * the cache. 1447 */ 1448 ASSERT(sptd->spt_pcachecnt == 0); 1449 kmem_free(pplist, sizeof (page_t *) * npages); 1450 sptd->spt_ppa = NULL; 1451 sptd->spt_flags &= ~DISM_PPA_CHANGED; 1452 sptd->spt_gen++; 1453 cv_broadcast(&sptd->spt_cv); 1454 done = 1; 1455 } 1456 mutex_exit(&sptd->spt_lock); 1457 1458 /* 1459 * If we are pcache async thread or called via seg_ppurge_wiredpp() we 1460 * may not hold AS lock (in this case async argument is not 0). This 1461 * means if softlockcnt drops to 0 after the decrement below address 1462 * space may get freed. We can't allow it since after softlock 1463 * derement to 0 we still need to access as structure for possible 1464 * wakeup of unmap waiters. To prevent the disappearance of as we take 1465 * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes 1466 * this mutex as a barrier to make sure this routine completes before 1467 * segment is freed. 1468 * 1469 * The second complication we have to deal with in async case is a 1470 * possibility of missed wake up of unmap wait thread. When we don't 1471 * hold as lock here we may take a_contents lock before unmap wait 1472 * thread that was first to see softlockcnt was still not 0. As a 1473 * result we'll fail to wake up an unmap wait thread. To avoid this 1474 * race we set nounmapwait flag in as structure if we drop softlockcnt 1475 * to 0 if async is not 0. unmapwait thread 1476 * will not block if this flag is set. 1477 */ 1478 if (async) 1479 mutex_enter(&shmd->shm_segfree_syncmtx); 1480 1481 /* 1482 * Now decrement softlockcnt. 1483 */ 1484 ASSERT(shmd->shm_softlockcnt > 0); 1485 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 1486 1487 if (shmd->shm_softlockcnt <= 0) { 1488 if (async || AS_ISUNMAPWAIT(seg->s_as)) { 1489 mutex_enter(&seg->s_as->a_contents); 1490 if (async) 1491 AS_SETNOUNMAPWAIT(seg->s_as); 1492 if (AS_ISUNMAPWAIT(seg->s_as)) { 1493 AS_CLRUNMAPWAIT(seg->s_as); 1494 cv_broadcast(&seg->s_as->a_cv); 1495 } 1496 mutex_exit(&seg->s_as->a_contents); 1497 } 1498 } 1499 1500 if (async) 1501 mutex_exit(&shmd->shm_segfree_syncmtx); 1502 1503 return (done); 1504 } 1505 1506 /* 1507 * Do a F_SOFTUNLOCK call over the range requested. 1508 * The range must have already been F_SOFTLOCK'ed. 1509 * 1510 * The calls to acquire and release the anon map lock mutex were 1511 * removed in order to avoid a deadly embrace during a DR 1512 * memory delete operation. (Eg. DR blocks while waiting for a 1513 * exclusive lock on a page that is being used for kaio; the 1514 * thread that will complete the kaio and call segspt_softunlock 1515 * blocks on the anon map lock; another thread holding the anon 1516 * map lock blocks on another page lock via the segspt_shmfault 1517 * -> page_lookup -> page_lookup_create -> page_lock_es code flow.) 1518 * 1519 * The appropriateness of the removal is based upon the following: 1520 * 1. If we are holding a segment's reader lock and the page is held 1521 * shared, then the corresponding element in anonmap which points to 1522 * anon struct cannot change and there is no need to acquire the 1523 * anonymous map lock. 1524 * 2. Threads in segspt_softunlock have a reader lock on the segment 1525 * and already have the shared page lock, so we are guaranteed that 1526 * the anon map slot cannot change and therefore can call anon_get_ptr() 1527 * without grabbing the anonymous map lock. 1528 * 3. Threads that softlock a shared page break copy-on-write, even if 1529 * its a read. Thus cow faults can be ignored with respect to soft 1530 * unlocking, since the breaking of cow means that the anon slot(s) will 1531 * not be shared. 1532 */ 1533 static void 1534 segspt_softunlock(struct seg *seg, caddr_t sptseg_addr, 1535 size_t len, enum seg_rw rw) 1536 { 1537 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1538 struct seg *sptseg; 1539 struct spt_data *sptd; 1540 page_t *pp; 1541 caddr_t adr; 1542 struct vnode *vp; 1543 u_offset_t offset; 1544 ulong_t anon_index; 1545 struct anon_map *amp; /* XXX - for locknest */ 1546 struct anon *ap = NULL; 1547 pgcnt_t npages; 1548 1549 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1550 1551 sptseg = shmd->shm_sptseg; 1552 sptd = sptseg->s_data; 1553 1554 /* 1555 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK 1556 * and therefore their pages are SE_SHARED locked 1557 * for the entire life of the segment. 1558 */ 1559 if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) && 1560 ((sptd->spt_flags & SHM_PAGEABLE) == 0)) { 1561 goto softlock_decrement; 1562 } 1563 1564 /* 1565 * Any thread is free to do a page_find and 1566 * page_unlock() on the pages within this seg. 1567 * 1568 * We are already holding the as->a_lock on the user's 1569 * real segment, but we need to hold the a_lock on the 1570 * underlying dummy as. This is mostly to satisfy the 1571 * underlying HAT layer. 1572 */ 1573 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER); 1574 hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len); 1575 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock); 1576 1577 amp = sptd->spt_amp; 1578 ASSERT(amp != NULL); 1579 anon_index = seg_page(sptseg, sptseg_addr); 1580 1581 for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) { 1582 ap = anon_get_ptr(amp->ahp, anon_index++); 1583 ASSERT(ap != NULL); 1584 swap_xlate(ap, &vp, &offset); 1585 1586 /* 1587 * Use page_find() instead of page_lookup() to 1588 * find the page since we know that it has a 1589 * "shared" lock. 1590 */ 1591 pp = page_find(vp, offset); 1592 ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1)); 1593 if (pp == NULL) { 1594 panic("segspt_softunlock: " 1595 "addr %p, ap %p, vp %p, off %llx", 1596 (void *)adr, (void *)ap, (void *)vp, offset); 1597 /*NOTREACHED*/ 1598 } 1599 1600 if (rw == S_WRITE) { 1601 hat_setrefmod(pp); 1602 } else if (rw != S_OTHER) { 1603 hat_setref(pp); 1604 } 1605 page_unlock(pp); 1606 } 1607 1608 softlock_decrement: 1609 npages = btopr(len); 1610 ASSERT(shmd->shm_softlockcnt >= npages); 1611 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages); 1612 if (shmd->shm_softlockcnt == 0) { 1613 /* 1614 * All SOFTLOCKS are gone. Wakeup any waiting 1615 * unmappers so they can try again to unmap. 1616 * Check for waiters first without the mutex 1617 * held so we don't always grab the mutex on 1618 * softunlocks. 1619 */ 1620 if (AS_ISUNMAPWAIT(seg->s_as)) { 1621 mutex_enter(&seg->s_as->a_contents); 1622 if (AS_ISUNMAPWAIT(seg->s_as)) { 1623 AS_CLRUNMAPWAIT(seg->s_as); 1624 cv_broadcast(&seg->s_as->a_cv); 1625 } 1626 mutex_exit(&seg->s_as->a_contents); 1627 } 1628 } 1629 } 1630 1631 int 1632 segspt_shmattach(struct seg *seg, caddr_t *argsp) 1633 { 1634 struct shm_data *shmd_arg = (struct shm_data *)argsp; 1635 struct shm_data *shmd; 1636 struct anon_map *shm_amp = shmd_arg->shm_amp; 1637 struct spt_data *sptd; 1638 int error = 0; 1639 1640 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1641 1642 shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP); 1643 if (shmd == NULL) 1644 return (ENOMEM); 1645 1646 shmd->shm_sptas = shmd_arg->shm_sptas; 1647 shmd->shm_amp = shm_amp; 1648 shmd->shm_sptseg = shmd_arg->shm_sptseg; 1649 1650 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0, 1651 NULL, 0, seg->s_size); 1652 1653 mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL); 1654 1655 seg->s_data = (void *)shmd; 1656 seg->s_ops = &segspt_shmops; 1657 seg->s_szc = shmd->shm_sptseg->s_szc; 1658 sptd = shmd->shm_sptseg->s_data; 1659 1660 if (sptd->spt_flags & SHM_PAGEABLE) { 1661 if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size), 1662 KM_NOSLEEP)) == NULL) { 1663 seg->s_data = (void *)NULL; 1664 kmem_free(shmd, (sizeof (*shmd))); 1665 return (ENOMEM); 1666 } 1667 shmd->shm_lckpgs = 0; 1668 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 1669 if ((error = hat_share(seg->s_as->a_hat, seg->s_base, 1670 shmd_arg->shm_sptas->a_hat, SEGSPTADDR, 1671 seg->s_size, seg->s_szc)) != 0) { 1672 kmem_free(shmd->shm_vpage, 1673 btopr(shm_amp->size)); 1674 } 1675 } 1676 } else { 1677 error = hat_share(seg->s_as->a_hat, seg->s_base, 1678 shmd_arg->shm_sptas->a_hat, SEGSPTADDR, 1679 seg->s_size, seg->s_szc); 1680 } 1681 if (error) { 1682 seg->s_szc = 0; 1683 seg->s_data = (void *)NULL; 1684 kmem_free(shmd, (sizeof (*shmd))); 1685 } else { 1686 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER); 1687 shm_amp->refcnt++; 1688 ANON_LOCK_EXIT(&shm_amp->a_rwlock); 1689 } 1690 return (error); 1691 } 1692 1693 int 1694 segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize) 1695 { 1696 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1697 int reclaim = 1; 1698 1699 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1700 retry: 1701 if (shmd->shm_softlockcnt > 0) { 1702 if (reclaim == 1) { 1703 segspt_purge(seg); 1704 reclaim = 0; 1705 goto retry; 1706 } 1707 return (EAGAIN); 1708 } 1709 1710 if (ssize != seg->s_size) { 1711 #ifdef DEBUG 1712 cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n", 1713 ssize, seg->s_size); 1714 #endif 1715 return (EINVAL); 1716 } 1717 1718 (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK, 1719 NULL, 0); 1720 hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc); 1721 1722 seg_free(seg); 1723 1724 return (0); 1725 } 1726 1727 void 1728 segspt_shmfree(struct seg *seg) 1729 { 1730 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1731 struct anon_map *shm_amp = shmd->shm_amp; 1732 1733 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1734 1735 (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0, 1736 MC_UNLOCK, NULL, 0); 1737 1738 /* 1739 * Need to increment refcnt when attaching 1740 * and decrement when detaching because of dup(). 1741 */ 1742 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER); 1743 shm_amp->refcnt--; 1744 ANON_LOCK_EXIT(&shm_amp->a_rwlock); 1745 1746 if (shmd->shm_vpage) { /* only for DISM */ 1747 kmem_free(shmd->shm_vpage, btopr(shm_amp->size)); 1748 shmd->shm_vpage = NULL; 1749 } 1750 1751 /* 1752 * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's 1753 * still working with this segment without holding as lock. 1754 */ 1755 ASSERT(shmd->shm_softlockcnt == 0); 1756 mutex_enter(&shmd->shm_segfree_syncmtx); 1757 mutex_destroy(&shmd->shm_segfree_syncmtx); 1758 1759 kmem_free(shmd, sizeof (*shmd)); 1760 } 1761 1762 /*ARGSUSED*/ 1763 int 1764 segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 1765 { 1766 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1767 1768 /* 1769 * Shared page table is more than shared mapping. 1770 * Individual process sharing page tables can't change prot 1771 * because there is only one set of page tables. 1772 * This will be allowed after private page table is 1773 * supported. 1774 */ 1775 /* need to return correct status error? */ 1776 return (0); 1777 } 1778 1779 1780 faultcode_t 1781 segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr, 1782 size_t len, enum fault_type type, enum seg_rw rw) 1783 { 1784 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1785 struct seg *sptseg = shmd->shm_sptseg; 1786 struct as *curspt = shmd->shm_sptas; 1787 struct spt_data *sptd = sptseg->s_data; 1788 pgcnt_t npages; 1789 size_t size; 1790 caddr_t segspt_addr, shm_addr; 1791 page_t **ppa; 1792 int i; 1793 ulong_t an_idx = 0; 1794 int err = 0; 1795 int dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0); 1796 size_t pgsz; 1797 pgcnt_t pgcnt; 1798 caddr_t a; 1799 pgcnt_t pidx; 1800 1801 #ifdef lint 1802 hat = hat; 1803 #endif 1804 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1805 1806 /* 1807 * Because of the way spt is implemented 1808 * the realsize of the segment does not have to be 1809 * equal to the segment size itself. The segment size is 1810 * often in multiples of a page size larger than PAGESIZE. 1811 * The realsize is rounded up to the nearest PAGESIZE 1812 * based on what the user requested. This is a bit of 1813 * ungliness that is historical but not easily fixed 1814 * without re-designing the higher levels of ISM. 1815 */ 1816 ASSERT(addr >= seg->s_base); 1817 if (((addr + len) - seg->s_base) > sptd->spt_realsize) 1818 return (FC_NOMAP); 1819 /* 1820 * For all of the following cases except F_PROT, we need to 1821 * make any necessary adjustments to addr and len 1822 * and get all of the necessary page_t's into an array called ppa[]. 1823 * 1824 * The code in shmat() forces base addr and len of ISM segment 1825 * to be aligned to largest page size supported. Therefore, 1826 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large 1827 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK 1828 * in large pagesize chunks, or else we will screw up the HAT 1829 * layer by calling hat_memload_array() with differing page sizes 1830 * over a given virtual range. 1831 */ 1832 pgsz = page_get_pagesize(sptseg->s_szc); 1833 pgcnt = page_get_pagecnt(sptseg->s_szc); 1834 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); 1835 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz); 1836 npages = btopr(size); 1837 1838 /* 1839 * Now we need to convert from addr in segshm to addr in segspt. 1840 */ 1841 an_idx = seg_page(seg, shm_addr); 1842 segspt_addr = sptseg->s_base + ptob(an_idx); 1843 1844 ASSERT((segspt_addr + ptob(npages)) <= 1845 (sptseg->s_base + sptd->spt_realsize)); 1846 ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size)); 1847 1848 switch (type) { 1849 1850 case F_SOFTLOCK: 1851 1852 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages); 1853 /* 1854 * Fall through to the F_INVAL case to load up the hat layer 1855 * entries with the HAT_LOAD_LOCK flag. 1856 */ 1857 /* FALLTHRU */ 1858 case F_INVAL: 1859 1860 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC)) 1861 return (FC_NOMAP); 1862 1863 ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP); 1864 1865 err = spt_anon_getpages(sptseg, segspt_addr, size, ppa); 1866 if (err != 0) { 1867 if (type == F_SOFTLOCK) { 1868 atomic_add_long((ulong_t *)( 1869 &(shmd->shm_softlockcnt)), -npages); 1870 } 1871 goto dism_err; 1872 } 1873 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER); 1874 a = segspt_addr; 1875 pidx = 0; 1876 if (type == F_SOFTLOCK) { 1877 1878 /* 1879 * Load up the translation keeping it 1880 * locked and don't unlock the page. 1881 */ 1882 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 1883 hat_memload_array(sptseg->s_as->a_hat, 1884 a, pgsz, &ppa[pidx], sptd->spt_prot, 1885 HAT_LOAD_LOCK | HAT_LOAD_SHARE); 1886 } 1887 } else { 1888 /* 1889 * Migrate pages marked for migration 1890 */ 1891 if (lgrp_optimizations()) 1892 page_migrate(seg, shm_addr, ppa, npages); 1893 1894 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 1895 hat_memload_array(sptseg->s_as->a_hat, 1896 a, pgsz, &ppa[pidx], 1897 sptd->spt_prot, 1898 HAT_LOAD_SHARE); 1899 } 1900 1901 /* 1902 * And now drop the SE_SHARED lock(s). 1903 */ 1904 if (dyn_ism_unmap) { 1905 for (i = 0; i < npages; i++) { 1906 page_unlock(ppa[i]); 1907 } 1908 } 1909 } 1910 1911 if (!dyn_ism_unmap) { 1912 if (hat_share(seg->s_as->a_hat, shm_addr, 1913 curspt->a_hat, segspt_addr, ptob(npages), 1914 seg->s_szc) != 0) { 1915 panic("hat_share err in DISM fault"); 1916 /* NOTREACHED */ 1917 } 1918 if (type == F_INVAL) { 1919 for (i = 0; i < npages; i++) { 1920 page_unlock(ppa[i]); 1921 } 1922 } 1923 } 1924 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock); 1925 dism_err: 1926 kmem_free(ppa, npages * sizeof (page_t *)); 1927 return (err); 1928 1929 case F_SOFTUNLOCK: 1930 1931 /* 1932 * This is a bit ugly, we pass in the real seg pointer, 1933 * but the segspt_addr is the virtual address within the 1934 * dummy seg. 1935 */ 1936 segspt_softunlock(seg, segspt_addr, size, rw); 1937 return (0); 1938 1939 case F_PROT: 1940 1941 /* 1942 * This takes care of the unusual case where a user 1943 * allocates a stack in shared memory and a register 1944 * window overflow is written to that stack page before 1945 * it is otherwise modified. 1946 * 1947 * We can get away with this because ISM segments are 1948 * always rw. Other than this unusual case, there 1949 * should be no instances of protection violations. 1950 */ 1951 return (0); 1952 1953 default: 1954 #ifdef DEBUG 1955 panic("segspt_dismfault default type?"); 1956 #else 1957 return (FC_NOMAP); 1958 #endif 1959 } 1960 } 1961 1962 1963 faultcode_t 1964 segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr, 1965 size_t len, enum fault_type type, enum seg_rw rw) 1966 { 1967 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1968 struct seg *sptseg = shmd->shm_sptseg; 1969 struct as *curspt = shmd->shm_sptas; 1970 struct spt_data *sptd = sptseg->s_data; 1971 pgcnt_t npages; 1972 size_t size; 1973 caddr_t sptseg_addr, shm_addr; 1974 page_t *pp, **ppa; 1975 int i; 1976 u_offset_t offset; 1977 ulong_t anon_index = 0; 1978 struct vnode *vp; 1979 struct anon_map *amp; /* XXX - for locknest */ 1980 struct anon *ap = NULL; 1981 size_t pgsz; 1982 pgcnt_t pgcnt; 1983 caddr_t a; 1984 pgcnt_t pidx; 1985 size_t sz; 1986 1987 #ifdef lint 1988 hat = hat; 1989 #endif 1990 1991 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1992 1993 if (sptd->spt_flags & SHM_PAGEABLE) { 1994 return (segspt_dismfault(hat, seg, addr, len, type, rw)); 1995 } 1996 1997 /* 1998 * Because of the way spt is implemented 1999 * the realsize of the segment does not have to be 2000 * equal to the segment size itself. The segment size is 2001 * often in multiples of a page size larger than PAGESIZE. 2002 * The realsize is rounded up to the nearest PAGESIZE 2003 * based on what the user requested. This is a bit of 2004 * ungliness that is historical but not easily fixed 2005 * without re-designing the higher levels of ISM. 2006 */ 2007 ASSERT(addr >= seg->s_base); 2008 if (((addr + len) - seg->s_base) > sptd->spt_realsize) 2009 return (FC_NOMAP); 2010 /* 2011 * For all of the following cases except F_PROT, we need to 2012 * make any necessary adjustments to addr and len 2013 * and get all of the necessary page_t's into an array called ppa[]. 2014 * 2015 * The code in shmat() forces base addr and len of ISM segment 2016 * to be aligned to largest page size supported. Therefore, 2017 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large 2018 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK 2019 * in large pagesize chunks, or else we will screw up the HAT 2020 * layer by calling hat_memload_array() with differing page sizes 2021 * over a given virtual range. 2022 */ 2023 pgsz = page_get_pagesize(sptseg->s_szc); 2024 pgcnt = page_get_pagecnt(sptseg->s_szc); 2025 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); 2026 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz); 2027 npages = btopr(size); 2028 2029 /* 2030 * Now we need to convert from addr in segshm to addr in segspt. 2031 */ 2032 anon_index = seg_page(seg, shm_addr); 2033 sptseg_addr = sptseg->s_base + ptob(anon_index); 2034 2035 /* 2036 * And now we may have to adjust npages downward if we have 2037 * exceeded the realsize of the segment or initial anon 2038 * allocations. 2039 */ 2040 if ((sptseg_addr + ptob(npages)) > 2041 (sptseg->s_base + sptd->spt_realsize)) 2042 size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr; 2043 2044 npages = btopr(size); 2045 2046 ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size)); 2047 ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0); 2048 2049 switch (type) { 2050 2051 case F_SOFTLOCK: 2052 2053 /* 2054 * availrmem is decremented once during anon_swap_adjust() 2055 * and is incremented during the anon_unresv(), which is 2056 * called from shm_rm_amp() when the segment is destroyed. 2057 */ 2058 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages); 2059 /* 2060 * Some platforms assume that ISM pages are SE_SHARED 2061 * locked for the entire life of the segment. 2062 */ 2063 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) 2064 return (0); 2065 /* 2066 * Fall through to the F_INVAL case to load up the hat layer 2067 * entries with the HAT_LOAD_LOCK flag. 2068 */ 2069 2070 /* FALLTHRU */ 2071 case F_INVAL: 2072 2073 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC)) 2074 return (FC_NOMAP); 2075 2076 /* 2077 * Some platforms that do NOT support DYNAMIC_ISM_UNMAP 2078 * may still rely on this call to hat_share(). That 2079 * would imply that those hat's can fault on a 2080 * HAT_LOAD_LOCK translation, which would seem 2081 * contradictory. 2082 */ 2083 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 2084 if (hat_share(seg->s_as->a_hat, seg->s_base, 2085 curspt->a_hat, sptseg->s_base, 2086 sptseg->s_size, sptseg->s_szc) != 0) { 2087 panic("hat_share error in ISM fault"); 2088 /*NOTREACHED*/ 2089 } 2090 return (0); 2091 } 2092 ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP); 2093 2094 /* 2095 * I see no need to lock the real seg, 2096 * here, because all of our work will be on the underlying 2097 * dummy seg. 2098 * 2099 * sptseg_addr and npages now account for large pages. 2100 */ 2101 amp = sptd->spt_amp; 2102 ASSERT(amp != NULL); 2103 anon_index = seg_page(sptseg, sptseg_addr); 2104 2105 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2106 for (i = 0; i < npages; i++) { 2107 ap = anon_get_ptr(amp->ahp, anon_index++); 2108 ASSERT(ap != NULL); 2109 swap_xlate(ap, &vp, &offset); 2110 pp = page_lookup(vp, offset, SE_SHARED); 2111 ASSERT(pp != NULL); 2112 ppa[i] = pp; 2113 } 2114 ANON_LOCK_EXIT(&->a_rwlock); 2115 ASSERT(i == npages); 2116 2117 /* 2118 * We are already holding the as->a_lock on the user's 2119 * real segment, but we need to hold the a_lock on the 2120 * underlying dummy as. This is mostly to satisfy the 2121 * underlying HAT layer. 2122 */ 2123 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER); 2124 a = sptseg_addr; 2125 pidx = 0; 2126 if (type == F_SOFTLOCK) { 2127 /* 2128 * Load up the translation keeping it 2129 * locked and don't unlock the page. 2130 */ 2131 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 2132 sz = MIN(pgsz, ptob(npages - pidx)); 2133 hat_memload_array(sptseg->s_as->a_hat, a, 2134 sz, &ppa[pidx], sptd->spt_prot, 2135 HAT_LOAD_LOCK | HAT_LOAD_SHARE); 2136 } 2137 } else { 2138 /* 2139 * Migrate pages marked for migration. 2140 */ 2141 if (lgrp_optimizations()) 2142 page_migrate(seg, shm_addr, ppa, npages); 2143 2144 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 2145 sz = MIN(pgsz, ptob(npages - pidx)); 2146 hat_memload_array(sptseg->s_as->a_hat, 2147 a, sz, &ppa[pidx], 2148 sptd->spt_prot, HAT_LOAD_SHARE); 2149 } 2150 2151 /* 2152 * And now drop the SE_SHARED lock(s). 2153 */ 2154 for (i = 0; i < npages; i++) 2155 page_unlock(ppa[i]); 2156 } 2157 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock); 2158 2159 kmem_free(ppa, sizeof (page_t *) * npages); 2160 return (0); 2161 case F_SOFTUNLOCK: 2162 2163 /* 2164 * This is a bit ugly, we pass in the real seg pointer, 2165 * but the sptseg_addr is the virtual address within the 2166 * dummy seg. 2167 */ 2168 segspt_softunlock(seg, sptseg_addr, ptob(npages), rw); 2169 return (0); 2170 2171 case F_PROT: 2172 2173 /* 2174 * This takes care of the unusual case where a user 2175 * allocates a stack in shared memory and a register 2176 * window overflow is written to that stack page before 2177 * it is otherwise modified. 2178 * 2179 * We can get away with this because ISM segments are 2180 * always rw. Other than this unusual case, there 2181 * should be no instances of protection violations. 2182 */ 2183 return (0); 2184 2185 default: 2186 #ifdef DEBUG 2187 cmn_err(CE_WARN, "segspt_shmfault default type?"); 2188 #endif 2189 return (FC_NOMAP); 2190 } 2191 } 2192 2193 /*ARGSUSED*/ 2194 static faultcode_t 2195 segspt_shmfaulta(struct seg *seg, caddr_t addr) 2196 { 2197 return (0); 2198 } 2199 2200 /*ARGSUSED*/ 2201 static int 2202 segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta) 2203 { 2204 return (0); 2205 } 2206 2207 /* 2208 * duplicate the shared page tables 2209 */ 2210 int 2211 segspt_shmdup(struct seg *seg, struct seg *newseg) 2212 { 2213 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2214 struct anon_map *amp = shmd->shm_amp; 2215 struct shm_data *shmd_new; 2216 struct seg *spt_seg = shmd->shm_sptseg; 2217 struct spt_data *sptd = spt_seg->s_data; 2218 int error = 0; 2219 2220 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 2221 2222 shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP); 2223 newseg->s_data = (void *)shmd_new; 2224 shmd_new->shm_sptas = shmd->shm_sptas; 2225 shmd_new->shm_amp = amp; 2226 shmd_new->shm_sptseg = shmd->shm_sptseg; 2227 newseg->s_ops = &segspt_shmops; 2228 newseg->s_szc = seg->s_szc; 2229 ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc); 2230 2231 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2232 amp->refcnt++; 2233 ANON_LOCK_EXIT(&->a_rwlock); 2234 2235 if (sptd->spt_flags & SHM_PAGEABLE) { 2236 shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP); 2237 shmd_new->shm_lckpgs = 0; 2238 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 2239 if ((error = hat_share(newseg->s_as->a_hat, 2240 newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR, 2241 seg->s_size, seg->s_szc)) != 0) { 2242 kmem_free(shmd_new->shm_vpage, 2243 btopr(amp->size)); 2244 } 2245 } 2246 return (error); 2247 } else { 2248 return (hat_share(newseg->s_as->a_hat, newseg->s_base, 2249 shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size, 2250 seg->s_szc)); 2251 2252 } 2253 } 2254 2255 /*ARGSUSED*/ 2256 int 2257 segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot) 2258 { 2259 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2260 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2261 2262 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2263 2264 /* 2265 * ISM segment is always rw. 2266 */ 2267 return (((sptd->spt_prot & prot) != prot) ? EACCES : 0); 2268 } 2269 2270 /* 2271 * Return an array of locked large pages, for empty slots allocate 2272 * private zero-filled anon pages. 2273 */ 2274 static int 2275 spt_anon_getpages( 2276 struct seg *sptseg, 2277 caddr_t sptaddr, 2278 size_t len, 2279 page_t *ppa[]) 2280 { 2281 struct spt_data *sptd = sptseg->s_data; 2282 struct anon_map *amp = sptd->spt_amp; 2283 enum seg_rw rw = sptd->spt_prot; 2284 uint_t szc = sptseg->s_szc; 2285 size_t pg_sz, share_sz = page_get_pagesize(szc); 2286 pgcnt_t lp_npgs; 2287 caddr_t lp_addr, e_sptaddr; 2288 uint_t vpprot, ppa_szc = 0; 2289 struct vpage *vpage = NULL; 2290 ulong_t j, ppa_idx; 2291 int err, ierr = 0; 2292 pgcnt_t an_idx; 2293 anon_sync_obj_t cookie; 2294 int anon_locked = 0; 2295 pgcnt_t amp_pgs; 2296 2297 2298 ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz)); 2299 ASSERT(len != 0); 2300 2301 pg_sz = share_sz; 2302 lp_npgs = btop(pg_sz); 2303 lp_addr = sptaddr; 2304 e_sptaddr = sptaddr + len; 2305 an_idx = seg_page(sptseg, sptaddr); 2306 ppa_idx = 0; 2307 2308 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2309 2310 amp_pgs = page_get_pagecnt(amp->a_szc); 2311 2312 /*CONSTCOND*/ 2313 while (1) { 2314 for (; lp_addr < e_sptaddr; 2315 an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) { 2316 2317 /* 2318 * If we're currently locked, and we get to a new 2319 * page, unlock our current anon chunk. 2320 */ 2321 if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) { 2322 anon_array_exit(&cookie); 2323 anon_locked = 0; 2324 } 2325 if (!anon_locked) { 2326 anon_array_enter(amp, an_idx, &cookie); 2327 anon_locked = 1; 2328 } 2329 ppa_szc = (uint_t)-1; 2330 ierr = anon_map_getpages(amp, an_idx, szc, sptseg, 2331 lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx], 2332 &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred); 2333 2334 if (ierr != 0) { 2335 if (ierr > 0) { 2336 err = FC_MAKE_ERR(ierr); 2337 goto lpgs_err; 2338 } 2339 break; 2340 } 2341 } 2342 if (lp_addr == e_sptaddr) { 2343 break; 2344 } 2345 ASSERT(lp_addr < e_sptaddr); 2346 2347 /* 2348 * ierr == -1 means we failed to allocate a large page. 2349 * so do a size down operation. 2350 * 2351 * ierr == -2 means some other process that privately shares 2352 * pages with this process has allocated a larger page and we 2353 * need to retry with larger pages. So do a size up 2354 * operation. This relies on the fact that large pages are 2355 * never partially shared i.e. if we share any constituent 2356 * page of a large page with another process we must share the 2357 * entire large page. Note this cannot happen for SOFTLOCK 2358 * case, unless current address (lpaddr) is at the beginning 2359 * of the next page size boundary because the other process 2360 * couldn't have relocated locked pages. 2361 */ 2362 ASSERT(ierr == -1 || ierr == -2); 2363 if (segvn_anypgsz) { 2364 ASSERT(ierr == -2 || szc != 0); 2365 ASSERT(ierr == -1 || szc < sptseg->s_szc); 2366 szc = (ierr == -1) ? szc - 1 : szc + 1; 2367 } else { 2368 /* 2369 * For faults and segvn_anypgsz == 0 2370 * we need to be careful not to loop forever 2371 * if existing page is found with szc other 2372 * than 0 or seg->s_szc. This could be due 2373 * to page relocations on behalf of DR or 2374 * more likely large page creation. For this 2375 * case simply re-size to existing page's szc 2376 * if returned by anon_map_getpages(). 2377 */ 2378 if (ppa_szc == (uint_t)-1) { 2379 szc = (ierr == -1) ? 0 : sptseg->s_szc; 2380 } else { 2381 ASSERT(ppa_szc <= sptseg->s_szc); 2382 ASSERT(ierr == -2 || ppa_szc < szc); 2383 ASSERT(ierr == -1 || ppa_szc > szc); 2384 szc = ppa_szc; 2385 } 2386 } 2387 pg_sz = page_get_pagesize(szc); 2388 lp_npgs = btop(pg_sz); 2389 ASSERT(IS_P2ALIGNED(lp_addr, pg_sz)); 2390 } 2391 if (anon_locked) { 2392 anon_array_exit(&cookie); 2393 } 2394 ANON_LOCK_EXIT(&->a_rwlock); 2395 return (0); 2396 2397 lpgs_err: 2398 if (anon_locked) { 2399 anon_array_exit(&cookie); 2400 } 2401 ANON_LOCK_EXIT(&->a_rwlock); 2402 for (j = 0; j < ppa_idx; j++) 2403 page_unlock(ppa[j]); 2404 return (err); 2405 } 2406 2407 /* 2408 * count the number of bytes in a set of spt pages that are currently not 2409 * locked 2410 */ 2411 static rctl_qty_t 2412 spt_unlockedbytes(pgcnt_t npages, page_t **ppa) 2413 { 2414 ulong_t i; 2415 rctl_qty_t unlocked = 0; 2416 2417 for (i = 0; i < npages; i++) { 2418 if (ppa[i]->p_lckcnt == 0) 2419 unlocked += PAGESIZE; 2420 } 2421 return (unlocked); 2422 } 2423 2424 extern u_longlong_t randtick(void); 2425 /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */ 2426 #define NLCK (NCPU_P2) 2427 /* Random number with a range [0, n-1], n must be power of two */ 2428 #define RAND_P2(n) \ 2429 ((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1)) 2430 2431 int 2432 spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages, 2433 page_t **ppa, ulong_t *lockmap, size_t pos, 2434 rctl_qty_t *locked) 2435 { 2436 struct shm_data *shmd = seg->s_data; 2437 struct spt_data *sptd = shmd->shm_sptseg->s_data; 2438 ulong_t i; 2439 int kernel; 2440 pgcnt_t nlck = 0; 2441 int rv = 0; 2442 int use_reserved = 1; 2443 2444 /* return the number of bytes actually locked */ 2445 *locked = 0; 2446 2447 /* 2448 * To avoid contention on freemem_lock, availrmem and pages_locked 2449 * global counters are updated only every nlck locked pages instead of 2450 * every time. Reserve nlck locks up front and deduct from this 2451 * reservation for each page that requires a lock. When the reservation 2452 * is consumed, reserve again. nlck is randomized, so the competing 2453 * threads do not fall into a cyclic lock contention pattern. When 2454 * memory is low, the lock ahead is disabled, and instead page_pp_lock() 2455 * is used to lock pages. 2456 */ 2457 for (i = 0; i < npages; anon_index++, pos++, i++) { 2458 if (nlck == 0 && use_reserved == 1) { 2459 nlck = NLCK + RAND_P2(NLCK); 2460 /* if fewer loops left, decrease nlck */ 2461 nlck = MIN(nlck, npages - i); 2462 /* 2463 * Reserve nlck locks up front and deduct from this 2464 * reservation for each page that requires a lock. When 2465 * the reservation is consumed, reserve again. 2466 */ 2467 mutex_enter(&freemem_lock); 2468 if ((availrmem - nlck) < pages_pp_maximum) { 2469 /* Do not do advance memory reserves */ 2470 use_reserved = 0; 2471 } else { 2472 availrmem -= nlck; 2473 pages_locked += nlck; 2474 } 2475 mutex_exit(&freemem_lock); 2476 } 2477 if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) { 2478 if (sptd->spt_ppa_lckcnt[anon_index] < 2479 (ushort_t)DISM_LOCK_MAX) { 2480 if (++sptd->spt_ppa_lckcnt[anon_index] == 2481 (ushort_t)DISM_LOCK_MAX) { 2482 cmn_err(CE_WARN, 2483 "DISM page lock limit " 2484 "reached on DISM offset 0x%lx\n", 2485 anon_index << PAGESHIFT); 2486 } 2487 kernel = (sptd->spt_ppa && 2488 sptd->spt_ppa[anon_index]); 2489 if (!page_pp_lock(ppa[i], 0, kernel || 2490 use_reserved)) { 2491 sptd->spt_ppa_lckcnt[anon_index]--; 2492 rv = EAGAIN; 2493 break; 2494 } 2495 /* if this is a newly locked page, count it */ 2496 if (ppa[i]->p_lckcnt == 1) { 2497 if (kernel == 0 && use_reserved == 1) 2498 nlck--; 2499 *locked += PAGESIZE; 2500 } 2501 shmd->shm_lckpgs++; 2502 shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED; 2503 if (lockmap != NULL) 2504 BT_SET(lockmap, pos); 2505 } 2506 } 2507 } 2508 /* Return unused lock reservation */ 2509 if (nlck != 0 && use_reserved == 1) { 2510 mutex_enter(&freemem_lock); 2511 availrmem += nlck; 2512 pages_locked -= nlck; 2513 mutex_exit(&freemem_lock); 2514 } 2515 2516 return (rv); 2517 } 2518 2519 int 2520 spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages, 2521 rctl_qty_t *unlocked) 2522 { 2523 struct shm_data *shmd = seg->s_data; 2524 struct spt_data *sptd = shmd->shm_sptseg->s_data; 2525 struct anon_map *amp = sptd->spt_amp; 2526 struct anon *ap; 2527 struct vnode *vp; 2528 u_offset_t off; 2529 struct page *pp; 2530 int kernel; 2531 anon_sync_obj_t cookie; 2532 ulong_t i; 2533 pgcnt_t nlck = 0; 2534 pgcnt_t nlck_limit = NLCK; 2535 2536 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2537 for (i = 0; i < npages; i++, anon_index++) { 2538 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) { 2539 anon_array_enter(amp, anon_index, &cookie); 2540 ap = anon_get_ptr(amp->ahp, anon_index); 2541 ASSERT(ap); 2542 2543 swap_xlate(ap, &vp, &off); 2544 anon_array_exit(&cookie); 2545 pp = page_lookup(vp, off, SE_SHARED); 2546 ASSERT(pp); 2547 /* 2548 * availrmem is decremented only for pages which are not 2549 * in seg pcache, for pages in seg pcache availrmem was 2550 * decremented in _dismpagelock() 2551 */ 2552 kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]); 2553 ASSERT(pp->p_lckcnt > 0); 2554 2555 /* 2556 * lock page but do not change availrmem, we do it 2557 * ourselves every nlck loops. 2558 */ 2559 page_pp_unlock(pp, 0, 1); 2560 if (pp->p_lckcnt == 0) { 2561 if (kernel == 0) 2562 nlck++; 2563 *unlocked += PAGESIZE; 2564 } 2565 page_unlock(pp); 2566 shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED; 2567 sptd->spt_ppa_lckcnt[anon_index]--; 2568 shmd->shm_lckpgs--; 2569 } 2570 2571 /* 2572 * To reduce freemem_lock contention, do not update availrmem 2573 * until at least NLCK pages have been unlocked. 2574 * 1. No need to update if nlck is zero 2575 * 2. Always update if the last iteration 2576 */ 2577 if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) { 2578 mutex_enter(&freemem_lock); 2579 availrmem += nlck; 2580 pages_locked -= nlck; 2581 mutex_exit(&freemem_lock); 2582 nlck = 0; 2583 nlck_limit = NLCK + RAND_P2(NLCK); 2584 } 2585 } 2586 ANON_LOCK_EXIT(&->a_rwlock); 2587 2588 return (0); 2589 } 2590 2591 /*ARGSUSED*/ 2592 static int 2593 segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len, 2594 int attr, int op, ulong_t *lockmap, size_t pos) 2595 { 2596 struct shm_data *shmd = seg->s_data; 2597 struct seg *sptseg = shmd->shm_sptseg; 2598 struct spt_data *sptd = sptseg->s_data; 2599 struct kshmid *sp = sptd->spt_amp->a_sp; 2600 pgcnt_t npages, a_npages; 2601 page_t **ppa; 2602 pgcnt_t an_idx, a_an_idx, ppa_idx; 2603 caddr_t spt_addr, a_addr; /* spt and aligned address */ 2604 size_t a_len; /* aligned len */ 2605 size_t share_sz; 2606 ulong_t i; 2607 int sts = 0; 2608 rctl_qty_t unlocked = 0; 2609 rctl_qty_t locked = 0; 2610 struct proc *p = curproc; 2611 kproject_t *proj; 2612 2613 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2614 ASSERT(sp != NULL); 2615 2616 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 2617 return (0); 2618 } 2619 2620 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2621 an_idx = seg_page(seg, addr); 2622 npages = btopr(len); 2623 2624 if (an_idx + npages > btopr(shmd->shm_amp->size)) { 2625 return (ENOMEM); 2626 } 2627 2628 /* 2629 * A shm's project never changes, so no lock needed. 2630 * The shm has a hold on the project, so it will not go away. 2631 * Since we have a mapping to shm within this zone, we know 2632 * that the zone will not go away. 2633 */ 2634 proj = sp->shm_perm.ipc_proj; 2635 2636 if (op == MC_LOCK) { 2637 2638 /* 2639 * Need to align addr and size request if they are not 2640 * aligned so we can always allocate large page(s) however 2641 * we only lock what was requested in initial request. 2642 */ 2643 share_sz = page_get_pagesize(sptseg->s_szc); 2644 a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz); 2645 a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)), 2646 share_sz); 2647 a_npages = btop(a_len); 2648 a_an_idx = seg_page(seg, a_addr); 2649 spt_addr = sptseg->s_base + ptob(a_an_idx); 2650 ppa_idx = an_idx - a_an_idx; 2651 2652 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages), 2653 KM_NOSLEEP)) == NULL) { 2654 return (ENOMEM); 2655 } 2656 2657 /* 2658 * Don't cache any new pages for IO and 2659 * flush any cached pages. 2660 */ 2661 mutex_enter(&sptd->spt_lock); 2662 if (sptd->spt_ppa != NULL) 2663 sptd->spt_flags |= DISM_PPA_CHANGED; 2664 2665 sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa); 2666 if (sts != 0) { 2667 mutex_exit(&sptd->spt_lock); 2668 kmem_free(ppa, ((sizeof (page_t *)) * a_npages)); 2669 return (sts); 2670 } 2671 2672 mutex_enter(&sp->shm_mlock); 2673 /* enforce locked memory rctl */ 2674 unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]); 2675 2676 mutex_enter(&p->p_lock); 2677 if (rctl_incr_locked_mem(p, proj, unlocked, 0)) { 2678 mutex_exit(&p->p_lock); 2679 sts = EAGAIN; 2680 } else { 2681 mutex_exit(&p->p_lock); 2682 sts = spt_lockpages(seg, an_idx, npages, 2683 &ppa[ppa_idx], lockmap, pos, &locked); 2684 2685 /* 2686 * correct locked count if not all pages could be 2687 * locked 2688 */ 2689 if ((unlocked - locked) > 0) { 2690 rctl_decr_locked_mem(NULL, proj, 2691 (unlocked - locked), 0); 2692 } 2693 } 2694 /* 2695 * unlock pages 2696 */ 2697 for (i = 0; i < a_npages; i++) 2698 page_unlock(ppa[i]); 2699 if (sptd->spt_ppa != NULL) 2700 sptd->spt_flags |= DISM_PPA_CHANGED; 2701 mutex_exit(&sp->shm_mlock); 2702 mutex_exit(&sptd->spt_lock); 2703 2704 kmem_free(ppa, ((sizeof (page_t *)) * a_npages)); 2705 2706 } else if (op == MC_UNLOCK) { /* unlock */ 2707 page_t **ppa; 2708 2709 mutex_enter(&sptd->spt_lock); 2710 if (shmd->shm_lckpgs == 0) { 2711 mutex_exit(&sptd->spt_lock); 2712 return (0); 2713 } 2714 /* 2715 * Don't cache new IO pages. 2716 */ 2717 if (sptd->spt_ppa != NULL) 2718 sptd->spt_flags |= DISM_PPA_CHANGED; 2719 2720 mutex_enter(&sp->shm_mlock); 2721 sts = spt_unlockpages(seg, an_idx, npages, &unlocked); 2722 if ((ppa = sptd->spt_ppa) != NULL) 2723 sptd->spt_flags |= DISM_PPA_CHANGED; 2724 mutex_exit(&sptd->spt_lock); 2725 2726 rctl_decr_locked_mem(NULL, proj, unlocked, 0); 2727 mutex_exit(&sp->shm_mlock); 2728 2729 if (ppa != NULL) 2730 seg_ppurge_wiredpp(ppa); 2731 } 2732 return (sts); 2733 } 2734 2735 /*ARGSUSED*/ 2736 int 2737 segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 2738 { 2739 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2740 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2741 spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1; 2742 2743 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2744 2745 /* 2746 * ISM segment is always rw. 2747 */ 2748 while (--pgno >= 0) 2749 *protv++ = sptd->spt_prot; 2750 return (0); 2751 } 2752 2753 /*ARGSUSED*/ 2754 u_offset_t 2755 segspt_shmgetoffset(struct seg *seg, caddr_t addr) 2756 { 2757 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2758 2759 /* Offset does not matter in ISM memory */ 2760 2761 return ((u_offset_t)0); 2762 } 2763 2764 /* ARGSUSED */ 2765 int 2766 segspt_shmgettype(struct seg *seg, caddr_t addr) 2767 { 2768 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2769 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2770 2771 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2772 2773 /* 2774 * The shared memory mapping is always MAP_SHARED, SWAP is only 2775 * reserved for DISM 2776 */ 2777 return (MAP_SHARED | 2778 ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE)); 2779 } 2780 2781 /*ARGSUSED*/ 2782 int 2783 segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 2784 { 2785 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2786 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2787 2788 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2789 2790 *vpp = sptd->spt_vp; 2791 return (0); 2792 } 2793 2794 /* 2795 * We need to wait for pending IO to complete to a DISM segment in order for 2796 * pages to get kicked out of the seg_pcache. 120 seconds should be more 2797 * than enough time to wait. 2798 */ 2799 static clock_t spt_pcache_wait = 120; 2800 2801 /*ARGSUSED*/ 2802 static int 2803 segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 2804 { 2805 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2806 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2807 struct anon_map *amp; 2808 pgcnt_t pg_idx; 2809 ushort_t gen; 2810 clock_t end_lbolt; 2811 int writer; 2812 page_t **ppa; 2813 2814 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2815 2816 if (behav == MADV_FREE) { 2817 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) 2818 return (0); 2819 2820 amp = sptd->spt_amp; 2821 pg_idx = seg_page(seg, addr); 2822 2823 mutex_enter(&sptd->spt_lock); 2824 if ((ppa = sptd->spt_ppa) == NULL) { 2825 mutex_exit(&sptd->spt_lock); 2826 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2827 anon_disclaim(amp, pg_idx, len); 2828 ANON_LOCK_EXIT(&->a_rwlock); 2829 return (0); 2830 } 2831 2832 sptd->spt_flags |= DISM_PPA_CHANGED; 2833 gen = sptd->spt_gen; 2834 2835 mutex_exit(&sptd->spt_lock); 2836 2837 /* 2838 * Purge all DISM cached pages 2839 */ 2840 seg_ppurge_wiredpp(ppa); 2841 2842 /* 2843 * Drop the AS_LOCK so that other threads can grab it 2844 * in the as_pageunlock path and hopefully get the segment 2845 * kicked out of the seg_pcache. We bump the shm_softlockcnt 2846 * to keep this segment resident. 2847 */ 2848 writer = AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock); 2849 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 2850 AS_LOCK_EXIT(seg->s_as, &seg->s_as->a_lock); 2851 2852 mutex_enter(&sptd->spt_lock); 2853 2854 end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait); 2855 2856 /* 2857 * Try to wait for pages to get kicked out of the seg_pcache. 2858 */ 2859 while (sptd->spt_gen == gen && 2860 (sptd->spt_flags & DISM_PPA_CHANGED) && 2861 ddi_get_lbolt() < end_lbolt) { 2862 if (!cv_timedwait_sig(&sptd->spt_cv, 2863 &sptd->spt_lock, end_lbolt)) { 2864 break; 2865 } 2866 } 2867 2868 mutex_exit(&sptd->spt_lock); 2869 2870 /* Regrab the AS_LOCK and release our hold on the segment */ 2871 AS_LOCK_ENTER(seg->s_as, &seg->s_as->a_lock, 2872 writer ? RW_WRITER : RW_READER); 2873 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 2874 if (shmd->shm_softlockcnt <= 0) { 2875 if (AS_ISUNMAPWAIT(seg->s_as)) { 2876 mutex_enter(&seg->s_as->a_contents); 2877 if (AS_ISUNMAPWAIT(seg->s_as)) { 2878 AS_CLRUNMAPWAIT(seg->s_as); 2879 cv_broadcast(&seg->s_as->a_cv); 2880 } 2881 mutex_exit(&seg->s_as->a_contents); 2882 } 2883 } 2884 2885 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2886 anon_disclaim(amp, pg_idx, len); 2887 ANON_LOCK_EXIT(&->a_rwlock); 2888 } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP || 2889 behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) { 2890 int already_set; 2891 ulong_t anon_index; 2892 lgrp_mem_policy_t policy; 2893 caddr_t shm_addr; 2894 size_t share_size; 2895 size_t size; 2896 struct seg *sptseg = shmd->shm_sptseg; 2897 caddr_t sptseg_addr; 2898 2899 /* 2900 * Align address and length to page size of underlying segment 2901 */ 2902 share_size = page_get_pagesize(shmd->shm_sptseg->s_szc); 2903 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size); 2904 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), 2905 share_size); 2906 2907 amp = shmd->shm_amp; 2908 anon_index = seg_page(seg, shm_addr); 2909 2910 /* 2911 * And now we may have to adjust size downward if we have 2912 * exceeded the realsize of the segment or initial anon 2913 * allocations. 2914 */ 2915 sptseg_addr = sptseg->s_base + ptob(anon_index); 2916 if ((sptseg_addr + size) > 2917 (sptseg->s_base + sptd->spt_realsize)) 2918 size = (sptseg->s_base + sptd->spt_realsize) - 2919 sptseg_addr; 2920 2921 /* 2922 * Set memory allocation policy for this segment 2923 */ 2924 policy = lgrp_madv_to_policy(behav, len, MAP_SHARED); 2925 already_set = lgrp_shm_policy_set(policy, amp, anon_index, 2926 NULL, 0, len); 2927 2928 /* 2929 * If random memory allocation policy set already, 2930 * don't bother reapplying it. 2931 */ 2932 if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 2933 return (0); 2934 2935 /* 2936 * Mark any existing pages in the given range for 2937 * migration, flushing the I/O page cache, and using 2938 * underlying segment to calculate anon index and get 2939 * anonmap and vnode pointer from 2940 */ 2941 if (shmd->shm_softlockcnt > 0) 2942 segspt_purge(seg); 2943 2944 page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0); 2945 } 2946 2947 return (0); 2948 } 2949 2950 /* 2951 * get a memory ID for an addr in a given segment 2952 */ 2953 static int 2954 segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 2955 { 2956 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2957 struct anon *ap; 2958 size_t anon_index; 2959 struct anon_map *amp = shmd->shm_amp; 2960 struct spt_data *sptd = shmd->shm_sptseg->s_data; 2961 struct seg *sptseg = shmd->shm_sptseg; 2962 anon_sync_obj_t cookie; 2963 2964 anon_index = seg_page(seg, addr); 2965 2966 if (addr > (seg->s_base + sptd->spt_realsize)) { 2967 return (EFAULT); 2968 } 2969 2970 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2971 anon_array_enter(amp, anon_index, &cookie); 2972 ap = anon_get_ptr(amp->ahp, anon_index); 2973 if (ap == NULL) { 2974 struct page *pp; 2975 caddr_t spt_addr = sptseg->s_base + ptob(anon_index); 2976 2977 pp = anon_zero(sptseg, spt_addr, &ap, kcred); 2978 if (pp == NULL) { 2979 anon_array_exit(&cookie); 2980 ANON_LOCK_EXIT(&->a_rwlock); 2981 return (ENOMEM); 2982 } 2983 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2984 page_unlock(pp); 2985 } 2986 anon_array_exit(&cookie); 2987 ANON_LOCK_EXIT(&->a_rwlock); 2988 memidp->val[0] = (uintptr_t)ap; 2989 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 2990 return (0); 2991 } 2992 2993 /* 2994 * Get memory allocation policy info for specified address in given segment 2995 */ 2996 static lgrp_mem_policy_info_t * 2997 segspt_shmgetpolicy(struct seg *seg, caddr_t addr) 2998 { 2999 struct anon_map *amp; 3000 ulong_t anon_index; 3001 lgrp_mem_policy_info_t *policy_info; 3002 struct shm_data *shm_data; 3003 3004 ASSERT(seg != NULL); 3005 3006 /* 3007 * Get anon_map from segshm 3008 * 3009 * Assume that no lock needs to be held on anon_map, since 3010 * it should be protected by its reference count which must be 3011 * nonzero for an existing segment 3012 * Need to grab readers lock on policy tree though 3013 */ 3014 shm_data = (struct shm_data *)seg->s_data; 3015 if (shm_data == NULL) 3016 return (NULL); 3017 amp = shm_data->shm_amp; 3018 ASSERT(amp->refcnt != 0); 3019 3020 /* 3021 * Get policy info 3022 * 3023 * Assume starting anon index of 0 3024 */ 3025 anon_index = seg_page(seg, addr); 3026 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0); 3027 3028 return (policy_info); 3029 }