1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/param.h> 26 #include <sys/user.h> 27 #include <sys/mman.h> 28 #include <sys/kmem.h> 29 #include <sys/sysmacros.h> 30 #include <sys/cmn_err.h> 31 #include <sys/systm.h> 32 #include <sys/tuneable.h> 33 #include <vm/hat.h> 34 #include <vm/seg.h> 35 #include <vm/as.h> 36 #include <vm/anon.h> 37 #include <vm/page.h> 38 #include <sys/buf.h> 39 #include <sys/swap.h> 40 #include <sys/atomic.h> 41 #include <vm/seg_spt.h> 42 #include <sys/debug.h> 43 #include <sys/vtrace.h> 44 #include <sys/shm.h> 45 #include <sys/shm_impl.h> 46 #include <sys/lgrp.h> 47 #include <sys/vmsystm.h> 48 #include <sys/policy.h> 49 #include <sys/project.h> 50 #include <sys/tnf_probe.h> 51 #include <sys/zone.h> 52 53 #define SEGSPTADDR (caddr_t)0x0 54 55 /* 56 * # pages used for spt 57 */ 58 size_t spt_used; 59 60 /* 61 * segspt_minfree is the memory left for system after ISM 62 * locked its pages; it is set up to 5% of availrmem in 63 * sptcreate when ISM is created. ISM should not use more 64 * than ~90% of availrmem; if it does, then the performance 65 * of the system may decrease. Machines with large memories may 66 * be able to use up more memory for ISM so we set the default 67 * segspt_minfree to 5% (which gives ISM max 95% of availrmem. 68 * If somebody wants even more memory for ISM (risking hanging 69 * the system) they can patch the segspt_minfree to smaller number. 70 */ 71 pgcnt_t segspt_minfree = 0; 72 73 static int segspt_create(struct seg *seg, caddr_t argsp); 74 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize); 75 static void segspt_free(struct seg *seg); 76 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len); 77 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr); 78 79 static const struct seg_ops segspt_ops = { 80 .unmap = segspt_unmap, 81 .free = segspt_free, 82 .getpolicy = segspt_getpolicy, 83 }; 84 85 static int segspt_shmdup(struct seg *seg, struct seg *newseg); 86 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize); 87 static void segspt_shmfree(struct seg *seg); 88 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg, 89 caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw); 90 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr); 91 static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr, 92 register size_t len, register uint_t prot); 93 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, 94 uint_t prot); 95 static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta); 96 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, 97 register char *vec); 98 static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len, 99 int attr, uint_t flags); 100 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len, 101 int attr, int op, ulong_t *lockmap, size_t pos); 102 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, 103 uint_t *protv); 104 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr); 105 static int segspt_shmgettype(struct seg *seg, caddr_t addr); 106 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp); 107 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, 108 uint_t behav); 109 static void segspt_shmdump(struct seg *seg); 110 static int segspt_shmpagelock(struct seg *, caddr_t, size_t, 111 struct page ***, enum lock_type, enum seg_rw); 112 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *); 113 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t); 114 115 const struct seg_ops segspt_shmops = { 116 .dup = segspt_shmdup, 117 .unmap = segspt_shmunmap, 118 .free = segspt_shmfree, 119 .fault = segspt_shmfault, 120 .faulta = segspt_shmfaulta, 121 .setprot = segspt_shmsetprot, 122 .checkprot = segspt_shmcheckprot, 123 .kluster = segspt_shmkluster, 124 .sync = segspt_shmsync, 125 .incore = segspt_shmincore, 126 .lockop = segspt_shmlockop, 127 .getprot = segspt_shmgetprot, 128 .getoffset = segspt_shmgetoffset, 129 .gettype = segspt_shmgettype, 130 .getvp = segspt_shmgetvp, 131 .advise = segspt_shmadvise, 132 .dump = segspt_shmdump, 133 .pagelock = segspt_shmpagelock, 134 .getmemid = segspt_shmgetmemid, 135 .getpolicy = segspt_shmgetpolicy, 136 }; 137 138 static void segspt_purge(struct seg *seg); 139 static int segspt_reclaim(void *, caddr_t, size_t, struct page **, 140 enum seg_rw, int); 141 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len, 142 page_t **ppa); 143 144 145 146 /*ARGSUSED*/ 147 int 148 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp, 149 uint_t prot, uint_t flags, uint_t share_szc) 150 { 151 int err; 152 struct as *newas; 153 struct segspt_crargs sptcargs; 154 155 #ifdef DEBUG 156 TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */, 157 tnf_ulong, size, size ); 158 #endif 159 if (segspt_minfree == 0) /* leave min 5% of availrmem for */ 160 segspt_minfree = availrmem/20; /* for the system */ 161 162 if (!hat_supported(HAT_SHARED_PT, (void *)0)) 163 return (EINVAL); 164 165 /* 166 * get a new as for this shared memory segment 167 */ 168 newas = as_alloc(); 169 newas->a_proc = NULL; 170 sptcargs.amp = amp; 171 sptcargs.prot = prot; 172 sptcargs.flags = flags; 173 sptcargs.szc = share_szc; 174 /* 175 * create a shared page table (spt) segment 176 */ 177 178 if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) { 179 as_free(newas); 180 return (err); 181 } 182 *sptseg = sptcargs.seg_spt; 183 return (0); 184 } 185 186 void 187 sptdestroy(struct as *as, struct anon_map *amp) 188 { 189 190 #ifdef DEBUG 191 TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */); 192 #endif 193 (void) as_unmap(as, SEGSPTADDR, amp->size); 194 as_free(as); 195 } 196 197 /* 198 * called from seg_free(). 199 * free (i.e., unlock, unmap, return to free list) 200 * all the pages in the given seg. 201 */ 202 void 203 segspt_free(struct seg *seg) 204 { 205 struct spt_data *sptd = (struct spt_data *)seg->s_data; 206 207 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 208 209 if (sptd != NULL) { 210 if (sptd->spt_realsize) 211 segspt_free_pages(seg, seg->s_base, sptd->spt_realsize); 212 213 if (sptd->spt_ppa_lckcnt) 214 kmem_free(sptd->spt_ppa_lckcnt, 215 sizeof (*sptd->spt_ppa_lckcnt) 216 * btopr(sptd->spt_amp->size)); 217 kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp)); 218 cv_destroy(&sptd->spt_cv); 219 mutex_destroy(&sptd->spt_lock); 220 kmem_free(sptd, sizeof (*sptd)); 221 } 222 } 223 224 /*ARGSUSED*/ 225 static int 226 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr, 227 uint_t flags) 228 { 229 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 230 231 return (0); 232 } 233 234 /*ARGSUSED*/ 235 static size_t 236 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec) 237 { 238 caddr_t eo_seg; 239 pgcnt_t npages; 240 struct shm_data *shmd = (struct shm_data *)seg->s_data; 241 struct seg *sptseg; 242 struct spt_data *sptd; 243 244 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 245 #ifdef lint 246 seg = seg; 247 #endif 248 sptseg = shmd->shm_sptseg; 249 sptd = sptseg->s_data; 250 251 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 252 eo_seg = addr + len; 253 while (addr < eo_seg) { 254 /* page exists, and it's locked. */ 255 *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED | 256 SEG_PAGE_ANON; 257 addr += PAGESIZE; 258 } 259 return (len); 260 } else { 261 struct anon_map *amp = shmd->shm_amp; 262 struct anon *ap; 263 page_t *pp; 264 pgcnt_t anon_index; 265 struct vnode *vp; 266 u_offset_t off; 267 ulong_t i; 268 int ret; 269 anon_sync_obj_t cookie; 270 271 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 272 anon_index = seg_page(seg, addr); 273 npages = btopr(len); 274 if (anon_index + npages > btopr(shmd->shm_amp->size)) { 275 return (EINVAL); 276 } 277 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 278 for (i = 0; i < npages; i++, anon_index++) { 279 ret = 0; 280 anon_array_enter(amp, anon_index, &cookie); 281 ap = anon_get_ptr(amp->ahp, anon_index); 282 if (ap != NULL) { 283 swap_xlate(ap, &vp, &off); 284 anon_array_exit(&cookie); 285 pp = page_lookup_nowait(vp, off, SE_SHARED); 286 if (pp != NULL) { 287 ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON; 288 page_unlock(pp); 289 } 290 } else { 291 anon_array_exit(&cookie); 292 } 293 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) { 294 ret |= SEG_PAGE_LOCKED; 295 } 296 *vec++ = (char)ret; 297 } 298 ANON_LOCK_EXIT(&->a_rwlock); 299 return (len); 300 } 301 } 302 303 static int 304 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize) 305 { 306 size_t share_size; 307 308 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 309 310 /* 311 * seg.s_size may have been rounded up to the largest page size 312 * in shmat(). 313 * XXX This should be cleanedup. sptdestroy should take a length 314 * argument which should be the same as sptcreate. Then 315 * this rounding would not be needed (or is done in shm.c) 316 * Only the check for full segment will be needed. 317 * 318 * XXX -- shouldn't raddr == 0 always? These tests don't seem 319 * to be useful at all. 320 */ 321 share_size = page_get_pagesize(seg->s_szc); 322 ssize = P2ROUNDUP(ssize, share_size); 323 324 if (raddr == seg->s_base && ssize == seg->s_size) { 325 seg_free(seg); 326 return (0); 327 } else 328 return (EINVAL); 329 } 330 331 int 332 segspt_create(struct seg *seg, caddr_t argsp) 333 { 334 int err; 335 caddr_t addr = seg->s_base; 336 struct spt_data *sptd; 337 struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp; 338 struct anon_map *amp = sptcargs->amp; 339 struct kshmid *sp = amp->a_sp; 340 struct cred *cred = CRED(); 341 ulong_t i, j, anon_index = 0; 342 pgcnt_t npages = btopr(amp->size); 343 struct vnode *vp; 344 page_t **ppa; 345 uint_t hat_flags; 346 size_t pgsz; 347 pgcnt_t pgcnt; 348 caddr_t a; 349 pgcnt_t pidx; 350 size_t sz; 351 proc_t *procp = curproc; 352 rctl_qty_t lockedbytes = 0; 353 kproject_t *proj; 354 355 /* 356 * We are holding the a_lock on the underlying dummy as, 357 * so we can make calls to the HAT layer. 358 */ 359 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 360 ASSERT(sp != NULL); 361 362 #ifdef DEBUG 363 TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */, 364 tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size); 365 #endif 366 if ((sptcargs->flags & SHM_PAGEABLE) == 0) { 367 if (err = anon_swap_adjust(npages)) 368 return (err); 369 } 370 err = ENOMEM; 371 372 if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL) 373 goto out1; 374 375 if ((sptcargs->flags & SHM_PAGEABLE) == 0) { 376 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages), 377 KM_NOSLEEP)) == NULL) 378 goto out2; 379 } 380 381 mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL); 382 383 if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL) 384 goto out3; 385 386 seg->s_ops = &segspt_ops; 387 sptd->spt_vp = vp; 388 sptd->spt_amp = amp; 389 sptd->spt_prot = sptcargs->prot; 390 sptd->spt_flags = sptcargs->flags; 391 seg->s_data = (caddr_t)sptd; 392 sptd->spt_ppa = NULL; 393 sptd->spt_ppa_lckcnt = NULL; 394 seg->s_szc = sptcargs->szc; 395 cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL); 396 sptd->spt_gen = 0; 397 398 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 399 if (seg->s_szc > amp->a_szc) { 400 amp->a_szc = seg->s_szc; 401 } 402 ANON_LOCK_EXIT(&->a_rwlock); 403 404 /* 405 * Set policy to affect initial allocation of pages in 406 * anon_map_createpages() 407 */ 408 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index, 409 NULL, 0, ptob(npages)); 410 411 if (sptcargs->flags & SHM_PAGEABLE) { 412 size_t share_sz; 413 pgcnt_t new_npgs, more_pgs; 414 struct anon_hdr *nahp; 415 zone_t *zone; 416 417 share_sz = page_get_pagesize(seg->s_szc); 418 if (!IS_P2ALIGNED(amp->size, share_sz)) { 419 /* 420 * We are rounding up the size of the anon array 421 * on 4 M boundary because we always create 4 M 422 * of page(s) when locking, faulting pages and we 423 * don't have to check for all corner cases e.g. 424 * if there is enough space to allocate 4 M 425 * page. 426 */ 427 new_npgs = btop(P2ROUNDUP(amp->size, share_sz)); 428 more_pgs = new_npgs - npages; 429 430 /* 431 * The zone will never be NULL, as a fully created 432 * shm always has an owning zone. 433 */ 434 zone = sp->shm_perm.ipc_zone_ref.zref_zone; 435 ASSERT(zone != NULL); 436 if (anon_resv_zone(ptob(more_pgs), zone) == 0) { 437 err = ENOMEM; 438 goto out4; 439 } 440 441 nahp = anon_create(new_npgs, ANON_SLEEP); 442 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 443 (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages, 444 ANON_SLEEP); 445 anon_release(amp->ahp, npages); 446 amp->ahp = nahp; 447 ASSERT(amp->swresv == ptob(npages)); 448 amp->swresv = amp->size = ptob(new_npgs); 449 ANON_LOCK_EXIT(&->a_rwlock); 450 npages = new_npgs; 451 } 452 453 sptd->spt_ppa_lckcnt = kmem_zalloc(npages * 454 sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP); 455 sptd->spt_pcachecnt = 0; 456 sptd->spt_realsize = ptob(npages); 457 sptcargs->seg_spt = seg; 458 return (0); 459 } 460 461 /* 462 * get array of pages for each anon slot in amp 463 */ 464 if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa, 465 seg, addr, S_CREATE, cred)) != 0) 466 goto out4; 467 468 mutex_enter(&sp->shm_mlock); 469 470 /* May be partially locked, so, count bytes to charge for locking */ 471 for (i = 0; i < npages; i++) 472 if (ppa[i]->p_lckcnt == 0) 473 lockedbytes += PAGESIZE; 474 475 proj = sp->shm_perm.ipc_proj; 476 477 if (lockedbytes > 0) { 478 mutex_enter(&procp->p_lock); 479 if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) { 480 mutex_exit(&procp->p_lock); 481 mutex_exit(&sp->shm_mlock); 482 for (i = 0; i < npages; i++) 483 page_unlock(ppa[i]); 484 err = ENOMEM; 485 goto out4; 486 } 487 mutex_exit(&procp->p_lock); 488 } 489 490 /* 491 * addr is initial address corresponding to the first page on ppa list 492 */ 493 for (i = 0; i < npages; i++) { 494 /* attempt to lock all pages */ 495 if (page_pp_lock(ppa[i], 0, 1) == 0) { 496 /* 497 * if unable to lock any page, unlock all 498 * of them and return error 499 */ 500 for (j = 0; j < i; j++) 501 page_pp_unlock(ppa[j], 0, 1); 502 for (i = 0; i < npages; i++) 503 page_unlock(ppa[i]); 504 rctl_decr_locked_mem(NULL, proj, lockedbytes, 0); 505 mutex_exit(&sp->shm_mlock); 506 err = ENOMEM; 507 goto out4; 508 } 509 } 510 mutex_exit(&sp->shm_mlock); 511 512 /* 513 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK 514 * for the entire life of the segment. For example platforms 515 * that do not support Dynamic Reconfiguration. 516 */ 517 hat_flags = HAT_LOAD_SHARE; 518 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL)) 519 hat_flags |= HAT_LOAD_LOCK; 520 521 /* 522 * Load translations one lare page at a time 523 * to make sure we don't create mappings bigger than 524 * segment's size code in case underlying pages 525 * are shared with segvn's segment that uses bigger 526 * size code than we do. 527 */ 528 pgsz = page_get_pagesize(seg->s_szc); 529 pgcnt = page_get_pagecnt(seg->s_szc); 530 for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) { 531 sz = MIN(pgsz, ptob(npages - pidx)); 532 hat_memload_array(seg->s_as->a_hat, a, sz, 533 &ppa[pidx], sptd->spt_prot, hat_flags); 534 } 535 536 /* 537 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, 538 * we will leave the pages locked SE_SHARED for the life 539 * of the ISM segment. This will prevent any calls to 540 * hat_pageunload() on this ISM segment for those platforms. 541 */ 542 if (!(hat_flags & HAT_LOAD_LOCK)) { 543 /* 544 * On platforms that support HAT_DYNAMIC_ISM_UNMAP, 545 * we no longer need to hold the SE_SHARED lock on the pages, 546 * since L_PAGELOCK and F_SOFTLOCK calls will grab the 547 * SE_SHARED lock on the pages as necessary. 548 */ 549 for (i = 0; i < npages; i++) 550 page_unlock(ppa[i]); 551 } 552 sptd->spt_pcachecnt = 0; 553 kmem_free(ppa, ((sizeof (page_t *)) * npages)); 554 sptd->spt_realsize = ptob(npages); 555 atomic_add_long(&spt_used, npages); 556 sptcargs->seg_spt = seg; 557 return (0); 558 559 out4: 560 seg->s_data = NULL; 561 kmem_free(vp, sizeof (*vp)); 562 cv_destroy(&sptd->spt_cv); 563 out3: 564 mutex_destroy(&sptd->spt_lock); 565 if ((sptcargs->flags & SHM_PAGEABLE) == 0) 566 kmem_free(ppa, (sizeof (*ppa) * npages)); 567 out2: 568 kmem_free(sptd, sizeof (*sptd)); 569 out1: 570 if ((sptcargs->flags & SHM_PAGEABLE) == 0) 571 anon_swap_restore(npages); 572 return (err); 573 } 574 575 /*ARGSUSED*/ 576 void 577 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len) 578 { 579 struct page *pp; 580 struct spt_data *sptd = (struct spt_data *)seg->s_data; 581 pgcnt_t npages; 582 ulong_t anon_idx; 583 struct anon_map *amp; 584 struct anon *ap; 585 struct vnode *vp; 586 u_offset_t off; 587 uint_t hat_flags; 588 int root = 0; 589 pgcnt_t pgs, curnpgs = 0; 590 page_t *rootpp; 591 rctl_qty_t unlocked_bytes = 0; 592 kproject_t *proj; 593 kshmid_t *sp; 594 595 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 596 597 len = P2ROUNDUP(len, PAGESIZE); 598 599 npages = btop(len); 600 601 hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP; 602 if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) || 603 (sptd->spt_flags & SHM_PAGEABLE)) { 604 hat_flags = HAT_UNLOAD_UNMAP; 605 } 606 607 hat_unload(seg->s_as->a_hat, addr, len, hat_flags); 608 609 amp = sptd->spt_amp; 610 if (sptd->spt_flags & SHM_PAGEABLE) 611 npages = btop(amp->size); 612 613 ASSERT(amp != NULL); 614 615 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 616 sp = amp->a_sp; 617 proj = sp->shm_perm.ipc_proj; 618 mutex_enter(&sp->shm_mlock); 619 } 620 for (anon_idx = 0; anon_idx < npages; anon_idx++) { 621 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 622 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) { 623 panic("segspt_free_pages: null app"); 624 /*NOTREACHED*/ 625 } 626 } else { 627 if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx)) 628 == NULL) 629 continue; 630 } 631 ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0); 632 swap_xlate(ap, &vp, &off); 633 634 /* 635 * If this platform supports HAT_DYNAMIC_ISM_UNMAP, 636 * the pages won't be having SE_SHARED lock at this 637 * point. 638 * 639 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, 640 * the pages are still held SE_SHARED locked from the 641 * original segspt_create() 642 * 643 * Our goal is to get SE_EXCL lock on each page, remove 644 * permanent lock on it and invalidate the page. 645 */ 646 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 647 if (hat_flags == HAT_UNLOAD_UNMAP) 648 pp = page_lookup(vp, off, SE_EXCL); 649 else { 650 if ((pp = page_find(vp, off)) == NULL) { 651 panic("segspt_free_pages: " 652 "page not locked"); 653 /*NOTREACHED*/ 654 } 655 if (!page_tryupgrade(pp)) { 656 page_unlock(pp); 657 pp = page_lookup(vp, off, SE_EXCL); 658 } 659 } 660 if (pp == NULL) { 661 panic("segspt_free_pages: " 662 "page not in the system"); 663 /*NOTREACHED*/ 664 } 665 ASSERT(pp->p_lckcnt > 0); 666 page_pp_unlock(pp, 0, 1); 667 if (pp->p_lckcnt == 0) 668 unlocked_bytes += PAGESIZE; 669 } else { 670 if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL) 671 continue; 672 } 673 /* 674 * It's logical to invalidate the pages here as in most cases 675 * these were created by segspt. 676 */ 677 if (pp->p_szc != 0) { 678 if (root == 0) { 679 ASSERT(curnpgs == 0); 680 root = 1; 681 rootpp = pp; 682 pgs = curnpgs = page_get_pagecnt(pp->p_szc); 683 ASSERT(pgs > 1); 684 ASSERT(IS_P2ALIGNED(pgs, pgs)); 685 ASSERT(!(page_pptonum(pp) & (pgs - 1))); 686 curnpgs--; 687 } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) { 688 ASSERT(curnpgs == 1); 689 ASSERT(page_pptonum(pp) == 690 page_pptonum(rootpp) + (pgs - 1)); 691 page_destroy_pages(rootpp); 692 root = 0; 693 curnpgs = 0; 694 } else { 695 ASSERT(curnpgs > 1); 696 ASSERT(page_pptonum(pp) == 697 page_pptonum(rootpp) + (pgs - curnpgs)); 698 curnpgs--; 699 } 700 } else { 701 if (root != 0 || curnpgs != 0) { 702 panic("segspt_free_pages: bad large page"); 703 /*NOTREACHED*/ 704 } 705 /* 706 * Before destroying the pages, we need to take care 707 * of the rctl locked memory accounting. For that 708 * we need to calculte the unlocked_bytes. 709 */ 710 if (pp->p_lckcnt > 0) 711 unlocked_bytes += PAGESIZE; 712 /*LINTED: constant in conditional context */ 713 VN_DISPOSE(pp, B_INVAL, 0, kcred); 714 } 715 } 716 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 717 if (unlocked_bytes > 0) 718 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0); 719 mutex_exit(&sp->shm_mlock); 720 } 721 if (root != 0 || curnpgs != 0) { 722 panic("segspt_free_pages: bad large page"); 723 /*NOTREACHED*/ 724 } 725 726 /* 727 * mark that pages have been released 728 */ 729 sptd->spt_realsize = 0; 730 731 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 732 atomic_add_long(&spt_used, -npages); 733 anon_swap_restore(npages); 734 } 735 } 736 737 /* 738 * Get memory allocation policy info for specified address in given segment 739 */ 740 static lgrp_mem_policy_info_t * 741 segspt_getpolicy(struct seg *seg, caddr_t addr) 742 { 743 struct anon_map *amp; 744 ulong_t anon_index; 745 lgrp_mem_policy_info_t *policy_info; 746 struct spt_data *spt_data; 747 748 ASSERT(seg != NULL); 749 750 /* 751 * Get anon_map from segspt 752 * 753 * Assume that no lock needs to be held on anon_map, since 754 * it should be protected by its reference count which must be 755 * nonzero for an existing segment 756 * Need to grab readers lock on policy tree though 757 */ 758 spt_data = (struct spt_data *)seg->s_data; 759 if (spt_data == NULL) 760 return (NULL); 761 amp = spt_data->spt_amp; 762 ASSERT(amp->refcnt != 0); 763 764 /* 765 * Get policy info 766 * 767 * Assume starting anon index of 0 768 */ 769 anon_index = seg_page(seg, addr); 770 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0); 771 772 return (policy_info); 773 } 774 775 /* 776 * DISM only. 777 * Return locked pages over a given range. 778 * 779 * We will cache all DISM locked pages and save the pplist for the 780 * entire segment in the ppa field of the underlying DISM segment structure. 781 * Later, during a call to segspt_reclaim() we will use this ppa array 782 * to page_unlock() all of the pages and then we will free this ppa list. 783 */ 784 /*ARGSUSED*/ 785 static int 786 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len, 787 struct page ***ppp, enum lock_type type, enum seg_rw rw) 788 { 789 struct shm_data *shmd = (struct shm_data *)seg->s_data; 790 struct seg *sptseg = shmd->shm_sptseg; 791 struct spt_data *sptd = sptseg->s_data; 792 pgcnt_t pg_idx, npages, tot_npages, npgs; 793 struct page **pplist, **pl, **ppa, *pp; 794 struct anon_map *amp; 795 spgcnt_t an_idx; 796 int ret = ENOTSUP; 797 uint_t pl_built = 0; 798 struct anon *ap; 799 struct vnode *vp; 800 u_offset_t off; 801 pgcnt_t claim_availrmem = 0; 802 uint_t szc; 803 804 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 805 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); 806 807 /* 808 * We want to lock/unlock the entire ISM segment. Therefore, 809 * we will be using the underlying sptseg and it's base address 810 * and length for the caching arguments. 811 */ 812 ASSERT(sptseg); 813 ASSERT(sptd); 814 815 pg_idx = seg_page(seg, addr); 816 npages = btopr(len); 817 818 /* 819 * check if the request is larger than number of pages covered 820 * by amp 821 */ 822 if (pg_idx + npages > btopr(sptd->spt_amp->size)) { 823 *ppp = NULL; 824 return (ENOTSUP); 825 } 826 827 if (type == L_PAGEUNLOCK) { 828 ASSERT(sptd->spt_ppa != NULL); 829 830 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, 831 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 832 833 /* 834 * If someone is blocked while unmapping, we purge 835 * segment page cache and thus reclaim pplist synchronously 836 * without waiting for seg_pasync_thread. This speeds up 837 * unmapping in cases where munmap(2) is called, while 838 * raw async i/o is still in progress or where a thread 839 * exits on data fault in a multithreaded application. 840 */ 841 if ((sptd->spt_flags & DISM_PPA_CHANGED) || 842 (AS_ISUNMAPWAIT(seg->s_as) && 843 shmd->shm_softlockcnt > 0)) { 844 segspt_purge(seg); 845 } 846 return (0); 847 } 848 849 /* The L_PAGELOCK case ... */ 850 851 if (sptd->spt_flags & DISM_PPA_CHANGED) { 852 segspt_purge(seg); 853 /* 854 * for DISM ppa needs to be rebuild since 855 * number of locked pages could be changed 856 */ 857 *ppp = NULL; 858 return (ENOTSUP); 859 } 860 861 /* 862 * First try to find pages in segment page cache, without 863 * holding the segment lock. 864 */ 865 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 866 S_WRITE, SEGP_FORCE_WIRED); 867 if (pplist != NULL) { 868 ASSERT(sptd->spt_ppa != NULL); 869 ASSERT(sptd->spt_ppa == pplist); 870 ppa = sptd->spt_ppa; 871 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 872 if (ppa[an_idx] == NULL) { 873 seg_pinactive(seg, NULL, seg->s_base, 874 sptd->spt_amp->size, ppa, 875 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 876 *ppp = NULL; 877 return (ENOTSUP); 878 } 879 if ((szc = ppa[an_idx]->p_szc) != 0) { 880 npgs = page_get_pagecnt(szc); 881 an_idx = P2ROUNDUP(an_idx + 1, npgs); 882 } else { 883 an_idx++; 884 } 885 } 886 /* 887 * Since we cache the entire DISM segment, we want to 888 * set ppp to point to the first slot that corresponds 889 * to the requested addr, i.e. pg_idx. 890 */ 891 *ppp = &(sptd->spt_ppa[pg_idx]); 892 return (0); 893 } 894 895 mutex_enter(&sptd->spt_lock); 896 /* 897 * try to find pages in segment page cache with mutex 898 */ 899 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 900 S_WRITE, SEGP_FORCE_WIRED); 901 if (pplist != NULL) { 902 ASSERT(sptd->spt_ppa != NULL); 903 ASSERT(sptd->spt_ppa == pplist); 904 ppa = sptd->spt_ppa; 905 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 906 if (ppa[an_idx] == NULL) { 907 mutex_exit(&sptd->spt_lock); 908 seg_pinactive(seg, NULL, seg->s_base, 909 sptd->spt_amp->size, ppa, 910 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 911 *ppp = NULL; 912 return (ENOTSUP); 913 } 914 if ((szc = ppa[an_idx]->p_szc) != 0) { 915 npgs = page_get_pagecnt(szc); 916 an_idx = P2ROUNDUP(an_idx + 1, npgs); 917 } else { 918 an_idx++; 919 } 920 } 921 /* 922 * Since we cache the entire DISM segment, we want to 923 * set ppp to point to the first slot that corresponds 924 * to the requested addr, i.e. pg_idx. 925 */ 926 mutex_exit(&sptd->spt_lock); 927 *ppp = &(sptd->spt_ppa[pg_idx]); 928 return (0); 929 } 930 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size, 931 SEGP_FORCE_WIRED) == SEGP_FAIL) { 932 mutex_exit(&sptd->spt_lock); 933 *ppp = NULL; 934 return (ENOTSUP); 935 } 936 937 /* 938 * No need to worry about protections because DISM pages are always rw. 939 */ 940 pl = pplist = NULL; 941 amp = sptd->spt_amp; 942 943 /* 944 * Do we need to build the ppa array? 945 */ 946 if (sptd->spt_ppa == NULL) { 947 pgcnt_t lpg_cnt = 0; 948 949 pl_built = 1; 950 tot_npages = btopr(sptd->spt_amp->size); 951 952 ASSERT(sptd->spt_pcachecnt == 0); 953 pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP); 954 pl = pplist; 955 956 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 957 for (an_idx = 0; an_idx < tot_npages; ) { 958 ap = anon_get_ptr(amp->ahp, an_idx); 959 /* 960 * Cache only mlocked pages. For large pages 961 * if one (constituent) page is mlocked 962 * all pages for that large page 963 * are cached also. This is for quick 964 * lookups of ppa array; 965 */ 966 if ((ap != NULL) && (lpg_cnt != 0 || 967 (sptd->spt_ppa_lckcnt[an_idx] != 0))) { 968 969 swap_xlate(ap, &vp, &off); 970 pp = page_lookup(vp, off, SE_SHARED); 971 ASSERT(pp != NULL); 972 if (lpg_cnt == 0) { 973 lpg_cnt++; 974 /* 975 * For a small page, we are done -- 976 * lpg_count is reset to 0 below. 977 * 978 * For a large page, we are guaranteed 979 * to find the anon structures of all 980 * constituent pages and a non-zero 981 * lpg_cnt ensures that we don't test 982 * for mlock for these. We are done 983 * when lpg_count reaches (npgs + 1). 984 * If we are not the first constituent 985 * page, restart at the first one. 986 */ 987 npgs = page_get_pagecnt(pp->p_szc); 988 if (!IS_P2ALIGNED(an_idx, npgs)) { 989 an_idx = P2ALIGN(an_idx, npgs); 990 page_unlock(pp); 991 continue; 992 } 993 } 994 if (++lpg_cnt > npgs) 995 lpg_cnt = 0; 996 997 /* 998 * availrmem is decremented only 999 * for unlocked pages 1000 */ 1001 if (sptd->spt_ppa_lckcnt[an_idx] == 0) 1002 claim_availrmem++; 1003 pplist[an_idx] = pp; 1004 } 1005 an_idx++; 1006 } 1007 ANON_LOCK_EXIT(&->a_rwlock); 1008 1009 if (claim_availrmem) { 1010 mutex_enter(&freemem_lock); 1011 if (availrmem < tune.t_minarmem + claim_availrmem) { 1012 mutex_exit(&freemem_lock); 1013 ret = ENOTSUP; 1014 claim_availrmem = 0; 1015 goto insert_fail; 1016 } else { 1017 availrmem -= claim_availrmem; 1018 } 1019 mutex_exit(&freemem_lock); 1020 } 1021 1022 sptd->spt_ppa = pl; 1023 } else { 1024 /* 1025 * We already have a valid ppa[]. 1026 */ 1027 pl = sptd->spt_ppa; 1028 } 1029 1030 ASSERT(pl != NULL); 1031 1032 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size, 1033 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED, 1034 segspt_reclaim); 1035 if (ret == SEGP_FAIL) { 1036 /* 1037 * seg_pinsert failed. We return 1038 * ENOTSUP, so that the as_pagelock() code will 1039 * then try the slower F_SOFTLOCK path. 1040 */ 1041 if (pl_built) { 1042 /* 1043 * No one else has referenced the ppa[]. 1044 * We created it and we need to destroy it. 1045 */ 1046 sptd->spt_ppa = NULL; 1047 } 1048 ret = ENOTSUP; 1049 goto insert_fail; 1050 } 1051 1052 /* 1053 * In either case, we increment softlockcnt on the 'real' segment. 1054 */ 1055 sptd->spt_pcachecnt++; 1056 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 1057 1058 ppa = sptd->spt_ppa; 1059 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 1060 if (ppa[an_idx] == NULL) { 1061 mutex_exit(&sptd->spt_lock); 1062 seg_pinactive(seg, NULL, seg->s_base, 1063 sptd->spt_amp->size, 1064 pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 1065 *ppp = NULL; 1066 return (ENOTSUP); 1067 } 1068 if ((szc = ppa[an_idx]->p_szc) != 0) { 1069 npgs = page_get_pagecnt(szc); 1070 an_idx = P2ROUNDUP(an_idx + 1, npgs); 1071 } else { 1072 an_idx++; 1073 } 1074 } 1075 /* 1076 * We can now drop the sptd->spt_lock since the ppa[] 1077 * exists and he have incremented pacachecnt. 1078 */ 1079 mutex_exit(&sptd->spt_lock); 1080 1081 /* 1082 * Since we cache the entire segment, we want to 1083 * set ppp to point to the first slot that corresponds 1084 * to the requested addr, i.e. pg_idx. 1085 */ 1086 *ppp = &(sptd->spt_ppa[pg_idx]); 1087 return (0); 1088 1089 insert_fail: 1090 /* 1091 * We will only reach this code if we tried and failed. 1092 * 1093 * And we can drop the lock on the dummy seg, once we've failed 1094 * to set up a new ppa[]. 1095 */ 1096 mutex_exit(&sptd->spt_lock); 1097 1098 if (pl_built) { 1099 if (claim_availrmem) { 1100 mutex_enter(&freemem_lock); 1101 availrmem += claim_availrmem; 1102 mutex_exit(&freemem_lock); 1103 } 1104 1105 /* 1106 * We created pl and we need to destroy it. 1107 */ 1108 pplist = pl; 1109 for (an_idx = 0; an_idx < tot_npages; an_idx++) { 1110 if (pplist[an_idx] != NULL) 1111 page_unlock(pplist[an_idx]); 1112 } 1113 kmem_free(pl, sizeof (page_t *) * tot_npages); 1114 } 1115 1116 if (shmd->shm_softlockcnt <= 0) { 1117 if (AS_ISUNMAPWAIT(seg->s_as)) { 1118 mutex_enter(&seg->s_as->a_contents); 1119 if (AS_ISUNMAPWAIT(seg->s_as)) { 1120 AS_CLRUNMAPWAIT(seg->s_as); 1121 cv_broadcast(&seg->s_as->a_cv); 1122 } 1123 mutex_exit(&seg->s_as->a_contents); 1124 } 1125 } 1126 *ppp = NULL; 1127 return (ret); 1128 } 1129 1130 1131 1132 /* 1133 * return locked pages over a given range. 1134 * 1135 * We will cache the entire ISM segment and save the pplist for the 1136 * entire segment in the ppa field of the underlying ISM segment structure. 1137 * Later, during a call to segspt_reclaim() we will use this ppa array 1138 * to page_unlock() all of the pages and then we will free this ppa list. 1139 */ 1140 /*ARGSUSED*/ 1141 static int 1142 segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len, 1143 struct page ***ppp, enum lock_type type, enum seg_rw rw) 1144 { 1145 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1146 struct seg *sptseg = shmd->shm_sptseg; 1147 struct spt_data *sptd = sptseg->s_data; 1148 pgcnt_t np, page_index, npages; 1149 caddr_t a, spt_base; 1150 struct page **pplist, **pl, *pp; 1151 struct anon_map *amp; 1152 ulong_t anon_index; 1153 int ret = ENOTSUP; 1154 uint_t pl_built = 0; 1155 struct anon *ap; 1156 struct vnode *vp; 1157 u_offset_t off; 1158 1159 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1160 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); 1161 1162 1163 /* 1164 * We want to lock/unlock the entire ISM segment. Therefore, 1165 * we will be using the underlying sptseg and it's base address 1166 * and length for the caching arguments. 1167 */ 1168 ASSERT(sptseg); 1169 ASSERT(sptd); 1170 1171 if (sptd->spt_flags & SHM_PAGEABLE) { 1172 return (segspt_dismpagelock(seg, addr, len, ppp, type, rw)); 1173 } 1174 1175 page_index = seg_page(seg, addr); 1176 npages = btopr(len); 1177 1178 /* 1179 * check if the request is larger than number of pages covered 1180 * by amp 1181 */ 1182 if (page_index + npages > btopr(sptd->spt_amp->size)) { 1183 *ppp = NULL; 1184 return (ENOTSUP); 1185 } 1186 1187 if (type == L_PAGEUNLOCK) { 1188 1189 ASSERT(sptd->spt_ppa != NULL); 1190 1191 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size, 1192 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim); 1193 1194 /* 1195 * If someone is blocked while unmapping, we purge 1196 * segment page cache and thus reclaim pplist synchronously 1197 * without waiting for seg_pasync_thread. This speeds up 1198 * unmapping in cases where munmap(2) is called, while 1199 * raw async i/o is still in progress or where a thread 1200 * exits on data fault in a multithreaded application. 1201 */ 1202 if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) { 1203 segspt_purge(seg); 1204 } 1205 return (0); 1206 } 1207 1208 /* The L_PAGELOCK case... */ 1209 1210 /* 1211 * First try to find pages in segment page cache, without 1212 * holding the segment lock. 1213 */ 1214 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 1215 S_WRITE, SEGP_FORCE_WIRED); 1216 if (pplist != NULL) { 1217 ASSERT(sptd->spt_ppa == pplist); 1218 ASSERT(sptd->spt_ppa[page_index]); 1219 /* 1220 * Since we cache the entire ISM segment, we want to 1221 * set ppp to point to the first slot that corresponds 1222 * to the requested addr, i.e. page_index. 1223 */ 1224 *ppp = &(sptd->spt_ppa[page_index]); 1225 return (0); 1226 } 1227 1228 mutex_enter(&sptd->spt_lock); 1229 1230 /* 1231 * try to find pages in segment page cache 1232 */ 1233 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size, 1234 S_WRITE, SEGP_FORCE_WIRED); 1235 if (pplist != NULL) { 1236 ASSERT(sptd->spt_ppa == pplist); 1237 /* 1238 * Since we cache the entire segment, we want to 1239 * set ppp to point to the first slot that corresponds 1240 * to the requested addr, i.e. page_index. 1241 */ 1242 mutex_exit(&sptd->spt_lock); 1243 *ppp = &(sptd->spt_ppa[page_index]); 1244 return (0); 1245 } 1246 1247 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size, 1248 SEGP_FORCE_WIRED) == SEGP_FAIL) { 1249 mutex_exit(&sptd->spt_lock); 1250 *ppp = NULL; 1251 return (ENOTSUP); 1252 } 1253 1254 /* 1255 * No need to worry about protections because ISM pages 1256 * are always rw. 1257 */ 1258 pl = pplist = NULL; 1259 1260 /* 1261 * Do we need to build the ppa array? 1262 */ 1263 if (sptd->spt_ppa == NULL) { 1264 ASSERT(sptd->spt_ppa == pplist); 1265 1266 spt_base = sptseg->s_base; 1267 pl_built = 1; 1268 1269 /* 1270 * availrmem is decremented once during anon_swap_adjust() 1271 * and is incremented during the anon_unresv(), which is 1272 * called from shm_rm_amp() when the segment is destroyed. 1273 */ 1274 amp = sptd->spt_amp; 1275 ASSERT(amp != NULL); 1276 1277 /* pcachecnt is protected by sptd->spt_lock */ 1278 ASSERT(sptd->spt_pcachecnt == 0); 1279 pplist = kmem_zalloc(sizeof (page_t *) 1280 * btopr(sptd->spt_amp->size), KM_SLEEP); 1281 pl = pplist; 1282 1283 anon_index = seg_page(sptseg, spt_base); 1284 1285 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1286 for (a = spt_base; a < (spt_base + sptd->spt_amp->size); 1287 a += PAGESIZE, anon_index++, pplist++) { 1288 ap = anon_get_ptr(amp->ahp, anon_index); 1289 ASSERT(ap != NULL); 1290 swap_xlate(ap, &vp, &off); 1291 pp = page_lookup(vp, off, SE_SHARED); 1292 ASSERT(pp != NULL); 1293 *pplist = pp; 1294 } 1295 ANON_LOCK_EXIT(&->a_rwlock); 1296 1297 if (a < (spt_base + sptd->spt_amp->size)) { 1298 ret = ENOTSUP; 1299 goto insert_fail; 1300 } 1301 sptd->spt_ppa = pl; 1302 } else { 1303 /* 1304 * We already have a valid ppa[]. 1305 */ 1306 pl = sptd->spt_ppa; 1307 } 1308 1309 ASSERT(pl != NULL); 1310 1311 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size, 1312 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED, 1313 segspt_reclaim); 1314 if (ret == SEGP_FAIL) { 1315 /* 1316 * seg_pinsert failed. We return 1317 * ENOTSUP, so that the as_pagelock() code will 1318 * then try the slower F_SOFTLOCK path. 1319 */ 1320 if (pl_built) { 1321 /* 1322 * No one else has referenced the ppa[]. 1323 * We created it and we need to destroy it. 1324 */ 1325 sptd->spt_ppa = NULL; 1326 } 1327 ret = ENOTSUP; 1328 goto insert_fail; 1329 } 1330 1331 /* 1332 * In either case, we increment softlockcnt on the 'real' segment. 1333 */ 1334 sptd->spt_pcachecnt++; 1335 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 1336 1337 /* 1338 * We can now drop the sptd->spt_lock since the ppa[] 1339 * exists and he have incremented pacachecnt. 1340 */ 1341 mutex_exit(&sptd->spt_lock); 1342 1343 /* 1344 * Since we cache the entire segment, we want to 1345 * set ppp to point to the first slot that corresponds 1346 * to the requested addr, i.e. page_index. 1347 */ 1348 *ppp = &(sptd->spt_ppa[page_index]); 1349 return (0); 1350 1351 insert_fail: 1352 /* 1353 * We will only reach this code if we tried and failed. 1354 * 1355 * And we can drop the lock on the dummy seg, once we've failed 1356 * to set up a new ppa[]. 1357 */ 1358 mutex_exit(&sptd->spt_lock); 1359 1360 if (pl_built) { 1361 /* 1362 * We created pl and we need to destroy it. 1363 */ 1364 pplist = pl; 1365 np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT); 1366 while (np) { 1367 page_unlock(*pplist); 1368 np--; 1369 pplist++; 1370 } 1371 kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size)); 1372 } 1373 if (shmd->shm_softlockcnt <= 0) { 1374 if (AS_ISUNMAPWAIT(seg->s_as)) { 1375 mutex_enter(&seg->s_as->a_contents); 1376 if (AS_ISUNMAPWAIT(seg->s_as)) { 1377 AS_CLRUNMAPWAIT(seg->s_as); 1378 cv_broadcast(&seg->s_as->a_cv); 1379 } 1380 mutex_exit(&seg->s_as->a_contents); 1381 } 1382 } 1383 *ppp = NULL; 1384 return (ret); 1385 } 1386 1387 /* 1388 * purge any cached pages in the I/O page cache 1389 */ 1390 static void 1391 segspt_purge(struct seg *seg) 1392 { 1393 seg_ppurge(seg, NULL, SEGP_FORCE_WIRED); 1394 } 1395 1396 static int 1397 segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist, 1398 enum seg_rw rw, int async) 1399 { 1400 struct seg *seg = (struct seg *)ptag; 1401 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1402 struct seg *sptseg; 1403 struct spt_data *sptd; 1404 pgcnt_t npages, i, free_availrmem = 0; 1405 int done = 0; 1406 1407 #ifdef lint 1408 addr = addr; 1409 #endif 1410 sptseg = shmd->shm_sptseg; 1411 sptd = sptseg->s_data; 1412 npages = (len >> PAGESHIFT); 1413 ASSERT(npages); 1414 ASSERT(sptd->spt_pcachecnt != 0); 1415 ASSERT(sptd->spt_ppa == pplist); 1416 ASSERT(npages == btopr(sptd->spt_amp->size)); 1417 ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1418 1419 /* 1420 * Acquire the lock on the dummy seg and destroy the 1421 * ppa array IF this is the last pcachecnt. 1422 */ 1423 mutex_enter(&sptd->spt_lock); 1424 if (--sptd->spt_pcachecnt == 0) { 1425 for (i = 0; i < npages; i++) { 1426 if (pplist[i] == NULL) { 1427 continue; 1428 } 1429 if (rw == S_WRITE) { 1430 hat_setrefmod(pplist[i]); 1431 } else { 1432 hat_setref(pplist[i]); 1433 } 1434 if ((sptd->spt_flags & SHM_PAGEABLE) && 1435 (sptd->spt_ppa_lckcnt[i] == 0)) 1436 free_availrmem++; 1437 page_unlock(pplist[i]); 1438 } 1439 if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) { 1440 mutex_enter(&freemem_lock); 1441 availrmem += free_availrmem; 1442 mutex_exit(&freemem_lock); 1443 } 1444 /* 1445 * Since we want to cach/uncache the entire ISM segment, 1446 * we will track the pplist in a segspt specific field 1447 * ppa, that is initialized at the time we add an entry to 1448 * the cache. 1449 */ 1450 ASSERT(sptd->spt_pcachecnt == 0); 1451 kmem_free(pplist, sizeof (page_t *) * npages); 1452 sptd->spt_ppa = NULL; 1453 sptd->spt_flags &= ~DISM_PPA_CHANGED; 1454 sptd->spt_gen++; 1455 cv_broadcast(&sptd->spt_cv); 1456 done = 1; 1457 } 1458 mutex_exit(&sptd->spt_lock); 1459 1460 /* 1461 * If we are pcache async thread or called via seg_ppurge_wiredpp() we 1462 * may not hold AS lock (in this case async argument is not 0). This 1463 * means if softlockcnt drops to 0 after the decrement below address 1464 * space may get freed. We can't allow it since after softlock 1465 * derement to 0 we still need to access as structure for possible 1466 * wakeup of unmap waiters. To prevent the disappearance of as we take 1467 * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes 1468 * this mutex as a barrier to make sure this routine completes before 1469 * segment is freed. 1470 * 1471 * The second complication we have to deal with in async case is a 1472 * possibility of missed wake up of unmap wait thread. When we don't 1473 * hold as lock here we may take a_contents lock before unmap wait 1474 * thread that was first to see softlockcnt was still not 0. As a 1475 * result we'll fail to wake up an unmap wait thread. To avoid this 1476 * race we set nounmapwait flag in as structure if we drop softlockcnt 1477 * to 0 if async is not 0. unmapwait thread 1478 * will not block if this flag is set. 1479 */ 1480 if (async) 1481 mutex_enter(&shmd->shm_segfree_syncmtx); 1482 1483 /* 1484 * Now decrement softlockcnt. 1485 */ 1486 ASSERT(shmd->shm_softlockcnt > 0); 1487 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 1488 1489 if (shmd->shm_softlockcnt <= 0) { 1490 if (async || AS_ISUNMAPWAIT(seg->s_as)) { 1491 mutex_enter(&seg->s_as->a_contents); 1492 if (async) 1493 AS_SETNOUNMAPWAIT(seg->s_as); 1494 if (AS_ISUNMAPWAIT(seg->s_as)) { 1495 AS_CLRUNMAPWAIT(seg->s_as); 1496 cv_broadcast(&seg->s_as->a_cv); 1497 } 1498 mutex_exit(&seg->s_as->a_contents); 1499 } 1500 } 1501 1502 if (async) 1503 mutex_exit(&shmd->shm_segfree_syncmtx); 1504 1505 return (done); 1506 } 1507 1508 /* 1509 * Do a F_SOFTUNLOCK call over the range requested. 1510 * The range must have already been F_SOFTLOCK'ed. 1511 * 1512 * The calls to acquire and release the anon map lock mutex were 1513 * removed in order to avoid a deadly embrace during a DR 1514 * memory delete operation. (Eg. DR blocks while waiting for a 1515 * exclusive lock on a page that is being used for kaio; the 1516 * thread that will complete the kaio and call segspt_softunlock 1517 * blocks on the anon map lock; another thread holding the anon 1518 * map lock blocks on another page lock via the segspt_shmfault 1519 * -> page_lookup -> page_lookup_create -> page_lock_es code flow.) 1520 * 1521 * The appropriateness of the removal is based upon the following: 1522 * 1. If we are holding a segment's reader lock and the page is held 1523 * shared, then the corresponding element in anonmap which points to 1524 * anon struct cannot change and there is no need to acquire the 1525 * anonymous map lock. 1526 * 2. Threads in segspt_softunlock have a reader lock on the segment 1527 * and already have the shared page lock, so we are guaranteed that 1528 * the anon map slot cannot change and therefore can call anon_get_ptr() 1529 * without grabbing the anonymous map lock. 1530 * 3. Threads that softlock a shared page break copy-on-write, even if 1531 * its a read. Thus cow faults can be ignored with respect to soft 1532 * unlocking, since the breaking of cow means that the anon slot(s) will 1533 * not be shared. 1534 */ 1535 static void 1536 segspt_softunlock(struct seg *seg, caddr_t sptseg_addr, 1537 size_t len, enum seg_rw rw) 1538 { 1539 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1540 struct seg *sptseg; 1541 struct spt_data *sptd; 1542 page_t *pp; 1543 caddr_t adr; 1544 struct vnode *vp; 1545 u_offset_t offset; 1546 ulong_t anon_index; 1547 struct anon_map *amp; /* XXX - for locknest */ 1548 struct anon *ap = NULL; 1549 pgcnt_t npages; 1550 1551 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1552 1553 sptseg = shmd->shm_sptseg; 1554 sptd = sptseg->s_data; 1555 1556 /* 1557 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK 1558 * and therefore their pages are SE_SHARED locked 1559 * for the entire life of the segment. 1560 */ 1561 if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) && 1562 ((sptd->spt_flags & SHM_PAGEABLE) == 0)) { 1563 goto softlock_decrement; 1564 } 1565 1566 /* 1567 * Any thread is free to do a page_find and 1568 * page_unlock() on the pages within this seg. 1569 * 1570 * We are already holding the as->a_lock on the user's 1571 * real segment, but we need to hold the a_lock on the 1572 * underlying dummy as. This is mostly to satisfy the 1573 * underlying HAT layer. 1574 */ 1575 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER); 1576 hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len); 1577 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock); 1578 1579 amp = sptd->spt_amp; 1580 ASSERT(amp != NULL); 1581 anon_index = seg_page(sptseg, sptseg_addr); 1582 1583 for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) { 1584 ap = anon_get_ptr(amp->ahp, anon_index++); 1585 ASSERT(ap != NULL); 1586 swap_xlate(ap, &vp, &offset); 1587 1588 /* 1589 * Use page_find() instead of page_lookup() to 1590 * find the page since we know that it has a 1591 * "shared" lock. 1592 */ 1593 pp = page_find(vp, offset); 1594 ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1)); 1595 if (pp == NULL) { 1596 panic("segspt_softunlock: " 1597 "addr %p, ap %p, vp %p, off %llx", 1598 (void *)adr, (void *)ap, (void *)vp, offset); 1599 /*NOTREACHED*/ 1600 } 1601 1602 if (rw == S_WRITE) { 1603 hat_setrefmod(pp); 1604 } else if (rw != S_OTHER) { 1605 hat_setref(pp); 1606 } 1607 page_unlock(pp); 1608 } 1609 1610 softlock_decrement: 1611 npages = btopr(len); 1612 ASSERT(shmd->shm_softlockcnt >= npages); 1613 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages); 1614 if (shmd->shm_softlockcnt == 0) { 1615 /* 1616 * All SOFTLOCKS are gone. Wakeup any waiting 1617 * unmappers so they can try again to unmap. 1618 * Check for waiters first without the mutex 1619 * held so we don't always grab the mutex on 1620 * softunlocks. 1621 */ 1622 if (AS_ISUNMAPWAIT(seg->s_as)) { 1623 mutex_enter(&seg->s_as->a_contents); 1624 if (AS_ISUNMAPWAIT(seg->s_as)) { 1625 AS_CLRUNMAPWAIT(seg->s_as); 1626 cv_broadcast(&seg->s_as->a_cv); 1627 } 1628 mutex_exit(&seg->s_as->a_contents); 1629 } 1630 } 1631 } 1632 1633 int 1634 segspt_shmattach(struct seg *seg, caddr_t *argsp) 1635 { 1636 struct shm_data *shmd_arg = (struct shm_data *)argsp; 1637 struct shm_data *shmd; 1638 struct anon_map *shm_amp = shmd_arg->shm_amp; 1639 struct spt_data *sptd; 1640 int error = 0; 1641 1642 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1643 1644 shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP); 1645 if (shmd == NULL) 1646 return (ENOMEM); 1647 1648 shmd->shm_sptas = shmd_arg->shm_sptas; 1649 shmd->shm_amp = shm_amp; 1650 shmd->shm_sptseg = shmd_arg->shm_sptseg; 1651 1652 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0, 1653 NULL, 0, seg->s_size); 1654 1655 mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL); 1656 1657 seg->s_data = (void *)shmd; 1658 seg->s_ops = &segspt_shmops; 1659 seg->s_szc = shmd->shm_sptseg->s_szc; 1660 sptd = shmd->shm_sptseg->s_data; 1661 1662 if (sptd->spt_flags & SHM_PAGEABLE) { 1663 if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size), 1664 KM_NOSLEEP)) == NULL) { 1665 seg->s_data = (void *)NULL; 1666 kmem_free(shmd, (sizeof (*shmd))); 1667 return (ENOMEM); 1668 } 1669 shmd->shm_lckpgs = 0; 1670 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 1671 if ((error = hat_share(seg->s_as->a_hat, seg->s_base, 1672 shmd_arg->shm_sptas->a_hat, SEGSPTADDR, 1673 seg->s_size, seg->s_szc)) != 0) { 1674 kmem_free(shmd->shm_vpage, 1675 btopr(shm_amp->size)); 1676 } 1677 } 1678 } else { 1679 error = hat_share(seg->s_as->a_hat, seg->s_base, 1680 shmd_arg->shm_sptas->a_hat, SEGSPTADDR, 1681 seg->s_size, seg->s_szc); 1682 } 1683 if (error) { 1684 seg->s_szc = 0; 1685 seg->s_data = (void *)NULL; 1686 kmem_free(shmd, (sizeof (*shmd))); 1687 } else { 1688 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER); 1689 shm_amp->refcnt++; 1690 ANON_LOCK_EXIT(&shm_amp->a_rwlock); 1691 } 1692 return (error); 1693 } 1694 1695 int 1696 segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize) 1697 { 1698 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1699 int reclaim = 1; 1700 1701 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1702 retry: 1703 if (shmd->shm_softlockcnt > 0) { 1704 if (reclaim == 1) { 1705 segspt_purge(seg); 1706 reclaim = 0; 1707 goto retry; 1708 } 1709 return (EAGAIN); 1710 } 1711 1712 if (ssize != seg->s_size) { 1713 #ifdef DEBUG 1714 cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n", 1715 ssize, seg->s_size); 1716 #endif 1717 return (EINVAL); 1718 } 1719 1720 (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK, 1721 NULL, 0); 1722 hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc); 1723 1724 seg_free(seg); 1725 1726 return (0); 1727 } 1728 1729 void 1730 segspt_shmfree(struct seg *seg) 1731 { 1732 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1733 struct anon_map *shm_amp = shmd->shm_amp; 1734 1735 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1736 1737 (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0, 1738 MC_UNLOCK, NULL, 0); 1739 1740 /* 1741 * Need to increment refcnt when attaching 1742 * and decrement when detaching because of dup(). 1743 */ 1744 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER); 1745 shm_amp->refcnt--; 1746 ANON_LOCK_EXIT(&shm_amp->a_rwlock); 1747 1748 if (shmd->shm_vpage) { /* only for DISM */ 1749 kmem_free(shmd->shm_vpage, btopr(shm_amp->size)); 1750 shmd->shm_vpage = NULL; 1751 } 1752 1753 /* 1754 * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's 1755 * still working with this segment without holding as lock. 1756 */ 1757 ASSERT(shmd->shm_softlockcnt == 0); 1758 mutex_enter(&shmd->shm_segfree_syncmtx); 1759 mutex_destroy(&shmd->shm_segfree_syncmtx); 1760 1761 kmem_free(shmd, sizeof (*shmd)); 1762 } 1763 1764 /*ARGSUSED*/ 1765 int 1766 segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 1767 { 1768 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1769 1770 /* 1771 * Shared page table is more than shared mapping. 1772 * Individual process sharing page tables can't change prot 1773 * because there is only one set of page tables. 1774 * This will be allowed after private page table is 1775 * supported. 1776 */ 1777 /* need to return correct status error? */ 1778 return (0); 1779 } 1780 1781 1782 faultcode_t 1783 segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr, 1784 size_t len, enum fault_type type, enum seg_rw rw) 1785 { 1786 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1787 struct seg *sptseg = shmd->shm_sptseg; 1788 struct as *curspt = shmd->shm_sptas; 1789 struct spt_data *sptd = sptseg->s_data; 1790 pgcnt_t npages; 1791 size_t size; 1792 caddr_t segspt_addr, shm_addr; 1793 page_t **ppa; 1794 int i; 1795 ulong_t an_idx = 0; 1796 int err = 0; 1797 int dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0); 1798 size_t pgsz; 1799 pgcnt_t pgcnt; 1800 caddr_t a; 1801 pgcnt_t pidx; 1802 1803 #ifdef lint 1804 hat = hat; 1805 #endif 1806 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1807 1808 /* 1809 * Because of the way spt is implemented 1810 * the realsize of the segment does not have to be 1811 * equal to the segment size itself. The segment size is 1812 * often in multiples of a page size larger than PAGESIZE. 1813 * The realsize is rounded up to the nearest PAGESIZE 1814 * based on what the user requested. This is a bit of 1815 * ungliness that is historical but not easily fixed 1816 * without re-designing the higher levels of ISM. 1817 */ 1818 ASSERT(addr >= seg->s_base); 1819 if (((addr + len) - seg->s_base) > sptd->spt_realsize) 1820 return (FC_NOMAP); 1821 /* 1822 * For all of the following cases except F_PROT, we need to 1823 * make any necessary adjustments to addr and len 1824 * and get all of the necessary page_t's into an array called ppa[]. 1825 * 1826 * The code in shmat() forces base addr and len of ISM segment 1827 * to be aligned to largest page size supported. Therefore, 1828 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large 1829 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK 1830 * in large pagesize chunks, or else we will screw up the HAT 1831 * layer by calling hat_memload_array() with differing page sizes 1832 * over a given virtual range. 1833 */ 1834 pgsz = page_get_pagesize(sptseg->s_szc); 1835 pgcnt = page_get_pagecnt(sptseg->s_szc); 1836 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); 1837 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz); 1838 npages = btopr(size); 1839 1840 /* 1841 * Now we need to convert from addr in segshm to addr in segspt. 1842 */ 1843 an_idx = seg_page(seg, shm_addr); 1844 segspt_addr = sptseg->s_base + ptob(an_idx); 1845 1846 ASSERT((segspt_addr + ptob(npages)) <= 1847 (sptseg->s_base + sptd->spt_realsize)); 1848 ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size)); 1849 1850 switch (type) { 1851 1852 case F_SOFTLOCK: 1853 1854 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages); 1855 /* 1856 * Fall through to the F_INVAL case to load up the hat layer 1857 * entries with the HAT_LOAD_LOCK flag. 1858 */ 1859 /* FALLTHRU */ 1860 case F_INVAL: 1861 1862 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC)) 1863 return (FC_NOMAP); 1864 1865 ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP); 1866 1867 err = spt_anon_getpages(sptseg, segspt_addr, size, ppa); 1868 if (err != 0) { 1869 if (type == F_SOFTLOCK) { 1870 atomic_add_long((ulong_t *)( 1871 &(shmd->shm_softlockcnt)), -npages); 1872 } 1873 goto dism_err; 1874 } 1875 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER); 1876 a = segspt_addr; 1877 pidx = 0; 1878 if (type == F_SOFTLOCK) { 1879 1880 /* 1881 * Load up the translation keeping it 1882 * locked and don't unlock the page. 1883 */ 1884 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 1885 hat_memload_array(sptseg->s_as->a_hat, 1886 a, pgsz, &ppa[pidx], sptd->spt_prot, 1887 HAT_LOAD_LOCK | HAT_LOAD_SHARE); 1888 } 1889 } else { 1890 /* 1891 * Migrate pages marked for migration 1892 */ 1893 if (lgrp_optimizations()) 1894 page_migrate(seg, shm_addr, ppa, npages); 1895 1896 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 1897 hat_memload_array(sptseg->s_as->a_hat, 1898 a, pgsz, &ppa[pidx], 1899 sptd->spt_prot, 1900 HAT_LOAD_SHARE); 1901 } 1902 1903 /* 1904 * And now drop the SE_SHARED lock(s). 1905 */ 1906 if (dyn_ism_unmap) { 1907 for (i = 0; i < npages; i++) { 1908 page_unlock(ppa[i]); 1909 } 1910 } 1911 } 1912 1913 if (!dyn_ism_unmap) { 1914 if (hat_share(seg->s_as->a_hat, shm_addr, 1915 curspt->a_hat, segspt_addr, ptob(npages), 1916 seg->s_szc) != 0) { 1917 panic("hat_share err in DISM fault"); 1918 /* NOTREACHED */ 1919 } 1920 if (type == F_INVAL) { 1921 for (i = 0; i < npages; i++) { 1922 page_unlock(ppa[i]); 1923 } 1924 } 1925 } 1926 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock); 1927 dism_err: 1928 kmem_free(ppa, npages * sizeof (page_t *)); 1929 return (err); 1930 1931 case F_SOFTUNLOCK: 1932 1933 /* 1934 * This is a bit ugly, we pass in the real seg pointer, 1935 * but the segspt_addr is the virtual address within the 1936 * dummy seg. 1937 */ 1938 segspt_softunlock(seg, segspt_addr, size, rw); 1939 return (0); 1940 1941 case F_PROT: 1942 1943 /* 1944 * This takes care of the unusual case where a user 1945 * allocates a stack in shared memory and a register 1946 * window overflow is written to that stack page before 1947 * it is otherwise modified. 1948 * 1949 * We can get away with this because ISM segments are 1950 * always rw. Other than this unusual case, there 1951 * should be no instances of protection violations. 1952 */ 1953 return (0); 1954 1955 default: 1956 #ifdef DEBUG 1957 panic("segspt_dismfault default type?"); 1958 #else 1959 return (FC_NOMAP); 1960 #endif 1961 } 1962 } 1963 1964 1965 faultcode_t 1966 segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr, 1967 size_t len, enum fault_type type, enum seg_rw rw) 1968 { 1969 struct shm_data *shmd = (struct shm_data *)seg->s_data; 1970 struct seg *sptseg = shmd->shm_sptseg; 1971 struct as *curspt = shmd->shm_sptas; 1972 struct spt_data *sptd = sptseg->s_data; 1973 pgcnt_t npages; 1974 size_t size; 1975 caddr_t sptseg_addr, shm_addr; 1976 page_t *pp, **ppa; 1977 int i; 1978 u_offset_t offset; 1979 ulong_t anon_index = 0; 1980 struct vnode *vp; 1981 struct anon_map *amp; /* XXX - for locknest */ 1982 struct anon *ap = NULL; 1983 size_t pgsz; 1984 pgcnt_t pgcnt; 1985 caddr_t a; 1986 pgcnt_t pidx; 1987 size_t sz; 1988 1989 #ifdef lint 1990 hat = hat; 1991 #endif 1992 1993 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 1994 1995 if (sptd->spt_flags & SHM_PAGEABLE) { 1996 return (segspt_dismfault(hat, seg, addr, len, type, rw)); 1997 } 1998 1999 /* 2000 * Because of the way spt is implemented 2001 * the realsize of the segment does not have to be 2002 * equal to the segment size itself. The segment size is 2003 * often in multiples of a page size larger than PAGESIZE. 2004 * The realsize is rounded up to the nearest PAGESIZE 2005 * based on what the user requested. This is a bit of 2006 * ungliness that is historical but not easily fixed 2007 * without re-designing the higher levels of ISM. 2008 */ 2009 ASSERT(addr >= seg->s_base); 2010 if (((addr + len) - seg->s_base) > sptd->spt_realsize) 2011 return (FC_NOMAP); 2012 /* 2013 * For all of the following cases except F_PROT, we need to 2014 * make any necessary adjustments to addr and len 2015 * and get all of the necessary page_t's into an array called ppa[]. 2016 * 2017 * The code in shmat() forces base addr and len of ISM segment 2018 * to be aligned to largest page size supported. Therefore, 2019 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large 2020 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK 2021 * in large pagesize chunks, or else we will screw up the HAT 2022 * layer by calling hat_memload_array() with differing page sizes 2023 * over a given virtual range. 2024 */ 2025 pgsz = page_get_pagesize(sptseg->s_szc); 2026 pgcnt = page_get_pagecnt(sptseg->s_szc); 2027 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); 2028 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz); 2029 npages = btopr(size); 2030 2031 /* 2032 * Now we need to convert from addr in segshm to addr in segspt. 2033 */ 2034 anon_index = seg_page(seg, shm_addr); 2035 sptseg_addr = sptseg->s_base + ptob(anon_index); 2036 2037 /* 2038 * And now we may have to adjust npages downward if we have 2039 * exceeded the realsize of the segment or initial anon 2040 * allocations. 2041 */ 2042 if ((sptseg_addr + ptob(npages)) > 2043 (sptseg->s_base + sptd->spt_realsize)) 2044 size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr; 2045 2046 npages = btopr(size); 2047 2048 ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size)); 2049 ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0); 2050 2051 switch (type) { 2052 2053 case F_SOFTLOCK: 2054 2055 /* 2056 * availrmem is decremented once during anon_swap_adjust() 2057 * and is incremented during the anon_unresv(), which is 2058 * called from shm_rm_amp() when the segment is destroyed. 2059 */ 2060 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages); 2061 /* 2062 * Some platforms assume that ISM pages are SE_SHARED 2063 * locked for the entire life of the segment. 2064 */ 2065 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) 2066 return (0); 2067 /* 2068 * Fall through to the F_INVAL case to load up the hat layer 2069 * entries with the HAT_LOAD_LOCK flag. 2070 */ 2071 2072 /* FALLTHRU */ 2073 case F_INVAL: 2074 2075 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC)) 2076 return (FC_NOMAP); 2077 2078 /* 2079 * Some platforms that do NOT support DYNAMIC_ISM_UNMAP 2080 * may still rely on this call to hat_share(). That 2081 * would imply that those hat's can fault on a 2082 * HAT_LOAD_LOCK translation, which would seem 2083 * contradictory. 2084 */ 2085 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 2086 if (hat_share(seg->s_as->a_hat, seg->s_base, 2087 curspt->a_hat, sptseg->s_base, 2088 sptseg->s_size, sptseg->s_szc) != 0) { 2089 panic("hat_share error in ISM fault"); 2090 /*NOTREACHED*/ 2091 } 2092 return (0); 2093 } 2094 ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP); 2095 2096 /* 2097 * I see no need to lock the real seg, 2098 * here, because all of our work will be on the underlying 2099 * dummy seg. 2100 * 2101 * sptseg_addr and npages now account for large pages. 2102 */ 2103 amp = sptd->spt_amp; 2104 ASSERT(amp != NULL); 2105 anon_index = seg_page(sptseg, sptseg_addr); 2106 2107 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2108 for (i = 0; i < npages; i++) { 2109 ap = anon_get_ptr(amp->ahp, anon_index++); 2110 ASSERT(ap != NULL); 2111 swap_xlate(ap, &vp, &offset); 2112 pp = page_lookup(vp, offset, SE_SHARED); 2113 ASSERT(pp != NULL); 2114 ppa[i] = pp; 2115 } 2116 ANON_LOCK_EXIT(&->a_rwlock); 2117 ASSERT(i == npages); 2118 2119 /* 2120 * We are already holding the as->a_lock on the user's 2121 * real segment, but we need to hold the a_lock on the 2122 * underlying dummy as. This is mostly to satisfy the 2123 * underlying HAT layer. 2124 */ 2125 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER); 2126 a = sptseg_addr; 2127 pidx = 0; 2128 if (type == F_SOFTLOCK) { 2129 /* 2130 * Load up the translation keeping it 2131 * locked and don't unlock the page. 2132 */ 2133 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 2134 sz = MIN(pgsz, ptob(npages - pidx)); 2135 hat_memload_array(sptseg->s_as->a_hat, a, 2136 sz, &ppa[pidx], sptd->spt_prot, 2137 HAT_LOAD_LOCK | HAT_LOAD_SHARE); 2138 } 2139 } else { 2140 /* 2141 * Migrate pages marked for migration. 2142 */ 2143 if (lgrp_optimizations()) 2144 page_migrate(seg, shm_addr, ppa, npages); 2145 2146 for (; pidx < npages; a += pgsz, pidx += pgcnt) { 2147 sz = MIN(pgsz, ptob(npages - pidx)); 2148 hat_memload_array(sptseg->s_as->a_hat, 2149 a, sz, &ppa[pidx], 2150 sptd->spt_prot, HAT_LOAD_SHARE); 2151 } 2152 2153 /* 2154 * And now drop the SE_SHARED lock(s). 2155 */ 2156 for (i = 0; i < npages; i++) 2157 page_unlock(ppa[i]); 2158 } 2159 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock); 2160 2161 kmem_free(ppa, sizeof (page_t *) * npages); 2162 return (0); 2163 case F_SOFTUNLOCK: 2164 2165 /* 2166 * This is a bit ugly, we pass in the real seg pointer, 2167 * but the sptseg_addr is the virtual address within the 2168 * dummy seg. 2169 */ 2170 segspt_softunlock(seg, sptseg_addr, ptob(npages), rw); 2171 return (0); 2172 2173 case F_PROT: 2174 2175 /* 2176 * This takes care of the unusual case where a user 2177 * allocates a stack in shared memory and a register 2178 * window overflow is written to that stack page before 2179 * it is otherwise modified. 2180 * 2181 * We can get away with this because ISM segments are 2182 * always rw. Other than this unusual case, there 2183 * should be no instances of protection violations. 2184 */ 2185 return (0); 2186 2187 default: 2188 #ifdef DEBUG 2189 cmn_err(CE_WARN, "segspt_shmfault default type?"); 2190 #endif 2191 return (FC_NOMAP); 2192 } 2193 } 2194 2195 /*ARGSUSED*/ 2196 static faultcode_t 2197 segspt_shmfaulta(struct seg *seg, caddr_t addr) 2198 { 2199 return (0); 2200 } 2201 2202 /*ARGSUSED*/ 2203 static int 2204 segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta) 2205 { 2206 return (0); 2207 } 2208 2209 /* 2210 * duplicate the shared page tables 2211 */ 2212 int 2213 segspt_shmdup(struct seg *seg, struct seg *newseg) 2214 { 2215 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2216 struct anon_map *amp = shmd->shm_amp; 2217 struct shm_data *shmd_new; 2218 struct seg *spt_seg = shmd->shm_sptseg; 2219 struct spt_data *sptd = spt_seg->s_data; 2220 int error = 0; 2221 2222 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 2223 2224 shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP); 2225 newseg->s_data = (void *)shmd_new; 2226 shmd_new->shm_sptas = shmd->shm_sptas; 2227 shmd_new->shm_amp = amp; 2228 shmd_new->shm_sptseg = shmd->shm_sptseg; 2229 newseg->s_ops = &segspt_shmops; 2230 newseg->s_szc = seg->s_szc; 2231 ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc); 2232 2233 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2234 amp->refcnt++; 2235 ANON_LOCK_EXIT(&->a_rwlock); 2236 2237 if (sptd->spt_flags & SHM_PAGEABLE) { 2238 shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP); 2239 shmd_new->shm_lckpgs = 0; 2240 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 2241 if ((error = hat_share(newseg->s_as->a_hat, 2242 newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR, 2243 seg->s_size, seg->s_szc)) != 0) { 2244 kmem_free(shmd_new->shm_vpage, 2245 btopr(amp->size)); 2246 } 2247 } 2248 return (error); 2249 } else { 2250 return (hat_share(newseg->s_as->a_hat, newseg->s_base, 2251 shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size, 2252 seg->s_szc)); 2253 2254 } 2255 } 2256 2257 /*ARGSUSED*/ 2258 int 2259 segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot) 2260 { 2261 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2262 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2263 2264 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2265 2266 /* 2267 * ISM segment is always rw. 2268 */ 2269 return (((sptd->spt_prot & prot) != prot) ? EACCES : 0); 2270 } 2271 2272 /* 2273 * Return an array of locked large pages, for empty slots allocate 2274 * private zero-filled anon pages. 2275 */ 2276 static int 2277 spt_anon_getpages( 2278 struct seg *sptseg, 2279 caddr_t sptaddr, 2280 size_t len, 2281 page_t *ppa[]) 2282 { 2283 struct spt_data *sptd = sptseg->s_data; 2284 struct anon_map *amp = sptd->spt_amp; 2285 enum seg_rw rw = sptd->spt_prot; 2286 uint_t szc = sptseg->s_szc; 2287 size_t pg_sz, share_sz = page_get_pagesize(szc); 2288 pgcnt_t lp_npgs; 2289 caddr_t lp_addr, e_sptaddr; 2290 uint_t vpprot, ppa_szc = 0; 2291 struct vpage *vpage = NULL; 2292 ulong_t j, ppa_idx; 2293 int err, ierr = 0; 2294 pgcnt_t an_idx; 2295 anon_sync_obj_t cookie; 2296 int anon_locked = 0; 2297 pgcnt_t amp_pgs; 2298 2299 2300 ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz)); 2301 ASSERT(len != 0); 2302 2303 pg_sz = share_sz; 2304 lp_npgs = btop(pg_sz); 2305 lp_addr = sptaddr; 2306 e_sptaddr = sptaddr + len; 2307 an_idx = seg_page(sptseg, sptaddr); 2308 ppa_idx = 0; 2309 2310 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2311 2312 amp_pgs = page_get_pagecnt(amp->a_szc); 2313 2314 /*CONSTCOND*/ 2315 while (1) { 2316 for (; lp_addr < e_sptaddr; 2317 an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) { 2318 2319 /* 2320 * If we're currently locked, and we get to a new 2321 * page, unlock our current anon chunk. 2322 */ 2323 if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) { 2324 anon_array_exit(&cookie); 2325 anon_locked = 0; 2326 } 2327 if (!anon_locked) { 2328 anon_array_enter(amp, an_idx, &cookie); 2329 anon_locked = 1; 2330 } 2331 ppa_szc = (uint_t)-1; 2332 ierr = anon_map_getpages(amp, an_idx, szc, sptseg, 2333 lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx], 2334 &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred); 2335 2336 if (ierr != 0) { 2337 if (ierr > 0) { 2338 err = FC_MAKE_ERR(ierr); 2339 goto lpgs_err; 2340 } 2341 break; 2342 } 2343 } 2344 if (lp_addr == e_sptaddr) { 2345 break; 2346 } 2347 ASSERT(lp_addr < e_sptaddr); 2348 2349 /* 2350 * ierr == -1 means we failed to allocate a large page. 2351 * so do a size down operation. 2352 * 2353 * ierr == -2 means some other process that privately shares 2354 * pages with this process has allocated a larger page and we 2355 * need to retry with larger pages. So do a size up 2356 * operation. This relies on the fact that large pages are 2357 * never partially shared i.e. if we share any constituent 2358 * page of a large page with another process we must share the 2359 * entire large page. Note this cannot happen for SOFTLOCK 2360 * case, unless current address (lpaddr) is at the beginning 2361 * of the next page size boundary because the other process 2362 * couldn't have relocated locked pages. 2363 */ 2364 ASSERT(ierr == -1 || ierr == -2); 2365 if (segvn_anypgsz) { 2366 ASSERT(ierr == -2 || szc != 0); 2367 ASSERT(ierr == -1 || szc < sptseg->s_szc); 2368 szc = (ierr == -1) ? szc - 1 : szc + 1; 2369 } else { 2370 /* 2371 * For faults and segvn_anypgsz == 0 2372 * we need to be careful not to loop forever 2373 * if existing page is found with szc other 2374 * than 0 or seg->s_szc. This could be due 2375 * to page relocations on behalf of DR or 2376 * more likely large page creation. For this 2377 * case simply re-size to existing page's szc 2378 * if returned by anon_map_getpages(). 2379 */ 2380 if (ppa_szc == (uint_t)-1) { 2381 szc = (ierr == -1) ? 0 : sptseg->s_szc; 2382 } else { 2383 ASSERT(ppa_szc <= sptseg->s_szc); 2384 ASSERT(ierr == -2 || ppa_szc < szc); 2385 ASSERT(ierr == -1 || ppa_szc > szc); 2386 szc = ppa_szc; 2387 } 2388 } 2389 pg_sz = page_get_pagesize(szc); 2390 lp_npgs = btop(pg_sz); 2391 ASSERT(IS_P2ALIGNED(lp_addr, pg_sz)); 2392 } 2393 if (anon_locked) { 2394 anon_array_exit(&cookie); 2395 } 2396 ANON_LOCK_EXIT(&->a_rwlock); 2397 return (0); 2398 2399 lpgs_err: 2400 if (anon_locked) { 2401 anon_array_exit(&cookie); 2402 } 2403 ANON_LOCK_EXIT(&->a_rwlock); 2404 for (j = 0; j < ppa_idx; j++) 2405 page_unlock(ppa[j]); 2406 return (err); 2407 } 2408 2409 /* 2410 * count the number of bytes in a set of spt pages that are currently not 2411 * locked 2412 */ 2413 static rctl_qty_t 2414 spt_unlockedbytes(pgcnt_t npages, page_t **ppa) 2415 { 2416 ulong_t i; 2417 rctl_qty_t unlocked = 0; 2418 2419 for (i = 0; i < npages; i++) { 2420 if (ppa[i]->p_lckcnt == 0) 2421 unlocked += PAGESIZE; 2422 } 2423 return (unlocked); 2424 } 2425 2426 extern u_longlong_t randtick(void); 2427 /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */ 2428 #define NLCK (NCPU_P2) 2429 /* Random number with a range [0, n-1], n must be power of two */ 2430 #define RAND_P2(n) \ 2431 ((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1)) 2432 2433 int 2434 spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages, 2435 page_t **ppa, ulong_t *lockmap, size_t pos, 2436 rctl_qty_t *locked) 2437 { 2438 struct shm_data *shmd = seg->s_data; 2439 struct spt_data *sptd = shmd->shm_sptseg->s_data; 2440 ulong_t i; 2441 int kernel; 2442 pgcnt_t nlck = 0; 2443 int rv = 0; 2444 int use_reserved = 1; 2445 2446 /* return the number of bytes actually locked */ 2447 *locked = 0; 2448 2449 /* 2450 * To avoid contention on freemem_lock, availrmem and pages_locked 2451 * global counters are updated only every nlck locked pages instead of 2452 * every time. Reserve nlck locks up front and deduct from this 2453 * reservation for each page that requires a lock. When the reservation 2454 * is consumed, reserve again. nlck is randomized, so the competing 2455 * threads do not fall into a cyclic lock contention pattern. When 2456 * memory is low, the lock ahead is disabled, and instead page_pp_lock() 2457 * is used to lock pages. 2458 */ 2459 for (i = 0; i < npages; anon_index++, pos++, i++) { 2460 if (nlck == 0 && use_reserved == 1) { 2461 nlck = NLCK + RAND_P2(NLCK); 2462 /* if fewer loops left, decrease nlck */ 2463 nlck = MIN(nlck, npages - i); 2464 /* 2465 * Reserve nlck locks up front and deduct from this 2466 * reservation for each page that requires a lock. When 2467 * the reservation is consumed, reserve again. 2468 */ 2469 mutex_enter(&freemem_lock); 2470 if ((availrmem - nlck) < pages_pp_maximum) { 2471 /* Do not do advance memory reserves */ 2472 use_reserved = 0; 2473 } else { 2474 availrmem -= nlck; 2475 pages_locked += nlck; 2476 } 2477 mutex_exit(&freemem_lock); 2478 } 2479 if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) { 2480 if (sptd->spt_ppa_lckcnt[anon_index] < 2481 (ushort_t)DISM_LOCK_MAX) { 2482 if (++sptd->spt_ppa_lckcnt[anon_index] == 2483 (ushort_t)DISM_LOCK_MAX) { 2484 cmn_err(CE_WARN, 2485 "DISM page lock limit " 2486 "reached on DISM offset 0x%lx\n", 2487 anon_index << PAGESHIFT); 2488 } 2489 kernel = (sptd->spt_ppa && 2490 sptd->spt_ppa[anon_index]); 2491 if (!page_pp_lock(ppa[i], 0, kernel || 2492 use_reserved)) { 2493 sptd->spt_ppa_lckcnt[anon_index]--; 2494 rv = EAGAIN; 2495 break; 2496 } 2497 /* if this is a newly locked page, count it */ 2498 if (ppa[i]->p_lckcnt == 1) { 2499 if (kernel == 0 && use_reserved == 1) 2500 nlck--; 2501 *locked += PAGESIZE; 2502 } 2503 shmd->shm_lckpgs++; 2504 shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED; 2505 if (lockmap != NULL) 2506 BT_SET(lockmap, pos); 2507 } 2508 } 2509 } 2510 /* Return unused lock reservation */ 2511 if (nlck != 0 && use_reserved == 1) { 2512 mutex_enter(&freemem_lock); 2513 availrmem += nlck; 2514 pages_locked -= nlck; 2515 mutex_exit(&freemem_lock); 2516 } 2517 2518 return (rv); 2519 } 2520 2521 int 2522 spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages, 2523 rctl_qty_t *unlocked) 2524 { 2525 struct shm_data *shmd = seg->s_data; 2526 struct spt_data *sptd = shmd->shm_sptseg->s_data; 2527 struct anon_map *amp = sptd->spt_amp; 2528 struct anon *ap; 2529 struct vnode *vp; 2530 u_offset_t off; 2531 struct page *pp; 2532 int kernel; 2533 anon_sync_obj_t cookie; 2534 ulong_t i; 2535 pgcnt_t nlck = 0; 2536 pgcnt_t nlck_limit = NLCK; 2537 2538 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2539 for (i = 0; i < npages; i++, anon_index++) { 2540 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) { 2541 anon_array_enter(amp, anon_index, &cookie); 2542 ap = anon_get_ptr(amp->ahp, anon_index); 2543 ASSERT(ap); 2544 2545 swap_xlate(ap, &vp, &off); 2546 anon_array_exit(&cookie); 2547 pp = page_lookup(vp, off, SE_SHARED); 2548 ASSERT(pp); 2549 /* 2550 * availrmem is decremented only for pages which are not 2551 * in seg pcache, for pages in seg pcache availrmem was 2552 * decremented in _dismpagelock() 2553 */ 2554 kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]); 2555 ASSERT(pp->p_lckcnt > 0); 2556 2557 /* 2558 * lock page but do not change availrmem, we do it 2559 * ourselves every nlck loops. 2560 */ 2561 page_pp_unlock(pp, 0, 1); 2562 if (pp->p_lckcnt == 0) { 2563 if (kernel == 0) 2564 nlck++; 2565 *unlocked += PAGESIZE; 2566 } 2567 page_unlock(pp); 2568 shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED; 2569 sptd->spt_ppa_lckcnt[anon_index]--; 2570 shmd->shm_lckpgs--; 2571 } 2572 2573 /* 2574 * To reduce freemem_lock contention, do not update availrmem 2575 * until at least NLCK pages have been unlocked. 2576 * 1. No need to update if nlck is zero 2577 * 2. Always update if the last iteration 2578 */ 2579 if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) { 2580 mutex_enter(&freemem_lock); 2581 availrmem += nlck; 2582 pages_locked -= nlck; 2583 mutex_exit(&freemem_lock); 2584 nlck = 0; 2585 nlck_limit = NLCK + RAND_P2(NLCK); 2586 } 2587 } 2588 ANON_LOCK_EXIT(&->a_rwlock); 2589 2590 return (0); 2591 } 2592 2593 /*ARGSUSED*/ 2594 static int 2595 segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len, 2596 int attr, int op, ulong_t *lockmap, size_t pos) 2597 { 2598 struct shm_data *shmd = seg->s_data; 2599 struct seg *sptseg = shmd->shm_sptseg; 2600 struct spt_data *sptd = sptseg->s_data; 2601 struct kshmid *sp = sptd->spt_amp->a_sp; 2602 pgcnt_t npages, a_npages; 2603 page_t **ppa; 2604 pgcnt_t an_idx, a_an_idx, ppa_idx; 2605 caddr_t spt_addr, a_addr; /* spt and aligned address */ 2606 size_t a_len; /* aligned len */ 2607 size_t share_sz; 2608 ulong_t i; 2609 int sts = 0; 2610 rctl_qty_t unlocked = 0; 2611 rctl_qty_t locked = 0; 2612 struct proc *p = curproc; 2613 kproject_t *proj; 2614 2615 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2616 ASSERT(sp != NULL); 2617 2618 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 2619 return (0); 2620 } 2621 2622 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2623 an_idx = seg_page(seg, addr); 2624 npages = btopr(len); 2625 2626 if (an_idx + npages > btopr(shmd->shm_amp->size)) { 2627 return (ENOMEM); 2628 } 2629 2630 /* 2631 * A shm's project never changes, so no lock needed. 2632 * The shm has a hold on the project, so it will not go away. 2633 * Since we have a mapping to shm within this zone, we know 2634 * that the zone will not go away. 2635 */ 2636 proj = sp->shm_perm.ipc_proj; 2637 2638 if (op == MC_LOCK) { 2639 2640 /* 2641 * Need to align addr and size request if they are not 2642 * aligned so we can always allocate large page(s) however 2643 * we only lock what was requested in initial request. 2644 */ 2645 share_sz = page_get_pagesize(sptseg->s_szc); 2646 a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz); 2647 a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)), 2648 share_sz); 2649 a_npages = btop(a_len); 2650 a_an_idx = seg_page(seg, a_addr); 2651 spt_addr = sptseg->s_base + ptob(a_an_idx); 2652 ppa_idx = an_idx - a_an_idx; 2653 2654 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages), 2655 KM_NOSLEEP)) == NULL) { 2656 return (ENOMEM); 2657 } 2658 2659 /* 2660 * Don't cache any new pages for IO and 2661 * flush any cached pages. 2662 */ 2663 mutex_enter(&sptd->spt_lock); 2664 if (sptd->spt_ppa != NULL) 2665 sptd->spt_flags |= DISM_PPA_CHANGED; 2666 2667 sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa); 2668 if (sts != 0) { 2669 mutex_exit(&sptd->spt_lock); 2670 kmem_free(ppa, ((sizeof (page_t *)) * a_npages)); 2671 return (sts); 2672 } 2673 2674 mutex_enter(&sp->shm_mlock); 2675 /* enforce locked memory rctl */ 2676 unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]); 2677 2678 mutex_enter(&p->p_lock); 2679 if (rctl_incr_locked_mem(p, proj, unlocked, 0)) { 2680 mutex_exit(&p->p_lock); 2681 sts = EAGAIN; 2682 } else { 2683 mutex_exit(&p->p_lock); 2684 sts = spt_lockpages(seg, an_idx, npages, 2685 &ppa[ppa_idx], lockmap, pos, &locked); 2686 2687 /* 2688 * correct locked count if not all pages could be 2689 * locked 2690 */ 2691 if ((unlocked - locked) > 0) { 2692 rctl_decr_locked_mem(NULL, proj, 2693 (unlocked - locked), 0); 2694 } 2695 } 2696 /* 2697 * unlock pages 2698 */ 2699 for (i = 0; i < a_npages; i++) 2700 page_unlock(ppa[i]); 2701 if (sptd->spt_ppa != NULL) 2702 sptd->spt_flags |= DISM_PPA_CHANGED; 2703 mutex_exit(&sp->shm_mlock); 2704 mutex_exit(&sptd->spt_lock); 2705 2706 kmem_free(ppa, ((sizeof (page_t *)) * a_npages)); 2707 2708 } else if (op == MC_UNLOCK) { /* unlock */ 2709 page_t **ppa; 2710 2711 mutex_enter(&sptd->spt_lock); 2712 if (shmd->shm_lckpgs == 0) { 2713 mutex_exit(&sptd->spt_lock); 2714 return (0); 2715 } 2716 /* 2717 * Don't cache new IO pages. 2718 */ 2719 if (sptd->spt_ppa != NULL) 2720 sptd->spt_flags |= DISM_PPA_CHANGED; 2721 2722 mutex_enter(&sp->shm_mlock); 2723 sts = spt_unlockpages(seg, an_idx, npages, &unlocked); 2724 if ((ppa = sptd->spt_ppa) != NULL) 2725 sptd->spt_flags |= DISM_PPA_CHANGED; 2726 mutex_exit(&sptd->spt_lock); 2727 2728 rctl_decr_locked_mem(NULL, proj, unlocked, 0); 2729 mutex_exit(&sp->shm_mlock); 2730 2731 if (ppa != NULL) 2732 seg_ppurge_wiredpp(ppa); 2733 } 2734 return (sts); 2735 } 2736 2737 /*ARGSUSED*/ 2738 int 2739 segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 2740 { 2741 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2742 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2743 spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1; 2744 2745 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2746 2747 /* 2748 * ISM segment is always rw. 2749 */ 2750 while (--pgno >= 0) 2751 *protv++ = sptd->spt_prot; 2752 return (0); 2753 } 2754 2755 /*ARGSUSED*/ 2756 u_offset_t 2757 segspt_shmgetoffset(struct seg *seg, caddr_t addr) 2758 { 2759 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2760 2761 /* Offset does not matter in ISM memory */ 2762 2763 return ((u_offset_t)0); 2764 } 2765 2766 /* ARGSUSED */ 2767 int 2768 segspt_shmgettype(struct seg *seg, caddr_t addr) 2769 { 2770 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2771 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2772 2773 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2774 2775 /* 2776 * The shared memory mapping is always MAP_SHARED, SWAP is only 2777 * reserved for DISM 2778 */ 2779 return (MAP_SHARED | 2780 ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE)); 2781 } 2782 2783 /*ARGSUSED*/ 2784 int 2785 segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 2786 { 2787 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2788 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2789 2790 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2791 2792 *vpp = sptd->spt_vp; 2793 return (0); 2794 } 2795 2796 /* 2797 * We need to wait for pending IO to complete to a DISM segment in order for 2798 * pages to get kicked out of the seg_pcache. 120 seconds should be more 2799 * than enough time to wait. 2800 */ 2801 static clock_t spt_pcache_wait = 120; 2802 2803 /*ARGSUSED*/ 2804 static int 2805 segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 2806 { 2807 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2808 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data; 2809 struct anon_map *amp; 2810 pgcnt_t pg_idx; 2811 ushort_t gen; 2812 clock_t end_lbolt; 2813 int writer; 2814 page_t **ppa; 2815 2816 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2817 2818 if (behav == MADV_FREE) { 2819 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) 2820 return (0); 2821 2822 amp = sptd->spt_amp; 2823 pg_idx = seg_page(seg, addr); 2824 2825 mutex_enter(&sptd->spt_lock); 2826 if ((ppa = sptd->spt_ppa) == NULL) { 2827 mutex_exit(&sptd->spt_lock); 2828 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2829 anon_disclaim(amp, pg_idx, len); 2830 ANON_LOCK_EXIT(&->a_rwlock); 2831 return (0); 2832 } 2833 2834 sptd->spt_flags |= DISM_PPA_CHANGED; 2835 gen = sptd->spt_gen; 2836 2837 mutex_exit(&sptd->spt_lock); 2838 2839 /* 2840 * Purge all DISM cached pages 2841 */ 2842 seg_ppurge_wiredpp(ppa); 2843 2844 /* 2845 * Drop the AS_LOCK so that other threads can grab it 2846 * in the as_pageunlock path and hopefully get the segment 2847 * kicked out of the seg_pcache. We bump the shm_softlockcnt 2848 * to keep this segment resident. 2849 */ 2850 writer = AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock); 2851 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 2852 AS_LOCK_EXIT(seg->s_as, &seg->s_as->a_lock); 2853 2854 mutex_enter(&sptd->spt_lock); 2855 2856 end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait); 2857 2858 /* 2859 * Try to wait for pages to get kicked out of the seg_pcache. 2860 */ 2861 while (sptd->spt_gen == gen && 2862 (sptd->spt_flags & DISM_PPA_CHANGED) && 2863 ddi_get_lbolt() < end_lbolt) { 2864 if (!cv_timedwait_sig(&sptd->spt_cv, 2865 &sptd->spt_lock, end_lbolt)) { 2866 break; 2867 } 2868 } 2869 2870 mutex_exit(&sptd->spt_lock); 2871 2872 /* Regrab the AS_LOCK and release our hold on the segment */ 2873 AS_LOCK_ENTER(seg->s_as, &seg->s_as->a_lock, 2874 writer ? RW_WRITER : RW_READER); 2875 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt))); 2876 if (shmd->shm_softlockcnt <= 0) { 2877 if (AS_ISUNMAPWAIT(seg->s_as)) { 2878 mutex_enter(&seg->s_as->a_contents); 2879 if (AS_ISUNMAPWAIT(seg->s_as)) { 2880 AS_CLRUNMAPWAIT(seg->s_as); 2881 cv_broadcast(&seg->s_as->a_cv); 2882 } 2883 mutex_exit(&seg->s_as->a_contents); 2884 } 2885 } 2886 2887 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2888 anon_disclaim(amp, pg_idx, len); 2889 ANON_LOCK_EXIT(&->a_rwlock); 2890 } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP || 2891 behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) { 2892 int already_set; 2893 ulong_t anon_index; 2894 lgrp_mem_policy_t policy; 2895 caddr_t shm_addr; 2896 size_t share_size; 2897 size_t size; 2898 struct seg *sptseg = shmd->shm_sptseg; 2899 caddr_t sptseg_addr; 2900 2901 /* 2902 * Align address and length to page size of underlying segment 2903 */ 2904 share_size = page_get_pagesize(shmd->shm_sptseg->s_szc); 2905 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size); 2906 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), 2907 share_size); 2908 2909 amp = shmd->shm_amp; 2910 anon_index = seg_page(seg, shm_addr); 2911 2912 /* 2913 * And now we may have to adjust size downward if we have 2914 * exceeded the realsize of the segment or initial anon 2915 * allocations. 2916 */ 2917 sptseg_addr = sptseg->s_base + ptob(anon_index); 2918 if ((sptseg_addr + size) > 2919 (sptseg->s_base + sptd->spt_realsize)) 2920 size = (sptseg->s_base + sptd->spt_realsize) - 2921 sptseg_addr; 2922 2923 /* 2924 * Set memory allocation policy for this segment 2925 */ 2926 policy = lgrp_madv_to_policy(behav, len, MAP_SHARED); 2927 already_set = lgrp_shm_policy_set(policy, amp, anon_index, 2928 NULL, 0, len); 2929 2930 /* 2931 * If random memory allocation policy set already, 2932 * don't bother reapplying it. 2933 */ 2934 if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 2935 return (0); 2936 2937 /* 2938 * Mark any existing pages in the given range for 2939 * migration, flushing the I/O page cache, and using 2940 * underlying segment to calculate anon index and get 2941 * anonmap and vnode pointer from 2942 */ 2943 if (shmd->shm_softlockcnt > 0) 2944 segspt_purge(seg); 2945 2946 page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0); 2947 } 2948 2949 return (0); 2950 } 2951 2952 /*ARGSUSED*/ 2953 void 2954 segspt_shmdump(struct seg *seg) 2955 { 2956 /* no-op for ISM segment */ 2957 } 2958 2959 /* 2960 * get a memory ID for an addr in a given segment 2961 */ 2962 static int 2963 segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 2964 { 2965 struct shm_data *shmd = (struct shm_data *)seg->s_data; 2966 struct anon *ap; 2967 size_t anon_index; 2968 struct anon_map *amp = shmd->shm_amp; 2969 struct spt_data *sptd = shmd->shm_sptseg->s_data; 2970 struct seg *sptseg = shmd->shm_sptseg; 2971 anon_sync_obj_t cookie; 2972 2973 anon_index = seg_page(seg, addr); 2974 2975 if (addr > (seg->s_base + sptd->spt_realsize)) { 2976 return (EFAULT); 2977 } 2978 2979 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2980 anon_array_enter(amp, anon_index, &cookie); 2981 ap = anon_get_ptr(amp->ahp, anon_index); 2982 if (ap == NULL) { 2983 struct page *pp; 2984 caddr_t spt_addr = sptseg->s_base + ptob(anon_index); 2985 2986 pp = anon_zero(sptseg, spt_addr, &ap, kcred); 2987 if (pp == NULL) { 2988 anon_array_exit(&cookie); 2989 ANON_LOCK_EXIT(&->a_rwlock); 2990 return (ENOMEM); 2991 } 2992 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2993 page_unlock(pp); 2994 } 2995 anon_array_exit(&cookie); 2996 ANON_LOCK_EXIT(&->a_rwlock); 2997 memidp->val[0] = (uintptr_t)ap; 2998 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 2999 return (0); 3000 } 3001 3002 /* 3003 * Get memory allocation policy info for specified address in given segment 3004 */ 3005 static lgrp_mem_policy_info_t * 3006 segspt_shmgetpolicy(struct seg *seg, caddr_t addr) 3007 { 3008 struct anon_map *amp; 3009 ulong_t anon_index; 3010 lgrp_mem_policy_info_t *policy_info; 3011 struct shm_data *shm_data; 3012 3013 ASSERT(seg != NULL); 3014 3015 /* 3016 * Get anon_map from segshm 3017 * 3018 * Assume that no lock needs to be held on anon_map, since 3019 * it should be protected by its reference count which must be 3020 * nonzero for an existing segment 3021 * Need to grab readers lock on policy tree though 3022 */ 3023 shm_data = (struct shm_data *)seg->s_data; 3024 if (shm_data == NULL) 3025 return (NULL); 3026 amp = shm_data->shm_amp; 3027 ASSERT(amp->refcnt != 0); 3028 3029 /* 3030 * Get policy info 3031 * 3032 * Assume starting anon index of 0 3033 */ 3034 anon_index = seg_page(seg, addr); 3035 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0); 3036 3037 return (policy_info); 3038 }