Print this page
no need for bad-op segment op functions
The segment drivers have a number of bad-op functions that simply panic.
Keeping the function pointer NULL will accomplish the same thing in most
cases. In other cases, keeping the function pointer NULL will result in
proper error code being returned.
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/seg_spt.c
+++ new/usr/src/uts/common/vm/seg_spt.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24
25 25 #include <sys/param.h>
26 26 #include <sys/user.h>
27 27 #include <sys/mman.h>
28 28 #include <sys/kmem.h>
29 29 #include <sys/sysmacros.h>
30 30 #include <sys/cmn_err.h>
31 31 #include <sys/systm.h>
32 32 #include <sys/tuneable.h>
33 33 #include <vm/hat.h>
34 34 #include <vm/seg.h>
35 35 #include <vm/as.h>
36 36 #include <vm/anon.h>
37 37 #include <vm/page.h>
38 38 #include <sys/buf.h>
39 39 #include <sys/swap.h>
40 40 #include <sys/atomic.h>
41 41 #include <vm/seg_spt.h>
42 42 #include <sys/debug.h>
43 43 #include <sys/vtrace.h>
44 44 #include <sys/shm.h>
45 45 #include <sys/shm_impl.h>
46 46 #include <sys/lgrp.h>
47 47 #include <sys/vmsystm.h>
48 48 #include <sys/policy.h>
49 49 #include <sys/project.h>
50 50 #include <sys/tnf_probe.h>
51 51 #include <sys/zone.h>
52 52
53 53 #define SEGSPTADDR (caddr_t)0x0
54 54
55 55 /*
56 56 * # pages used for spt
57 57 */
58 58 size_t spt_used;
59 59
60 60 /*
61 61 * segspt_minfree is the memory left for system after ISM
62 62 * locked its pages; it is set up to 5% of availrmem in
63 63 * sptcreate when ISM is created. ISM should not use more
64 64 * than ~90% of availrmem; if it does, then the performance
65 65 * of the system may decrease. Machines with large memories may
66 66 * be able to use up more memory for ISM so we set the default
67 67 * segspt_minfree to 5% (which gives ISM max 95% of availrmem.
68 68 * If somebody wants even more memory for ISM (risking hanging
↓ open down ↓ |
68 lines elided |
↑ open up ↑ |
69 69 * the system) they can patch the segspt_minfree to smaller number.
70 70 */
71 71 pgcnt_t segspt_minfree = 0;
72 72
73 73 static int segspt_create(struct seg *seg, caddr_t argsp);
74 74 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
75 75 static void segspt_free(struct seg *seg);
76 76 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
77 77 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
78 78
79 -static void
80 -segspt_badop()
81 -{
82 - panic("segspt_badop called");
83 - /*NOTREACHED*/
84 -}
85 -
86 -#define SEGSPT_BADOP(t) (t(*)())segspt_badop
87 -
88 79 struct seg_ops segspt_ops = {
89 - .dup = SEGSPT_BADOP(int),
90 80 .unmap = segspt_unmap,
91 81 .free = segspt_free,
92 - .fault = SEGSPT_BADOP(int),
93 - .faulta = SEGSPT_BADOP(faultcode_t),
94 - .setprot = SEGSPT_BADOP(int),
95 - .checkprot = SEGSPT_BADOP(int),
96 - .kluster = SEGSPT_BADOP(int),
97 - .sync = SEGSPT_BADOP(int),
98 - .incore = SEGSPT_BADOP(size_t),
99 - .lockop = SEGSPT_BADOP(int),
100 - .getprot = SEGSPT_BADOP(int),
101 - .getoffset = SEGSPT_BADOP(u_offset_t),
102 - .gettype = SEGSPT_BADOP(int),
103 - .getvp = SEGSPT_BADOP(int),
104 - .advise = SEGSPT_BADOP(int),
105 - .dump = SEGSPT_BADOP(void),
106 - .pagelock = SEGSPT_BADOP(int),
107 - .setpagesize = SEGSPT_BADOP(int),
108 - .getmemid = SEGSPT_BADOP(int),
109 82 .getpolicy = segspt_getpolicy,
110 - .capable = SEGSPT_BADOP(int),
111 83 .inherit = seg_inherit_notsup,
112 84 };
113 85
114 86 static int segspt_shmdup(struct seg *seg, struct seg *newseg);
115 87 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
116 88 static void segspt_shmfree(struct seg *seg);
117 89 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
118 90 caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
119 91 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
120 92 static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr,
121 93 register size_t len, register uint_t prot);
122 94 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
123 95 uint_t prot);
124 96 static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
125 97 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
126 98 register char *vec);
127 99 static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len,
128 100 int attr, uint_t flags);
129 101 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
130 102 int attr, int op, ulong_t *lockmap, size_t pos);
131 103 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
132 104 uint_t *protv);
133 105 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
134 106 static int segspt_shmgettype(struct seg *seg, caddr_t addr);
135 107 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
136 108 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
137 109 uint_t behav);
138 110 static void segspt_shmdump(struct seg *seg);
139 111 static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
140 112 struct page ***, enum lock_type, enum seg_rw);
141 113 static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
142 114 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
143 115 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
144 116 static int segspt_shmcapable(struct seg *, segcapability_t);
145 117
146 118 struct seg_ops segspt_shmops = {
147 119 .dup = segspt_shmdup,
148 120 .unmap = segspt_shmunmap,
149 121 .free = segspt_shmfree,
150 122 .fault = segspt_shmfault,
151 123 .faulta = segspt_shmfaulta,
152 124 .setprot = segspt_shmsetprot,
153 125 .checkprot = segspt_shmcheckprot,
154 126 .kluster = segspt_shmkluster,
155 127 .sync = segspt_shmsync,
156 128 .incore = segspt_shmincore,
157 129 .lockop = segspt_shmlockop,
158 130 .getprot = segspt_shmgetprot,
159 131 .getoffset = segspt_shmgetoffset,
160 132 .gettype = segspt_shmgettype,
161 133 .getvp = segspt_shmgetvp,
162 134 .advise = segspt_shmadvise,
163 135 .dump = segspt_shmdump,
164 136 .pagelock = segspt_shmpagelock,
165 137 .setpagesize = segspt_shmsetpgsz,
166 138 .getmemid = segspt_shmgetmemid,
167 139 .getpolicy = segspt_shmgetpolicy,
168 140 .capable = segspt_shmcapable,
169 141 .inherit = seg_inherit_notsup,
170 142 };
171 143
172 144 static void segspt_purge(struct seg *seg);
173 145 static int segspt_reclaim(void *, caddr_t, size_t, struct page **,
174 146 enum seg_rw, int);
175 147 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
176 148 page_t **ppa);
177 149
178 150
179 151
180 152 /*ARGSUSED*/
181 153 int
182 154 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
183 155 uint_t prot, uint_t flags, uint_t share_szc)
184 156 {
185 157 int err;
186 158 struct as *newas;
187 159 struct segspt_crargs sptcargs;
188 160
189 161 #ifdef DEBUG
190 162 TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */,
191 163 tnf_ulong, size, size );
192 164 #endif
193 165 if (segspt_minfree == 0) /* leave min 5% of availrmem for */
194 166 segspt_minfree = availrmem/20; /* for the system */
195 167
196 168 if (!hat_supported(HAT_SHARED_PT, (void *)0))
197 169 return (EINVAL);
198 170
199 171 /*
200 172 * get a new as for this shared memory segment
201 173 */
202 174 newas = as_alloc();
203 175 newas->a_proc = NULL;
204 176 sptcargs.amp = amp;
205 177 sptcargs.prot = prot;
206 178 sptcargs.flags = flags;
207 179 sptcargs.szc = share_szc;
208 180 /*
209 181 * create a shared page table (spt) segment
210 182 */
211 183
212 184 if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
213 185 as_free(newas);
214 186 return (err);
215 187 }
216 188 *sptseg = sptcargs.seg_spt;
217 189 return (0);
218 190 }
219 191
220 192 void
221 193 sptdestroy(struct as *as, struct anon_map *amp)
222 194 {
223 195
224 196 #ifdef DEBUG
225 197 TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */);
226 198 #endif
227 199 (void) as_unmap(as, SEGSPTADDR, amp->size);
228 200 as_free(as);
229 201 }
230 202
231 203 /*
232 204 * called from seg_free().
233 205 * free (i.e., unlock, unmap, return to free list)
234 206 * all the pages in the given seg.
235 207 */
236 208 void
237 209 segspt_free(struct seg *seg)
238 210 {
239 211 struct spt_data *sptd = (struct spt_data *)seg->s_data;
240 212
241 213 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
242 214
243 215 if (sptd != NULL) {
244 216 if (sptd->spt_realsize)
245 217 segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
246 218
247 219 if (sptd->spt_ppa_lckcnt)
248 220 kmem_free(sptd->spt_ppa_lckcnt,
249 221 sizeof (*sptd->spt_ppa_lckcnt)
250 222 * btopr(sptd->spt_amp->size));
251 223 kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
252 224 cv_destroy(&sptd->spt_cv);
253 225 mutex_destroy(&sptd->spt_lock);
254 226 kmem_free(sptd, sizeof (*sptd));
255 227 }
256 228 }
257 229
258 230 /*ARGSUSED*/
259 231 static int
260 232 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
261 233 uint_t flags)
262 234 {
263 235 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
264 236
265 237 return (0);
266 238 }
267 239
268 240 /*ARGSUSED*/
269 241 static size_t
270 242 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
271 243 {
272 244 caddr_t eo_seg;
273 245 pgcnt_t npages;
274 246 struct shm_data *shmd = (struct shm_data *)seg->s_data;
275 247 struct seg *sptseg;
276 248 struct spt_data *sptd;
277 249
278 250 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
279 251 #ifdef lint
280 252 seg = seg;
281 253 #endif
282 254 sptseg = shmd->shm_sptseg;
283 255 sptd = sptseg->s_data;
284 256
285 257 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
286 258 eo_seg = addr + len;
287 259 while (addr < eo_seg) {
288 260 /* page exists, and it's locked. */
289 261 *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
290 262 SEG_PAGE_ANON;
291 263 addr += PAGESIZE;
292 264 }
293 265 return (len);
294 266 } else {
295 267 struct anon_map *amp = shmd->shm_amp;
296 268 struct anon *ap;
297 269 page_t *pp;
298 270 pgcnt_t anon_index;
299 271 struct vnode *vp;
300 272 u_offset_t off;
301 273 ulong_t i;
302 274 int ret;
303 275 anon_sync_obj_t cookie;
304 276
305 277 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
306 278 anon_index = seg_page(seg, addr);
307 279 npages = btopr(len);
308 280 if (anon_index + npages > btopr(shmd->shm_amp->size)) {
309 281 return (EINVAL);
310 282 }
311 283 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
312 284 for (i = 0; i < npages; i++, anon_index++) {
313 285 ret = 0;
314 286 anon_array_enter(amp, anon_index, &cookie);
315 287 ap = anon_get_ptr(amp->ahp, anon_index);
316 288 if (ap != NULL) {
317 289 swap_xlate(ap, &vp, &off);
318 290 anon_array_exit(&cookie);
319 291 pp = page_lookup_nowait(vp, off, SE_SHARED);
320 292 if (pp != NULL) {
321 293 ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
322 294 page_unlock(pp);
323 295 }
324 296 } else {
325 297 anon_array_exit(&cookie);
326 298 }
327 299 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
328 300 ret |= SEG_PAGE_LOCKED;
329 301 }
330 302 *vec++ = (char)ret;
331 303 }
332 304 ANON_LOCK_EXIT(&->a_rwlock);
333 305 return (len);
334 306 }
335 307 }
336 308
337 309 static int
338 310 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
339 311 {
340 312 size_t share_size;
341 313
342 314 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
343 315
344 316 /*
345 317 * seg.s_size may have been rounded up to the largest page size
346 318 * in shmat().
347 319 * XXX This should be cleanedup. sptdestroy should take a length
348 320 * argument which should be the same as sptcreate. Then
349 321 * this rounding would not be needed (or is done in shm.c)
350 322 * Only the check for full segment will be needed.
351 323 *
352 324 * XXX -- shouldn't raddr == 0 always? These tests don't seem
353 325 * to be useful at all.
354 326 */
355 327 share_size = page_get_pagesize(seg->s_szc);
356 328 ssize = P2ROUNDUP(ssize, share_size);
357 329
358 330 if (raddr == seg->s_base && ssize == seg->s_size) {
359 331 seg_free(seg);
360 332 return (0);
361 333 } else
362 334 return (EINVAL);
363 335 }
364 336
365 337 int
366 338 segspt_create(struct seg *seg, caddr_t argsp)
367 339 {
368 340 int err;
369 341 caddr_t addr = seg->s_base;
370 342 struct spt_data *sptd;
371 343 struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
372 344 struct anon_map *amp = sptcargs->amp;
373 345 struct kshmid *sp = amp->a_sp;
374 346 struct cred *cred = CRED();
375 347 ulong_t i, j, anon_index = 0;
376 348 pgcnt_t npages = btopr(amp->size);
377 349 struct vnode *vp;
378 350 page_t **ppa;
379 351 uint_t hat_flags;
380 352 size_t pgsz;
381 353 pgcnt_t pgcnt;
382 354 caddr_t a;
383 355 pgcnt_t pidx;
384 356 size_t sz;
385 357 proc_t *procp = curproc;
386 358 rctl_qty_t lockedbytes = 0;
387 359 kproject_t *proj;
388 360
389 361 /*
390 362 * We are holding the a_lock on the underlying dummy as,
391 363 * so we can make calls to the HAT layer.
392 364 */
393 365 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
394 366 ASSERT(sp != NULL);
395 367
396 368 #ifdef DEBUG
397 369 TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */,
398 370 tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size);
399 371 #endif
400 372 if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
401 373 if (err = anon_swap_adjust(npages))
402 374 return (err);
403 375 }
404 376 err = ENOMEM;
405 377
406 378 if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
407 379 goto out1;
408 380
409 381 if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
410 382 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
411 383 KM_NOSLEEP)) == NULL)
412 384 goto out2;
413 385 }
414 386
415 387 mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
416 388
417 389 if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
418 390 goto out3;
419 391
420 392 seg->s_ops = &segspt_ops;
421 393 sptd->spt_vp = vp;
422 394 sptd->spt_amp = amp;
423 395 sptd->spt_prot = sptcargs->prot;
424 396 sptd->spt_flags = sptcargs->flags;
425 397 seg->s_data = (caddr_t)sptd;
426 398 sptd->spt_ppa = NULL;
427 399 sptd->spt_ppa_lckcnt = NULL;
428 400 seg->s_szc = sptcargs->szc;
429 401 cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL);
430 402 sptd->spt_gen = 0;
431 403
432 404 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
433 405 if (seg->s_szc > amp->a_szc) {
434 406 amp->a_szc = seg->s_szc;
435 407 }
436 408 ANON_LOCK_EXIT(&->a_rwlock);
437 409
438 410 /*
439 411 * Set policy to affect initial allocation of pages in
440 412 * anon_map_createpages()
441 413 */
442 414 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
443 415 NULL, 0, ptob(npages));
444 416
445 417 if (sptcargs->flags & SHM_PAGEABLE) {
446 418 size_t share_sz;
447 419 pgcnt_t new_npgs, more_pgs;
448 420 struct anon_hdr *nahp;
449 421 zone_t *zone;
450 422
451 423 share_sz = page_get_pagesize(seg->s_szc);
452 424 if (!IS_P2ALIGNED(amp->size, share_sz)) {
453 425 /*
454 426 * We are rounding up the size of the anon array
455 427 * on 4 M boundary because we always create 4 M
456 428 * of page(s) when locking, faulting pages and we
457 429 * don't have to check for all corner cases e.g.
458 430 * if there is enough space to allocate 4 M
459 431 * page.
460 432 */
461 433 new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
462 434 more_pgs = new_npgs - npages;
463 435
464 436 /*
465 437 * The zone will never be NULL, as a fully created
466 438 * shm always has an owning zone.
467 439 */
468 440 zone = sp->shm_perm.ipc_zone_ref.zref_zone;
469 441 ASSERT(zone != NULL);
470 442 if (anon_resv_zone(ptob(more_pgs), zone) == 0) {
471 443 err = ENOMEM;
472 444 goto out4;
473 445 }
474 446
475 447 nahp = anon_create(new_npgs, ANON_SLEEP);
476 448 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
477 449 (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
478 450 ANON_SLEEP);
479 451 anon_release(amp->ahp, npages);
480 452 amp->ahp = nahp;
481 453 ASSERT(amp->swresv == ptob(npages));
482 454 amp->swresv = amp->size = ptob(new_npgs);
483 455 ANON_LOCK_EXIT(&->a_rwlock);
484 456 npages = new_npgs;
485 457 }
486 458
487 459 sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
488 460 sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
489 461 sptd->spt_pcachecnt = 0;
490 462 sptd->spt_realsize = ptob(npages);
491 463 sptcargs->seg_spt = seg;
492 464 return (0);
493 465 }
494 466
495 467 /*
496 468 * get array of pages for each anon slot in amp
497 469 */
498 470 if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
499 471 seg, addr, S_CREATE, cred)) != 0)
500 472 goto out4;
501 473
502 474 mutex_enter(&sp->shm_mlock);
503 475
504 476 /* May be partially locked, so, count bytes to charge for locking */
505 477 for (i = 0; i < npages; i++)
506 478 if (ppa[i]->p_lckcnt == 0)
507 479 lockedbytes += PAGESIZE;
508 480
509 481 proj = sp->shm_perm.ipc_proj;
510 482
511 483 if (lockedbytes > 0) {
512 484 mutex_enter(&procp->p_lock);
513 485 if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) {
514 486 mutex_exit(&procp->p_lock);
515 487 mutex_exit(&sp->shm_mlock);
516 488 for (i = 0; i < npages; i++)
517 489 page_unlock(ppa[i]);
518 490 err = ENOMEM;
519 491 goto out4;
520 492 }
521 493 mutex_exit(&procp->p_lock);
522 494 }
523 495
524 496 /*
525 497 * addr is initial address corresponding to the first page on ppa list
526 498 */
527 499 for (i = 0; i < npages; i++) {
528 500 /* attempt to lock all pages */
529 501 if (page_pp_lock(ppa[i], 0, 1) == 0) {
530 502 /*
531 503 * if unable to lock any page, unlock all
532 504 * of them and return error
533 505 */
534 506 for (j = 0; j < i; j++)
535 507 page_pp_unlock(ppa[j], 0, 1);
536 508 for (i = 0; i < npages; i++)
537 509 page_unlock(ppa[i]);
538 510 rctl_decr_locked_mem(NULL, proj, lockedbytes, 0);
539 511 mutex_exit(&sp->shm_mlock);
540 512 err = ENOMEM;
541 513 goto out4;
542 514 }
543 515 }
544 516 mutex_exit(&sp->shm_mlock);
545 517
546 518 /*
547 519 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
548 520 * for the entire life of the segment. For example platforms
549 521 * that do not support Dynamic Reconfiguration.
550 522 */
551 523 hat_flags = HAT_LOAD_SHARE;
552 524 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
553 525 hat_flags |= HAT_LOAD_LOCK;
554 526
555 527 /*
556 528 * Load translations one lare page at a time
557 529 * to make sure we don't create mappings bigger than
558 530 * segment's size code in case underlying pages
559 531 * are shared with segvn's segment that uses bigger
560 532 * size code than we do.
561 533 */
562 534 pgsz = page_get_pagesize(seg->s_szc);
563 535 pgcnt = page_get_pagecnt(seg->s_szc);
564 536 for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) {
565 537 sz = MIN(pgsz, ptob(npages - pidx));
566 538 hat_memload_array(seg->s_as->a_hat, a, sz,
567 539 &ppa[pidx], sptd->spt_prot, hat_flags);
568 540 }
569 541
570 542 /*
571 543 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
572 544 * we will leave the pages locked SE_SHARED for the life
573 545 * of the ISM segment. This will prevent any calls to
574 546 * hat_pageunload() on this ISM segment for those platforms.
575 547 */
576 548 if (!(hat_flags & HAT_LOAD_LOCK)) {
577 549 /*
578 550 * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
579 551 * we no longer need to hold the SE_SHARED lock on the pages,
580 552 * since L_PAGELOCK and F_SOFTLOCK calls will grab the
581 553 * SE_SHARED lock on the pages as necessary.
582 554 */
583 555 for (i = 0; i < npages; i++)
584 556 page_unlock(ppa[i]);
585 557 }
586 558 sptd->spt_pcachecnt = 0;
587 559 kmem_free(ppa, ((sizeof (page_t *)) * npages));
588 560 sptd->spt_realsize = ptob(npages);
589 561 atomic_add_long(&spt_used, npages);
590 562 sptcargs->seg_spt = seg;
591 563 return (0);
592 564
593 565 out4:
594 566 seg->s_data = NULL;
595 567 kmem_free(vp, sizeof (*vp));
596 568 cv_destroy(&sptd->spt_cv);
597 569 out3:
598 570 mutex_destroy(&sptd->spt_lock);
599 571 if ((sptcargs->flags & SHM_PAGEABLE) == 0)
600 572 kmem_free(ppa, (sizeof (*ppa) * npages));
601 573 out2:
602 574 kmem_free(sptd, sizeof (*sptd));
603 575 out1:
604 576 if ((sptcargs->flags & SHM_PAGEABLE) == 0)
605 577 anon_swap_restore(npages);
606 578 return (err);
607 579 }
608 580
609 581 /*ARGSUSED*/
610 582 void
611 583 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
612 584 {
613 585 struct page *pp;
614 586 struct spt_data *sptd = (struct spt_data *)seg->s_data;
615 587 pgcnt_t npages;
616 588 ulong_t anon_idx;
617 589 struct anon_map *amp;
618 590 struct anon *ap;
619 591 struct vnode *vp;
620 592 u_offset_t off;
621 593 uint_t hat_flags;
622 594 int root = 0;
623 595 pgcnt_t pgs, curnpgs = 0;
624 596 page_t *rootpp;
625 597 rctl_qty_t unlocked_bytes = 0;
626 598 kproject_t *proj;
627 599 kshmid_t *sp;
628 600
629 601 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
630 602
631 603 len = P2ROUNDUP(len, PAGESIZE);
632 604
633 605 npages = btop(len);
634 606
635 607 hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP;
636 608 if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
637 609 (sptd->spt_flags & SHM_PAGEABLE)) {
638 610 hat_flags = HAT_UNLOAD_UNMAP;
639 611 }
640 612
641 613 hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
642 614
643 615 amp = sptd->spt_amp;
644 616 if (sptd->spt_flags & SHM_PAGEABLE)
645 617 npages = btop(amp->size);
646 618
647 619 ASSERT(amp != NULL);
648 620
649 621 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
650 622 sp = amp->a_sp;
651 623 proj = sp->shm_perm.ipc_proj;
652 624 mutex_enter(&sp->shm_mlock);
653 625 }
654 626 for (anon_idx = 0; anon_idx < npages; anon_idx++) {
655 627 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
656 628 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
657 629 panic("segspt_free_pages: null app");
658 630 /*NOTREACHED*/
659 631 }
660 632 } else {
661 633 if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
662 634 == NULL)
663 635 continue;
664 636 }
665 637 ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
666 638 swap_xlate(ap, &vp, &off);
667 639
668 640 /*
669 641 * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
670 642 * the pages won't be having SE_SHARED lock at this
671 643 * point.
672 644 *
673 645 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
674 646 * the pages are still held SE_SHARED locked from the
675 647 * original segspt_create()
676 648 *
677 649 * Our goal is to get SE_EXCL lock on each page, remove
678 650 * permanent lock on it and invalidate the page.
679 651 */
680 652 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
681 653 if (hat_flags == HAT_UNLOAD_UNMAP)
682 654 pp = page_lookup(vp, off, SE_EXCL);
683 655 else {
684 656 if ((pp = page_find(vp, off)) == NULL) {
685 657 panic("segspt_free_pages: "
686 658 "page not locked");
687 659 /*NOTREACHED*/
688 660 }
689 661 if (!page_tryupgrade(pp)) {
690 662 page_unlock(pp);
691 663 pp = page_lookup(vp, off, SE_EXCL);
692 664 }
693 665 }
694 666 if (pp == NULL) {
695 667 panic("segspt_free_pages: "
696 668 "page not in the system");
697 669 /*NOTREACHED*/
698 670 }
699 671 ASSERT(pp->p_lckcnt > 0);
700 672 page_pp_unlock(pp, 0, 1);
701 673 if (pp->p_lckcnt == 0)
702 674 unlocked_bytes += PAGESIZE;
703 675 } else {
704 676 if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
705 677 continue;
706 678 }
707 679 /*
708 680 * It's logical to invalidate the pages here as in most cases
709 681 * these were created by segspt.
710 682 */
711 683 if (pp->p_szc != 0) {
712 684 if (root == 0) {
713 685 ASSERT(curnpgs == 0);
714 686 root = 1;
715 687 rootpp = pp;
716 688 pgs = curnpgs = page_get_pagecnt(pp->p_szc);
717 689 ASSERT(pgs > 1);
718 690 ASSERT(IS_P2ALIGNED(pgs, pgs));
719 691 ASSERT(!(page_pptonum(pp) & (pgs - 1)));
720 692 curnpgs--;
721 693 } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
722 694 ASSERT(curnpgs == 1);
723 695 ASSERT(page_pptonum(pp) ==
724 696 page_pptonum(rootpp) + (pgs - 1));
725 697 page_destroy_pages(rootpp);
726 698 root = 0;
727 699 curnpgs = 0;
728 700 } else {
729 701 ASSERT(curnpgs > 1);
730 702 ASSERT(page_pptonum(pp) ==
731 703 page_pptonum(rootpp) + (pgs - curnpgs));
732 704 curnpgs--;
733 705 }
734 706 } else {
735 707 if (root != 0 || curnpgs != 0) {
736 708 panic("segspt_free_pages: bad large page");
737 709 /*NOTREACHED*/
738 710 }
739 711 /*
740 712 * Before destroying the pages, we need to take care
741 713 * of the rctl locked memory accounting. For that
742 714 * we need to calculte the unlocked_bytes.
743 715 */
744 716 if (pp->p_lckcnt > 0)
745 717 unlocked_bytes += PAGESIZE;
746 718 /*LINTED: constant in conditional context */
747 719 VN_DISPOSE(pp, B_INVAL, 0, kcred);
748 720 }
749 721 }
750 722 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
751 723 if (unlocked_bytes > 0)
752 724 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0);
753 725 mutex_exit(&sp->shm_mlock);
754 726 }
755 727 if (root != 0 || curnpgs != 0) {
756 728 panic("segspt_free_pages: bad large page");
757 729 /*NOTREACHED*/
758 730 }
759 731
760 732 /*
761 733 * mark that pages have been released
762 734 */
763 735 sptd->spt_realsize = 0;
764 736
765 737 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
766 738 atomic_add_long(&spt_used, -npages);
767 739 anon_swap_restore(npages);
768 740 }
769 741 }
770 742
771 743 /*
772 744 * Get memory allocation policy info for specified address in given segment
773 745 */
774 746 static lgrp_mem_policy_info_t *
775 747 segspt_getpolicy(struct seg *seg, caddr_t addr)
776 748 {
777 749 struct anon_map *amp;
778 750 ulong_t anon_index;
779 751 lgrp_mem_policy_info_t *policy_info;
780 752 struct spt_data *spt_data;
781 753
782 754 ASSERT(seg != NULL);
783 755
784 756 /*
785 757 * Get anon_map from segspt
786 758 *
787 759 * Assume that no lock needs to be held on anon_map, since
788 760 * it should be protected by its reference count which must be
789 761 * nonzero for an existing segment
790 762 * Need to grab readers lock on policy tree though
791 763 */
792 764 spt_data = (struct spt_data *)seg->s_data;
793 765 if (spt_data == NULL)
794 766 return (NULL);
795 767 amp = spt_data->spt_amp;
796 768 ASSERT(amp->refcnt != 0);
797 769
798 770 /*
799 771 * Get policy info
800 772 *
801 773 * Assume starting anon index of 0
802 774 */
803 775 anon_index = seg_page(seg, addr);
804 776 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
805 777
806 778 return (policy_info);
807 779 }
808 780
809 781 /*
810 782 * DISM only.
811 783 * Return locked pages over a given range.
812 784 *
813 785 * We will cache all DISM locked pages and save the pplist for the
814 786 * entire segment in the ppa field of the underlying DISM segment structure.
815 787 * Later, during a call to segspt_reclaim() we will use this ppa array
816 788 * to page_unlock() all of the pages and then we will free this ppa list.
817 789 */
818 790 /*ARGSUSED*/
819 791 static int
820 792 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
821 793 struct page ***ppp, enum lock_type type, enum seg_rw rw)
822 794 {
823 795 struct shm_data *shmd = (struct shm_data *)seg->s_data;
824 796 struct seg *sptseg = shmd->shm_sptseg;
825 797 struct spt_data *sptd = sptseg->s_data;
826 798 pgcnt_t pg_idx, npages, tot_npages, npgs;
827 799 struct page **pplist, **pl, **ppa, *pp;
828 800 struct anon_map *amp;
829 801 spgcnt_t an_idx;
830 802 int ret = ENOTSUP;
831 803 uint_t pl_built = 0;
832 804 struct anon *ap;
833 805 struct vnode *vp;
834 806 u_offset_t off;
835 807 pgcnt_t claim_availrmem = 0;
836 808 uint_t szc;
837 809
838 810 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
839 811 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
840 812
841 813 /*
842 814 * We want to lock/unlock the entire ISM segment. Therefore,
843 815 * we will be using the underlying sptseg and it's base address
844 816 * and length for the caching arguments.
845 817 */
846 818 ASSERT(sptseg);
847 819 ASSERT(sptd);
848 820
849 821 pg_idx = seg_page(seg, addr);
850 822 npages = btopr(len);
851 823
852 824 /*
853 825 * check if the request is larger than number of pages covered
854 826 * by amp
855 827 */
856 828 if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
857 829 *ppp = NULL;
858 830 return (ENOTSUP);
859 831 }
860 832
861 833 if (type == L_PAGEUNLOCK) {
862 834 ASSERT(sptd->spt_ppa != NULL);
863 835
864 836 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
865 837 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
866 838
867 839 /*
868 840 * If someone is blocked while unmapping, we purge
869 841 * segment page cache and thus reclaim pplist synchronously
870 842 * without waiting for seg_pasync_thread. This speeds up
871 843 * unmapping in cases where munmap(2) is called, while
872 844 * raw async i/o is still in progress or where a thread
873 845 * exits on data fault in a multithreaded application.
874 846 */
875 847 if ((sptd->spt_flags & DISM_PPA_CHANGED) ||
876 848 (AS_ISUNMAPWAIT(seg->s_as) &&
877 849 shmd->shm_softlockcnt > 0)) {
878 850 segspt_purge(seg);
879 851 }
880 852 return (0);
881 853 }
882 854
883 855 /* The L_PAGELOCK case ... */
884 856
885 857 if (sptd->spt_flags & DISM_PPA_CHANGED) {
886 858 segspt_purge(seg);
887 859 /*
888 860 * for DISM ppa needs to be rebuild since
889 861 * number of locked pages could be changed
890 862 */
891 863 *ppp = NULL;
892 864 return (ENOTSUP);
893 865 }
894 866
895 867 /*
896 868 * First try to find pages in segment page cache, without
897 869 * holding the segment lock.
898 870 */
899 871 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
900 872 S_WRITE, SEGP_FORCE_WIRED);
901 873 if (pplist != NULL) {
902 874 ASSERT(sptd->spt_ppa != NULL);
903 875 ASSERT(sptd->spt_ppa == pplist);
904 876 ppa = sptd->spt_ppa;
905 877 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
906 878 if (ppa[an_idx] == NULL) {
907 879 seg_pinactive(seg, NULL, seg->s_base,
908 880 sptd->spt_amp->size, ppa,
909 881 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
910 882 *ppp = NULL;
911 883 return (ENOTSUP);
912 884 }
913 885 if ((szc = ppa[an_idx]->p_szc) != 0) {
914 886 npgs = page_get_pagecnt(szc);
915 887 an_idx = P2ROUNDUP(an_idx + 1, npgs);
916 888 } else {
917 889 an_idx++;
918 890 }
919 891 }
920 892 /*
921 893 * Since we cache the entire DISM segment, we want to
922 894 * set ppp to point to the first slot that corresponds
923 895 * to the requested addr, i.e. pg_idx.
924 896 */
925 897 *ppp = &(sptd->spt_ppa[pg_idx]);
926 898 return (0);
927 899 }
928 900
929 901 mutex_enter(&sptd->spt_lock);
930 902 /*
931 903 * try to find pages in segment page cache with mutex
932 904 */
933 905 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
934 906 S_WRITE, SEGP_FORCE_WIRED);
935 907 if (pplist != NULL) {
936 908 ASSERT(sptd->spt_ppa != NULL);
937 909 ASSERT(sptd->spt_ppa == pplist);
938 910 ppa = sptd->spt_ppa;
939 911 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
940 912 if (ppa[an_idx] == NULL) {
941 913 mutex_exit(&sptd->spt_lock);
942 914 seg_pinactive(seg, NULL, seg->s_base,
943 915 sptd->spt_amp->size, ppa,
944 916 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
945 917 *ppp = NULL;
946 918 return (ENOTSUP);
947 919 }
948 920 if ((szc = ppa[an_idx]->p_szc) != 0) {
949 921 npgs = page_get_pagecnt(szc);
950 922 an_idx = P2ROUNDUP(an_idx + 1, npgs);
951 923 } else {
952 924 an_idx++;
953 925 }
954 926 }
955 927 /*
956 928 * Since we cache the entire DISM segment, we want to
957 929 * set ppp to point to the first slot that corresponds
958 930 * to the requested addr, i.e. pg_idx.
959 931 */
960 932 mutex_exit(&sptd->spt_lock);
961 933 *ppp = &(sptd->spt_ppa[pg_idx]);
962 934 return (0);
963 935 }
964 936 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
965 937 SEGP_FORCE_WIRED) == SEGP_FAIL) {
966 938 mutex_exit(&sptd->spt_lock);
967 939 *ppp = NULL;
968 940 return (ENOTSUP);
969 941 }
970 942
971 943 /*
972 944 * No need to worry about protections because DISM pages are always rw.
973 945 */
974 946 pl = pplist = NULL;
975 947 amp = sptd->spt_amp;
976 948
977 949 /*
978 950 * Do we need to build the ppa array?
979 951 */
980 952 if (sptd->spt_ppa == NULL) {
981 953 pgcnt_t lpg_cnt = 0;
982 954
983 955 pl_built = 1;
984 956 tot_npages = btopr(sptd->spt_amp->size);
985 957
986 958 ASSERT(sptd->spt_pcachecnt == 0);
987 959 pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
988 960 pl = pplist;
989 961
990 962 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
991 963 for (an_idx = 0; an_idx < tot_npages; ) {
992 964 ap = anon_get_ptr(amp->ahp, an_idx);
993 965 /*
994 966 * Cache only mlocked pages. For large pages
995 967 * if one (constituent) page is mlocked
996 968 * all pages for that large page
997 969 * are cached also. This is for quick
998 970 * lookups of ppa array;
999 971 */
1000 972 if ((ap != NULL) && (lpg_cnt != 0 ||
1001 973 (sptd->spt_ppa_lckcnt[an_idx] != 0))) {
1002 974
1003 975 swap_xlate(ap, &vp, &off);
1004 976 pp = page_lookup(vp, off, SE_SHARED);
1005 977 ASSERT(pp != NULL);
1006 978 if (lpg_cnt == 0) {
1007 979 lpg_cnt++;
1008 980 /*
1009 981 * For a small page, we are done --
1010 982 * lpg_count is reset to 0 below.
1011 983 *
1012 984 * For a large page, we are guaranteed
1013 985 * to find the anon structures of all
1014 986 * constituent pages and a non-zero
1015 987 * lpg_cnt ensures that we don't test
1016 988 * for mlock for these. We are done
1017 989 * when lpg_count reaches (npgs + 1).
1018 990 * If we are not the first constituent
1019 991 * page, restart at the first one.
1020 992 */
1021 993 npgs = page_get_pagecnt(pp->p_szc);
1022 994 if (!IS_P2ALIGNED(an_idx, npgs)) {
1023 995 an_idx = P2ALIGN(an_idx, npgs);
1024 996 page_unlock(pp);
1025 997 continue;
1026 998 }
1027 999 }
1028 1000 if (++lpg_cnt > npgs)
1029 1001 lpg_cnt = 0;
1030 1002
1031 1003 /*
1032 1004 * availrmem is decremented only
1033 1005 * for unlocked pages
1034 1006 */
1035 1007 if (sptd->spt_ppa_lckcnt[an_idx] == 0)
1036 1008 claim_availrmem++;
1037 1009 pplist[an_idx] = pp;
1038 1010 }
1039 1011 an_idx++;
1040 1012 }
1041 1013 ANON_LOCK_EXIT(&->a_rwlock);
1042 1014
1043 1015 if (claim_availrmem) {
1044 1016 mutex_enter(&freemem_lock);
1045 1017 if (availrmem < tune.t_minarmem + claim_availrmem) {
1046 1018 mutex_exit(&freemem_lock);
1047 1019 ret = ENOTSUP;
1048 1020 claim_availrmem = 0;
1049 1021 goto insert_fail;
1050 1022 } else {
1051 1023 availrmem -= claim_availrmem;
1052 1024 }
1053 1025 mutex_exit(&freemem_lock);
1054 1026 }
1055 1027
1056 1028 sptd->spt_ppa = pl;
1057 1029 } else {
1058 1030 /*
1059 1031 * We already have a valid ppa[].
1060 1032 */
1061 1033 pl = sptd->spt_ppa;
1062 1034 }
1063 1035
1064 1036 ASSERT(pl != NULL);
1065 1037
1066 1038 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1067 1039 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1068 1040 segspt_reclaim);
1069 1041 if (ret == SEGP_FAIL) {
1070 1042 /*
1071 1043 * seg_pinsert failed. We return
1072 1044 * ENOTSUP, so that the as_pagelock() code will
1073 1045 * then try the slower F_SOFTLOCK path.
1074 1046 */
1075 1047 if (pl_built) {
1076 1048 /*
1077 1049 * No one else has referenced the ppa[].
1078 1050 * We created it and we need to destroy it.
1079 1051 */
1080 1052 sptd->spt_ppa = NULL;
1081 1053 }
1082 1054 ret = ENOTSUP;
1083 1055 goto insert_fail;
1084 1056 }
1085 1057
1086 1058 /*
1087 1059 * In either case, we increment softlockcnt on the 'real' segment.
1088 1060 */
1089 1061 sptd->spt_pcachecnt++;
1090 1062 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1091 1063
1092 1064 ppa = sptd->spt_ppa;
1093 1065 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1094 1066 if (ppa[an_idx] == NULL) {
1095 1067 mutex_exit(&sptd->spt_lock);
1096 1068 seg_pinactive(seg, NULL, seg->s_base,
1097 1069 sptd->spt_amp->size,
1098 1070 pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1099 1071 *ppp = NULL;
1100 1072 return (ENOTSUP);
1101 1073 }
1102 1074 if ((szc = ppa[an_idx]->p_szc) != 0) {
1103 1075 npgs = page_get_pagecnt(szc);
1104 1076 an_idx = P2ROUNDUP(an_idx + 1, npgs);
1105 1077 } else {
1106 1078 an_idx++;
1107 1079 }
1108 1080 }
1109 1081 /*
1110 1082 * We can now drop the sptd->spt_lock since the ppa[]
1111 1083 * exists and he have incremented pacachecnt.
1112 1084 */
1113 1085 mutex_exit(&sptd->spt_lock);
1114 1086
1115 1087 /*
1116 1088 * Since we cache the entire segment, we want to
1117 1089 * set ppp to point to the first slot that corresponds
1118 1090 * to the requested addr, i.e. pg_idx.
1119 1091 */
1120 1092 *ppp = &(sptd->spt_ppa[pg_idx]);
1121 1093 return (0);
1122 1094
1123 1095 insert_fail:
1124 1096 /*
1125 1097 * We will only reach this code if we tried and failed.
1126 1098 *
1127 1099 * And we can drop the lock on the dummy seg, once we've failed
1128 1100 * to set up a new ppa[].
1129 1101 */
1130 1102 mutex_exit(&sptd->spt_lock);
1131 1103
1132 1104 if (pl_built) {
1133 1105 if (claim_availrmem) {
1134 1106 mutex_enter(&freemem_lock);
1135 1107 availrmem += claim_availrmem;
1136 1108 mutex_exit(&freemem_lock);
1137 1109 }
1138 1110
1139 1111 /*
1140 1112 * We created pl and we need to destroy it.
1141 1113 */
1142 1114 pplist = pl;
1143 1115 for (an_idx = 0; an_idx < tot_npages; an_idx++) {
1144 1116 if (pplist[an_idx] != NULL)
1145 1117 page_unlock(pplist[an_idx]);
1146 1118 }
1147 1119 kmem_free(pl, sizeof (page_t *) * tot_npages);
1148 1120 }
1149 1121
1150 1122 if (shmd->shm_softlockcnt <= 0) {
1151 1123 if (AS_ISUNMAPWAIT(seg->s_as)) {
1152 1124 mutex_enter(&seg->s_as->a_contents);
1153 1125 if (AS_ISUNMAPWAIT(seg->s_as)) {
1154 1126 AS_CLRUNMAPWAIT(seg->s_as);
1155 1127 cv_broadcast(&seg->s_as->a_cv);
1156 1128 }
1157 1129 mutex_exit(&seg->s_as->a_contents);
1158 1130 }
1159 1131 }
1160 1132 *ppp = NULL;
1161 1133 return (ret);
1162 1134 }
1163 1135
1164 1136
1165 1137
1166 1138 /*
1167 1139 * return locked pages over a given range.
1168 1140 *
1169 1141 * We will cache the entire ISM segment and save the pplist for the
1170 1142 * entire segment in the ppa field of the underlying ISM segment structure.
1171 1143 * Later, during a call to segspt_reclaim() we will use this ppa array
1172 1144 * to page_unlock() all of the pages and then we will free this ppa list.
1173 1145 */
1174 1146 /*ARGSUSED*/
1175 1147 static int
1176 1148 segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
1177 1149 struct page ***ppp, enum lock_type type, enum seg_rw rw)
1178 1150 {
1179 1151 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1180 1152 struct seg *sptseg = shmd->shm_sptseg;
1181 1153 struct spt_data *sptd = sptseg->s_data;
1182 1154 pgcnt_t np, page_index, npages;
1183 1155 caddr_t a, spt_base;
1184 1156 struct page **pplist, **pl, *pp;
1185 1157 struct anon_map *amp;
1186 1158 ulong_t anon_index;
1187 1159 int ret = ENOTSUP;
1188 1160 uint_t pl_built = 0;
1189 1161 struct anon *ap;
1190 1162 struct vnode *vp;
1191 1163 u_offset_t off;
1192 1164
1193 1165 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1194 1166 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
1195 1167
1196 1168
1197 1169 /*
1198 1170 * We want to lock/unlock the entire ISM segment. Therefore,
1199 1171 * we will be using the underlying sptseg and it's base address
1200 1172 * and length for the caching arguments.
1201 1173 */
1202 1174 ASSERT(sptseg);
1203 1175 ASSERT(sptd);
1204 1176
1205 1177 if (sptd->spt_flags & SHM_PAGEABLE) {
1206 1178 return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
1207 1179 }
1208 1180
1209 1181 page_index = seg_page(seg, addr);
1210 1182 npages = btopr(len);
1211 1183
1212 1184 /*
1213 1185 * check if the request is larger than number of pages covered
1214 1186 * by amp
1215 1187 */
1216 1188 if (page_index + npages > btopr(sptd->spt_amp->size)) {
1217 1189 *ppp = NULL;
1218 1190 return (ENOTSUP);
1219 1191 }
1220 1192
1221 1193 if (type == L_PAGEUNLOCK) {
1222 1194
1223 1195 ASSERT(sptd->spt_ppa != NULL);
1224 1196
1225 1197 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1226 1198 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1227 1199
1228 1200 /*
1229 1201 * If someone is blocked while unmapping, we purge
1230 1202 * segment page cache and thus reclaim pplist synchronously
1231 1203 * without waiting for seg_pasync_thread. This speeds up
1232 1204 * unmapping in cases where munmap(2) is called, while
1233 1205 * raw async i/o is still in progress or where a thread
1234 1206 * exits on data fault in a multithreaded application.
1235 1207 */
1236 1208 if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
1237 1209 segspt_purge(seg);
1238 1210 }
1239 1211 return (0);
1240 1212 }
1241 1213
1242 1214 /* The L_PAGELOCK case... */
1243 1215
1244 1216 /*
1245 1217 * First try to find pages in segment page cache, without
1246 1218 * holding the segment lock.
1247 1219 */
1248 1220 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1249 1221 S_WRITE, SEGP_FORCE_WIRED);
1250 1222 if (pplist != NULL) {
1251 1223 ASSERT(sptd->spt_ppa == pplist);
1252 1224 ASSERT(sptd->spt_ppa[page_index]);
1253 1225 /*
1254 1226 * Since we cache the entire ISM segment, we want to
1255 1227 * set ppp to point to the first slot that corresponds
1256 1228 * to the requested addr, i.e. page_index.
1257 1229 */
1258 1230 *ppp = &(sptd->spt_ppa[page_index]);
1259 1231 return (0);
1260 1232 }
1261 1233
1262 1234 mutex_enter(&sptd->spt_lock);
1263 1235
1264 1236 /*
1265 1237 * try to find pages in segment page cache
1266 1238 */
1267 1239 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1268 1240 S_WRITE, SEGP_FORCE_WIRED);
1269 1241 if (pplist != NULL) {
1270 1242 ASSERT(sptd->spt_ppa == pplist);
1271 1243 /*
1272 1244 * Since we cache the entire segment, we want to
1273 1245 * set ppp to point to the first slot that corresponds
1274 1246 * to the requested addr, i.e. page_index.
1275 1247 */
1276 1248 mutex_exit(&sptd->spt_lock);
1277 1249 *ppp = &(sptd->spt_ppa[page_index]);
1278 1250 return (0);
1279 1251 }
1280 1252
1281 1253 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1282 1254 SEGP_FORCE_WIRED) == SEGP_FAIL) {
1283 1255 mutex_exit(&sptd->spt_lock);
1284 1256 *ppp = NULL;
1285 1257 return (ENOTSUP);
1286 1258 }
1287 1259
1288 1260 /*
1289 1261 * No need to worry about protections because ISM pages
1290 1262 * are always rw.
1291 1263 */
1292 1264 pl = pplist = NULL;
1293 1265
1294 1266 /*
1295 1267 * Do we need to build the ppa array?
1296 1268 */
1297 1269 if (sptd->spt_ppa == NULL) {
1298 1270 ASSERT(sptd->spt_ppa == pplist);
1299 1271
1300 1272 spt_base = sptseg->s_base;
1301 1273 pl_built = 1;
1302 1274
1303 1275 /*
1304 1276 * availrmem is decremented once during anon_swap_adjust()
1305 1277 * and is incremented during the anon_unresv(), which is
1306 1278 * called from shm_rm_amp() when the segment is destroyed.
1307 1279 */
1308 1280 amp = sptd->spt_amp;
1309 1281 ASSERT(amp != NULL);
1310 1282
1311 1283 /* pcachecnt is protected by sptd->spt_lock */
1312 1284 ASSERT(sptd->spt_pcachecnt == 0);
1313 1285 pplist = kmem_zalloc(sizeof (page_t *)
1314 1286 * btopr(sptd->spt_amp->size), KM_SLEEP);
1315 1287 pl = pplist;
1316 1288
1317 1289 anon_index = seg_page(sptseg, spt_base);
1318 1290
1319 1291 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
1320 1292 for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
1321 1293 a += PAGESIZE, anon_index++, pplist++) {
1322 1294 ap = anon_get_ptr(amp->ahp, anon_index);
1323 1295 ASSERT(ap != NULL);
1324 1296 swap_xlate(ap, &vp, &off);
1325 1297 pp = page_lookup(vp, off, SE_SHARED);
1326 1298 ASSERT(pp != NULL);
1327 1299 *pplist = pp;
1328 1300 }
1329 1301 ANON_LOCK_EXIT(&->a_rwlock);
1330 1302
1331 1303 if (a < (spt_base + sptd->spt_amp->size)) {
1332 1304 ret = ENOTSUP;
1333 1305 goto insert_fail;
1334 1306 }
1335 1307 sptd->spt_ppa = pl;
1336 1308 } else {
1337 1309 /*
1338 1310 * We already have a valid ppa[].
1339 1311 */
1340 1312 pl = sptd->spt_ppa;
1341 1313 }
1342 1314
1343 1315 ASSERT(pl != NULL);
1344 1316
1345 1317 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1346 1318 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1347 1319 segspt_reclaim);
1348 1320 if (ret == SEGP_FAIL) {
1349 1321 /*
1350 1322 * seg_pinsert failed. We return
1351 1323 * ENOTSUP, so that the as_pagelock() code will
1352 1324 * then try the slower F_SOFTLOCK path.
1353 1325 */
1354 1326 if (pl_built) {
1355 1327 /*
1356 1328 * No one else has referenced the ppa[].
1357 1329 * We created it and we need to destroy it.
1358 1330 */
1359 1331 sptd->spt_ppa = NULL;
1360 1332 }
1361 1333 ret = ENOTSUP;
1362 1334 goto insert_fail;
1363 1335 }
1364 1336
1365 1337 /*
1366 1338 * In either case, we increment softlockcnt on the 'real' segment.
1367 1339 */
1368 1340 sptd->spt_pcachecnt++;
1369 1341 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1370 1342
1371 1343 /*
1372 1344 * We can now drop the sptd->spt_lock since the ppa[]
1373 1345 * exists and he have incremented pacachecnt.
1374 1346 */
1375 1347 mutex_exit(&sptd->spt_lock);
1376 1348
1377 1349 /*
1378 1350 * Since we cache the entire segment, we want to
1379 1351 * set ppp to point to the first slot that corresponds
1380 1352 * to the requested addr, i.e. page_index.
1381 1353 */
1382 1354 *ppp = &(sptd->spt_ppa[page_index]);
1383 1355 return (0);
1384 1356
1385 1357 insert_fail:
1386 1358 /*
1387 1359 * We will only reach this code if we tried and failed.
1388 1360 *
1389 1361 * And we can drop the lock on the dummy seg, once we've failed
1390 1362 * to set up a new ppa[].
1391 1363 */
1392 1364 mutex_exit(&sptd->spt_lock);
1393 1365
1394 1366 if (pl_built) {
1395 1367 /*
1396 1368 * We created pl and we need to destroy it.
1397 1369 */
1398 1370 pplist = pl;
1399 1371 np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
1400 1372 while (np) {
1401 1373 page_unlock(*pplist);
1402 1374 np--;
1403 1375 pplist++;
1404 1376 }
1405 1377 kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size));
1406 1378 }
1407 1379 if (shmd->shm_softlockcnt <= 0) {
1408 1380 if (AS_ISUNMAPWAIT(seg->s_as)) {
1409 1381 mutex_enter(&seg->s_as->a_contents);
1410 1382 if (AS_ISUNMAPWAIT(seg->s_as)) {
1411 1383 AS_CLRUNMAPWAIT(seg->s_as);
1412 1384 cv_broadcast(&seg->s_as->a_cv);
1413 1385 }
1414 1386 mutex_exit(&seg->s_as->a_contents);
1415 1387 }
1416 1388 }
1417 1389 *ppp = NULL;
1418 1390 return (ret);
1419 1391 }
1420 1392
1421 1393 /*
1422 1394 * purge any cached pages in the I/O page cache
1423 1395 */
1424 1396 static void
1425 1397 segspt_purge(struct seg *seg)
1426 1398 {
1427 1399 seg_ppurge(seg, NULL, SEGP_FORCE_WIRED);
1428 1400 }
1429 1401
1430 1402 static int
1431 1403 segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
1432 1404 enum seg_rw rw, int async)
1433 1405 {
1434 1406 struct seg *seg = (struct seg *)ptag;
1435 1407 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1436 1408 struct seg *sptseg;
1437 1409 struct spt_data *sptd;
1438 1410 pgcnt_t npages, i, free_availrmem = 0;
1439 1411 int done = 0;
1440 1412
1441 1413 #ifdef lint
1442 1414 addr = addr;
1443 1415 #endif
1444 1416 sptseg = shmd->shm_sptseg;
1445 1417 sptd = sptseg->s_data;
1446 1418 npages = (len >> PAGESHIFT);
1447 1419 ASSERT(npages);
1448 1420 ASSERT(sptd->spt_pcachecnt != 0);
1449 1421 ASSERT(sptd->spt_ppa == pplist);
1450 1422 ASSERT(npages == btopr(sptd->spt_amp->size));
1451 1423 ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1452 1424
1453 1425 /*
1454 1426 * Acquire the lock on the dummy seg and destroy the
1455 1427 * ppa array IF this is the last pcachecnt.
1456 1428 */
1457 1429 mutex_enter(&sptd->spt_lock);
1458 1430 if (--sptd->spt_pcachecnt == 0) {
1459 1431 for (i = 0; i < npages; i++) {
1460 1432 if (pplist[i] == NULL) {
1461 1433 continue;
1462 1434 }
1463 1435 if (rw == S_WRITE) {
1464 1436 hat_setrefmod(pplist[i]);
1465 1437 } else {
1466 1438 hat_setref(pplist[i]);
1467 1439 }
1468 1440 if ((sptd->spt_flags & SHM_PAGEABLE) &&
1469 1441 (sptd->spt_ppa_lckcnt[i] == 0))
1470 1442 free_availrmem++;
1471 1443 page_unlock(pplist[i]);
1472 1444 }
1473 1445 if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) {
1474 1446 mutex_enter(&freemem_lock);
1475 1447 availrmem += free_availrmem;
1476 1448 mutex_exit(&freemem_lock);
1477 1449 }
1478 1450 /*
1479 1451 * Since we want to cach/uncache the entire ISM segment,
1480 1452 * we will track the pplist in a segspt specific field
1481 1453 * ppa, that is initialized at the time we add an entry to
1482 1454 * the cache.
1483 1455 */
1484 1456 ASSERT(sptd->spt_pcachecnt == 0);
1485 1457 kmem_free(pplist, sizeof (page_t *) * npages);
1486 1458 sptd->spt_ppa = NULL;
1487 1459 sptd->spt_flags &= ~DISM_PPA_CHANGED;
1488 1460 sptd->spt_gen++;
1489 1461 cv_broadcast(&sptd->spt_cv);
1490 1462 done = 1;
1491 1463 }
1492 1464 mutex_exit(&sptd->spt_lock);
1493 1465
1494 1466 /*
1495 1467 * If we are pcache async thread or called via seg_ppurge_wiredpp() we
1496 1468 * may not hold AS lock (in this case async argument is not 0). This
1497 1469 * means if softlockcnt drops to 0 after the decrement below address
1498 1470 * space may get freed. We can't allow it since after softlock
1499 1471 * derement to 0 we still need to access as structure for possible
1500 1472 * wakeup of unmap waiters. To prevent the disappearance of as we take
1501 1473 * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes
1502 1474 * this mutex as a barrier to make sure this routine completes before
1503 1475 * segment is freed.
1504 1476 *
1505 1477 * The second complication we have to deal with in async case is a
1506 1478 * possibility of missed wake up of unmap wait thread. When we don't
1507 1479 * hold as lock here we may take a_contents lock before unmap wait
1508 1480 * thread that was first to see softlockcnt was still not 0. As a
1509 1481 * result we'll fail to wake up an unmap wait thread. To avoid this
1510 1482 * race we set nounmapwait flag in as structure if we drop softlockcnt
1511 1483 * to 0 if async is not 0. unmapwait thread
1512 1484 * will not block if this flag is set.
1513 1485 */
1514 1486 if (async)
1515 1487 mutex_enter(&shmd->shm_segfree_syncmtx);
1516 1488
1517 1489 /*
1518 1490 * Now decrement softlockcnt.
1519 1491 */
1520 1492 ASSERT(shmd->shm_softlockcnt > 0);
1521 1493 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1522 1494
1523 1495 if (shmd->shm_softlockcnt <= 0) {
1524 1496 if (async || AS_ISUNMAPWAIT(seg->s_as)) {
1525 1497 mutex_enter(&seg->s_as->a_contents);
1526 1498 if (async)
1527 1499 AS_SETNOUNMAPWAIT(seg->s_as);
1528 1500 if (AS_ISUNMAPWAIT(seg->s_as)) {
1529 1501 AS_CLRUNMAPWAIT(seg->s_as);
1530 1502 cv_broadcast(&seg->s_as->a_cv);
1531 1503 }
1532 1504 mutex_exit(&seg->s_as->a_contents);
1533 1505 }
1534 1506 }
1535 1507
1536 1508 if (async)
1537 1509 mutex_exit(&shmd->shm_segfree_syncmtx);
1538 1510
1539 1511 return (done);
1540 1512 }
1541 1513
1542 1514 /*
1543 1515 * Do a F_SOFTUNLOCK call over the range requested.
1544 1516 * The range must have already been F_SOFTLOCK'ed.
1545 1517 *
1546 1518 * The calls to acquire and release the anon map lock mutex were
1547 1519 * removed in order to avoid a deadly embrace during a DR
1548 1520 * memory delete operation. (Eg. DR blocks while waiting for a
1549 1521 * exclusive lock on a page that is being used for kaio; the
1550 1522 * thread that will complete the kaio and call segspt_softunlock
1551 1523 * blocks on the anon map lock; another thread holding the anon
1552 1524 * map lock blocks on another page lock via the segspt_shmfault
1553 1525 * -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
1554 1526 *
1555 1527 * The appropriateness of the removal is based upon the following:
1556 1528 * 1. If we are holding a segment's reader lock and the page is held
1557 1529 * shared, then the corresponding element in anonmap which points to
1558 1530 * anon struct cannot change and there is no need to acquire the
1559 1531 * anonymous map lock.
1560 1532 * 2. Threads in segspt_softunlock have a reader lock on the segment
1561 1533 * and already have the shared page lock, so we are guaranteed that
1562 1534 * the anon map slot cannot change and therefore can call anon_get_ptr()
1563 1535 * without grabbing the anonymous map lock.
1564 1536 * 3. Threads that softlock a shared page break copy-on-write, even if
1565 1537 * its a read. Thus cow faults can be ignored with respect to soft
1566 1538 * unlocking, since the breaking of cow means that the anon slot(s) will
1567 1539 * not be shared.
1568 1540 */
1569 1541 static void
1570 1542 segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
1571 1543 size_t len, enum seg_rw rw)
1572 1544 {
1573 1545 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1574 1546 struct seg *sptseg;
1575 1547 struct spt_data *sptd;
1576 1548 page_t *pp;
1577 1549 caddr_t adr;
1578 1550 struct vnode *vp;
1579 1551 u_offset_t offset;
1580 1552 ulong_t anon_index;
1581 1553 struct anon_map *amp; /* XXX - for locknest */
1582 1554 struct anon *ap = NULL;
1583 1555 pgcnt_t npages;
1584 1556
1585 1557 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1586 1558
1587 1559 sptseg = shmd->shm_sptseg;
1588 1560 sptd = sptseg->s_data;
1589 1561
1590 1562 /*
1591 1563 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
1592 1564 * and therefore their pages are SE_SHARED locked
1593 1565 * for the entire life of the segment.
1594 1566 */
1595 1567 if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
1596 1568 ((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
1597 1569 goto softlock_decrement;
1598 1570 }
1599 1571
1600 1572 /*
1601 1573 * Any thread is free to do a page_find and
1602 1574 * page_unlock() on the pages within this seg.
1603 1575 *
1604 1576 * We are already holding the as->a_lock on the user's
1605 1577 * real segment, but we need to hold the a_lock on the
1606 1578 * underlying dummy as. This is mostly to satisfy the
1607 1579 * underlying HAT layer.
1608 1580 */
1609 1581 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
1610 1582 hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
1611 1583 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
1612 1584
1613 1585 amp = sptd->spt_amp;
1614 1586 ASSERT(amp != NULL);
1615 1587 anon_index = seg_page(sptseg, sptseg_addr);
1616 1588
1617 1589 for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
1618 1590 ap = anon_get_ptr(amp->ahp, anon_index++);
1619 1591 ASSERT(ap != NULL);
1620 1592 swap_xlate(ap, &vp, &offset);
1621 1593
1622 1594 /*
1623 1595 * Use page_find() instead of page_lookup() to
1624 1596 * find the page since we know that it has a
1625 1597 * "shared" lock.
1626 1598 */
1627 1599 pp = page_find(vp, offset);
1628 1600 ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
1629 1601 if (pp == NULL) {
1630 1602 panic("segspt_softunlock: "
1631 1603 "addr %p, ap %p, vp %p, off %llx",
1632 1604 (void *)adr, (void *)ap, (void *)vp, offset);
1633 1605 /*NOTREACHED*/
1634 1606 }
1635 1607
1636 1608 if (rw == S_WRITE) {
1637 1609 hat_setrefmod(pp);
1638 1610 } else if (rw != S_OTHER) {
1639 1611 hat_setref(pp);
1640 1612 }
1641 1613 page_unlock(pp);
1642 1614 }
1643 1615
1644 1616 softlock_decrement:
1645 1617 npages = btopr(len);
1646 1618 ASSERT(shmd->shm_softlockcnt >= npages);
1647 1619 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
1648 1620 if (shmd->shm_softlockcnt == 0) {
1649 1621 /*
1650 1622 * All SOFTLOCKS are gone. Wakeup any waiting
1651 1623 * unmappers so they can try again to unmap.
1652 1624 * Check for waiters first without the mutex
1653 1625 * held so we don't always grab the mutex on
1654 1626 * softunlocks.
1655 1627 */
1656 1628 if (AS_ISUNMAPWAIT(seg->s_as)) {
1657 1629 mutex_enter(&seg->s_as->a_contents);
1658 1630 if (AS_ISUNMAPWAIT(seg->s_as)) {
1659 1631 AS_CLRUNMAPWAIT(seg->s_as);
1660 1632 cv_broadcast(&seg->s_as->a_cv);
1661 1633 }
1662 1634 mutex_exit(&seg->s_as->a_contents);
1663 1635 }
1664 1636 }
1665 1637 }
1666 1638
1667 1639 int
1668 1640 segspt_shmattach(struct seg *seg, caddr_t *argsp)
1669 1641 {
1670 1642 struct shm_data *shmd_arg = (struct shm_data *)argsp;
1671 1643 struct shm_data *shmd;
1672 1644 struct anon_map *shm_amp = shmd_arg->shm_amp;
1673 1645 struct spt_data *sptd;
1674 1646 int error = 0;
1675 1647
1676 1648 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1677 1649
1678 1650 shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
1679 1651 if (shmd == NULL)
1680 1652 return (ENOMEM);
1681 1653
1682 1654 shmd->shm_sptas = shmd_arg->shm_sptas;
1683 1655 shmd->shm_amp = shm_amp;
1684 1656 shmd->shm_sptseg = shmd_arg->shm_sptseg;
1685 1657
1686 1658 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
1687 1659 NULL, 0, seg->s_size);
1688 1660
1689 1661 mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
1690 1662
1691 1663 seg->s_data = (void *)shmd;
1692 1664 seg->s_ops = &segspt_shmops;
1693 1665 seg->s_szc = shmd->shm_sptseg->s_szc;
1694 1666 sptd = shmd->shm_sptseg->s_data;
1695 1667
1696 1668 if (sptd->spt_flags & SHM_PAGEABLE) {
1697 1669 if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
1698 1670 KM_NOSLEEP)) == NULL) {
1699 1671 seg->s_data = (void *)NULL;
1700 1672 kmem_free(shmd, (sizeof (*shmd)));
1701 1673 return (ENOMEM);
1702 1674 }
1703 1675 shmd->shm_lckpgs = 0;
1704 1676 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
1705 1677 if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
1706 1678 shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1707 1679 seg->s_size, seg->s_szc)) != 0) {
1708 1680 kmem_free(shmd->shm_vpage,
1709 1681 btopr(shm_amp->size));
1710 1682 }
1711 1683 }
1712 1684 } else {
1713 1685 error = hat_share(seg->s_as->a_hat, seg->s_base,
1714 1686 shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1715 1687 seg->s_size, seg->s_szc);
1716 1688 }
1717 1689 if (error) {
1718 1690 seg->s_szc = 0;
1719 1691 seg->s_data = (void *)NULL;
1720 1692 kmem_free(shmd, (sizeof (*shmd)));
1721 1693 } else {
1722 1694 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1723 1695 shm_amp->refcnt++;
1724 1696 ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1725 1697 }
1726 1698 return (error);
1727 1699 }
1728 1700
1729 1701 int
1730 1702 segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
1731 1703 {
1732 1704 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1733 1705 int reclaim = 1;
1734 1706
1735 1707 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1736 1708 retry:
1737 1709 if (shmd->shm_softlockcnt > 0) {
1738 1710 if (reclaim == 1) {
1739 1711 segspt_purge(seg);
1740 1712 reclaim = 0;
1741 1713 goto retry;
1742 1714 }
1743 1715 return (EAGAIN);
1744 1716 }
1745 1717
1746 1718 if (ssize != seg->s_size) {
1747 1719 #ifdef DEBUG
1748 1720 cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
1749 1721 ssize, seg->s_size);
1750 1722 #endif
1751 1723 return (EINVAL);
1752 1724 }
1753 1725
1754 1726 (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
1755 1727 NULL, 0);
1756 1728 hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);
1757 1729
1758 1730 seg_free(seg);
1759 1731
1760 1732 return (0);
1761 1733 }
1762 1734
1763 1735 void
1764 1736 segspt_shmfree(struct seg *seg)
1765 1737 {
1766 1738 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1767 1739 struct anon_map *shm_amp = shmd->shm_amp;
1768 1740
1769 1741 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1770 1742
1771 1743 (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
1772 1744 MC_UNLOCK, NULL, 0);
1773 1745
1774 1746 /*
1775 1747 * Need to increment refcnt when attaching
1776 1748 * and decrement when detaching because of dup().
1777 1749 */
1778 1750 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1779 1751 shm_amp->refcnt--;
1780 1752 ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1781 1753
1782 1754 if (shmd->shm_vpage) { /* only for DISM */
1783 1755 kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
1784 1756 shmd->shm_vpage = NULL;
1785 1757 }
1786 1758
1787 1759 /*
1788 1760 * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's
1789 1761 * still working with this segment without holding as lock.
1790 1762 */
1791 1763 ASSERT(shmd->shm_softlockcnt == 0);
1792 1764 mutex_enter(&shmd->shm_segfree_syncmtx);
1793 1765 mutex_destroy(&shmd->shm_segfree_syncmtx);
1794 1766
1795 1767 kmem_free(shmd, sizeof (*shmd));
1796 1768 }
1797 1769
1798 1770 /*ARGSUSED*/
1799 1771 int
1800 1772 segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1801 1773 {
1802 1774 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1803 1775
1804 1776 /*
1805 1777 * Shared page table is more than shared mapping.
1806 1778 * Individual process sharing page tables can't change prot
1807 1779 * because there is only one set of page tables.
1808 1780 * This will be allowed after private page table is
1809 1781 * supported.
1810 1782 */
1811 1783 /* need to return correct status error? */
1812 1784 return (0);
1813 1785 }
1814 1786
1815 1787
1816 1788 faultcode_t
1817 1789 segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
1818 1790 size_t len, enum fault_type type, enum seg_rw rw)
1819 1791 {
1820 1792 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1821 1793 struct seg *sptseg = shmd->shm_sptseg;
1822 1794 struct as *curspt = shmd->shm_sptas;
1823 1795 struct spt_data *sptd = sptseg->s_data;
1824 1796 pgcnt_t npages;
1825 1797 size_t size;
1826 1798 caddr_t segspt_addr, shm_addr;
1827 1799 page_t **ppa;
1828 1800 int i;
1829 1801 ulong_t an_idx = 0;
1830 1802 int err = 0;
1831 1803 int dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0);
1832 1804 size_t pgsz;
1833 1805 pgcnt_t pgcnt;
1834 1806 caddr_t a;
1835 1807 pgcnt_t pidx;
1836 1808
1837 1809 #ifdef lint
1838 1810 hat = hat;
1839 1811 #endif
1840 1812 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1841 1813
1842 1814 /*
1843 1815 * Because of the way spt is implemented
1844 1816 * the realsize of the segment does not have to be
1845 1817 * equal to the segment size itself. The segment size is
1846 1818 * often in multiples of a page size larger than PAGESIZE.
1847 1819 * The realsize is rounded up to the nearest PAGESIZE
1848 1820 * based on what the user requested. This is a bit of
1849 1821 * ungliness that is historical but not easily fixed
1850 1822 * without re-designing the higher levels of ISM.
1851 1823 */
1852 1824 ASSERT(addr >= seg->s_base);
1853 1825 if (((addr + len) - seg->s_base) > sptd->spt_realsize)
1854 1826 return (FC_NOMAP);
1855 1827 /*
1856 1828 * For all of the following cases except F_PROT, we need to
1857 1829 * make any necessary adjustments to addr and len
1858 1830 * and get all of the necessary page_t's into an array called ppa[].
1859 1831 *
1860 1832 * The code in shmat() forces base addr and len of ISM segment
1861 1833 * to be aligned to largest page size supported. Therefore,
1862 1834 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
1863 1835 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
1864 1836 * in large pagesize chunks, or else we will screw up the HAT
1865 1837 * layer by calling hat_memload_array() with differing page sizes
1866 1838 * over a given virtual range.
1867 1839 */
1868 1840 pgsz = page_get_pagesize(sptseg->s_szc);
1869 1841 pgcnt = page_get_pagecnt(sptseg->s_szc);
1870 1842 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
1871 1843 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
1872 1844 npages = btopr(size);
1873 1845
1874 1846 /*
1875 1847 * Now we need to convert from addr in segshm to addr in segspt.
1876 1848 */
1877 1849 an_idx = seg_page(seg, shm_addr);
1878 1850 segspt_addr = sptseg->s_base + ptob(an_idx);
1879 1851
1880 1852 ASSERT((segspt_addr + ptob(npages)) <=
1881 1853 (sptseg->s_base + sptd->spt_realsize));
1882 1854 ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));
1883 1855
1884 1856 switch (type) {
1885 1857
1886 1858 case F_SOFTLOCK:
1887 1859
1888 1860 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
1889 1861 /*
1890 1862 * Fall through to the F_INVAL case to load up the hat layer
1891 1863 * entries with the HAT_LOAD_LOCK flag.
1892 1864 */
1893 1865 /* FALLTHRU */
1894 1866 case F_INVAL:
1895 1867
1896 1868 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
1897 1869 return (FC_NOMAP);
1898 1870
1899 1871 ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
1900 1872
1901 1873 err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
1902 1874 if (err != 0) {
1903 1875 if (type == F_SOFTLOCK) {
1904 1876 atomic_add_long((ulong_t *)(
1905 1877 &(shmd->shm_softlockcnt)), -npages);
1906 1878 }
1907 1879 goto dism_err;
1908 1880 }
1909 1881 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
1910 1882 a = segspt_addr;
1911 1883 pidx = 0;
1912 1884 if (type == F_SOFTLOCK) {
1913 1885
1914 1886 /*
1915 1887 * Load up the translation keeping it
1916 1888 * locked and don't unlock the page.
1917 1889 */
1918 1890 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
1919 1891 hat_memload_array(sptseg->s_as->a_hat,
1920 1892 a, pgsz, &ppa[pidx], sptd->spt_prot,
1921 1893 HAT_LOAD_LOCK | HAT_LOAD_SHARE);
1922 1894 }
1923 1895 } else {
1924 1896 /*
1925 1897 * Migrate pages marked for migration
1926 1898 */
1927 1899 if (lgrp_optimizations())
1928 1900 page_migrate(seg, shm_addr, ppa, npages);
1929 1901
1930 1902 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
1931 1903 hat_memload_array(sptseg->s_as->a_hat,
1932 1904 a, pgsz, &ppa[pidx],
1933 1905 sptd->spt_prot,
1934 1906 HAT_LOAD_SHARE);
1935 1907 }
1936 1908
1937 1909 /*
1938 1910 * And now drop the SE_SHARED lock(s).
1939 1911 */
1940 1912 if (dyn_ism_unmap) {
1941 1913 for (i = 0; i < npages; i++) {
1942 1914 page_unlock(ppa[i]);
1943 1915 }
1944 1916 }
1945 1917 }
1946 1918
1947 1919 if (!dyn_ism_unmap) {
1948 1920 if (hat_share(seg->s_as->a_hat, shm_addr,
1949 1921 curspt->a_hat, segspt_addr, ptob(npages),
1950 1922 seg->s_szc) != 0) {
1951 1923 panic("hat_share err in DISM fault");
1952 1924 /* NOTREACHED */
1953 1925 }
1954 1926 if (type == F_INVAL) {
1955 1927 for (i = 0; i < npages; i++) {
1956 1928 page_unlock(ppa[i]);
1957 1929 }
1958 1930 }
1959 1931 }
1960 1932 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
1961 1933 dism_err:
1962 1934 kmem_free(ppa, npages * sizeof (page_t *));
1963 1935 return (err);
1964 1936
1965 1937 case F_SOFTUNLOCK:
1966 1938
1967 1939 /*
1968 1940 * This is a bit ugly, we pass in the real seg pointer,
1969 1941 * but the segspt_addr is the virtual address within the
1970 1942 * dummy seg.
1971 1943 */
1972 1944 segspt_softunlock(seg, segspt_addr, size, rw);
1973 1945 return (0);
1974 1946
1975 1947 case F_PROT:
1976 1948
1977 1949 /*
1978 1950 * This takes care of the unusual case where a user
1979 1951 * allocates a stack in shared memory and a register
1980 1952 * window overflow is written to that stack page before
1981 1953 * it is otherwise modified.
1982 1954 *
1983 1955 * We can get away with this because ISM segments are
1984 1956 * always rw. Other than this unusual case, there
1985 1957 * should be no instances of protection violations.
1986 1958 */
1987 1959 return (0);
1988 1960
1989 1961 default:
1990 1962 #ifdef DEBUG
1991 1963 panic("segspt_dismfault default type?");
1992 1964 #else
1993 1965 return (FC_NOMAP);
1994 1966 #endif
1995 1967 }
1996 1968 }
1997 1969
1998 1970
1999 1971 faultcode_t
2000 1972 segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
2001 1973 size_t len, enum fault_type type, enum seg_rw rw)
2002 1974 {
2003 1975 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2004 1976 struct seg *sptseg = shmd->shm_sptseg;
2005 1977 struct as *curspt = shmd->shm_sptas;
2006 1978 struct spt_data *sptd = sptseg->s_data;
2007 1979 pgcnt_t npages;
2008 1980 size_t size;
2009 1981 caddr_t sptseg_addr, shm_addr;
2010 1982 page_t *pp, **ppa;
2011 1983 int i;
2012 1984 u_offset_t offset;
2013 1985 ulong_t anon_index = 0;
2014 1986 struct vnode *vp;
2015 1987 struct anon_map *amp; /* XXX - for locknest */
2016 1988 struct anon *ap = NULL;
2017 1989 size_t pgsz;
2018 1990 pgcnt_t pgcnt;
2019 1991 caddr_t a;
2020 1992 pgcnt_t pidx;
2021 1993 size_t sz;
2022 1994
2023 1995 #ifdef lint
2024 1996 hat = hat;
2025 1997 #endif
2026 1998
2027 1999 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2028 2000
2029 2001 if (sptd->spt_flags & SHM_PAGEABLE) {
2030 2002 return (segspt_dismfault(hat, seg, addr, len, type, rw));
2031 2003 }
2032 2004
2033 2005 /*
2034 2006 * Because of the way spt is implemented
2035 2007 * the realsize of the segment does not have to be
2036 2008 * equal to the segment size itself. The segment size is
2037 2009 * often in multiples of a page size larger than PAGESIZE.
2038 2010 * The realsize is rounded up to the nearest PAGESIZE
2039 2011 * based on what the user requested. This is a bit of
2040 2012 * ungliness that is historical but not easily fixed
2041 2013 * without re-designing the higher levels of ISM.
2042 2014 */
2043 2015 ASSERT(addr >= seg->s_base);
2044 2016 if (((addr + len) - seg->s_base) > sptd->spt_realsize)
2045 2017 return (FC_NOMAP);
2046 2018 /*
2047 2019 * For all of the following cases except F_PROT, we need to
2048 2020 * make any necessary adjustments to addr and len
2049 2021 * and get all of the necessary page_t's into an array called ppa[].
2050 2022 *
2051 2023 * The code in shmat() forces base addr and len of ISM segment
2052 2024 * to be aligned to largest page size supported. Therefore,
2053 2025 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2054 2026 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2055 2027 * in large pagesize chunks, or else we will screw up the HAT
2056 2028 * layer by calling hat_memload_array() with differing page sizes
2057 2029 * over a given virtual range.
2058 2030 */
2059 2031 pgsz = page_get_pagesize(sptseg->s_szc);
2060 2032 pgcnt = page_get_pagecnt(sptseg->s_szc);
2061 2033 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2062 2034 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2063 2035 npages = btopr(size);
2064 2036
2065 2037 /*
2066 2038 * Now we need to convert from addr in segshm to addr in segspt.
2067 2039 */
2068 2040 anon_index = seg_page(seg, shm_addr);
2069 2041 sptseg_addr = sptseg->s_base + ptob(anon_index);
2070 2042
2071 2043 /*
2072 2044 * And now we may have to adjust npages downward if we have
2073 2045 * exceeded the realsize of the segment or initial anon
2074 2046 * allocations.
2075 2047 */
2076 2048 if ((sptseg_addr + ptob(npages)) >
2077 2049 (sptseg->s_base + sptd->spt_realsize))
2078 2050 size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;
2079 2051
2080 2052 npages = btopr(size);
2081 2053
2082 2054 ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
2083 2055 ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);
2084 2056
2085 2057 switch (type) {
2086 2058
2087 2059 case F_SOFTLOCK:
2088 2060
2089 2061 /*
2090 2062 * availrmem is decremented once during anon_swap_adjust()
2091 2063 * and is incremented during the anon_unresv(), which is
2092 2064 * called from shm_rm_amp() when the segment is destroyed.
2093 2065 */
2094 2066 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2095 2067 /*
2096 2068 * Some platforms assume that ISM pages are SE_SHARED
2097 2069 * locked for the entire life of the segment.
2098 2070 */
2099 2071 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
2100 2072 return (0);
2101 2073 /*
2102 2074 * Fall through to the F_INVAL case to load up the hat layer
2103 2075 * entries with the HAT_LOAD_LOCK flag.
2104 2076 */
2105 2077
2106 2078 /* FALLTHRU */
2107 2079 case F_INVAL:
2108 2080
2109 2081 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2110 2082 return (FC_NOMAP);
2111 2083
2112 2084 /*
2113 2085 * Some platforms that do NOT support DYNAMIC_ISM_UNMAP
2114 2086 * may still rely on this call to hat_share(). That
2115 2087 * would imply that those hat's can fault on a
2116 2088 * HAT_LOAD_LOCK translation, which would seem
2117 2089 * contradictory.
2118 2090 */
2119 2091 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2120 2092 if (hat_share(seg->s_as->a_hat, seg->s_base,
2121 2093 curspt->a_hat, sptseg->s_base,
2122 2094 sptseg->s_size, sptseg->s_szc) != 0) {
2123 2095 panic("hat_share error in ISM fault");
2124 2096 /*NOTREACHED*/
2125 2097 }
2126 2098 return (0);
2127 2099 }
2128 2100 ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);
2129 2101
2130 2102 /*
2131 2103 * I see no need to lock the real seg,
2132 2104 * here, because all of our work will be on the underlying
2133 2105 * dummy seg.
2134 2106 *
2135 2107 * sptseg_addr and npages now account for large pages.
2136 2108 */
2137 2109 amp = sptd->spt_amp;
2138 2110 ASSERT(amp != NULL);
2139 2111 anon_index = seg_page(sptseg, sptseg_addr);
2140 2112
2141 2113 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2142 2114 for (i = 0; i < npages; i++) {
2143 2115 ap = anon_get_ptr(amp->ahp, anon_index++);
2144 2116 ASSERT(ap != NULL);
2145 2117 swap_xlate(ap, &vp, &offset);
2146 2118 pp = page_lookup(vp, offset, SE_SHARED);
2147 2119 ASSERT(pp != NULL);
2148 2120 ppa[i] = pp;
2149 2121 }
2150 2122 ANON_LOCK_EXIT(&->a_rwlock);
2151 2123 ASSERT(i == npages);
2152 2124
2153 2125 /*
2154 2126 * We are already holding the as->a_lock on the user's
2155 2127 * real segment, but we need to hold the a_lock on the
2156 2128 * underlying dummy as. This is mostly to satisfy the
2157 2129 * underlying HAT layer.
2158 2130 */
2159 2131 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
2160 2132 a = sptseg_addr;
2161 2133 pidx = 0;
2162 2134 if (type == F_SOFTLOCK) {
2163 2135 /*
2164 2136 * Load up the translation keeping it
2165 2137 * locked and don't unlock the page.
2166 2138 */
2167 2139 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2168 2140 sz = MIN(pgsz, ptob(npages - pidx));
2169 2141 hat_memload_array(sptseg->s_as->a_hat, a,
2170 2142 sz, &ppa[pidx], sptd->spt_prot,
2171 2143 HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2172 2144 }
2173 2145 } else {
2174 2146 /*
2175 2147 * Migrate pages marked for migration.
2176 2148 */
2177 2149 if (lgrp_optimizations())
2178 2150 page_migrate(seg, shm_addr, ppa, npages);
2179 2151
2180 2152 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2181 2153 sz = MIN(pgsz, ptob(npages - pidx));
2182 2154 hat_memload_array(sptseg->s_as->a_hat,
2183 2155 a, sz, &ppa[pidx],
2184 2156 sptd->spt_prot, HAT_LOAD_SHARE);
2185 2157 }
2186 2158
2187 2159 /*
2188 2160 * And now drop the SE_SHARED lock(s).
2189 2161 */
2190 2162 for (i = 0; i < npages; i++)
2191 2163 page_unlock(ppa[i]);
2192 2164 }
2193 2165 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
2194 2166
2195 2167 kmem_free(ppa, sizeof (page_t *) * npages);
2196 2168 return (0);
2197 2169 case F_SOFTUNLOCK:
2198 2170
2199 2171 /*
2200 2172 * This is a bit ugly, we pass in the real seg pointer,
2201 2173 * but the sptseg_addr is the virtual address within the
2202 2174 * dummy seg.
2203 2175 */
2204 2176 segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
2205 2177 return (0);
2206 2178
2207 2179 case F_PROT:
2208 2180
2209 2181 /*
2210 2182 * This takes care of the unusual case where a user
2211 2183 * allocates a stack in shared memory and a register
2212 2184 * window overflow is written to that stack page before
2213 2185 * it is otherwise modified.
2214 2186 *
2215 2187 * We can get away with this because ISM segments are
2216 2188 * always rw. Other than this unusual case, there
2217 2189 * should be no instances of protection violations.
2218 2190 */
2219 2191 return (0);
2220 2192
2221 2193 default:
2222 2194 #ifdef DEBUG
2223 2195 cmn_err(CE_WARN, "segspt_shmfault default type?");
2224 2196 #endif
2225 2197 return (FC_NOMAP);
2226 2198 }
2227 2199 }
2228 2200
2229 2201 /*ARGSUSED*/
2230 2202 static faultcode_t
2231 2203 segspt_shmfaulta(struct seg *seg, caddr_t addr)
2232 2204 {
2233 2205 return (0);
2234 2206 }
2235 2207
2236 2208 /*ARGSUSED*/
2237 2209 static int
2238 2210 segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
2239 2211 {
2240 2212 return (0);
2241 2213 }
2242 2214
2243 2215 /*
2244 2216 * duplicate the shared page tables
2245 2217 */
2246 2218 int
2247 2219 segspt_shmdup(struct seg *seg, struct seg *newseg)
2248 2220 {
2249 2221 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2250 2222 struct anon_map *amp = shmd->shm_amp;
2251 2223 struct shm_data *shmd_new;
2252 2224 struct seg *spt_seg = shmd->shm_sptseg;
2253 2225 struct spt_data *sptd = spt_seg->s_data;
2254 2226 int error = 0;
2255 2227
2256 2228 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
2257 2229
2258 2230 shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
2259 2231 newseg->s_data = (void *)shmd_new;
2260 2232 shmd_new->shm_sptas = shmd->shm_sptas;
2261 2233 shmd_new->shm_amp = amp;
2262 2234 shmd_new->shm_sptseg = shmd->shm_sptseg;
2263 2235 newseg->s_ops = &segspt_shmops;
2264 2236 newseg->s_szc = seg->s_szc;
2265 2237 ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);
2266 2238
2267 2239 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
2268 2240 amp->refcnt++;
2269 2241 ANON_LOCK_EXIT(&->a_rwlock);
2270 2242
2271 2243 if (sptd->spt_flags & SHM_PAGEABLE) {
2272 2244 shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
2273 2245 shmd_new->shm_lckpgs = 0;
2274 2246 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2275 2247 if ((error = hat_share(newseg->s_as->a_hat,
2276 2248 newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR,
2277 2249 seg->s_size, seg->s_szc)) != 0) {
2278 2250 kmem_free(shmd_new->shm_vpage,
2279 2251 btopr(amp->size));
2280 2252 }
2281 2253 }
2282 2254 return (error);
2283 2255 } else {
2284 2256 return (hat_share(newseg->s_as->a_hat, newseg->s_base,
2285 2257 shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size,
2286 2258 seg->s_szc));
2287 2259
2288 2260 }
2289 2261 }
2290 2262
2291 2263 /*ARGSUSED*/
2292 2264 int
2293 2265 segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
2294 2266 {
2295 2267 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2296 2268 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2297 2269
2298 2270 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2299 2271
2300 2272 /*
2301 2273 * ISM segment is always rw.
2302 2274 */
2303 2275 return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
2304 2276 }
2305 2277
2306 2278 /*
2307 2279 * Return an array of locked large pages, for empty slots allocate
2308 2280 * private zero-filled anon pages.
2309 2281 */
2310 2282 static int
2311 2283 spt_anon_getpages(
2312 2284 struct seg *sptseg,
2313 2285 caddr_t sptaddr,
2314 2286 size_t len,
2315 2287 page_t *ppa[])
2316 2288 {
2317 2289 struct spt_data *sptd = sptseg->s_data;
2318 2290 struct anon_map *amp = sptd->spt_amp;
2319 2291 enum seg_rw rw = sptd->spt_prot;
2320 2292 uint_t szc = sptseg->s_szc;
2321 2293 size_t pg_sz, share_sz = page_get_pagesize(szc);
2322 2294 pgcnt_t lp_npgs;
2323 2295 caddr_t lp_addr, e_sptaddr;
2324 2296 uint_t vpprot, ppa_szc = 0;
2325 2297 struct vpage *vpage = NULL;
2326 2298 ulong_t j, ppa_idx;
2327 2299 int err, ierr = 0;
2328 2300 pgcnt_t an_idx;
2329 2301 anon_sync_obj_t cookie;
2330 2302 int anon_locked = 0;
2331 2303 pgcnt_t amp_pgs;
2332 2304
2333 2305
2334 2306 ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
2335 2307 ASSERT(len != 0);
2336 2308
2337 2309 pg_sz = share_sz;
2338 2310 lp_npgs = btop(pg_sz);
2339 2311 lp_addr = sptaddr;
2340 2312 e_sptaddr = sptaddr + len;
2341 2313 an_idx = seg_page(sptseg, sptaddr);
2342 2314 ppa_idx = 0;
2343 2315
2344 2316 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2345 2317
2346 2318 amp_pgs = page_get_pagecnt(amp->a_szc);
2347 2319
2348 2320 /*CONSTCOND*/
2349 2321 while (1) {
2350 2322 for (; lp_addr < e_sptaddr;
2351 2323 an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) {
2352 2324
2353 2325 /*
2354 2326 * If we're currently locked, and we get to a new
2355 2327 * page, unlock our current anon chunk.
2356 2328 */
2357 2329 if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) {
2358 2330 anon_array_exit(&cookie);
2359 2331 anon_locked = 0;
2360 2332 }
2361 2333 if (!anon_locked) {
2362 2334 anon_array_enter(amp, an_idx, &cookie);
2363 2335 anon_locked = 1;
2364 2336 }
2365 2337 ppa_szc = (uint_t)-1;
2366 2338 ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
2367 2339 lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
2368 2340 &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred);
2369 2341
2370 2342 if (ierr != 0) {
2371 2343 if (ierr > 0) {
2372 2344 err = FC_MAKE_ERR(ierr);
2373 2345 goto lpgs_err;
2374 2346 }
2375 2347 break;
2376 2348 }
2377 2349 }
2378 2350 if (lp_addr == e_sptaddr) {
2379 2351 break;
2380 2352 }
2381 2353 ASSERT(lp_addr < e_sptaddr);
2382 2354
2383 2355 /*
2384 2356 * ierr == -1 means we failed to allocate a large page.
2385 2357 * so do a size down operation.
2386 2358 *
2387 2359 * ierr == -2 means some other process that privately shares
2388 2360 * pages with this process has allocated a larger page and we
2389 2361 * need to retry with larger pages. So do a size up
2390 2362 * operation. This relies on the fact that large pages are
2391 2363 * never partially shared i.e. if we share any constituent
2392 2364 * page of a large page with another process we must share the
2393 2365 * entire large page. Note this cannot happen for SOFTLOCK
2394 2366 * case, unless current address (lpaddr) is at the beginning
2395 2367 * of the next page size boundary because the other process
2396 2368 * couldn't have relocated locked pages.
2397 2369 */
2398 2370 ASSERT(ierr == -1 || ierr == -2);
2399 2371 if (segvn_anypgsz) {
2400 2372 ASSERT(ierr == -2 || szc != 0);
2401 2373 ASSERT(ierr == -1 || szc < sptseg->s_szc);
2402 2374 szc = (ierr == -1) ? szc - 1 : szc + 1;
2403 2375 } else {
2404 2376 /*
2405 2377 * For faults and segvn_anypgsz == 0
2406 2378 * we need to be careful not to loop forever
2407 2379 * if existing page is found with szc other
2408 2380 * than 0 or seg->s_szc. This could be due
2409 2381 * to page relocations on behalf of DR or
2410 2382 * more likely large page creation. For this
2411 2383 * case simply re-size to existing page's szc
2412 2384 * if returned by anon_map_getpages().
2413 2385 */
2414 2386 if (ppa_szc == (uint_t)-1) {
2415 2387 szc = (ierr == -1) ? 0 : sptseg->s_szc;
2416 2388 } else {
2417 2389 ASSERT(ppa_szc <= sptseg->s_szc);
2418 2390 ASSERT(ierr == -2 || ppa_szc < szc);
2419 2391 ASSERT(ierr == -1 || ppa_szc > szc);
2420 2392 szc = ppa_szc;
2421 2393 }
2422 2394 }
2423 2395 pg_sz = page_get_pagesize(szc);
2424 2396 lp_npgs = btop(pg_sz);
2425 2397 ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
2426 2398 }
2427 2399 if (anon_locked) {
2428 2400 anon_array_exit(&cookie);
2429 2401 }
2430 2402 ANON_LOCK_EXIT(&->a_rwlock);
2431 2403 return (0);
2432 2404
2433 2405 lpgs_err:
2434 2406 if (anon_locked) {
2435 2407 anon_array_exit(&cookie);
2436 2408 }
2437 2409 ANON_LOCK_EXIT(&->a_rwlock);
2438 2410 for (j = 0; j < ppa_idx; j++)
2439 2411 page_unlock(ppa[j]);
2440 2412 return (err);
2441 2413 }
2442 2414
2443 2415 /*
2444 2416 * count the number of bytes in a set of spt pages that are currently not
2445 2417 * locked
2446 2418 */
2447 2419 static rctl_qty_t
2448 2420 spt_unlockedbytes(pgcnt_t npages, page_t **ppa)
2449 2421 {
2450 2422 ulong_t i;
2451 2423 rctl_qty_t unlocked = 0;
2452 2424
2453 2425 for (i = 0; i < npages; i++) {
2454 2426 if (ppa[i]->p_lckcnt == 0)
2455 2427 unlocked += PAGESIZE;
2456 2428 }
2457 2429 return (unlocked);
2458 2430 }
2459 2431
2460 2432 extern u_longlong_t randtick(void);
2461 2433 /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */
2462 2434 #define NLCK (NCPU_P2)
2463 2435 /* Random number with a range [0, n-1], n must be power of two */
2464 2436 #define RAND_P2(n) \
2465 2437 ((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1))
2466 2438
2467 2439 int
2468 2440 spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2469 2441 page_t **ppa, ulong_t *lockmap, size_t pos,
2470 2442 rctl_qty_t *locked)
2471 2443 {
2472 2444 struct shm_data *shmd = seg->s_data;
2473 2445 struct spt_data *sptd = shmd->shm_sptseg->s_data;
2474 2446 ulong_t i;
2475 2447 int kernel;
2476 2448 pgcnt_t nlck = 0;
2477 2449 int rv = 0;
2478 2450 int use_reserved = 1;
2479 2451
2480 2452 /* return the number of bytes actually locked */
2481 2453 *locked = 0;
2482 2454
2483 2455 /*
2484 2456 * To avoid contention on freemem_lock, availrmem and pages_locked
2485 2457 * global counters are updated only every nlck locked pages instead of
2486 2458 * every time. Reserve nlck locks up front and deduct from this
2487 2459 * reservation for each page that requires a lock. When the reservation
2488 2460 * is consumed, reserve again. nlck is randomized, so the competing
2489 2461 * threads do not fall into a cyclic lock contention pattern. When
2490 2462 * memory is low, the lock ahead is disabled, and instead page_pp_lock()
2491 2463 * is used to lock pages.
2492 2464 */
2493 2465 for (i = 0; i < npages; anon_index++, pos++, i++) {
2494 2466 if (nlck == 0 && use_reserved == 1) {
2495 2467 nlck = NLCK + RAND_P2(NLCK);
2496 2468 /* if fewer loops left, decrease nlck */
2497 2469 nlck = MIN(nlck, npages - i);
2498 2470 /*
2499 2471 * Reserve nlck locks up front and deduct from this
2500 2472 * reservation for each page that requires a lock. When
2501 2473 * the reservation is consumed, reserve again.
2502 2474 */
2503 2475 mutex_enter(&freemem_lock);
2504 2476 if ((availrmem - nlck) < pages_pp_maximum) {
2505 2477 /* Do not do advance memory reserves */
2506 2478 use_reserved = 0;
2507 2479 } else {
2508 2480 availrmem -= nlck;
2509 2481 pages_locked += nlck;
2510 2482 }
2511 2483 mutex_exit(&freemem_lock);
2512 2484 }
2513 2485 if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
2514 2486 if (sptd->spt_ppa_lckcnt[anon_index] <
2515 2487 (ushort_t)DISM_LOCK_MAX) {
2516 2488 if (++sptd->spt_ppa_lckcnt[anon_index] ==
2517 2489 (ushort_t)DISM_LOCK_MAX) {
2518 2490 cmn_err(CE_WARN,
2519 2491 "DISM page lock limit "
2520 2492 "reached on DISM offset 0x%lx\n",
2521 2493 anon_index << PAGESHIFT);
2522 2494 }
2523 2495 kernel = (sptd->spt_ppa &&
2524 2496 sptd->spt_ppa[anon_index]);
2525 2497 if (!page_pp_lock(ppa[i], 0, kernel ||
2526 2498 use_reserved)) {
2527 2499 sptd->spt_ppa_lckcnt[anon_index]--;
2528 2500 rv = EAGAIN;
2529 2501 break;
2530 2502 }
2531 2503 /* if this is a newly locked page, count it */
2532 2504 if (ppa[i]->p_lckcnt == 1) {
2533 2505 if (kernel == 0 && use_reserved == 1)
2534 2506 nlck--;
2535 2507 *locked += PAGESIZE;
2536 2508 }
2537 2509 shmd->shm_lckpgs++;
2538 2510 shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
2539 2511 if (lockmap != NULL)
2540 2512 BT_SET(lockmap, pos);
2541 2513 }
2542 2514 }
2543 2515 }
2544 2516 /* Return unused lock reservation */
2545 2517 if (nlck != 0 && use_reserved == 1) {
2546 2518 mutex_enter(&freemem_lock);
2547 2519 availrmem += nlck;
2548 2520 pages_locked -= nlck;
2549 2521 mutex_exit(&freemem_lock);
2550 2522 }
2551 2523
2552 2524 return (rv);
2553 2525 }
2554 2526
2555 2527 int
2556 2528 spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2557 2529 rctl_qty_t *unlocked)
2558 2530 {
2559 2531 struct shm_data *shmd = seg->s_data;
2560 2532 struct spt_data *sptd = shmd->shm_sptseg->s_data;
2561 2533 struct anon_map *amp = sptd->spt_amp;
2562 2534 struct anon *ap;
2563 2535 struct vnode *vp;
2564 2536 u_offset_t off;
2565 2537 struct page *pp;
2566 2538 int kernel;
2567 2539 anon_sync_obj_t cookie;
2568 2540 ulong_t i;
2569 2541 pgcnt_t nlck = 0;
2570 2542 pgcnt_t nlck_limit = NLCK;
2571 2543
2572 2544 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2573 2545 for (i = 0; i < npages; i++, anon_index++) {
2574 2546 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
2575 2547 anon_array_enter(amp, anon_index, &cookie);
2576 2548 ap = anon_get_ptr(amp->ahp, anon_index);
2577 2549 ASSERT(ap);
2578 2550
2579 2551 swap_xlate(ap, &vp, &off);
2580 2552 anon_array_exit(&cookie);
2581 2553 pp = page_lookup(vp, off, SE_SHARED);
2582 2554 ASSERT(pp);
2583 2555 /*
2584 2556 * availrmem is decremented only for pages which are not
2585 2557 * in seg pcache, for pages in seg pcache availrmem was
2586 2558 * decremented in _dismpagelock()
2587 2559 */
2588 2560 kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]);
2589 2561 ASSERT(pp->p_lckcnt > 0);
2590 2562
2591 2563 /*
2592 2564 * lock page but do not change availrmem, we do it
2593 2565 * ourselves every nlck loops.
2594 2566 */
2595 2567 page_pp_unlock(pp, 0, 1);
2596 2568 if (pp->p_lckcnt == 0) {
2597 2569 if (kernel == 0)
2598 2570 nlck++;
2599 2571 *unlocked += PAGESIZE;
2600 2572 }
2601 2573 page_unlock(pp);
2602 2574 shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED;
2603 2575 sptd->spt_ppa_lckcnt[anon_index]--;
2604 2576 shmd->shm_lckpgs--;
2605 2577 }
2606 2578
2607 2579 /*
2608 2580 * To reduce freemem_lock contention, do not update availrmem
2609 2581 * until at least NLCK pages have been unlocked.
2610 2582 * 1. No need to update if nlck is zero
2611 2583 * 2. Always update if the last iteration
2612 2584 */
2613 2585 if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) {
2614 2586 mutex_enter(&freemem_lock);
2615 2587 availrmem += nlck;
2616 2588 pages_locked -= nlck;
2617 2589 mutex_exit(&freemem_lock);
2618 2590 nlck = 0;
2619 2591 nlck_limit = NLCK + RAND_P2(NLCK);
2620 2592 }
2621 2593 }
2622 2594 ANON_LOCK_EXIT(&->a_rwlock);
2623 2595
2624 2596 return (0);
2625 2597 }
2626 2598
2627 2599 /*ARGSUSED*/
2628 2600 static int
2629 2601 segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
2630 2602 int attr, int op, ulong_t *lockmap, size_t pos)
2631 2603 {
2632 2604 struct shm_data *shmd = seg->s_data;
2633 2605 struct seg *sptseg = shmd->shm_sptseg;
2634 2606 struct spt_data *sptd = sptseg->s_data;
2635 2607 struct kshmid *sp = sptd->spt_amp->a_sp;
2636 2608 pgcnt_t npages, a_npages;
2637 2609 page_t **ppa;
2638 2610 pgcnt_t an_idx, a_an_idx, ppa_idx;
2639 2611 caddr_t spt_addr, a_addr; /* spt and aligned address */
2640 2612 size_t a_len; /* aligned len */
2641 2613 size_t share_sz;
2642 2614 ulong_t i;
2643 2615 int sts = 0;
2644 2616 rctl_qty_t unlocked = 0;
2645 2617 rctl_qty_t locked = 0;
2646 2618 struct proc *p = curproc;
2647 2619 kproject_t *proj;
2648 2620
2649 2621 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2650 2622 ASSERT(sp != NULL);
2651 2623
2652 2624 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
2653 2625 return (0);
2654 2626 }
2655 2627
2656 2628 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2657 2629 an_idx = seg_page(seg, addr);
2658 2630 npages = btopr(len);
2659 2631
2660 2632 if (an_idx + npages > btopr(shmd->shm_amp->size)) {
2661 2633 return (ENOMEM);
2662 2634 }
2663 2635
2664 2636 /*
2665 2637 * A shm's project never changes, so no lock needed.
2666 2638 * The shm has a hold on the project, so it will not go away.
2667 2639 * Since we have a mapping to shm within this zone, we know
2668 2640 * that the zone will not go away.
2669 2641 */
2670 2642 proj = sp->shm_perm.ipc_proj;
2671 2643
2672 2644 if (op == MC_LOCK) {
2673 2645
2674 2646 /*
2675 2647 * Need to align addr and size request if they are not
2676 2648 * aligned so we can always allocate large page(s) however
2677 2649 * we only lock what was requested in initial request.
2678 2650 */
2679 2651 share_sz = page_get_pagesize(sptseg->s_szc);
2680 2652 a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
2681 2653 a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
2682 2654 share_sz);
2683 2655 a_npages = btop(a_len);
2684 2656 a_an_idx = seg_page(seg, a_addr);
2685 2657 spt_addr = sptseg->s_base + ptob(a_an_idx);
2686 2658 ppa_idx = an_idx - a_an_idx;
2687 2659
2688 2660 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
2689 2661 KM_NOSLEEP)) == NULL) {
2690 2662 return (ENOMEM);
2691 2663 }
2692 2664
2693 2665 /*
2694 2666 * Don't cache any new pages for IO and
2695 2667 * flush any cached pages.
2696 2668 */
2697 2669 mutex_enter(&sptd->spt_lock);
2698 2670 if (sptd->spt_ppa != NULL)
2699 2671 sptd->spt_flags |= DISM_PPA_CHANGED;
2700 2672
2701 2673 sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
2702 2674 if (sts != 0) {
2703 2675 mutex_exit(&sptd->spt_lock);
2704 2676 kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2705 2677 return (sts);
2706 2678 }
2707 2679
2708 2680 mutex_enter(&sp->shm_mlock);
2709 2681 /* enforce locked memory rctl */
2710 2682 unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]);
2711 2683
2712 2684 mutex_enter(&p->p_lock);
2713 2685 if (rctl_incr_locked_mem(p, proj, unlocked, 0)) {
2714 2686 mutex_exit(&p->p_lock);
2715 2687 sts = EAGAIN;
2716 2688 } else {
2717 2689 mutex_exit(&p->p_lock);
2718 2690 sts = spt_lockpages(seg, an_idx, npages,
2719 2691 &ppa[ppa_idx], lockmap, pos, &locked);
2720 2692
2721 2693 /*
2722 2694 * correct locked count if not all pages could be
2723 2695 * locked
2724 2696 */
2725 2697 if ((unlocked - locked) > 0) {
2726 2698 rctl_decr_locked_mem(NULL, proj,
2727 2699 (unlocked - locked), 0);
2728 2700 }
2729 2701 }
2730 2702 /*
2731 2703 * unlock pages
2732 2704 */
2733 2705 for (i = 0; i < a_npages; i++)
2734 2706 page_unlock(ppa[i]);
2735 2707 if (sptd->spt_ppa != NULL)
2736 2708 sptd->spt_flags |= DISM_PPA_CHANGED;
2737 2709 mutex_exit(&sp->shm_mlock);
2738 2710 mutex_exit(&sptd->spt_lock);
2739 2711
2740 2712 kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2741 2713
2742 2714 } else if (op == MC_UNLOCK) { /* unlock */
2743 2715 page_t **ppa;
2744 2716
2745 2717 mutex_enter(&sptd->spt_lock);
2746 2718 if (shmd->shm_lckpgs == 0) {
2747 2719 mutex_exit(&sptd->spt_lock);
2748 2720 return (0);
2749 2721 }
2750 2722 /*
2751 2723 * Don't cache new IO pages.
2752 2724 */
2753 2725 if (sptd->spt_ppa != NULL)
2754 2726 sptd->spt_flags |= DISM_PPA_CHANGED;
2755 2727
2756 2728 mutex_enter(&sp->shm_mlock);
2757 2729 sts = spt_unlockpages(seg, an_idx, npages, &unlocked);
2758 2730 if ((ppa = sptd->spt_ppa) != NULL)
2759 2731 sptd->spt_flags |= DISM_PPA_CHANGED;
2760 2732 mutex_exit(&sptd->spt_lock);
2761 2733
2762 2734 rctl_decr_locked_mem(NULL, proj, unlocked, 0);
2763 2735 mutex_exit(&sp->shm_mlock);
2764 2736
2765 2737 if (ppa != NULL)
2766 2738 seg_ppurge_wiredpp(ppa);
2767 2739 }
2768 2740 return (sts);
2769 2741 }
2770 2742
2771 2743 /*ARGSUSED*/
2772 2744 int
2773 2745 segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
2774 2746 {
2775 2747 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2776 2748 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2777 2749 spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;
2778 2750
2779 2751 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2780 2752
2781 2753 /*
2782 2754 * ISM segment is always rw.
2783 2755 */
2784 2756 while (--pgno >= 0)
2785 2757 *protv++ = sptd->spt_prot;
2786 2758 return (0);
2787 2759 }
2788 2760
2789 2761 /*ARGSUSED*/
2790 2762 u_offset_t
2791 2763 segspt_shmgetoffset(struct seg *seg, caddr_t addr)
2792 2764 {
2793 2765 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2794 2766
2795 2767 /* Offset does not matter in ISM memory */
2796 2768
2797 2769 return ((u_offset_t)0);
2798 2770 }
2799 2771
2800 2772 /* ARGSUSED */
2801 2773 int
2802 2774 segspt_shmgettype(struct seg *seg, caddr_t addr)
2803 2775 {
2804 2776 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2805 2777 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2806 2778
2807 2779 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2808 2780
2809 2781 /*
2810 2782 * The shared memory mapping is always MAP_SHARED, SWAP is only
2811 2783 * reserved for DISM
2812 2784 */
2813 2785 return (MAP_SHARED |
2814 2786 ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
2815 2787 }
2816 2788
2817 2789 /*ARGSUSED*/
2818 2790 int
2819 2791 segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
2820 2792 {
2821 2793 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2822 2794 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2823 2795
2824 2796 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2825 2797
2826 2798 *vpp = sptd->spt_vp;
2827 2799 return (0);
2828 2800 }
2829 2801
2830 2802 /*
2831 2803 * We need to wait for pending IO to complete to a DISM segment in order for
2832 2804 * pages to get kicked out of the seg_pcache. 120 seconds should be more
2833 2805 * than enough time to wait.
2834 2806 */
2835 2807 static clock_t spt_pcache_wait = 120;
2836 2808
2837 2809 /*ARGSUSED*/
2838 2810 static int
2839 2811 segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
2840 2812 {
2841 2813 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2842 2814 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2843 2815 struct anon_map *amp;
2844 2816 pgcnt_t pg_idx;
2845 2817 ushort_t gen;
2846 2818 clock_t end_lbolt;
2847 2819 int writer;
2848 2820 page_t **ppa;
2849 2821
2850 2822 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2851 2823
2852 2824 if (behav == MADV_FREE) {
2853 2825 if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
2854 2826 return (0);
2855 2827
2856 2828 amp = sptd->spt_amp;
2857 2829 pg_idx = seg_page(seg, addr);
2858 2830
2859 2831 mutex_enter(&sptd->spt_lock);
2860 2832 if ((ppa = sptd->spt_ppa) == NULL) {
2861 2833 mutex_exit(&sptd->spt_lock);
2862 2834 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2863 2835 anon_disclaim(amp, pg_idx, len);
2864 2836 ANON_LOCK_EXIT(&->a_rwlock);
2865 2837 return (0);
2866 2838 }
2867 2839
2868 2840 sptd->spt_flags |= DISM_PPA_CHANGED;
2869 2841 gen = sptd->spt_gen;
2870 2842
2871 2843 mutex_exit(&sptd->spt_lock);
2872 2844
2873 2845 /*
2874 2846 * Purge all DISM cached pages
2875 2847 */
2876 2848 seg_ppurge_wiredpp(ppa);
2877 2849
2878 2850 /*
2879 2851 * Drop the AS_LOCK so that other threads can grab it
2880 2852 * in the as_pageunlock path and hopefully get the segment
2881 2853 * kicked out of the seg_pcache. We bump the shm_softlockcnt
2882 2854 * to keep this segment resident.
2883 2855 */
2884 2856 writer = AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock);
2885 2857 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
2886 2858 AS_LOCK_EXIT(seg->s_as, &seg->s_as->a_lock);
2887 2859
2888 2860 mutex_enter(&sptd->spt_lock);
2889 2861
2890 2862 end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait);
2891 2863
2892 2864 /*
2893 2865 * Try to wait for pages to get kicked out of the seg_pcache.
2894 2866 */
2895 2867 while (sptd->spt_gen == gen &&
2896 2868 (sptd->spt_flags & DISM_PPA_CHANGED) &&
2897 2869 ddi_get_lbolt() < end_lbolt) {
2898 2870 if (!cv_timedwait_sig(&sptd->spt_cv,
2899 2871 &sptd->spt_lock, end_lbolt)) {
2900 2872 break;
2901 2873 }
2902 2874 }
2903 2875
2904 2876 mutex_exit(&sptd->spt_lock);
2905 2877
2906 2878 /* Regrab the AS_LOCK and release our hold on the segment */
2907 2879 AS_LOCK_ENTER(seg->s_as, &seg->s_as->a_lock,
2908 2880 writer ? RW_WRITER : RW_READER);
2909 2881 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
2910 2882 if (shmd->shm_softlockcnt <= 0) {
2911 2883 if (AS_ISUNMAPWAIT(seg->s_as)) {
2912 2884 mutex_enter(&seg->s_as->a_contents);
2913 2885 if (AS_ISUNMAPWAIT(seg->s_as)) {
2914 2886 AS_CLRUNMAPWAIT(seg->s_as);
2915 2887 cv_broadcast(&seg->s_as->a_cv);
2916 2888 }
2917 2889 mutex_exit(&seg->s_as->a_contents);
2918 2890 }
2919 2891 }
2920 2892
2921 2893 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2922 2894 anon_disclaim(amp, pg_idx, len);
2923 2895 ANON_LOCK_EXIT(&->a_rwlock);
2924 2896 } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
2925 2897 behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
2926 2898 int already_set;
2927 2899 ulong_t anon_index;
2928 2900 lgrp_mem_policy_t policy;
2929 2901 caddr_t shm_addr;
2930 2902 size_t share_size;
2931 2903 size_t size;
2932 2904 struct seg *sptseg = shmd->shm_sptseg;
2933 2905 caddr_t sptseg_addr;
2934 2906
2935 2907 /*
2936 2908 * Align address and length to page size of underlying segment
2937 2909 */
2938 2910 share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
2939 2911 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
2940 2912 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
2941 2913 share_size);
2942 2914
2943 2915 amp = shmd->shm_amp;
2944 2916 anon_index = seg_page(seg, shm_addr);
2945 2917
2946 2918 /*
2947 2919 * And now we may have to adjust size downward if we have
2948 2920 * exceeded the realsize of the segment or initial anon
2949 2921 * allocations.
2950 2922 */
2951 2923 sptseg_addr = sptseg->s_base + ptob(anon_index);
2952 2924 if ((sptseg_addr + size) >
2953 2925 (sptseg->s_base + sptd->spt_realsize))
2954 2926 size = (sptseg->s_base + sptd->spt_realsize) -
2955 2927 sptseg_addr;
2956 2928
2957 2929 /*
2958 2930 * Set memory allocation policy for this segment
2959 2931 */
2960 2932 policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
2961 2933 already_set = lgrp_shm_policy_set(policy, amp, anon_index,
2962 2934 NULL, 0, len);
2963 2935
2964 2936 /*
2965 2937 * If random memory allocation policy set already,
2966 2938 * don't bother reapplying it.
2967 2939 */
2968 2940 if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
2969 2941 return (0);
2970 2942
2971 2943 /*
2972 2944 * Mark any existing pages in the given range for
2973 2945 * migration, flushing the I/O page cache, and using
2974 2946 * underlying segment to calculate anon index and get
2975 2947 * anonmap and vnode pointer from
2976 2948 */
2977 2949 if (shmd->shm_softlockcnt > 0)
2978 2950 segspt_purge(seg);
2979 2951
2980 2952 page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
2981 2953 }
2982 2954
2983 2955 return (0);
2984 2956 }
2985 2957
2986 2958 /*ARGSUSED*/
2987 2959 void
2988 2960 segspt_shmdump(struct seg *seg)
2989 2961 {
2990 2962 /* no-op for ISM segment */
2991 2963 }
2992 2964
2993 2965 /*ARGSUSED*/
2994 2966 static faultcode_t
2995 2967 segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
2996 2968 {
2997 2969 return (ENOTSUP);
2998 2970 }
2999 2971
3000 2972 /*
3001 2973 * get a memory ID for an addr in a given segment
3002 2974 */
3003 2975 static int
3004 2976 segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
3005 2977 {
3006 2978 struct shm_data *shmd = (struct shm_data *)seg->s_data;
3007 2979 struct anon *ap;
3008 2980 size_t anon_index;
3009 2981 struct anon_map *amp = shmd->shm_amp;
3010 2982 struct spt_data *sptd = shmd->shm_sptseg->s_data;
3011 2983 struct seg *sptseg = shmd->shm_sptseg;
3012 2984 anon_sync_obj_t cookie;
3013 2985
3014 2986 anon_index = seg_page(seg, addr);
3015 2987
3016 2988 if (addr > (seg->s_base + sptd->spt_realsize)) {
3017 2989 return (EFAULT);
3018 2990 }
3019 2991
3020 2992 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
3021 2993 anon_array_enter(amp, anon_index, &cookie);
3022 2994 ap = anon_get_ptr(amp->ahp, anon_index);
3023 2995 if (ap == NULL) {
3024 2996 struct page *pp;
3025 2997 caddr_t spt_addr = sptseg->s_base + ptob(anon_index);
3026 2998
3027 2999 pp = anon_zero(sptseg, spt_addr, &ap, kcred);
3028 3000 if (pp == NULL) {
3029 3001 anon_array_exit(&cookie);
3030 3002 ANON_LOCK_EXIT(&->a_rwlock);
3031 3003 return (ENOMEM);
3032 3004 }
3033 3005 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
3034 3006 page_unlock(pp);
3035 3007 }
3036 3008 anon_array_exit(&cookie);
3037 3009 ANON_LOCK_EXIT(&->a_rwlock);
3038 3010 memidp->val[0] = (uintptr_t)ap;
3039 3011 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
3040 3012 return (0);
3041 3013 }
3042 3014
3043 3015 /*
3044 3016 * Get memory allocation policy info for specified address in given segment
3045 3017 */
3046 3018 static lgrp_mem_policy_info_t *
3047 3019 segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
3048 3020 {
3049 3021 struct anon_map *amp;
3050 3022 ulong_t anon_index;
3051 3023 lgrp_mem_policy_info_t *policy_info;
3052 3024 struct shm_data *shm_data;
3053 3025
3054 3026 ASSERT(seg != NULL);
3055 3027
3056 3028 /*
3057 3029 * Get anon_map from segshm
3058 3030 *
3059 3031 * Assume that no lock needs to be held on anon_map, since
3060 3032 * it should be protected by its reference count which must be
3061 3033 * nonzero for an existing segment
3062 3034 * Need to grab readers lock on policy tree though
3063 3035 */
3064 3036 shm_data = (struct shm_data *)seg->s_data;
3065 3037 if (shm_data == NULL)
3066 3038 return (NULL);
3067 3039 amp = shm_data->shm_amp;
3068 3040 ASSERT(amp->refcnt != 0);
3069 3041
3070 3042 /*
3071 3043 * Get policy info
3072 3044 *
3073 3045 * Assume starting anon index of 0
3074 3046 */
3075 3047 anon_index = seg_page(seg, addr);
3076 3048 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
3077 3049
3078 3050 return (policy_info);
3079 3051 }
3080 3052
3081 3053 /*ARGSUSED*/
3082 3054 static int
3083 3055 segspt_shmcapable(struct seg *seg, segcapability_t capability)
3084 3056 {
3085 3057 return (0);
3086 3058 }
↓ open down ↓ |
2966 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX