Print this page
use NULL dump segop as a shorthand for no-op
Instead of forcing every segment driver to implement a dummy function that
does nothing, handle NULL dump segop function pointer as a no-op shorthand.
segspt_ops can be static
There is nothing that needs access to this structure outside of the spt
segment driver itself.
const-ify make segment ops structures
There is no reason to keep the segment ops structures writable.
use NULL setpagesize segop as a shorthand for ENOTSUP
Instead of forcing every segment driver to implement a dummp function to
return (hopefully) ENOTSUP, handle NULL setpagesize segop function pointer
as "return ENOTSUP" shorthand.
use NULL capable segop as a shorthand for no-capabilities
Instead of forcing every segment driver to implement a dummy "return 0"
function, handle NULL capable segop function pointer as "no copabilities
supported" shorthand.
seg_inherit_notsup is redundant since segop_inherit checks for NULL properly
no need for bad-op segment op functions
The segment drivers have a number of bad-op functions that simply panic.
Keeping the function pointer NULL will accomplish the same thing in most
cases. In other cases, keeping the function pointer NULL will result in
proper error code being returned.
use C99 initializers in segment ops structures
remove whole-process swapping
Long before Unix supported paging, it used process swapping to reclaim
memory. The code is there and in theory it runs when we get *extremely* low
on memory. In practice, it never runs since the definition of low-on-memory
is antiquated. (XXX: define what antiquated means)
You can check the number of swapout/swapin events with kstats:
$ kstat -p ::vm:swapin ::vm:swapout
remove xhat
The xhat infrastructure was added to support hardware such as the zulu
graphics card - hardware which had on-board MMUs. The VM used the xhat code
to keep the CPU's and Zulu's page tables in-sync. Since the only xhat user
was zulu (which is gone), we can safely remove it simplifying the whole VM
subsystem.
Assorted notes:
- AS_BUSY flag was used solely by xhat
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/seg_spt.c
+++ new/usr/src/uts/common/vm/seg_spt.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24
25 25 #include <sys/param.h>
26 26 #include <sys/user.h>
27 27 #include <sys/mman.h>
28 28 #include <sys/kmem.h>
29 29 #include <sys/sysmacros.h>
30 30 #include <sys/cmn_err.h>
31 31 #include <sys/systm.h>
32 32 #include <sys/tuneable.h>
33 33 #include <vm/hat.h>
34 34 #include <vm/seg.h>
35 35 #include <vm/as.h>
36 36 #include <vm/anon.h>
37 37 #include <vm/page.h>
38 38 #include <sys/buf.h>
39 39 #include <sys/swap.h>
40 40 #include <sys/atomic.h>
41 41 #include <vm/seg_spt.h>
42 42 #include <sys/debug.h>
43 43 #include <sys/vtrace.h>
44 44 #include <sys/shm.h>
45 45 #include <sys/shm_impl.h>
46 46 #include <sys/lgrp.h>
47 47 #include <sys/vmsystm.h>
48 48 #include <sys/policy.h>
49 49 #include <sys/project.h>
50 50 #include <sys/tnf_probe.h>
51 51 #include <sys/zone.h>
52 52
53 53 #define SEGSPTADDR (caddr_t)0x0
54 54
55 55 /*
56 56 * # pages used for spt
57 57 */
58 58 size_t spt_used;
59 59
60 60 /*
61 61 * segspt_minfree is the memory left for system after ISM
62 62 * locked its pages; it is set up to 5% of availrmem in
63 63 * sptcreate when ISM is created. ISM should not use more
64 64 * than ~90% of availrmem; if it does, then the performance
65 65 * of the system may decrease. Machines with large memories may
66 66 * be able to use up more memory for ISM so we set the default
67 67 * segspt_minfree to 5% (which gives ISM max 95% of availrmem.
68 68 * If somebody wants even more memory for ISM (risking hanging
↓ open down ↓ |
68 lines elided |
↑ open up ↑ |
69 69 * the system) they can patch the segspt_minfree to smaller number.
70 70 */
71 71 pgcnt_t segspt_minfree = 0;
72 72
73 73 static int segspt_create(struct seg *seg, caddr_t argsp);
74 74 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
75 75 static void segspt_free(struct seg *seg);
76 76 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
77 77 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
78 78
79 -static void
80 -segspt_badop()
81 -{
82 - panic("segspt_badop called");
83 - /*NOTREACHED*/
84 -}
85 -
86 -#define SEGSPT_BADOP(t) (t(*)())segspt_badop
87 -
88 -struct seg_ops segspt_ops = {
89 - SEGSPT_BADOP(int), /* dup */
90 - segspt_unmap,
91 - segspt_free,
92 - SEGSPT_BADOP(int), /* fault */
93 - SEGSPT_BADOP(faultcode_t), /* faulta */
94 - SEGSPT_BADOP(int), /* setprot */
95 - SEGSPT_BADOP(int), /* checkprot */
96 - SEGSPT_BADOP(int), /* kluster */
97 - SEGSPT_BADOP(size_t), /* swapout */
98 - SEGSPT_BADOP(int), /* sync */
99 - SEGSPT_BADOP(size_t), /* incore */
100 - SEGSPT_BADOP(int), /* lockop */
101 - SEGSPT_BADOP(int), /* getprot */
102 - SEGSPT_BADOP(u_offset_t), /* getoffset */
103 - SEGSPT_BADOP(int), /* gettype */
104 - SEGSPT_BADOP(int), /* getvp */
105 - SEGSPT_BADOP(int), /* advise */
106 - SEGSPT_BADOP(void), /* dump */
107 - SEGSPT_BADOP(int), /* pagelock */
108 - SEGSPT_BADOP(int), /* setpgsz */
109 - SEGSPT_BADOP(int), /* getmemid */
110 - segspt_getpolicy, /* getpolicy */
111 - SEGSPT_BADOP(int), /* capable */
112 - seg_inherit_notsup /* inherit */
79 +static const struct seg_ops segspt_ops = {
80 + .unmap = segspt_unmap,
81 + .free = segspt_free,
82 + .getpolicy = segspt_getpolicy,
113 83 };
114 84
115 85 static int segspt_shmdup(struct seg *seg, struct seg *newseg);
116 86 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
117 87 static void segspt_shmfree(struct seg *seg);
118 88 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
119 89 caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
120 90 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
121 91 static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr,
122 92 register size_t len, register uint_t prot);
123 93 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
124 94 uint_t prot);
125 95 static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
126 -static size_t segspt_shmswapout(struct seg *seg);
127 96 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
128 97 register char *vec);
129 98 static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len,
130 99 int attr, uint_t flags);
131 100 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
132 101 int attr, int op, ulong_t *lockmap, size_t pos);
133 102 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
134 103 uint_t *protv);
135 104 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
136 105 static int segspt_shmgettype(struct seg *seg, caddr_t addr);
137 106 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
138 107 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
139 108 uint_t behav);
140 -static void segspt_shmdump(struct seg *seg);
141 109 static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
142 110 struct page ***, enum lock_type, enum seg_rw);
143 -static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
144 111 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
145 112 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
146 -static int segspt_shmcapable(struct seg *, segcapability_t);
147 113
148 -struct seg_ops segspt_shmops = {
149 - segspt_shmdup,
150 - segspt_shmunmap,
151 - segspt_shmfree,
152 - segspt_shmfault,
153 - segspt_shmfaulta,
154 - segspt_shmsetprot,
155 - segspt_shmcheckprot,
156 - segspt_shmkluster,
157 - segspt_shmswapout,
158 - segspt_shmsync,
159 - segspt_shmincore,
160 - segspt_shmlockop,
161 - segspt_shmgetprot,
162 - segspt_shmgetoffset,
163 - segspt_shmgettype,
164 - segspt_shmgetvp,
165 - segspt_shmadvise, /* advise */
166 - segspt_shmdump,
167 - segspt_shmpagelock,
168 - segspt_shmsetpgsz,
169 - segspt_shmgetmemid,
170 - segspt_shmgetpolicy,
171 - segspt_shmcapable,
172 - seg_inherit_notsup
114 +const struct seg_ops segspt_shmops = {
115 + .dup = segspt_shmdup,
116 + .unmap = segspt_shmunmap,
117 + .free = segspt_shmfree,
118 + .fault = segspt_shmfault,
119 + .faulta = segspt_shmfaulta,
120 + .setprot = segspt_shmsetprot,
121 + .checkprot = segspt_shmcheckprot,
122 + .kluster = segspt_shmkluster,
123 + .sync = segspt_shmsync,
124 + .incore = segspt_shmincore,
125 + .lockop = segspt_shmlockop,
126 + .getprot = segspt_shmgetprot,
127 + .getoffset = segspt_shmgetoffset,
128 + .gettype = segspt_shmgettype,
129 + .getvp = segspt_shmgetvp,
130 + .advise = segspt_shmadvise,
131 + .pagelock = segspt_shmpagelock,
132 + .getmemid = segspt_shmgetmemid,
133 + .getpolicy = segspt_shmgetpolicy,
173 134 };
174 135
175 136 static void segspt_purge(struct seg *seg);
176 137 static int segspt_reclaim(void *, caddr_t, size_t, struct page **,
177 138 enum seg_rw, int);
178 139 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
179 140 page_t **ppa);
180 141
181 142
182 143
183 144 /*ARGSUSED*/
184 145 int
185 146 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
186 147 uint_t prot, uint_t flags, uint_t share_szc)
187 148 {
188 149 int err;
189 150 struct as *newas;
190 151 struct segspt_crargs sptcargs;
191 152
192 153 #ifdef DEBUG
193 154 TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */,
194 155 tnf_ulong, size, size );
195 156 #endif
196 157 if (segspt_minfree == 0) /* leave min 5% of availrmem for */
197 158 segspt_minfree = availrmem/20; /* for the system */
198 159
199 160 if (!hat_supported(HAT_SHARED_PT, (void *)0))
200 161 return (EINVAL);
201 162
202 163 /*
203 164 * get a new as for this shared memory segment
204 165 */
205 166 newas = as_alloc();
206 167 newas->a_proc = NULL;
207 168 sptcargs.amp = amp;
208 169 sptcargs.prot = prot;
209 170 sptcargs.flags = flags;
210 171 sptcargs.szc = share_szc;
211 172 /*
212 173 * create a shared page table (spt) segment
213 174 */
214 175
215 176 if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
216 177 as_free(newas);
217 178 return (err);
218 179 }
219 180 *sptseg = sptcargs.seg_spt;
220 181 return (0);
221 182 }
222 183
223 184 void
224 185 sptdestroy(struct as *as, struct anon_map *amp)
225 186 {
226 187
227 188 #ifdef DEBUG
228 189 TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */);
229 190 #endif
230 191 (void) as_unmap(as, SEGSPTADDR, amp->size);
231 192 as_free(as);
232 193 }
233 194
234 195 /*
235 196 * called from seg_free().
236 197 * free (i.e., unlock, unmap, return to free list)
237 198 * all the pages in the given seg.
238 199 */
239 200 void
240 201 segspt_free(struct seg *seg)
241 202 {
242 203 struct spt_data *sptd = (struct spt_data *)seg->s_data;
243 204
244 205 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
245 206
246 207 if (sptd != NULL) {
247 208 if (sptd->spt_realsize)
248 209 segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
249 210
250 211 if (sptd->spt_ppa_lckcnt)
251 212 kmem_free(sptd->spt_ppa_lckcnt,
252 213 sizeof (*sptd->spt_ppa_lckcnt)
253 214 * btopr(sptd->spt_amp->size));
254 215 kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
255 216 cv_destroy(&sptd->spt_cv);
256 217 mutex_destroy(&sptd->spt_lock);
257 218 kmem_free(sptd, sizeof (*sptd));
258 219 }
259 220 }
260 221
261 222 /*ARGSUSED*/
262 223 static int
263 224 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
264 225 uint_t flags)
265 226 {
266 227 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
267 228
268 229 return (0);
269 230 }
270 231
271 232 /*ARGSUSED*/
272 233 static size_t
273 234 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
274 235 {
275 236 caddr_t eo_seg;
276 237 pgcnt_t npages;
277 238 struct shm_data *shmd = (struct shm_data *)seg->s_data;
278 239 struct seg *sptseg;
279 240 struct spt_data *sptd;
280 241
281 242 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
282 243 #ifdef lint
283 244 seg = seg;
284 245 #endif
285 246 sptseg = shmd->shm_sptseg;
286 247 sptd = sptseg->s_data;
287 248
288 249 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
289 250 eo_seg = addr + len;
290 251 while (addr < eo_seg) {
291 252 /* page exists, and it's locked. */
292 253 *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
293 254 SEG_PAGE_ANON;
294 255 addr += PAGESIZE;
295 256 }
296 257 return (len);
297 258 } else {
298 259 struct anon_map *amp = shmd->shm_amp;
299 260 struct anon *ap;
300 261 page_t *pp;
301 262 pgcnt_t anon_index;
302 263 struct vnode *vp;
303 264 u_offset_t off;
304 265 ulong_t i;
305 266 int ret;
306 267 anon_sync_obj_t cookie;
307 268
308 269 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
309 270 anon_index = seg_page(seg, addr);
310 271 npages = btopr(len);
311 272 if (anon_index + npages > btopr(shmd->shm_amp->size)) {
312 273 return (EINVAL);
313 274 }
314 275 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
315 276 for (i = 0; i < npages; i++, anon_index++) {
316 277 ret = 0;
317 278 anon_array_enter(amp, anon_index, &cookie);
318 279 ap = anon_get_ptr(amp->ahp, anon_index);
319 280 if (ap != NULL) {
320 281 swap_xlate(ap, &vp, &off);
321 282 anon_array_exit(&cookie);
322 283 pp = page_lookup_nowait(vp, off, SE_SHARED);
323 284 if (pp != NULL) {
324 285 ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
325 286 page_unlock(pp);
326 287 }
327 288 } else {
328 289 anon_array_exit(&cookie);
329 290 }
330 291 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
331 292 ret |= SEG_PAGE_LOCKED;
332 293 }
333 294 *vec++ = (char)ret;
334 295 }
335 296 ANON_LOCK_EXIT(&->a_rwlock);
336 297 return (len);
337 298 }
338 299 }
339 300
340 301 static int
341 302 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
342 303 {
343 304 size_t share_size;
344 305
345 306 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
346 307
347 308 /*
348 309 * seg.s_size may have been rounded up to the largest page size
349 310 * in shmat().
350 311 * XXX This should be cleanedup. sptdestroy should take a length
351 312 * argument which should be the same as sptcreate. Then
352 313 * this rounding would not be needed (or is done in shm.c)
353 314 * Only the check for full segment will be needed.
354 315 *
355 316 * XXX -- shouldn't raddr == 0 always? These tests don't seem
356 317 * to be useful at all.
357 318 */
358 319 share_size = page_get_pagesize(seg->s_szc);
359 320 ssize = P2ROUNDUP(ssize, share_size);
360 321
361 322 if (raddr == seg->s_base && ssize == seg->s_size) {
362 323 seg_free(seg);
363 324 return (0);
364 325 } else
365 326 return (EINVAL);
366 327 }
367 328
368 329 int
369 330 segspt_create(struct seg *seg, caddr_t argsp)
370 331 {
371 332 int err;
372 333 caddr_t addr = seg->s_base;
373 334 struct spt_data *sptd;
374 335 struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
375 336 struct anon_map *amp = sptcargs->amp;
376 337 struct kshmid *sp = amp->a_sp;
377 338 struct cred *cred = CRED();
378 339 ulong_t i, j, anon_index = 0;
379 340 pgcnt_t npages = btopr(amp->size);
380 341 struct vnode *vp;
381 342 page_t **ppa;
382 343 uint_t hat_flags;
383 344 size_t pgsz;
384 345 pgcnt_t pgcnt;
385 346 caddr_t a;
386 347 pgcnt_t pidx;
387 348 size_t sz;
388 349 proc_t *procp = curproc;
389 350 rctl_qty_t lockedbytes = 0;
390 351 kproject_t *proj;
391 352
392 353 /*
393 354 * We are holding the a_lock on the underlying dummy as,
394 355 * so we can make calls to the HAT layer.
395 356 */
396 357 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
397 358 ASSERT(sp != NULL);
398 359
399 360 #ifdef DEBUG
400 361 TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */,
401 362 tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size);
402 363 #endif
403 364 if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
404 365 if (err = anon_swap_adjust(npages))
405 366 return (err);
406 367 }
407 368 err = ENOMEM;
408 369
409 370 if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
410 371 goto out1;
411 372
412 373 if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
413 374 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
414 375 KM_NOSLEEP)) == NULL)
415 376 goto out2;
416 377 }
417 378
418 379 mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
419 380
420 381 if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
421 382 goto out3;
422 383
423 384 seg->s_ops = &segspt_ops;
424 385 sptd->spt_vp = vp;
425 386 sptd->spt_amp = amp;
426 387 sptd->spt_prot = sptcargs->prot;
427 388 sptd->spt_flags = sptcargs->flags;
428 389 seg->s_data = (caddr_t)sptd;
429 390 sptd->spt_ppa = NULL;
430 391 sptd->spt_ppa_lckcnt = NULL;
431 392 seg->s_szc = sptcargs->szc;
432 393 cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL);
433 394 sptd->spt_gen = 0;
434 395
435 396 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
436 397 if (seg->s_szc > amp->a_szc) {
437 398 amp->a_szc = seg->s_szc;
438 399 }
439 400 ANON_LOCK_EXIT(&->a_rwlock);
440 401
441 402 /*
442 403 * Set policy to affect initial allocation of pages in
443 404 * anon_map_createpages()
444 405 */
445 406 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
446 407 NULL, 0, ptob(npages));
447 408
448 409 if (sptcargs->flags & SHM_PAGEABLE) {
449 410 size_t share_sz;
450 411 pgcnt_t new_npgs, more_pgs;
451 412 struct anon_hdr *nahp;
452 413 zone_t *zone;
453 414
454 415 share_sz = page_get_pagesize(seg->s_szc);
455 416 if (!IS_P2ALIGNED(amp->size, share_sz)) {
456 417 /*
457 418 * We are rounding up the size of the anon array
458 419 * on 4 M boundary because we always create 4 M
459 420 * of page(s) when locking, faulting pages and we
460 421 * don't have to check for all corner cases e.g.
461 422 * if there is enough space to allocate 4 M
462 423 * page.
463 424 */
464 425 new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
465 426 more_pgs = new_npgs - npages;
466 427
467 428 /*
468 429 * The zone will never be NULL, as a fully created
469 430 * shm always has an owning zone.
470 431 */
471 432 zone = sp->shm_perm.ipc_zone_ref.zref_zone;
472 433 ASSERT(zone != NULL);
473 434 if (anon_resv_zone(ptob(more_pgs), zone) == 0) {
474 435 err = ENOMEM;
475 436 goto out4;
476 437 }
477 438
478 439 nahp = anon_create(new_npgs, ANON_SLEEP);
479 440 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
480 441 (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
481 442 ANON_SLEEP);
482 443 anon_release(amp->ahp, npages);
483 444 amp->ahp = nahp;
484 445 ASSERT(amp->swresv == ptob(npages));
485 446 amp->swresv = amp->size = ptob(new_npgs);
486 447 ANON_LOCK_EXIT(&->a_rwlock);
487 448 npages = new_npgs;
488 449 }
489 450
490 451 sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
491 452 sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
492 453 sptd->spt_pcachecnt = 0;
493 454 sptd->spt_realsize = ptob(npages);
494 455 sptcargs->seg_spt = seg;
495 456 return (0);
496 457 }
497 458
498 459 /*
499 460 * get array of pages for each anon slot in amp
500 461 */
501 462 if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
502 463 seg, addr, S_CREATE, cred)) != 0)
503 464 goto out4;
504 465
505 466 mutex_enter(&sp->shm_mlock);
506 467
507 468 /* May be partially locked, so, count bytes to charge for locking */
508 469 for (i = 0; i < npages; i++)
509 470 if (ppa[i]->p_lckcnt == 0)
510 471 lockedbytes += PAGESIZE;
511 472
512 473 proj = sp->shm_perm.ipc_proj;
513 474
514 475 if (lockedbytes > 0) {
515 476 mutex_enter(&procp->p_lock);
516 477 if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) {
517 478 mutex_exit(&procp->p_lock);
518 479 mutex_exit(&sp->shm_mlock);
519 480 for (i = 0; i < npages; i++)
520 481 page_unlock(ppa[i]);
521 482 err = ENOMEM;
522 483 goto out4;
523 484 }
524 485 mutex_exit(&procp->p_lock);
525 486 }
526 487
527 488 /*
528 489 * addr is initial address corresponding to the first page on ppa list
529 490 */
530 491 for (i = 0; i < npages; i++) {
531 492 /* attempt to lock all pages */
532 493 if (page_pp_lock(ppa[i], 0, 1) == 0) {
533 494 /*
534 495 * if unable to lock any page, unlock all
535 496 * of them and return error
536 497 */
537 498 for (j = 0; j < i; j++)
538 499 page_pp_unlock(ppa[j], 0, 1);
539 500 for (i = 0; i < npages; i++)
540 501 page_unlock(ppa[i]);
541 502 rctl_decr_locked_mem(NULL, proj, lockedbytes, 0);
542 503 mutex_exit(&sp->shm_mlock);
543 504 err = ENOMEM;
544 505 goto out4;
545 506 }
546 507 }
547 508 mutex_exit(&sp->shm_mlock);
548 509
549 510 /*
550 511 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
551 512 * for the entire life of the segment. For example platforms
552 513 * that do not support Dynamic Reconfiguration.
553 514 */
554 515 hat_flags = HAT_LOAD_SHARE;
555 516 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
556 517 hat_flags |= HAT_LOAD_LOCK;
557 518
558 519 /*
559 520 * Load translations one lare page at a time
560 521 * to make sure we don't create mappings bigger than
561 522 * segment's size code in case underlying pages
562 523 * are shared with segvn's segment that uses bigger
563 524 * size code than we do.
564 525 */
565 526 pgsz = page_get_pagesize(seg->s_szc);
566 527 pgcnt = page_get_pagecnt(seg->s_szc);
567 528 for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) {
568 529 sz = MIN(pgsz, ptob(npages - pidx));
569 530 hat_memload_array(seg->s_as->a_hat, a, sz,
570 531 &ppa[pidx], sptd->spt_prot, hat_flags);
571 532 }
572 533
573 534 /*
574 535 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
575 536 * we will leave the pages locked SE_SHARED for the life
576 537 * of the ISM segment. This will prevent any calls to
577 538 * hat_pageunload() on this ISM segment for those platforms.
578 539 */
579 540 if (!(hat_flags & HAT_LOAD_LOCK)) {
580 541 /*
581 542 * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
582 543 * we no longer need to hold the SE_SHARED lock on the pages,
583 544 * since L_PAGELOCK and F_SOFTLOCK calls will grab the
584 545 * SE_SHARED lock on the pages as necessary.
585 546 */
586 547 for (i = 0; i < npages; i++)
587 548 page_unlock(ppa[i]);
588 549 }
589 550 sptd->spt_pcachecnt = 0;
590 551 kmem_free(ppa, ((sizeof (page_t *)) * npages));
591 552 sptd->spt_realsize = ptob(npages);
592 553 atomic_add_long(&spt_used, npages);
593 554 sptcargs->seg_spt = seg;
594 555 return (0);
595 556
596 557 out4:
597 558 seg->s_data = NULL;
598 559 kmem_free(vp, sizeof (*vp));
599 560 cv_destroy(&sptd->spt_cv);
600 561 out3:
601 562 mutex_destroy(&sptd->spt_lock);
602 563 if ((sptcargs->flags & SHM_PAGEABLE) == 0)
603 564 kmem_free(ppa, (sizeof (*ppa) * npages));
604 565 out2:
605 566 kmem_free(sptd, sizeof (*sptd));
606 567 out1:
607 568 if ((sptcargs->flags & SHM_PAGEABLE) == 0)
608 569 anon_swap_restore(npages);
609 570 return (err);
610 571 }
611 572
612 573 /*ARGSUSED*/
613 574 void
614 575 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
615 576 {
616 577 struct page *pp;
617 578 struct spt_data *sptd = (struct spt_data *)seg->s_data;
618 579 pgcnt_t npages;
619 580 ulong_t anon_idx;
620 581 struct anon_map *amp;
621 582 struct anon *ap;
622 583 struct vnode *vp;
623 584 u_offset_t off;
624 585 uint_t hat_flags;
625 586 int root = 0;
626 587 pgcnt_t pgs, curnpgs = 0;
627 588 page_t *rootpp;
628 589 rctl_qty_t unlocked_bytes = 0;
629 590 kproject_t *proj;
630 591 kshmid_t *sp;
631 592
632 593 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
633 594
634 595 len = P2ROUNDUP(len, PAGESIZE);
635 596
636 597 npages = btop(len);
637 598
638 599 hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP;
639 600 if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
640 601 (sptd->spt_flags & SHM_PAGEABLE)) {
641 602 hat_flags = HAT_UNLOAD_UNMAP;
642 603 }
643 604
644 605 hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
645 606
646 607 amp = sptd->spt_amp;
647 608 if (sptd->spt_flags & SHM_PAGEABLE)
648 609 npages = btop(amp->size);
649 610
650 611 ASSERT(amp != NULL);
651 612
652 613 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
653 614 sp = amp->a_sp;
654 615 proj = sp->shm_perm.ipc_proj;
655 616 mutex_enter(&sp->shm_mlock);
656 617 }
657 618 for (anon_idx = 0; anon_idx < npages; anon_idx++) {
658 619 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
659 620 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
660 621 panic("segspt_free_pages: null app");
661 622 /*NOTREACHED*/
662 623 }
663 624 } else {
664 625 if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
665 626 == NULL)
666 627 continue;
667 628 }
668 629 ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
669 630 swap_xlate(ap, &vp, &off);
670 631
671 632 /*
672 633 * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
673 634 * the pages won't be having SE_SHARED lock at this
674 635 * point.
675 636 *
676 637 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
677 638 * the pages are still held SE_SHARED locked from the
678 639 * original segspt_create()
679 640 *
680 641 * Our goal is to get SE_EXCL lock on each page, remove
681 642 * permanent lock on it and invalidate the page.
682 643 */
683 644 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
684 645 if (hat_flags == HAT_UNLOAD_UNMAP)
685 646 pp = page_lookup(vp, off, SE_EXCL);
686 647 else {
687 648 if ((pp = page_find(vp, off)) == NULL) {
688 649 panic("segspt_free_pages: "
689 650 "page not locked");
690 651 /*NOTREACHED*/
691 652 }
692 653 if (!page_tryupgrade(pp)) {
693 654 page_unlock(pp);
694 655 pp = page_lookup(vp, off, SE_EXCL);
695 656 }
696 657 }
697 658 if (pp == NULL) {
698 659 panic("segspt_free_pages: "
699 660 "page not in the system");
700 661 /*NOTREACHED*/
701 662 }
702 663 ASSERT(pp->p_lckcnt > 0);
703 664 page_pp_unlock(pp, 0, 1);
704 665 if (pp->p_lckcnt == 0)
705 666 unlocked_bytes += PAGESIZE;
706 667 } else {
707 668 if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
708 669 continue;
709 670 }
710 671 /*
711 672 * It's logical to invalidate the pages here as in most cases
712 673 * these were created by segspt.
713 674 */
714 675 if (pp->p_szc != 0) {
715 676 if (root == 0) {
716 677 ASSERT(curnpgs == 0);
717 678 root = 1;
718 679 rootpp = pp;
719 680 pgs = curnpgs = page_get_pagecnt(pp->p_szc);
720 681 ASSERT(pgs > 1);
721 682 ASSERT(IS_P2ALIGNED(pgs, pgs));
722 683 ASSERT(!(page_pptonum(pp) & (pgs - 1)));
723 684 curnpgs--;
724 685 } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
725 686 ASSERT(curnpgs == 1);
726 687 ASSERT(page_pptonum(pp) ==
727 688 page_pptonum(rootpp) + (pgs - 1));
728 689 page_destroy_pages(rootpp);
729 690 root = 0;
730 691 curnpgs = 0;
731 692 } else {
732 693 ASSERT(curnpgs > 1);
733 694 ASSERT(page_pptonum(pp) ==
734 695 page_pptonum(rootpp) + (pgs - curnpgs));
735 696 curnpgs--;
736 697 }
737 698 } else {
738 699 if (root != 0 || curnpgs != 0) {
739 700 panic("segspt_free_pages: bad large page");
740 701 /*NOTREACHED*/
741 702 }
742 703 /*
743 704 * Before destroying the pages, we need to take care
744 705 * of the rctl locked memory accounting. For that
745 706 * we need to calculte the unlocked_bytes.
746 707 */
747 708 if (pp->p_lckcnt > 0)
748 709 unlocked_bytes += PAGESIZE;
749 710 /*LINTED: constant in conditional context */
750 711 VN_DISPOSE(pp, B_INVAL, 0, kcred);
751 712 }
752 713 }
753 714 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
754 715 if (unlocked_bytes > 0)
755 716 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0);
756 717 mutex_exit(&sp->shm_mlock);
757 718 }
758 719 if (root != 0 || curnpgs != 0) {
759 720 panic("segspt_free_pages: bad large page");
760 721 /*NOTREACHED*/
761 722 }
762 723
763 724 /*
764 725 * mark that pages have been released
765 726 */
766 727 sptd->spt_realsize = 0;
767 728
768 729 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
769 730 atomic_add_long(&spt_used, -npages);
770 731 anon_swap_restore(npages);
771 732 }
772 733 }
773 734
774 735 /*
775 736 * Get memory allocation policy info for specified address in given segment
776 737 */
777 738 static lgrp_mem_policy_info_t *
778 739 segspt_getpolicy(struct seg *seg, caddr_t addr)
779 740 {
780 741 struct anon_map *amp;
781 742 ulong_t anon_index;
782 743 lgrp_mem_policy_info_t *policy_info;
783 744 struct spt_data *spt_data;
784 745
785 746 ASSERT(seg != NULL);
786 747
787 748 /*
788 749 * Get anon_map from segspt
789 750 *
790 751 * Assume that no lock needs to be held on anon_map, since
791 752 * it should be protected by its reference count which must be
792 753 * nonzero for an existing segment
793 754 * Need to grab readers lock on policy tree though
794 755 */
795 756 spt_data = (struct spt_data *)seg->s_data;
796 757 if (spt_data == NULL)
797 758 return (NULL);
798 759 amp = spt_data->spt_amp;
799 760 ASSERT(amp->refcnt != 0);
800 761
801 762 /*
802 763 * Get policy info
803 764 *
804 765 * Assume starting anon index of 0
805 766 */
806 767 anon_index = seg_page(seg, addr);
807 768 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
808 769
809 770 return (policy_info);
810 771 }
811 772
812 773 /*
813 774 * DISM only.
814 775 * Return locked pages over a given range.
815 776 *
816 777 * We will cache all DISM locked pages and save the pplist for the
817 778 * entire segment in the ppa field of the underlying DISM segment structure.
818 779 * Later, during a call to segspt_reclaim() we will use this ppa array
819 780 * to page_unlock() all of the pages and then we will free this ppa list.
820 781 */
821 782 /*ARGSUSED*/
822 783 static int
823 784 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
824 785 struct page ***ppp, enum lock_type type, enum seg_rw rw)
825 786 {
826 787 struct shm_data *shmd = (struct shm_data *)seg->s_data;
827 788 struct seg *sptseg = shmd->shm_sptseg;
828 789 struct spt_data *sptd = sptseg->s_data;
829 790 pgcnt_t pg_idx, npages, tot_npages, npgs;
830 791 struct page **pplist, **pl, **ppa, *pp;
831 792 struct anon_map *amp;
832 793 spgcnt_t an_idx;
833 794 int ret = ENOTSUP;
834 795 uint_t pl_built = 0;
835 796 struct anon *ap;
836 797 struct vnode *vp;
837 798 u_offset_t off;
838 799 pgcnt_t claim_availrmem = 0;
839 800 uint_t szc;
840 801
841 802 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
842 803 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
843 804
844 805 /*
845 806 * We want to lock/unlock the entire ISM segment. Therefore,
846 807 * we will be using the underlying sptseg and it's base address
847 808 * and length for the caching arguments.
848 809 */
849 810 ASSERT(sptseg);
850 811 ASSERT(sptd);
851 812
852 813 pg_idx = seg_page(seg, addr);
853 814 npages = btopr(len);
854 815
855 816 /*
856 817 * check if the request is larger than number of pages covered
857 818 * by amp
858 819 */
859 820 if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
860 821 *ppp = NULL;
861 822 return (ENOTSUP);
862 823 }
863 824
864 825 if (type == L_PAGEUNLOCK) {
865 826 ASSERT(sptd->spt_ppa != NULL);
866 827
867 828 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
868 829 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
869 830
870 831 /*
871 832 * If someone is blocked while unmapping, we purge
872 833 * segment page cache and thus reclaim pplist synchronously
873 834 * without waiting for seg_pasync_thread. This speeds up
874 835 * unmapping in cases where munmap(2) is called, while
875 836 * raw async i/o is still in progress or where a thread
876 837 * exits on data fault in a multithreaded application.
877 838 */
878 839 if ((sptd->spt_flags & DISM_PPA_CHANGED) ||
879 840 (AS_ISUNMAPWAIT(seg->s_as) &&
880 841 shmd->shm_softlockcnt > 0)) {
881 842 segspt_purge(seg);
882 843 }
883 844 return (0);
884 845 }
885 846
886 847 /* The L_PAGELOCK case ... */
887 848
888 849 if (sptd->spt_flags & DISM_PPA_CHANGED) {
889 850 segspt_purge(seg);
890 851 /*
891 852 * for DISM ppa needs to be rebuild since
892 853 * number of locked pages could be changed
893 854 */
894 855 *ppp = NULL;
895 856 return (ENOTSUP);
896 857 }
897 858
898 859 /*
899 860 * First try to find pages in segment page cache, without
900 861 * holding the segment lock.
901 862 */
902 863 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
903 864 S_WRITE, SEGP_FORCE_WIRED);
904 865 if (pplist != NULL) {
905 866 ASSERT(sptd->spt_ppa != NULL);
906 867 ASSERT(sptd->spt_ppa == pplist);
907 868 ppa = sptd->spt_ppa;
908 869 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
909 870 if (ppa[an_idx] == NULL) {
910 871 seg_pinactive(seg, NULL, seg->s_base,
911 872 sptd->spt_amp->size, ppa,
912 873 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
913 874 *ppp = NULL;
914 875 return (ENOTSUP);
915 876 }
916 877 if ((szc = ppa[an_idx]->p_szc) != 0) {
917 878 npgs = page_get_pagecnt(szc);
918 879 an_idx = P2ROUNDUP(an_idx + 1, npgs);
919 880 } else {
920 881 an_idx++;
921 882 }
922 883 }
923 884 /*
924 885 * Since we cache the entire DISM segment, we want to
925 886 * set ppp to point to the first slot that corresponds
926 887 * to the requested addr, i.e. pg_idx.
927 888 */
928 889 *ppp = &(sptd->spt_ppa[pg_idx]);
929 890 return (0);
930 891 }
931 892
932 893 mutex_enter(&sptd->spt_lock);
933 894 /*
934 895 * try to find pages in segment page cache with mutex
935 896 */
936 897 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
937 898 S_WRITE, SEGP_FORCE_WIRED);
938 899 if (pplist != NULL) {
939 900 ASSERT(sptd->spt_ppa != NULL);
940 901 ASSERT(sptd->spt_ppa == pplist);
941 902 ppa = sptd->spt_ppa;
942 903 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
943 904 if (ppa[an_idx] == NULL) {
944 905 mutex_exit(&sptd->spt_lock);
945 906 seg_pinactive(seg, NULL, seg->s_base,
946 907 sptd->spt_amp->size, ppa,
947 908 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
948 909 *ppp = NULL;
949 910 return (ENOTSUP);
950 911 }
951 912 if ((szc = ppa[an_idx]->p_szc) != 0) {
952 913 npgs = page_get_pagecnt(szc);
953 914 an_idx = P2ROUNDUP(an_idx + 1, npgs);
954 915 } else {
955 916 an_idx++;
956 917 }
957 918 }
958 919 /*
959 920 * Since we cache the entire DISM segment, we want to
960 921 * set ppp to point to the first slot that corresponds
961 922 * to the requested addr, i.e. pg_idx.
962 923 */
963 924 mutex_exit(&sptd->spt_lock);
964 925 *ppp = &(sptd->spt_ppa[pg_idx]);
965 926 return (0);
966 927 }
967 928 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
968 929 SEGP_FORCE_WIRED) == SEGP_FAIL) {
969 930 mutex_exit(&sptd->spt_lock);
970 931 *ppp = NULL;
971 932 return (ENOTSUP);
972 933 }
973 934
974 935 /*
975 936 * No need to worry about protections because DISM pages are always rw.
976 937 */
977 938 pl = pplist = NULL;
978 939 amp = sptd->spt_amp;
979 940
980 941 /*
981 942 * Do we need to build the ppa array?
982 943 */
983 944 if (sptd->spt_ppa == NULL) {
984 945 pgcnt_t lpg_cnt = 0;
985 946
986 947 pl_built = 1;
987 948 tot_npages = btopr(sptd->spt_amp->size);
988 949
989 950 ASSERT(sptd->spt_pcachecnt == 0);
990 951 pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
991 952 pl = pplist;
992 953
993 954 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
994 955 for (an_idx = 0; an_idx < tot_npages; ) {
995 956 ap = anon_get_ptr(amp->ahp, an_idx);
996 957 /*
997 958 * Cache only mlocked pages. For large pages
998 959 * if one (constituent) page is mlocked
999 960 * all pages for that large page
1000 961 * are cached also. This is for quick
1001 962 * lookups of ppa array;
1002 963 */
1003 964 if ((ap != NULL) && (lpg_cnt != 0 ||
1004 965 (sptd->spt_ppa_lckcnt[an_idx] != 0))) {
1005 966
1006 967 swap_xlate(ap, &vp, &off);
1007 968 pp = page_lookup(vp, off, SE_SHARED);
1008 969 ASSERT(pp != NULL);
1009 970 if (lpg_cnt == 0) {
1010 971 lpg_cnt++;
1011 972 /*
1012 973 * For a small page, we are done --
1013 974 * lpg_count is reset to 0 below.
1014 975 *
1015 976 * For a large page, we are guaranteed
1016 977 * to find the anon structures of all
1017 978 * constituent pages and a non-zero
1018 979 * lpg_cnt ensures that we don't test
1019 980 * for mlock for these. We are done
1020 981 * when lpg_count reaches (npgs + 1).
1021 982 * If we are not the first constituent
1022 983 * page, restart at the first one.
1023 984 */
1024 985 npgs = page_get_pagecnt(pp->p_szc);
1025 986 if (!IS_P2ALIGNED(an_idx, npgs)) {
1026 987 an_idx = P2ALIGN(an_idx, npgs);
1027 988 page_unlock(pp);
1028 989 continue;
1029 990 }
1030 991 }
1031 992 if (++lpg_cnt > npgs)
1032 993 lpg_cnt = 0;
1033 994
1034 995 /*
1035 996 * availrmem is decremented only
1036 997 * for unlocked pages
1037 998 */
1038 999 if (sptd->spt_ppa_lckcnt[an_idx] == 0)
1039 1000 claim_availrmem++;
1040 1001 pplist[an_idx] = pp;
1041 1002 }
1042 1003 an_idx++;
1043 1004 }
1044 1005 ANON_LOCK_EXIT(&->a_rwlock);
1045 1006
1046 1007 if (claim_availrmem) {
1047 1008 mutex_enter(&freemem_lock);
1048 1009 if (availrmem < tune.t_minarmem + claim_availrmem) {
1049 1010 mutex_exit(&freemem_lock);
1050 1011 ret = ENOTSUP;
1051 1012 claim_availrmem = 0;
1052 1013 goto insert_fail;
1053 1014 } else {
1054 1015 availrmem -= claim_availrmem;
1055 1016 }
1056 1017 mutex_exit(&freemem_lock);
1057 1018 }
1058 1019
1059 1020 sptd->spt_ppa = pl;
1060 1021 } else {
1061 1022 /*
1062 1023 * We already have a valid ppa[].
1063 1024 */
1064 1025 pl = sptd->spt_ppa;
1065 1026 }
1066 1027
1067 1028 ASSERT(pl != NULL);
1068 1029
1069 1030 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1070 1031 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1071 1032 segspt_reclaim);
1072 1033 if (ret == SEGP_FAIL) {
1073 1034 /*
1074 1035 * seg_pinsert failed. We return
1075 1036 * ENOTSUP, so that the as_pagelock() code will
1076 1037 * then try the slower F_SOFTLOCK path.
1077 1038 */
1078 1039 if (pl_built) {
1079 1040 /*
1080 1041 * No one else has referenced the ppa[].
1081 1042 * We created it and we need to destroy it.
1082 1043 */
1083 1044 sptd->spt_ppa = NULL;
1084 1045 }
1085 1046 ret = ENOTSUP;
1086 1047 goto insert_fail;
1087 1048 }
1088 1049
1089 1050 /*
1090 1051 * In either case, we increment softlockcnt on the 'real' segment.
1091 1052 */
1092 1053 sptd->spt_pcachecnt++;
1093 1054 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1094 1055
1095 1056 ppa = sptd->spt_ppa;
1096 1057 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1097 1058 if (ppa[an_idx] == NULL) {
1098 1059 mutex_exit(&sptd->spt_lock);
1099 1060 seg_pinactive(seg, NULL, seg->s_base,
1100 1061 sptd->spt_amp->size,
1101 1062 pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1102 1063 *ppp = NULL;
1103 1064 return (ENOTSUP);
1104 1065 }
1105 1066 if ((szc = ppa[an_idx]->p_szc) != 0) {
1106 1067 npgs = page_get_pagecnt(szc);
1107 1068 an_idx = P2ROUNDUP(an_idx + 1, npgs);
1108 1069 } else {
1109 1070 an_idx++;
1110 1071 }
1111 1072 }
1112 1073 /*
1113 1074 * We can now drop the sptd->spt_lock since the ppa[]
1114 1075 * exists and he have incremented pacachecnt.
1115 1076 */
1116 1077 mutex_exit(&sptd->spt_lock);
1117 1078
1118 1079 /*
1119 1080 * Since we cache the entire segment, we want to
1120 1081 * set ppp to point to the first slot that corresponds
1121 1082 * to the requested addr, i.e. pg_idx.
1122 1083 */
1123 1084 *ppp = &(sptd->spt_ppa[pg_idx]);
1124 1085 return (0);
1125 1086
1126 1087 insert_fail:
1127 1088 /*
1128 1089 * We will only reach this code if we tried and failed.
1129 1090 *
1130 1091 * And we can drop the lock on the dummy seg, once we've failed
1131 1092 * to set up a new ppa[].
1132 1093 */
1133 1094 mutex_exit(&sptd->spt_lock);
1134 1095
1135 1096 if (pl_built) {
1136 1097 if (claim_availrmem) {
1137 1098 mutex_enter(&freemem_lock);
1138 1099 availrmem += claim_availrmem;
1139 1100 mutex_exit(&freemem_lock);
1140 1101 }
1141 1102
1142 1103 /*
1143 1104 * We created pl and we need to destroy it.
1144 1105 */
1145 1106 pplist = pl;
1146 1107 for (an_idx = 0; an_idx < tot_npages; an_idx++) {
1147 1108 if (pplist[an_idx] != NULL)
1148 1109 page_unlock(pplist[an_idx]);
1149 1110 }
1150 1111 kmem_free(pl, sizeof (page_t *) * tot_npages);
1151 1112 }
1152 1113
1153 1114 if (shmd->shm_softlockcnt <= 0) {
1154 1115 if (AS_ISUNMAPWAIT(seg->s_as)) {
1155 1116 mutex_enter(&seg->s_as->a_contents);
1156 1117 if (AS_ISUNMAPWAIT(seg->s_as)) {
1157 1118 AS_CLRUNMAPWAIT(seg->s_as);
1158 1119 cv_broadcast(&seg->s_as->a_cv);
1159 1120 }
1160 1121 mutex_exit(&seg->s_as->a_contents);
1161 1122 }
1162 1123 }
1163 1124 *ppp = NULL;
1164 1125 return (ret);
1165 1126 }
1166 1127
1167 1128
1168 1129
1169 1130 /*
1170 1131 * return locked pages over a given range.
1171 1132 *
1172 1133 * We will cache the entire ISM segment and save the pplist for the
1173 1134 * entire segment in the ppa field of the underlying ISM segment structure.
1174 1135 * Later, during a call to segspt_reclaim() we will use this ppa array
1175 1136 * to page_unlock() all of the pages and then we will free this ppa list.
1176 1137 */
1177 1138 /*ARGSUSED*/
1178 1139 static int
1179 1140 segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
1180 1141 struct page ***ppp, enum lock_type type, enum seg_rw rw)
1181 1142 {
1182 1143 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1183 1144 struct seg *sptseg = shmd->shm_sptseg;
1184 1145 struct spt_data *sptd = sptseg->s_data;
1185 1146 pgcnt_t np, page_index, npages;
1186 1147 caddr_t a, spt_base;
1187 1148 struct page **pplist, **pl, *pp;
1188 1149 struct anon_map *amp;
1189 1150 ulong_t anon_index;
1190 1151 int ret = ENOTSUP;
1191 1152 uint_t pl_built = 0;
1192 1153 struct anon *ap;
1193 1154 struct vnode *vp;
1194 1155 u_offset_t off;
1195 1156
1196 1157 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1197 1158 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
1198 1159
1199 1160
1200 1161 /*
1201 1162 * We want to lock/unlock the entire ISM segment. Therefore,
1202 1163 * we will be using the underlying sptseg and it's base address
1203 1164 * and length for the caching arguments.
1204 1165 */
1205 1166 ASSERT(sptseg);
1206 1167 ASSERT(sptd);
1207 1168
1208 1169 if (sptd->spt_flags & SHM_PAGEABLE) {
1209 1170 return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
1210 1171 }
1211 1172
1212 1173 page_index = seg_page(seg, addr);
1213 1174 npages = btopr(len);
1214 1175
1215 1176 /*
1216 1177 * check if the request is larger than number of pages covered
1217 1178 * by amp
1218 1179 */
1219 1180 if (page_index + npages > btopr(sptd->spt_amp->size)) {
1220 1181 *ppp = NULL;
1221 1182 return (ENOTSUP);
1222 1183 }
1223 1184
1224 1185 if (type == L_PAGEUNLOCK) {
1225 1186
1226 1187 ASSERT(sptd->spt_ppa != NULL);
1227 1188
1228 1189 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1229 1190 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1230 1191
1231 1192 /*
1232 1193 * If someone is blocked while unmapping, we purge
1233 1194 * segment page cache and thus reclaim pplist synchronously
1234 1195 * without waiting for seg_pasync_thread. This speeds up
1235 1196 * unmapping in cases where munmap(2) is called, while
1236 1197 * raw async i/o is still in progress or where a thread
1237 1198 * exits on data fault in a multithreaded application.
1238 1199 */
1239 1200 if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
1240 1201 segspt_purge(seg);
1241 1202 }
1242 1203 return (0);
1243 1204 }
1244 1205
1245 1206 /* The L_PAGELOCK case... */
1246 1207
1247 1208 /*
1248 1209 * First try to find pages in segment page cache, without
1249 1210 * holding the segment lock.
1250 1211 */
1251 1212 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1252 1213 S_WRITE, SEGP_FORCE_WIRED);
1253 1214 if (pplist != NULL) {
1254 1215 ASSERT(sptd->spt_ppa == pplist);
1255 1216 ASSERT(sptd->spt_ppa[page_index]);
1256 1217 /*
1257 1218 * Since we cache the entire ISM segment, we want to
1258 1219 * set ppp to point to the first slot that corresponds
1259 1220 * to the requested addr, i.e. page_index.
1260 1221 */
1261 1222 *ppp = &(sptd->spt_ppa[page_index]);
1262 1223 return (0);
1263 1224 }
1264 1225
1265 1226 mutex_enter(&sptd->spt_lock);
1266 1227
1267 1228 /*
1268 1229 * try to find pages in segment page cache
1269 1230 */
1270 1231 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1271 1232 S_WRITE, SEGP_FORCE_WIRED);
1272 1233 if (pplist != NULL) {
1273 1234 ASSERT(sptd->spt_ppa == pplist);
1274 1235 /*
1275 1236 * Since we cache the entire segment, we want to
1276 1237 * set ppp to point to the first slot that corresponds
1277 1238 * to the requested addr, i.e. page_index.
1278 1239 */
1279 1240 mutex_exit(&sptd->spt_lock);
1280 1241 *ppp = &(sptd->spt_ppa[page_index]);
1281 1242 return (0);
1282 1243 }
1283 1244
1284 1245 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1285 1246 SEGP_FORCE_WIRED) == SEGP_FAIL) {
1286 1247 mutex_exit(&sptd->spt_lock);
1287 1248 *ppp = NULL;
1288 1249 return (ENOTSUP);
1289 1250 }
1290 1251
1291 1252 /*
1292 1253 * No need to worry about protections because ISM pages
1293 1254 * are always rw.
1294 1255 */
1295 1256 pl = pplist = NULL;
1296 1257
1297 1258 /*
1298 1259 * Do we need to build the ppa array?
1299 1260 */
1300 1261 if (sptd->spt_ppa == NULL) {
1301 1262 ASSERT(sptd->spt_ppa == pplist);
1302 1263
1303 1264 spt_base = sptseg->s_base;
1304 1265 pl_built = 1;
1305 1266
1306 1267 /*
1307 1268 * availrmem is decremented once during anon_swap_adjust()
1308 1269 * and is incremented during the anon_unresv(), which is
1309 1270 * called from shm_rm_amp() when the segment is destroyed.
1310 1271 */
1311 1272 amp = sptd->spt_amp;
1312 1273 ASSERT(amp != NULL);
1313 1274
1314 1275 /* pcachecnt is protected by sptd->spt_lock */
1315 1276 ASSERT(sptd->spt_pcachecnt == 0);
1316 1277 pplist = kmem_zalloc(sizeof (page_t *)
1317 1278 * btopr(sptd->spt_amp->size), KM_SLEEP);
1318 1279 pl = pplist;
1319 1280
1320 1281 anon_index = seg_page(sptseg, spt_base);
1321 1282
1322 1283 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
1323 1284 for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
1324 1285 a += PAGESIZE, anon_index++, pplist++) {
1325 1286 ap = anon_get_ptr(amp->ahp, anon_index);
1326 1287 ASSERT(ap != NULL);
1327 1288 swap_xlate(ap, &vp, &off);
1328 1289 pp = page_lookup(vp, off, SE_SHARED);
1329 1290 ASSERT(pp != NULL);
1330 1291 *pplist = pp;
1331 1292 }
1332 1293 ANON_LOCK_EXIT(&->a_rwlock);
1333 1294
1334 1295 if (a < (spt_base + sptd->spt_amp->size)) {
1335 1296 ret = ENOTSUP;
1336 1297 goto insert_fail;
1337 1298 }
1338 1299 sptd->spt_ppa = pl;
1339 1300 } else {
1340 1301 /*
1341 1302 * We already have a valid ppa[].
1342 1303 */
1343 1304 pl = sptd->spt_ppa;
1344 1305 }
1345 1306
1346 1307 ASSERT(pl != NULL);
1347 1308
1348 1309 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1349 1310 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1350 1311 segspt_reclaim);
1351 1312 if (ret == SEGP_FAIL) {
1352 1313 /*
1353 1314 * seg_pinsert failed. We return
1354 1315 * ENOTSUP, so that the as_pagelock() code will
1355 1316 * then try the slower F_SOFTLOCK path.
1356 1317 */
1357 1318 if (pl_built) {
1358 1319 /*
1359 1320 * No one else has referenced the ppa[].
1360 1321 * We created it and we need to destroy it.
1361 1322 */
1362 1323 sptd->spt_ppa = NULL;
1363 1324 }
1364 1325 ret = ENOTSUP;
1365 1326 goto insert_fail;
1366 1327 }
1367 1328
1368 1329 /*
1369 1330 * In either case, we increment softlockcnt on the 'real' segment.
1370 1331 */
1371 1332 sptd->spt_pcachecnt++;
1372 1333 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1373 1334
1374 1335 /*
1375 1336 * We can now drop the sptd->spt_lock since the ppa[]
1376 1337 * exists and he have incremented pacachecnt.
1377 1338 */
1378 1339 mutex_exit(&sptd->spt_lock);
1379 1340
1380 1341 /*
1381 1342 * Since we cache the entire segment, we want to
1382 1343 * set ppp to point to the first slot that corresponds
1383 1344 * to the requested addr, i.e. page_index.
1384 1345 */
1385 1346 *ppp = &(sptd->spt_ppa[page_index]);
1386 1347 return (0);
1387 1348
1388 1349 insert_fail:
1389 1350 /*
1390 1351 * We will only reach this code if we tried and failed.
1391 1352 *
1392 1353 * And we can drop the lock on the dummy seg, once we've failed
1393 1354 * to set up a new ppa[].
1394 1355 */
1395 1356 mutex_exit(&sptd->spt_lock);
1396 1357
1397 1358 if (pl_built) {
1398 1359 /*
1399 1360 * We created pl and we need to destroy it.
1400 1361 */
1401 1362 pplist = pl;
1402 1363 np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
1403 1364 while (np) {
1404 1365 page_unlock(*pplist);
1405 1366 np--;
1406 1367 pplist++;
1407 1368 }
1408 1369 kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size));
1409 1370 }
1410 1371 if (shmd->shm_softlockcnt <= 0) {
1411 1372 if (AS_ISUNMAPWAIT(seg->s_as)) {
1412 1373 mutex_enter(&seg->s_as->a_contents);
1413 1374 if (AS_ISUNMAPWAIT(seg->s_as)) {
1414 1375 AS_CLRUNMAPWAIT(seg->s_as);
1415 1376 cv_broadcast(&seg->s_as->a_cv);
1416 1377 }
1417 1378 mutex_exit(&seg->s_as->a_contents);
1418 1379 }
1419 1380 }
1420 1381 *ppp = NULL;
1421 1382 return (ret);
1422 1383 }
1423 1384
1424 1385 /*
1425 1386 * purge any cached pages in the I/O page cache
1426 1387 */
1427 1388 static void
1428 1389 segspt_purge(struct seg *seg)
1429 1390 {
1430 1391 seg_ppurge(seg, NULL, SEGP_FORCE_WIRED);
1431 1392 }
1432 1393
1433 1394 static int
1434 1395 segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
1435 1396 enum seg_rw rw, int async)
1436 1397 {
1437 1398 struct seg *seg = (struct seg *)ptag;
1438 1399 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1439 1400 struct seg *sptseg;
1440 1401 struct spt_data *sptd;
1441 1402 pgcnt_t npages, i, free_availrmem = 0;
1442 1403 int done = 0;
1443 1404
1444 1405 #ifdef lint
1445 1406 addr = addr;
1446 1407 #endif
1447 1408 sptseg = shmd->shm_sptseg;
1448 1409 sptd = sptseg->s_data;
1449 1410 npages = (len >> PAGESHIFT);
1450 1411 ASSERT(npages);
1451 1412 ASSERT(sptd->spt_pcachecnt != 0);
1452 1413 ASSERT(sptd->spt_ppa == pplist);
1453 1414 ASSERT(npages == btopr(sptd->spt_amp->size));
1454 1415 ASSERT(async || AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1455 1416
1456 1417 /*
1457 1418 * Acquire the lock on the dummy seg and destroy the
1458 1419 * ppa array IF this is the last pcachecnt.
1459 1420 */
1460 1421 mutex_enter(&sptd->spt_lock);
1461 1422 if (--sptd->spt_pcachecnt == 0) {
1462 1423 for (i = 0; i < npages; i++) {
1463 1424 if (pplist[i] == NULL) {
1464 1425 continue;
1465 1426 }
1466 1427 if (rw == S_WRITE) {
1467 1428 hat_setrefmod(pplist[i]);
1468 1429 } else {
1469 1430 hat_setref(pplist[i]);
1470 1431 }
1471 1432 if ((sptd->spt_flags & SHM_PAGEABLE) &&
1472 1433 (sptd->spt_ppa_lckcnt[i] == 0))
1473 1434 free_availrmem++;
1474 1435 page_unlock(pplist[i]);
1475 1436 }
1476 1437 if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) {
1477 1438 mutex_enter(&freemem_lock);
1478 1439 availrmem += free_availrmem;
1479 1440 mutex_exit(&freemem_lock);
1480 1441 }
1481 1442 /*
1482 1443 * Since we want to cach/uncache the entire ISM segment,
1483 1444 * we will track the pplist in a segspt specific field
1484 1445 * ppa, that is initialized at the time we add an entry to
1485 1446 * the cache.
1486 1447 */
1487 1448 ASSERT(sptd->spt_pcachecnt == 0);
1488 1449 kmem_free(pplist, sizeof (page_t *) * npages);
1489 1450 sptd->spt_ppa = NULL;
1490 1451 sptd->spt_flags &= ~DISM_PPA_CHANGED;
1491 1452 sptd->spt_gen++;
1492 1453 cv_broadcast(&sptd->spt_cv);
1493 1454 done = 1;
1494 1455 }
1495 1456 mutex_exit(&sptd->spt_lock);
1496 1457
1497 1458 /*
1498 1459 * If we are pcache async thread or called via seg_ppurge_wiredpp() we
1499 1460 * may not hold AS lock (in this case async argument is not 0). This
1500 1461 * means if softlockcnt drops to 0 after the decrement below address
1501 1462 * space may get freed. We can't allow it since after softlock
1502 1463 * derement to 0 we still need to access as structure for possible
1503 1464 * wakeup of unmap waiters. To prevent the disappearance of as we take
1504 1465 * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes
1505 1466 * this mutex as a barrier to make sure this routine completes before
1506 1467 * segment is freed.
1507 1468 *
1508 1469 * The second complication we have to deal with in async case is a
1509 1470 * possibility of missed wake up of unmap wait thread. When we don't
1510 1471 * hold as lock here we may take a_contents lock before unmap wait
1511 1472 * thread that was first to see softlockcnt was still not 0. As a
1512 1473 * result we'll fail to wake up an unmap wait thread. To avoid this
1513 1474 * race we set nounmapwait flag in as structure if we drop softlockcnt
1514 1475 * to 0 if async is not 0. unmapwait thread
1515 1476 * will not block if this flag is set.
1516 1477 */
1517 1478 if (async)
1518 1479 mutex_enter(&shmd->shm_segfree_syncmtx);
1519 1480
1520 1481 /*
1521 1482 * Now decrement softlockcnt.
1522 1483 */
1523 1484 ASSERT(shmd->shm_softlockcnt > 0);
1524 1485 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1525 1486
1526 1487 if (shmd->shm_softlockcnt <= 0) {
1527 1488 if (async || AS_ISUNMAPWAIT(seg->s_as)) {
1528 1489 mutex_enter(&seg->s_as->a_contents);
1529 1490 if (async)
1530 1491 AS_SETNOUNMAPWAIT(seg->s_as);
1531 1492 if (AS_ISUNMAPWAIT(seg->s_as)) {
1532 1493 AS_CLRUNMAPWAIT(seg->s_as);
1533 1494 cv_broadcast(&seg->s_as->a_cv);
1534 1495 }
1535 1496 mutex_exit(&seg->s_as->a_contents);
1536 1497 }
1537 1498 }
1538 1499
1539 1500 if (async)
1540 1501 mutex_exit(&shmd->shm_segfree_syncmtx);
1541 1502
1542 1503 return (done);
1543 1504 }
1544 1505
1545 1506 /*
1546 1507 * Do a F_SOFTUNLOCK call over the range requested.
1547 1508 * The range must have already been F_SOFTLOCK'ed.
1548 1509 *
1549 1510 * The calls to acquire and release the anon map lock mutex were
1550 1511 * removed in order to avoid a deadly embrace during a DR
1551 1512 * memory delete operation. (Eg. DR blocks while waiting for a
1552 1513 * exclusive lock on a page that is being used for kaio; the
1553 1514 * thread that will complete the kaio and call segspt_softunlock
1554 1515 * blocks on the anon map lock; another thread holding the anon
1555 1516 * map lock blocks on another page lock via the segspt_shmfault
1556 1517 * -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
1557 1518 *
1558 1519 * The appropriateness of the removal is based upon the following:
1559 1520 * 1. If we are holding a segment's reader lock and the page is held
1560 1521 * shared, then the corresponding element in anonmap which points to
1561 1522 * anon struct cannot change and there is no need to acquire the
1562 1523 * anonymous map lock.
1563 1524 * 2. Threads in segspt_softunlock have a reader lock on the segment
1564 1525 * and already have the shared page lock, so we are guaranteed that
1565 1526 * the anon map slot cannot change and therefore can call anon_get_ptr()
1566 1527 * without grabbing the anonymous map lock.
1567 1528 * 3. Threads that softlock a shared page break copy-on-write, even if
1568 1529 * its a read. Thus cow faults can be ignored with respect to soft
1569 1530 * unlocking, since the breaking of cow means that the anon slot(s) will
1570 1531 * not be shared.
1571 1532 */
1572 1533 static void
1573 1534 segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
1574 1535 size_t len, enum seg_rw rw)
1575 1536 {
1576 1537 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1577 1538 struct seg *sptseg;
1578 1539 struct spt_data *sptd;
1579 1540 page_t *pp;
1580 1541 caddr_t adr;
1581 1542 struct vnode *vp;
1582 1543 u_offset_t offset;
1583 1544 ulong_t anon_index;
1584 1545 struct anon_map *amp; /* XXX - for locknest */
1585 1546 struct anon *ap = NULL;
1586 1547 pgcnt_t npages;
1587 1548
1588 1549 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1589 1550
1590 1551 sptseg = shmd->shm_sptseg;
1591 1552 sptd = sptseg->s_data;
1592 1553
1593 1554 /*
1594 1555 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
1595 1556 * and therefore their pages are SE_SHARED locked
1596 1557 * for the entire life of the segment.
1597 1558 */
1598 1559 if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
1599 1560 ((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
1600 1561 goto softlock_decrement;
1601 1562 }
1602 1563
1603 1564 /*
1604 1565 * Any thread is free to do a page_find and
1605 1566 * page_unlock() on the pages within this seg.
1606 1567 *
1607 1568 * We are already holding the as->a_lock on the user's
1608 1569 * real segment, but we need to hold the a_lock on the
1609 1570 * underlying dummy as. This is mostly to satisfy the
1610 1571 * underlying HAT layer.
1611 1572 */
1612 1573 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
1613 1574 hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
1614 1575 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
1615 1576
1616 1577 amp = sptd->spt_amp;
1617 1578 ASSERT(amp != NULL);
1618 1579 anon_index = seg_page(sptseg, sptseg_addr);
1619 1580
1620 1581 for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
1621 1582 ap = anon_get_ptr(amp->ahp, anon_index++);
1622 1583 ASSERT(ap != NULL);
1623 1584 swap_xlate(ap, &vp, &offset);
1624 1585
1625 1586 /*
1626 1587 * Use page_find() instead of page_lookup() to
1627 1588 * find the page since we know that it has a
1628 1589 * "shared" lock.
1629 1590 */
1630 1591 pp = page_find(vp, offset);
1631 1592 ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
1632 1593 if (pp == NULL) {
1633 1594 panic("segspt_softunlock: "
1634 1595 "addr %p, ap %p, vp %p, off %llx",
1635 1596 (void *)adr, (void *)ap, (void *)vp, offset);
1636 1597 /*NOTREACHED*/
1637 1598 }
1638 1599
1639 1600 if (rw == S_WRITE) {
1640 1601 hat_setrefmod(pp);
1641 1602 } else if (rw != S_OTHER) {
1642 1603 hat_setref(pp);
1643 1604 }
1644 1605 page_unlock(pp);
1645 1606 }
1646 1607
1647 1608 softlock_decrement:
1648 1609 npages = btopr(len);
1649 1610 ASSERT(shmd->shm_softlockcnt >= npages);
1650 1611 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
1651 1612 if (shmd->shm_softlockcnt == 0) {
1652 1613 /*
1653 1614 * All SOFTLOCKS are gone. Wakeup any waiting
1654 1615 * unmappers so they can try again to unmap.
1655 1616 * Check for waiters first without the mutex
1656 1617 * held so we don't always grab the mutex on
1657 1618 * softunlocks.
1658 1619 */
1659 1620 if (AS_ISUNMAPWAIT(seg->s_as)) {
1660 1621 mutex_enter(&seg->s_as->a_contents);
1661 1622 if (AS_ISUNMAPWAIT(seg->s_as)) {
1662 1623 AS_CLRUNMAPWAIT(seg->s_as);
1663 1624 cv_broadcast(&seg->s_as->a_cv);
1664 1625 }
1665 1626 mutex_exit(&seg->s_as->a_contents);
1666 1627 }
1667 1628 }
1668 1629 }
1669 1630
1670 1631 int
1671 1632 segspt_shmattach(struct seg *seg, caddr_t *argsp)
1672 1633 {
1673 1634 struct shm_data *shmd_arg = (struct shm_data *)argsp;
1674 1635 struct shm_data *shmd;
1675 1636 struct anon_map *shm_amp = shmd_arg->shm_amp;
1676 1637 struct spt_data *sptd;
1677 1638 int error = 0;
1678 1639
1679 1640 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1680 1641
1681 1642 shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
1682 1643 if (shmd == NULL)
1683 1644 return (ENOMEM);
1684 1645
1685 1646 shmd->shm_sptas = shmd_arg->shm_sptas;
1686 1647 shmd->shm_amp = shm_amp;
1687 1648 shmd->shm_sptseg = shmd_arg->shm_sptseg;
1688 1649
1689 1650 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
1690 1651 NULL, 0, seg->s_size);
1691 1652
1692 1653 mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
1693 1654
1694 1655 seg->s_data = (void *)shmd;
1695 1656 seg->s_ops = &segspt_shmops;
1696 1657 seg->s_szc = shmd->shm_sptseg->s_szc;
1697 1658 sptd = shmd->shm_sptseg->s_data;
1698 1659
1699 1660 if (sptd->spt_flags & SHM_PAGEABLE) {
1700 1661 if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
1701 1662 KM_NOSLEEP)) == NULL) {
1702 1663 seg->s_data = (void *)NULL;
1703 1664 kmem_free(shmd, (sizeof (*shmd)));
1704 1665 return (ENOMEM);
1705 1666 }
1706 1667 shmd->shm_lckpgs = 0;
1707 1668 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
1708 1669 if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
1709 1670 shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1710 1671 seg->s_size, seg->s_szc)) != 0) {
1711 1672 kmem_free(shmd->shm_vpage,
1712 1673 btopr(shm_amp->size));
1713 1674 }
1714 1675 }
1715 1676 } else {
1716 1677 error = hat_share(seg->s_as->a_hat, seg->s_base,
1717 1678 shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1718 1679 seg->s_size, seg->s_szc);
1719 1680 }
1720 1681 if (error) {
1721 1682 seg->s_szc = 0;
1722 1683 seg->s_data = (void *)NULL;
1723 1684 kmem_free(shmd, (sizeof (*shmd)));
1724 1685 } else {
1725 1686 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1726 1687 shm_amp->refcnt++;
1727 1688 ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1728 1689 }
1729 1690 return (error);
1730 1691 }
1731 1692
1732 1693 int
1733 1694 segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
1734 1695 {
1735 1696 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1736 1697 int reclaim = 1;
1737 1698
1738 1699 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1739 1700 retry:
1740 1701 if (shmd->shm_softlockcnt > 0) {
1741 1702 if (reclaim == 1) {
1742 1703 segspt_purge(seg);
1743 1704 reclaim = 0;
1744 1705 goto retry;
1745 1706 }
1746 1707 return (EAGAIN);
1747 1708 }
1748 1709
1749 1710 if (ssize != seg->s_size) {
1750 1711 #ifdef DEBUG
1751 1712 cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
1752 1713 ssize, seg->s_size);
1753 1714 #endif
1754 1715 return (EINVAL);
1755 1716 }
1756 1717
1757 1718 (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
1758 1719 NULL, 0);
1759 1720 hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);
1760 1721
1761 1722 seg_free(seg);
1762 1723
1763 1724 return (0);
1764 1725 }
1765 1726
1766 1727 void
1767 1728 segspt_shmfree(struct seg *seg)
1768 1729 {
1769 1730 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1770 1731 struct anon_map *shm_amp = shmd->shm_amp;
1771 1732
1772 1733 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1773 1734
1774 1735 (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
1775 1736 MC_UNLOCK, NULL, 0);
1776 1737
1777 1738 /*
1778 1739 * Need to increment refcnt when attaching
1779 1740 * and decrement when detaching because of dup().
1780 1741 */
1781 1742 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1782 1743 shm_amp->refcnt--;
1783 1744 ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1784 1745
1785 1746 if (shmd->shm_vpage) { /* only for DISM */
1786 1747 kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
1787 1748 shmd->shm_vpage = NULL;
1788 1749 }
1789 1750
1790 1751 /*
1791 1752 * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's
1792 1753 * still working with this segment without holding as lock.
1793 1754 */
1794 1755 ASSERT(shmd->shm_softlockcnt == 0);
1795 1756 mutex_enter(&shmd->shm_segfree_syncmtx);
1796 1757 mutex_destroy(&shmd->shm_segfree_syncmtx);
1797 1758
1798 1759 kmem_free(shmd, sizeof (*shmd));
1799 1760 }
1800 1761
1801 1762 /*ARGSUSED*/
1802 1763 int
1803 1764 segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1804 1765 {
1805 1766 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1806 1767
1807 1768 /*
1808 1769 * Shared page table is more than shared mapping.
1809 1770 * Individual process sharing page tables can't change prot
1810 1771 * because there is only one set of page tables.
1811 1772 * This will be allowed after private page table is
1812 1773 * supported.
1813 1774 */
1814 1775 /* need to return correct status error? */
1815 1776 return (0);
1816 1777 }
1817 1778
1818 1779
1819 1780 faultcode_t
1820 1781 segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
1821 1782 size_t len, enum fault_type type, enum seg_rw rw)
1822 1783 {
1823 1784 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1824 1785 struct seg *sptseg = shmd->shm_sptseg;
1825 1786 struct as *curspt = shmd->shm_sptas;
1826 1787 struct spt_data *sptd = sptseg->s_data;
1827 1788 pgcnt_t npages;
1828 1789 size_t size;
1829 1790 caddr_t segspt_addr, shm_addr;
1830 1791 page_t **ppa;
1831 1792 int i;
1832 1793 ulong_t an_idx = 0;
1833 1794 int err = 0;
1834 1795 int dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0);
1835 1796 size_t pgsz;
1836 1797 pgcnt_t pgcnt;
1837 1798 caddr_t a;
1838 1799 pgcnt_t pidx;
1839 1800
1840 1801 #ifdef lint
1841 1802 hat = hat;
1842 1803 #endif
1843 1804 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1844 1805
1845 1806 /*
1846 1807 * Because of the way spt is implemented
1847 1808 * the realsize of the segment does not have to be
1848 1809 * equal to the segment size itself. The segment size is
1849 1810 * often in multiples of a page size larger than PAGESIZE.
1850 1811 * The realsize is rounded up to the nearest PAGESIZE
1851 1812 * based on what the user requested. This is a bit of
1852 1813 * ungliness that is historical but not easily fixed
1853 1814 * without re-designing the higher levels of ISM.
1854 1815 */
1855 1816 ASSERT(addr >= seg->s_base);
1856 1817 if (((addr + len) - seg->s_base) > sptd->spt_realsize)
1857 1818 return (FC_NOMAP);
1858 1819 /*
1859 1820 * For all of the following cases except F_PROT, we need to
1860 1821 * make any necessary adjustments to addr and len
1861 1822 * and get all of the necessary page_t's into an array called ppa[].
1862 1823 *
1863 1824 * The code in shmat() forces base addr and len of ISM segment
1864 1825 * to be aligned to largest page size supported. Therefore,
1865 1826 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
1866 1827 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
1867 1828 * in large pagesize chunks, or else we will screw up the HAT
1868 1829 * layer by calling hat_memload_array() with differing page sizes
1869 1830 * over a given virtual range.
1870 1831 */
1871 1832 pgsz = page_get_pagesize(sptseg->s_szc);
1872 1833 pgcnt = page_get_pagecnt(sptseg->s_szc);
1873 1834 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
1874 1835 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
1875 1836 npages = btopr(size);
1876 1837
1877 1838 /*
1878 1839 * Now we need to convert from addr in segshm to addr in segspt.
1879 1840 */
1880 1841 an_idx = seg_page(seg, shm_addr);
1881 1842 segspt_addr = sptseg->s_base + ptob(an_idx);
1882 1843
1883 1844 ASSERT((segspt_addr + ptob(npages)) <=
1884 1845 (sptseg->s_base + sptd->spt_realsize));
1885 1846 ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));
1886 1847
1887 1848 switch (type) {
1888 1849
1889 1850 case F_SOFTLOCK:
1890 1851
1891 1852 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
1892 1853 /*
1893 1854 * Fall through to the F_INVAL case to load up the hat layer
1894 1855 * entries with the HAT_LOAD_LOCK flag.
1895 1856 */
1896 1857 /* FALLTHRU */
1897 1858 case F_INVAL:
1898 1859
1899 1860 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
1900 1861 return (FC_NOMAP);
1901 1862
1902 1863 ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
1903 1864
1904 1865 err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
1905 1866 if (err != 0) {
1906 1867 if (type == F_SOFTLOCK) {
1907 1868 atomic_add_long((ulong_t *)(
1908 1869 &(shmd->shm_softlockcnt)), -npages);
1909 1870 }
1910 1871 goto dism_err;
1911 1872 }
1912 1873 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
1913 1874 a = segspt_addr;
1914 1875 pidx = 0;
1915 1876 if (type == F_SOFTLOCK) {
1916 1877
↓ open down ↓ |
1734 lines elided |
↑ open up ↑ |
1917 1878 /*
1918 1879 * Load up the translation keeping it
1919 1880 * locked and don't unlock the page.
1920 1881 */
1921 1882 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
1922 1883 hat_memload_array(sptseg->s_as->a_hat,
1923 1884 a, pgsz, &ppa[pidx], sptd->spt_prot,
1924 1885 HAT_LOAD_LOCK | HAT_LOAD_SHARE);
1925 1886 }
1926 1887 } else {
1927 - if (hat == seg->s_as->a_hat) {
1888 + /*
1889 + * Migrate pages marked for migration
1890 + */
1891 + if (lgrp_optimizations())
1892 + page_migrate(seg, shm_addr, ppa, npages);
1928 1893
1929 - /*
1930 - * Migrate pages marked for migration
1931 - */
1932 - if (lgrp_optimizations())
1933 - page_migrate(seg, shm_addr, ppa,
1934 - npages);
1935 -
1936 - /* CPU HAT */
1937 - for (; pidx < npages;
1938 - a += pgsz, pidx += pgcnt) {
1939 - hat_memload_array(sptseg->s_as->a_hat,
1940 - a, pgsz, &ppa[pidx],
1941 - sptd->spt_prot,
1942 - HAT_LOAD_SHARE);
1943 - }
1944 - } else {
1945 - /* XHAT. Pass real address */
1946 - hat_memload_array(hat, shm_addr,
1947 - size, ppa, sptd->spt_prot, HAT_LOAD_SHARE);
1894 + for (; pidx < npages; a += pgsz, pidx += pgcnt) {
1895 + hat_memload_array(sptseg->s_as->a_hat,
1896 + a, pgsz, &ppa[pidx],
1897 + sptd->spt_prot,
1898 + HAT_LOAD_SHARE);
1948 1899 }
1949 1900
1950 1901 /*
1951 1902 * And now drop the SE_SHARED lock(s).
1952 1903 */
1953 1904 if (dyn_ism_unmap) {
1954 1905 for (i = 0; i < npages; i++) {
1955 1906 page_unlock(ppa[i]);
1956 1907 }
1957 1908 }
1958 1909 }
1959 1910
1960 1911 if (!dyn_ism_unmap) {
1961 1912 if (hat_share(seg->s_as->a_hat, shm_addr,
1962 1913 curspt->a_hat, segspt_addr, ptob(npages),
1963 1914 seg->s_szc) != 0) {
1964 1915 panic("hat_share err in DISM fault");
1965 1916 /* NOTREACHED */
1966 1917 }
1967 1918 if (type == F_INVAL) {
1968 1919 for (i = 0; i < npages; i++) {
1969 1920 page_unlock(ppa[i]);
1970 1921 }
1971 1922 }
1972 1923 }
1973 1924 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
1974 1925 dism_err:
1975 1926 kmem_free(ppa, npages * sizeof (page_t *));
1976 1927 return (err);
1977 1928
1978 1929 case F_SOFTUNLOCK:
1979 1930
1980 1931 /*
1981 1932 * This is a bit ugly, we pass in the real seg pointer,
1982 1933 * but the segspt_addr is the virtual address within the
1983 1934 * dummy seg.
1984 1935 */
1985 1936 segspt_softunlock(seg, segspt_addr, size, rw);
1986 1937 return (0);
1987 1938
1988 1939 case F_PROT:
1989 1940
1990 1941 /*
1991 1942 * This takes care of the unusual case where a user
1992 1943 * allocates a stack in shared memory and a register
1993 1944 * window overflow is written to that stack page before
1994 1945 * it is otherwise modified.
1995 1946 *
1996 1947 * We can get away with this because ISM segments are
1997 1948 * always rw. Other than this unusual case, there
1998 1949 * should be no instances of protection violations.
1999 1950 */
2000 1951 return (0);
2001 1952
2002 1953 default:
2003 1954 #ifdef DEBUG
2004 1955 panic("segspt_dismfault default type?");
2005 1956 #else
2006 1957 return (FC_NOMAP);
2007 1958 #endif
2008 1959 }
2009 1960 }
2010 1961
2011 1962
2012 1963 faultcode_t
2013 1964 segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
2014 1965 size_t len, enum fault_type type, enum seg_rw rw)
2015 1966 {
2016 1967 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2017 1968 struct seg *sptseg = shmd->shm_sptseg;
2018 1969 struct as *curspt = shmd->shm_sptas;
2019 1970 struct spt_data *sptd = sptseg->s_data;
2020 1971 pgcnt_t npages;
2021 1972 size_t size;
2022 1973 caddr_t sptseg_addr, shm_addr;
2023 1974 page_t *pp, **ppa;
2024 1975 int i;
2025 1976 u_offset_t offset;
2026 1977 ulong_t anon_index = 0;
2027 1978 struct vnode *vp;
2028 1979 struct anon_map *amp; /* XXX - for locknest */
2029 1980 struct anon *ap = NULL;
2030 1981 size_t pgsz;
2031 1982 pgcnt_t pgcnt;
2032 1983 caddr_t a;
2033 1984 pgcnt_t pidx;
2034 1985 size_t sz;
2035 1986
2036 1987 #ifdef lint
2037 1988 hat = hat;
2038 1989 #endif
2039 1990
2040 1991 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2041 1992
2042 1993 if (sptd->spt_flags & SHM_PAGEABLE) {
2043 1994 return (segspt_dismfault(hat, seg, addr, len, type, rw));
2044 1995 }
2045 1996
2046 1997 /*
2047 1998 * Because of the way spt is implemented
2048 1999 * the realsize of the segment does not have to be
2049 2000 * equal to the segment size itself. The segment size is
2050 2001 * often in multiples of a page size larger than PAGESIZE.
2051 2002 * The realsize is rounded up to the nearest PAGESIZE
2052 2003 * based on what the user requested. This is a bit of
2053 2004 * ungliness that is historical but not easily fixed
2054 2005 * without re-designing the higher levels of ISM.
2055 2006 */
2056 2007 ASSERT(addr >= seg->s_base);
2057 2008 if (((addr + len) - seg->s_base) > sptd->spt_realsize)
2058 2009 return (FC_NOMAP);
2059 2010 /*
2060 2011 * For all of the following cases except F_PROT, we need to
2061 2012 * make any necessary adjustments to addr and len
2062 2013 * and get all of the necessary page_t's into an array called ppa[].
2063 2014 *
2064 2015 * The code in shmat() forces base addr and len of ISM segment
2065 2016 * to be aligned to largest page size supported. Therefore,
2066 2017 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2067 2018 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2068 2019 * in large pagesize chunks, or else we will screw up the HAT
2069 2020 * layer by calling hat_memload_array() with differing page sizes
2070 2021 * over a given virtual range.
2071 2022 */
2072 2023 pgsz = page_get_pagesize(sptseg->s_szc);
2073 2024 pgcnt = page_get_pagecnt(sptseg->s_szc);
2074 2025 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2075 2026 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2076 2027 npages = btopr(size);
2077 2028
2078 2029 /*
2079 2030 * Now we need to convert from addr in segshm to addr in segspt.
2080 2031 */
2081 2032 anon_index = seg_page(seg, shm_addr);
2082 2033 sptseg_addr = sptseg->s_base + ptob(anon_index);
2083 2034
2084 2035 /*
2085 2036 * And now we may have to adjust npages downward if we have
2086 2037 * exceeded the realsize of the segment or initial anon
2087 2038 * allocations.
2088 2039 */
2089 2040 if ((sptseg_addr + ptob(npages)) >
2090 2041 (sptseg->s_base + sptd->spt_realsize))
2091 2042 size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;
2092 2043
2093 2044 npages = btopr(size);
2094 2045
2095 2046 ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
2096 2047 ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);
2097 2048
2098 2049 switch (type) {
2099 2050
2100 2051 case F_SOFTLOCK:
2101 2052
2102 2053 /*
2103 2054 * availrmem is decremented once during anon_swap_adjust()
2104 2055 * and is incremented during the anon_unresv(), which is
2105 2056 * called from shm_rm_amp() when the segment is destroyed.
2106 2057 */
2107 2058 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2108 2059 /*
2109 2060 * Some platforms assume that ISM pages are SE_SHARED
2110 2061 * locked for the entire life of the segment.
2111 2062 */
2112 2063 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
2113 2064 return (0);
2114 2065 /*
2115 2066 * Fall through to the F_INVAL case to load up the hat layer
2116 2067 * entries with the HAT_LOAD_LOCK flag.
2117 2068 */
2118 2069
2119 2070 /* FALLTHRU */
2120 2071 case F_INVAL:
2121 2072
2122 2073 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2123 2074 return (FC_NOMAP);
2124 2075
2125 2076 /*
2126 2077 * Some platforms that do NOT support DYNAMIC_ISM_UNMAP
2127 2078 * may still rely on this call to hat_share(). That
2128 2079 * would imply that those hat's can fault on a
2129 2080 * HAT_LOAD_LOCK translation, which would seem
2130 2081 * contradictory.
2131 2082 */
2132 2083 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2133 2084 if (hat_share(seg->s_as->a_hat, seg->s_base,
2134 2085 curspt->a_hat, sptseg->s_base,
2135 2086 sptseg->s_size, sptseg->s_szc) != 0) {
2136 2087 panic("hat_share error in ISM fault");
2137 2088 /*NOTREACHED*/
2138 2089 }
2139 2090 return (0);
2140 2091 }
2141 2092 ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);
2142 2093
2143 2094 /*
2144 2095 * I see no need to lock the real seg,
2145 2096 * here, because all of our work will be on the underlying
2146 2097 * dummy seg.
2147 2098 *
2148 2099 * sptseg_addr and npages now account for large pages.
2149 2100 */
2150 2101 amp = sptd->spt_amp;
2151 2102 ASSERT(amp != NULL);
2152 2103 anon_index = seg_page(sptseg, sptseg_addr);
2153 2104
2154 2105 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2155 2106 for (i = 0; i < npages; i++) {
2156 2107 ap = anon_get_ptr(amp->ahp, anon_index++);
2157 2108 ASSERT(ap != NULL);
2158 2109 swap_xlate(ap, &vp, &offset);
2159 2110 pp = page_lookup(vp, offset, SE_SHARED);
2160 2111 ASSERT(pp != NULL);
2161 2112 ppa[i] = pp;
2162 2113 }
2163 2114 ANON_LOCK_EXIT(&->a_rwlock);
2164 2115 ASSERT(i == npages);
2165 2116
2166 2117 /*
2167 2118 * We are already holding the as->a_lock on the user's
2168 2119 * real segment, but we need to hold the a_lock on the
2169 2120 * underlying dummy as. This is mostly to satisfy the
2170 2121 * underlying HAT layer.
2171 2122 */
2172 2123 AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
2173 2124 a = sptseg_addr;
2174 2125 pidx = 0;
2175 2126 if (type == F_SOFTLOCK) {
2176 2127 /*
↓ open down ↓ |
219 lines elided |
↑ open up ↑ |
2177 2128 * Load up the translation keeping it
2178 2129 * locked and don't unlock the page.
2179 2130 */
2180 2131 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2181 2132 sz = MIN(pgsz, ptob(npages - pidx));
2182 2133 hat_memload_array(sptseg->s_as->a_hat, a,
2183 2134 sz, &ppa[pidx], sptd->spt_prot,
2184 2135 HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2185 2136 }
2186 2137 } else {
2187 - if (hat == seg->s_as->a_hat) {
2138 + /*
2139 + * Migrate pages marked for migration.
2140 + */
2141 + if (lgrp_optimizations())
2142 + page_migrate(seg, shm_addr, ppa, npages);
2188 2143
2189 - /*
2190 - * Migrate pages marked for migration.
2191 - */
2192 - if (lgrp_optimizations())
2193 - page_migrate(seg, shm_addr, ppa,
2194 - npages);
2195 -
2196 - /* CPU HAT */
2197 - for (; pidx < npages;
2198 - a += pgsz, pidx += pgcnt) {
2199 - sz = MIN(pgsz, ptob(npages - pidx));
2200 - hat_memload_array(sptseg->s_as->a_hat,
2201 - a, sz, &ppa[pidx],
2202 - sptd->spt_prot, HAT_LOAD_SHARE);
2203 - }
2204 - } else {
2205 - /* XHAT. Pass real address */
2206 - hat_memload_array(hat, shm_addr,
2207 - ptob(npages), ppa, sptd->spt_prot,
2208 - HAT_LOAD_SHARE);
2144 + for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2145 + sz = MIN(pgsz, ptob(npages - pidx));
2146 + hat_memload_array(sptseg->s_as->a_hat,
2147 + a, sz, &ppa[pidx],
2148 + sptd->spt_prot, HAT_LOAD_SHARE);
2209 2149 }
2210 2150
2211 2151 /*
2212 2152 * And now drop the SE_SHARED lock(s).
2213 2153 */
2214 2154 for (i = 0; i < npages; i++)
2215 2155 page_unlock(ppa[i]);
2216 2156 }
2217 2157 AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
2218 2158
2219 2159 kmem_free(ppa, sizeof (page_t *) * npages);
2220 2160 return (0);
2221 2161 case F_SOFTUNLOCK:
2222 2162
2223 2163 /*
2224 2164 * This is a bit ugly, we pass in the real seg pointer,
2225 2165 * but the sptseg_addr is the virtual address within the
2226 2166 * dummy seg.
2227 2167 */
2228 2168 segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
2229 2169 return (0);
2230 2170
2231 2171 case F_PROT:
2232 2172
2233 2173 /*
2234 2174 * This takes care of the unusual case where a user
2235 2175 * allocates a stack in shared memory and a register
2236 2176 * window overflow is written to that stack page before
2237 2177 * it is otherwise modified.
2238 2178 *
2239 2179 * We can get away with this because ISM segments are
2240 2180 * always rw. Other than this unusual case, there
2241 2181 * should be no instances of protection violations.
2242 2182 */
2243 2183 return (0);
2244 2184
2245 2185 default:
2246 2186 #ifdef DEBUG
2247 2187 cmn_err(CE_WARN, "segspt_shmfault default type?");
2248 2188 #endif
2249 2189 return (FC_NOMAP);
2250 2190 }
2251 2191 }
2252 2192
2253 2193 /*ARGSUSED*/
2254 2194 static faultcode_t
2255 2195 segspt_shmfaulta(struct seg *seg, caddr_t addr)
2256 2196 {
↓ open down ↓ |
38 lines elided |
↑ open up ↑ |
2257 2197 return (0);
2258 2198 }
2259 2199
2260 2200 /*ARGSUSED*/
2261 2201 static int
2262 2202 segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
2263 2203 {
2264 2204 return (0);
2265 2205 }
2266 2206
2267 -/*ARGSUSED*/
2268 -static size_t
2269 -segspt_shmswapout(struct seg *seg)
2270 -{
2271 - return (0);
2272 -}
2273 -
2274 2207 /*
2275 2208 * duplicate the shared page tables
2276 2209 */
2277 2210 int
2278 2211 segspt_shmdup(struct seg *seg, struct seg *newseg)
2279 2212 {
2280 2213 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2281 2214 struct anon_map *amp = shmd->shm_amp;
2282 2215 struct shm_data *shmd_new;
2283 2216 struct seg *spt_seg = shmd->shm_sptseg;
2284 2217 struct spt_data *sptd = spt_seg->s_data;
2285 2218 int error = 0;
2286 2219
2287 2220 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
2288 2221
2289 2222 shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
2290 2223 newseg->s_data = (void *)shmd_new;
2291 2224 shmd_new->shm_sptas = shmd->shm_sptas;
2292 2225 shmd_new->shm_amp = amp;
2293 2226 shmd_new->shm_sptseg = shmd->shm_sptseg;
2294 2227 newseg->s_ops = &segspt_shmops;
2295 2228 newseg->s_szc = seg->s_szc;
2296 2229 ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);
2297 2230
2298 2231 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
2299 2232 amp->refcnt++;
2300 2233 ANON_LOCK_EXIT(&->a_rwlock);
2301 2234
2302 2235 if (sptd->spt_flags & SHM_PAGEABLE) {
2303 2236 shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
2304 2237 shmd_new->shm_lckpgs = 0;
2305 2238 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2306 2239 if ((error = hat_share(newseg->s_as->a_hat,
2307 2240 newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR,
2308 2241 seg->s_size, seg->s_szc)) != 0) {
2309 2242 kmem_free(shmd_new->shm_vpage,
2310 2243 btopr(amp->size));
2311 2244 }
2312 2245 }
2313 2246 return (error);
2314 2247 } else {
2315 2248 return (hat_share(newseg->s_as->a_hat, newseg->s_base,
2316 2249 shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size,
2317 2250 seg->s_szc));
2318 2251
2319 2252 }
2320 2253 }
2321 2254
2322 2255 /*ARGSUSED*/
2323 2256 int
2324 2257 segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
2325 2258 {
2326 2259 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2327 2260 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2328 2261
2329 2262 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2330 2263
2331 2264 /*
2332 2265 * ISM segment is always rw.
2333 2266 */
2334 2267 return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
2335 2268 }
2336 2269
2337 2270 /*
2338 2271 * Return an array of locked large pages, for empty slots allocate
2339 2272 * private zero-filled anon pages.
2340 2273 */
2341 2274 static int
2342 2275 spt_anon_getpages(
2343 2276 struct seg *sptseg,
2344 2277 caddr_t sptaddr,
2345 2278 size_t len,
2346 2279 page_t *ppa[])
2347 2280 {
2348 2281 struct spt_data *sptd = sptseg->s_data;
2349 2282 struct anon_map *amp = sptd->spt_amp;
2350 2283 enum seg_rw rw = sptd->spt_prot;
2351 2284 uint_t szc = sptseg->s_szc;
2352 2285 size_t pg_sz, share_sz = page_get_pagesize(szc);
2353 2286 pgcnt_t lp_npgs;
2354 2287 caddr_t lp_addr, e_sptaddr;
2355 2288 uint_t vpprot, ppa_szc = 0;
2356 2289 struct vpage *vpage = NULL;
2357 2290 ulong_t j, ppa_idx;
2358 2291 int err, ierr = 0;
2359 2292 pgcnt_t an_idx;
2360 2293 anon_sync_obj_t cookie;
2361 2294 int anon_locked = 0;
2362 2295 pgcnt_t amp_pgs;
2363 2296
2364 2297
2365 2298 ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
2366 2299 ASSERT(len != 0);
2367 2300
2368 2301 pg_sz = share_sz;
2369 2302 lp_npgs = btop(pg_sz);
2370 2303 lp_addr = sptaddr;
2371 2304 e_sptaddr = sptaddr + len;
2372 2305 an_idx = seg_page(sptseg, sptaddr);
2373 2306 ppa_idx = 0;
2374 2307
2375 2308 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2376 2309
2377 2310 amp_pgs = page_get_pagecnt(amp->a_szc);
2378 2311
2379 2312 /*CONSTCOND*/
2380 2313 while (1) {
2381 2314 for (; lp_addr < e_sptaddr;
2382 2315 an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) {
2383 2316
2384 2317 /*
2385 2318 * If we're currently locked, and we get to a new
2386 2319 * page, unlock our current anon chunk.
2387 2320 */
2388 2321 if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) {
2389 2322 anon_array_exit(&cookie);
2390 2323 anon_locked = 0;
2391 2324 }
2392 2325 if (!anon_locked) {
2393 2326 anon_array_enter(amp, an_idx, &cookie);
2394 2327 anon_locked = 1;
2395 2328 }
2396 2329 ppa_szc = (uint_t)-1;
2397 2330 ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
2398 2331 lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
2399 2332 &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred);
2400 2333
2401 2334 if (ierr != 0) {
2402 2335 if (ierr > 0) {
2403 2336 err = FC_MAKE_ERR(ierr);
2404 2337 goto lpgs_err;
2405 2338 }
2406 2339 break;
2407 2340 }
2408 2341 }
2409 2342 if (lp_addr == e_sptaddr) {
2410 2343 break;
2411 2344 }
2412 2345 ASSERT(lp_addr < e_sptaddr);
2413 2346
2414 2347 /*
2415 2348 * ierr == -1 means we failed to allocate a large page.
2416 2349 * so do a size down operation.
2417 2350 *
2418 2351 * ierr == -2 means some other process that privately shares
2419 2352 * pages with this process has allocated a larger page and we
2420 2353 * need to retry with larger pages. So do a size up
2421 2354 * operation. This relies on the fact that large pages are
2422 2355 * never partially shared i.e. if we share any constituent
2423 2356 * page of a large page with another process we must share the
2424 2357 * entire large page. Note this cannot happen for SOFTLOCK
2425 2358 * case, unless current address (lpaddr) is at the beginning
2426 2359 * of the next page size boundary because the other process
2427 2360 * couldn't have relocated locked pages.
2428 2361 */
2429 2362 ASSERT(ierr == -1 || ierr == -2);
2430 2363 if (segvn_anypgsz) {
2431 2364 ASSERT(ierr == -2 || szc != 0);
2432 2365 ASSERT(ierr == -1 || szc < sptseg->s_szc);
2433 2366 szc = (ierr == -1) ? szc - 1 : szc + 1;
2434 2367 } else {
2435 2368 /*
2436 2369 * For faults and segvn_anypgsz == 0
2437 2370 * we need to be careful not to loop forever
2438 2371 * if existing page is found with szc other
2439 2372 * than 0 or seg->s_szc. This could be due
2440 2373 * to page relocations on behalf of DR or
2441 2374 * more likely large page creation. For this
2442 2375 * case simply re-size to existing page's szc
2443 2376 * if returned by anon_map_getpages().
2444 2377 */
2445 2378 if (ppa_szc == (uint_t)-1) {
2446 2379 szc = (ierr == -1) ? 0 : sptseg->s_szc;
2447 2380 } else {
2448 2381 ASSERT(ppa_szc <= sptseg->s_szc);
2449 2382 ASSERT(ierr == -2 || ppa_szc < szc);
2450 2383 ASSERT(ierr == -1 || ppa_szc > szc);
2451 2384 szc = ppa_szc;
2452 2385 }
2453 2386 }
2454 2387 pg_sz = page_get_pagesize(szc);
2455 2388 lp_npgs = btop(pg_sz);
2456 2389 ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
2457 2390 }
2458 2391 if (anon_locked) {
2459 2392 anon_array_exit(&cookie);
2460 2393 }
2461 2394 ANON_LOCK_EXIT(&->a_rwlock);
2462 2395 return (0);
2463 2396
2464 2397 lpgs_err:
2465 2398 if (anon_locked) {
2466 2399 anon_array_exit(&cookie);
2467 2400 }
2468 2401 ANON_LOCK_EXIT(&->a_rwlock);
2469 2402 for (j = 0; j < ppa_idx; j++)
2470 2403 page_unlock(ppa[j]);
2471 2404 return (err);
2472 2405 }
2473 2406
2474 2407 /*
2475 2408 * count the number of bytes in a set of spt pages that are currently not
2476 2409 * locked
2477 2410 */
2478 2411 static rctl_qty_t
2479 2412 spt_unlockedbytes(pgcnt_t npages, page_t **ppa)
2480 2413 {
2481 2414 ulong_t i;
2482 2415 rctl_qty_t unlocked = 0;
2483 2416
2484 2417 for (i = 0; i < npages; i++) {
2485 2418 if (ppa[i]->p_lckcnt == 0)
2486 2419 unlocked += PAGESIZE;
2487 2420 }
2488 2421 return (unlocked);
2489 2422 }
2490 2423
2491 2424 extern u_longlong_t randtick(void);
2492 2425 /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */
2493 2426 #define NLCK (NCPU_P2)
2494 2427 /* Random number with a range [0, n-1], n must be power of two */
2495 2428 #define RAND_P2(n) \
2496 2429 ((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1))
2497 2430
2498 2431 int
2499 2432 spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2500 2433 page_t **ppa, ulong_t *lockmap, size_t pos,
2501 2434 rctl_qty_t *locked)
2502 2435 {
2503 2436 struct shm_data *shmd = seg->s_data;
2504 2437 struct spt_data *sptd = shmd->shm_sptseg->s_data;
2505 2438 ulong_t i;
2506 2439 int kernel;
2507 2440 pgcnt_t nlck = 0;
2508 2441 int rv = 0;
2509 2442 int use_reserved = 1;
2510 2443
2511 2444 /* return the number of bytes actually locked */
2512 2445 *locked = 0;
2513 2446
2514 2447 /*
2515 2448 * To avoid contention on freemem_lock, availrmem and pages_locked
2516 2449 * global counters are updated only every nlck locked pages instead of
2517 2450 * every time. Reserve nlck locks up front and deduct from this
2518 2451 * reservation for each page that requires a lock. When the reservation
2519 2452 * is consumed, reserve again. nlck is randomized, so the competing
2520 2453 * threads do not fall into a cyclic lock contention pattern. When
2521 2454 * memory is low, the lock ahead is disabled, and instead page_pp_lock()
2522 2455 * is used to lock pages.
2523 2456 */
2524 2457 for (i = 0; i < npages; anon_index++, pos++, i++) {
2525 2458 if (nlck == 0 && use_reserved == 1) {
2526 2459 nlck = NLCK + RAND_P2(NLCK);
2527 2460 /* if fewer loops left, decrease nlck */
2528 2461 nlck = MIN(nlck, npages - i);
2529 2462 /*
2530 2463 * Reserve nlck locks up front and deduct from this
2531 2464 * reservation for each page that requires a lock. When
2532 2465 * the reservation is consumed, reserve again.
2533 2466 */
2534 2467 mutex_enter(&freemem_lock);
2535 2468 if ((availrmem - nlck) < pages_pp_maximum) {
2536 2469 /* Do not do advance memory reserves */
2537 2470 use_reserved = 0;
2538 2471 } else {
2539 2472 availrmem -= nlck;
2540 2473 pages_locked += nlck;
2541 2474 }
2542 2475 mutex_exit(&freemem_lock);
2543 2476 }
2544 2477 if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
2545 2478 if (sptd->spt_ppa_lckcnt[anon_index] <
2546 2479 (ushort_t)DISM_LOCK_MAX) {
2547 2480 if (++sptd->spt_ppa_lckcnt[anon_index] ==
2548 2481 (ushort_t)DISM_LOCK_MAX) {
2549 2482 cmn_err(CE_WARN,
2550 2483 "DISM page lock limit "
2551 2484 "reached on DISM offset 0x%lx\n",
2552 2485 anon_index << PAGESHIFT);
2553 2486 }
2554 2487 kernel = (sptd->spt_ppa &&
2555 2488 sptd->spt_ppa[anon_index]);
2556 2489 if (!page_pp_lock(ppa[i], 0, kernel ||
2557 2490 use_reserved)) {
2558 2491 sptd->spt_ppa_lckcnt[anon_index]--;
2559 2492 rv = EAGAIN;
2560 2493 break;
2561 2494 }
2562 2495 /* if this is a newly locked page, count it */
2563 2496 if (ppa[i]->p_lckcnt == 1) {
2564 2497 if (kernel == 0 && use_reserved == 1)
2565 2498 nlck--;
2566 2499 *locked += PAGESIZE;
2567 2500 }
2568 2501 shmd->shm_lckpgs++;
2569 2502 shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
2570 2503 if (lockmap != NULL)
2571 2504 BT_SET(lockmap, pos);
2572 2505 }
2573 2506 }
2574 2507 }
2575 2508 /* Return unused lock reservation */
2576 2509 if (nlck != 0 && use_reserved == 1) {
2577 2510 mutex_enter(&freemem_lock);
2578 2511 availrmem += nlck;
2579 2512 pages_locked -= nlck;
2580 2513 mutex_exit(&freemem_lock);
2581 2514 }
2582 2515
2583 2516 return (rv);
2584 2517 }
2585 2518
2586 2519 int
2587 2520 spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2588 2521 rctl_qty_t *unlocked)
2589 2522 {
2590 2523 struct shm_data *shmd = seg->s_data;
2591 2524 struct spt_data *sptd = shmd->shm_sptseg->s_data;
2592 2525 struct anon_map *amp = sptd->spt_amp;
2593 2526 struct anon *ap;
2594 2527 struct vnode *vp;
2595 2528 u_offset_t off;
2596 2529 struct page *pp;
2597 2530 int kernel;
2598 2531 anon_sync_obj_t cookie;
2599 2532 ulong_t i;
2600 2533 pgcnt_t nlck = 0;
2601 2534 pgcnt_t nlck_limit = NLCK;
2602 2535
2603 2536 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2604 2537 for (i = 0; i < npages; i++, anon_index++) {
2605 2538 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
2606 2539 anon_array_enter(amp, anon_index, &cookie);
2607 2540 ap = anon_get_ptr(amp->ahp, anon_index);
2608 2541 ASSERT(ap);
2609 2542
2610 2543 swap_xlate(ap, &vp, &off);
2611 2544 anon_array_exit(&cookie);
2612 2545 pp = page_lookup(vp, off, SE_SHARED);
2613 2546 ASSERT(pp);
2614 2547 /*
2615 2548 * availrmem is decremented only for pages which are not
2616 2549 * in seg pcache, for pages in seg pcache availrmem was
2617 2550 * decremented in _dismpagelock()
2618 2551 */
2619 2552 kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]);
2620 2553 ASSERT(pp->p_lckcnt > 0);
2621 2554
2622 2555 /*
2623 2556 * lock page but do not change availrmem, we do it
2624 2557 * ourselves every nlck loops.
2625 2558 */
2626 2559 page_pp_unlock(pp, 0, 1);
2627 2560 if (pp->p_lckcnt == 0) {
2628 2561 if (kernel == 0)
2629 2562 nlck++;
2630 2563 *unlocked += PAGESIZE;
2631 2564 }
2632 2565 page_unlock(pp);
2633 2566 shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED;
2634 2567 sptd->spt_ppa_lckcnt[anon_index]--;
2635 2568 shmd->shm_lckpgs--;
2636 2569 }
2637 2570
2638 2571 /*
2639 2572 * To reduce freemem_lock contention, do not update availrmem
2640 2573 * until at least NLCK pages have been unlocked.
2641 2574 * 1. No need to update if nlck is zero
2642 2575 * 2. Always update if the last iteration
2643 2576 */
2644 2577 if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) {
2645 2578 mutex_enter(&freemem_lock);
2646 2579 availrmem += nlck;
2647 2580 pages_locked -= nlck;
2648 2581 mutex_exit(&freemem_lock);
2649 2582 nlck = 0;
2650 2583 nlck_limit = NLCK + RAND_P2(NLCK);
2651 2584 }
2652 2585 }
2653 2586 ANON_LOCK_EXIT(&->a_rwlock);
2654 2587
2655 2588 return (0);
2656 2589 }
2657 2590
2658 2591 /*ARGSUSED*/
2659 2592 static int
2660 2593 segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
2661 2594 int attr, int op, ulong_t *lockmap, size_t pos)
2662 2595 {
2663 2596 struct shm_data *shmd = seg->s_data;
2664 2597 struct seg *sptseg = shmd->shm_sptseg;
2665 2598 struct spt_data *sptd = sptseg->s_data;
2666 2599 struct kshmid *sp = sptd->spt_amp->a_sp;
2667 2600 pgcnt_t npages, a_npages;
2668 2601 page_t **ppa;
2669 2602 pgcnt_t an_idx, a_an_idx, ppa_idx;
2670 2603 caddr_t spt_addr, a_addr; /* spt and aligned address */
2671 2604 size_t a_len; /* aligned len */
2672 2605 size_t share_sz;
2673 2606 ulong_t i;
2674 2607 int sts = 0;
2675 2608 rctl_qty_t unlocked = 0;
2676 2609 rctl_qty_t locked = 0;
2677 2610 struct proc *p = curproc;
2678 2611 kproject_t *proj;
2679 2612
2680 2613 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2681 2614 ASSERT(sp != NULL);
2682 2615
2683 2616 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
2684 2617 return (0);
2685 2618 }
2686 2619
2687 2620 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2688 2621 an_idx = seg_page(seg, addr);
2689 2622 npages = btopr(len);
2690 2623
2691 2624 if (an_idx + npages > btopr(shmd->shm_amp->size)) {
2692 2625 return (ENOMEM);
2693 2626 }
2694 2627
2695 2628 /*
2696 2629 * A shm's project never changes, so no lock needed.
2697 2630 * The shm has a hold on the project, so it will not go away.
2698 2631 * Since we have a mapping to shm within this zone, we know
2699 2632 * that the zone will not go away.
2700 2633 */
2701 2634 proj = sp->shm_perm.ipc_proj;
2702 2635
2703 2636 if (op == MC_LOCK) {
2704 2637
2705 2638 /*
2706 2639 * Need to align addr and size request if they are not
2707 2640 * aligned so we can always allocate large page(s) however
2708 2641 * we only lock what was requested in initial request.
2709 2642 */
2710 2643 share_sz = page_get_pagesize(sptseg->s_szc);
2711 2644 a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
2712 2645 a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
2713 2646 share_sz);
2714 2647 a_npages = btop(a_len);
2715 2648 a_an_idx = seg_page(seg, a_addr);
2716 2649 spt_addr = sptseg->s_base + ptob(a_an_idx);
2717 2650 ppa_idx = an_idx - a_an_idx;
2718 2651
2719 2652 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
2720 2653 KM_NOSLEEP)) == NULL) {
2721 2654 return (ENOMEM);
2722 2655 }
2723 2656
2724 2657 /*
2725 2658 * Don't cache any new pages for IO and
2726 2659 * flush any cached pages.
2727 2660 */
2728 2661 mutex_enter(&sptd->spt_lock);
2729 2662 if (sptd->spt_ppa != NULL)
2730 2663 sptd->spt_flags |= DISM_PPA_CHANGED;
2731 2664
2732 2665 sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
2733 2666 if (sts != 0) {
2734 2667 mutex_exit(&sptd->spt_lock);
2735 2668 kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2736 2669 return (sts);
2737 2670 }
2738 2671
2739 2672 mutex_enter(&sp->shm_mlock);
2740 2673 /* enforce locked memory rctl */
2741 2674 unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]);
2742 2675
2743 2676 mutex_enter(&p->p_lock);
2744 2677 if (rctl_incr_locked_mem(p, proj, unlocked, 0)) {
2745 2678 mutex_exit(&p->p_lock);
2746 2679 sts = EAGAIN;
2747 2680 } else {
2748 2681 mutex_exit(&p->p_lock);
2749 2682 sts = spt_lockpages(seg, an_idx, npages,
2750 2683 &ppa[ppa_idx], lockmap, pos, &locked);
2751 2684
2752 2685 /*
2753 2686 * correct locked count if not all pages could be
2754 2687 * locked
2755 2688 */
2756 2689 if ((unlocked - locked) > 0) {
2757 2690 rctl_decr_locked_mem(NULL, proj,
2758 2691 (unlocked - locked), 0);
2759 2692 }
2760 2693 }
2761 2694 /*
2762 2695 * unlock pages
2763 2696 */
2764 2697 for (i = 0; i < a_npages; i++)
2765 2698 page_unlock(ppa[i]);
2766 2699 if (sptd->spt_ppa != NULL)
2767 2700 sptd->spt_flags |= DISM_PPA_CHANGED;
2768 2701 mutex_exit(&sp->shm_mlock);
2769 2702 mutex_exit(&sptd->spt_lock);
2770 2703
2771 2704 kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2772 2705
2773 2706 } else if (op == MC_UNLOCK) { /* unlock */
2774 2707 page_t **ppa;
2775 2708
2776 2709 mutex_enter(&sptd->spt_lock);
2777 2710 if (shmd->shm_lckpgs == 0) {
2778 2711 mutex_exit(&sptd->spt_lock);
2779 2712 return (0);
2780 2713 }
2781 2714 /*
2782 2715 * Don't cache new IO pages.
2783 2716 */
2784 2717 if (sptd->spt_ppa != NULL)
2785 2718 sptd->spt_flags |= DISM_PPA_CHANGED;
2786 2719
2787 2720 mutex_enter(&sp->shm_mlock);
2788 2721 sts = spt_unlockpages(seg, an_idx, npages, &unlocked);
2789 2722 if ((ppa = sptd->spt_ppa) != NULL)
2790 2723 sptd->spt_flags |= DISM_PPA_CHANGED;
2791 2724 mutex_exit(&sptd->spt_lock);
2792 2725
2793 2726 rctl_decr_locked_mem(NULL, proj, unlocked, 0);
2794 2727 mutex_exit(&sp->shm_mlock);
2795 2728
2796 2729 if (ppa != NULL)
2797 2730 seg_ppurge_wiredpp(ppa);
2798 2731 }
2799 2732 return (sts);
2800 2733 }
2801 2734
2802 2735 /*ARGSUSED*/
2803 2736 int
2804 2737 segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
2805 2738 {
2806 2739 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2807 2740 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2808 2741 spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;
2809 2742
2810 2743 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2811 2744
2812 2745 /*
2813 2746 * ISM segment is always rw.
2814 2747 */
2815 2748 while (--pgno >= 0)
2816 2749 *protv++ = sptd->spt_prot;
2817 2750 return (0);
2818 2751 }
2819 2752
2820 2753 /*ARGSUSED*/
2821 2754 u_offset_t
2822 2755 segspt_shmgetoffset(struct seg *seg, caddr_t addr)
2823 2756 {
2824 2757 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2825 2758
2826 2759 /* Offset does not matter in ISM memory */
2827 2760
2828 2761 return ((u_offset_t)0);
2829 2762 }
2830 2763
2831 2764 /* ARGSUSED */
2832 2765 int
2833 2766 segspt_shmgettype(struct seg *seg, caddr_t addr)
2834 2767 {
2835 2768 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2836 2769 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2837 2770
2838 2771 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2839 2772
2840 2773 /*
2841 2774 * The shared memory mapping is always MAP_SHARED, SWAP is only
2842 2775 * reserved for DISM
2843 2776 */
2844 2777 return (MAP_SHARED |
2845 2778 ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
2846 2779 }
2847 2780
2848 2781 /*ARGSUSED*/
2849 2782 int
2850 2783 segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
2851 2784 {
2852 2785 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2853 2786 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2854 2787
2855 2788 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2856 2789
2857 2790 *vpp = sptd->spt_vp;
2858 2791 return (0);
2859 2792 }
2860 2793
2861 2794 /*
2862 2795 * We need to wait for pending IO to complete to a DISM segment in order for
2863 2796 * pages to get kicked out of the seg_pcache. 120 seconds should be more
2864 2797 * than enough time to wait.
2865 2798 */
2866 2799 static clock_t spt_pcache_wait = 120;
2867 2800
2868 2801 /*ARGSUSED*/
2869 2802 static int
2870 2803 segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
2871 2804 {
2872 2805 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2873 2806 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2874 2807 struct anon_map *amp;
2875 2808 pgcnt_t pg_idx;
2876 2809 ushort_t gen;
2877 2810 clock_t end_lbolt;
2878 2811 int writer;
2879 2812 page_t **ppa;
2880 2813
2881 2814 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2882 2815
2883 2816 if (behav == MADV_FREE) {
2884 2817 if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
2885 2818 return (0);
2886 2819
2887 2820 amp = sptd->spt_amp;
2888 2821 pg_idx = seg_page(seg, addr);
2889 2822
2890 2823 mutex_enter(&sptd->spt_lock);
2891 2824 if ((ppa = sptd->spt_ppa) == NULL) {
2892 2825 mutex_exit(&sptd->spt_lock);
2893 2826 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2894 2827 anon_disclaim(amp, pg_idx, len);
2895 2828 ANON_LOCK_EXIT(&->a_rwlock);
2896 2829 return (0);
2897 2830 }
2898 2831
2899 2832 sptd->spt_flags |= DISM_PPA_CHANGED;
2900 2833 gen = sptd->spt_gen;
2901 2834
2902 2835 mutex_exit(&sptd->spt_lock);
2903 2836
2904 2837 /*
2905 2838 * Purge all DISM cached pages
2906 2839 */
2907 2840 seg_ppurge_wiredpp(ppa);
2908 2841
2909 2842 /*
2910 2843 * Drop the AS_LOCK so that other threads can grab it
2911 2844 * in the as_pageunlock path and hopefully get the segment
2912 2845 * kicked out of the seg_pcache. We bump the shm_softlockcnt
2913 2846 * to keep this segment resident.
2914 2847 */
2915 2848 writer = AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock);
2916 2849 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
2917 2850 AS_LOCK_EXIT(seg->s_as, &seg->s_as->a_lock);
2918 2851
2919 2852 mutex_enter(&sptd->spt_lock);
2920 2853
2921 2854 end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait);
2922 2855
2923 2856 /*
2924 2857 * Try to wait for pages to get kicked out of the seg_pcache.
2925 2858 */
2926 2859 while (sptd->spt_gen == gen &&
2927 2860 (sptd->spt_flags & DISM_PPA_CHANGED) &&
2928 2861 ddi_get_lbolt() < end_lbolt) {
2929 2862 if (!cv_timedwait_sig(&sptd->spt_cv,
2930 2863 &sptd->spt_lock, end_lbolt)) {
2931 2864 break;
2932 2865 }
2933 2866 }
2934 2867
2935 2868 mutex_exit(&sptd->spt_lock);
2936 2869
2937 2870 /* Regrab the AS_LOCK and release our hold on the segment */
2938 2871 AS_LOCK_ENTER(seg->s_as, &seg->s_as->a_lock,
2939 2872 writer ? RW_WRITER : RW_READER);
2940 2873 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
2941 2874 if (shmd->shm_softlockcnt <= 0) {
2942 2875 if (AS_ISUNMAPWAIT(seg->s_as)) {
2943 2876 mutex_enter(&seg->s_as->a_contents);
2944 2877 if (AS_ISUNMAPWAIT(seg->s_as)) {
2945 2878 AS_CLRUNMAPWAIT(seg->s_as);
2946 2879 cv_broadcast(&seg->s_as->a_cv);
2947 2880 }
2948 2881 mutex_exit(&seg->s_as->a_contents);
2949 2882 }
2950 2883 }
2951 2884
2952 2885 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2953 2886 anon_disclaim(amp, pg_idx, len);
2954 2887 ANON_LOCK_EXIT(&->a_rwlock);
2955 2888 } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
2956 2889 behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
2957 2890 int already_set;
2958 2891 ulong_t anon_index;
2959 2892 lgrp_mem_policy_t policy;
2960 2893 caddr_t shm_addr;
2961 2894 size_t share_size;
2962 2895 size_t size;
2963 2896 struct seg *sptseg = shmd->shm_sptseg;
2964 2897 caddr_t sptseg_addr;
2965 2898
2966 2899 /*
2967 2900 * Align address and length to page size of underlying segment
2968 2901 */
2969 2902 share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
2970 2903 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
2971 2904 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
2972 2905 share_size);
2973 2906
2974 2907 amp = shmd->shm_amp;
2975 2908 anon_index = seg_page(seg, shm_addr);
2976 2909
2977 2910 /*
2978 2911 * And now we may have to adjust size downward if we have
2979 2912 * exceeded the realsize of the segment or initial anon
2980 2913 * allocations.
2981 2914 */
2982 2915 sptseg_addr = sptseg->s_base + ptob(anon_index);
2983 2916 if ((sptseg_addr + size) >
2984 2917 (sptseg->s_base + sptd->spt_realsize))
2985 2918 size = (sptseg->s_base + sptd->spt_realsize) -
2986 2919 sptseg_addr;
2987 2920
2988 2921 /*
2989 2922 * Set memory allocation policy for this segment
2990 2923 */
2991 2924 policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
2992 2925 already_set = lgrp_shm_policy_set(policy, amp, anon_index,
2993 2926 NULL, 0, len);
2994 2927
2995 2928 /*
2996 2929 * If random memory allocation policy set already,
2997 2930 * don't bother reapplying it.
2998 2931 */
2999 2932 if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
3000 2933 return (0);
3001 2934
3002 2935 /*
3003 2936 * Mark any existing pages in the given range for
3004 2937 * migration, flushing the I/O page cache, and using
3005 2938 * underlying segment to calculate anon index and get
3006 2939 * anonmap and vnode pointer from
↓ open down ↓ |
723 lines elided |
↑ open up ↑ |
3007 2940 */
3008 2941 if (shmd->shm_softlockcnt > 0)
3009 2942 segspt_purge(seg);
3010 2943
3011 2944 page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
3012 2945 }
3013 2946
3014 2947 return (0);
3015 2948 }
3016 2949
3017 -/*ARGSUSED*/
3018 -void
3019 -segspt_shmdump(struct seg *seg)
3020 -{
3021 - /* no-op for ISM segment */
3022 -}
3023 -
3024 -/*ARGSUSED*/
3025 -static faultcode_t
3026 -segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
3027 -{
3028 - return (ENOTSUP);
3029 -}
3030 -
3031 2950 /*
3032 2951 * get a memory ID for an addr in a given segment
3033 2952 */
3034 2953 static int
3035 2954 segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
3036 2955 {
3037 2956 struct shm_data *shmd = (struct shm_data *)seg->s_data;
3038 2957 struct anon *ap;
3039 2958 size_t anon_index;
3040 2959 struct anon_map *amp = shmd->shm_amp;
3041 2960 struct spt_data *sptd = shmd->shm_sptseg->s_data;
3042 2961 struct seg *sptseg = shmd->shm_sptseg;
3043 2962 anon_sync_obj_t cookie;
3044 2963
3045 2964 anon_index = seg_page(seg, addr);
3046 2965
3047 2966 if (addr > (seg->s_base + sptd->spt_realsize)) {
3048 2967 return (EFAULT);
3049 2968 }
3050 2969
3051 2970 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
3052 2971 anon_array_enter(amp, anon_index, &cookie);
3053 2972 ap = anon_get_ptr(amp->ahp, anon_index);
3054 2973 if (ap == NULL) {
3055 2974 struct page *pp;
3056 2975 caddr_t spt_addr = sptseg->s_base + ptob(anon_index);
3057 2976
3058 2977 pp = anon_zero(sptseg, spt_addr, &ap, kcred);
3059 2978 if (pp == NULL) {
3060 2979 anon_array_exit(&cookie);
3061 2980 ANON_LOCK_EXIT(&->a_rwlock);
3062 2981 return (ENOMEM);
3063 2982 }
3064 2983 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
3065 2984 page_unlock(pp);
3066 2985 }
3067 2986 anon_array_exit(&cookie);
3068 2987 ANON_LOCK_EXIT(&->a_rwlock);
3069 2988 memidp->val[0] = (uintptr_t)ap;
3070 2989 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
3071 2990 return (0);
3072 2991 }
3073 2992
3074 2993 /*
3075 2994 * Get memory allocation policy info for specified address in given segment
3076 2995 */
3077 2996 static lgrp_mem_policy_info_t *
3078 2997 segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
3079 2998 {
3080 2999 struct anon_map *amp;
3081 3000 ulong_t anon_index;
3082 3001 lgrp_mem_policy_info_t *policy_info;
3083 3002 struct shm_data *shm_data;
3084 3003
3085 3004 ASSERT(seg != NULL);
3086 3005
3087 3006 /*
3088 3007 * Get anon_map from segshm
3089 3008 *
3090 3009 * Assume that no lock needs to be held on anon_map, since
3091 3010 * it should be protected by its reference count which must be
3092 3011 * nonzero for an existing segment
3093 3012 * Need to grab readers lock on policy tree though
3094 3013 */
3095 3014 shm_data = (struct shm_data *)seg->s_data;
3096 3015 if (shm_data == NULL)
3097 3016 return (NULL);
3098 3017 amp = shm_data->shm_amp;
3099 3018 ASSERT(amp->refcnt != 0);
↓ open down ↓ |
59 lines elided |
↑ open up ↑ |
3100 3019
3101 3020 /*
3102 3021 * Get policy info
3103 3022 *
3104 3023 * Assume starting anon index of 0
3105 3024 */
3106 3025 anon_index = seg_page(seg, addr);
3107 3026 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
3108 3027
3109 3028 return (policy_info);
3110 -}
3111 -
3112 -/*ARGSUSED*/
3113 -static int
3114 -segspt_shmcapable(struct seg *seg, segcapability_t capability)
3115 -{
3116 - return (0);
3117 3029 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX