Print this page
remove whole-process swapping
Long before Unix supported paging, it used process swapping to reclaim
memory. The code is there and in theory it runs when we get *extremely* low
on memory. In practice, it never runs since the definition of low-on-memory
is antiquated. (XXX: define what antiquated means)
You can check the number of swapout/swapin events with kstats:
$ kstat -p ::vm:swapin ::vm:swapout
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/vm_anon.c
+++ new/usr/src/uts/common/vm/vm_anon.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24
25 25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
26 26 /* All Rights Reserved */
27 27
28 28 /*
29 29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 30 * The Regents of the University of California
31 31 * All Rights Reserved
32 32 *
33 33 * University Acknowledgment- Portions of this document are derived from
34 34 * software developed by the University of California, Berkeley, and its
35 35 * contributors.
36 36 */
37 37
38 38 /*
39 39 * VM - anonymous pages.
40 40 *
41 41 * This layer sits immediately above the vm_swap layer. It manages
42 42 * physical pages that have no permanent identity in the file system
43 43 * name space, using the services of the vm_swap layer to allocate
44 44 * backing storage for these pages. Since these pages have no external
45 45 * identity, they are discarded when the last reference is removed.
46 46 *
47 47 * An important function of this layer is to manage low-level sharing
48 48 * of pages that are logically distinct but that happen to be
49 49 * physically identical (e.g., the corresponding pages of the processes
50 50 * resulting from a fork before one process or the other changes their
51 51 * contents). This pseudo-sharing is present only as an optimization
52 52 * and is not to be confused with true sharing in which multiple
53 53 * address spaces deliberately contain references to the same object;
54 54 * such sharing is managed at a higher level.
55 55 *
56 56 * The key data structure here is the anon struct, which contains a
57 57 * reference count for its associated physical page and a hint about
58 58 * the identity of that page. Anon structs typically live in arrays,
59 59 * with an instance's position in its array determining where the
↓ open down ↓ |
59 lines elided |
↑ open up ↑ |
60 60 * corresponding backing storage is allocated; however, the swap_xlate()
61 61 * routine abstracts away this representation information so that the
62 62 * rest of the anon layer need not know it. (See the swap layer for
63 63 * more details on anon struct layout.)
64 64 *
65 65 * In the future versions of the system, the association between an
66 66 * anon struct and its position on backing store will change so that
67 67 * we don't require backing store all anonymous pages in the system.
68 68 * This is important for consideration for large memory systems.
69 69 * We can also use this technique to delay binding physical locations
70 - * to anonymous pages until pageout/swapout time where we can make
71 - * smarter allocation decisions to improve anonymous klustering.
70 + * to anonymous pages until pageout time where we can make smarter
71 + * allocation decisions to improve anonymous klustering.
72 72 *
73 73 * Many of the routines defined here take a (struct anon **) argument,
74 74 * which allows the code at this level to manage anon pages directly,
75 75 * so that callers can regard anon structs as opaque objects and not be
76 76 * concerned with assigning or inspecting their contents.
77 77 *
78 78 * Clients of this layer refer to anon pages indirectly. That is, they
79 79 * maintain arrays of pointers to anon structs rather than maintaining
80 80 * anon structs themselves. The (struct anon **) arguments mentioned
81 81 * above are pointers to entries in these arrays. It is these arrays
82 82 * that capture the mapping between offsets within a given segment and
83 83 * the corresponding anonymous backing storage address.
84 84 */
85 85
86 86 #ifdef DEBUG
87 87 #define ANON_DEBUG
88 88 #endif
89 89
90 90 #include <sys/types.h>
91 91 #include <sys/t_lock.h>
92 92 #include <sys/param.h>
93 93 #include <sys/systm.h>
94 94 #include <sys/mman.h>
95 95 #include <sys/cred.h>
96 96 #include <sys/thread.h>
97 97 #include <sys/vnode.h>
98 98 #include <sys/cpuvar.h>
99 99 #include <sys/swap.h>
100 100 #include <sys/cmn_err.h>
101 101 #include <sys/vtrace.h>
102 102 #include <sys/kmem.h>
103 103 #include <sys/sysmacros.h>
104 104 #include <sys/bitmap.h>
105 105 #include <sys/vmsystm.h>
106 106 #include <sys/tuneable.h>
107 107 #include <sys/debug.h>
108 108 #include <sys/fs/swapnode.h>
109 109 #include <sys/tnf_probe.h>
110 110 #include <sys/lgrp.h>
111 111 #include <sys/policy.h>
112 112 #include <sys/condvar_impl.h>
113 113 #include <sys/mutex_impl.h>
114 114 #include <sys/rctl.h>
115 115
116 116 #include <vm/as.h>
117 117 #include <vm/hat.h>
118 118 #include <vm/anon.h>
119 119 #include <vm/page.h>
120 120 #include <vm/vpage.h>
121 121 #include <vm/seg.h>
122 122 #include <vm/rm.h>
123 123
124 124 #include <fs/fs_subr.h>
125 125
126 126 struct vnode *anon_vp;
127 127
128 128 int anon_debug;
129 129
130 130 kmutex_t anoninfo_lock;
131 131 struct k_anoninfo k_anoninfo;
132 132 ani_free_t *ani_free_pool;
133 133 pad_mutex_t anon_array_lock[ANON_LOCKSIZE];
134 134 kcondvar_t anon_array_cv[ANON_LOCKSIZE];
135 135
136 136 /*
137 137 * Global hash table for (vp, off) -> anon slot
138 138 */
139 139 extern int swap_maxcontig;
140 140 size_t anon_hash_size;
141 141 unsigned int anon_hash_shift;
142 142 struct anon **anon_hash;
143 143
144 144 static struct kmem_cache *anon_cache;
145 145 static struct kmem_cache *anonmap_cache;
146 146
147 147 pad_mutex_t *anonhash_lock;
148 148
149 149 /*
150 150 * Used to make the increment of all refcnts of all anon slots of a large
151 151 * page appear to be atomic. The lock is grabbed for the first anon slot of
152 152 * a large page.
153 153 */
154 154 pad_mutex_t *anonpages_hash_lock;
155 155
156 156 #define APH_MUTEX(vp, off) \
157 157 (&anonpages_hash_lock[(ANON_HASH((vp), (off)) & \
158 158 (AH_LOCK_SIZE - 1))].pad_mutex)
159 159
160 160 #ifdef VM_STATS
161 161 static struct anonvmstats_str {
162 162 ulong_t getpages[30];
163 163 ulong_t privatepages[10];
164 164 ulong_t demotepages[9];
165 165 ulong_t decrefpages[9];
166 166 ulong_t dupfillholes[4];
167 167 ulong_t freepages[1];
168 168 } anonvmstats;
169 169 #endif /* VM_STATS */
170 170
171 171 /*ARGSUSED*/
172 172 static int
173 173 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags)
174 174 {
175 175 struct anon_map *amp = buf;
176 176
177 177 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL);
178 178 cv_init(&->a_purgecv, NULL, CV_DEFAULT, NULL);
179 179 mutex_init(&->a_pmtx, NULL, MUTEX_DEFAULT, NULL);
180 180 mutex_init(&->a_purgemtx, NULL, MUTEX_DEFAULT, NULL);
181 181 return (0);
182 182 }
183 183
184 184 /*ARGSUSED1*/
185 185 static void
186 186 anonmap_cache_destructor(void *buf, void *cdrarg)
187 187 {
188 188 struct anon_map *amp = buf;
189 189
190 190 rw_destroy(&->a_rwlock);
191 191 cv_destroy(&->a_purgecv);
192 192 mutex_destroy(&->a_pmtx);
193 193 mutex_destroy(&->a_purgemtx);
194 194 }
195 195
196 196 void
197 197 anon_init(void)
198 198 {
199 199 int i;
200 200 pad_mutex_t *tmp;
201 201
202 202 /* These both need to be powers of 2 so round up to the next power */
203 203 anon_hash_shift = highbit((physmem / ANON_HASHAVELEN) - 1);
204 204 anon_hash_size = 1L << anon_hash_shift;
205 205
206 206 /*
207 207 * We need to align the anonhash_lock and anonpages_hash_lock arrays
208 208 * to a 64B boundary to avoid false sharing. We add 63B to our
209 209 * allocation so that we can get a 64B aligned address to use.
210 210 * We allocate both of these together to avoid wasting an additional
211 211 * 63B.
212 212 */
213 213 tmp = kmem_zalloc((2 * AH_LOCK_SIZE * sizeof (pad_mutex_t)) + 63,
214 214 KM_SLEEP);
215 215 anonhash_lock = (pad_mutex_t *)P2ROUNDUP((uintptr_t)tmp, 64);
216 216 anonpages_hash_lock = anonhash_lock + AH_LOCK_SIZE;
217 217
218 218 for (i = 0; i < AH_LOCK_SIZE; i++) {
219 219 mutex_init(&anonhash_lock[i].pad_mutex, NULL, MUTEX_DEFAULT,
220 220 NULL);
221 221 mutex_init(&anonpages_hash_lock[i].pad_mutex, NULL,
222 222 MUTEX_DEFAULT, NULL);
223 223 }
224 224
225 225 for (i = 0; i < ANON_LOCKSIZE; i++) {
226 226 mutex_init(&anon_array_lock[i].pad_mutex, NULL,
227 227 MUTEX_DEFAULT, NULL);
228 228 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL);
229 229 }
230 230
231 231 anon_hash = (struct anon **)
232 232 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP);
233 233 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon),
234 234 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, KMC_PREFILL);
235 235 anonmap_cache = kmem_cache_create("anonmap_cache",
236 236 sizeof (struct anon_map), 0,
237 237 anonmap_cache_constructor, anonmap_cache_destructor, NULL,
238 238 NULL, NULL, 0);
239 239 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */
240 240
241 241 tmp = kmem_zalloc((ANI_MAX_POOL * sizeof (ani_free_t)) + 63, KM_SLEEP);
242 242 /* Round ani_free_pool to cacheline boundary to avoid false sharing. */
243 243 ani_free_pool = (ani_free_t *)P2ROUNDUP((uintptr_t)tmp, 64);
244 244
245 245 anon_vp = vn_alloc(KM_SLEEP);
246 246 vn_setops(anon_vp, swap_vnodeops);
247 247 anon_vp->v_type = VREG;
248 248 anon_vp->v_flag |= (VISSWAP|VISSWAPFS);
249 249 }
250 250
251 251 /*
252 252 * Global anon slot hash table manipulation.
253 253 */
254 254
255 255 static void
256 256 anon_addhash(struct anon *ap)
257 257 {
258 258 int index;
259 259
260 260 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off)));
261 261 index = ANON_HASH(ap->an_vp, ap->an_off);
262 262 ap->an_hash = anon_hash[index];
263 263 anon_hash[index] = ap;
264 264 }
265 265
266 266 static void
267 267 anon_rmhash(struct anon *ap)
268 268 {
269 269 struct anon **app;
270 270
271 271 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off)));
272 272
273 273 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)];
274 274 *app; app = &((*app)->an_hash)) {
275 275 if (*app == ap) {
276 276 *app = ap->an_hash;
277 277 break;
278 278 }
279 279 }
280 280 }
281 281
282 282 /*
283 283 * The anon array interfaces. Functions allocating,
284 284 * freeing array of pointers, and returning/setting
285 285 * entries in the array of pointers for a given offset.
286 286 *
287 287 * Create the list of pointers
288 288 */
289 289 struct anon_hdr *
290 290 anon_create(pgcnt_t npages, int flags)
291 291 {
292 292 struct anon_hdr *ahp;
293 293 ulong_t nchunks;
294 294 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
295 295
296 296 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) {
297 297 return (NULL);
298 298 }
299 299
300 300 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL);
301 301 /*
302 302 * Single level case.
303 303 */
304 304 ahp->size = npages;
305 305 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) {
306 306
307 307 if (flags & ANON_ALLOC_FORCE)
308 308 ahp->flags |= ANON_ALLOC_FORCE;
309 309
310 310 ahp->array_chunk = kmem_zalloc(
311 311 ahp->size * sizeof (struct anon *), kmemflags);
312 312
313 313 if (ahp->array_chunk == NULL) {
314 314 kmem_free(ahp, sizeof (struct anon_hdr));
315 315 return (NULL);
316 316 }
317 317 } else {
318 318 /*
319 319 * 2 Level case.
320 320 * anon hdr size needs to be rounded off to be a multiple
321 321 * of ANON_CHUNK_SIZE. This is important as various anon
322 322 * related functions depend on this.
323 323 * NOTE -
324 324 * anon_grow() makes anon hdr size a multiple of
325 325 * ANON_CHUNK_SIZE.
326 326 * amp size is <= anon hdr size.
327 327 * anon_index + seg_pgs <= anon hdr size.
328 328 */
329 329 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE);
330 330 nchunks = ahp->size >> ANON_CHUNK_SHIFT;
331 331
332 332 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *),
333 333 kmemflags);
334 334
335 335 if (ahp->array_chunk == NULL) {
336 336 kmem_free(ahp, sizeof (struct anon_hdr));
337 337 return (NULL);
338 338 }
339 339 }
340 340 return (ahp);
341 341 }
342 342
343 343 /*
344 344 * Free the array of pointers
345 345 */
346 346 void
347 347 anon_release(struct anon_hdr *ahp, pgcnt_t npages)
348 348 {
349 349 ulong_t i;
350 350 void **ppp;
351 351 ulong_t nchunks;
352 352
353 353 ASSERT(npages <= ahp->size);
354 354
355 355 /*
356 356 * Single level case.
357 357 */
358 358 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
359 359 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *));
360 360 } else {
361 361 /*
362 362 * 2 level case.
363 363 */
364 364 nchunks = ahp->size >> ANON_CHUNK_SHIFT;
365 365 for (i = 0; i < nchunks; i++) {
366 366 ppp = &ahp->array_chunk[i];
367 367 if (*ppp != NULL)
368 368 kmem_free(*ppp, PAGESIZE);
369 369 }
370 370 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *));
371 371 }
372 372 mutex_destroy(&ahp->serial_lock);
373 373 kmem_free(ahp, sizeof (struct anon_hdr));
374 374 }
375 375
376 376 /*
377 377 * Return the pointer from the list for a
378 378 * specified anon index.
379 379 */
380 380 struct anon *
381 381 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx)
382 382 {
383 383 struct anon **app;
384 384
385 385 ASSERT(an_idx < ahp->size);
386 386
387 387 /*
388 388 * Single level case.
389 389 */
390 390 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
391 391 return ((struct anon *)
392 392 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK));
393 393 } else {
394 394
395 395 /*
396 396 * 2 level case.
397 397 */
398 398 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
399 399 if (app) {
400 400 return ((struct anon *)
401 401 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] &
402 402 ANON_PTRMASK));
403 403 } else {
404 404 return (NULL);
405 405 }
406 406 }
407 407 }
408 408
409 409 /*
410 410 * Return the anon pointer for the first valid entry in the anon list,
411 411 * starting from the given index.
412 412 */
413 413 struct anon *
414 414 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index)
415 415 {
416 416 struct anon *ap;
417 417 struct anon **app;
418 418 ulong_t chunkoff;
419 419 ulong_t i;
420 420 ulong_t j;
421 421 pgcnt_t size;
422 422
423 423 i = *index;
424 424 size = ahp->size;
425 425
426 426 ASSERT(i < size);
427 427
428 428 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
429 429 /*
430 430 * 1 level case
431 431 */
432 432 while (i < size) {
433 433 ap = (struct anon *)
434 434 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK);
435 435 if (ap) {
436 436 *index = i;
437 437 return (ap);
438 438 }
439 439 i++;
440 440 }
441 441 } else {
442 442 /*
443 443 * 2 level case
444 444 */
445 445 chunkoff = i & ANON_CHUNK_OFF;
446 446 while (i < size) {
447 447 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT];
448 448 if (app)
449 449 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) {
450 450 ap = (struct anon *)
451 451 ((uintptr_t)app[j] & ANON_PTRMASK);
452 452 if (ap) {
453 453 *index = i + (j - chunkoff);
454 454 return (ap);
455 455 }
456 456 }
457 457 chunkoff = 0;
458 458 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF;
459 459 }
460 460 }
461 461 *index = size;
462 462 return (NULL);
463 463 }
464 464
465 465 /*
466 466 * Set list entry with a given pointer for a specified offset
467 467 */
468 468 int
469 469 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags)
470 470 {
471 471 void **ppp;
472 472 struct anon **app;
473 473 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
474 474 uintptr_t *ap_addr;
475 475
476 476 ASSERT(an_idx < ahp->size);
477 477
478 478 /*
479 479 * Single level case.
480 480 */
481 481 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
482 482 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx];
483 483 } else {
484 484
485 485 /*
486 486 * 2 level case.
487 487 */
488 488 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
489 489
490 490 ASSERT(ppp != NULL);
491 491 if (*ppp == NULL) {
492 492 mutex_enter(&ahp->serial_lock);
493 493 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
494 494 if (*ppp == NULL) {
495 495 *ppp = kmem_zalloc(PAGESIZE, kmemflags);
496 496 if (*ppp == NULL) {
497 497 mutex_exit(&ahp->serial_lock);
498 498 return (ENOMEM);
499 499 }
500 500 }
501 501 mutex_exit(&ahp->serial_lock);
502 502 }
503 503 app = *ppp;
504 504 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF];
505 505 }
506 506 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap;
507 507 return (0);
508 508 }
509 509
510 510 /*
511 511 * Copy anon array into a given new anon array
512 512 */
513 513 int
514 514 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx,
515 515 struct anon_hdr *dahp, ulong_t d_idx,
516 516 pgcnt_t npages, int flags)
517 517 {
518 518 void **sapp, **dapp;
519 519 void *ap;
520 520 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
521 521
522 522 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size));
523 523 ASSERT((npages <= sahp->size) && (npages <= dahp->size));
524 524
525 525 /*
526 526 * Both arrays are 1 level.
527 527 */
528 528 if (((sahp->size <= ANON_CHUNK_SIZE) &&
529 529 (dahp->size <= ANON_CHUNK_SIZE)) ||
530 530 ((sahp->flags & ANON_ALLOC_FORCE) &&
531 531 (dahp->flags & ANON_ALLOC_FORCE))) {
532 532
533 533 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx],
534 534 npages * sizeof (struct anon *));
535 535 return (0);
536 536 }
537 537
538 538 /*
539 539 * Both arrays are 2 levels.
540 540 */
541 541 if (sahp->size > ANON_CHUNK_SIZE &&
542 542 dahp->size > ANON_CHUNK_SIZE &&
543 543 ((sahp->flags & ANON_ALLOC_FORCE) == 0) &&
544 544 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) {
545 545
546 546 ulong_t sapidx, dapidx;
547 547 ulong_t *sap, *dap;
548 548 ulong_t chknp;
549 549
550 550 while (npages != 0) {
551 551
552 552 sapidx = s_idx & ANON_CHUNK_OFF;
553 553 dapidx = d_idx & ANON_CHUNK_OFF;
554 554 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx);
555 555 if (chknp > npages)
556 556 chknp = npages;
557 557
558 558 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT];
559 559 if ((sap = *sapp) != NULL) {
560 560 dapp = &dahp->array_chunk[d_idx
561 561 >> ANON_CHUNK_SHIFT];
562 562 if ((dap = *dapp) == NULL) {
563 563 *dapp = kmem_zalloc(PAGESIZE,
564 564 kmemflags);
565 565 if ((dap = *dapp) == NULL)
566 566 return (ENOMEM);
567 567 }
568 568 bcopy((sap + sapidx), (dap + dapidx),
569 569 chknp << ANON_PTRSHIFT);
570 570 }
571 571 s_idx += chknp;
572 572 d_idx += chknp;
573 573 npages -= chknp;
574 574 }
575 575 return (0);
576 576 }
577 577
578 578 /*
579 579 * At least one of the arrays is 2 level.
580 580 */
581 581 while (npages--) {
582 582 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) {
583 583 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx)));
584 584 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM)
585 585 return (ENOMEM);
586 586 }
587 587 s_idx++;
588 588 d_idx++;
589 589 }
590 590 return (0);
591 591 }
592 592
593 593
594 594 /*
595 595 * ANON_INITBUF is a convenience macro for anon_grow() below. It
596 596 * takes a buffer dst, which is at least as large as buffer src. It
597 597 * does a bcopy from src into dst, and then bzeros the extra bytes
598 598 * of dst. If tail is set, the data in src is tail aligned within
599 599 * dst instead of head aligned.
600 600 */
601 601
602 602 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \
603 603 if (tail) { \
604 604 bzero((dst), (dstsize) - (srclen)); \
605 605 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \
606 606 } else { \
607 607 bcopy((src), (dst), (srclen)); \
608 608 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \
609 609 }
610 610
611 611 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8)
612 612 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE)
613 613
614 614 /*
615 615 * anon_grow() is used to efficiently extend an existing anon array.
616 616 * startidx_p points to the index into the anon array of the first page
617 617 * that is in use. oldseg_pgs is the number of pages in use, starting at
618 618 * *startidx_p. newpages is the number of additional pages desired.
619 619 *
620 620 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed.
621 621 *
622 622 * The growth is done by creating a new top level of the anon array,
623 623 * and (if the array is 2-level) reusing the existing second level arrays.
624 624 *
625 625 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN.
626 626 *
627 627 * Returns the new number of pages in the anon array.
628 628 */
629 629 pgcnt_t
630 630 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs,
631 631 pgcnt_t newseg_pgs, int flags)
632 632 {
633 633 ulong_t startidx = startidx_p ? *startidx_p : 0;
634 634 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs;
635 635 pgcnt_t oelems, nelems, totpages;
636 636 void **level1;
637 637 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
638 638 int growdown = (flags & ANON_GROWDOWN);
639 639 size_t newarrsz, oldarrsz;
640 640 void *level2;
641 641
642 642 ASSERT(!(startidx_p == NULL && growdown));
643 643 ASSERT(startidx + oldseg_pgs <= ahp->size);
644 644
645 645 /*
646 646 * Determine the total number of pages needed in the new
647 647 * anon array. If growing down, totpages is all pages from
648 648 * startidx through the end of the array, plus <newseg_pgs>
649 649 * pages. If growing up, keep all pages from page 0 through
650 650 * the last page currently in use, plus <newseg_pgs> pages.
651 651 */
652 652 if (growdown)
653 653 totpages = oldamp_pgs - startidx + newseg_pgs;
654 654 else
655 655 totpages = startidx + oldseg_pgs + newseg_pgs;
656 656
657 657 /* If the array is already large enough, just return. */
658 658
659 659 if (oldamp_pgs >= totpages) {
660 660 if (growdown)
661 661 *startidx_p = oldamp_pgs - totpages;
662 662 return (oldamp_pgs);
663 663 }
664 664
665 665 /*
666 666 * oldamp_pgs/newamp_pgs are the total numbers of pages represented
667 667 * by the corresponding arrays.
668 668 * oelems/nelems are the number of pointers in the top level arrays
669 669 * which may be either level 1 or level 2.
670 670 * Will the new anon array be one level or two levels?
671 671 */
672 672 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
673 673 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC);
674 674 oelems = oldamp_pgs;
675 675 nelems = newamp_pgs;
676 676 } else {
677 677 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC);
678 678 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT;
679 679 nelems = newamp_pgs >> ANON_CHUNK_SHIFT;
680 680 }
681 681
682 682 newarrsz = nelems * sizeof (void *);
683 683 level1 = kmem_alloc(newarrsz, kmemflags);
684 684 if (level1 == NULL)
685 685 return (0);
686 686
687 687 /* Are we converting from a one level to a two level anon array? */
688 688
689 689 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE &&
690 690 !(ahp->flags & ANON_ALLOC_FORCE)) {
691 691
692 692 /*
693 693 * Yes, we're converting to a two level. Reuse old level 1
694 694 * as new level 2 if it is exactly PAGESIZE. Otherwise
695 695 * alloc a new level 2 and copy the old level 1 data into it.
696 696 */
697 697 if (oldamp_pgs == ANON_CHUNK_SIZE) {
698 698 level2 = (void *)ahp->array_chunk;
699 699 } else {
700 700 level2 = kmem_alloc(PAGESIZE, kmemflags);
701 701 if (level2 == NULL) {
702 702 kmem_free(level1, newarrsz);
703 703 return (0);
704 704 }
705 705 oldarrsz = oldamp_pgs * sizeof (void *);
706 706
707 707 ANON_INITBUF(ahp->array_chunk, oldarrsz,
708 708 level2, PAGESIZE, growdown);
709 709 kmem_free(ahp->array_chunk, oldarrsz);
710 710 }
711 711 bzero(level1, newarrsz);
712 712 if (growdown)
713 713 level1[nelems - 1] = level2;
714 714 else
715 715 level1[0] = level2;
716 716 } else {
717 717 oldarrsz = oelems * sizeof (void *);
718 718
719 719 ANON_INITBUF(ahp->array_chunk, oldarrsz,
720 720 level1, newarrsz, growdown);
721 721 kmem_free(ahp->array_chunk, oldarrsz);
722 722 }
723 723
724 724 ahp->array_chunk = level1;
725 725 ahp->size = newamp_pgs;
726 726 if (growdown)
727 727 *startidx_p = newamp_pgs - totpages;
728 728
729 729 return (newamp_pgs);
730 730 }
731 731
732 732
733 733 /*
734 734 * Called to sync ani_free value.
735 735 */
736 736
737 737 void
738 738 set_anoninfo(void)
739 739 {
740 740 processorid_t ix, max_seqid;
741 741 pgcnt_t total = 0;
742 742 static clock_t last_time;
743 743 clock_t new_time;
744 744
745 745 if (ani_free_pool == NULL)
746 746 return;
747 747
748 748 /*
749 749 * Recompute ani_free at most once per tick. Use max_cpu_seqid_ever to
750 750 * identify the maximum number of CPUs were ever online.
751 751 */
752 752 new_time = ddi_get_lbolt();
753 753 if (new_time > last_time) {
754 754
755 755 max_seqid = max_cpu_seqid_ever;
756 756 ASSERT(ANI_MAX_POOL > max_seqid);
757 757 for (ix = 0; ix <= max_seqid; ix++)
758 758 total += ani_free_pool[ix].ani_count;
759 759
760 760 last_time = new_time;
761 761 k_anoninfo.ani_free = total;
762 762 }
763 763 }
764 764
765 765 /*
766 766 * Reserve anon space.
767 767 *
768 768 * It's no longer simply a matter of incrementing ani_resv to
769 769 * reserve swap space, we need to check memory-based as well
770 770 * as disk-backed (physical) swap. The following algorithm
771 771 * is used:
772 772 * Check the space on physical swap
773 773 * i.e. amount needed < ani_max - ani_phys_resv
774 774 * If we are swapping on swapfs check
775 775 * amount needed < (availrmem - swapfs_minfree)
776 776 * Since the algorithm to check for the quantity of swap space is
777 777 * almost the same as that for reserving it, we'll just use anon_resvmem
778 778 * with a flag to decrement availrmem.
779 779 *
780 780 * Return non-zero on success.
781 781 */
782 782 int
783 783 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard)
784 784 {
785 785 pgcnt_t npages = btopr(size);
786 786 pgcnt_t mswap_pages = 0;
787 787 pgcnt_t pswap_pages = 0;
788 788 proc_t *p = curproc;
789 789
790 790 if (zone != NULL && takemem) {
791 791 /* test zone.max-swap resource control */
792 792 mutex_enter(&p->p_lock);
793 793 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) {
794 794 mutex_exit(&p->p_lock);
795 795 return (0);
796 796 }
797 797 mutex_exit(&p->p_lock);
798 798 }
799 799 mutex_enter(&anoninfo_lock);
800 800
801 801 /*
802 802 * pswap_pages is the number of pages we can take from
803 803 * physical (i.e. disk-backed) swap.
804 804 */
805 805 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
806 806 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv;
807 807
808 808 ANON_PRINT(A_RESV,
809 809 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n",
810 810 npages, takemem, pswap_pages, (void *)caller()));
811 811
812 812 if (npages <= pswap_pages) {
813 813 /*
814 814 * we have enough space on a physical swap
815 815 */
816 816 if (takemem)
817 817 k_anoninfo.ani_phys_resv += npages;
818 818 mutex_exit(&anoninfo_lock);
819 819 return (1);
820 820 } else if (pswap_pages != 0) {
821 821 /*
822 822 * we have some space on a physical swap
823 823 */
824 824 if (takemem) {
825 825 /*
826 826 * use up remainder of phys swap
827 827 */
828 828 k_anoninfo.ani_phys_resv += pswap_pages;
829 829 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max);
830 830 }
831 831 }
832 832 /*
833 833 * since (npages > pswap_pages) we need mem swap
834 834 * mswap_pages is the number of pages needed from availrmem
835 835 */
836 836 ASSERT(npages > pswap_pages);
837 837 mswap_pages = npages - pswap_pages;
838 838
839 839 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n",
840 840 mswap_pages));
841 841
842 842 /*
843 843 * priv processes can reserve memory as swap as long as availrmem
844 844 * remains greater than swapfs_minfree; in the case of non-priv
845 845 * processes, memory can be reserved as swap only if availrmem
846 846 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
847 847 * swapfs_reserve amount of memswap is not available to non-priv
848 848 * processes. This protects daemons such as automounter dying
849 849 * as a result of application processes eating away almost entire
850 850 * membased swap. This safeguard becomes useless if apps are run
851 851 * with root access.
852 852 *
853 853 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem.
854 854 *
855 855 */
856 856 if (tryhard) {
857 857 pgcnt_t floor_pages;
858 858
859 859 if (secpolicy_resource_anon_mem(CRED())) {
860 860 floor_pages = swapfs_minfree;
861 861 } else {
862 862 floor_pages = swapfs_minfree + swapfs_reserve;
863 863 }
864 864
865 865 mutex_exit(&anoninfo_lock);
866 866 (void) page_reclaim_mem(mswap_pages, floor_pages, 0);
867 867 mutex_enter(&anoninfo_lock);
868 868 }
869 869
870 870 mutex_enter(&freemem_lock);
871 871 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) ||
872 872 (availrmem > (swapfs_minfree + mswap_pages) &&
873 873 secpolicy_resource(CRED()) == 0)) {
874 874
875 875 if (takemem) {
876 876 /*
877 877 * Take the memory from the rest of the system.
878 878 */
879 879 availrmem -= mswap_pages;
880 880 mutex_exit(&freemem_lock);
881 881 k_anoninfo.ani_mem_resv += mswap_pages;
882 882 ANI_ADD(mswap_pages);
883 883 ANON_PRINT((A_RESV | A_MRESV),
884 884 ("anon_resvmem: took %ld pages of availrmem\n",
885 885 mswap_pages));
886 886 } else {
887 887 mutex_exit(&freemem_lock);
888 888 }
889 889
890 890 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
891 891 mutex_exit(&anoninfo_lock);
892 892 return (1);
893 893 } else {
894 894 /*
895 895 * Fail if not enough memory
896 896 */
897 897 if (takemem) {
898 898 k_anoninfo.ani_phys_resv -= pswap_pages;
899 899 }
900 900
901 901 mutex_exit(&freemem_lock);
902 902 mutex_exit(&anoninfo_lock);
903 903 ANON_PRINT(A_RESV,
904 904 ("anon_resvmem: not enough space from swapfs\n"));
905 905 if (zone != NULL && takemem)
906 906 rctl_decr_swap(zone, ptob(npages));
907 907 return (0);
908 908 }
909 909 }
910 910
911 911 /*
912 912 * Give back an anon reservation.
913 913 */
914 914 void
915 915 anon_unresvmem(size_t size, zone_t *zone)
916 916 {
917 917 pgcnt_t npages = btopr(size);
918 918 spgcnt_t mem_free_pages = 0;
919 919 pgcnt_t phys_free_slots;
920 920 #ifdef ANON_DEBUG
921 921 pgcnt_t mem_resv;
922 922 #endif
923 923 if (zone != NULL)
924 924 rctl_decr_swap(zone, ptob(npages));
925 925
926 926 mutex_enter(&anoninfo_lock);
927 927
928 928 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
929 929
930 930 /*
931 931 * If some of this reservation belonged to swapfs
932 932 * give it back to availrmem.
933 933 * ani_mem_resv is the amount of availrmem swapfs has reserved.
934 934 * but some of that memory could be locked by segspt so we can only
935 935 * return non locked ani_mem_resv back to availrmem
936 936 */
937 937 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) {
938 938 ANON_PRINT((A_RESV | A_MRESV),
939 939 ("anon_unresv: growing availrmem by %ld pages\n",
940 940 MIN(k_anoninfo.ani_mem_resv, npages)));
941 941
942 942 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv -
943 943 k_anoninfo.ani_locked_swap), npages);
944 944 mutex_enter(&freemem_lock);
945 945 availrmem += mem_free_pages;
946 946 mutex_exit(&freemem_lock);
947 947 k_anoninfo.ani_mem_resv -= mem_free_pages;
948 948
949 949 ANI_ADD(-mem_free_pages);
950 950 }
951 951 /*
952 952 * The remainder of the pages is returned to phys swap
953 953 */
954 954 ASSERT(npages >= mem_free_pages);
955 955 phys_free_slots = npages - mem_free_pages;
956 956
957 957 if (phys_free_slots) {
958 958 k_anoninfo.ani_phys_resv -= phys_free_slots;
959 959 }
960 960
961 961 #ifdef ANON_DEBUG
962 962 mem_resv = k_anoninfo.ani_mem_resv;
963 963 #endif
964 964
965 965 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
966 966 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
967 967
968 968 mutex_exit(&anoninfo_lock);
969 969
970 970 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n",
971 971 npages, mem_resv, (void *)caller()));
972 972 }
973 973
974 974 /*
975 975 * Allocate an anon slot and return it with the lock held.
976 976 */
977 977 struct anon *
978 978 anon_alloc(struct vnode *vp, anoff_t off)
979 979 {
980 980 struct anon *ap;
981 981 kmutex_t *ahm;
982 982
983 983 ap = kmem_cache_alloc(anon_cache, KM_SLEEP);
984 984 if (vp == NULL) {
985 985 swap_alloc(ap);
986 986 } else {
987 987 ap->an_vp = vp;
988 988 ap->an_off = off;
989 989 }
990 990 ap->an_refcnt = 1;
991 991 ap->an_pvp = NULL;
992 992 ap->an_poff = 0;
993 993 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
994 994 mutex_enter(ahm);
995 995 anon_addhash(ap);
996 996 mutex_exit(ahm);
997 997 ANI_ADD(-1);
998 998 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n",
999 999 (void *)ap, (ap ? (void *)ap->an_vp : NULL)));
1000 1000 return (ap);
1001 1001 }
1002 1002
1003 1003 /*
1004 1004 * Called for pages locked in memory via softlock/pagelock/mlock to make sure
1005 1005 * such pages don't consume any physical swap resources needed for swapping
1006 1006 * unlocked pages.
1007 1007 */
1008 1008 void
1009 1009 anon_swap_free(struct anon *ap, page_t *pp)
1010 1010 {
1011 1011 kmutex_t *ahm;
1012 1012
1013 1013 ASSERT(ap != NULL);
1014 1014 ASSERT(pp != NULL);
1015 1015 ASSERT(PAGE_LOCKED(pp));
1016 1016 ASSERT(pp->p_vnode != NULL);
1017 1017 ASSERT(IS_SWAPFSVP(pp->p_vnode));
1018 1018 ASSERT(ap->an_refcnt != 0);
1019 1019 ASSERT(pp->p_vnode == ap->an_vp);
1020 1020 ASSERT(pp->p_offset == ap->an_off);
1021 1021
1022 1022 if (ap->an_pvp == NULL)
1023 1023 return;
1024 1024
1025 1025 page_io_lock(pp);
1026 1026 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1027 1027 mutex_enter(ahm);
1028 1028
1029 1029 ASSERT(ap->an_refcnt != 0);
1030 1030 ASSERT(pp->p_vnode == ap->an_vp);
1031 1031 ASSERT(pp->p_offset == ap->an_off);
1032 1032
1033 1033 if (ap->an_pvp != NULL) {
1034 1034 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
1035 1035 ap->an_pvp = NULL;
1036 1036 ap->an_poff = 0;
1037 1037 mutex_exit(ahm);
1038 1038 hat_setmod(pp);
1039 1039 } else {
1040 1040 mutex_exit(ahm);
1041 1041 }
1042 1042 page_io_unlock(pp);
1043 1043 }
1044 1044
1045 1045 /*
1046 1046 * Decrement the reference count of an anon page.
1047 1047 * If reference count goes to zero, free it and
1048 1048 * its associated page (if any).
1049 1049 */
1050 1050 void
1051 1051 anon_decref(struct anon *ap)
1052 1052 {
1053 1053 page_t *pp;
1054 1054 struct vnode *vp;
1055 1055 anoff_t off;
1056 1056 kmutex_t *ahm;
1057 1057
1058 1058 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1059 1059 mutex_enter(ahm);
1060 1060 ASSERT(ap->an_refcnt != 0);
1061 1061 if (ap->an_refcnt == 0)
1062 1062 panic("anon_decref: slot count 0");
1063 1063 if (--ap->an_refcnt == 0) {
1064 1064 swap_xlate(ap, &vp, &off);
1065 1065 anon_rmhash(ap);
1066 1066 if (ap->an_pvp != NULL)
1067 1067 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
1068 1068 mutex_exit(ahm);
1069 1069
1070 1070 /*
1071 1071 * If there is a page for this anon slot we will need to
1072 1072 * call VN_DISPOSE to get rid of the vp association and
1073 1073 * put the page back on the free list as really free.
1074 1074 * Acquire the "exclusive" lock to ensure that any
1075 1075 * pending i/o always completes before the swap slot
1076 1076 * is freed.
1077 1077 */
1078 1078 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
1079 1079 if (pp != NULL) {
1080 1080 /*LINTED: constant in conditional context */
1081 1081 VN_DISPOSE(pp, B_INVAL, 0, kcred);
1082 1082 }
1083 1083 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n",
1084 1084 (void *)ap, (void *)ap->an_vp));
1085 1085
1086 1086 kmem_cache_free(anon_cache, ap);
1087 1087
1088 1088 ANI_ADD(1);
1089 1089 } else {
1090 1090 mutex_exit(ahm);
1091 1091 }
1092 1092 }
1093 1093
1094 1094
1095 1095 /*
1096 1096 * check an_refcnt of the root anon slot (anon_index argument is aligned at
1097 1097 * seg->s_szc level) to determine whether COW processing is required.
1098 1098 * anonpages_hash_lock[] held on the root ap ensures that if root's
1099 1099 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase
1100 1100 * later since this process can't fork while its AS lock is held).
1101 1101 *
1102 1102 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0.
1103 1103 */
1104 1104 int
1105 1105 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index)
1106 1106 {
1107 1107 struct anon *ap;
1108 1108 kmutex_t *ahmpages = NULL;
1109 1109
1110 1110 ap = anon_get_ptr(ahp, anon_index);
1111 1111 if (ap == NULL)
1112 1112 return (0);
1113 1113
1114 1114 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1115 1115 mutex_enter(ahmpages);
1116 1116 ASSERT(ap->an_refcnt >= 1);
1117 1117 if (ap->an_refcnt == 1) {
1118 1118 mutex_exit(ahmpages);
1119 1119 return (0);
1120 1120 }
1121 1121 mutex_exit(ahmpages);
1122 1122 return (1);
1123 1123 }
1124 1124 /*
1125 1125 * Check 'nslots' anon slots for refcnt > 1.
1126 1126 *
1127 1127 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise
1128 1128 * returns 0.
1129 1129 */
1130 1130 static int
1131 1131 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
1132 1132 {
1133 1133 struct anon *ap;
1134 1134
1135 1135 while (nslots-- > 0) {
1136 1136 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL &&
1137 1137 ap->an_refcnt > 1)
1138 1138 return (1);
1139 1139 anon_index++;
1140 1140 }
1141 1141
1142 1142 return (0);
1143 1143 }
1144 1144
1145 1145 static void
1146 1146 anon_decref_pages(
1147 1147 struct anon_hdr *ahp,
1148 1148 ulong_t an_idx,
1149 1149 uint_t szc)
1150 1150 {
1151 1151 struct anon *ap = anon_get_ptr(ahp, an_idx);
1152 1152 kmutex_t *ahmpages = NULL;
1153 1153 page_t *pp;
1154 1154 pgcnt_t pgcnt = page_get_pagecnt(szc);
1155 1155 pgcnt_t i;
1156 1156 struct vnode *vp;
1157 1157 anoff_t off;
1158 1158 kmutex_t *ahm;
1159 1159 #ifdef DEBUG
1160 1160 int refcnt = 1;
1161 1161 #endif
1162 1162
1163 1163 ASSERT(szc != 0);
1164 1164 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1165 1165 ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1166 1166 ASSERT(an_idx < ahp->size);
1167 1167
1168 1168 if (ahp->size - an_idx < pgcnt) {
1169 1169 /*
1170 1170 * In case of shared mappings total anon map size may not be
1171 1171 * the largest page size aligned.
1172 1172 */
1173 1173 pgcnt = ahp->size - an_idx;
1174 1174 }
1175 1175
1176 1176 VM_STAT_ADD(anonvmstats.decrefpages[0]);
1177 1177
1178 1178 if (ap != NULL) {
1179 1179 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1180 1180 mutex_enter(ahmpages);
1181 1181 ASSERT((refcnt = ap->an_refcnt) != 0);
1182 1182 VM_STAT_ADD(anonvmstats.decrefpages[1]);
1183 1183 if (ap->an_refcnt == 1) {
1184 1184 VM_STAT_ADD(anonvmstats.decrefpages[2]);
1185 1185 ASSERT(!anon_share(ahp, an_idx, pgcnt));
1186 1186 mutex_exit(ahmpages);
1187 1187 ahmpages = NULL;
1188 1188 }
1189 1189 }
1190 1190
1191 1191 i = 0;
1192 1192 while (i < pgcnt) {
1193 1193 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) {
1194 1194 ASSERT(refcnt == 1 && ahmpages == NULL);
1195 1195 i++;
1196 1196 continue;
1197 1197 }
1198 1198 ASSERT(ap->an_refcnt == refcnt);
1199 1199 ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
1200 1200 ASSERT(ahmpages == NULL || ap->an_refcnt > 1);
1201 1201
1202 1202 if (ahmpages == NULL) {
1203 1203 swap_xlate(ap, &vp, &off);
1204 1204 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
1205 1205 if (pp == NULL || pp->p_szc == 0) {
1206 1206 VM_STAT_ADD(anonvmstats.decrefpages[3]);
1207 1207 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1208 1208 (void) anon_set_ptr(ahp, an_idx + i, NULL,
1209 1209 ANON_SLEEP);
1210 1210 mutex_enter(ahm);
1211 1211 ap->an_refcnt--;
1212 1212 ASSERT(ap->an_refcnt == 0);
1213 1213 anon_rmhash(ap);
1214 1214 if (ap->an_pvp)
1215 1215 swap_phys_free(ap->an_pvp, ap->an_poff,
1216 1216 PAGESIZE);
1217 1217 mutex_exit(ahm);
1218 1218 if (pp == NULL) {
1219 1219 pp = page_lookup(vp, (u_offset_t)off,
1220 1220 SE_EXCL);
1221 1221 ASSERT(pp == NULL || pp->p_szc == 0);
1222 1222 }
1223 1223 if (pp != NULL) {
1224 1224 VM_STAT_ADD(anonvmstats.decrefpages[4]);
1225 1225 /*LINTED*/
1226 1226 VN_DISPOSE(pp, B_INVAL, 0, kcred);
1227 1227 }
1228 1228 kmem_cache_free(anon_cache, ap);
1229 1229 ANI_ADD(1);
1230 1230 i++;
1231 1231 } else {
1232 1232 pgcnt_t j;
1233 1233 pgcnt_t curpgcnt =
1234 1234 page_get_pagecnt(pp->p_szc);
1235 1235 size_t ppasize = curpgcnt * sizeof (page_t *);
1236 1236 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP);
1237 1237 int dispose = 0;
1238 1238
1239 1239 VM_STAT_ADD(anonvmstats.decrefpages[5]);
1240 1240
1241 1241 ASSERT(pp->p_szc <= szc);
1242 1242 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt));
1243 1243 ASSERT(IS_P2ALIGNED(i, curpgcnt));
1244 1244 ASSERT(i + curpgcnt <= pgcnt);
1245 1245 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1)));
1246 1246 ppa[0] = pp;
1247 1247 for (j = i + 1; j < i + curpgcnt; j++) {
1248 1248 ap = anon_get_ptr(ahp, an_idx + j);
1249 1249 ASSERT(ap != NULL &&
1250 1250 ap->an_refcnt == 1);
1251 1251 swap_xlate(ap, &vp, &off);
1252 1252 pp = page_lookup(vp, (u_offset_t)off,
1253 1253 SE_EXCL);
1254 1254 if (pp == NULL)
1255 1255 panic("anon_decref_pages: "
1256 1256 "no page");
1257 1257
1258 1258 (void) hat_pageunload(pp,
1259 1259 HAT_FORCE_PGUNLOAD);
1260 1260 ASSERT(pp->p_szc == ppa[0]->p_szc);
1261 1261 ASSERT(page_pptonum(pp) - 1 ==
1262 1262 page_pptonum(ppa[j - i - 1]));
1263 1263 ppa[j - i] = pp;
1264 1264 if (ap->an_pvp != NULL &&
1265 1265 !vn_matchopval(ap->an_pvp,
1266 1266 VOPNAME_DISPOSE,
1267 1267 (fs_generic_func_p)fs_dispose))
1268 1268 dispose = 1;
1269 1269 }
1270 1270 for (j = i; j < i + curpgcnt; j++) {
1271 1271 ap = anon_get_ptr(ahp, an_idx + j);
1272 1272 ASSERT(ap != NULL &&
1273 1273 ap->an_refcnt == 1);
1274 1274 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1275 1275 (void) anon_set_ptr(ahp, an_idx + j,
1276 1276 NULL, ANON_SLEEP);
1277 1277 mutex_enter(ahm);
1278 1278 ap->an_refcnt--;
1279 1279 ASSERT(ap->an_refcnt == 0);
1280 1280 anon_rmhash(ap);
1281 1281 if (ap->an_pvp)
1282 1282 swap_phys_free(ap->an_pvp,
1283 1283 ap->an_poff, PAGESIZE);
1284 1284 mutex_exit(ahm);
1285 1285 kmem_cache_free(anon_cache, ap);
1286 1286 ANI_ADD(1);
1287 1287 }
1288 1288 if (!dispose) {
1289 1289 VM_STAT_ADD(anonvmstats.decrefpages[6]);
1290 1290 page_destroy_pages(ppa[0]);
1291 1291 } else {
1292 1292 VM_STAT_ADD(anonvmstats.decrefpages[7]);
1293 1293 for (j = 0; j < curpgcnt; j++) {
1294 1294 ASSERT(PAGE_EXCL(ppa[j]));
1295 1295 ppa[j]->p_szc = 0;
1296 1296 }
1297 1297 for (j = 0; j < curpgcnt; j++) {
1298 1298 ASSERT(!hat_page_is_mapped(
1299 1299 ppa[j]));
1300 1300 /*LINTED*/
1301 1301 VN_DISPOSE(ppa[j], B_INVAL, 0,
1302 1302 kcred);
1303 1303 }
1304 1304 }
1305 1305 kmem_free(ppa, ppasize);
1306 1306 i += curpgcnt;
1307 1307 }
1308 1308 } else {
1309 1309 VM_STAT_ADD(anonvmstats.decrefpages[8]);
1310 1310 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP);
1311 1311 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1312 1312 mutex_enter(ahm);
1313 1313 ap->an_refcnt--;
1314 1314 mutex_exit(ahm);
1315 1315 i++;
1316 1316 }
1317 1317 }
1318 1318
1319 1319 if (ahmpages != NULL) {
1320 1320 mutex_exit(ahmpages);
1321 1321 }
1322 1322 }
1323 1323
1324 1324 /*
1325 1325 * Duplicate references to size bytes worth of anon pages.
1326 1326 * Used when duplicating a segment that contains private anon pages.
1327 1327 * This code assumes that procedure calling this one has already used
1328 1328 * hat_chgprot() to disable write access to the range of addresses that
1329 1329 * that *old actually refers to.
1330 1330 */
1331 1331 void
1332 1332 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new,
1333 1333 ulong_t new_idx, size_t size)
1334 1334 {
1335 1335 spgcnt_t npages;
1336 1336 kmutex_t *ahm;
1337 1337 struct anon *ap;
1338 1338 ulong_t off;
1339 1339 ulong_t index;
1340 1340
1341 1341 npages = btopr(size);
1342 1342 while (npages > 0) {
1343 1343 index = old_idx;
1344 1344 if ((ap = anon_get_next_ptr(old, &index)) == NULL)
1345 1345 break;
1346 1346
1347 1347 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
1348 1348 off = index - old_idx;
1349 1349 npages -= off;
1350 1350 if (npages <= 0)
1351 1351 break;
1352 1352
1353 1353 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP);
1354 1354 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1355 1355
1356 1356 mutex_enter(ahm);
1357 1357 ap->an_refcnt++;
1358 1358 mutex_exit(ahm);
1359 1359
1360 1360 off++;
1361 1361 new_idx += off;
1362 1362 old_idx += off;
1363 1363 npages--;
1364 1364 }
1365 1365 }
1366 1366
1367 1367 /*
1368 1368 * Just like anon_dup but also guarantees there are no holes (unallocated anon
1369 1369 * slots) within any large page region. That means if a large page region is
1370 1370 * empty in the old array it will skip it. If there are 1 or more valid slots
1371 1371 * in the large page region of the old array it will make sure to fill in any
1372 1372 * unallocated ones and also copy them to the new array. If noalloc is 1 large
1373 1373 * page region should either have no valid anon slots or all slots should be
1374 1374 * valid.
1375 1375 */
1376 1376 void
1377 1377 anon_dup_fill_holes(
1378 1378 struct anon_hdr *old,
1379 1379 ulong_t old_idx,
1380 1380 struct anon_hdr *new,
1381 1381 ulong_t new_idx,
1382 1382 size_t size,
1383 1383 uint_t szc,
1384 1384 int noalloc)
1385 1385 {
1386 1386 struct anon *ap;
1387 1387 spgcnt_t npages;
1388 1388 kmutex_t *ahm, *ahmpages = NULL;
1389 1389 pgcnt_t pgcnt, i;
1390 1390 ulong_t index, off;
1391 1391 #ifdef DEBUG
1392 1392 int refcnt;
1393 1393 #endif
1394 1394
1395 1395 ASSERT(szc != 0);
1396 1396 pgcnt = page_get_pagecnt(szc);
1397 1397 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1398 1398 npages = btopr(size);
1399 1399 ASSERT(IS_P2ALIGNED(npages, pgcnt));
1400 1400 ASSERT(IS_P2ALIGNED(old_idx, pgcnt));
1401 1401
1402 1402 VM_STAT_ADD(anonvmstats.dupfillholes[0]);
1403 1403
1404 1404 while (npages > 0) {
1405 1405 index = old_idx;
1406 1406
1407 1407 /*
1408 1408 * Find the next valid slot.
1409 1409 */
1410 1410 if (anon_get_next_ptr(old, &index) == NULL)
1411 1411 break;
1412 1412
1413 1413 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
1414 1414 /*
1415 1415 * Now backup index to the beginning of the
1416 1416 * current large page region of the old array.
1417 1417 */
1418 1418 index = P2ALIGN(index, pgcnt);
1419 1419 off = index - old_idx;
1420 1420 ASSERT(IS_P2ALIGNED(off, pgcnt));
1421 1421 npages -= off;
1422 1422 if (npages <= 0)
1423 1423 break;
1424 1424
1425 1425 /*
1426 1426 * Fill and copy a large page regions worth
1427 1427 * of anon slots.
1428 1428 */
1429 1429 for (i = 0; i < pgcnt; i++) {
1430 1430 if ((ap = anon_get_ptr(old, index + i)) == NULL) {
1431 1431 if (noalloc) {
1432 1432 panic("anon_dup_fill_holes: "
1433 1433 "empty anon slot\n");
1434 1434 }
1435 1435 VM_STAT_ADD(anonvmstats.dupfillholes[1]);
1436 1436 ap = anon_alloc(NULL, 0);
1437 1437 (void) anon_set_ptr(old, index + i, ap,
1438 1438 ANON_SLEEP);
1439 1439 } else if (i == 0) {
1440 1440 /*
1441 1441 * make the increment of all refcnts of all
1442 1442 * anon slots of a large page appear atomic by
1443 1443 * getting an anonpages_hash_lock for the
1444 1444 * first anon slot of a large page.
1445 1445 */
1446 1446 VM_STAT_ADD(anonvmstats.dupfillholes[2]);
1447 1447
1448 1448 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1449 1449 mutex_enter(ahmpages);
1450 1450 /*LINTED*/
1451 1451 ASSERT(refcnt = ap->an_refcnt);
1452 1452
1453 1453 VM_STAT_COND_ADD(ap->an_refcnt > 1,
1454 1454 anonvmstats.dupfillholes[3]);
1455 1455 }
1456 1456 (void) anon_set_ptr(new, new_idx + off + i, ap,
1457 1457 ANON_SLEEP);
1458 1458 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1459 1459 mutex_enter(ahm);
1460 1460 ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
1461 1461 ASSERT(i == 0 || ahmpages == NULL ||
1462 1462 refcnt == ap->an_refcnt);
1463 1463 ap->an_refcnt++;
1464 1464 mutex_exit(ahm);
1465 1465 }
1466 1466 if (ahmpages != NULL) {
1467 1467 mutex_exit(ahmpages);
1468 1468 ahmpages = NULL;
1469 1469 }
1470 1470 off += pgcnt;
1471 1471 new_idx += off;
1472 1472 old_idx += off;
1473 1473 npages -= pgcnt;
1474 1474 }
1475 1475 }
1476 1476
1477 1477 /*
1478 1478 * Used when a segment with a vnode changes szc. similarly to
1479 1479 * anon_dup_fill_holes() makes sure each large page region either has no anon
1480 1480 * slots or all of them. but new slots are created by COWing the file
1481 1481 * pages. on entrance no anon slots should be shared.
1482 1482 */
1483 1483 int
1484 1484 anon_fill_cow_holes(
1485 1485 struct seg *seg,
1486 1486 caddr_t addr,
1487 1487 struct anon_hdr *ahp,
1488 1488 ulong_t an_idx,
1489 1489 struct vnode *vp,
1490 1490 u_offset_t vp_off,
1491 1491 size_t size,
1492 1492 uint_t szc,
1493 1493 uint_t prot,
1494 1494 struct vpage vpage[],
1495 1495 struct cred *cred)
1496 1496 {
1497 1497 struct anon *ap;
1498 1498 spgcnt_t npages;
1499 1499 pgcnt_t pgcnt, i;
1500 1500 ulong_t index, off;
1501 1501 int err = 0;
1502 1502 int pageflags = 0;
1503 1503
1504 1504 ASSERT(szc != 0);
1505 1505 pgcnt = page_get_pagecnt(szc);
1506 1506 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1507 1507 npages = btopr(size);
1508 1508 ASSERT(IS_P2ALIGNED(npages, pgcnt));
1509 1509 ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1510 1510
1511 1511 while (npages > 0) {
1512 1512 index = an_idx;
1513 1513
1514 1514 /*
1515 1515 * Find the next valid slot.
1516 1516 */
1517 1517 if (anon_get_next_ptr(ahp, &index) == NULL) {
1518 1518 break;
1519 1519 }
1520 1520
1521 1521 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1522 1522 /*
1523 1523 * Now backup index to the beginning of the
1524 1524 * current large page region of the anon array.
1525 1525 */
1526 1526 index = P2ALIGN(index, pgcnt);
1527 1527 off = index - an_idx;
1528 1528 ASSERT(IS_P2ALIGNED(off, pgcnt));
1529 1529 npages -= off;
1530 1530 if (npages <= 0)
1531 1531 break;
1532 1532 an_idx += off;
1533 1533 vp_off += ptob(off);
1534 1534 addr += ptob(off);
1535 1535 if (vpage != NULL) {
1536 1536 vpage += off;
1537 1537 }
1538 1538
1539 1539 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) {
1540 1540 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) {
1541 1541 page_t *pl[1 + 1];
1542 1542 page_t *pp;
1543 1543
1544 1544 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL,
1545 1545 pl, PAGESIZE, seg, addr, S_READ, cred,
1546 1546 NULL);
1547 1547 if (err) {
1548 1548 break;
1549 1549 }
1550 1550 if (vpage != NULL) {
1551 1551 prot = VPP_PROT(vpage);
1552 1552 pageflags = VPP_ISPPLOCK(vpage) ?
1553 1553 LOCK_PAGE : 0;
1554 1554 }
1555 1555 pp = anon_private(&ap, seg, addr, prot, pl[0],
1556 1556 pageflags, cred);
1557 1557 if (pp == NULL) {
1558 1558 err = ENOMEM;
1559 1559 break;
1560 1560 }
1561 1561 (void) anon_set_ptr(ahp, an_idx, ap,
1562 1562 ANON_SLEEP);
1563 1563 page_unlock(pp);
1564 1564 }
1565 1565 ASSERT(ap->an_refcnt == 1);
1566 1566 addr += PAGESIZE;
1567 1567 if (vpage != NULL) {
1568 1568 vpage++;
1569 1569 }
1570 1570 }
1571 1571 npages -= pgcnt;
1572 1572 }
1573 1573
1574 1574 return (err);
1575 1575 }
1576 1576
1577 1577 /*
1578 1578 * Free a group of "size" anon pages, size in bytes,
1579 1579 * and clear out the pointers to the anon entries.
1580 1580 */
1581 1581 void
1582 1582 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size)
1583 1583 {
1584 1584 spgcnt_t npages;
1585 1585 struct anon *ap;
1586 1586 ulong_t old;
1587 1587
1588 1588 npages = btopr(size);
1589 1589
1590 1590 while (npages > 0) {
1591 1591 old = index;
1592 1592 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
1593 1593 break;
1594 1594
1595 1595 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1596 1596 npages -= index - old;
1597 1597 if (npages <= 0)
1598 1598 break;
1599 1599
1600 1600 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP);
1601 1601 anon_decref(ap);
1602 1602 /*
1603 1603 * Bump index and decrement page count
1604 1604 */
1605 1605 index++;
1606 1606 npages--;
1607 1607 }
1608 1608 }
1609 1609
1610 1610 void
1611 1611 anon_free_pages(
1612 1612 struct anon_hdr *ahp,
1613 1613 ulong_t an_idx,
1614 1614 size_t size,
1615 1615 uint_t szc)
1616 1616 {
1617 1617 spgcnt_t npages;
1618 1618 pgcnt_t pgcnt;
1619 1619 ulong_t index, off;
1620 1620
1621 1621 ASSERT(szc != 0);
1622 1622 pgcnt = page_get_pagecnt(szc);
1623 1623 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1624 1624 npages = btopr(size);
1625 1625 ASSERT(IS_P2ALIGNED(npages, pgcnt));
1626 1626 ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1627 1627 ASSERT(an_idx < ahp->size);
1628 1628
1629 1629 VM_STAT_ADD(anonvmstats.freepages[0]);
1630 1630
1631 1631 while (npages > 0) {
1632 1632 index = an_idx;
1633 1633
1634 1634 /*
1635 1635 * Find the next valid slot.
1636 1636 */
1637 1637 if (anon_get_next_ptr(ahp, &index) == NULL)
1638 1638 break;
1639 1639
1640 1640 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1641 1641 /*
1642 1642 * Now backup index to the beginning of the
1643 1643 * current large page region of the old array.
1644 1644 */
1645 1645 index = P2ALIGN(index, pgcnt);
1646 1646 off = index - an_idx;
1647 1647 ASSERT(IS_P2ALIGNED(off, pgcnt));
1648 1648 npages -= off;
1649 1649 if (npages <= 0)
1650 1650 break;
1651 1651
1652 1652 anon_decref_pages(ahp, index, szc);
1653 1653
1654 1654 off += pgcnt;
1655 1655 an_idx += off;
1656 1656 npages -= pgcnt;
1657 1657 }
1658 1658 }
1659 1659
1660 1660 /*
1661 1661 * Make anonymous pages discardable
1662 1662 */
1663 1663 void
1664 1664 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size)
1665 1665 {
1666 1666 spgcnt_t npages = btopr(size);
1667 1667 struct anon *ap;
1668 1668 struct vnode *vp;
1669 1669 anoff_t off;
1670 1670 page_t *pp, *root_pp;
1671 1671 kmutex_t *ahm;
1672 1672 pgcnt_t pgcnt;
1673 1673 ulong_t old_idx, idx, i;
1674 1674 struct anon_hdr *ahp = amp->ahp;
1675 1675 anon_sync_obj_t cookie;
1676 1676
1677 1677 ASSERT(RW_READ_HELD(&->a_rwlock));
1678 1678 pgcnt = 1;
1679 1679 for (; npages > 0; index = (pgcnt == 1) ? index + 1 :
1680 1680 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) {
1681 1681
1682 1682 /*
1683 1683 * get anon pointer and index for the first valid entry
1684 1684 * in the anon list, starting from "index"
1685 1685 */
1686 1686 old_idx = index;
1687 1687 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
1688 1688 break;
1689 1689
1690 1690 /*
1691 1691 * decrement npages by number of NULL anon slots we skipped
1692 1692 */
1693 1693 npages -= index - old_idx;
1694 1694 if (npages <= 0)
1695 1695 break;
1696 1696
1697 1697 anon_array_enter(amp, index, &cookie);
1698 1698 ap = anon_get_ptr(ahp, index);
1699 1699 ASSERT(ap != NULL);
1700 1700
1701 1701 /*
1702 1702 * Get anonymous page and try to lock it SE_EXCL;
1703 1703 * if we couldn't grab the lock we skip to next page.
1704 1704 */
1705 1705 swap_xlate(ap, &vp, &off);
1706 1706 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL);
1707 1707 if (pp == NULL) {
1708 1708 segadvstat.MADV_FREE_miss.value.ul++;
1709 1709 pgcnt = 1;
1710 1710 anon_array_exit(&cookie);
1711 1711 continue;
1712 1712 }
1713 1713 pgcnt = page_get_pagecnt(pp->p_szc);
1714 1714
1715 1715 /*
1716 1716 * we cannot free a page which is permanently locked.
1717 1717 * The page_struct_lock need not be acquired to examine
1718 1718 * these fields since the page has an "exclusive" lock.
1719 1719 */
1720 1720 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1721 1721 page_unlock(pp);
1722 1722 segadvstat.MADV_FREE_miss.value.ul++;
1723 1723 anon_array_exit(&cookie);
1724 1724 continue;
1725 1725 }
1726 1726
1727 1727 ahm = AH_MUTEX(vp, off);
1728 1728 mutex_enter(ahm);
1729 1729 ASSERT(ap->an_refcnt != 0);
1730 1730 /*
1731 1731 * skip this one if copy-on-write is not yet broken.
1732 1732 */
1733 1733 if (ap->an_refcnt > 1) {
1734 1734 mutex_exit(ahm);
1735 1735 page_unlock(pp);
1736 1736 segadvstat.MADV_FREE_miss.value.ul++;
1737 1737 anon_array_exit(&cookie);
1738 1738 continue;
1739 1739 }
1740 1740
1741 1741 if (pp->p_szc == 0) {
1742 1742 pgcnt = 1;
1743 1743
1744 1744 /*
1745 1745 * free swap slot;
1746 1746 */
1747 1747 if (ap->an_pvp) {
1748 1748 swap_phys_free(ap->an_pvp, ap->an_poff,
1749 1749 PAGESIZE);
1750 1750 ap->an_pvp = NULL;
1751 1751 ap->an_poff = 0;
1752 1752 }
1753 1753 mutex_exit(ahm);
1754 1754 segadvstat.MADV_FREE_hit.value.ul++;
1755 1755
1756 1756 /*
1757 1757 * while we are at it, unload all the translations
1758 1758 * and attempt to free the page.
1759 1759 */
1760 1760 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1761 1761 /*LINTED: constant in conditional context */
1762 1762 VN_DISPOSE(pp, B_FREE, 0, kcred);
1763 1763 anon_array_exit(&cookie);
1764 1764 continue;
1765 1765 }
1766 1766
1767 1767 pgcnt = page_get_pagecnt(pp->p_szc);
1768 1768 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) {
1769 1769 if (!page_try_demote_pages(pp)) {
1770 1770 mutex_exit(ahm);
1771 1771 page_unlock(pp);
1772 1772 segadvstat.MADV_FREE_miss.value.ul++;
1773 1773 anon_array_exit(&cookie);
1774 1774 continue;
1775 1775 } else {
1776 1776 pgcnt = 1;
1777 1777 if (ap->an_pvp) {
1778 1778 swap_phys_free(ap->an_pvp,
1779 1779 ap->an_poff, PAGESIZE);
1780 1780 ap->an_pvp = NULL;
1781 1781 ap->an_poff = 0;
1782 1782 }
1783 1783 mutex_exit(ahm);
1784 1784 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1785 1785 /*LINTED*/
1786 1786 VN_DISPOSE(pp, B_FREE, 0, kcred);
1787 1787 segadvstat.MADV_FREE_hit.value.ul++;
1788 1788 anon_array_exit(&cookie);
1789 1789 continue;
1790 1790 }
1791 1791 }
1792 1792 mutex_exit(ahm);
1793 1793 root_pp = pp;
1794 1794
1795 1795 /*
1796 1796 * try to lock remaining pages
1797 1797 */
1798 1798 for (idx = 1; idx < pgcnt; idx++) {
1799 1799 pp++;
1800 1800 if (!page_trylock(pp, SE_EXCL))
1801 1801 break;
1802 1802 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1803 1803 page_unlock(pp);
1804 1804 break;
1805 1805 }
1806 1806 }
1807 1807
1808 1808 if (idx == pgcnt) {
1809 1809 for (i = 0; i < pgcnt; i++) {
1810 1810 ap = anon_get_ptr(ahp, index + i);
1811 1811 if (ap == NULL)
1812 1812 break;
1813 1813 swap_xlate(ap, &vp, &off);
1814 1814 ahm = AH_MUTEX(vp, off);
1815 1815 mutex_enter(ahm);
1816 1816 ASSERT(ap->an_refcnt != 0);
1817 1817
1818 1818 /*
1819 1819 * skip this one if copy-on-write
1820 1820 * is not yet broken.
1821 1821 */
1822 1822 if (ap->an_refcnt > 1) {
1823 1823 mutex_exit(ahm);
1824 1824 goto skiplp;
1825 1825 }
1826 1826 if (ap->an_pvp) {
1827 1827 swap_phys_free(ap->an_pvp,
1828 1828 ap->an_poff, PAGESIZE);
1829 1829 ap->an_pvp = NULL;
1830 1830 ap->an_poff = 0;
1831 1831 }
1832 1832 mutex_exit(ahm);
1833 1833 }
1834 1834 page_destroy_pages(root_pp);
1835 1835 segadvstat.MADV_FREE_hit.value.ul += pgcnt;
1836 1836 anon_array_exit(&cookie);
1837 1837 continue;
1838 1838 }
1839 1839 skiplp:
1840 1840 segadvstat.MADV_FREE_miss.value.ul += pgcnt;
1841 1841 for (i = 0, pp = root_pp; i < idx; pp++, i++)
1842 1842 page_unlock(pp);
1843 1843 anon_array_exit(&cookie);
1844 1844 }
1845 1845 }
1846 1846
1847 1847 /*
1848 1848 * Return the kept page(s) and protections back to the segment driver.
1849 1849 */
1850 1850 int
1851 1851 anon_getpage(
1852 1852 struct anon **app,
1853 1853 uint_t *protp,
1854 1854 page_t *pl[],
1855 1855 size_t plsz,
1856 1856 struct seg *seg,
1857 1857 caddr_t addr,
1858 1858 enum seg_rw rw,
1859 1859 struct cred *cred)
1860 1860 {
1861 1861 page_t *pp;
1862 1862 struct anon *ap = *app;
1863 1863 struct vnode *vp;
1864 1864 anoff_t off;
1865 1865 int err;
1866 1866 kmutex_t *ahm;
1867 1867
1868 1868 swap_xlate(ap, &vp, &off);
1869 1869
1870 1870 /*
1871 1871 * Lookup the page. If page is being paged in,
1872 1872 * wait for it to finish as we must return a list of
1873 1873 * pages since this routine acts like the VOP_GETPAGE
1874 1874 * routine does.
1875 1875 */
1876 1876 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) {
1877 1877 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1878 1878 mutex_enter(ahm);
1879 1879 if (ap->an_refcnt == 1)
1880 1880 *protp = PROT_ALL;
1881 1881 else
1882 1882 *protp = PROT_ALL & ~PROT_WRITE;
1883 1883 mutex_exit(ahm);
1884 1884 pl[0] = pp;
1885 1885 pl[1] = NULL;
1886 1886 return (0);
1887 1887 }
1888 1888
1889 1889 /*
1890 1890 * Simply treat it as a vnode fault on the anon vp.
1891 1891 */
1892 1892
1893 1893 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE,
1894 1894 "anon_getpage:seg %x addr %x vp %x",
1895 1895 seg, addr, vp);
1896 1896
1897 1897 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz,
1898 1898 seg, addr, rw, cred, NULL);
1899 1899
1900 1900 if (err == 0 && pl != NULL) {
1901 1901 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1902 1902 mutex_enter(ahm);
1903 1903 if (ap->an_refcnt != 1)
1904 1904 *protp &= ~PROT_WRITE; /* make read-only */
1905 1905 mutex_exit(ahm);
1906 1906 }
1907 1907 return (err);
1908 1908 }
1909 1909
1910 1910 /*
1911 1911 * Creates or returns kept pages to the segment driver. returns -1 if a large
1912 1912 * page cannot be allocated. returns -2 if some other process has allocated a
1913 1913 * larger page.
1914 1914 *
1915 1915 * For cowfault it will allocate any size pages to fill the requested area to
1916 1916 * avoid partially overwriting anon slots (i.e. sharing only some of the anon
1917 1917 * slots within a large page with other processes). This policy greatly
1918 1918 * simplifies large page freeing (which is only freed when all anon slot
1919 1919 * refcnts are 0).
1920 1920 */
1921 1921 int
1922 1922 anon_map_getpages(
1923 1923 struct anon_map *amp,
1924 1924 ulong_t start_idx,
1925 1925 uint_t szc,
1926 1926 struct seg *seg,
1927 1927 caddr_t addr,
1928 1928 uint_t prot,
1929 1929 uint_t *protp,
1930 1930 page_t *ppa[],
1931 1931 uint_t *ppa_szc,
1932 1932 struct vpage vpage[],
1933 1933 enum seg_rw rw,
1934 1934 int brkcow,
1935 1935 int anypgsz,
1936 1936 int pgflags,
1937 1937 struct cred *cred)
1938 1938 {
1939 1939 pgcnt_t pgcnt;
1940 1940 struct anon *ap;
1941 1941 struct vnode *vp;
1942 1942 anoff_t off;
1943 1943 page_t *pp, *pl[2], *conpp = NULL;
1944 1944 caddr_t vaddr;
1945 1945 ulong_t pg_idx, an_idx, i;
1946 1946 spgcnt_t nreloc = 0;
1947 1947 int prealloc = 1;
1948 1948 int err, slotcreate;
1949 1949 uint_t vpprot;
1950 1950 int upsize = (szc < seg->s_szc);
1951 1951
1952 1952 #if !defined(__i386) && !defined(__amd64)
1953 1953 ASSERT(seg->s_szc != 0);
1954 1954 #endif
1955 1955 ASSERT(szc <= seg->s_szc);
1956 1956 ASSERT(ppa_szc != NULL);
1957 1957 ASSERT(rw != S_CREATE);
1958 1958
1959 1959 *protp = PROT_ALL;
1960 1960
1961 1961 VM_STAT_ADD(anonvmstats.getpages[0]);
1962 1962
1963 1963 if (szc == 0) {
1964 1964 VM_STAT_ADD(anonvmstats.getpages[1]);
1965 1965 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) {
1966 1966 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg,
1967 1967 addr, rw, cred);
1968 1968 if (err)
1969 1969 return (err);
1970 1970 ppa[0] = pl[0];
1971 1971 if (brkcow == 0 || (*protp & PROT_WRITE)) {
1972 1972 VM_STAT_ADD(anonvmstats.getpages[2]);
1973 1973 if (ppa[0]->p_szc != 0 && upsize) {
1974 1974 VM_STAT_ADD(anonvmstats.getpages[3]);
1975 1975 *ppa_szc = MIN(ppa[0]->p_szc,
1976 1976 seg->s_szc);
1977 1977 page_unlock(ppa[0]);
1978 1978 return (-2);
1979 1979 }
1980 1980 return (0);
1981 1981 }
1982 1982 panic("anon_map_getpages: cowfault for szc 0");
1983 1983 } else {
1984 1984 VM_STAT_ADD(anonvmstats.getpages[4]);
1985 1985 ppa[0] = anon_zero(seg, addr, &ap, cred);
1986 1986 if (ppa[0] == NULL)
1987 1987 return (ENOMEM);
1988 1988 (void) anon_set_ptr(amp->ahp, start_idx, ap,
1989 1989 ANON_SLEEP);
1990 1990 return (0);
1991 1991 }
1992 1992 }
1993 1993
1994 1994 pgcnt = page_get_pagecnt(szc);
1995 1995 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1996 1996 ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
1997 1997
1998 1998 /*
1999 1999 * First we check for the case that the requtested large
2000 2000 * page or larger page already exists in the system.
2001 2001 * Actually we only check if the first constituent page
2002 2002 * exists and only preallocate if it's not found.
2003 2003 */
2004 2004 ap = anon_get_ptr(amp->ahp, start_idx);
2005 2005 if (ap) {
2006 2006 uint_t pszc;
2007 2007 swap_xlate(ap, &vp, &off);
2008 2008 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) {
2009 2009 if (pszc > szc && upsize) {
2010 2010 *ppa_szc = MIN(pszc, seg->s_szc);
2011 2011 return (-2);
2012 2012 }
2013 2013 if (pszc >= szc) {
2014 2014 prealloc = 0;
2015 2015 }
2016 2016 }
2017 2017 }
2018 2018
2019 2019 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]);
2020 2020 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]);
2021 2021
2022 2022 top:
2023 2023 /*
2024 2024 * If a smaller page or no page at all was found,
2025 2025 * grab a large page off the freelist.
2026 2026 */
2027 2027 if (prealloc) {
2028 2028 ASSERT(conpp == NULL);
2029 2029 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa,
2030 2030 szc, 0, pgflags) != 0) {
2031 2031 VM_STAT_ADD(anonvmstats.getpages[7]);
2032 2032 if (brkcow == 0 || szc < seg->s_szc ||
2033 2033 !anon_szcshare(amp->ahp, start_idx)) {
2034 2034 /*
2035 2035 * If the refcnt's of all anon slots are <= 1
2036 2036 * they can't increase since we are holding
2037 2037 * the address space's lock. So segvn can
2038 2038 * safely decrease szc without risking to
2039 2039 * generate a cow fault for the region smaller
2040 2040 * than the segment's largest page size.
2041 2041 */
2042 2042 VM_STAT_ADD(anonvmstats.getpages[8]);
2043 2043 return (-1);
2044 2044 }
2045 2045 docow:
2046 2046 /*
2047 2047 * This is a cow fault. Copy away the entire 1 large
2048 2048 * page region of this segment.
2049 2049 */
2050 2050 if (szc != seg->s_szc)
2051 2051 panic("anon_map_getpages: cowfault for szc %d",
2052 2052 szc);
2053 2053 vaddr = addr;
2054 2054 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
2055 2055 pg_idx++, an_idx++, vaddr += PAGESIZE) {
2056 2056 if ((ap = anon_get_ptr(amp->ahp, an_idx)) !=
2057 2057 NULL) {
2058 2058 err = anon_getpage(&ap, &vpprot, pl,
2059 2059 PAGESIZE, seg, vaddr, rw, cred);
2060 2060 if (err) {
2061 2061 for (i = 0; i < pg_idx; i++) {
2062 2062 if ((pp = ppa[i]) !=
2063 2063 NULL)
2064 2064 page_unlock(pp);
2065 2065 }
2066 2066 return (err);
2067 2067 }
2068 2068 ppa[pg_idx] = pl[0];
2069 2069 } else {
2070 2070 /*
2071 2071 * Since this is a cowfault we know
2072 2072 * that this address space has a
2073 2073 * parent or children which means
2074 2074 * anon_dup_fill_holes() has initialized
2075 2075 * all anon slots within a large page
2076 2076 * region that had at least one anon
2077 2077 * slot at the time of fork().
2078 2078 */
2079 2079 panic("anon_map_getpages: "
2080 2080 "cowfault but anon slot is empty");
2081 2081 }
2082 2082 }
2083 2083 VM_STAT_ADD(anonvmstats.getpages[9]);
2084 2084 *protp = PROT_ALL;
2085 2085 return (anon_map_privatepages(amp, start_idx, szc, seg,
2086 2086 addr, prot, ppa, vpage, anypgsz, pgflags, cred));
2087 2087 }
2088 2088 }
2089 2089
2090 2090 VM_STAT_ADD(anonvmstats.getpages[10]);
2091 2091
2092 2092 an_idx = start_idx;
2093 2093 pg_idx = 0;
2094 2094 vaddr = addr;
2095 2095 while (pg_idx < pgcnt) {
2096 2096 slotcreate = 0;
2097 2097 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) {
2098 2098 VM_STAT_ADD(anonvmstats.getpages[11]);
2099 2099 /*
2100 2100 * For us to have decided not to preallocate
2101 2101 * would have meant that a large page
2102 2102 * was found. Which also means that all of the
2103 2103 * anon slots for that page would have been
2104 2104 * already created for us.
2105 2105 */
2106 2106 if (prealloc == 0)
2107 2107 panic("anon_map_getpages: prealloc = 0");
2108 2108
2109 2109 slotcreate = 1;
2110 2110 ap = anon_alloc(NULL, 0);
2111 2111 }
2112 2112 swap_xlate(ap, &vp, &off);
2113 2113
2114 2114 /*
2115 2115 * Now setup our preallocated page to pass down
2116 2116 * to swap_getpage().
2117 2117 */
2118 2118 if (prealloc) {
2119 2119 ASSERT(ppa[pg_idx]->p_szc == szc);
2120 2120 conpp = ppa[pg_idx];
2121 2121 }
2122 2122 ASSERT(prealloc || conpp == NULL);
2123 2123
2124 2124 /*
2125 2125 * If we just created this anon slot then call
2126 2126 * with S_CREATE to prevent doing IO on the page.
2127 2127 * Similar to the anon_zero case.
2128 2128 */
2129 2129 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE,
2130 2130 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr,
2131 2131 slotcreate == 1 ? S_CREATE : rw, cred);
2132 2132
2133 2133 if (err) {
2134 2134 ASSERT(err != -2 || upsize);
2135 2135 VM_STAT_ADD(anonvmstats.getpages[12]);
2136 2136 ASSERT(slotcreate == 0);
2137 2137 goto io_err;
2138 2138 }
2139 2139
2140 2140 pp = pl[0];
2141 2141
2142 2142 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) {
2143 2143 VM_STAT_ADD(anonvmstats.getpages[13]);
2144 2144 ASSERT(slotcreate == 0);
2145 2145 ASSERT(prealloc == 0);
2146 2146 ASSERT(pg_idx == 0);
2147 2147 if (pp->p_szc > szc) {
2148 2148 ASSERT(upsize);
2149 2149 *ppa_szc = MIN(pp->p_szc, seg->s_szc);
2150 2150 page_unlock(pp);
2151 2151 VM_STAT_ADD(anonvmstats.getpages[14]);
2152 2152 return (-2);
2153 2153 }
2154 2154 page_unlock(pp);
2155 2155 prealloc = 1;
2156 2156 goto top;
2157 2157 }
2158 2158
2159 2159 /*
2160 2160 * If we decided to preallocate but VOP_GETPAGE
2161 2161 * found a page in the system that satisfies our
2162 2162 * request then free up our preallocated large page
2163 2163 * and continue looping accross the existing large
2164 2164 * page via VOP_GETPAGE.
2165 2165 */
2166 2166 if (prealloc && pp != ppa[pg_idx]) {
2167 2167 VM_STAT_ADD(anonvmstats.getpages[15]);
2168 2168 ASSERT(slotcreate == 0);
2169 2169 ASSERT(pg_idx == 0);
2170 2170 conpp = NULL;
2171 2171 prealloc = 0;
2172 2172 page_free_pages(ppa[0]);
2173 2173 }
2174 2174
2175 2175 if (prealloc && nreloc > 1) {
2176 2176 /*
2177 2177 * we have relocated out of a smaller large page.
2178 2178 * skip npgs - 1 iterations and continue which will
2179 2179 * increment by one the loop indices.
2180 2180 */
2181 2181 spgcnt_t npgs = nreloc;
2182 2182
2183 2183 VM_STAT_ADD(anonvmstats.getpages[16]);
2184 2184
2185 2185 ASSERT(pp == ppa[pg_idx]);
2186 2186 ASSERT(slotcreate == 0);
2187 2187 ASSERT(pg_idx + npgs <= pgcnt);
2188 2188 if ((*protp & PROT_WRITE) &&
2189 2189 anon_share(amp->ahp, an_idx, npgs)) {
2190 2190 *protp &= ~PROT_WRITE;
2191 2191 }
2192 2192 pg_idx += npgs;
2193 2193 an_idx += npgs;
2194 2194 vaddr += PAGESIZE * npgs;
2195 2195 continue;
2196 2196 }
2197 2197
2198 2198 VM_STAT_ADD(anonvmstats.getpages[17]);
2199 2199
2200 2200 /*
2201 2201 * Anon_zero case.
2202 2202 */
2203 2203 if (slotcreate) {
2204 2204 ASSERT(prealloc);
2205 2205 pagezero(pp, 0, PAGESIZE);
2206 2206 CPU_STATS_ADD_K(vm, zfod, 1);
2207 2207 hat_setrefmod(pp);
2208 2208 }
2209 2209
2210 2210 ASSERT(prealloc == 0 || ppa[pg_idx] == pp);
2211 2211 ASSERT(prealloc != 0 || PAGE_SHARED(pp));
2212 2212 ASSERT(prealloc == 0 || PAGE_EXCL(pp));
2213 2213
2214 2214 if (pg_idx > 0 &&
2215 2215 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) ||
2216 2216 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) {
2217 2217 panic("anon_map_getpages: unexpected page");
2218 2218 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) {
2219 2219 panic("anon_map_getpages: unaligned page");
2220 2220 }
2221 2221
2222 2222 if (prealloc == 0) {
2223 2223 ppa[pg_idx] = pp;
2224 2224 }
2225 2225
2226 2226 if (ap->an_refcnt > 1) {
2227 2227 VM_STAT_ADD(anonvmstats.getpages[18]);
2228 2228 *protp &= ~PROT_WRITE;
2229 2229 }
2230 2230
2231 2231 /*
2232 2232 * If this is a new anon slot then initialize
2233 2233 * the anon array entry.
2234 2234 */
2235 2235 if (slotcreate) {
2236 2236 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
2237 2237 }
2238 2238 pg_idx++;
2239 2239 an_idx++;
2240 2240 vaddr += PAGESIZE;
2241 2241 }
2242 2242
2243 2243 /*
2244 2244 * Since preallocated pages come off the freelist
2245 2245 * they are locked SE_EXCL. Simply downgrade and return.
2246 2246 */
2247 2247 if (prealloc) {
2248 2248 VM_STAT_ADD(anonvmstats.getpages[19]);
2249 2249 conpp = NULL;
2250 2250 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2251 2251 page_downgrade(ppa[pg_idx]);
2252 2252 }
2253 2253 }
2254 2254 ASSERT(conpp == NULL);
2255 2255
2256 2256 if (brkcow == 0 || (*protp & PROT_WRITE)) {
2257 2257 VM_STAT_ADD(anonvmstats.getpages[20]);
2258 2258 return (0);
2259 2259 }
2260 2260
2261 2261 if (szc < seg->s_szc)
2262 2262 panic("anon_map_getpages: cowfault for szc %d", szc);
2263 2263
2264 2264 VM_STAT_ADD(anonvmstats.getpages[21]);
2265 2265
2266 2266 *protp = PROT_ALL;
2267 2267 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot,
2268 2268 ppa, vpage, anypgsz, pgflags, cred));
2269 2269 io_err:
2270 2270 /*
2271 2271 * We got an IO error somewhere in our large page.
2272 2272 * If we were using a preallocated page then just demote
2273 2273 * all the constituent pages that we've succeeded with sofar
2274 2274 * to PAGESIZE pages and leave them in the system
2275 2275 * unlocked.
2276 2276 */
2277 2277
2278 2278 ASSERT(err != -2 || ((pg_idx == 0) && upsize));
2279 2279
2280 2280 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]);
2281 2281 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]);
2282 2282 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]);
2283 2283
2284 2284 if (prealloc) {
2285 2285 conpp = NULL;
2286 2286 if (pg_idx > 0) {
2287 2287 VM_STAT_ADD(anonvmstats.getpages[25]);
2288 2288 for (i = 0; i < pgcnt; i++) {
2289 2289 pp = ppa[i];
2290 2290 ASSERT(PAGE_EXCL(pp));
2291 2291 ASSERT(pp->p_szc == szc);
2292 2292 pp->p_szc = 0;
2293 2293 }
2294 2294 for (i = 0; i < pg_idx; i++) {
2295 2295 ASSERT(!hat_page_is_mapped(ppa[i]));
2296 2296 page_unlock(ppa[i]);
2297 2297 }
2298 2298 /*
2299 2299 * Now free up the remaining unused constituent
2300 2300 * pages.
2301 2301 */
2302 2302 while (pg_idx < pgcnt) {
2303 2303 ASSERT(!hat_page_is_mapped(ppa[pg_idx]));
2304 2304 page_free(ppa[pg_idx], 0);
2305 2305 pg_idx++;
2306 2306 }
2307 2307 } else {
2308 2308 VM_STAT_ADD(anonvmstats.getpages[26]);
2309 2309 page_free_pages(ppa[0]);
2310 2310 }
2311 2311 } else {
2312 2312 VM_STAT_ADD(anonvmstats.getpages[27]);
2313 2313 ASSERT(err > 0);
2314 2314 for (i = 0; i < pg_idx; i++)
2315 2315 page_unlock(ppa[i]);
2316 2316 }
2317 2317 ASSERT(conpp == NULL);
2318 2318 if (err != -1)
2319 2319 return (err);
2320 2320 /*
2321 2321 * we are here because we failed to relocate.
2322 2322 */
2323 2323 ASSERT(prealloc);
2324 2324 if (brkcow == 0 || szc < seg->s_szc ||
2325 2325 !anon_szcshare(amp->ahp, start_idx)) {
2326 2326 VM_STAT_ADD(anonvmstats.getpages[28]);
2327 2327 return (-1);
2328 2328 }
2329 2329 VM_STAT_ADD(anonvmstats.getpages[29]);
2330 2330 goto docow;
2331 2331 }
2332 2332
2333 2333
2334 2334 /*
2335 2335 * Turn a reference to an object or shared anon page
2336 2336 * into a private page with a copy of the data from the
2337 2337 * original page which is always locked by the caller.
2338 2338 * This routine unloads the translation and unlocks the
2339 2339 * original page, if it isn't being stolen, before returning
2340 2340 * to the caller.
2341 2341 *
2342 2342 * NOTE: The original anon slot is not freed by this routine
2343 2343 * It must be freed by the caller while holding the
2344 2344 * "anon_map" lock to prevent races which can occur if
2345 2345 * a process has multiple lwps in its address space.
2346 2346 */
2347 2347 page_t *
2348 2348 anon_private(
2349 2349 struct anon **app,
2350 2350 struct seg *seg,
2351 2351 caddr_t addr,
2352 2352 uint_t prot,
2353 2353 page_t *opp,
2354 2354 int oppflags,
2355 2355 struct cred *cred)
2356 2356 {
2357 2357 struct anon *old = *app;
2358 2358 struct anon *new;
2359 2359 page_t *pp = NULL;
2360 2360 struct vnode *vp;
2361 2361 anoff_t off;
2362 2362 page_t *anon_pl[1 + 1];
2363 2363 int err;
2364 2364
2365 2365 if (oppflags & STEAL_PAGE)
2366 2366 ASSERT(PAGE_EXCL(opp));
2367 2367 else
2368 2368 ASSERT(PAGE_LOCKED(opp));
2369 2369
2370 2370 CPU_STATS_ADD_K(vm, cow_fault, 1);
2371 2371
2372 2372 /* Kernel probe */
2373 2373 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */,
2374 2374 tnf_opaque, address, addr);
2375 2375
2376 2376 *app = new = anon_alloc(NULL, 0);
2377 2377 swap_xlate(new, &vp, &off);
2378 2378
2379 2379 if (oppflags & STEAL_PAGE) {
2380 2380 page_rename(opp, vp, (u_offset_t)off);
2381 2381 pp = opp;
2382 2382 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE,
2383 2383 "anon_private:seg %p addr %x pp %p vp %p off %lx",
2384 2384 seg, addr, pp, vp, off);
2385 2385 hat_setmod(pp);
2386 2386
2387 2387 /* bug 4026339 */
2388 2388 page_downgrade(pp);
2389 2389 return (pp);
2390 2390 }
2391 2391
2392 2392 /*
2393 2393 * Call the VOP_GETPAGE routine to create the page, thereby
2394 2394 * enabling the vnode driver to allocate any filesystem
2395 2395 * space (e.g., disk block allocation for UFS). This also
2396 2396 * prevents more than one page from being added to the
2397 2397 * vnode at the same time.
2398 2398 */
2399 2399 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL,
2400 2400 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL);
2401 2401 if (err)
2402 2402 goto out;
2403 2403
2404 2404 pp = anon_pl[0];
2405 2405
2406 2406 /*
2407 2407 * If the original page was locked, we need to move the lock
2408 2408 * to the new page by transfering 'cowcnt/lckcnt' of the original
2409 2409 * page to 'cowcnt/lckcnt' of the new page.
2410 2410 *
2411 2411 * See Statement at the beginning of segvn_lockop() and
2412 2412 * comments in page_pp_useclaim() regarding the way
2413 2413 * cowcnts/lckcnts are handled.
2414 2414 *
2415 2415 * Also availrmem must be decremented up front for read only mapping
2416 2416 * before calling page_pp_useclaim. page_pp_useclaim will bump it back
2417 2417 * if availrmem did not need to be decremented after all.
2418 2418 */
2419 2419 if (oppflags & LOCK_PAGE) {
2420 2420 if ((prot & PROT_WRITE) == 0) {
2421 2421 mutex_enter(&freemem_lock);
2422 2422 if (availrmem > pages_pp_maximum) {
2423 2423 availrmem--;
2424 2424 pages_useclaim++;
2425 2425 } else {
2426 2426 mutex_exit(&freemem_lock);
2427 2427 goto out;
2428 2428 }
2429 2429 mutex_exit(&freemem_lock);
2430 2430 }
2431 2431 page_pp_useclaim(opp, pp, prot & PROT_WRITE);
2432 2432 }
2433 2433
2434 2434 /*
2435 2435 * Now copy the contents from the original page,
2436 2436 * which is locked and loaded in the MMU by
2437 2437 * the caller to prevent yet another page fault.
2438 2438 */
2439 2439 /* XXX - should set mod bit in here */
2440 2440 if (ppcopy(opp, pp) == 0) {
2441 2441 /*
2442 2442 * Before ppcopy could hanlde UE or other faults, we
2443 2443 * would have panicked here, and still have no option
2444 2444 * but to do so now.
2445 2445 */
2446 2446 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p",
2447 2447 (void *)opp, (void *)pp);
2448 2448 }
2449 2449
2450 2450 hat_setrefmod(pp); /* mark as modified */
2451 2451
2452 2452 /*
2453 2453 * Unload the old translation.
2454 2454 */
2455 2455 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD);
2456 2456
2457 2457 /*
2458 2458 * Free unmapped, unmodified original page.
2459 2459 * or release the lock on the original page,
2460 2460 * otherwise the process will sleep forever in
2461 2461 * anon_decref() waiting for the "exclusive" lock
2462 2462 * on the page.
2463 2463 */
2464 2464 (void) page_release(opp, 1);
2465 2465
2466 2466 /*
2467 2467 * we are done with page creation so downgrade the new
2468 2468 * page's selock to shared, this helps when multiple
2469 2469 * as_fault(...SOFTLOCK...) are done to the same
2470 2470 * page(aio)
2471 2471 */
2472 2472 page_downgrade(pp);
2473 2473
2474 2474 /*
2475 2475 * NOTE: The original anon slot must be freed by the
2476 2476 * caller while holding the "anon_map" lock, if we
2477 2477 * copied away from an anonymous page.
2478 2478 */
2479 2479 return (pp);
2480 2480
2481 2481 out:
2482 2482 *app = old;
2483 2483 if (pp)
2484 2484 page_unlock(pp);
2485 2485 anon_decref(new);
2486 2486 page_unlock(opp);
2487 2487 return ((page_t *)NULL);
2488 2488 }
2489 2489
2490 2490 int
2491 2491 anon_map_privatepages(
2492 2492 struct anon_map *amp,
2493 2493 ulong_t start_idx,
2494 2494 uint_t szc,
2495 2495 struct seg *seg,
2496 2496 caddr_t addr,
2497 2497 uint_t prot,
2498 2498 page_t *ppa[],
2499 2499 struct vpage vpage[],
2500 2500 int anypgsz,
2501 2501 int pgflags,
2502 2502 struct cred *cred)
2503 2503 {
2504 2504 pgcnt_t pgcnt;
2505 2505 struct vnode *vp;
2506 2506 anoff_t off;
2507 2507 page_t *pl[2], *conpp = NULL;
2508 2508 int err;
2509 2509 int prealloc = 1;
2510 2510 struct anon *ap, *oldap;
2511 2511 caddr_t vaddr;
2512 2512 page_t *pplist, *pp;
2513 2513 ulong_t pg_idx, an_idx;
2514 2514 spgcnt_t nreloc = 0;
2515 2515 int pagelock = 0;
2516 2516 kmutex_t *ahmpages = NULL;
2517 2517 #ifdef DEBUG
2518 2518 int refcnt;
2519 2519 #endif
2520 2520
2521 2521 ASSERT(szc != 0);
2522 2522 ASSERT(szc == seg->s_szc);
2523 2523
2524 2524 VM_STAT_ADD(anonvmstats.privatepages[0]);
2525 2525
2526 2526 pgcnt = page_get_pagecnt(szc);
2527 2527 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
2528 2528 ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
2529 2529
2530 2530 ASSERT(amp != NULL);
2531 2531 ap = anon_get_ptr(amp->ahp, start_idx);
2532 2532 ASSERT(ap == NULL || ap->an_refcnt >= 1);
2533 2533
2534 2534 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]);
2535 2535
2536 2536 /*
2537 2537 * Now try and allocate the large page. If we fail then just
2538 2538 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let
2539 2539 * the caller make this decision but to avoid added complexity
2540 2540 * it's simplier to handle that case here.
2541 2541 */
2542 2542 if (anypgsz == -1) {
2543 2543 VM_STAT_ADD(anonvmstats.privatepages[2]);
2544 2544 prealloc = 0;
2545 2545 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc,
2546 2546 anypgsz, pgflags) != 0) {
2547 2547 VM_STAT_ADD(anonvmstats.privatepages[3]);
2548 2548 prealloc = 0;
2549 2549 }
2550 2550
2551 2551 /*
2552 2552 * make the decrement of all refcnts of all
2553 2553 * anon slots of a large page appear atomic by
2554 2554 * getting an anonpages_hash_lock for the
2555 2555 * first anon slot of a large page.
2556 2556 */
2557 2557 if (ap != NULL) {
2558 2558 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
2559 2559 mutex_enter(ahmpages);
2560 2560 if (ap->an_refcnt == 1) {
2561 2561 VM_STAT_ADD(anonvmstats.privatepages[4]);
2562 2562 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt));
2563 2563 mutex_exit(ahmpages);
2564 2564
2565 2565 if (prealloc) {
2566 2566 page_free_replacement_page(pplist);
2567 2567 page_create_putback(pgcnt);
2568 2568 }
2569 2569 ASSERT(ppa[0]->p_szc <= szc);
2570 2570 if (ppa[0]->p_szc == szc) {
2571 2571 VM_STAT_ADD(anonvmstats.privatepages[5]);
2572 2572 return (0);
2573 2573 }
2574 2574 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2575 2575 ASSERT(ppa[pg_idx] != NULL);
2576 2576 page_unlock(ppa[pg_idx]);
2577 2577 }
2578 2578 return (-1);
2579 2579 }
2580 2580 }
2581 2581
2582 2582 /*
2583 2583 * If we are passed in the vpage array and this is
2584 2584 * not PROT_WRITE then we need to decrement availrmem
2585 2585 * up front before we try anything. If we need to and
2586 2586 * can't decrement availrmem then its better to fail now
2587 2587 * than in the middle of processing the new large page.
2588 2588 * page_pp_usclaim() on behalf of each constituent page
2589 2589 * below will adjust availrmem back for the cases not needed.
2590 2590 */
2591 2591 if (vpage != NULL && (prot & PROT_WRITE) == 0) {
2592 2592 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2593 2593 if (VPP_ISPPLOCK(&vpage[pg_idx])) {
2594 2594 pagelock = 1;
2595 2595 break;
2596 2596 }
2597 2597 }
2598 2598 if (pagelock) {
2599 2599 VM_STAT_ADD(anonvmstats.privatepages[6]);
2600 2600 mutex_enter(&freemem_lock);
2601 2601 if (availrmem >= pages_pp_maximum + pgcnt) {
2602 2602 availrmem -= pgcnt;
2603 2603 pages_useclaim += pgcnt;
2604 2604 } else {
2605 2605 VM_STAT_ADD(anonvmstats.privatepages[7]);
2606 2606 mutex_exit(&freemem_lock);
2607 2607 if (ahmpages != NULL) {
2608 2608 mutex_exit(ahmpages);
2609 2609 }
2610 2610 if (prealloc) {
2611 2611 page_free_replacement_page(pplist);
2612 2612 page_create_putback(pgcnt);
2613 2613 }
2614 2614 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++)
2615 2615 if (ppa[pg_idx] != NULL)
2616 2616 page_unlock(ppa[pg_idx]);
2617 2617 return (ENOMEM);
2618 2618 }
2619 2619 mutex_exit(&freemem_lock);
2620 2620 }
2621 2621 }
2622 2622
2623 2623 CPU_STATS_ADD_K(vm, cow_fault, pgcnt);
2624 2624
2625 2625 VM_STAT_ADD(anonvmstats.privatepages[8]);
2626 2626
2627 2627 an_idx = start_idx;
2628 2628 pg_idx = 0;
2629 2629 vaddr = addr;
2630 2630 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) {
2631 2631 ASSERT(ppa[pg_idx] != NULL);
2632 2632 oldap = anon_get_ptr(amp->ahp, an_idx);
2633 2633 ASSERT(ahmpages != NULL || oldap == NULL);
2634 2634 ASSERT(ahmpages == NULL || oldap != NULL);
2635 2635 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
2636 2636 ASSERT(ahmpages == NULL || pg_idx != 0 ||
2637 2637 (refcnt = oldap->an_refcnt));
2638 2638 ASSERT(ahmpages == NULL || pg_idx == 0 ||
2639 2639 refcnt == oldap->an_refcnt);
2640 2640
2641 2641 ap = anon_alloc(NULL, 0);
2642 2642
2643 2643 swap_xlate(ap, &vp, &off);
2644 2644
2645 2645 /*
2646 2646 * Now setup our preallocated page to pass down to
2647 2647 * swap_getpage().
2648 2648 */
2649 2649 if (prealloc) {
2650 2650 pp = pplist;
2651 2651 page_sub(&pplist, pp);
2652 2652 conpp = pp;
2653 2653 }
2654 2654
2655 2655 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl,
2656 2656 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr,
2657 2657 S_CREATE, cred);
2658 2658
2659 2659 /*
2660 2660 * Impossible to fail this is S_CREATE.
2661 2661 */
2662 2662 if (err)
2663 2663 panic("anon_map_privatepages: VOP_GETPAGE failed");
2664 2664
2665 2665 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0);
2666 2666 ASSERT(prealloc == 0 || nreloc == 1);
2667 2667
2668 2668 pp = pl[0];
2669 2669
2670 2670 /*
2671 2671 * If the original page was locked, we need to move
2672 2672 * the lock to the new page by transfering
2673 2673 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt'
2674 2674 * of the new page. pg_idx can be used to index
2675 2675 * into the vpage array since the caller will guarentee
2676 2676 * that vpage struct passed in corresponds to addr
2677 2677 * and forward.
2678 2678 */
2679 2679 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) {
2680 2680 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE);
2681 2681 } else if (pagelock) {
2682 2682 mutex_enter(&freemem_lock);
2683 2683 availrmem++;
2684 2684 pages_useclaim--;
2685 2685 mutex_exit(&freemem_lock);
2686 2686 }
2687 2687
2688 2688 /*
2689 2689 * Now copy the contents from the original page.
2690 2690 */
2691 2691 if (ppcopy(ppa[pg_idx], pp) == 0) {
2692 2692 /*
2693 2693 * Before ppcopy could hanlde UE or other faults, we
2694 2694 * would have panicked here, and still have no option
2695 2695 * but to do so now.
2696 2696 */
2697 2697 panic("anon_map_privatepages, ppcopy failed");
2698 2698 }
2699 2699
2700 2700 hat_setrefmod(pp); /* mark as modified */
2701 2701
2702 2702 /*
2703 2703 * Release the lock on the original page,
2704 2704 * derement the old slot, and down grade the lock
2705 2705 * on the new copy.
2706 2706 */
2707 2707 page_unlock(ppa[pg_idx]);
2708 2708
2709 2709 if (!prealloc)
2710 2710 page_downgrade(pp);
2711 2711
2712 2712 ppa[pg_idx] = pp;
2713 2713
2714 2714 /*
2715 2715 * Now reflect the copy in the new anon array.
2716 2716 */
2717 2717 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
2718 2718 if (oldap != NULL)
2719 2719 anon_decref(oldap);
2720 2720 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
2721 2721 }
2722 2722
2723 2723 /*
2724 2724 * Unload the old large page translation.
2725 2725 */
2726 2726 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD);
2727 2727
2728 2728 if (ahmpages != NULL) {
2729 2729 mutex_exit(ahmpages);
2730 2730 }
2731 2731 ASSERT(prealloc == 0 || pplist == NULL);
2732 2732 if (prealloc) {
2733 2733 VM_STAT_ADD(anonvmstats.privatepages[9]);
2734 2734 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2735 2735 page_downgrade(ppa[pg_idx]);
2736 2736 }
2737 2737 }
2738 2738
2739 2739 return (0);
2740 2740 }
2741 2741
2742 2742 /*
2743 2743 * Allocate a private zero-filled anon page.
2744 2744 */
2745 2745 page_t *
2746 2746 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred)
2747 2747 {
2748 2748 struct anon *ap;
2749 2749 page_t *pp;
2750 2750 struct vnode *vp;
2751 2751 anoff_t off;
2752 2752 page_t *anon_pl[1 + 1];
2753 2753 int err;
2754 2754
2755 2755 /* Kernel probe */
2756 2756 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */,
2757 2757 tnf_opaque, address, addr);
2758 2758
2759 2759 *app = ap = anon_alloc(NULL, 0);
2760 2760 swap_xlate(ap, &vp, &off);
2761 2761
2762 2762 /*
2763 2763 * Call the VOP_GETPAGE routine to create the page, thereby
2764 2764 * enabling the vnode driver to allocate any filesystem
2765 2765 * dependent structures (e.g., disk block allocation for UFS).
2766 2766 * This also prevents more than on page from being added to
2767 2767 * the vnode at the same time since it is locked.
2768 2768 */
2769 2769 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL,
2770 2770 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL);
2771 2771 if (err) {
2772 2772 *app = NULL;
2773 2773 anon_decref(ap);
2774 2774 return (NULL);
2775 2775 }
2776 2776 pp = anon_pl[0];
2777 2777
2778 2778 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */
2779 2779 page_downgrade(pp);
2780 2780 CPU_STATS_ADD_K(vm, zfod, 1);
2781 2781 hat_setrefmod(pp); /* mark as modified so pageout writes back */
2782 2782 return (pp);
2783 2783 }
2784 2784
2785 2785
2786 2786 /*
2787 2787 * Allocate array of private zero-filled anon pages for empty slots
2788 2788 * and kept pages for non empty slots within given range.
2789 2789 *
2790 2790 * NOTE: This rontine will try and use large pages
2791 2791 * if available and supported by underlying platform.
2792 2792 */
2793 2793 int
2794 2794 anon_map_createpages(
2795 2795 struct anon_map *amp,
2796 2796 ulong_t start_index,
2797 2797 size_t len,
2798 2798 page_t *ppa[],
2799 2799 struct seg *seg,
2800 2800 caddr_t addr,
2801 2801 enum seg_rw rw,
2802 2802 struct cred *cred)
2803 2803 {
2804 2804
2805 2805 struct anon *ap;
2806 2806 struct vnode *ap_vp;
2807 2807 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL;
2808 2808 int err = 0;
2809 2809 ulong_t p_index, index;
2810 2810 pgcnt_t npgs, pg_cnt;
2811 2811 spgcnt_t nreloc = 0;
2812 2812 uint_t l_szc, szc, prot;
2813 2813 anoff_t ap_off;
2814 2814 size_t pgsz;
2815 2815 lgrp_t *lgrp;
2816 2816 kmutex_t *ahm;
2817 2817
2818 2818 /*
2819 2819 * XXX For now only handle S_CREATE.
2820 2820 */
2821 2821 ASSERT(rw == S_CREATE);
2822 2822
2823 2823 index = start_index;
2824 2824 p_index = 0;
2825 2825 npgs = btopr(len);
2826 2826
2827 2827 /*
2828 2828 * If this platform supports multiple page sizes
2829 2829 * then try and allocate directly from the free
2830 2830 * list for pages larger than PAGESIZE.
2831 2831 *
2832 2832 * NOTE:When we have page_create_ru we can stop
2833 2833 * directly allocating from the freelist.
2834 2834 */
2835 2835 l_szc = seg->s_szc;
2836 2836 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
2837 2837 while (npgs) {
2838 2838
2839 2839 /*
2840 2840 * if anon slot already exists
2841 2841 * (means page has been created)
2842 2842 * so 1) look up the page
2843 2843 * 2) if the page is still in memory, get it.
2844 2844 * 3) if not, create a page and
2845 2845 * page in from physical swap device.
2846 2846 * These are done in anon_getpage().
2847 2847 */
2848 2848 ap = anon_get_ptr(amp->ahp, index);
2849 2849 if (ap) {
2850 2850 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE,
2851 2851 seg, addr, S_READ, cred);
2852 2852 if (err) {
2853 2853 ANON_LOCK_EXIT(&->a_rwlock);
2854 2854 panic("anon_map_createpages: anon_getpage");
2855 2855 }
2856 2856 pp = anon_pl[0];
2857 2857 ppa[p_index++] = pp;
2858 2858
2859 2859 /*
2860 2860 * an_pvp can become non-NULL after SysV's page was
2861 2861 * paged out before ISM was attached to this SysV
2862 2862 * shared memory segment. So free swap slot if needed.
2863 2863 */
2864 2864 if (ap->an_pvp != NULL) {
2865 2865 page_io_lock(pp);
2866 2866 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
2867 2867 mutex_enter(ahm);
2868 2868 if (ap->an_pvp != NULL) {
2869 2869 swap_phys_free(ap->an_pvp,
2870 2870 ap->an_poff, PAGESIZE);
2871 2871 ap->an_pvp = NULL;
2872 2872 ap->an_poff = 0;
2873 2873 mutex_exit(ahm);
2874 2874 hat_setmod(pp);
2875 2875 } else {
2876 2876 mutex_exit(ahm);
2877 2877 }
2878 2878 page_io_unlock(pp);
2879 2879 }
2880 2880
2881 2881 addr += PAGESIZE;
2882 2882 index++;
2883 2883 npgs--;
2884 2884 continue;
2885 2885 }
2886 2886 /*
2887 2887 * Now try and allocate the largest page possible
2888 2888 * for the current address and range.
2889 2889 * Keep dropping down in page size until:
2890 2890 *
2891 2891 * 1) Properly aligned
2892 2892 * 2) Does not overlap existing anon pages
2893 2893 * 3) Fits in remaining range.
2894 2894 * 4) able to allocate one.
2895 2895 *
2896 2896 * NOTE: XXX When page_create_ru is completed this code
2897 2897 * will change.
2898 2898 */
2899 2899 szc = l_szc;
2900 2900 pplist = NULL;
2901 2901 pg_cnt = 0;
2902 2902 while (szc) {
2903 2903 pgsz = page_get_pagesize(szc);
2904 2904 pg_cnt = pgsz >> PAGESHIFT;
2905 2905 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs &&
2906 2906 anon_pages(amp->ahp, index, pg_cnt) == 0) {
2907 2907 /*
2908 2908 * XXX
2909 2909 * Since we are faking page_create()
2910 2910 * we also need to do the freemem and
2911 2911 * pcf accounting.
2912 2912 */
2913 2913 (void) page_create_wait(pg_cnt, PG_WAIT);
2914 2914
2915 2915 /*
2916 2916 * Get lgroup to allocate next page of shared
2917 2917 * memory from and use it to specify where to
2918 2918 * allocate the physical memory
2919 2919 */
2920 2920 lgrp = lgrp_mem_choose(seg, addr, pgsz);
2921 2921
2922 2922 pplist = page_get_freelist(
2923 2923 anon_vp, (u_offset_t)0, seg,
2924 2924 addr, pgsz, 0, lgrp);
2925 2925
2926 2926 if (pplist == NULL) {
2927 2927 page_create_putback(pg_cnt);
2928 2928 }
2929 2929
2930 2930 /*
2931 2931 * If a request for a page of size
2932 2932 * larger than PAGESIZE failed
2933 2933 * then don't try that size anymore.
2934 2934 */
2935 2935 if (pplist == NULL) {
2936 2936 l_szc = szc - 1;
2937 2937 } else {
2938 2938 break;
2939 2939 }
2940 2940 }
2941 2941 szc--;
2942 2942 }
2943 2943
2944 2944 /*
2945 2945 * If just using PAGESIZE pages then don't
2946 2946 * directly allocate from the free list.
2947 2947 */
2948 2948 if (pplist == NULL) {
2949 2949 ASSERT(szc == 0);
2950 2950 pp = anon_zero(seg, addr, &ap, cred);
2951 2951 if (pp == NULL) {
2952 2952 ANON_LOCK_EXIT(&->a_rwlock);
2953 2953 panic("anon_map_createpages: anon_zero");
2954 2954 }
2955 2955 ppa[p_index++] = pp;
2956 2956
2957 2957 ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
2958 2958 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
2959 2959
2960 2960 addr += PAGESIZE;
2961 2961 index++;
2962 2962 npgs--;
2963 2963 continue;
2964 2964 }
2965 2965
2966 2966 /*
2967 2967 * pplist is a list of pg_cnt PAGESIZE pages.
2968 2968 * These pages are locked SE_EXCL since they
2969 2969 * came directly off the free list.
2970 2970 */
2971 2971 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt));
2972 2972 ASSERT(IS_P2ALIGNED(index, pg_cnt));
2973 2973 ASSERT(conpp == NULL);
2974 2974 while (pg_cnt--) {
2975 2975
2976 2976 ap = anon_alloc(NULL, 0);
2977 2977 swap_xlate(ap, &ap_vp, &ap_off);
2978 2978
2979 2979 ASSERT(pplist != NULL);
2980 2980 pp = pplist;
2981 2981 page_sub(&pplist, pp);
2982 2982 PP_CLRFREE(pp);
2983 2983 PP_CLRAGED(pp);
2984 2984 conpp = pp;
2985 2985
2986 2986 err = swap_getconpage(ap_vp, ap_off, PAGESIZE,
2987 2987 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL,
2988 2988 &nreloc, seg, addr, S_CREATE, cred);
2989 2989
2990 2990 if (err) {
2991 2991 ANON_LOCK_EXIT(&->a_rwlock);
2992 2992 panic("anon_map_createpages: S_CREATE");
2993 2993 }
2994 2994
2995 2995 ASSERT(anon_pl[0] == pp);
2996 2996 ASSERT(nreloc == 1);
2997 2997 pagezero(pp, 0, PAGESIZE);
2998 2998 CPU_STATS_ADD_K(vm, zfod, 1);
2999 2999 hat_setrefmod(pp);
3000 3000
3001 3001 ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
3002 3002 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
3003 3003
3004 3004 ppa[p_index++] = pp;
3005 3005
3006 3006 addr += PAGESIZE;
3007 3007 index++;
3008 3008 npgs--;
3009 3009 }
3010 3010 conpp = NULL;
3011 3011 pg_cnt = pgsz >> PAGESHIFT;
3012 3012 p_index = p_index - pg_cnt;
3013 3013 while (pg_cnt--) {
3014 3014 page_downgrade(ppa[p_index++]);
3015 3015 }
3016 3016 }
3017 3017 ANON_LOCK_EXIT(&->a_rwlock);
3018 3018 return (0);
3019 3019 }
3020 3020
3021 3021 static int
3022 3022 anon_try_demote_pages(
3023 3023 struct anon_hdr *ahp,
3024 3024 ulong_t sidx,
3025 3025 uint_t szc,
3026 3026 page_t **ppa,
3027 3027 int private)
3028 3028 {
3029 3029 struct anon *ap;
3030 3030 pgcnt_t pgcnt = page_get_pagecnt(szc);
3031 3031 page_t *pp;
3032 3032 pgcnt_t i;
3033 3033 kmutex_t *ahmpages = NULL;
3034 3034 int root = 0;
3035 3035 pgcnt_t npgs;
3036 3036 pgcnt_t curnpgs = 0;
3037 3037 size_t ppasize = 0;
3038 3038
3039 3039 ASSERT(szc != 0);
3040 3040 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
3041 3041 ASSERT(IS_P2ALIGNED(sidx, pgcnt));
3042 3042 ASSERT(sidx < ahp->size);
3043 3043
3044 3044 if (ppa == NULL) {
3045 3045 ppasize = pgcnt * sizeof (page_t *);
3046 3046 ppa = kmem_alloc(ppasize, KM_SLEEP);
3047 3047 }
3048 3048
3049 3049 ap = anon_get_ptr(ahp, sidx);
3050 3050 if (ap != NULL && private) {
3051 3051 VM_STAT_ADD(anonvmstats.demotepages[1]);
3052 3052 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
3053 3053 mutex_enter(ahmpages);
3054 3054 }
3055 3055
3056 3056 if (ap != NULL && ap->an_refcnt > 1) {
3057 3057 if (ahmpages != NULL) {
3058 3058 VM_STAT_ADD(anonvmstats.demotepages[2]);
3059 3059 mutex_exit(ahmpages);
3060 3060 }
3061 3061 if (ppasize != 0) {
3062 3062 kmem_free(ppa, ppasize);
3063 3063 }
3064 3064 return (0);
3065 3065 }
3066 3066 if (ahmpages != NULL) {
3067 3067 mutex_exit(ahmpages);
3068 3068 }
3069 3069 if (ahp->size - sidx < pgcnt) {
3070 3070 ASSERT(private == 0);
3071 3071 pgcnt = ahp->size - sidx;
3072 3072 }
3073 3073 for (i = 0; i < pgcnt; i++, sidx++) {
3074 3074 ap = anon_get_ptr(ahp, sidx);
3075 3075 if (ap != NULL) {
3076 3076 if (ap->an_refcnt != 1) {
3077 3077 panic("anon_try_demote_pages: an_refcnt != 1");
3078 3078 }
3079 3079 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off,
3080 3080 SE_EXCL);
3081 3081 if (pp != NULL) {
3082 3082 (void) hat_pageunload(pp,
3083 3083 HAT_FORCE_PGUNLOAD);
3084 3084 }
3085 3085 } else {
3086 3086 ppa[i] = NULL;
3087 3087 }
3088 3088 }
3089 3089 for (i = 0; i < pgcnt; i++) {
3090 3090 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) {
3091 3091 ASSERT(pp->p_szc <= szc);
3092 3092 if (!root) {
3093 3093 VM_STAT_ADD(anonvmstats.demotepages[3]);
3094 3094 if (curnpgs != 0)
3095 3095 panic("anon_try_demote_pages: "
3096 3096 "bad large page");
3097 3097
3098 3098 root = 1;
3099 3099 curnpgs = npgs =
3100 3100 page_get_pagecnt(pp->p_szc);
3101 3101
3102 3102 ASSERT(npgs <= pgcnt);
3103 3103 ASSERT(IS_P2ALIGNED(npgs, npgs));
3104 3104 ASSERT(!(page_pptonum(pp) & (npgs - 1)));
3105 3105 } else {
3106 3106 ASSERT(i > 0);
3107 3107 ASSERT(page_pptonum(pp) - 1 ==
3108 3108 page_pptonum(ppa[i - 1]));
3109 3109 if ((page_pptonum(pp) & (npgs - 1)) ==
3110 3110 npgs - 1)
3111 3111 root = 0;
3112 3112 }
3113 3113 ASSERT(PAGE_EXCL(pp));
3114 3114 pp->p_szc = 0;
3115 3115 ASSERT(curnpgs > 0);
3116 3116 curnpgs--;
3117 3117 }
3118 3118 }
3119 3119 if (root != 0 || curnpgs != 0)
3120 3120 panic("anon_try_demote_pages: bad large page");
3121 3121
3122 3122 for (i = 0; i < pgcnt; i++) {
3123 3123 if ((pp = ppa[i]) != NULL) {
3124 3124 ASSERT(!hat_page_is_mapped(pp));
3125 3125 ASSERT(pp->p_szc == 0);
3126 3126 page_unlock(pp);
3127 3127 }
3128 3128 }
3129 3129 if (ppasize != 0) {
3130 3130 kmem_free(ppa, ppasize);
3131 3131 }
3132 3132 return (1);
3133 3133 }
3134 3134
3135 3135 /*
3136 3136 * anon_map_demotepages() can only be called by MAP_PRIVATE segments.
3137 3137 */
3138 3138 int
3139 3139 anon_map_demotepages(
3140 3140 struct anon_map *amp,
3141 3141 ulong_t start_idx,
3142 3142 struct seg *seg,
3143 3143 caddr_t addr,
3144 3144 uint_t prot,
3145 3145 struct vpage vpage[],
3146 3146 struct cred *cred)
3147 3147 {
3148 3148 struct anon *ap;
3149 3149 uint_t szc = seg->s_szc;
3150 3150 pgcnt_t pgcnt = page_get_pagecnt(szc);
3151 3151 size_t ppasize = pgcnt * sizeof (page_t *);
3152 3152 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP);
3153 3153 page_t *pp;
3154 3154 page_t *pl[2];
3155 3155 pgcnt_t i, pg_idx;
3156 3156 ulong_t an_idx;
3157 3157 caddr_t vaddr;
3158 3158 int err;
3159 3159 int retry = 0;
3160 3160 uint_t vpprot;
3161 3161
3162 3162 ASSERT(RW_WRITE_HELD(&->a_rwlock));
3163 3163 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
3164 3164 ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
3165 3165 ASSERT(ppa != NULL);
3166 3166 ASSERT(szc != 0);
3167 3167 ASSERT(szc == amp->a_szc);
3168 3168
3169 3169 VM_STAT_ADD(anonvmstats.demotepages[0]);
3170 3170
3171 3171 top:
3172 3172 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) {
3173 3173 kmem_free(ppa, ppasize);
3174 3174 return (0);
3175 3175 }
3176 3176
3177 3177 VM_STAT_ADD(anonvmstats.demotepages[4]);
3178 3178
3179 3179 ASSERT(retry == 0); /* we can be here only once */
3180 3180
3181 3181 vaddr = addr;
3182 3182 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
3183 3183 pg_idx++, an_idx++, vaddr += PAGESIZE) {
3184 3184 ap = anon_get_ptr(amp->ahp, an_idx);
3185 3185 if (ap == NULL)
3186 3186 panic("anon_map_demotepages: no anon slot");
3187 3187 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr,
3188 3188 S_READ, cred);
3189 3189 if (err) {
3190 3190 for (i = 0; i < pg_idx; i++) {
3191 3191 if ((pp = ppa[i]) != NULL)
3192 3192 page_unlock(pp);
3193 3193 }
3194 3194 kmem_free(ppa, ppasize);
3195 3195 return (err);
3196 3196 }
3197 3197 ppa[pg_idx] = pl[0];
3198 3198 }
3199 3199
3200 3200 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa,
3201 3201 vpage, -1, 0, cred);
3202 3202 if (err > 0) {
3203 3203 VM_STAT_ADD(anonvmstats.demotepages[5]);
3204 3204 kmem_free(ppa, ppasize);
3205 3205 return (err);
3206 3206 }
3207 3207 ASSERT(err == 0 || err == -1);
3208 3208 if (err == -1) {
3209 3209 VM_STAT_ADD(anonvmstats.demotepages[6]);
3210 3210 retry = 1;
3211 3211 goto top;
3212 3212 }
3213 3213 for (i = 0; i < pgcnt; i++) {
3214 3214 ASSERT(ppa[i] != NULL);
3215 3215 if (ppa[i]->p_szc != 0)
3216 3216 retry = 1;
3217 3217 page_unlock(ppa[i]);
3218 3218 }
3219 3219 if (retry) {
3220 3220 VM_STAT_ADD(anonvmstats.demotepages[7]);
3221 3221 goto top;
3222 3222 }
3223 3223
3224 3224 VM_STAT_ADD(anonvmstats.demotepages[8]);
3225 3225
3226 3226 kmem_free(ppa, ppasize);
3227 3227
3228 3228 return (0);
3229 3229 }
3230 3230
3231 3231 /*
3232 3232 * Free pages of shared anon map. It's assumed that anon maps don't share anon
3233 3233 * structures with private anon maps. Therefore all anon structures should
3234 3234 * have at most one reference at this point. This means underlying pages can
3235 3235 * be exclusively locked and demoted or freed. If not freeing the entire
3236 3236 * large pages demote the ends of the region we free to be able to free
3237 3237 * subpages. Page roots correspond to aligned index positions in anon map.
3238 3238 */
3239 3239 void
3240 3240 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len)
3241 3241 {
3242 3242 ulong_t eidx = sidx + btopr(len);
3243 3243 pgcnt_t pages = page_get_pagecnt(amp->a_szc);
3244 3244 struct anon_hdr *ahp = amp->ahp;
3245 3245 ulong_t tidx;
3246 3246 size_t size;
3247 3247 ulong_t sidx_aligned;
3248 3248 ulong_t eidx_aligned;
3249 3249
3250 3250 ASSERT(ANON_WRITE_HELD(&->a_rwlock));
3251 3251 ASSERT(amp->refcnt <= 1);
3252 3252 ASSERT(amp->a_szc > 0);
3253 3253 ASSERT(eidx <= ahp->size);
3254 3254 ASSERT(!anon_share(ahp, sidx, btopr(len)));
3255 3255
3256 3256 if (len == 0) { /* XXX */
3257 3257 return;
3258 3258 }
3259 3259
3260 3260 sidx_aligned = P2ALIGN(sidx, pages);
3261 3261 if (sidx_aligned != sidx ||
3262 3262 (eidx < sidx_aligned + pages && eidx < ahp->size)) {
3263 3263 if (!anon_try_demote_pages(ahp, sidx_aligned,
3264 3264 amp->a_szc, NULL, 0)) {
3265 3265 panic("anon_shmap_free_pages: demote failed");
3266 3266 }
3267 3267 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) :
3268 3268 P2NPHASE(sidx, pages);
3269 3269 size <<= PAGESHIFT;
3270 3270 anon_free(ahp, sidx, size);
3271 3271 sidx = sidx_aligned + pages;
3272 3272 if (eidx <= sidx) {
3273 3273 return;
3274 3274 }
3275 3275 }
3276 3276 eidx_aligned = P2ALIGN(eidx, pages);
3277 3277 if (sidx < eidx_aligned) {
3278 3278 anon_free_pages(ahp, sidx,
3279 3279 (eidx_aligned - sidx) << PAGESHIFT,
3280 3280 amp->a_szc);
3281 3281 sidx = eidx_aligned;
3282 3282 }
3283 3283 ASSERT(sidx == eidx_aligned);
3284 3284 if (eidx == eidx_aligned) {
3285 3285 return;
3286 3286 }
3287 3287 tidx = eidx;
3288 3288 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL &&
3289 3289 tidx - sidx < pages) {
3290 3290 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) {
3291 3291 panic("anon_shmap_free_pages: demote failed");
3292 3292 }
3293 3293 size = (eidx - sidx) << PAGESHIFT;
3294 3294 anon_free(ahp, sidx, size);
3295 3295 } else {
3296 3296 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc);
3297 3297 }
3298 3298 }
3299 3299
3300 3300 /*
3301 3301 * This routine should be called with amp's writer lock when there're no other
3302 3302 * users of amp. All pcache entries of this amp must have been already
3303 3303 * inactivated. We must not drop a_rwlock here to prevent new users from
3304 3304 * attaching to this amp.
3305 3305 */
3306 3306 void
3307 3307 anonmap_purge(struct anon_map *amp)
3308 3308 {
3309 3309 ASSERT(ANON_WRITE_HELD(&->a_rwlock));
3310 3310 ASSERT(amp->refcnt <= 1);
3311 3311
3312 3312 if (amp->a_softlockcnt != 0) {
3313 3313 seg_ppurge(NULL, amp, 0);
3314 3314 }
3315 3315
3316 3316 /*
3317 3317 * Since all pcache entries were already inactive before this routine
3318 3318 * was called seg_ppurge() couldn't return while there're still
3319 3319 * entries that can be found via the list anchored at a_phead. So we
3320 3320 * can assert this list is empty now. a_softlockcnt may be still non 0
3321 3321 * if asynchronous thread that manages pcache already removed pcache
3322 3322 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non
3323 3323 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if
3324 3324 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map
3325 3325 * before shamp_reclaim() is done with it. a_purgemtx also taken by
3326 3326 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a
3327 3327 * barrier that prevents anonmap_purge() to complete while
3328 3328 * shamp_reclaim() may still be referencing this amp.
3329 3329 */
3330 3330 ASSERT(amp->a_phead.p_lnext == &->a_phead);
3331 3331 ASSERT(amp->a_phead.p_lprev == &->a_phead);
3332 3332
3333 3333 mutex_enter(&->a_purgemtx);
3334 3334 while (amp->a_softlockcnt != 0) {
3335 3335 ASSERT(amp->a_phead.p_lnext == &->a_phead);
3336 3336 ASSERT(amp->a_phead.p_lprev == &->a_phead);
3337 3337 amp->a_purgewait = 1;
3338 3338 cv_wait(&->a_purgecv, &->a_purgemtx);
3339 3339 }
3340 3340 mutex_exit(&->a_purgemtx);
3341 3341
3342 3342 ASSERT(amp->a_phead.p_lnext == &->a_phead);
3343 3343 ASSERT(amp->a_phead.p_lprev == &->a_phead);
3344 3344 ASSERT(amp->a_softlockcnt == 0);
3345 3345 }
3346 3346
3347 3347 /*
3348 3348 * Allocate and initialize an anon_map structure for seg
3349 3349 * associating the given swap reservation with the new anon_map.
3350 3350 */
3351 3351 struct anon_map *
3352 3352 anonmap_alloc(size_t size, size_t swresv, int flags)
3353 3353 {
3354 3354 struct anon_map *amp;
3355 3355 int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
3356 3356
3357 3357 amp = kmem_cache_alloc(anonmap_cache, kmflags);
3358 3358 if (amp == NULL) {
3359 3359 ASSERT(kmflags == KM_NOSLEEP);
3360 3360 return (NULL);
3361 3361 }
3362 3362
3363 3363 amp->ahp = anon_create(btopr(size), flags);
3364 3364 if (amp->ahp == NULL) {
3365 3365 ASSERT(flags == ANON_NOSLEEP);
3366 3366 kmem_cache_free(anonmap_cache, amp);
3367 3367 return (NULL);
3368 3368 }
3369 3369 amp->refcnt = 1;
3370 3370 amp->size = size;
3371 3371 amp->swresv = swresv;
3372 3372 amp->locality = 0;
3373 3373 amp->a_szc = 0;
3374 3374 amp->a_sp = NULL;
3375 3375 amp->a_softlockcnt = 0;
3376 3376 amp->a_purgewait = 0;
3377 3377 amp->a_phead.p_lnext = &->a_phead;
3378 3378 amp->a_phead.p_lprev = &->a_phead;
3379 3379
3380 3380 return (amp);
3381 3381 }
3382 3382
3383 3383 void
3384 3384 anonmap_free(struct anon_map *amp)
3385 3385 {
3386 3386 ASSERT(amp->ahp != NULL);
3387 3387 ASSERT(amp->refcnt == 0);
3388 3388 ASSERT(amp->a_softlockcnt == 0);
3389 3389 ASSERT(amp->a_phead.p_lnext == &->a_phead);
3390 3390 ASSERT(amp->a_phead.p_lprev == &->a_phead);
3391 3391
3392 3392 lgrp_shm_policy_fini(amp, NULL);
3393 3393 anon_release(amp->ahp, btopr(amp->size));
3394 3394 kmem_cache_free(anonmap_cache, amp);
3395 3395 }
3396 3396
3397 3397 /*
3398 3398 * Returns true if the app array has some empty slots.
3399 3399 * The offp and lenp parameters are in/out parameters. On entry
3400 3400 * these values represent the starting offset and length of the
3401 3401 * mapping. When true is returned, these values may be modified
3402 3402 * to be the largest range which includes empty slots.
3403 3403 */
3404 3404 int
3405 3405 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp,
3406 3406 size_t *lenp)
3407 3407 {
3408 3408 ulong_t i, el;
3409 3409 ssize_t low, high;
3410 3410 struct anon *ap;
3411 3411
3412 3412 low = -1;
3413 3413 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) {
3414 3414 ap = anon_get_ptr(ahp, anon_idx);
3415 3415 if (ap == NULL) {
3416 3416 if (low == -1)
3417 3417 low = i;
3418 3418 high = i;
3419 3419 }
3420 3420 }
3421 3421 if (low != -1) {
3422 3422 /*
3423 3423 * Found at least one non-anon page.
3424 3424 * Set up the off and len return values.
3425 3425 */
3426 3426 if (low != 0)
3427 3427 *offp += low;
3428 3428 *lenp = high - low + PAGESIZE;
3429 3429 return (1);
3430 3430 }
3431 3431 return (0);
3432 3432 }
3433 3433
3434 3434 /*
3435 3435 * Return a count of the number of existing anon pages in the anon array
3436 3436 * app in the range (off, off+len). The array and slots must be guaranteed
3437 3437 * stable by the caller.
3438 3438 */
3439 3439 pgcnt_t
3440 3440 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
3441 3441 {
3442 3442 pgcnt_t cnt = 0;
3443 3443
3444 3444 while (nslots-- > 0) {
3445 3445 if ((anon_get_ptr(ahp, anon_index)) != NULL)
3446 3446 cnt++;
3447 3447 anon_index++;
3448 3448 }
3449 3449 return (cnt);
3450 3450 }
3451 3451
3452 3452 /*
3453 3453 * Move reserved phys swap into memory swap (unreserve phys swap
3454 3454 * and reserve mem swap by the same amount).
3455 3455 * Used by segspt when it needs to lock reserved swap npages in memory
3456 3456 */
3457 3457 int
3458 3458 anon_swap_adjust(pgcnt_t npages)
3459 3459 {
3460 3460 pgcnt_t unlocked_mem_swap;
3461 3461
3462 3462 mutex_enter(&anoninfo_lock);
3463 3463
3464 3464 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
3465 3465 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
3466 3466
3467 3467 unlocked_mem_swap = k_anoninfo.ani_mem_resv
3468 3468 - k_anoninfo.ani_locked_swap;
3469 3469 if (npages > unlocked_mem_swap) {
3470 3470 spgcnt_t adjusted_swap = npages - unlocked_mem_swap;
3471 3471
3472 3472 /*
3473 3473 * if there is not enough unlocked mem swap we take missing
3474 3474 * amount from phys swap and give it to mem swap
3475 3475 */
3476 3476 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) {
3477 3477 mutex_exit(&anoninfo_lock);
3478 3478 return (ENOMEM);
3479 3479 }
3480 3480
3481 3481 k_anoninfo.ani_mem_resv += adjusted_swap;
3482 3482 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap);
3483 3483 k_anoninfo.ani_phys_resv -= adjusted_swap;
3484 3484
3485 3485 ANI_ADD(adjusted_swap);
3486 3486 }
3487 3487 k_anoninfo.ani_locked_swap += npages;
3488 3488
3489 3489 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
3490 3490 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
3491 3491
3492 3492 mutex_exit(&anoninfo_lock);
3493 3493
3494 3494 return (0);
3495 3495 }
3496 3496
3497 3497 /*
3498 3498 * 'unlocked' reserved mem swap so when it is unreserved it
3499 3499 * can be moved back phys (disk) swap
3500 3500 */
3501 3501 void
3502 3502 anon_swap_restore(pgcnt_t npages)
3503 3503 {
3504 3504 mutex_enter(&anoninfo_lock);
3505 3505
3506 3506 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
3507 3507
3508 3508 ASSERT(k_anoninfo.ani_locked_swap >= npages);
3509 3509 k_anoninfo.ani_locked_swap -= npages;
3510 3510
3511 3511 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
3512 3512
3513 3513 mutex_exit(&anoninfo_lock);
3514 3514 }
3515 3515
3516 3516 /*
3517 3517 * Return the pointer from the list for a
3518 3518 * specified anon index.
3519 3519 */
3520 3520 ulong_t *
3521 3521 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx)
3522 3522 {
3523 3523 struct anon **app;
3524 3524 void **ppp;
3525 3525
3526 3526 ASSERT(an_idx < ahp->size);
3527 3527
3528 3528 /*
3529 3529 * Single level case.
3530 3530 */
3531 3531 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
3532 3532 return ((ulong_t *)&ahp->array_chunk[an_idx]);
3533 3533 } else {
3534 3534
3535 3535 /*
3536 3536 * 2 level case.
3537 3537 */
3538 3538 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
3539 3539 if (*ppp == NULL) {
3540 3540 mutex_enter(&ahp->serial_lock);
3541 3541 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
3542 3542 if (*ppp == NULL)
3543 3543 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP);
3544 3544 mutex_exit(&ahp->serial_lock);
3545 3545 }
3546 3546 app = *ppp;
3547 3547 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]);
3548 3548 }
3549 3549 }
3550 3550
3551 3551 void
3552 3552 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj)
3553 3553 {
3554 3554 ulong_t *ap_slot;
3555 3555 kmutex_t *mtx;
3556 3556 kcondvar_t *cv;
3557 3557 int hash;
3558 3558
3559 3559 /*
3560 3560 * Use szc to determine anon slot(s) to appear atomic.
3561 3561 * If szc = 0, then lock the anon slot and mark it busy.
3562 3562 * If szc > 0, then lock the range of slots by getting the
3563 3563 * anon_array_lock for the first anon slot, and mark only the
3564 3564 * first anon slot busy to represent whole range being busy.
3565 3565 */
3566 3566
3567 3567 ASSERT(RW_READ_HELD(&->a_rwlock));
3568 3568 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc));
↓ open down ↓ |
3487 lines elided |
↑ open up ↑ |
3569 3569 hash = ANON_ARRAY_HASH(amp, an_idx);
3570 3570 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex;
3571 3571 sobj->sync_cv = cv = &anon_array_cv[hash];
3572 3572 mutex_enter(mtx);
3573 3573 ap_slot = anon_get_slot(amp->ahp, an_idx);
3574 3574 while (ANON_ISBUSY(ap_slot))
3575 3575 cv_wait(cv, mtx);
3576 3576 ANON_SETBUSY(ap_slot);
3577 3577 sobj->sync_data = ap_slot;
3578 3578 mutex_exit(mtx);
3579 -}
3580 -
3581 -int
3582 -anon_array_try_enter(struct anon_map *amp, ulong_t an_idx,
3583 - anon_sync_obj_t *sobj)
3584 -{
3585 - ulong_t *ap_slot;
3586 - kmutex_t *mtx;
3587 - int hash;
3588 -
3589 - /*
3590 - * Try to lock a range of anon slots.
3591 - * Use szc to determine anon slot(s) to appear atomic.
3592 - * If szc = 0, then lock the anon slot and mark it busy.
3593 - * If szc > 0, then lock the range of slots by getting the
3594 - * anon_array_lock for the first anon slot, and mark only the
3595 - * first anon slot busy to represent whole range being busy.
3596 - * Fail if the mutex or the anon_array are busy.
3597 - */
3598 -
3599 - ASSERT(RW_READ_HELD(&->a_rwlock));
3600 - an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc));
3601 - hash = ANON_ARRAY_HASH(amp, an_idx);
3602 - sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex;
3603 - sobj->sync_cv = &anon_array_cv[hash];
3604 - if (!mutex_tryenter(mtx)) {
3605 - return (EWOULDBLOCK);
3606 - }
3607 - ap_slot = anon_get_slot(amp->ahp, an_idx);
3608 - if (ANON_ISBUSY(ap_slot)) {
3609 - mutex_exit(mtx);
3610 - return (EWOULDBLOCK);
3611 - }
3612 - ANON_SETBUSY(ap_slot);
3613 - sobj->sync_data = ap_slot;
3614 - mutex_exit(mtx);
3615 - return (0);
3616 3579 }
3617 3580
3618 3581 void
3619 3582 anon_array_exit(anon_sync_obj_t *sobj)
3620 3583 {
3621 3584 mutex_enter(sobj->sync_mutex);
3622 3585 ASSERT(ANON_ISBUSY(sobj->sync_data));
3623 3586 ANON_CLRBUSY(sobj->sync_data);
3624 3587 if (CV_HAS_WAITERS(sobj->sync_cv))
3625 3588 cv_broadcast(sobj->sync_cv);
3626 3589 mutex_exit(sobj->sync_mutex);
3627 3590 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX