Print this page
5045 use atomic_{inc,dec}_* instead of atomic_add_*
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/vm_page.c
+++ new/usr/src/uts/common/vm/vm_page.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24
25 25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
26 26 /* All Rights Reserved */
27 27
28 28 /*
29 29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 30 * The Regents of the University of California
31 31 * All Rights Reserved
32 32 *
33 33 * University Acknowledgment- Portions of this document are derived from
34 34 * software developed by the University of California, Berkeley, and its
35 35 * contributors.
36 36 */
37 37
38 38 /*
39 39 * VM - physical page management.
40 40 */
41 41
42 42 #include <sys/types.h>
43 43 #include <sys/t_lock.h>
44 44 #include <sys/param.h>
45 45 #include <sys/systm.h>
46 46 #include <sys/errno.h>
47 47 #include <sys/time.h>
48 48 #include <sys/vnode.h>
49 49 #include <sys/vm.h>
50 50 #include <sys/vtrace.h>
51 51 #include <sys/swap.h>
52 52 #include <sys/cmn_err.h>
53 53 #include <sys/tuneable.h>
54 54 #include <sys/sysmacros.h>
55 55 #include <sys/cpuvar.h>
56 56 #include <sys/callb.h>
57 57 #include <sys/debug.h>
58 58 #include <sys/tnf_probe.h>
59 59 #include <sys/condvar_impl.h>
60 60 #include <sys/mem_config.h>
61 61 #include <sys/mem_cage.h>
62 62 #include <sys/kmem.h>
63 63 #include <sys/atomic.h>
64 64 #include <sys/strlog.h>
65 65 #include <sys/mman.h>
66 66 #include <sys/ontrap.h>
67 67 #include <sys/lgrp.h>
68 68 #include <sys/vfs.h>
69 69
70 70 #include <vm/hat.h>
71 71 #include <vm/anon.h>
72 72 #include <vm/page.h>
73 73 #include <vm/seg.h>
74 74 #include <vm/pvn.h>
75 75 #include <vm/seg_kmem.h>
76 76 #include <vm/vm_dep.h>
77 77 #include <sys/vm_usage.h>
78 78 #include <fs/fs_subr.h>
79 79 #include <sys/ddi.h>
80 80 #include <sys/modctl.h>
81 81
82 82 static int nopageage = 0;
83 83
84 84 static pgcnt_t max_page_get; /* max page_get request size in pages */
85 85 pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */
86 86
87 87 /*
88 88 * freemem_lock protects all freemem variables:
89 89 * availrmem. Also this lock protects the globals which track the
90 90 * availrmem changes for accurate kernel footprint calculation.
91 91 * See below for an explanation of these
92 92 * globals.
93 93 */
94 94 kmutex_t freemem_lock;
95 95 pgcnt_t availrmem;
96 96 pgcnt_t availrmem_initial;
97 97
98 98 /*
99 99 * These globals track availrmem changes to get a more accurate
100 100 * estimate of tke kernel size. Historically pp_kernel is used for
101 101 * kernel size and is based on availrmem. But availrmem is adjusted for
102 102 * locked pages in the system not just for kernel locked pages.
103 103 * These new counters will track the pages locked through segvn and
104 104 * by explicit user locking.
105 105 *
106 106 * pages_locked : How many pages are locked because of user specified
107 107 * locking through mlock or plock.
108 108 *
109 109 * pages_useclaim,pages_claimed : These two variables track the
110 110 * claim adjustments because of the protection changes on a segvn segment.
111 111 *
112 112 * All these globals are protected by the same lock which protects availrmem.
113 113 */
114 114 pgcnt_t pages_locked = 0;
115 115 pgcnt_t pages_useclaim = 0;
116 116 pgcnt_t pages_claimed = 0;
117 117
118 118
119 119 /*
120 120 * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
121 121 */
122 122 static kmutex_t new_freemem_lock;
123 123 static uint_t freemem_wait; /* someone waiting for freemem */
124 124 static kcondvar_t freemem_cv;
125 125
126 126 /*
127 127 * The logical page free list is maintained as two lists, the 'free'
128 128 * and the 'cache' lists.
129 129 * The free list contains those pages that should be reused first.
130 130 *
131 131 * The implementation of the lists is machine dependent.
132 132 * page_get_freelist(), page_get_cachelist(),
133 133 * page_list_sub(), and page_list_add()
134 134 * form the interface to the machine dependent implementation.
135 135 *
136 136 * Pages with p_free set are on the cache list.
137 137 * Pages with p_free and p_age set are on the free list,
138 138 *
139 139 * A page may be locked while on either list.
140 140 */
141 141
142 142 /*
143 143 * free list accounting stuff.
144 144 *
145 145 *
146 146 * Spread out the value for the number of pages on the
147 147 * page free and page cache lists. If there is just one
148 148 * value, then it must be under just one lock.
149 149 * The lock contention and cache traffic are a real bother.
150 150 *
151 151 * When we acquire and then drop a single pcf lock
152 152 * we can start in the middle of the array of pcf structures.
153 153 * If we acquire more than one pcf lock at a time, we need to
154 154 * start at the front to avoid deadlocking.
155 155 *
156 156 * pcf_count holds the number of pages in each pool.
157 157 *
158 158 * pcf_block is set when page_create_get_something() has asked the
159 159 * PSM page freelist and page cachelist routines without specifying
160 160 * a color and nothing came back. This is used to block anything
161 161 * else from moving pages from one list to the other while the
162 162 * lists are searched again. If a page is freeed while pcf_block is
163 163 * set, then pcf_reserve is incremented. pcgs_unblock() takes care
164 164 * of clearning pcf_block, doing the wakeups, etc.
165 165 */
166 166
167 167 #define MAX_PCF_FANOUT NCPU
168 168 static uint_t pcf_fanout = 1; /* Will get changed at boot time */
169 169 static uint_t pcf_fanout_mask = 0;
170 170
171 171 struct pcf {
172 172 kmutex_t pcf_lock; /* protects the structure */
173 173 uint_t pcf_count; /* page count */
174 174 uint_t pcf_wait; /* number of waiters */
175 175 uint_t pcf_block; /* pcgs flag to page_free() */
176 176 uint_t pcf_reserve; /* pages freed after pcf_block set */
177 177 uint_t pcf_fill[10]; /* to line up on the caches */
178 178 };
179 179
180 180 /*
181 181 * PCF_INDEX hash needs to be dynamic (every so often the hash changes where
182 182 * it will hash the cpu to). This is done to prevent a drain condition
183 183 * from happening. This drain condition will occur when pcf_count decrement
184 184 * occurs on cpu A and the increment of pcf_count always occurs on cpu B. An
185 185 * example of this shows up with device interrupts. The dma buffer is allocated
186 186 * by the cpu requesting the IO thus the pcf_count is decremented based on that.
187 187 * When the memory is returned by the interrupt thread, the pcf_count will be
188 188 * incremented based on the cpu servicing the interrupt.
189 189 */
190 190 static struct pcf pcf[MAX_PCF_FANOUT];
191 191 #define PCF_INDEX() ((int)(((long)CPU->cpu_seqid) + \
192 192 (randtick() >> 24)) & (pcf_fanout_mask))
193 193
194 194 static int pcf_decrement_bucket(pgcnt_t);
195 195 static int pcf_decrement_multiple(pgcnt_t *, pgcnt_t, int);
196 196
197 197 kmutex_t pcgs_lock; /* serializes page_create_get_ */
198 198 kmutex_t pcgs_cagelock; /* serializes NOSLEEP cage allocs */
199 199 kmutex_t pcgs_wait_lock; /* used for delay in pcgs */
200 200 static kcondvar_t pcgs_cv; /* cv for delay in pcgs */
201 201
202 202 #ifdef VM_STATS
203 203
204 204 /*
205 205 * No locks, but so what, they are only statistics.
206 206 */
207 207
208 208 static struct page_tcnt {
209 209 int pc_free_cache; /* free's into cache list */
210 210 int pc_free_dontneed; /* free's with dontneed */
211 211 int pc_free_pageout; /* free's from pageout */
212 212 int pc_free_free; /* free's into free list */
213 213 int pc_free_pages; /* free's into large page free list */
214 214 int pc_destroy_pages; /* large page destroy's */
215 215 int pc_get_cache; /* get's from cache list */
216 216 int pc_get_free; /* get's from free list */
217 217 int pc_reclaim; /* reclaim's */
218 218 int pc_abortfree; /* abort's of free pages */
219 219 int pc_find_hit; /* find's that find page */
220 220 int pc_find_miss; /* find's that don't find page */
221 221 int pc_destroy_free; /* # of free pages destroyed */
222 222 #define PC_HASH_CNT (4*PAGE_HASHAVELEN)
223 223 int pc_find_hashlen[PC_HASH_CNT+1];
224 224 int pc_addclaim_pages;
225 225 int pc_subclaim_pages;
226 226 int pc_free_replacement_page[2];
227 227 int pc_try_demote_pages[6];
228 228 int pc_demote_pages[2];
229 229 } pagecnt;
230 230
231 231 uint_t hashin_count;
232 232 uint_t hashin_not_held;
233 233 uint_t hashin_already;
234 234
235 235 uint_t hashout_count;
236 236 uint_t hashout_not_held;
237 237
238 238 uint_t page_create_count;
239 239 uint_t page_create_not_enough;
240 240 uint_t page_create_not_enough_again;
241 241 uint_t page_create_zero;
242 242 uint_t page_create_hashout;
243 243 uint_t page_create_page_lock_failed;
244 244 uint_t page_create_trylock_failed;
245 245 uint_t page_create_found_one;
246 246 uint_t page_create_hashin_failed;
247 247 uint_t page_create_dropped_phm;
248 248
249 249 uint_t page_create_new;
250 250 uint_t page_create_exists;
251 251 uint_t page_create_putbacks;
252 252 uint_t page_create_overshoot;
253 253
254 254 uint_t page_reclaim_zero;
255 255 uint_t page_reclaim_zero_locked;
256 256
257 257 uint_t page_rename_exists;
258 258 uint_t page_rename_count;
259 259
260 260 uint_t page_lookup_cnt[20];
261 261 uint_t page_lookup_nowait_cnt[10];
262 262 uint_t page_find_cnt;
263 263 uint_t page_exists_cnt;
264 264 uint_t page_exists_forreal_cnt;
265 265 uint_t page_lookup_dev_cnt;
266 266 uint_t get_cachelist_cnt;
267 267 uint_t page_create_cnt[10];
268 268 uint_t alloc_pages[9];
269 269 uint_t page_exphcontg[19];
270 270 uint_t page_create_large_cnt[10];
271 271
272 272 /*
273 273 * Collects statistics.
274 274 */
275 275 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
276 276 uint_t mylen = 0; \
277 277 \
278 278 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \
279 279 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
280 280 break; \
281 281 } \
282 282 if ((pp) != NULL) \
283 283 pagecnt.pc_find_hit++; \
284 284 else \
285 285 pagecnt.pc_find_miss++; \
286 286 if (mylen > PC_HASH_CNT) \
287 287 mylen = PC_HASH_CNT; \
288 288 pagecnt.pc_find_hashlen[mylen]++; \
289 289 }
290 290
291 291 #else /* VM_STATS */
292 292
293 293 /*
294 294 * Don't collect statistics
295 295 */
296 296 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
297 297 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
298 298 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
299 299 break; \
300 300 } \
301 301 }
302 302
303 303 #endif /* VM_STATS */
304 304
305 305
306 306
307 307 #ifdef DEBUG
308 308 #define MEMSEG_SEARCH_STATS
309 309 #endif
↓ open down ↓ |
309 lines elided |
↑ open up ↑ |
310 310
311 311 #ifdef MEMSEG_SEARCH_STATS
312 312 struct memseg_stats {
313 313 uint_t nsearch;
314 314 uint_t nlastwon;
315 315 uint_t nhashwon;
316 316 uint_t nnotfound;
317 317 } memseg_stats;
318 318
319 319 #define MEMSEG_STAT_INCR(v) \
320 - atomic_add_32(&memseg_stats.v, 1)
320 + atomic_inc_32(&memseg_stats.v)
321 321 #else
322 322 #define MEMSEG_STAT_INCR(x)
323 323 #endif
324 324
325 325 struct memseg *memsegs; /* list of memory segments */
326 326
327 327 /*
328 328 * /etc/system tunable to control large page allocation hueristic.
329 329 *
330 330 * Setting to LPAP_LOCAL will heavily prefer the local lgroup over remote lgroup
331 331 * for large page allocation requests. If a large page is not readily
332 332 * avaliable on the local freelists we will go through additional effort
333 333 * to create a large page, potentially moving smaller pages around to coalesce
334 334 * larger pages in the local lgroup.
335 335 * Default value of LPAP_DEFAULT will go to remote freelists if large pages
336 336 * are not readily available in the local lgroup.
337 337 */
338 338 enum lpap {
339 339 LPAP_DEFAULT, /* default large page allocation policy */
340 340 LPAP_LOCAL /* local large page allocation policy */
341 341 };
342 342
343 343 enum lpap lpg_alloc_prefer = LPAP_DEFAULT;
344 344
345 345 static void page_init_mem_config(void);
346 346 static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
347 347 static void page_do_hashout(page_t *);
348 348 static void page_capture_init();
349 349 int page_capture_take_action(page_t *, uint_t, void *);
350 350
351 351 static void page_demote_vp_pages(page_t *);
352 352
353 353
354 354 void
355 355 pcf_init(void)
356 356
357 357 {
358 358 if (boot_ncpus != -1) {
359 359 pcf_fanout = boot_ncpus;
360 360 } else {
361 361 pcf_fanout = max_ncpus;
362 362 }
363 363 #ifdef sun4v
364 364 /*
365 365 * Force at least 4 buckets if possible for sun4v.
366 366 */
367 367 pcf_fanout = MAX(pcf_fanout, 4);
368 368 #endif /* sun4v */
369 369
370 370 /*
371 371 * Round up to the nearest power of 2.
372 372 */
373 373 pcf_fanout = MIN(pcf_fanout, MAX_PCF_FANOUT);
374 374 if (!ISP2(pcf_fanout)) {
375 375 pcf_fanout = 1 << highbit(pcf_fanout);
376 376
377 377 if (pcf_fanout > MAX_PCF_FANOUT) {
378 378 pcf_fanout = 1 << (highbit(MAX_PCF_FANOUT) - 1);
379 379 }
380 380 }
381 381 pcf_fanout_mask = pcf_fanout - 1;
382 382 }
383 383
384 384 /*
385 385 * vm subsystem related initialization
386 386 */
387 387 void
388 388 vm_init(void)
389 389 {
390 390 boolean_t callb_vm_cpr(void *, int);
391 391
392 392 (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
393 393 page_init_mem_config();
394 394 page_retire_init();
395 395 vm_usage_init();
396 396 page_capture_init();
397 397 }
398 398
399 399 /*
400 400 * This function is called at startup and when memory is added or deleted.
401 401 */
402 402 void
403 403 init_pages_pp_maximum()
404 404 {
405 405 static pgcnt_t p_min;
406 406 static pgcnt_t pages_pp_maximum_startup;
407 407 static pgcnt_t avrmem_delta;
408 408 static int init_done;
409 409 static int user_set; /* true if set in /etc/system */
410 410
411 411 if (init_done == 0) {
412 412
413 413 /* If the user specified a value, save it */
414 414 if (pages_pp_maximum != 0) {
415 415 user_set = 1;
416 416 pages_pp_maximum_startup = pages_pp_maximum;
417 417 }
418 418
419 419 /*
420 420 * Setting of pages_pp_maximum is based first time
421 421 * on the value of availrmem just after the start-up
422 422 * allocations. To preserve this relationship at run
423 423 * time, use a delta from availrmem_initial.
424 424 */
425 425 ASSERT(availrmem_initial >= availrmem);
426 426 avrmem_delta = availrmem_initial - availrmem;
427 427
428 428 /* The allowable floor of pages_pp_maximum */
429 429 p_min = tune.t_minarmem + 100;
430 430
431 431 /* Make sure we don't come through here again. */
432 432 init_done = 1;
433 433 }
434 434 /*
435 435 * Determine pages_pp_maximum, the number of currently available
436 436 * pages (availrmem) that can't be `locked'. If not set by
437 437 * the user, we set it to 4% of the currently available memory
438 438 * plus 4MB.
439 439 * But we also insist that it be greater than tune.t_minarmem;
440 440 * otherwise a process could lock down a lot of memory, get swapped
441 441 * out, and never have enough to get swapped back in.
442 442 */
443 443 if (user_set)
444 444 pages_pp_maximum = pages_pp_maximum_startup;
445 445 else
446 446 pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
447 447 + btop(4 * 1024 * 1024);
448 448
449 449 if (pages_pp_maximum <= p_min) {
450 450 pages_pp_maximum = p_min;
451 451 }
452 452 }
453 453
454 454 void
455 455 set_max_page_get(pgcnt_t target_total_pages)
456 456 {
457 457 max_page_get = target_total_pages / 2;
458 458 }
459 459
460 460 static pgcnt_t pending_delete;
461 461
462 462 /*ARGSUSED*/
463 463 static void
464 464 page_mem_config_post_add(
465 465 void *arg,
466 466 pgcnt_t delta_pages)
467 467 {
468 468 set_max_page_get(total_pages - pending_delete);
469 469 init_pages_pp_maximum();
470 470 }
471 471
472 472 /*ARGSUSED*/
473 473 static int
474 474 page_mem_config_pre_del(
475 475 void *arg,
476 476 pgcnt_t delta_pages)
477 477 {
478 478 pgcnt_t nv;
479 479
480 480 nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
481 481 set_max_page_get(total_pages - nv);
482 482 return (0);
483 483 }
484 484
485 485 /*ARGSUSED*/
486 486 static void
487 487 page_mem_config_post_del(
488 488 void *arg,
489 489 pgcnt_t delta_pages,
490 490 int cancelled)
491 491 {
492 492 pgcnt_t nv;
493 493
494 494 nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
495 495 set_max_page_get(total_pages - nv);
496 496 if (!cancelled)
497 497 init_pages_pp_maximum();
498 498 }
499 499
500 500 static kphysm_setup_vector_t page_mem_config_vec = {
501 501 KPHYSM_SETUP_VECTOR_VERSION,
502 502 page_mem_config_post_add,
503 503 page_mem_config_pre_del,
504 504 page_mem_config_post_del,
505 505 };
506 506
507 507 static void
508 508 page_init_mem_config(void)
509 509 {
510 510 int ret;
511 511
512 512 ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
513 513 ASSERT(ret == 0);
514 514 }
515 515
516 516 /*
517 517 * Evenly spread out the PCF counters for large free pages
518 518 */
519 519 static void
520 520 page_free_large_ctr(pgcnt_t npages)
521 521 {
522 522 static struct pcf *p = pcf;
523 523 pgcnt_t lump;
524 524
525 525 freemem += npages;
526 526
527 527 lump = roundup(npages, pcf_fanout) / pcf_fanout;
528 528
529 529 while (npages > 0) {
530 530
531 531 ASSERT(!p->pcf_block);
532 532
533 533 if (lump < npages) {
534 534 p->pcf_count += (uint_t)lump;
535 535 npages -= lump;
536 536 } else {
537 537 p->pcf_count += (uint_t)npages;
538 538 npages = 0;
539 539 }
540 540
541 541 ASSERT(!p->pcf_wait);
542 542
543 543 if (++p > &pcf[pcf_fanout - 1])
544 544 p = pcf;
545 545 }
546 546
547 547 ASSERT(npages == 0);
548 548 }
549 549
550 550 /*
551 551 * Add a physical chunk of memory to the system free lists during startup.
552 552 * Platform specific startup() allocates the memory for the page structs.
553 553 *
554 554 * num - number of page structures
555 555 * base - page number (pfn) to be associated with the first page.
556 556 *
557 557 * Since we are doing this during startup (ie. single threaded), we will
558 558 * use shortcut routines to avoid any locking overhead while putting all
559 559 * these pages on the freelists.
560 560 *
561 561 * NOTE: Any changes performed to page_free(), must also be performed to
562 562 * add_physmem() since this is how we initialize all page_t's at
563 563 * boot time.
564 564 */
565 565 void
566 566 add_physmem(
567 567 page_t *pp,
568 568 pgcnt_t num,
569 569 pfn_t pnum)
570 570 {
571 571 page_t *root = NULL;
572 572 uint_t szc = page_num_pagesizes() - 1;
573 573 pgcnt_t large = page_get_pagecnt(szc);
574 574 pgcnt_t cnt = 0;
575 575
576 576 TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
577 577 "add_physmem:pp %p num %lu", pp, num);
578 578
579 579 /*
580 580 * Arbitrarily limit the max page_get request
581 581 * to 1/2 of the page structs we have.
582 582 */
583 583 total_pages += num;
584 584 set_max_page_get(total_pages);
585 585
586 586 PLCNT_MODIFY_MAX(pnum, (long)num);
587 587
588 588 /*
589 589 * The physical space for the pages array
590 590 * representing ram pages has already been
591 591 * allocated. Here we initialize each lock
592 592 * in the page structure, and put each on
593 593 * the free list
594 594 */
595 595 for (; num; pp++, pnum++, num--) {
596 596
597 597 /*
598 598 * this needs to fill in the page number
599 599 * and do any other arch specific initialization
600 600 */
601 601 add_physmem_cb(pp, pnum);
602 602
603 603 pp->p_lckcnt = 0;
604 604 pp->p_cowcnt = 0;
605 605 pp->p_slckcnt = 0;
606 606
607 607 /*
608 608 * Initialize the page lock as unlocked, since nobody
609 609 * can see or access this page yet.
610 610 */
611 611 pp->p_selock = 0;
612 612
613 613 /*
614 614 * Initialize IO lock
615 615 */
616 616 page_iolock_init(pp);
617 617
618 618 /*
619 619 * initialize other fields in the page_t
620 620 */
621 621 PP_SETFREE(pp);
622 622 page_clr_all_props(pp);
623 623 PP_SETAGED(pp);
624 624 pp->p_offset = (u_offset_t)-1;
625 625 pp->p_next = pp;
626 626 pp->p_prev = pp;
627 627
628 628 /*
629 629 * Simple case: System doesn't support large pages.
630 630 */
631 631 if (szc == 0) {
632 632 pp->p_szc = 0;
633 633 page_free_at_startup(pp);
634 634 continue;
635 635 }
636 636
637 637 /*
638 638 * Handle unaligned pages, we collect them up onto
639 639 * the root page until we have a full large page.
640 640 */
641 641 if (!IS_P2ALIGNED(pnum, large)) {
642 642
643 643 /*
644 644 * If not in a large page,
645 645 * just free as small page.
646 646 */
647 647 if (root == NULL) {
648 648 pp->p_szc = 0;
649 649 page_free_at_startup(pp);
650 650 continue;
651 651 }
652 652
653 653 /*
654 654 * Link a constituent page into the large page.
655 655 */
656 656 pp->p_szc = szc;
657 657 page_list_concat(&root, &pp);
658 658
659 659 /*
660 660 * When large page is fully formed, free it.
661 661 */
662 662 if (++cnt == large) {
663 663 page_free_large_ctr(cnt);
664 664 page_list_add_pages(root, PG_LIST_ISINIT);
665 665 root = NULL;
666 666 cnt = 0;
667 667 }
668 668 continue;
669 669 }
670 670
671 671 /*
672 672 * At this point we have a page number which
673 673 * is aligned. We assert that we aren't already
674 674 * in a different large page.
675 675 */
676 676 ASSERT(IS_P2ALIGNED(pnum, large));
677 677 ASSERT(root == NULL && cnt == 0);
678 678
679 679 /*
680 680 * If insufficient number of pages left to form
681 681 * a large page, just free the small page.
682 682 */
683 683 if (num < large) {
684 684 pp->p_szc = 0;
685 685 page_free_at_startup(pp);
686 686 continue;
687 687 }
688 688
689 689 /*
690 690 * Otherwise start a new large page.
691 691 */
692 692 pp->p_szc = szc;
693 693 cnt++;
694 694 root = pp;
695 695 }
696 696 ASSERT(root == NULL && cnt == 0);
697 697 }
698 698
699 699 /*
700 700 * Find a page representing the specified [vp, offset].
701 701 * If we find the page but it is intransit coming in,
702 702 * it will have an "exclusive" lock and we wait for
703 703 * the i/o to complete. A page found on the free list
704 704 * is always reclaimed and then locked. On success, the page
705 705 * is locked, its data is valid and it isn't on the free
706 706 * list, while a NULL is returned if the page doesn't exist.
707 707 */
708 708 page_t *
709 709 page_lookup(vnode_t *vp, u_offset_t off, se_t se)
710 710 {
711 711 return (page_lookup_create(vp, off, se, NULL, NULL, 0));
712 712 }
713 713
714 714 /*
715 715 * Find a page representing the specified [vp, offset].
716 716 * We either return the one we found or, if passed in,
717 717 * create one with identity of [vp, offset] of the
718 718 * pre-allocated page. If we find existing page but it is
719 719 * intransit coming in, it will have an "exclusive" lock
720 720 * and we wait for the i/o to complete. A page found on
721 721 * the free list is always reclaimed and then locked.
722 722 * On success, the page is locked, its data is valid and
723 723 * it isn't on the free list, while a NULL is returned
724 724 * if the page doesn't exist and newpp is NULL;
725 725 */
726 726 page_t *
727 727 page_lookup_create(
728 728 vnode_t *vp,
729 729 u_offset_t off,
730 730 se_t se,
731 731 page_t *newpp,
732 732 spgcnt_t *nrelocp,
733 733 int flags)
734 734 {
735 735 page_t *pp;
736 736 kmutex_t *phm;
737 737 ulong_t index;
738 738 uint_t hash_locked;
739 739 uint_t es;
740 740
741 741 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
742 742 VM_STAT_ADD(page_lookup_cnt[0]);
743 743 ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
744 744
745 745 /*
746 746 * Acquire the appropriate page hash lock since
747 747 * we have to search the hash list. Pages that
748 748 * hash to this list can't change identity while
749 749 * this lock is held.
750 750 */
751 751 hash_locked = 0;
752 752 index = PAGE_HASH_FUNC(vp, off);
753 753 phm = NULL;
754 754 top:
755 755 PAGE_HASH_SEARCH(index, pp, vp, off);
756 756 if (pp != NULL) {
757 757 VM_STAT_ADD(page_lookup_cnt[1]);
758 758 es = (newpp != NULL) ? 1 : 0;
759 759 es |= flags;
760 760 if (!hash_locked) {
761 761 VM_STAT_ADD(page_lookup_cnt[2]);
762 762 if (!page_try_reclaim_lock(pp, se, es)) {
763 763 /*
764 764 * On a miss, acquire the phm. Then
765 765 * next time, page_lock() will be called,
766 766 * causing a wait if the page is busy.
767 767 * just looping with page_trylock() would
768 768 * get pretty boring.
769 769 */
770 770 VM_STAT_ADD(page_lookup_cnt[3]);
771 771 phm = PAGE_HASH_MUTEX(index);
772 772 mutex_enter(phm);
773 773 hash_locked = 1;
774 774 goto top;
775 775 }
776 776 } else {
777 777 VM_STAT_ADD(page_lookup_cnt[4]);
778 778 if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {
779 779 VM_STAT_ADD(page_lookup_cnt[5]);
780 780 goto top;
781 781 }
782 782 }
783 783
784 784 /*
785 785 * Since `pp' is locked it can not change identity now.
786 786 * Reconfirm we locked the correct page.
787 787 *
788 788 * Both the p_vnode and p_offset *must* be cast volatile
789 789 * to force a reload of their values: The PAGE_HASH_SEARCH
790 790 * macro will have stuffed p_vnode and p_offset into
791 791 * registers before calling page_trylock(); another thread,
792 792 * actually holding the hash lock, could have changed the
793 793 * page's identity in memory, but our registers would not
794 794 * be changed, fooling the reconfirmation. If the hash
795 795 * lock was held during the search, the casting would
796 796 * not be needed.
797 797 */
798 798 VM_STAT_ADD(page_lookup_cnt[6]);
799 799 if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
800 800 ((volatile u_offset_t)(pp->p_offset) != off)) {
801 801 VM_STAT_ADD(page_lookup_cnt[7]);
802 802 if (hash_locked) {
803 803 panic("page_lookup_create: lost page %p",
804 804 (void *)pp);
805 805 /*NOTREACHED*/
806 806 }
807 807 page_unlock(pp);
808 808 phm = PAGE_HASH_MUTEX(index);
809 809 mutex_enter(phm);
810 810 hash_locked = 1;
811 811 goto top;
812 812 }
813 813
814 814 /*
815 815 * If page_trylock() was called, then pp may still be on
816 816 * the cachelist (can't be on the free list, it would not
817 817 * have been found in the search). If it is on the
818 818 * cachelist it must be pulled now. To pull the page from
819 819 * the cachelist, it must be exclusively locked.
820 820 *
821 821 * The other big difference between page_trylock() and
822 822 * page_lock(), is that page_lock() will pull the
823 823 * page from whatever free list (the cache list in this
824 824 * case) the page is on. If page_trylock() was used
825 825 * above, then we have to do the reclaim ourselves.
826 826 */
827 827 if ((!hash_locked) && (PP_ISFREE(pp))) {
828 828 ASSERT(PP_ISAGED(pp) == 0);
829 829 VM_STAT_ADD(page_lookup_cnt[8]);
830 830
831 831 /*
832 832 * page_relcaim will insure that we
833 833 * have this page exclusively
834 834 */
835 835
836 836 if (!page_reclaim(pp, NULL)) {
837 837 /*
838 838 * Page_reclaim dropped whatever lock
839 839 * we held.
840 840 */
841 841 VM_STAT_ADD(page_lookup_cnt[9]);
842 842 phm = PAGE_HASH_MUTEX(index);
843 843 mutex_enter(phm);
844 844 hash_locked = 1;
845 845 goto top;
846 846 } else if (se == SE_SHARED && newpp == NULL) {
847 847 VM_STAT_ADD(page_lookup_cnt[10]);
848 848 page_downgrade(pp);
849 849 }
850 850 }
851 851
852 852 if (hash_locked) {
853 853 mutex_exit(phm);
854 854 }
855 855
856 856 if (newpp != NULL && pp->p_szc < newpp->p_szc &&
857 857 PAGE_EXCL(pp) && nrelocp != NULL) {
858 858 ASSERT(nrelocp != NULL);
859 859 (void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
860 860 NULL);
861 861 if (*nrelocp > 0) {
862 862 VM_STAT_COND_ADD(*nrelocp == 1,
863 863 page_lookup_cnt[11]);
864 864 VM_STAT_COND_ADD(*nrelocp > 1,
865 865 page_lookup_cnt[12]);
866 866 pp = newpp;
867 867 se = SE_EXCL;
868 868 } else {
869 869 if (se == SE_SHARED) {
870 870 page_downgrade(pp);
871 871 }
872 872 VM_STAT_ADD(page_lookup_cnt[13]);
873 873 }
874 874 } else if (newpp != NULL && nrelocp != NULL) {
875 875 if (PAGE_EXCL(pp) && se == SE_SHARED) {
876 876 page_downgrade(pp);
877 877 }
878 878 VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
879 879 page_lookup_cnt[14]);
880 880 VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
881 881 page_lookup_cnt[15]);
882 882 VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
883 883 page_lookup_cnt[16]);
884 884 } else if (newpp != NULL && PAGE_EXCL(pp)) {
885 885 se = SE_EXCL;
886 886 }
887 887 } else if (!hash_locked) {
888 888 VM_STAT_ADD(page_lookup_cnt[17]);
889 889 phm = PAGE_HASH_MUTEX(index);
890 890 mutex_enter(phm);
891 891 hash_locked = 1;
892 892 goto top;
893 893 } else if (newpp != NULL) {
894 894 /*
895 895 * If we have a preallocated page then
896 896 * insert it now and basically behave like
897 897 * page_create.
898 898 */
899 899 VM_STAT_ADD(page_lookup_cnt[18]);
900 900 /*
901 901 * Since we hold the page hash mutex and
902 902 * just searched for this page, page_hashin
903 903 * had better not fail. If it does, that
904 904 * means some thread did not follow the
905 905 * page hash mutex rules. Panic now and
906 906 * get it over with. As usual, go down
907 907 * holding all the locks.
908 908 */
909 909 ASSERT(MUTEX_HELD(phm));
910 910 if (!page_hashin(newpp, vp, off, phm)) {
911 911 ASSERT(MUTEX_HELD(phm));
912 912 panic("page_lookup_create: hashin failed %p %p %llx %p",
913 913 (void *)newpp, (void *)vp, off, (void *)phm);
914 914 /*NOTREACHED*/
915 915 }
916 916 ASSERT(MUTEX_HELD(phm));
917 917 mutex_exit(phm);
918 918 phm = NULL;
919 919 page_set_props(newpp, P_REF);
920 920 page_io_lock(newpp);
921 921 pp = newpp;
922 922 se = SE_EXCL;
923 923 } else {
924 924 VM_STAT_ADD(page_lookup_cnt[19]);
925 925 mutex_exit(phm);
926 926 }
927 927
928 928 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
929 929
930 930 ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
931 931
932 932 return (pp);
933 933 }
934 934
935 935 /*
936 936 * Search the hash list for the page representing the
937 937 * specified [vp, offset] and return it locked. Skip
938 938 * free pages and pages that cannot be locked as requested.
939 939 * Used while attempting to kluster pages.
940 940 */
941 941 page_t *
942 942 page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)
943 943 {
944 944 page_t *pp;
945 945 kmutex_t *phm;
946 946 ulong_t index;
947 947 uint_t locked;
948 948
949 949 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
950 950 VM_STAT_ADD(page_lookup_nowait_cnt[0]);
951 951
952 952 index = PAGE_HASH_FUNC(vp, off);
953 953 PAGE_HASH_SEARCH(index, pp, vp, off);
954 954 locked = 0;
955 955 if (pp == NULL) {
956 956 top:
957 957 VM_STAT_ADD(page_lookup_nowait_cnt[1]);
958 958 locked = 1;
959 959 phm = PAGE_HASH_MUTEX(index);
960 960 mutex_enter(phm);
961 961 PAGE_HASH_SEARCH(index, pp, vp, off);
962 962 }
963 963
964 964 if (pp == NULL || PP_ISFREE(pp)) {
965 965 VM_STAT_ADD(page_lookup_nowait_cnt[2]);
966 966 pp = NULL;
967 967 } else {
968 968 if (!page_trylock(pp, se)) {
969 969 VM_STAT_ADD(page_lookup_nowait_cnt[3]);
970 970 pp = NULL;
971 971 } else {
972 972 VM_STAT_ADD(page_lookup_nowait_cnt[4]);
973 973 /*
974 974 * See the comment in page_lookup()
975 975 */
976 976 if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
977 977 ((u_offset_t)(pp->p_offset) != off)) {
978 978 VM_STAT_ADD(page_lookup_nowait_cnt[5]);
979 979 if (locked) {
980 980 panic("page_lookup_nowait %p",
981 981 (void *)pp);
982 982 /*NOTREACHED*/
983 983 }
984 984 page_unlock(pp);
985 985 goto top;
986 986 }
987 987 if (PP_ISFREE(pp)) {
988 988 VM_STAT_ADD(page_lookup_nowait_cnt[6]);
989 989 page_unlock(pp);
990 990 pp = NULL;
991 991 }
992 992 }
993 993 }
994 994 if (locked) {
995 995 VM_STAT_ADD(page_lookup_nowait_cnt[7]);
996 996 mutex_exit(phm);
997 997 }
998 998
999 999 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
1000 1000
1001 1001 return (pp);
1002 1002 }
1003 1003
1004 1004 /*
1005 1005 * Search the hash list for a page with the specified [vp, off]
1006 1006 * that is known to exist and is already locked. This routine
1007 1007 * is typically used by segment SOFTUNLOCK routines.
1008 1008 */
1009 1009 page_t *
1010 1010 page_find(vnode_t *vp, u_offset_t off)
1011 1011 {
1012 1012 page_t *pp;
1013 1013 kmutex_t *phm;
1014 1014 ulong_t index;
1015 1015
1016 1016 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1017 1017 VM_STAT_ADD(page_find_cnt);
1018 1018
1019 1019 index = PAGE_HASH_FUNC(vp, off);
1020 1020 phm = PAGE_HASH_MUTEX(index);
1021 1021
1022 1022 mutex_enter(phm);
1023 1023 PAGE_HASH_SEARCH(index, pp, vp, off);
1024 1024 mutex_exit(phm);
1025 1025
1026 1026 ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr);
1027 1027 return (pp);
1028 1028 }
1029 1029
1030 1030 /*
1031 1031 * Determine whether a page with the specified [vp, off]
1032 1032 * currently exists in the system. Obviously this should
1033 1033 * only be considered as a hint since nothing prevents the
1034 1034 * page from disappearing or appearing immediately after
1035 1035 * the return from this routine. Subsequently, we don't
1036 1036 * even bother to lock the list.
1037 1037 */
1038 1038 page_t *
1039 1039 page_exists(vnode_t *vp, u_offset_t off)
1040 1040 {
1041 1041 page_t *pp;
1042 1042 ulong_t index;
1043 1043
1044 1044 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1045 1045 VM_STAT_ADD(page_exists_cnt);
1046 1046
1047 1047 index = PAGE_HASH_FUNC(vp, off);
1048 1048 PAGE_HASH_SEARCH(index, pp, vp, off);
1049 1049
1050 1050 return (pp);
1051 1051 }
1052 1052
1053 1053 /*
1054 1054 * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
1055 1055 * page_size(szc)) range. if they exist and ppa is not NULL fill ppa array
1056 1056 * with these pages locked SHARED. If necessary reclaim pages from
1057 1057 * freelist. Return 1 if contiguous pages exist and 0 otherwise.
1058 1058 *
1059 1059 * If we fail to lock pages still return 1 if pages exist and contiguous.
1060 1060 * But in this case return value is just a hint. ppa array won't be filled.
1061 1061 * Caller should initialize ppa[0] as NULL to distinguish return value.
1062 1062 *
1063 1063 * Returns 0 if pages don't exist or not physically contiguous.
1064 1064 *
1065 1065 * This routine doesn't work for anonymous(swapfs) pages.
1066 1066 */
1067 1067 int
1068 1068 page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
1069 1069 {
1070 1070 pgcnt_t pages;
1071 1071 pfn_t pfn;
1072 1072 page_t *rootpp;
1073 1073 pgcnt_t i;
1074 1074 pgcnt_t j;
1075 1075 u_offset_t save_off = off;
1076 1076 ulong_t index;
1077 1077 kmutex_t *phm;
1078 1078 page_t *pp;
1079 1079 uint_t pszc;
1080 1080 int loopcnt = 0;
1081 1081
1082 1082 ASSERT(szc != 0);
1083 1083 ASSERT(vp != NULL);
1084 1084 ASSERT(!IS_SWAPFSVP(vp));
1085 1085 ASSERT(!VN_ISKAS(vp));
1086 1086
1087 1087 again:
1088 1088 if (++loopcnt > 3) {
1089 1089 VM_STAT_ADD(page_exphcontg[0]);
1090 1090 return (0);
1091 1091 }
1092 1092
1093 1093 index = PAGE_HASH_FUNC(vp, off);
1094 1094 phm = PAGE_HASH_MUTEX(index);
1095 1095
1096 1096 mutex_enter(phm);
1097 1097 PAGE_HASH_SEARCH(index, pp, vp, off);
1098 1098 mutex_exit(phm);
1099 1099
1100 1100 VM_STAT_ADD(page_exphcontg[1]);
1101 1101
1102 1102 if (pp == NULL) {
1103 1103 VM_STAT_ADD(page_exphcontg[2]);
1104 1104 return (0);
1105 1105 }
1106 1106
1107 1107 pages = page_get_pagecnt(szc);
1108 1108 rootpp = pp;
1109 1109 pfn = rootpp->p_pagenum;
1110 1110
1111 1111 if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
1112 1112 VM_STAT_ADD(page_exphcontg[3]);
1113 1113 if (!page_trylock(pp, SE_SHARED)) {
1114 1114 VM_STAT_ADD(page_exphcontg[4]);
1115 1115 return (1);
1116 1116 }
1117 1117 /*
1118 1118 * Also check whether p_pagenum was modified by DR.
1119 1119 */
1120 1120 if (pp->p_szc != pszc || pp->p_vnode != vp ||
1121 1121 pp->p_offset != off || pp->p_pagenum != pfn) {
1122 1122 VM_STAT_ADD(page_exphcontg[5]);
1123 1123 page_unlock(pp);
1124 1124 off = save_off;
1125 1125 goto again;
1126 1126 }
1127 1127 /*
1128 1128 * szc was non zero and vnode and offset matched after we
1129 1129 * locked the page it means it can't become free on us.
1130 1130 */
1131 1131 ASSERT(!PP_ISFREE(pp));
1132 1132 if (!IS_P2ALIGNED(pfn, pages)) {
1133 1133 page_unlock(pp);
1134 1134 return (0);
1135 1135 }
1136 1136 ppa[0] = pp;
1137 1137 pp++;
1138 1138 off += PAGESIZE;
1139 1139 pfn++;
1140 1140 for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1141 1141 if (!page_trylock(pp, SE_SHARED)) {
1142 1142 VM_STAT_ADD(page_exphcontg[6]);
1143 1143 pp--;
1144 1144 while (i-- > 0) {
1145 1145 page_unlock(pp);
1146 1146 pp--;
1147 1147 }
1148 1148 ppa[0] = NULL;
1149 1149 return (1);
1150 1150 }
1151 1151 if (pp->p_szc != pszc) {
1152 1152 VM_STAT_ADD(page_exphcontg[7]);
1153 1153 page_unlock(pp);
1154 1154 pp--;
1155 1155 while (i-- > 0) {
1156 1156 page_unlock(pp);
1157 1157 pp--;
1158 1158 }
1159 1159 ppa[0] = NULL;
1160 1160 off = save_off;
1161 1161 goto again;
1162 1162 }
1163 1163 /*
1164 1164 * szc the same as for previous already locked pages
1165 1165 * with right identity. Since this page had correct
1166 1166 * szc after we locked it can't get freed or destroyed
1167 1167 * and therefore must have the expected identity.
1168 1168 */
1169 1169 ASSERT(!PP_ISFREE(pp));
1170 1170 if (pp->p_vnode != vp ||
1171 1171 pp->p_offset != off) {
1172 1172 panic("page_exists_physcontig: "
1173 1173 "large page identity doesn't match");
1174 1174 }
1175 1175 ppa[i] = pp;
1176 1176 ASSERT(pp->p_pagenum == pfn);
1177 1177 }
1178 1178 VM_STAT_ADD(page_exphcontg[8]);
1179 1179 ppa[pages] = NULL;
1180 1180 return (1);
1181 1181 } else if (pszc >= szc) {
1182 1182 VM_STAT_ADD(page_exphcontg[9]);
1183 1183 if (!IS_P2ALIGNED(pfn, pages)) {
1184 1184 return (0);
1185 1185 }
1186 1186 return (1);
1187 1187 }
1188 1188
1189 1189 if (!IS_P2ALIGNED(pfn, pages)) {
1190 1190 VM_STAT_ADD(page_exphcontg[10]);
1191 1191 return (0);
1192 1192 }
1193 1193
1194 1194 if (page_numtomemseg_nolock(pfn) !=
1195 1195 page_numtomemseg_nolock(pfn + pages - 1)) {
1196 1196 VM_STAT_ADD(page_exphcontg[11]);
1197 1197 return (0);
1198 1198 }
1199 1199
1200 1200 /*
1201 1201 * We loop up 4 times across pages to promote page size.
1202 1202 * We're extra cautious to promote page size atomically with respect
1203 1203 * to everybody else. But we can probably optimize into 1 loop if
1204 1204 * this becomes an issue.
1205 1205 */
1206 1206
1207 1207 for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1208 1208 if (!page_trylock(pp, SE_EXCL)) {
1209 1209 VM_STAT_ADD(page_exphcontg[12]);
1210 1210 break;
1211 1211 }
1212 1212 /*
1213 1213 * Check whether p_pagenum was modified by DR.
1214 1214 */
1215 1215 if (pp->p_pagenum != pfn) {
1216 1216 page_unlock(pp);
1217 1217 break;
1218 1218 }
1219 1219 if (pp->p_vnode != vp ||
1220 1220 pp->p_offset != off) {
1221 1221 VM_STAT_ADD(page_exphcontg[13]);
1222 1222 page_unlock(pp);
1223 1223 break;
1224 1224 }
1225 1225 if (pp->p_szc >= szc) {
1226 1226 ASSERT(i == 0);
1227 1227 page_unlock(pp);
1228 1228 off = save_off;
1229 1229 goto again;
1230 1230 }
1231 1231 }
1232 1232
1233 1233 if (i != pages) {
1234 1234 VM_STAT_ADD(page_exphcontg[14]);
1235 1235 --pp;
1236 1236 while (i-- > 0) {
1237 1237 page_unlock(pp);
1238 1238 --pp;
1239 1239 }
1240 1240 return (0);
1241 1241 }
1242 1242
1243 1243 pp = rootpp;
1244 1244 for (i = 0; i < pages; i++, pp++) {
1245 1245 if (PP_ISFREE(pp)) {
1246 1246 VM_STAT_ADD(page_exphcontg[15]);
1247 1247 ASSERT(!PP_ISAGED(pp));
1248 1248 ASSERT(pp->p_szc == 0);
1249 1249 if (!page_reclaim(pp, NULL)) {
1250 1250 break;
1251 1251 }
1252 1252 } else {
1253 1253 ASSERT(pp->p_szc < szc);
1254 1254 VM_STAT_ADD(page_exphcontg[16]);
1255 1255 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1256 1256 }
1257 1257 }
1258 1258 if (i < pages) {
1259 1259 VM_STAT_ADD(page_exphcontg[17]);
1260 1260 /*
1261 1261 * page_reclaim failed because we were out of memory.
1262 1262 * drop the rest of the locks and return because this page
1263 1263 * must be already reallocated anyway.
1264 1264 */
1265 1265 pp = rootpp;
1266 1266 for (j = 0; j < pages; j++, pp++) {
1267 1267 if (j != i) {
1268 1268 page_unlock(pp);
1269 1269 }
1270 1270 }
1271 1271 return (0);
1272 1272 }
1273 1273
1274 1274 off = save_off;
1275 1275 pp = rootpp;
1276 1276 for (i = 0; i < pages; i++, pp++, off += PAGESIZE) {
1277 1277 ASSERT(PAGE_EXCL(pp));
1278 1278 ASSERT(!PP_ISFREE(pp));
1279 1279 ASSERT(!hat_page_is_mapped(pp));
1280 1280 ASSERT(pp->p_vnode == vp);
1281 1281 ASSERT(pp->p_offset == off);
1282 1282 pp->p_szc = szc;
1283 1283 }
1284 1284 pp = rootpp;
1285 1285 for (i = 0; i < pages; i++, pp++) {
1286 1286 if (ppa == NULL) {
1287 1287 page_unlock(pp);
1288 1288 } else {
1289 1289 ppa[i] = pp;
1290 1290 page_downgrade(ppa[i]);
1291 1291 }
1292 1292 }
1293 1293 if (ppa != NULL) {
1294 1294 ppa[pages] = NULL;
1295 1295 }
1296 1296 VM_STAT_ADD(page_exphcontg[18]);
1297 1297 ASSERT(vp->v_pages != NULL);
1298 1298 return (1);
1299 1299 }
1300 1300
1301 1301 /*
1302 1302 * Determine whether a page with the specified [vp, off]
1303 1303 * currently exists in the system and if so return its
1304 1304 * size code. Obviously this should only be considered as
1305 1305 * a hint since nothing prevents the page from disappearing
1306 1306 * or appearing immediately after the return from this routine.
1307 1307 */
1308 1308 int
1309 1309 page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc)
1310 1310 {
1311 1311 page_t *pp;
1312 1312 kmutex_t *phm;
1313 1313 ulong_t index;
1314 1314 int rc = 0;
1315 1315
1316 1316 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1317 1317 ASSERT(szc != NULL);
1318 1318 VM_STAT_ADD(page_exists_forreal_cnt);
1319 1319
1320 1320 index = PAGE_HASH_FUNC(vp, off);
1321 1321 phm = PAGE_HASH_MUTEX(index);
1322 1322
1323 1323 mutex_enter(phm);
1324 1324 PAGE_HASH_SEARCH(index, pp, vp, off);
1325 1325 if (pp != NULL) {
1326 1326 *szc = pp->p_szc;
1327 1327 rc = 1;
1328 1328 }
1329 1329 mutex_exit(phm);
1330 1330 return (rc);
1331 1331 }
1332 1332
1333 1333 /* wakeup threads waiting for pages in page_create_get_something() */
1334 1334 void
1335 1335 wakeup_pcgs(void)
1336 1336 {
1337 1337 if (!CV_HAS_WAITERS(&pcgs_cv))
1338 1338 return;
1339 1339 cv_broadcast(&pcgs_cv);
1340 1340 }
1341 1341
1342 1342 /*
1343 1343 * 'freemem' is used all over the kernel as an indication of how many
1344 1344 * pages are free (either on the cache list or on the free page list)
1345 1345 * in the system. In very few places is a really accurate 'freemem'
1346 1346 * needed. To avoid contention of the lock protecting a the
1347 1347 * single freemem, it was spread out into NCPU buckets. Set_freemem
1348 1348 * sets freemem to the total of all NCPU buckets. It is called from
1349 1349 * clock() on each TICK.
1350 1350 */
1351 1351 void
1352 1352 set_freemem()
1353 1353 {
1354 1354 struct pcf *p;
1355 1355 ulong_t t;
1356 1356 uint_t i;
1357 1357
1358 1358 t = 0;
1359 1359 p = pcf;
1360 1360 for (i = 0; i < pcf_fanout; i++) {
1361 1361 t += p->pcf_count;
1362 1362 p++;
1363 1363 }
1364 1364 freemem = t;
1365 1365
1366 1366 /*
1367 1367 * Don't worry about grabbing mutex. It's not that
1368 1368 * critical if we miss a tick or two. This is
1369 1369 * where we wakeup possible delayers in
1370 1370 * page_create_get_something().
1371 1371 */
1372 1372 wakeup_pcgs();
1373 1373 }
1374 1374
1375 1375 ulong_t
1376 1376 get_freemem()
1377 1377 {
1378 1378 struct pcf *p;
1379 1379 ulong_t t;
1380 1380 uint_t i;
1381 1381
1382 1382 t = 0;
1383 1383 p = pcf;
1384 1384 for (i = 0; i < pcf_fanout; i++) {
1385 1385 t += p->pcf_count;
1386 1386 p++;
1387 1387 }
1388 1388 /*
1389 1389 * We just calculated it, might as well set it.
1390 1390 */
1391 1391 freemem = t;
1392 1392 return (t);
1393 1393 }
1394 1394
1395 1395 /*
1396 1396 * Acquire all of the page cache & free (pcf) locks.
1397 1397 */
1398 1398 void
1399 1399 pcf_acquire_all()
1400 1400 {
1401 1401 struct pcf *p;
1402 1402 uint_t i;
1403 1403
1404 1404 p = pcf;
1405 1405 for (i = 0; i < pcf_fanout; i++) {
1406 1406 mutex_enter(&p->pcf_lock);
1407 1407 p++;
1408 1408 }
1409 1409 }
1410 1410
1411 1411 /*
1412 1412 * Release all the pcf_locks.
1413 1413 */
1414 1414 void
1415 1415 pcf_release_all()
1416 1416 {
1417 1417 struct pcf *p;
1418 1418 uint_t i;
1419 1419
1420 1420 p = pcf;
1421 1421 for (i = 0; i < pcf_fanout; i++) {
1422 1422 mutex_exit(&p->pcf_lock);
1423 1423 p++;
1424 1424 }
1425 1425 }
1426 1426
1427 1427 /*
1428 1428 * Inform the VM system that we need some pages freed up.
1429 1429 * Calls must be symmetric, e.g.:
1430 1430 *
1431 1431 * page_needfree(100);
1432 1432 * wait a bit;
1433 1433 * page_needfree(-100);
1434 1434 */
1435 1435 void
1436 1436 page_needfree(spgcnt_t npages)
1437 1437 {
1438 1438 mutex_enter(&new_freemem_lock);
1439 1439 needfree += npages;
1440 1440 mutex_exit(&new_freemem_lock);
1441 1441 }
1442 1442
1443 1443 /*
1444 1444 * Throttle for page_create(): try to prevent freemem from dropping
1445 1445 * below throttlefree. We can't provide a 100% guarantee because
1446 1446 * KM_NOSLEEP allocations, page_reclaim(), and various other things
1447 1447 * nibble away at the freelist. However, we can block all PG_WAIT
1448 1448 * allocations until memory becomes available. The motivation is
1449 1449 * that several things can fall apart when there's no free memory:
1450 1450 *
1451 1451 * (1) If pageout() needs memory to push a page, the system deadlocks.
1452 1452 *
1453 1453 * (2) By (broken) specification, timeout(9F) can neither fail nor
1454 1454 * block, so it has no choice but to panic the system if it
1455 1455 * cannot allocate a callout structure.
1456 1456 *
1457 1457 * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block;
1458 1458 * it panics if it cannot allocate a callback structure.
1459 1459 *
1460 1460 * (4) Untold numbers of third-party drivers have not yet been hardened
1461 1461 * against KM_NOSLEEP and/or allocb() failures; they simply assume
1462 1462 * success and panic the system with a data fault on failure.
1463 1463 * (The long-term solution to this particular problem is to ship
1464 1464 * hostile fault-injecting DEBUG kernels with the DDK.)
1465 1465 *
1466 1466 * It is theoretically impossible to guarantee success of non-blocking
1467 1467 * allocations, but in practice, this throttle is very hard to break.
1468 1468 */
1469 1469 static int
1470 1470 page_create_throttle(pgcnt_t npages, int flags)
1471 1471 {
1472 1472 ulong_t fm;
1473 1473 uint_t i;
1474 1474 pgcnt_t tf; /* effective value of throttlefree */
1475 1475
1476 1476 /*
1477 1477 * Normal priority allocations.
1478 1478 */
1479 1479 if ((flags & (PG_WAIT | PG_NORMALPRI)) == PG_NORMALPRI) {
1480 1480 ASSERT(!(flags & (PG_PANIC | PG_PUSHPAGE)));
1481 1481 return (freemem >= npages + throttlefree);
1482 1482 }
1483 1483
1484 1484 /*
1485 1485 * Never deny pages when:
1486 1486 * - it's a thread that cannot block [NOMEMWAIT()]
1487 1487 * - the allocation cannot block and must not fail
1488 1488 * - the allocation cannot block and is pageout dispensated
1489 1489 */
1490 1490 if (NOMEMWAIT() ||
1491 1491 ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) ||
1492 1492 ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE))
1493 1493 return (1);
1494 1494
1495 1495 /*
1496 1496 * If the allocation can't block, we look favorably upon it
1497 1497 * unless we're below pageout_reserve. In that case we fail
1498 1498 * the allocation because we want to make sure there are a few
1499 1499 * pages available for pageout.
1500 1500 */
1501 1501 if ((flags & PG_WAIT) == 0)
1502 1502 return (freemem >= npages + pageout_reserve);
1503 1503
1504 1504 /* Calculate the effective throttlefree value */
1505 1505 tf = throttlefree -
1506 1506 ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
1507 1507
1508 1508 cv_signal(&proc_pageout->p_cv);
1509 1509
1510 1510 for (;;) {
1511 1511 fm = 0;
1512 1512 pcf_acquire_all();
1513 1513 mutex_enter(&new_freemem_lock);
1514 1514 for (i = 0; i < pcf_fanout; i++) {
1515 1515 fm += pcf[i].pcf_count;
1516 1516 pcf[i].pcf_wait++;
1517 1517 mutex_exit(&pcf[i].pcf_lock);
1518 1518 }
1519 1519 freemem = fm;
1520 1520 if (freemem >= npages + tf) {
1521 1521 mutex_exit(&new_freemem_lock);
1522 1522 break;
1523 1523 }
1524 1524 needfree += npages;
1525 1525 freemem_wait++;
1526 1526 cv_wait(&freemem_cv, &new_freemem_lock);
1527 1527 freemem_wait--;
1528 1528 needfree -= npages;
1529 1529 mutex_exit(&new_freemem_lock);
1530 1530 }
1531 1531 return (1);
1532 1532 }
1533 1533
1534 1534 /*
1535 1535 * page_create_wait() is called to either coalesce pages from the
1536 1536 * different pcf buckets or to wait because there simply are not
1537 1537 * enough pages to satisfy the caller's request.
1538 1538 *
1539 1539 * Sadly, this is called from platform/vm/vm_machdep.c
1540 1540 */
1541 1541 int
1542 1542 page_create_wait(pgcnt_t npages, uint_t flags)
1543 1543 {
1544 1544 pgcnt_t total;
1545 1545 uint_t i;
1546 1546 struct pcf *p;
1547 1547
1548 1548 /*
1549 1549 * Wait until there are enough free pages to satisfy our
1550 1550 * entire request.
1551 1551 * We set needfree += npages before prodding pageout, to make sure
1552 1552 * it does real work when npages > lotsfree > freemem.
1553 1553 */
1554 1554 VM_STAT_ADD(page_create_not_enough);
1555 1555
1556 1556 ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1);
1557 1557 checkagain:
1558 1558 if ((flags & PG_NORELOC) &&
1559 1559 kcage_freemem < kcage_throttlefree + npages)
1560 1560 (void) kcage_create_throttle(npages, flags);
1561 1561
1562 1562 if (freemem < npages + throttlefree)
1563 1563 if (!page_create_throttle(npages, flags))
1564 1564 return (0);
1565 1565
1566 1566 if (pcf_decrement_bucket(npages) ||
1567 1567 pcf_decrement_multiple(&total, npages, 0))
1568 1568 return (1);
1569 1569
1570 1570 /*
1571 1571 * All of the pcf locks are held, there are not enough pages
1572 1572 * to satisfy the request (npages < total).
1573 1573 * Be sure to acquire the new_freemem_lock before dropping
1574 1574 * the pcf locks. This prevents dropping wakeups in page_free().
1575 1575 * The order is always pcf_lock then new_freemem_lock.
1576 1576 *
1577 1577 * Since we hold all the pcf locks, it is a good time to set freemem.
1578 1578 *
1579 1579 * If the caller does not want to wait, return now.
1580 1580 * Else turn the pageout daemon loose to find something
1581 1581 * and wait till it does.
1582 1582 *
1583 1583 */
1584 1584 freemem = total;
1585 1585
1586 1586 if ((flags & PG_WAIT) == 0) {
1587 1587 pcf_release_all();
1588 1588
1589 1589 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM,
1590 1590 "page_create_nomem:npages %ld freemem %ld", npages, freemem);
1591 1591 return (0);
1592 1592 }
1593 1593
1594 1594 ASSERT(proc_pageout != NULL);
1595 1595 cv_signal(&proc_pageout->p_cv);
1596 1596
1597 1597 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
1598 1598 "page_create_sleep_start: freemem %ld needfree %ld",
1599 1599 freemem, needfree);
1600 1600
1601 1601 /*
1602 1602 * We are going to wait.
1603 1603 * We currently hold all of the pcf_locks,
1604 1604 * get the new_freemem_lock (it protects freemem_wait),
1605 1605 * before dropping the pcf_locks.
1606 1606 */
1607 1607 mutex_enter(&new_freemem_lock);
1608 1608
1609 1609 p = pcf;
1610 1610 for (i = 0; i < pcf_fanout; i++) {
1611 1611 p->pcf_wait++;
1612 1612 mutex_exit(&p->pcf_lock);
1613 1613 p++;
1614 1614 }
1615 1615
1616 1616 needfree += npages;
1617 1617 freemem_wait++;
1618 1618
1619 1619 cv_wait(&freemem_cv, &new_freemem_lock);
1620 1620
1621 1621 freemem_wait--;
1622 1622 needfree -= npages;
1623 1623
1624 1624 mutex_exit(&new_freemem_lock);
1625 1625
1626 1626 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END,
1627 1627 "page_create_sleep_end: freemem %ld needfree %ld",
1628 1628 freemem, needfree);
1629 1629
1630 1630 VM_STAT_ADD(page_create_not_enough_again);
1631 1631 goto checkagain;
1632 1632 }
1633 1633 /*
1634 1634 * A routine to do the opposite of page_create_wait().
1635 1635 */
1636 1636 void
1637 1637 page_create_putback(spgcnt_t npages)
1638 1638 {
1639 1639 struct pcf *p;
1640 1640 pgcnt_t lump;
1641 1641 uint_t *which;
1642 1642
1643 1643 /*
1644 1644 * When a contiguous lump is broken up, we have to
1645 1645 * deal with lots of pages (min 64) so lets spread
1646 1646 * the wealth around.
1647 1647 */
1648 1648 lump = roundup(npages, pcf_fanout) / pcf_fanout;
1649 1649 freemem += npages;
1650 1650
1651 1651 for (p = pcf; (npages > 0) && (p < &pcf[pcf_fanout]); p++) {
1652 1652 which = &p->pcf_count;
1653 1653
1654 1654 mutex_enter(&p->pcf_lock);
1655 1655
1656 1656 if (p->pcf_block) {
1657 1657 which = &p->pcf_reserve;
1658 1658 }
1659 1659
1660 1660 if (lump < npages) {
1661 1661 *which += (uint_t)lump;
1662 1662 npages -= lump;
1663 1663 } else {
1664 1664 *which += (uint_t)npages;
1665 1665 npages = 0;
1666 1666 }
1667 1667
1668 1668 if (p->pcf_wait) {
1669 1669 mutex_enter(&new_freemem_lock);
1670 1670 /*
1671 1671 * Check to see if some other thread
1672 1672 * is actually waiting. Another bucket
1673 1673 * may have woken it up by now. If there
1674 1674 * are no waiters, then set our pcf_wait
1675 1675 * count to zero to avoid coming in here
1676 1676 * next time.
1677 1677 */
1678 1678 if (freemem_wait) {
1679 1679 if (npages > 1) {
1680 1680 cv_broadcast(&freemem_cv);
1681 1681 } else {
1682 1682 cv_signal(&freemem_cv);
1683 1683 }
1684 1684 p->pcf_wait--;
1685 1685 } else {
1686 1686 p->pcf_wait = 0;
1687 1687 }
1688 1688 mutex_exit(&new_freemem_lock);
1689 1689 }
1690 1690 mutex_exit(&p->pcf_lock);
1691 1691 }
1692 1692 ASSERT(npages == 0);
1693 1693 }
1694 1694
1695 1695 /*
1696 1696 * A helper routine for page_create_get_something.
1697 1697 * The indenting got to deep down there.
1698 1698 * Unblock the pcf counters. Any pages freed after
1699 1699 * pcf_block got set are moved to pcf_count and
1700 1700 * wakeups (cv_broadcast() or cv_signal()) are done as needed.
1701 1701 */
1702 1702 static void
1703 1703 pcgs_unblock(void)
1704 1704 {
1705 1705 int i;
1706 1706 struct pcf *p;
1707 1707
1708 1708 /* Update freemem while we're here. */
1709 1709 freemem = 0;
1710 1710 p = pcf;
1711 1711 for (i = 0; i < pcf_fanout; i++) {
1712 1712 mutex_enter(&p->pcf_lock);
1713 1713 ASSERT(p->pcf_count == 0);
1714 1714 p->pcf_count = p->pcf_reserve;
1715 1715 p->pcf_block = 0;
1716 1716 freemem += p->pcf_count;
1717 1717 if (p->pcf_wait) {
1718 1718 mutex_enter(&new_freemem_lock);
1719 1719 if (freemem_wait) {
1720 1720 if (p->pcf_reserve > 1) {
1721 1721 cv_broadcast(&freemem_cv);
1722 1722 p->pcf_wait = 0;
1723 1723 } else {
1724 1724 cv_signal(&freemem_cv);
1725 1725 p->pcf_wait--;
1726 1726 }
1727 1727 } else {
1728 1728 p->pcf_wait = 0;
1729 1729 }
1730 1730 mutex_exit(&new_freemem_lock);
1731 1731 }
1732 1732 p->pcf_reserve = 0;
1733 1733 mutex_exit(&p->pcf_lock);
1734 1734 p++;
1735 1735 }
1736 1736 }
1737 1737
1738 1738 /*
1739 1739 * Called from page_create_va() when both the cache and free lists
1740 1740 * have been checked once.
1741 1741 *
1742 1742 * Either returns a page or panics since the accounting was done
1743 1743 * way before we got here.
1744 1744 *
1745 1745 * We don't come here often, so leave the accounting on permanently.
1746 1746 */
1747 1747
1748 1748 #define MAX_PCGS 100
1749 1749
1750 1750 #ifdef DEBUG
1751 1751 #define PCGS_TRIES 100
1752 1752 #else /* DEBUG */
1753 1753 #define PCGS_TRIES 10
1754 1754 #endif /* DEBUG */
1755 1755
1756 1756 #ifdef VM_STATS
1757 1757 uint_t pcgs_counts[PCGS_TRIES];
1758 1758 uint_t pcgs_too_many;
1759 1759 uint_t pcgs_entered;
1760 1760 uint_t pcgs_entered_noreloc;
1761 1761 uint_t pcgs_locked;
1762 1762 uint_t pcgs_cagelocked;
1763 1763 #endif /* VM_STATS */
1764 1764
1765 1765 static page_t *
1766 1766 page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg,
1767 1767 caddr_t vaddr, uint_t flags)
1768 1768 {
1769 1769 uint_t count;
1770 1770 page_t *pp;
1771 1771 uint_t locked, i;
1772 1772 struct pcf *p;
1773 1773 lgrp_t *lgrp;
1774 1774 int cagelocked = 0;
1775 1775
1776 1776 VM_STAT_ADD(pcgs_entered);
1777 1777
1778 1778 /*
1779 1779 * Tap any reserve freelists: if we fail now, we'll die
1780 1780 * since the page(s) we're looking for have already been
1781 1781 * accounted for.
1782 1782 */
1783 1783 flags |= PG_PANIC;
1784 1784
1785 1785 if ((flags & PG_NORELOC) != 0) {
1786 1786 VM_STAT_ADD(pcgs_entered_noreloc);
1787 1787 /*
1788 1788 * Requests for free pages from critical threads
1789 1789 * such as pageout still won't throttle here, but
1790 1790 * we must try again, to give the cageout thread
1791 1791 * another chance to catch up. Since we already
1792 1792 * accounted for the pages, we had better get them
1793 1793 * this time.
1794 1794 *
1795 1795 * N.B. All non-critical threads acquire the pcgs_cagelock
1796 1796 * to serialize access to the freelists. This implements a
1797 1797 * turnstile-type synchornization to avoid starvation of
1798 1798 * critical requests for PG_NORELOC memory by non-critical
1799 1799 * threads: all non-critical threads must acquire a 'ticket'
1800 1800 * before passing through, which entails making sure
1801 1801 * kcage_freemem won't fall below minfree prior to grabbing
1802 1802 * pages from the freelists.
1803 1803 */
1804 1804 if (kcage_create_throttle(1, flags) == KCT_NONCRIT) {
1805 1805 mutex_enter(&pcgs_cagelock);
1806 1806 cagelocked = 1;
1807 1807 VM_STAT_ADD(pcgs_cagelocked);
1808 1808 }
1809 1809 }
1810 1810
1811 1811 /*
1812 1812 * Time to get serious.
1813 1813 * We failed to get a `correctly colored' page from both the
1814 1814 * free and cache lists.
1815 1815 * We escalate in stage.
1816 1816 *
1817 1817 * First try both lists without worring about color.
1818 1818 *
1819 1819 * Then, grab all page accounting locks (ie. pcf[]) and
1820 1820 * steal any pages that they have and set the pcf_block flag to
1821 1821 * stop deletions from the lists. This will help because
1822 1822 * a page can get added to the free list while we are looking
1823 1823 * at the cache list, then another page could be added to the cache
1824 1824 * list allowing the page on the free list to be removed as we
1825 1825 * move from looking at the cache list to the free list. This
1826 1826 * could happen over and over. We would never find the page
1827 1827 * we have accounted for.
1828 1828 *
1829 1829 * Noreloc pages are a subset of the global (relocatable) page pool.
1830 1830 * They are not tracked separately in the pcf bins, so it is
1831 1831 * impossible to know when doing pcf accounting if the available
1832 1832 * page(s) are noreloc pages or not. When looking for a noreloc page
1833 1833 * it is quite easy to end up here even if the global (relocatable)
1834 1834 * page pool has plenty of free pages but the noreloc pool is empty.
1835 1835 *
1836 1836 * When the noreloc pool is empty (or low), additional noreloc pages
1837 1837 * are created by converting pages from the global page pool. This
1838 1838 * process will stall during pcf accounting if the pcf bins are
1839 1839 * already locked. Such is the case when a noreloc allocation is
1840 1840 * looping here in page_create_get_something waiting for more noreloc
1841 1841 * pages to appear.
1842 1842 *
1843 1843 * Short of adding a new field to the pcf bins to accurately track
1844 1844 * the number of free noreloc pages, we instead do not grab the
1845 1845 * pcgs_lock, do not set the pcf blocks and do not timeout when
1846 1846 * allocating a noreloc page. This allows noreloc allocations to
1847 1847 * loop without blocking global page pool allocations.
1848 1848 *
1849 1849 * NOTE: the behaviour of page_create_get_something has not changed
1850 1850 * for the case of global page pool allocations.
1851 1851 */
1852 1852
1853 1853 flags &= ~PG_MATCH_COLOR;
1854 1854 locked = 0;
1855 1855 #if defined(__i386) || defined(__amd64)
1856 1856 flags = page_create_update_flags_x86(flags);
1857 1857 #endif
1858 1858
1859 1859 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
1860 1860
1861 1861 for (count = 0; kcage_on || count < MAX_PCGS; count++) {
1862 1862 pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
1863 1863 flags, lgrp);
1864 1864 if (pp == NULL) {
1865 1865 pp = page_get_cachelist(vp, off, seg, vaddr,
1866 1866 flags, lgrp);
1867 1867 }
1868 1868 if (pp == NULL) {
1869 1869 /*
1870 1870 * Serialize. Don't fight with other pcgs().
1871 1871 */
1872 1872 if (!locked && (!kcage_on || !(flags & PG_NORELOC))) {
1873 1873 mutex_enter(&pcgs_lock);
1874 1874 VM_STAT_ADD(pcgs_locked);
1875 1875 locked = 1;
1876 1876 p = pcf;
1877 1877 for (i = 0; i < pcf_fanout; i++) {
1878 1878 mutex_enter(&p->pcf_lock);
1879 1879 ASSERT(p->pcf_block == 0);
1880 1880 p->pcf_block = 1;
1881 1881 p->pcf_reserve = p->pcf_count;
1882 1882 p->pcf_count = 0;
1883 1883 mutex_exit(&p->pcf_lock);
1884 1884 p++;
1885 1885 }
1886 1886 freemem = 0;
1887 1887 }
1888 1888
1889 1889 if (count) {
1890 1890 /*
1891 1891 * Since page_free() puts pages on
1892 1892 * a list then accounts for it, we
1893 1893 * just have to wait for page_free()
1894 1894 * to unlock any page it was working
1895 1895 * with. The page_lock()-page_reclaim()
1896 1896 * path falls in the same boat.
1897 1897 *
1898 1898 * We don't need to check on the
1899 1899 * PG_WAIT flag, we have already
1900 1900 * accounted for the page we are
1901 1901 * looking for in page_create_va().
1902 1902 *
1903 1903 * We just wait a moment to let any
1904 1904 * locked pages on the lists free up,
1905 1905 * then continue around and try again.
1906 1906 *
1907 1907 * Will be awakened by set_freemem().
1908 1908 */
1909 1909 mutex_enter(&pcgs_wait_lock);
1910 1910 cv_wait(&pcgs_cv, &pcgs_wait_lock);
1911 1911 mutex_exit(&pcgs_wait_lock);
1912 1912 }
1913 1913 } else {
1914 1914 #ifdef VM_STATS
1915 1915 if (count >= PCGS_TRIES) {
1916 1916 VM_STAT_ADD(pcgs_too_many);
1917 1917 } else {
1918 1918 VM_STAT_ADD(pcgs_counts[count]);
1919 1919 }
1920 1920 #endif
1921 1921 if (locked) {
1922 1922 pcgs_unblock();
1923 1923 mutex_exit(&pcgs_lock);
1924 1924 }
1925 1925 if (cagelocked)
1926 1926 mutex_exit(&pcgs_cagelock);
1927 1927 return (pp);
1928 1928 }
1929 1929 }
1930 1930 /*
1931 1931 * we go down holding the pcf locks.
1932 1932 */
1933 1933 panic("no %spage found %d",
1934 1934 ((flags & PG_NORELOC) ? "non-reloc " : ""), count);
1935 1935 /*NOTREACHED*/
1936 1936 }
1937 1937
1938 1938 /*
1939 1939 * Create enough pages for "bytes" worth of data starting at
1940 1940 * "off" in "vp".
1941 1941 *
1942 1942 * Where flag must be one of:
1943 1943 *
1944 1944 * PG_EXCL: Exclusive create (fail if any page already
1945 1945 * exists in the page cache) which does not
1946 1946 * wait for memory to become available.
1947 1947 *
1948 1948 * PG_WAIT: Non-exclusive create which can wait for
1949 1949 * memory to become available.
1950 1950 *
1951 1951 * PG_PHYSCONTIG: Allocate physically contiguous pages.
1952 1952 * (Not Supported)
1953 1953 *
1954 1954 * A doubly linked list of pages is returned to the caller. Each page
1955 1955 * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock)
1956 1956 * lock.
1957 1957 *
1958 1958 * Unable to change the parameters to page_create() in a minor release,
1959 1959 * we renamed page_create() to page_create_va(), changed all known calls
1960 1960 * from page_create() to page_create_va(), and created this wrapper.
1961 1961 *
1962 1962 * Upon a major release, we should break compatibility by deleting this
1963 1963 * wrapper, and replacing all the strings "page_create_va", with "page_create".
1964 1964 *
1965 1965 * NOTE: There is a copy of this interface as page_create_io() in
1966 1966 * i86/vm/vm_machdep.c. Any bugs fixed here should be applied
1967 1967 * there.
1968 1968 */
1969 1969 page_t *
1970 1970 page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags)
1971 1971 {
1972 1972 caddr_t random_vaddr;
1973 1973 struct seg kseg;
1974 1974
1975 1975 #ifdef DEBUG
1976 1976 cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p",
1977 1977 (void *)caller());
1978 1978 #endif
1979 1979
1980 1980 random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^
1981 1981 (uintptr_t)(off >> PAGESHIFT));
1982 1982 kseg.s_as = &kas;
1983 1983
1984 1984 return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr));
1985 1985 }
1986 1986
1987 1987 #ifdef DEBUG
1988 1988 uint32_t pg_alloc_pgs_mtbf = 0;
1989 1989 #endif
1990 1990
1991 1991 /*
1992 1992 * Used for large page support. It will attempt to allocate
1993 1993 * a large page(s) off the freelist.
1994 1994 *
1995 1995 * Returns non zero on failure.
1996 1996 */
1997 1997 int
1998 1998 page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr,
1999 1999 page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz, int pgflags)
2000 2000 {
2001 2001 pgcnt_t npgs, curnpgs, totpgs;
2002 2002 size_t pgsz;
2003 2003 page_t *pplist = NULL, *pp;
2004 2004 int err = 0;
2005 2005 lgrp_t *lgrp;
2006 2006
2007 2007 ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
2008 2008 ASSERT(pgflags == 0 || pgflags == PG_LOCAL);
2009 2009
2010 2010 /*
2011 2011 * Check if system heavily prefers local large pages over remote
2012 2012 * on systems with multiple lgroups.
2013 2013 */
2014 2014 if (lpg_alloc_prefer == LPAP_LOCAL && nlgrps > 1) {
2015 2015 pgflags = PG_LOCAL;
2016 2016 }
2017 2017
2018 2018 VM_STAT_ADD(alloc_pages[0]);
2019 2019
2020 2020 #ifdef DEBUG
2021 2021 if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) {
2022 2022 return (ENOMEM);
2023 2023 }
2024 2024 #endif
2025 2025
2026 2026 /*
2027 2027 * One must be NULL but not both.
2028 2028 * And one must be non NULL but not both.
2029 2029 */
2030 2030 ASSERT(basepp != NULL || ppa != NULL);
2031 2031 ASSERT(basepp == NULL || ppa == NULL);
2032 2032
2033 2033 #if defined(__i386) || defined(__amd64)
2034 2034 while (page_chk_freelist(szc) == 0) {
2035 2035 VM_STAT_ADD(alloc_pages[8]);
2036 2036 if (anypgsz == 0 || --szc == 0)
2037 2037 return (ENOMEM);
2038 2038 }
2039 2039 #endif
2040 2040
2041 2041 pgsz = page_get_pagesize(szc);
2042 2042 totpgs = curnpgs = npgs = pgsz >> PAGESHIFT;
2043 2043
2044 2044 ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0);
2045 2045
2046 2046 (void) page_create_wait(npgs, PG_WAIT);
2047 2047
2048 2048 while (npgs && szc) {
2049 2049 lgrp = lgrp_mem_choose(seg, addr, pgsz);
2050 2050 if (pgflags == PG_LOCAL) {
2051 2051 pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2052 2052 pgflags, lgrp);
2053 2053 if (pp == NULL) {
2054 2054 pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2055 2055 0, lgrp);
2056 2056 }
2057 2057 } else {
2058 2058 pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2059 2059 0, lgrp);
2060 2060 }
2061 2061 if (pp != NULL) {
2062 2062 VM_STAT_ADD(alloc_pages[1]);
2063 2063 page_list_concat(&pplist, &pp);
2064 2064 ASSERT(npgs >= curnpgs);
2065 2065 npgs -= curnpgs;
2066 2066 } else if (anypgsz) {
2067 2067 VM_STAT_ADD(alloc_pages[2]);
2068 2068 szc--;
2069 2069 pgsz = page_get_pagesize(szc);
2070 2070 curnpgs = pgsz >> PAGESHIFT;
2071 2071 } else {
2072 2072 VM_STAT_ADD(alloc_pages[3]);
2073 2073 ASSERT(npgs == totpgs);
2074 2074 page_create_putback(npgs);
2075 2075 return (ENOMEM);
2076 2076 }
2077 2077 }
2078 2078 if (szc == 0) {
2079 2079 VM_STAT_ADD(alloc_pages[4]);
2080 2080 ASSERT(npgs != 0);
2081 2081 page_create_putback(npgs);
2082 2082 err = ENOMEM;
2083 2083 } else if (basepp != NULL) {
2084 2084 ASSERT(npgs == 0);
2085 2085 ASSERT(ppa == NULL);
2086 2086 *basepp = pplist;
2087 2087 }
2088 2088
2089 2089 npgs = totpgs - npgs;
2090 2090 pp = pplist;
2091 2091
2092 2092 /*
2093 2093 * Clear the free and age bits. Also if we were passed in a ppa then
2094 2094 * fill it in with all the constituent pages from the large page. But
2095 2095 * if we failed to allocate all the pages just free what we got.
2096 2096 */
2097 2097 while (npgs != 0) {
2098 2098 ASSERT(PP_ISFREE(pp));
2099 2099 ASSERT(PP_ISAGED(pp));
2100 2100 if (ppa != NULL || err != 0) {
2101 2101 if (err == 0) {
2102 2102 VM_STAT_ADD(alloc_pages[5]);
2103 2103 PP_CLRFREE(pp);
2104 2104 PP_CLRAGED(pp);
2105 2105 page_sub(&pplist, pp);
2106 2106 *ppa++ = pp;
2107 2107 npgs--;
2108 2108 } else {
2109 2109 VM_STAT_ADD(alloc_pages[6]);
2110 2110 ASSERT(pp->p_szc != 0);
2111 2111 curnpgs = page_get_pagecnt(pp->p_szc);
2112 2112 page_list_break(&pp, &pplist, curnpgs);
2113 2113 page_list_add_pages(pp, 0);
2114 2114 page_create_putback(curnpgs);
2115 2115 ASSERT(npgs >= curnpgs);
2116 2116 npgs -= curnpgs;
2117 2117 }
2118 2118 pp = pplist;
2119 2119 } else {
2120 2120 VM_STAT_ADD(alloc_pages[7]);
2121 2121 PP_CLRFREE(pp);
2122 2122 PP_CLRAGED(pp);
2123 2123 pp = pp->p_next;
2124 2124 npgs--;
2125 2125 }
2126 2126 }
2127 2127 return (err);
2128 2128 }
2129 2129
2130 2130 /*
2131 2131 * Get a single large page off of the freelists, and set it up for use.
2132 2132 * Number of bytes requested must be a supported page size.
2133 2133 *
2134 2134 * Note that this call may fail even if there is sufficient
2135 2135 * memory available or PG_WAIT is set, so the caller must
2136 2136 * be willing to fallback on page_create_va(), block and retry,
2137 2137 * or fail the requester.
2138 2138 */
2139 2139 page_t *
2140 2140 page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2141 2141 struct seg *seg, caddr_t vaddr, void *arg)
2142 2142 {
2143 2143 pgcnt_t npages;
2144 2144 page_t *pp;
2145 2145 page_t *rootpp;
2146 2146 lgrp_t *lgrp;
2147 2147 lgrp_id_t *lgrpid = (lgrp_id_t *)arg;
2148 2148
2149 2149 ASSERT(vp != NULL);
2150 2150
2151 2151 ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2152 2152 PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2153 2153 /* but no others */
2154 2154
2155 2155 ASSERT((flags & PG_EXCL) == PG_EXCL);
2156 2156
2157 2157 npages = btop(bytes);
2158 2158
2159 2159 if (!kcage_on || panicstr) {
2160 2160 /*
2161 2161 * Cage is OFF, or we are single threaded in
2162 2162 * panic, so make everything a RELOC request.
2163 2163 */
2164 2164 flags &= ~PG_NORELOC;
2165 2165 }
2166 2166
2167 2167 /*
2168 2168 * Make sure there's adequate physical memory available.
2169 2169 * Note: PG_WAIT is ignored here.
2170 2170 */
2171 2171 if (freemem <= throttlefree + npages) {
2172 2172 VM_STAT_ADD(page_create_large_cnt[1]);
2173 2173 return (NULL);
2174 2174 }
2175 2175
2176 2176 /*
2177 2177 * If cage is on, dampen draw from cage when available
2178 2178 * cage space is low.
2179 2179 */
2180 2180 if ((flags & (PG_NORELOC | PG_WAIT)) == (PG_NORELOC | PG_WAIT) &&
2181 2181 kcage_freemem < kcage_throttlefree + npages) {
2182 2182
2183 2183 /*
2184 2184 * The cage is on, the caller wants PG_NORELOC
2185 2185 * pages and available cage memory is very low.
2186 2186 * Call kcage_create_throttle() to attempt to
2187 2187 * control demand on the cage.
2188 2188 */
2189 2189 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) {
2190 2190 VM_STAT_ADD(page_create_large_cnt[2]);
2191 2191 return (NULL);
2192 2192 }
2193 2193 }
2194 2194
2195 2195 if (!pcf_decrement_bucket(npages) &&
2196 2196 !pcf_decrement_multiple(NULL, npages, 1)) {
2197 2197 VM_STAT_ADD(page_create_large_cnt[4]);
2198 2198 return (NULL);
2199 2199 }
2200 2200
2201 2201 /*
2202 2202 * This is where this function behaves fundamentally differently
2203 2203 * than page_create_va(); since we're intending to map the page
2204 2204 * with a single TTE, we have to get it as a physically contiguous
2205 2205 * hardware pagesize chunk. If we can't, we fail.
2206 2206 */
2207 2207 if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max &&
2208 2208 LGRP_EXISTS(lgrp_table[*lgrpid]))
2209 2209 lgrp = lgrp_table[*lgrpid];
2210 2210 else
2211 2211 lgrp = lgrp_mem_choose(seg, vaddr, bytes);
2212 2212
2213 2213 if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr,
2214 2214 bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) {
2215 2215 page_create_putback(npages);
2216 2216 VM_STAT_ADD(page_create_large_cnt[5]);
2217 2217 return (NULL);
2218 2218 }
2219 2219
2220 2220 /*
2221 2221 * if we got the page with the wrong mtype give it back this is a
2222 2222 * workaround for CR 6249718. When CR 6249718 is fixed we never get
2223 2223 * inside "if" and the workaround becomes just a nop
2224 2224 */
2225 2225 if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) {
2226 2226 page_list_add_pages(rootpp, 0);
2227 2227 page_create_putback(npages);
2228 2228 VM_STAT_ADD(page_create_large_cnt[6]);
2229 2229 return (NULL);
2230 2230 }
2231 2231
2232 2232 /*
2233 2233 * If satisfying this request has left us with too little
2234 2234 * memory, start the wheels turning to get some back. The
2235 2235 * first clause of the test prevents waking up the pageout
2236 2236 * daemon in situations where it would decide that there's
2237 2237 * nothing to do.
2238 2238 */
2239 2239 if (nscan < desscan && freemem < minfree) {
2240 2240 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2241 2241 "pageout_cv_signal:freemem %ld", freemem);
2242 2242 cv_signal(&proc_pageout->p_cv);
2243 2243 }
2244 2244
2245 2245 pp = rootpp;
2246 2246 while (npages--) {
2247 2247 ASSERT(PAGE_EXCL(pp));
2248 2248 ASSERT(pp->p_vnode == NULL);
2249 2249 ASSERT(!hat_page_is_mapped(pp));
2250 2250 PP_CLRFREE(pp);
2251 2251 PP_CLRAGED(pp);
2252 2252 if (!page_hashin(pp, vp, off, NULL))
2253 2253 panic("page_create_large: hashin failed: page %p",
2254 2254 (void *)pp);
2255 2255 page_io_lock(pp);
2256 2256 off += PAGESIZE;
2257 2257 pp = pp->p_next;
2258 2258 }
2259 2259
2260 2260 VM_STAT_ADD(page_create_large_cnt[0]);
2261 2261 return (rootpp);
2262 2262 }
2263 2263
2264 2264 page_t *
2265 2265 page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2266 2266 struct seg *seg, caddr_t vaddr)
2267 2267 {
2268 2268 page_t *plist = NULL;
2269 2269 pgcnt_t npages;
2270 2270 pgcnt_t found_on_free = 0;
2271 2271 pgcnt_t pages_req;
2272 2272 page_t *npp = NULL;
2273 2273 struct pcf *p;
2274 2274 lgrp_t *lgrp;
2275 2275
2276 2276 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
2277 2277 "page_create_start:vp %p off %llx bytes %lu flags %x",
2278 2278 vp, off, bytes, flags);
2279 2279
2280 2280 ASSERT(bytes != 0 && vp != NULL);
2281 2281
2282 2282 if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) {
2283 2283 panic("page_create: invalid flags");
2284 2284 /*NOTREACHED*/
2285 2285 }
2286 2286 ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2287 2287 PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2288 2288 /* but no others */
2289 2289
2290 2290 pages_req = npages = btopr(bytes);
2291 2291 /*
2292 2292 * Try to see whether request is too large to *ever* be
2293 2293 * satisfied, in order to prevent deadlock. We arbitrarily
2294 2294 * decide to limit maximum size requests to max_page_get.
2295 2295 */
2296 2296 if (npages >= max_page_get) {
2297 2297 if ((flags & PG_WAIT) == 0) {
2298 2298 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG,
2299 2299 "page_create_toobig:vp %p off %llx npages "
2300 2300 "%lu max_page_get %lu",
2301 2301 vp, off, npages, max_page_get);
2302 2302 return (NULL);
2303 2303 } else {
2304 2304 cmn_err(CE_WARN,
2305 2305 "Request for too much kernel memory "
2306 2306 "(%lu bytes), will hang forever", bytes);
2307 2307 for (;;)
2308 2308 delay(1000000000);
2309 2309 }
2310 2310 }
2311 2311
2312 2312 if (!kcage_on || panicstr) {
2313 2313 /*
2314 2314 * Cage is OFF, or we are single threaded in
2315 2315 * panic, so make everything a RELOC request.
2316 2316 */
2317 2317 flags &= ~PG_NORELOC;
2318 2318 }
2319 2319
2320 2320 if (freemem <= throttlefree + npages)
2321 2321 if (!page_create_throttle(npages, flags))
2322 2322 return (NULL);
2323 2323
2324 2324 /*
2325 2325 * If cage is on, dampen draw from cage when available
2326 2326 * cage space is low.
2327 2327 */
2328 2328 if ((flags & PG_NORELOC) &&
2329 2329 kcage_freemem < kcage_throttlefree + npages) {
2330 2330
2331 2331 /*
2332 2332 * The cage is on, the caller wants PG_NORELOC
2333 2333 * pages and available cage memory is very low.
2334 2334 * Call kcage_create_throttle() to attempt to
2335 2335 * control demand on the cage.
2336 2336 */
2337 2337 if (kcage_create_throttle(npages, flags) == KCT_FAILURE)
2338 2338 return (NULL);
2339 2339 }
2340 2340
2341 2341 VM_STAT_ADD(page_create_cnt[0]);
2342 2342
2343 2343 if (!pcf_decrement_bucket(npages)) {
2344 2344 /*
2345 2345 * Have to look harder. If npages is greater than
2346 2346 * one, then we might have to coalesce the counters.
2347 2347 *
2348 2348 * Go wait. We come back having accounted
2349 2349 * for the memory.
2350 2350 */
2351 2351 VM_STAT_ADD(page_create_cnt[1]);
2352 2352 if (!page_create_wait(npages, flags)) {
2353 2353 VM_STAT_ADD(page_create_cnt[2]);
2354 2354 return (NULL);
2355 2355 }
2356 2356 }
2357 2357
2358 2358 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
2359 2359 "page_create_success:vp %p off %llx", vp, off);
2360 2360
2361 2361 /*
2362 2362 * If satisfying this request has left us with too little
2363 2363 * memory, start the wheels turning to get some back. The
2364 2364 * first clause of the test prevents waking up the pageout
2365 2365 * daemon in situations where it would decide that there's
2366 2366 * nothing to do.
2367 2367 */
2368 2368 if (nscan < desscan && freemem < minfree) {
2369 2369 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2370 2370 "pageout_cv_signal:freemem %ld", freemem);
2371 2371 cv_signal(&proc_pageout->p_cv);
2372 2372 }
2373 2373
2374 2374 /*
2375 2375 * Loop around collecting the requested number of pages.
2376 2376 * Most of the time, we have to `create' a new page. With
2377 2377 * this in mind, pull the page off the free list before
2378 2378 * getting the hash lock. This will minimize the hash
2379 2379 * lock hold time, nesting, and the like. If it turns
2380 2380 * out we don't need the page, we put it back at the end.
2381 2381 */
2382 2382 while (npages--) {
2383 2383 page_t *pp;
2384 2384 kmutex_t *phm = NULL;
2385 2385 ulong_t index;
2386 2386
2387 2387 index = PAGE_HASH_FUNC(vp, off);
2388 2388 top:
2389 2389 ASSERT(phm == NULL);
2390 2390 ASSERT(index == PAGE_HASH_FUNC(vp, off));
2391 2391 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
2392 2392
2393 2393 if (npp == NULL) {
2394 2394 /*
2395 2395 * Try to get a page from the freelist (ie,
2396 2396 * a page with no [vp, off] tag). If that
2397 2397 * fails, use the cachelist.
2398 2398 *
2399 2399 * During the first attempt at both the free
2400 2400 * and cache lists we try for the correct color.
2401 2401 */
2402 2402 /*
2403 2403 * XXXX-how do we deal with virtual indexed
2404 2404 * caches and and colors?
2405 2405 */
2406 2406 VM_STAT_ADD(page_create_cnt[4]);
2407 2407 /*
2408 2408 * Get lgroup to allocate next page of shared memory
2409 2409 * from and use it to specify where to allocate
2410 2410 * the physical memory
2411 2411 */
2412 2412 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
2413 2413 npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
2414 2414 flags | PG_MATCH_COLOR, lgrp);
2415 2415 if (npp == NULL) {
2416 2416 npp = page_get_cachelist(vp, off, seg,
2417 2417 vaddr, flags | PG_MATCH_COLOR, lgrp);
2418 2418 if (npp == NULL) {
2419 2419 npp = page_create_get_something(vp,
2420 2420 off, seg, vaddr,
2421 2421 flags & ~PG_MATCH_COLOR);
2422 2422 }
2423 2423
2424 2424 if (PP_ISAGED(npp) == 0) {
2425 2425 /*
2426 2426 * Since this page came from the
2427 2427 * cachelist, we must destroy the
2428 2428 * old vnode association.
2429 2429 */
2430 2430 page_hashout(npp, NULL);
2431 2431 }
2432 2432 }
2433 2433 }
2434 2434
2435 2435 /*
2436 2436 * We own this page!
2437 2437 */
2438 2438 ASSERT(PAGE_EXCL(npp));
2439 2439 ASSERT(npp->p_vnode == NULL);
2440 2440 ASSERT(!hat_page_is_mapped(npp));
2441 2441 PP_CLRFREE(npp);
2442 2442 PP_CLRAGED(npp);
2443 2443
2444 2444 /*
2445 2445 * Here we have a page in our hot little mits and are
2446 2446 * just waiting to stuff it on the appropriate lists.
2447 2447 * Get the mutex and check to see if it really does
2448 2448 * not exist.
2449 2449 */
2450 2450 phm = PAGE_HASH_MUTEX(index);
2451 2451 mutex_enter(phm);
2452 2452 PAGE_HASH_SEARCH(index, pp, vp, off);
2453 2453 if (pp == NULL) {
2454 2454 VM_STAT_ADD(page_create_new);
2455 2455 pp = npp;
2456 2456 npp = NULL;
2457 2457 if (!page_hashin(pp, vp, off, phm)) {
2458 2458 /*
2459 2459 * Since we hold the page hash mutex and
2460 2460 * just searched for this page, page_hashin
2461 2461 * had better not fail. If it does, that
2462 2462 * means somethread did not follow the
2463 2463 * page hash mutex rules. Panic now and
2464 2464 * get it over with. As usual, go down
2465 2465 * holding all the locks.
2466 2466 */
2467 2467 ASSERT(MUTEX_HELD(phm));
2468 2468 panic("page_create: "
2469 2469 "hashin failed %p %p %llx %p",
2470 2470 (void *)pp, (void *)vp, off, (void *)phm);
2471 2471 /*NOTREACHED*/
2472 2472 }
2473 2473 ASSERT(MUTEX_HELD(phm));
2474 2474 mutex_exit(phm);
2475 2475 phm = NULL;
2476 2476
2477 2477 /*
2478 2478 * Hat layer locking need not be done to set
2479 2479 * the following bits since the page is not hashed
2480 2480 * and was on the free list (i.e., had no mappings).
2481 2481 *
2482 2482 * Set the reference bit to protect
2483 2483 * against immediate pageout
2484 2484 *
2485 2485 * XXXmh modify freelist code to set reference
2486 2486 * bit so we don't have to do it here.
2487 2487 */
2488 2488 page_set_props(pp, P_REF);
2489 2489 found_on_free++;
2490 2490 } else {
2491 2491 VM_STAT_ADD(page_create_exists);
2492 2492 if (flags & PG_EXCL) {
2493 2493 /*
2494 2494 * Found an existing page, and the caller
2495 2495 * wanted all new pages. Undo all of the work
2496 2496 * we have done.
2497 2497 */
2498 2498 mutex_exit(phm);
2499 2499 phm = NULL;
2500 2500 while (plist != NULL) {
2501 2501 pp = plist;
2502 2502 page_sub(&plist, pp);
2503 2503 page_io_unlock(pp);
2504 2504 /* large pages should not end up here */
2505 2505 ASSERT(pp->p_szc == 0);
2506 2506 /*LINTED: constant in conditional ctx*/
2507 2507 VN_DISPOSE(pp, B_INVAL, 0, kcred);
2508 2508 }
2509 2509 VM_STAT_ADD(page_create_found_one);
2510 2510 goto fail;
2511 2511 }
2512 2512 ASSERT(flags & PG_WAIT);
2513 2513 if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) {
2514 2514 /*
2515 2515 * Start all over again if we blocked trying
2516 2516 * to lock the page.
2517 2517 */
2518 2518 mutex_exit(phm);
2519 2519 VM_STAT_ADD(page_create_page_lock_failed);
2520 2520 phm = NULL;
2521 2521 goto top;
2522 2522 }
2523 2523 mutex_exit(phm);
2524 2524 phm = NULL;
2525 2525
2526 2526 if (PP_ISFREE(pp)) {
2527 2527 ASSERT(PP_ISAGED(pp) == 0);
2528 2528 VM_STAT_ADD(pagecnt.pc_get_cache);
2529 2529 page_list_sub(pp, PG_CACHE_LIST);
2530 2530 PP_CLRFREE(pp);
2531 2531 found_on_free++;
2532 2532 }
2533 2533 }
2534 2534
2535 2535 /*
2536 2536 * Got a page! It is locked. Acquire the i/o
2537 2537 * lock since we are going to use the p_next and
2538 2538 * p_prev fields to link the requested pages together.
2539 2539 */
2540 2540 page_io_lock(pp);
2541 2541 page_add(&plist, pp);
2542 2542 plist = plist->p_next;
2543 2543 off += PAGESIZE;
2544 2544 vaddr += PAGESIZE;
2545 2545 }
2546 2546
2547 2547 ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1);
2548 2548 fail:
2549 2549 if (npp != NULL) {
2550 2550 /*
2551 2551 * Did not need this page after all.
2552 2552 * Put it back on the free list.
2553 2553 */
2554 2554 VM_STAT_ADD(page_create_putbacks);
2555 2555 PP_SETFREE(npp);
2556 2556 PP_SETAGED(npp);
2557 2557 npp->p_offset = (u_offset_t)-1;
2558 2558 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
2559 2559 page_unlock(npp);
2560 2560
2561 2561 }
2562 2562
2563 2563 ASSERT(pages_req >= found_on_free);
2564 2564
2565 2565 {
2566 2566 uint_t overshoot = (uint_t)(pages_req - found_on_free);
2567 2567
2568 2568 if (overshoot) {
2569 2569 VM_STAT_ADD(page_create_overshoot);
2570 2570 p = &pcf[PCF_INDEX()];
2571 2571 mutex_enter(&p->pcf_lock);
2572 2572 if (p->pcf_block) {
2573 2573 p->pcf_reserve += overshoot;
2574 2574 } else {
2575 2575 p->pcf_count += overshoot;
2576 2576 if (p->pcf_wait) {
2577 2577 mutex_enter(&new_freemem_lock);
2578 2578 if (freemem_wait) {
2579 2579 cv_signal(&freemem_cv);
2580 2580 p->pcf_wait--;
2581 2581 } else {
2582 2582 p->pcf_wait = 0;
2583 2583 }
2584 2584 mutex_exit(&new_freemem_lock);
2585 2585 }
2586 2586 }
2587 2587 mutex_exit(&p->pcf_lock);
2588 2588 /* freemem is approximate, so this test OK */
2589 2589 if (!p->pcf_block)
2590 2590 freemem += overshoot;
2591 2591 }
2592 2592 }
2593 2593
2594 2594 return (plist);
2595 2595 }
2596 2596
2597 2597 /*
2598 2598 * One or more constituent pages of this large page has been marked
2599 2599 * toxic. Simply demote the large page to PAGESIZE pages and let
2600 2600 * page_free() handle it. This routine should only be called by
2601 2601 * large page free routines (page_free_pages() and page_destroy_pages().
2602 2602 * All pages are locked SE_EXCL and have already been marked free.
2603 2603 */
2604 2604 static void
2605 2605 page_free_toxic_pages(page_t *rootpp)
2606 2606 {
2607 2607 page_t *tpp;
2608 2608 pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc);
2609 2609 uint_t szc = rootpp->p_szc;
2610 2610
2611 2611 for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) {
2612 2612 ASSERT(tpp->p_szc == szc);
2613 2613 ASSERT((PAGE_EXCL(tpp) &&
2614 2614 !page_iolock_assert(tpp)) || panicstr);
2615 2615 tpp->p_szc = 0;
2616 2616 }
2617 2617
2618 2618 while (rootpp != NULL) {
2619 2619 tpp = rootpp;
2620 2620 page_sub(&rootpp, tpp);
2621 2621 ASSERT(PP_ISFREE(tpp));
2622 2622 PP_CLRFREE(tpp);
2623 2623 page_free(tpp, 1);
2624 2624 }
2625 2625 }
2626 2626
2627 2627 /*
2628 2628 * Put page on the "free" list.
2629 2629 * The free list is really two lists maintained by
2630 2630 * the PSM of whatever machine we happen to be on.
2631 2631 */
2632 2632 void
2633 2633 page_free(page_t *pp, int dontneed)
2634 2634 {
2635 2635 struct pcf *p;
2636 2636 uint_t pcf_index;
2637 2637
2638 2638 ASSERT((PAGE_EXCL(pp) &&
2639 2639 !page_iolock_assert(pp)) || panicstr);
2640 2640
2641 2641 if (PP_ISFREE(pp)) {
2642 2642 panic("page_free: page %p is free", (void *)pp);
2643 2643 }
2644 2644
2645 2645 if (pp->p_szc != 0) {
2646 2646 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
2647 2647 PP_ISKAS(pp)) {
2648 2648 panic("page_free: anon or kernel "
2649 2649 "or no vnode large page %p", (void *)pp);
2650 2650 }
2651 2651 page_demote_vp_pages(pp);
2652 2652 ASSERT(pp->p_szc == 0);
2653 2653 }
2654 2654
2655 2655 /*
2656 2656 * The page_struct_lock need not be acquired to examine these
2657 2657 * fields since the page has an "exclusive" lock.
2658 2658 */
2659 2659 if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
2660 2660 pp->p_slckcnt != 0) {
2661 2661 panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d "
2662 2662 "slckcnt = %d", (void *)pp, page_pptonum(pp), pp->p_lckcnt,
2663 2663 pp->p_cowcnt, pp->p_slckcnt);
2664 2664 /*NOTREACHED*/
2665 2665 }
2666 2666
2667 2667 ASSERT(!hat_page_getshare(pp));
2668 2668
2669 2669 PP_SETFREE(pp);
2670 2670 ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
2671 2671 !hat_ismod(pp));
2672 2672 page_clr_all_props(pp);
2673 2673 ASSERT(!hat_page_getshare(pp));
2674 2674
2675 2675 /*
2676 2676 * Now we add the page to the head of the free list.
2677 2677 * But if this page is associated with a paged vnode
2678 2678 * then we adjust the head forward so that the page is
2679 2679 * effectively at the end of the list.
2680 2680 */
2681 2681 if (pp->p_vnode == NULL) {
2682 2682 /*
2683 2683 * Page has no identity, put it on the free list.
2684 2684 */
2685 2685 PP_SETAGED(pp);
2686 2686 pp->p_offset = (u_offset_t)-1;
2687 2687 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2688 2688 VM_STAT_ADD(pagecnt.pc_free_free);
2689 2689 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2690 2690 "page_free_free:pp %p", pp);
2691 2691 } else {
2692 2692 PP_CLRAGED(pp);
2693 2693
2694 2694 if (!dontneed || nopageage) {
2695 2695 /* move it to the tail of the list */
2696 2696 page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL);
2697 2697
2698 2698 VM_STAT_ADD(pagecnt.pc_free_cache);
2699 2699 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL,
2700 2700 "page_free_cache_tail:pp %p", pp);
2701 2701 } else {
2702 2702 page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD);
2703 2703
2704 2704 VM_STAT_ADD(pagecnt.pc_free_dontneed);
2705 2705 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD,
2706 2706 "page_free_cache_head:pp %p", pp);
2707 2707 }
2708 2708 }
2709 2709 page_unlock(pp);
2710 2710
2711 2711 /*
2712 2712 * Now do the `freemem' accounting.
2713 2713 */
2714 2714 pcf_index = PCF_INDEX();
2715 2715 p = &pcf[pcf_index];
2716 2716
2717 2717 mutex_enter(&p->pcf_lock);
2718 2718 if (p->pcf_block) {
2719 2719 p->pcf_reserve += 1;
2720 2720 } else {
2721 2721 p->pcf_count += 1;
2722 2722 if (p->pcf_wait) {
2723 2723 mutex_enter(&new_freemem_lock);
2724 2724 /*
2725 2725 * Check to see if some other thread
2726 2726 * is actually waiting. Another bucket
2727 2727 * may have woken it up by now. If there
2728 2728 * are no waiters, then set our pcf_wait
2729 2729 * count to zero to avoid coming in here
2730 2730 * next time. Also, since only one page
2731 2731 * was put on the free list, just wake
2732 2732 * up one waiter.
2733 2733 */
2734 2734 if (freemem_wait) {
2735 2735 cv_signal(&freemem_cv);
2736 2736 p->pcf_wait--;
2737 2737 } else {
2738 2738 p->pcf_wait = 0;
2739 2739 }
2740 2740 mutex_exit(&new_freemem_lock);
2741 2741 }
2742 2742 }
2743 2743 mutex_exit(&p->pcf_lock);
2744 2744
2745 2745 /* freemem is approximate, so this test OK */
2746 2746 if (!p->pcf_block)
2747 2747 freemem += 1;
2748 2748 }
2749 2749
2750 2750 /*
2751 2751 * Put page on the "free" list during intial startup.
2752 2752 * This happens during initial single threaded execution.
2753 2753 */
2754 2754 void
2755 2755 page_free_at_startup(page_t *pp)
2756 2756 {
2757 2757 struct pcf *p;
2758 2758 uint_t pcf_index;
2759 2759
2760 2760 page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT);
2761 2761 VM_STAT_ADD(pagecnt.pc_free_free);
2762 2762
2763 2763 /*
2764 2764 * Now do the `freemem' accounting.
2765 2765 */
2766 2766 pcf_index = PCF_INDEX();
2767 2767 p = &pcf[pcf_index];
2768 2768
2769 2769 ASSERT(p->pcf_block == 0);
2770 2770 ASSERT(p->pcf_wait == 0);
2771 2771 p->pcf_count += 1;
2772 2772
2773 2773 /* freemem is approximate, so this is OK */
2774 2774 freemem += 1;
2775 2775 }
2776 2776
2777 2777 void
2778 2778 page_free_pages(page_t *pp)
2779 2779 {
2780 2780 page_t *tpp, *rootpp = NULL;
2781 2781 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
2782 2782 pgcnt_t i;
2783 2783 uint_t szc = pp->p_szc;
2784 2784
2785 2785 VM_STAT_ADD(pagecnt.pc_free_pages);
2786 2786 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2787 2787 "page_free_free:pp %p", pp);
2788 2788
2789 2789 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
2790 2790 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
2791 2791 panic("page_free_pages: not root page %p", (void *)pp);
2792 2792 /*NOTREACHED*/
2793 2793 }
2794 2794
2795 2795 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
2796 2796 ASSERT((PAGE_EXCL(tpp) &&
2797 2797 !page_iolock_assert(tpp)) || panicstr);
2798 2798 if (PP_ISFREE(tpp)) {
2799 2799 panic("page_free_pages: page %p is free", (void *)tpp);
2800 2800 /*NOTREACHED*/
2801 2801 }
2802 2802 if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 ||
2803 2803 tpp->p_cowcnt != 0 || tpp->p_slckcnt != 0) {
2804 2804 panic("page_free_pages %p", (void *)tpp);
2805 2805 /*NOTREACHED*/
2806 2806 }
2807 2807
2808 2808 ASSERT(!hat_page_getshare(tpp));
2809 2809 ASSERT(tpp->p_vnode == NULL);
2810 2810 ASSERT(tpp->p_szc == szc);
2811 2811
2812 2812 PP_SETFREE(tpp);
2813 2813 page_clr_all_props(tpp);
2814 2814 PP_SETAGED(tpp);
2815 2815 tpp->p_offset = (u_offset_t)-1;
2816 2816 ASSERT(tpp->p_next == tpp);
2817 2817 ASSERT(tpp->p_prev == tpp);
2818 2818 page_list_concat(&rootpp, &tpp);
2819 2819 }
2820 2820 ASSERT(rootpp == pp);
2821 2821
2822 2822 page_list_add_pages(rootpp, 0);
2823 2823 page_create_putback(pgcnt);
2824 2824 }
2825 2825
2826 2826 int free_pages = 1;
2827 2827
2828 2828 /*
2829 2829 * This routine attempts to return pages to the cachelist via page_release().
2830 2830 * It does not *have* to be successful in all cases, since the pageout scanner
2831 2831 * will catch any pages it misses. It does need to be fast and not introduce
2832 2832 * too much overhead.
2833 2833 *
2834 2834 * If a page isn't found on the unlocked sweep of the page_hash bucket, we
2835 2835 * don't lock and retry. This is ok, since the page scanner will eventually
2836 2836 * find any page we miss in free_vp_pages().
2837 2837 */
2838 2838 void
2839 2839 free_vp_pages(vnode_t *vp, u_offset_t off, size_t len)
2840 2840 {
2841 2841 page_t *pp;
2842 2842 u_offset_t eoff;
2843 2843 extern int swap_in_range(vnode_t *, u_offset_t, size_t);
2844 2844
2845 2845 eoff = off + len;
2846 2846
2847 2847 if (free_pages == 0)
2848 2848 return;
2849 2849 if (swap_in_range(vp, off, len))
2850 2850 return;
2851 2851
2852 2852 for (; off < eoff; off += PAGESIZE) {
2853 2853
2854 2854 /*
2855 2855 * find the page using a fast, but inexact search. It'll be OK
2856 2856 * if a few pages slip through the cracks here.
2857 2857 */
2858 2858 pp = page_exists(vp, off);
2859 2859
2860 2860 /*
2861 2861 * If we didn't find the page (it may not exist), the page
2862 2862 * is free, looks still in use (shared), or we can't lock it,
2863 2863 * just give up.
2864 2864 */
2865 2865 if (pp == NULL ||
2866 2866 PP_ISFREE(pp) ||
2867 2867 page_share_cnt(pp) > 0 ||
2868 2868 !page_trylock(pp, SE_EXCL))
2869 2869 continue;
2870 2870
2871 2871 /*
2872 2872 * Once we have locked pp, verify that it's still the
2873 2873 * correct page and not already free
2874 2874 */
2875 2875 ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL));
2876 2876 if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) {
2877 2877 page_unlock(pp);
2878 2878 continue;
2879 2879 }
2880 2880
2881 2881 /*
2882 2882 * try to release the page...
2883 2883 */
2884 2884 (void) page_release(pp, 1);
2885 2885 }
2886 2886 }
2887 2887
2888 2888 /*
2889 2889 * Reclaim the given page from the free list.
2890 2890 * If pp is part of a large pages, only the given constituent page is reclaimed
2891 2891 * and the large page it belonged to will be demoted. This can only happen
2892 2892 * if the page is not on the cachelist.
2893 2893 *
2894 2894 * Returns 1 on success or 0 on failure.
2895 2895 *
2896 2896 * The page is unlocked if it can't be reclaimed (when freemem == 0).
2897 2897 * If `lock' is non-null, it will be dropped and re-acquired if
2898 2898 * the routine must wait while freemem is 0.
2899 2899 *
2900 2900 * As it turns out, boot_getpages() does this. It picks a page,
2901 2901 * based on where OBP mapped in some address, gets its pfn, searches
2902 2902 * the memsegs, locks the page, then pulls it off the free list!
2903 2903 */
2904 2904 int
2905 2905 page_reclaim(page_t *pp, kmutex_t *lock)
2906 2906 {
2907 2907 struct pcf *p;
2908 2908 struct cpu *cpup;
2909 2909 int enough;
2910 2910 uint_t i;
2911 2911
2912 2912 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
2913 2913 ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
2914 2914
2915 2915 /*
2916 2916 * If `freemem' is 0, we cannot reclaim this page from the
2917 2917 * freelist, so release every lock we might hold: the page,
2918 2918 * and the `lock' before blocking.
2919 2919 *
2920 2920 * The only way `freemem' can become 0 while there are pages
2921 2921 * marked free (have their p->p_free bit set) is when the
2922 2922 * system is low on memory and doing a page_create(). In
2923 2923 * order to guarantee that once page_create() starts acquiring
2924 2924 * pages it will be able to get all that it needs since `freemem'
2925 2925 * was decreased by the requested amount. So, we need to release
2926 2926 * this page, and let page_create() have it.
2927 2927 *
2928 2928 * Since `freemem' being zero is not supposed to happen, just
2929 2929 * use the usual hash stuff as a starting point. If that bucket
2930 2930 * is empty, then assume the worst, and start at the beginning
2931 2931 * of the pcf array. If we always start at the beginning
2932 2932 * when acquiring more than one pcf lock, there won't be any
2933 2933 * deadlock problems.
2934 2934 */
2935 2935
2936 2936 /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */
2937 2937
2938 2938 if (freemem <= throttlefree && !page_create_throttle(1l, 0)) {
2939 2939 pcf_acquire_all();
2940 2940 goto page_reclaim_nomem;
2941 2941 }
2942 2942
2943 2943 enough = pcf_decrement_bucket(1);
2944 2944
2945 2945 if (!enough) {
2946 2946 VM_STAT_ADD(page_reclaim_zero);
2947 2947 /*
2948 2948 * Check again. Its possible that some other thread
2949 2949 * could have been right behind us, and added one
2950 2950 * to a list somewhere. Acquire each of the pcf locks
2951 2951 * until we find a page.
2952 2952 */
2953 2953 p = pcf;
2954 2954 for (i = 0; i < pcf_fanout; i++) {
2955 2955 mutex_enter(&p->pcf_lock);
2956 2956 if (p->pcf_count >= 1) {
2957 2957 p->pcf_count -= 1;
2958 2958 /*
2959 2959 * freemem is not protected by any lock. Thus,
2960 2960 * we cannot have any assertion containing
2961 2961 * freemem here.
2962 2962 */
2963 2963 freemem -= 1;
2964 2964 enough = 1;
2965 2965 break;
2966 2966 }
2967 2967 p++;
2968 2968 }
2969 2969
2970 2970 if (!enough) {
2971 2971 page_reclaim_nomem:
2972 2972 /*
2973 2973 * We really can't have page `pp'.
2974 2974 * Time for the no-memory dance with
2975 2975 * page_free(). This is just like
2976 2976 * page_create_wait(). Plus the added
2977 2977 * attraction of releasing whatever mutex
2978 2978 * we held when we were called with in `lock'.
2979 2979 * Page_unlock() will wakeup any thread
2980 2980 * waiting around for this page.
2981 2981 */
2982 2982 if (lock) {
2983 2983 VM_STAT_ADD(page_reclaim_zero_locked);
2984 2984 mutex_exit(lock);
2985 2985 }
2986 2986 page_unlock(pp);
2987 2987
2988 2988 /*
2989 2989 * get this before we drop all the pcf locks.
2990 2990 */
2991 2991 mutex_enter(&new_freemem_lock);
2992 2992
2993 2993 p = pcf;
2994 2994 for (i = 0; i < pcf_fanout; i++) {
2995 2995 p->pcf_wait++;
2996 2996 mutex_exit(&p->pcf_lock);
2997 2997 p++;
2998 2998 }
2999 2999
3000 3000 freemem_wait++;
3001 3001 cv_wait(&freemem_cv, &new_freemem_lock);
3002 3002 freemem_wait--;
3003 3003
3004 3004 mutex_exit(&new_freemem_lock);
3005 3005
3006 3006 if (lock) {
3007 3007 mutex_enter(lock);
3008 3008 }
3009 3009 return (0);
3010 3010 }
3011 3011
3012 3012 /*
3013 3013 * The pcf accounting has been done,
3014 3014 * though none of the pcf_wait flags have been set,
3015 3015 * drop the locks and continue on.
3016 3016 */
3017 3017 while (p >= pcf) {
3018 3018 mutex_exit(&p->pcf_lock);
3019 3019 p--;
3020 3020 }
3021 3021 }
3022 3022
3023 3023
3024 3024 VM_STAT_ADD(pagecnt.pc_reclaim);
3025 3025
3026 3026 /*
3027 3027 * page_list_sub will handle the case where pp is a large page.
3028 3028 * It's possible that the page was promoted while on the freelist
3029 3029 */
3030 3030 if (PP_ISAGED(pp)) {
3031 3031 page_list_sub(pp, PG_FREE_LIST);
3032 3032 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
3033 3033 "page_reclaim_free:pp %p", pp);
3034 3034 } else {
3035 3035 page_list_sub(pp, PG_CACHE_LIST);
3036 3036 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
3037 3037 "page_reclaim_cache:pp %p", pp);
3038 3038 }
3039 3039
3040 3040 /*
3041 3041 * clear the p_free & p_age bits since this page is no longer
3042 3042 * on the free list. Notice that there was a brief time where
3043 3043 * a page is marked as free, but is not on the list.
3044 3044 *
3045 3045 * Set the reference bit to protect against immediate pageout.
3046 3046 */
3047 3047 PP_CLRFREE(pp);
3048 3048 PP_CLRAGED(pp);
3049 3049 page_set_props(pp, P_REF);
3050 3050
3051 3051 CPU_STATS_ENTER_K();
3052 3052 cpup = CPU; /* get cpup now that CPU cannot change */
3053 3053 CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
3054 3054 CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
3055 3055 CPU_STATS_EXIT_K();
3056 3056 ASSERT(pp->p_szc == 0);
3057 3057
3058 3058 return (1);
3059 3059 }
3060 3060
3061 3061 /*
3062 3062 * Destroy identity of the page and put it back on
3063 3063 * the page free list. Assumes that the caller has
3064 3064 * acquired the "exclusive" lock on the page.
3065 3065 */
3066 3066 void
3067 3067 page_destroy(page_t *pp, int dontfree)
3068 3068 {
3069 3069 ASSERT((PAGE_EXCL(pp) &&
3070 3070 !page_iolock_assert(pp)) || panicstr);
3071 3071 ASSERT(pp->p_slckcnt == 0 || panicstr);
3072 3072
3073 3073 if (pp->p_szc != 0) {
3074 3074 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
3075 3075 PP_ISKAS(pp)) {
3076 3076 panic("page_destroy: anon or kernel or no vnode "
3077 3077 "large page %p", (void *)pp);
3078 3078 }
3079 3079 page_demote_vp_pages(pp);
3080 3080 ASSERT(pp->p_szc == 0);
3081 3081 }
3082 3082
3083 3083 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp);
3084 3084
3085 3085 /*
3086 3086 * Unload translations, if any, then hash out the
3087 3087 * page to erase its identity.
3088 3088 */
3089 3089 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3090 3090 page_hashout(pp, NULL);
3091 3091
3092 3092 if (!dontfree) {
3093 3093 /*
3094 3094 * Acquire the "freemem_lock" for availrmem.
3095 3095 * The page_struct_lock need not be acquired for lckcnt
3096 3096 * and cowcnt since the page has an "exclusive" lock.
3097 3097 * We are doing a modified version of page_pp_unlock here.
3098 3098 */
3099 3099 if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) {
3100 3100 mutex_enter(&freemem_lock);
3101 3101 if (pp->p_lckcnt != 0) {
3102 3102 availrmem++;
3103 3103 pages_locked--;
3104 3104 pp->p_lckcnt = 0;
3105 3105 }
3106 3106 if (pp->p_cowcnt != 0) {
3107 3107 availrmem += pp->p_cowcnt;
3108 3108 pages_locked -= pp->p_cowcnt;
3109 3109 pp->p_cowcnt = 0;
3110 3110 }
3111 3111 mutex_exit(&freemem_lock);
3112 3112 }
3113 3113 /*
3114 3114 * Put the page on the "free" list.
3115 3115 */
3116 3116 page_free(pp, 0);
3117 3117 }
3118 3118 }
3119 3119
3120 3120 void
3121 3121 page_destroy_pages(page_t *pp)
3122 3122 {
3123 3123
3124 3124 page_t *tpp, *rootpp = NULL;
3125 3125 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
3126 3126 pgcnt_t i, pglcks = 0;
3127 3127 uint_t szc = pp->p_szc;
3128 3128
3129 3129 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
3130 3130
3131 3131 VM_STAT_ADD(pagecnt.pc_destroy_pages);
3132 3132
3133 3133 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp);
3134 3134
3135 3135 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
3136 3136 panic("page_destroy_pages: not root page %p", (void *)pp);
3137 3137 /*NOTREACHED*/
3138 3138 }
3139 3139
3140 3140 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
3141 3141 ASSERT((PAGE_EXCL(tpp) &&
3142 3142 !page_iolock_assert(tpp)) || panicstr);
3143 3143 ASSERT(tpp->p_slckcnt == 0 || panicstr);
3144 3144 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
3145 3145 page_hashout(tpp, NULL);
3146 3146 ASSERT(tpp->p_offset == (u_offset_t)-1);
3147 3147 if (tpp->p_lckcnt != 0) {
3148 3148 pglcks++;
3149 3149 tpp->p_lckcnt = 0;
3150 3150 } else if (tpp->p_cowcnt != 0) {
3151 3151 pglcks += tpp->p_cowcnt;
3152 3152 tpp->p_cowcnt = 0;
3153 3153 }
3154 3154 ASSERT(!hat_page_getshare(tpp));
3155 3155 ASSERT(tpp->p_vnode == NULL);
3156 3156 ASSERT(tpp->p_szc == szc);
3157 3157
3158 3158 PP_SETFREE(tpp);
3159 3159 page_clr_all_props(tpp);
3160 3160 PP_SETAGED(tpp);
3161 3161 ASSERT(tpp->p_next == tpp);
3162 3162 ASSERT(tpp->p_prev == tpp);
3163 3163 page_list_concat(&rootpp, &tpp);
3164 3164 }
3165 3165
3166 3166 ASSERT(rootpp == pp);
3167 3167 if (pglcks != 0) {
3168 3168 mutex_enter(&freemem_lock);
3169 3169 availrmem += pglcks;
3170 3170 mutex_exit(&freemem_lock);
3171 3171 }
3172 3172
3173 3173 page_list_add_pages(rootpp, 0);
3174 3174 page_create_putback(pgcnt);
3175 3175 }
3176 3176
3177 3177 /*
3178 3178 * Similar to page_destroy(), but destroys pages which are
3179 3179 * locked and known to be on the page free list. Since
3180 3180 * the page is known to be free and locked, no one can access
3181 3181 * it.
3182 3182 *
3183 3183 * Also, the number of free pages does not change.
3184 3184 */
3185 3185 void
3186 3186 page_destroy_free(page_t *pp)
3187 3187 {
3188 3188 ASSERT(PAGE_EXCL(pp));
3189 3189 ASSERT(PP_ISFREE(pp));
3190 3190 ASSERT(pp->p_vnode);
3191 3191 ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0);
3192 3192 ASSERT(!hat_page_is_mapped(pp));
3193 3193 ASSERT(PP_ISAGED(pp) == 0);
3194 3194 ASSERT(pp->p_szc == 0);
3195 3195
3196 3196 VM_STAT_ADD(pagecnt.pc_destroy_free);
3197 3197 page_list_sub(pp, PG_CACHE_LIST);
3198 3198
3199 3199 page_hashout(pp, NULL);
3200 3200 ASSERT(pp->p_vnode == NULL);
3201 3201 ASSERT(pp->p_offset == (u_offset_t)-1);
3202 3202 ASSERT(pp->p_hash == NULL);
3203 3203
3204 3204 PP_SETAGED(pp);
3205 3205 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3206 3206 page_unlock(pp);
3207 3207
3208 3208 mutex_enter(&new_freemem_lock);
3209 3209 if (freemem_wait) {
3210 3210 cv_signal(&freemem_cv);
3211 3211 }
3212 3212 mutex_exit(&new_freemem_lock);
3213 3213 }
3214 3214
3215 3215 /*
3216 3216 * Rename the page "opp" to have an identity specified
3217 3217 * by [vp, off]. If a page already exists with this name
3218 3218 * it is locked and destroyed. Note that the page's
3219 3219 * translations are not unloaded during the rename.
3220 3220 *
3221 3221 * This routine is used by the anon layer to "steal" the
3222 3222 * original page and is not unlike destroying a page and
3223 3223 * creating a new page using the same page frame.
3224 3224 *
3225 3225 * XXX -- Could deadlock if caller 1 tries to rename A to B while
3226 3226 * caller 2 tries to rename B to A.
3227 3227 */
3228 3228 void
3229 3229 page_rename(page_t *opp, vnode_t *vp, u_offset_t off)
3230 3230 {
3231 3231 page_t *pp;
3232 3232 int olckcnt = 0;
3233 3233 int ocowcnt = 0;
3234 3234 kmutex_t *phm;
3235 3235 ulong_t index;
3236 3236
3237 3237 ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp));
3238 3238 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3239 3239 ASSERT(PP_ISFREE(opp) == 0);
3240 3240
3241 3241 VM_STAT_ADD(page_rename_count);
3242 3242
3243 3243 TRACE_3(TR_FAC_VM, TR_PAGE_RENAME,
3244 3244 "page rename:pp %p vp %p off %llx", opp, vp, off);
3245 3245
3246 3246 /*
3247 3247 * CacheFS may call page_rename for a large NFS page
3248 3248 * when both CacheFS and NFS mount points are used
3249 3249 * by applications. Demote this large page before
3250 3250 * renaming it, to ensure that there are no "partial"
3251 3251 * large pages left lying around.
3252 3252 */
3253 3253 if (opp->p_szc != 0) {
3254 3254 vnode_t *ovp = opp->p_vnode;
3255 3255 ASSERT(ovp != NULL);
3256 3256 ASSERT(!IS_SWAPFSVP(ovp));
3257 3257 ASSERT(!VN_ISKAS(ovp));
3258 3258 page_demote_vp_pages(opp);
3259 3259 ASSERT(opp->p_szc == 0);
3260 3260 }
3261 3261
3262 3262 page_hashout(opp, NULL);
3263 3263 PP_CLRAGED(opp);
3264 3264
3265 3265 /*
3266 3266 * Acquire the appropriate page hash lock, since
3267 3267 * we're going to rename the page.
3268 3268 */
3269 3269 index = PAGE_HASH_FUNC(vp, off);
3270 3270 phm = PAGE_HASH_MUTEX(index);
3271 3271 mutex_enter(phm);
3272 3272 top:
3273 3273 /*
3274 3274 * Look for an existing page with this name and destroy it if found.
3275 3275 * By holding the page hash lock all the way to the page_hashin()
3276 3276 * call, we are assured that no page can be created with this
3277 3277 * identity. In the case when the phm lock is dropped to undo any
3278 3278 * hat layer mappings, the existing page is held with an "exclusive"
3279 3279 * lock, again preventing another page from being created with
3280 3280 * this identity.
3281 3281 */
3282 3282 PAGE_HASH_SEARCH(index, pp, vp, off);
3283 3283 if (pp != NULL) {
3284 3284 VM_STAT_ADD(page_rename_exists);
3285 3285
3286 3286 /*
3287 3287 * As it turns out, this is one of only two places where
3288 3288 * page_lock() needs to hold the passed in lock in the
3289 3289 * successful case. In all of the others, the lock could
3290 3290 * be dropped as soon as the attempt is made to lock
3291 3291 * the page. It is tempting to add yet another arguement,
3292 3292 * PL_KEEP or PL_DROP, to let page_lock know what to do.
3293 3293 */
3294 3294 if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) {
3295 3295 /*
3296 3296 * Went to sleep because the page could not
3297 3297 * be locked. We were woken up when the page
3298 3298 * was unlocked, or when the page was destroyed.
3299 3299 * In either case, `phm' was dropped while we
3300 3300 * slept. Hence we should not just roar through
3301 3301 * this loop.
3302 3302 */
3303 3303 goto top;
3304 3304 }
3305 3305
3306 3306 /*
3307 3307 * If an existing page is a large page, then demote
3308 3308 * it to ensure that no "partial" large pages are
3309 3309 * "created" after page_rename. An existing page
3310 3310 * can be a CacheFS page, and can't belong to swapfs.
3311 3311 */
3312 3312 if (hat_page_is_mapped(pp)) {
3313 3313 /*
3314 3314 * Unload translations. Since we hold the
3315 3315 * exclusive lock on this page, the page
3316 3316 * can not be changed while we drop phm.
3317 3317 * This is also not a lock protocol violation,
3318 3318 * but rather the proper way to do things.
3319 3319 */
3320 3320 mutex_exit(phm);
3321 3321 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3322 3322 if (pp->p_szc != 0) {
3323 3323 ASSERT(!IS_SWAPFSVP(vp));
3324 3324 ASSERT(!VN_ISKAS(vp));
3325 3325 page_demote_vp_pages(pp);
3326 3326 ASSERT(pp->p_szc == 0);
3327 3327 }
3328 3328 mutex_enter(phm);
3329 3329 } else if (pp->p_szc != 0) {
3330 3330 ASSERT(!IS_SWAPFSVP(vp));
3331 3331 ASSERT(!VN_ISKAS(vp));
3332 3332 mutex_exit(phm);
3333 3333 page_demote_vp_pages(pp);
3334 3334 ASSERT(pp->p_szc == 0);
3335 3335 mutex_enter(phm);
3336 3336 }
3337 3337 page_hashout(pp, phm);
3338 3338 }
3339 3339 /*
3340 3340 * Hash in the page with the new identity.
3341 3341 */
3342 3342 if (!page_hashin(opp, vp, off, phm)) {
3343 3343 /*
3344 3344 * We were holding phm while we searched for [vp, off]
3345 3345 * and only dropped phm if we found and locked a page.
3346 3346 * If we can't create this page now, then some thing
3347 3347 * is really broken.
3348 3348 */
3349 3349 panic("page_rename: Can't hash in page: %p", (void *)pp);
3350 3350 /*NOTREACHED*/
3351 3351 }
3352 3352
3353 3353 ASSERT(MUTEX_HELD(phm));
3354 3354 mutex_exit(phm);
3355 3355
3356 3356 /*
3357 3357 * Now that we have dropped phm, lets get around to finishing up
3358 3358 * with pp.
3359 3359 */
3360 3360 if (pp != NULL) {
3361 3361 ASSERT(!hat_page_is_mapped(pp));
3362 3362 /* for now large pages should not end up here */
3363 3363 ASSERT(pp->p_szc == 0);
3364 3364 /*
3365 3365 * Save the locks for transfer to the new page and then
3366 3366 * clear them so page_free doesn't think they're important.
3367 3367 * The page_struct_lock need not be acquired for lckcnt and
3368 3368 * cowcnt since the page has an "exclusive" lock.
3369 3369 */
3370 3370 olckcnt = pp->p_lckcnt;
3371 3371 ocowcnt = pp->p_cowcnt;
3372 3372 pp->p_lckcnt = pp->p_cowcnt = 0;
3373 3373
3374 3374 /*
3375 3375 * Put the page on the "free" list after we drop
3376 3376 * the lock. The less work under the lock the better.
3377 3377 */
3378 3378 /*LINTED: constant in conditional context*/
3379 3379 VN_DISPOSE(pp, B_FREE, 0, kcred);
3380 3380 }
3381 3381
3382 3382 /*
3383 3383 * Transfer the lock count from the old page (if any).
3384 3384 * The page_struct_lock need not be acquired for lckcnt and
3385 3385 * cowcnt since the page has an "exclusive" lock.
3386 3386 */
3387 3387 opp->p_lckcnt += olckcnt;
3388 3388 opp->p_cowcnt += ocowcnt;
3389 3389 }
3390 3390
3391 3391 /*
3392 3392 * low level routine to add page `pp' to the hash and vp chains for [vp, offset]
3393 3393 *
3394 3394 * Pages are normally inserted at the start of a vnode's v_pages list.
3395 3395 * If the vnode is VMODSORT and the page is modified, it goes at the end.
3396 3396 * This can happen when a modified page is relocated for DR.
3397 3397 *
3398 3398 * Returns 1 on success and 0 on failure.
3399 3399 */
3400 3400 static int
3401 3401 page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset)
3402 3402 {
3403 3403 page_t **listp;
3404 3404 page_t *tp;
3405 3405 ulong_t index;
3406 3406
3407 3407 ASSERT(PAGE_EXCL(pp));
3408 3408 ASSERT(vp != NULL);
3409 3409 ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3410 3410
3411 3411 /*
3412 3412 * Be sure to set these up before the page is inserted on the hash
3413 3413 * list. As soon as the page is placed on the list some other
3414 3414 * thread might get confused and wonder how this page could
3415 3415 * possibly hash to this list.
3416 3416 */
3417 3417 pp->p_vnode = vp;
3418 3418 pp->p_offset = offset;
3419 3419
3420 3420 /*
3421 3421 * record if this page is on a swap vnode
3422 3422 */
3423 3423 if ((vp->v_flag & VISSWAP) != 0)
3424 3424 PP_SETSWAP(pp);
3425 3425
3426 3426 index = PAGE_HASH_FUNC(vp, offset);
3427 3427 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index)));
3428 3428 listp = &page_hash[index];
3429 3429
3430 3430 /*
3431 3431 * If this page is already hashed in, fail this attempt to add it.
3432 3432 */
3433 3433 for (tp = *listp; tp != NULL; tp = tp->p_hash) {
3434 3434 if (tp->p_vnode == vp && tp->p_offset == offset) {
3435 3435 pp->p_vnode = NULL;
3436 3436 pp->p_offset = (u_offset_t)(-1);
3437 3437 return (0);
3438 3438 }
3439 3439 }
3440 3440 pp->p_hash = *listp;
3441 3441 *listp = pp;
3442 3442
3443 3443 /*
3444 3444 * Add the page to the vnode's list of pages
3445 3445 */
3446 3446 if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp))
3447 3447 listp = &vp->v_pages->p_vpprev->p_vpnext;
3448 3448 else
3449 3449 listp = &vp->v_pages;
3450 3450
3451 3451 page_vpadd(listp, pp);
3452 3452
3453 3453 return (1);
3454 3454 }
3455 3455
3456 3456 /*
3457 3457 * Add page `pp' to both the hash and vp chains for [vp, offset].
3458 3458 *
3459 3459 * Returns 1 on success and 0 on failure.
3460 3460 * If hold is passed in, it is not dropped.
3461 3461 */
3462 3462 int
3463 3463 page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold)
3464 3464 {
3465 3465 kmutex_t *phm = NULL;
3466 3466 kmutex_t *vphm;
3467 3467 int rc;
3468 3468
3469 3469 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3470 3470 ASSERT(pp->p_fsdata == 0 || panicstr);
3471 3471
3472 3472 TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN,
3473 3473 "page_hashin:pp %p vp %p offset %llx",
3474 3474 pp, vp, offset);
3475 3475
3476 3476 VM_STAT_ADD(hashin_count);
3477 3477
3478 3478 if (hold != NULL)
3479 3479 phm = hold;
3480 3480 else {
3481 3481 VM_STAT_ADD(hashin_not_held);
3482 3482 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset));
3483 3483 mutex_enter(phm);
3484 3484 }
3485 3485
3486 3486 vphm = page_vnode_mutex(vp);
3487 3487 mutex_enter(vphm);
3488 3488 rc = page_do_hashin(pp, vp, offset);
3489 3489 mutex_exit(vphm);
3490 3490 if (hold == NULL)
3491 3491 mutex_exit(phm);
3492 3492 if (rc == 0)
3493 3493 VM_STAT_ADD(hashin_already);
3494 3494 return (rc);
3495 3495 }
3496 3496
3497 3497 /*
3498 3498 * Remove page ``pp'' from the hash and vp chains and remove vp association.
3499 3499 * All mutexes must be held
3500 3500 */
3501 3501 static void
3502 3502 page_do_hashout(page_t *pp)
3503 3503 {
3504 3504 page_t **hpp;
3505 3505 page_t *hp;
3506 3506 vnode_t *vp = pp->p_vnode;
3507 3507
3508 3508 ASSERT(vp != NULL);
3509 3509 ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3510 3510
3511 3511 /*
3512 3512 * First, take pp off of its hash chain.
3513 3513 */
3514 3514 hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)];
3515 3515
3516 3516 for (;;) {
3517 3517 hp = *hpp;
3518 3518 if (hp == pp)
3519 3519 break;
3520 3520 if (hp == NULL) {
3521 3521 panic("page_do_hashout");
3522 3522 /*NOTREACHED*/
3523 3523 }
3524 3524 hpp = &hp->p_hash;
3525 3525 }
3526 3526 *hpp = pp->p_hash;
3527 3527
3528 3528 /*
3529 3529 * Now remove it from its associated vnode.
3530 3530 */
3531 3531 if (vp->v_pages)
3532 3532 page_vpsub(&vp->v_pages, pp);
3533 3533
3534 3534 pp->p_hash = NULL;
3535 3535 page_clr_all_props(pp);
3536 3536 PP_CLRSWAP(pp);
3537 3537 pp->p_vnode = NULL;
3538 3538 pp->p_offset = (u_offset_t)-1;
3539 3539 pp->p_fsdata = 0;
3540 3540 }
3541 3541
3542 3542 /*
3543 3543 * Remove page ``pp'' from the hash and vp chains and remove vp association.
3544 3544 *
3545 3545 * When `phm' is non-NULL it contains the address of the mutex protecting the
3546 3546 * hash list pp is on. It is not dropped.
3547 3547 */
3548 3548 void
3549 3549 page_hashout(page_t *pp, kmutex_t *phm)
3550 3550 {
3551 3551 vnode_t *vp;
3552 3552 ulong_t index;
3553 3553 kmutex_t *nphm;
3554 3554 kmutex_t *vphm;
3555 3555 kmutex_t *sep;
3556 3556
3557 3557 ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1);
3558 3558 ASSERT(pp->p_vnode != NULL);
3559 3559 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
3560 3560 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode)));
3561 3561
3562 3562 vp = pp->p_vnode;
3563 3563
3564 3564 TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT,
3565 3565 "page_hashout:pp %p vp %p", pp, vp);
3566 3566
3567 3567 /* Kernel probe */
3568 3568 TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */,
3569 3569 tnf_opaque, vnode, vp,
3570 3570 tnf_offset, offset, pp->p_offset);
3571 3571
3572 3572 /*
3573 3573 *
3574 3574 */
3575 3575 VM_STAT_ADD(hashout_count);
3576 3576 index = PAGE_HASH_FUNC(vp, pp->p_offset);
3577 3577 if (phm == NULL) {
3578 3578 VM_STAT_ADD(hashout_not_held);
3579 3579 nphm = PAGE_HASH_MUTEX(index);
3580 3580 mutex_enter(nphm);
3581 3581 }
3582 3582 ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1);
3583 3583
3584 3584
3585 3585 /*
3586 3586 * grab page vnode mutex and remove it...
3587 3587 */
3588 3588 vphm = page_vnode_mutex(vp);
3589 3589 mutex_enter(vphm);
3590 3590
3591 3591 page_do_hashout(pp);
3592 3592
3593 3593 mutex_exit(vphm);
3594 3594 if (phm == NULL)
3595 3595 mutex_exit(nphm);
3596 3596
3597 3597 /*
3598 3598 * Wake up processes waiting for this page. The page's
3599 3599 * identity has been changed, and is probably not the
3600 3600 * desired page any longer.
3601 3601 */
3602 3602 sep = page_se_mutex(pp);
3603 3603 mutex_enter(sep);
3604 3604 pp->p_selock &= ~SE_EWANTED;
3605 3605 if (CV_HAS_WAITERS(&pp->p_cv))
3606 3606 cv_broadcast(&pp->p_cv);
3607 3607 mutex_exit(sep);
3608 3608 }
3609 3609
3610 3610 /*
3611 3611 * Add the page to the front of a linked list of pages
3612 3612 * using the p_next & p_prev pointers for the list.
3613 3613 * The caller is responsible for protecting the list pointers.
3614 3614 */
3615 3615 void
3616 3616 page_add(page_t **ppp, page_t *pp)
3617 3617 {
3618 3618 ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3619 3619
3620 3620 page_add_common(ppp, pp);
3621 3621 }
3622 3622
3623 3623
3624 3624
3625 3625 /*
3626 3626 * Common code for page_add() and mach_page_add()
3627 3627 */
3628 3628 void
3629 3629 page_add_common(page_t **ppp, page_t *pp)
3630 3630 {
3631 3631 if (*ppp == NULL) {
3632 3632 pp->p_next = pp->p_prev = pp;
3633 3633 } else {
3634 3634 pp->p_next = *ppp;
3635 3635 pp->p_prev = (*ppp)->p_prev;
3636 3636 (*ppp)->p_prev = pp;
3637 3637 pp->p_prev->p_next = pp;
3638 3638 }
3639 3639 *ppp = pp;
3640 3640 }
3641 3641
3642 3642
3643 3643 /*
3644 3644 * Remove this page from a linked list of pages
3645 3645 * using the p_next & p_prev pointers for the list.
3646 3646 *
3647 3647 * The caller is responsible for protecting the list pointers.
3648 3648 */
3649 3649 void
3650 3650 page_sub(page_t **ppp, page_t *pp)
3651 3651 {
3652 3652 ASSERT((PP_ISFREE(pp)) ? 1 :
3653 3653 (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3654 3654
3655 3655 if (*ppp == NULL || pp == NULL) {
3656 3656 panic("page_sub: bad arg(s): pp %p, *ppp %p",
3657 3657 (void *)pp, (void *)(*ppp));
3658 3658 /*NOTREACHED*/
3659 3659 }
3660 3660
3661 3661 page_sub_common(ppp, pp);
3662 3662 }
3663 3663
3664 3664
3665 3665 /*
3666 3666 * Common code for page_sub() and mach_page_sub()
3667 3667 */
3668 3668 void
3669 3669 page_sub_common(page_t **ppp, page_t *pp)
3670 3670 {
3671 3671 if (*ppp == pp)
3672 3672 *ppp = pp->p_next; /* go to next page */
3673 3673
3674 3674 if (*ppp == pp)
3675 3675 *ppp = NULL; /* page list is gone */
3676 3676 else {
3677 3677 pp->p_prev->p_next = pp->p_next;
3678 3678 pp->p_next->p_prev = pp->p_prev;
3679 3679 }
3680 3680 pp->p_prev = pp->p_next = pp; /* make pp a list of one */
3681 3681 }
3682 3682
3683 3683
3684 3684 /*
3685 3685 * Break page list cppp into two lists with npages in the first list.
3686 3686 * The tail is returned in nppp.
3687 3687 */
3688 3688 void
3689 3689 page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages)
3690 3690 {
3691 3691 page_t *s1pp = *oppp;
3692 3692 page_t *s2pp;
3693 3693 page_t *e1pp, *e2pp;
3694 3694 long n = 0;
3695 3695
3696 3696 if (s1pp == NULL) {
3697 3697 *nppp = NULL;
3698 3698 return;
3699 3699 }
3700 3700 if (npages == 0) {
3701 3701 *nppp = s1pp;
3702 3702 *oppp = NULL;
3703 3703 return;
3704 3704 }
3705 3705 for (n = 0, s2pp = *oppp; n < npages; n++) {
3706 3706 s2pp = s2pp->p_next;
3707 3707 }
3708 3708 /* Fix head and tail of new lists */
3709 3709 e1pp = s2pp->p_prev;
3710 3710 e2pp = s1pp->p_prev;
3711 3711 s1pp->p_prev = e1pp;
3712 3712 e1pp->p_next = s1pp;
3713 3713 s2pp->p_prev = e2pp;
3714 3714 e2pp->p_next = s2pp;
3715 3715
3716 3716 /* second list empty */
3717 3717 if (s2pp == s1pp) {
3718 3718 *oppp = s1pp;
3719 3719 *nppp = NULL;
3720 3720 } else {
3721 3721 *oppp = s1pp;
3722 3722 *nppp = s2pp;
3723 3723 }
3724 3724 }
3725 3725
3726 3726 /*
3727 3727 * Concatenate page list nppp onto the end of list ppp.
3728 3728 */
3729 3729 void
3730 3730 page_list_concat(page_t **ppp, page_t **nppp)
3731 3731 {
3732 3732 page_t *s1pp, *s2pp, *e1pp, *e2pp;
3733 3733
3734 3734 if (*nppp == NULL) {
3735 3735 return;
3736 3736 }
3737 3737 if (*ppp == NULL) {
3738 3738 *ppp = *nppp;
3739 3739 return;
3740 3740 }
3741 3741 s1pp = *ppp;
3742 3742 e1pp = s1pp->p_prev;
3743 3743 s2pp = *nppp;
3744 3744 e2pp = s2pp->p_prev;
3745 3745 s1pp->p_prev = e2pp;
3746 3746 e2pp->p_next = s1pp;
3747 3747 e1pp->p_next = s2pp;
3748 3748 s2pp->p_prev = e1pp;
3749 3749 }
3750 3750
3751 3751 /*
3752 3752 * return the next page in the page list
3753 3753 */
3754 3754 page_t *
3755 3755 page_list_next(page_t *pp)
3756 3756 {
3757 3757 return (pp->p_next);
3758 3758 }
3759 3759
3760 3760
3761 3761 /*
3762 3762 * Add the page to the front of the linked list of pages
3763 3763 * using p_vpnext/p_vpprev pointers for the list.
3764 3764 *
3765 3765 * The caller is responsible for protecting the lists.
3766 3766 */
3767 3767 void
3768 3768 page_vpadd(page_t **ppp, page_t *pp)
3769 3769 {
3770 3770 if (*ppp == NULL) {
3771 3771 pp->p_vpnext = pp->p_vpprev = pp;
3772 3772 } else {
3773 3773 pp->p_vpnext = *ppp;
3774 3774 pp->p_vpprev = (*ppp)->p_vpprev;
3775 3775 (*ppp)->p_vpprev = pp;
3776 3776 pp->p_vpprev->p_vpnext = pp;
3777 3777 }
3778 3778 *ppp = pp;
3779 3779 }
3780 3780
3781 3781 /*
3782 3782 * Remove this page from the linked list of pages
3783 3783 * using p_vpnext/p_vpprev pointers for the list.
3784 3784 *
3785 3785 * The caller is responsible for protecting the lists.
3786 3786 */
3787 3787 void
3788 3788 page_vpsub(page_t **ppp, page_t *pp)
3789 3789 {
3790 3790 if (*ppp == NULL || pp == NULL) {
3791 3791 panic("page_vpsub: bad arg(s): pp %p, *ppp %p",
3792 3792 (void *)pp, (void *)(*ppp));
3793 3793 /*NOTREACHED*/
3794 3794 }
3795 3795
3796 3796 if (*ppp == pp)
3797 3797 *ppp = pp->p_vpnext; /* go to next page */
3798 3798
3799 3799 if (*ppp == pp)
3800 3800 *ppp = NULL; /* page list is gone */
3801 3801 else {
3802 3802 pp->p_vpprev->p_vpnext = pp->p_vpnext;
3803 3803 pp->p_vpnext->p_vpprev = pp->p_vpprev;
3804 3804 }
3805 3805 pp->p_vpprev = pp->p_vpnext = pp; /* make pp a list of one */
3806 3806 }
3807 3807
3808 3808 /*
3809 3809 * Lock a physical page into memory "long term". Used to support "lock
3810 3810 * in memory" functions. Accepts the page to be locked, and a cow variable
3811 3811 * to indicate whether a the lock will travel to the new page during
3812 3812 * a potential copy-on-write.
3813 3813 */
3814 3814 int
3815 3815 page_pp_lock(
3816 3816 page_t *pp, /* page to be locked */
3817 3817 int cow, /* cow lock */
3818 3818 int kernel) /* must succeed -- ignore checking */
3819 3819 {
3820 3820 int r = 0; /* result -- assume failure */
3821 3821
3822 3822 ASSERT(PAGE_LOCKED(pp));
3823 3823
3824 3824 page_struct_lock(pp);
3825 3825 /*
3826 3826 * Acquire the "freemem_lock" for availrmem.
3827 3827 */
3828 3828 if (cow) {
3829 3829 mutex_enter(&freemem_lock);
3830 3830 if ((availrmem > pages_pp_maximum) &&
3831 3831 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
3832 3832 availrmem--;
3833 3833 pages_locked++;
3834 3834 mutex_exit(&freemem_lock);
3835 3835 r = 1;
3836 3836 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3837 3837 cmn_err(CE_WARN,
3838 3838 "COW lock limit reached on pfn 0x%lx",
3839 3839 page_pptonum(pp));
3840 3840 }
3841 3841 } else
3842 3842 mutex_exit(&freemem_lock);
3843 3843 } else {
3844 3844 if (pp->p_lckcnt) {
3845 3845 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
3846 3846 r = 1;
3847 3847 if (++pp->p_lckcnt ==
3848 3848 (ushort_t)PAGE_LOCK_MAXIMUM) {
3849 3849 cmn_err(CE_WARN, "Page lock limit "
3850 3850 "reached on pfn 0x%lx",
3851 3851 page_pptonum(pp));
3852 3852 }
3853 3853 }
3854 3854 } else {
3855 3855 if (kernel) {
3856 3856 /* availrmem accounting done by caller */
3857 3857 ++pp->p_lckcnt;
3858 3858 r = 1;
3859 3859 } else {
3860 3860 mutex_enter(&freemem_lock);
3861 3861 if (availrmem > pages_pp_maximum) {
3862 3862 availrmem--;
3863 3863 pages_locked++;
3864 3864 ++pp->p_lckcnt;
3865 3865 r = 1;
3866 3866 }
3867 3867 mutex_exit(&freemem_lock);
3868 3868 }
3869 3869 }
3870 3870 }
3871 3871 page_struct_unlock(pp);
3872 3872 return (r);
3873 3873 }
3874 3874
3875 3875 /*
3876 3876 * Decommit a lock on a physical page frame. Account for cow locks if
3877 3877 * appropriate.
3878 3878 */
3879 3879 void
3880 3880 page_pp_unlock(
3881 3881 page_t *pp, /* page to be unlocked */
3882 3882 int cow, /* expect cow lock */
3883 3883 int kernel) /* this was a kernel lock */
3884 3884 {
3885 3885 ASSERT(PAGE_LOCKED(pp));
3886 3886
3887 3887 page_struct_lock(pp);
3888 3888 /*
3889 3889 * Acquire the "freemem_lock" for availrmem.
3890 3890 * If cowcnt or lcknt is already 0 do nothing; i.e., we
3891 3891 * could be called to unlock even if nothing is locked. This could
3892 3892 * happen if locked file pages were truncated (removing the lock)
3893 3893 * and the file was grown again and new pages faulted in; the new
3894 3894 * pages are unlocked but the segment still thinks they're locked.
3895 3895 */
3896 3896 if (cow) {
3897 3897 if (pp->p_cowcnt) {
3898 3898 mutex_enter(&freemem_lock);
3899 3899 pp->p_cowcnt--;
3900 3900 availrmem++;
3901 3901 pages_locked--;
3902 3902 mutex_exit(&freemem_lock);
3903 3903 }
3904 3904 } else {
3905 3905 if (pp->p_lckcnt && --pp->p_lckcnt == 0) {
3906 3906 if (!kernel) {
3907 3907 mutex_enter(&freemem_lock);
3908 3908 availrmem++;
3909 3909 pages_locked--;
3910 3910 mutex_exit(&freemem_lock);
3911 3911 }
3912 3912 }
3913 3913 }
3914 3914 page_struct_unlock(pp);
3915 3915 }
3916 3916
3917 3917 /*
3918 3918 * This routine reserves availrmem for npages;
3919 3919 * flags: KM_NOSLEEP or KM_SLEEP
3920 3920 * returns 1 on success or 0 on failure
3921 3921 */
3922 3922 int
3923 3923 page_resv(pgcnt_t npages, uint_t flags)
3924 3924 {
3925 3925 mutex_enter(&freemem_lock);
3926 3926 while (availrmem < tune.t_minarmem + npages) {
3927 3927 if (flags & KM_NOSLEEP) {
3928 3928 mutex_exit(&freemem_lock);
3929 3929 return (0);
3930 3930 }
3931 3931 mutex_exit(&freemem_lock);
3932 3932 page_needfree(npages);
3933 3933 kmem_reap();
3934 3934 delay(hz >> 2);
3935 3935 page_needfree(-(spgcnt_t)npages);
3936 3936 mutex_enter(&freemem_lock);
3937 3937 }
3938 3938 availrmem -= npages;
3939 3939 mutex_exit(&freemem_lock);
3940 3940 return (1);
3941 3941 }
3942 3942
3943 3943 /*
3944 3944 * This routine unreserves availrmem for npages;
3945 3945 */
3946 3946 void
3947 3947 page_unresv(pgcnt_t npages)
3948 3948 {
3949 3949 mutex_enter(&freemem_lock);
3950 3950 availrmem += npages;
3951 3951 mutex_exit(&freemem_lock);
3952 3952 }
3953 3953
3954 3954 /*
3955 3955 * See Statement at the beginning of segvn_lockop() regarding
3956 3956 * the way we handle cowcnts and lckcnts.
3957 3957 *
3958 3958 * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage
3959 3959 * that breaks COW has PROT_WRITE.
3960 3960 *
3961 3961 * Note that, we may also break COW in case we are softlocking
3962 3962 * on read access during physio;
3963 3963 * in this softlock case, the vpage may not have PROT_WRITE.
3964 3964 * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp'
3965 3965 * if the vpage doesn't have PROT_WRITE.
3966 3966 *
3967 3967 * This routine is never called if we are stealing a page
3968 3968 * in anon_private.
3969 3969 *
3970 3970 * The caller subtracted from availrmem for read only mapping.
3971 3971 * if lckcnt is 1 increment availrmem.
3972 3972 */
3973 3973 void
3974 3974 page_pp_useclaim(
3975 3975 page_t *opp, /* original page frame losing lock */
3976 3976 page_t *npp, /* new page frame gaining lock */
3977 3977 uint_t write_perm) /* set if vpage has PROT_WRITE */
3978 3978 {
3979 3979 int payback = 0;
3980 3980 int nidx, oidx;
3981 3981
3982 3982 ASSERT(PAGE_LOCKED(opp));
3983 3983 ASSERT(PAGE_LOCKED(npp));
3984 3984
3985 3985 /*
3986 3986 * Since we have two pages we probably have two locks. We need to take
3987 3987 * them in a defined order to avoid deadlocks. It's also possible they
3988 3988 * both hash to the same lock in which case this is a non-issue.
3989 3989 */
3990 3990 nidx = PAGE_LLOCK_HASH(PP_PAGEROOT(npp));
3991 3991 oidx = PAGE_LLOCK_HASH(PP_PAGEROOT(opp));
3992 3992 if (nidx < oidx) {
3993 3993 page_struct_lock(npp);
3994 3994 page_struct_lock(opp);
3995 3995 } else if (oidx < nidx) {
3996 3996 page_struct_lock(opp);
3997 3997 page_struct_lock(npp);
3998 3998 } else { /* The pages hash to the same lock */
3999 3999 page_struct_lock(npp);
4000 4000 }
4001 4001
4002 4002 ASSERT(npp->p_cowcnt == 0);
4003 4003 ASSERT(npp->p_lckcnt == 0);
4004 4004
4005 4005 /* Don't use claim if nothing is locked (see page_pp_unlock above) */
4006 4006 if ((write_perm && opp->p_cowcnt != 0) ||
4007 4007 (!write_perm && opp->p_lckcnt != 0)) {
4008 4008
4009 4009 if (write_perm) {
4010 4010 npp->p_cowcnt++;
4011 4011 ASSERT(opp->p_cowcnt != 0);
4012 4012 opp->p_cowcnt--;
4013 4013 } else {
4014 4014
4015 4015 ASSERT(opp->p_lckcnt != 0);
4016 4016
4017 4017 /*
4018 4018 * We didn't need availrmem decremented if p_lckcnt on
4019 4019 * original page is 1. Here, we are unlocking
4020 4020 * read-only copy belonging to original page and
4021 4021 * are locking a copy belonging to new page.
4022 4022 */
4023 4023 if (opp->p_lckcnt == 1)
4024 4024 payback = 1;
4025 4025
4026 4026 npp->p_lckcnt++;
4027 4027 opp->p_lckcnt--;
4028 4028 }
4029 4029 }
4030 4030 if (payback) {
4031 4031 mutex_enter(&freemem_lock);
4032 4032 availrmem++;
4033 4033 pages_useclaim--;
4034 4034 mutex_exit(&freemem_lock);
4035 4035 }
4036 4036
4037 4037 if (nidx < oidx) {
4038 4038 page_struct_unlock(opp);
4039 4039 page_struct_unlock(npp);
4040 4040 } else if (oidx < nidx) {
4041 4041 page_struct_unlock(npp);
4042 4042 page_struct_unlock(opp);
4043 4043 } else { /* The pages hash to the same lock */
4044 4044 page_struct_unlock(npp);
4045 4045 }
4046 4046 }
4047 4047
4048 4048 /*
4049 4049 * Simple claim adjust functions -- used to support changes in
4050 4050 * claims due to changes in access permissions. Used by segvn_setprot().
4051 4051 */
4052 4052 int
4053 4053 page_addclaim(page_t *pp)
4054 4054 {
4055 4055 int r = 0; /* result */
4056 4056
4057 4057 ASSERT(PAGE_LOCKED(pp));
4058 4058
4059 4059 page_struct_lock(pp);
4060 4060 ASSERT(pp->p_lckcnt != 0);
4061 4061
4062 4062 if (pp->p_lckcnt == 1) {
4063 4063 if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4064 4064 --pp->p_lckcnt;
4065 4065 r = 1;
4066 4066 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4067 4067 cmn_err(CE_WARN,
4068 4068 "COW lock limit reached on pfn 0x%lx",
4069 4069 page_pptonum(pp));
4070 4070 }
4071 4071 }
4072 4072 } else {
4073 4073 mutex_enter(&freemem_lock);
4074 4074 if ((availrmem > pages_pp_maximum) &&
4075 4075 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
4076 4076 --availrmem;
4077 4077 ++pages_claimed;
4078 4078 mutex_exit(&freemem_lock);
4079 4079 --pp->p_lckcnt;
4080 4080 r = 1;
4081 4081 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4082 4082 cmn_err(CE_WARN,
4083 4083 "COW lock limit reached on pfn 0x%lx",
4084 4084 page_pptonum(pp));
4085 4085 }
4086 4086 } else
4087 4087 mutex_exit(&freemem_lock);
4088 4088 }
4089 4089 page_struct_unlock(pp);
4090 4090 return (r);
4091 4091 }
4092 4092
4093 4093 int
4094 4094 page_subclaim(page_t *pp)
4095 4095 {
4096 4096 int r = 0;
4097 4097
4098 4098 ASSERT(PAGE_LOCKED(pp));
4099 4099
4100 4100 page_struct_lock(pp);
4101 4101 ASSERT(pp->p_cowcnt != 0);
4102 4102
4103 4103 if (pp->p_lckcnt) {
4104 4104 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4105 4105 r = 1;
4106 4106 /*
4107 4107 * for availrmem
4108 4108 */
4109 4109 mutex_enter(&freemem_lock);
4110 4110 availrmem++;
4111 4111 pages_claimed--;
4112 4112 mutex_exit(&freemem_lock);
4113 4113
4114 4114 pp->p_cowcnt--;
4115 4115
4116 4116 if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4117 4117 cmn_err(CE_WARN,
4118 4118 "Page lock limit reached on pfn 0x%lx",
4119 4119 page_pptonum(pp));
4120 4120 }
4121 4121 }
4122 4122 } else {
4123 4123 r = 1;
4124 4124 pp->p_cowcnt--;
4125 4125 pp->p_lckcnt++;
4126 4126 }
4127 4127 page_struct_unlock(pp);
4128 4128 return (r);
4129 4129 }
4130 4130
4131 4131 /*
4132 4132 * Variant of page_addclaim(), where ppa[] contains the pages of a single large
4133 4133 * page.
4134 4134 */
4135 4135 int
4136 4136 page_addclaim_pages(page_t **ppa)
4137 4137 {
4138 4138 pgcnt_t lckpgs = 0, pg_idx;
4139 4139
4140 4140 VM_STAT_ADD(pagecnt.pc_addclaim_pages);
4141 4141
4142 4142 /*
4143 4143 * Only need to take the page struct lock on the large page root.
4144 4144 */
4145 4145 page_struct_lock(ppa[0]);
4146 4146 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4147 4147
4148 4148 ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4149 4149 ASSERT(ppa[pg_idx]->p_lckcnt != 0);
4150 4150 if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4151 4151 page_struct_unlock(ppa[0]);
4152 4152 return (0);
4153 4153 }
4154 4154 if (ppa[pg_idx]->p_lckcnt > 1)
4155 4155 lckpgs++;
4156 4156 }
4157 4157
4158 4158 if (lckpgs != 0) {
4159 4159 mutex_enter(&freemem_lock);
4160 4160 if (availrmem >= pages_pp_maximum + lckpgs) {
4161 4161 availrmem -= lckpgs;
4162 4162 pages_claimed += lckpgs;
4163 4163 } else {
4164 4164 mutex_exit(&freemem_lock);
4165 4165 page_struct_unlock(ppa[0]);
4166 4166 return (0);
4167 4167 }
4168 4168 mutex_exit(&freemem_lock);
4169 4169 }
4170 4170
4171 4171 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4172 4172 ppa[pg_idx]->p_lckcnt--;
4173 4173 ppa[pg_idx]->p_cowcnt++;
4174 4174 }
4175 4175 page_struct_unlock(ppa[0]);
4176 4176 return (1);
4177 4177 }
4178 4178
4179 4179 /*
4180 4180 * Variant of page_subclaim(), where ppa[] contains the pages of a single large
4181 4181 * page.
4182 4182 */
4183 4183 int
4184 4184 page_subclaim_pages(page_t **ppa)
4185 4185 {
4186 4186 pgcnt_t ulckpgs = 0, pg_idx;
4187 4187
4188 4188 VM_STAT_ADD(pagecnt.pc_subclaim_pages);
4189 4189
4190 4190 /*
4191 4191 * Only need to take the page struct lock on the large page root.
4192 4192 */
4193 4193 page_struct_lock(ppa[0]);
4194 4194 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4195 4195
4196 4196 ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4197 4197 ASSERT(ppa[pg_idx]->p_cowcnt != 0);
4198 4198 if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4199 4199 page_struct_unlock(ppa[0]);
4200 4200 return (0);
4201 4201 }
4202 4202 if (ppa[pg_idx]->p_lckcnt != 0)
4203 4203 ulckpgs++;
4204 4204 }
4205 4205
4206 4206 if (ulckpgs != 0) {
4207 4207 mutex_enter(&freemem_lock);
4208 4208 availrmem += ulckpgs;
4209 4209 pages_claimed -= ulckpgs;
4210 4210 mutex_exit(&freemem_lock);
4211 4211 }
4212 4212
4213 4213 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4214 4214 ppa[pg_idx]->p_cowcnt--;
4215 4215 ppa[pg_idx]->p_lckcnt++;
4216 4216
4217 4217 }
4218 4218 page_struct_unlock(ppa[0]);
4219 4219 return (1);
4220 4220 }
4221 4221
4222 4222 page_t *
4223 4223 page_numtopp(pfn_t pfnum, se_t se)
4224 4224 {
4225 4225 page_t *pp;
4226 4226
4227 4227 retry:
4228 4228 pp = page_numtopp_nolock(pfnum);
4229 4229 if (pp == NULL) {
4230 4230 return ((page_t *)NULL);
4231 4231 }
4232 4232
4233 4233 /*
4234 4234 * Acquire the appropriate lock on the page.
4235 4235 */
4236 4236 while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) {
4237 4237 if (page_pptonum(pp) != pfnum)
4238 4238 goto retry;
4239 4239 continue;
4240 4240 }
4241 4241
4242 4242 if (page_pptonum(pp) != pfnum) {
4243 4243 page_unlock(pp);
4244 4244 goto retry;
4245 4245 }
4246 4246
4247 4247 return (pp);
4248 4248 }
4249 4249
4250 4250 page_t *
4251 4251 page_numtopp_noreclaim(pfn_t pfnum, se_t se)
4252 4252 {
4253 4253 page_t *pp;
4254 4254
4255 4255 retry:
4256 4256 pp = page_numtopp_nolock(pfnum);
4257 4257 if (pp == NULL) {
4258 4258 return ((page_t *)NULL);
4259 4259 }
4260 4260
4261 4261 /*
4262 4262 * Acquire the appropriate lock on the page.
4263 4263 */
4264 4264 while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) {
4265 4265 if (page_pptonum(pp) != pfnum)
4266 4266 goto retry;
4267 4267 continue;
4268 4268 }
4269 4269
4270 4270 if (page_pptonum(pp) != pfnum) {
4271 4271 page_unlock(pp);
4272 4272 goto retry;
4273 4273 }
4274 4274
4275 4275 return (pp);
4276 4276 }
4277 4277
4278 4278 /*
4279 4279 * This routine is like page_numtopp, but will only return page structs
4280 4280 * for pages which are ok for loading into hardware using the page struct.
4281 4281 */
4282 4282 page_t *
4283 4283 page_numtopp_nowait(pfn_t pfnum, se_t se)
4284 4284 {
4285 4285 page_t *pp;
4286 4286
4287 4287 retry:
4288 4288 pp = page_numtopp_nolock(pfnum);
4289 4289 if (pp == NULL) {
4290 4290 return ((page_t *)NULL);
4291 4291 }
4292 4292
4293 4293 /*
4294 4294 * Try to acquire the appropriate lock on the page.
4295 4295 */
4296 4296 if (PP_ISFREE(pp))
4297 4297 pp = NULL;
4298 4298 else {
4299 4299 if (!page_trylock(pp, se))
4300 4300 pp = NULL;
4301 4301 else {
4302 4302 if (page_pptonum(pp) != pfnum) {
4303 4303 page_unlock(pp);
4304 4304 goto retry;
4305 4305 }
4306 4306 if (PP_ISFREE(pp)) {
4307 4307 page_unlock(pp);
4308 4308 pp = NULL;
4309 4309 }
4310 4310 }
4311 4311 }
4312 4312 return (pp);
4313 4313 }
4314 4314
4315 4315 #define SYNC_PROGRESS_NPAGES 1000
4316 4316
4317 4317 /*
4318 4318 * Returns a count of dirty pages that are in the process
4319 4319 * of being written out. If 'cleanit' is set, try to push the page.
4320 4320 */
4321 4321 pgcnt_t
4322 4322 page_busy(int cleanit)
4323 4323 {
4324 4324 page_t *page0 = page_first();
4325 4325 page_t *pp = page0;
4326 4326 pgcnt_t nppbusy = 0;
4327 4327 int counter = 0;
4328 4328 u_offset_t off;
4329 4329
4330 4330 do {
4331 4331 vnode_t *vp = pp->p_vnode;
4332 4332
4333 4333 /*
4334 4334 * Reset the sync timeout. The page list is very long
4335 4335 * on large memory systems.
4336 4336 */
4337 4337 if (++counter > SYNC_PROGRESS_NPAGES) {
4338 4338 counter = 0;
4339 4339 vfs_syncprogress();
4340 4340 }
4341 4341
4342 4342 /*
4343 4343 * A page is a candidate for syncing if it is:
4344 4344 *
4345 4345 * (a) On neither the freelist nor the cachelist
4346 4346 * (b) Hashed onto a vnode
4347 4347 * (c) Not a kernel page
4348 4348 * (d) Dirty
4349 4349 * (e) Not part of a swapfile
4350 4350 * (f) a page which belongs to a real vnode; eg has a non-null
4351 4351 * v_vfsp pointer.
4352 4352 * (g) Backed by a filesystem which doesn't have a
4353 4353 * stubbed-out sync operation
4354 4354 */
4355 4355 if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) &&
4356 4356 hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
4357 4357 vfs_can_sync(vp->v_vfsp)) {
4358 4358 nppbusy++;
4359 4359
4360 4360 if (!cleanit)
4361 4361 continue;
4362 4362 if (!page_trylock(pp, SE_EXCL))
4363 4363 continue;
4364 4364
4365 4365 if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) ||
4366 4366 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
4367 4367 !(hat_pagesync(pp,
4368 4368 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) {
4369 4369 page_unlock(pp);
4370 4370 continue;
4371 4371 }
4372 4372 off = pp->p_offset;
4373 4373 VN_HOLD(vp);
4374 4374 page_unlock(pp);
4375 4375 (void) VOP_PUTPAGE(vp, off, PAGESIZE,
4376 4376 B_ASYNC | B_FREE, kcred, NULL);
4377 4377 VN_RELE(vp);
4378 4378 }
4379 4379 } while ((pp = page_next(pp)) != page0);
4380 4380
4381 4381 vfs_syncprogress();
4382 4382 return (nppbusy);
4383 4383 }
4384 4384
4385 4385 void page_invalidate_pages(void);
4386 4386
4387 4387 /*
4388 4388 * callback handler to vm sub-system
4389 4389 *
4390 4390 * callers make sure no recursive entries to this func.
4391 4391 */
4392 4392 /*ARGSUSED*/
4393 4393 boolean_t
4394 4394 callb_vm_cpr(void *arg, int code)
4395 4395 {
4396 4396 if (code == CB_CODE_CPR_CHKPT)
4397 4397 page_invalidate_pages();
4398 4398 return (B_TRUE);
4399 4399 }
4400 4400
4401 4401 /*
4402 4402 * Invalidate all pages of the system.
4403 4403 * It shouldn't be called until all user page activities are all stopped.
4404 4404 */
4405 4405 void
4406 4406 page_invalidate_pages()
4407 4407 {
4408 4408 page_t *pp;
4409 4409 page_t *page0;
4410 4410 pgcnt_t nbusypages;
4411 4411 int retry = 0;
4412 4412 const int MAXRETRIES = 4;
4413 4413 top:
4414 4414 /*
4415 4415 * Flush dirty pages and destroy the clean ones.
4416 4416 */
4417 4417 nbusypages = 0;
4418 4418
4419 4419 pp = page0 = page_first();
4420 4420 do {
4421 4421 struct vnode *vp;
4422 4422 u_offset_t offset;
4423 4423 int mod;
4424 4424
4425 4425 /*
4426 4426 * skip the page if it has no vnode or the page associated
4427 4427 * with the kernel vnode or prom allocated kernel mem.
4428 4428 */
4429 4429 if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp))
4430 4430 continue;
4431 4431
4432 4432 /*
4433 4433 * skip the page which is already free invalidated.
4434 4434 */
4435 4435 if (PP_ISFREE(pp) && PP_ISAGED(pp))
4436 4436 continue;
4437 4437
4438 4438 /*
4439 4439 * skip pages that are already locked or can't be "exclusively"
4440 4440 * locked or are already free. After we lock the page, check
4441 4441 * the free and age bits again to be sure it's not destroyed
4442 4442 * yet.
4443 4443 * To achieve max. parallelization, we use page_trylock instead
4444 4444 * of page_lock so that we don't get block on individual pages
4445 4445 * while we have thousands of other pages to process.
4446 4446 */
4447 4447 if (!page_trylock(pp, SE_EXCL)) {
4448 4448 nbusypages++;
4449 4449 continue;
4450 4450 } else if (PP_ISFREE(pp)) {
4451 4451 if (!PP_ISAGED(pp)) {
4452 4452 page_destroy_free(pp);
4453 4453 } else {
4454 4454 page_unlock(pp);
4455 4455 }
4456 4456 continue;
4457 4457 }
4458 4458 /*
4459 4459 * Is this page involved in some I/O? shared?
4460 4460 *
4461 4461 * The page_struct_lock need not be acquired to
4462 4462 * examine these fields since the page has an
4463 4463 * "exclusive" lock.
4464 4464 */
4465 4465 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
4466 4466 page_unlock(pp);
4467 4467 continue;
4468 4468 }
4469 4469
4470 4470 if (vp->v_type == VCHR) {
4471 4471 panic("vp->v_type == VCHR");
4472 4472 /*NOTREACHED*/
4473 4473 }
4474 4474
4475 4475 if (!page_try_demote_pages(pp)) {
4476 4476 page_unlock(pp);
4477 4477 continue;
4478 4478 }
4479 4479
4480 4480 /*
4481 4481 * Check the modified bit. Leave the bits alone in hardware
4482 4482 * (they will be modified if we do the putpage).
4483 4483 */
4484 4484 mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD)
4485 4485 & P_MOD);
4486 4486 if (mod) {
4487 4487 offset = pp->p_offset;
4488 4488 /*
4489 4489 * Hold the vnode before releasing the page lock
4490 4490 * to prevent it from being freed and re-used by
4491 4491 * some other thread.
4492 4492 */
4493 4493 VN_HOLD(vp);
4494 4494 page_unlock(pp);
4495 4495 /*
4496 4496 * No error return is checked here. Callers such as
4497 4497 * cpr deals with the dirty pages at the dump time
4498 4498 * if this putpage fails.
4499 4499 */
4500 4500 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL,
4501 4501 kcred, NULL);
4502 4502 VN_RELE(vp);
4503 4503 } else {
4504 4504 /*LINTED: constant in conditional context*/
4505 4505 VN_DISPOSE(pp, B_INVAL, 0, kcred);
4506 4506 }
4507 4507 } while ((pp = page_next(pp)) != page0);
4508 4508 if (nbusypages && retry++ < MAXRETRIES) {
4509 4509 delay(1);
4510 4510 goto top;
4511 4511 }
4512 4512 }
4513 4513
4514 4514 /*
4515 4515 * Replace the page "old" with the page "new" on the page hash and vnode lists
4516 4516 *
4517 4517 * the replacement must be done in place, ie the equivalent sequence:
4518 4518 *
4519 4519 * vp = old->p_vnode;
4520 4520 * off = old->p_offset;
4521 4521 * page_do_hashout(old)
4522 4522 * page_do_hashin(new, vp, off)
4523 4523 *
4524 4524 * doesn't work, since
4525 4525 * 1) if old is the only page on the vnode, the v_pages list has a window
4526 4526 * where it looks empty. This will break file system assumptions.
4527 4527 * and
4528 4528 * 2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list.
4529 4529 */
4530 4530 static void
4531 4531 page_do_relocate_hash(page_t *new, page_t *old)
4532 4532 {
4533 4533 page_t **hash_list;
4534 4534 vnode_t *vp = old->p_vnode;
4535 4535 kmutex_t *sep;
4536 4536
4537 4537 ASSERT(PAGE_EXCL(old));
4538 4538 ASSERT(PAGE_EXCL(new));
4539 4539 ASSERT(vp != NULL);
4540 4540 ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
4541 4541 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset))));
4542 4542
4543 4543 /*
4544 4544 * First find old page on the page hash list
4545 4545 */
4546 4546 hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)];
4547 4547
4548 4548 for (;;) {
4549 4549 if (*hash_list == old)
4550 4550 break;
4551 4551 if (*hash_list == NULL) {
4552 4552 panic("page_do_hashout");
4553 4553 /*NOTREACHED*/
4554 4554 }
4555 4555 hash_list = &(*hash_list)->p_hash;
4556 4556 }
4557 4557
4558 4558 /*
4559 4559 * update new and replace old with new on the page hash list
4560 4560 */
4561 4561 new->p_vnode = old->p_vnode;
4562 4562 new->p_offset = old->p_offset;
4563 4563 new->p_hash = old->p_hash;
4564 4564 *hash_list = new;
4565 4565
4566 4566 if ((new->p_vnode->v_flag & VISSWAP) != 0)
4567 4567 PP_SETSWAP(new);
4568 4568
4569 4569 /*
4570 4570 * replace old with new on the vnode's page list
4571 4571 */
4572 4572 if (old->p_vpnext == old) {
4573 4573 new->p_vpnext = new;
4574 4574 new->p_vpprev = new;
4575 4575 } else {
4576 4576 new->p_vpnext = old->p_vpnext;
4577 4577 new->p_vpprev = old->p_vpprev;
4578 4578 new->p_vpnext->p_vpprev = new;
4579 4579 new->p_vpprev->p_vpnext = new;
4580 4580 }
4581 4581 if (vp->v_pages == old)
4582 4582 vp->v_pages = new;
4583 4583
4584 4584 /*
4585 4585 * clear out the old page
4586 4586 */
4587 4587 old->p_hash = NULL;
4588 4588 old->p_vpnext = NULL;
4589 4589 old->p_vpprev = NULL;
4590 4590 old->p_vnode = NULL;
4591 4591 PP_CLRSWAP(old);
4592 4592 old->p_offset = (u_offset_t)-1;
4593 4593 page_clr_all_props(old);
4594 4594
4595 4595 /*
4596 4596 * Wake up processes waiting for this page. The page's
4597 4597 * identity has been changed, and is probably not the
4598 4598 * desired page any longer.
4599 4599 */
4600 4600 sep = page_se_mutex(old);
4601 4601 mutex_enter(sep);
4602 4602 old->p_selock &= ~SE_EWANTED;
4603 4603 if (CV_HAS_WAITERS(&old->p_cv))
4604 4604 cv_broadcast(&old->p_cv);
4605 4605 mutex_exit(sep);
4606 4606 }
4607 4607
4608 4608 /*
4609 4609 * This function moves the identity of page "pp_old" to page "pp_new".
4610 4610 * Both pages must be locked on entry. "pp_new" is free, has no identity,
4611 4611 * and need not be hashed out from anywhere.
4612 4612 */
4613 4613 void
4614 4614 page_relocate_hash(page_t *pp_new, page_t *pp_old)
4615 4615 {
4616 4616 vnode_t *vp = pp_old->p_vnode;
4617 4617 u_offset_t off = pp_old->p_offset;
4618 4618 kmutex_t *phm, *vphm;
4619 4619
4620 4620 /*
4621 4621 * Rehash two pages
4622 4622 */
4623 4623 ASSERT(PAGE_EXCL(pp_old));
4624 4624 ASSERT(PAGE_EXCL(pp_new));
4625 4625 ASSERT(vp != NULL);
4626 4626 ASSERT(pp_new->p_vnode == NULL);
4627 4627
4628 4628 /*
4629 4629 * hashout then hashin while holding the mutexes
4630 4630 */
4631 4631 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off));
4632 4632 mutex_enter(phm);
4633 4633 vphm = page_vnode_mutex(vp);
4634 4634 mutex_enter(vphm);
4635 4635
4636 4636 page_do_relocate_hash(pp_new, pp_old);
4637 4637
4638 4638 /* The following comment preserved from page_flip(). */
4639 4639 pp_new->p_fsdata = pp_old->p_fsdata;
4640 4640 pp_old->p_fsdata = 0;
4641 4641 mutex_exit(vphm);
4642 4642 mutex_exit(phm);
4643 4643
4644 4644 /*
4645 4645 * The page_struct_lock need not be acquired for lckcnt and
4646 4646 * cowcnt since the page has an "exclusive" lock.
4647 4647 */
4648 4648 ASSERT(pp_new->p_lckcnt == 0);
4649 4649 ASSERT(pp_new->p_cowcnt == 0);
4650 4650 pp_new->p_lckcnt = pp_old->p_lckcnt;
4651 4651 pp_new->p_cowcnt = pp_old->p_cowcnt;
4652 4652 pp_old->p_lckcnt = pp_old->p_cowcnt = 0;
4653 4653
4654 4654 }
4655 4655
4656 4656 /*
4657 4657 * Helper routine used to lock all remaining members of a
4658 4658 * large page. The caller is responsible for passing in a locked
4659 4659 * pp. If pp is a large page, then it succeeds in locking all the
4660 4660 * remaining constituent pages or it returns with only the
4661 4661 * original page locked.
4662 4662 *
4663 4663 * Returns 1 on success, 0 on failure.
4664 4664 *
4665 4665 * If success is returned this routine guarantees p_szc for all constituent
4666 4666 * pages of a large page pp belongs to can't change. To achieve this we
4667 4667 * recheck szc of pp after locking all constituent pages and retry if szc
4668 4668 * changed (it could only decrease). Since hat_page_demote() needs an EXCL
4669 4669 * lock on one of constituent pages it can't be running after all constituent
4670 4670 * pages are locked. hat_page_demote() with a lock on a constituent page
4671 4671 * outside of this large page (i.e. pp belonged to a larger large page) is
4672 4672 * already done with all constituent pages of pp since the root's p_szc is
4673 4673 * changed last. Therefore no need to synchronize with hat_page_demote() that
4674 4674 * locked a constituent page outside of pp's current large page.
4675 4675 */
4676 4676 #ifdef DEBUG
4677 4677 uint32_t gpg_trylock_mtbf = 0;
4678 4678 #endif
4679 4679
4680 4680 int
4681 4681 group_page_trylock(page_t *pp, se_t se)
4682 4682 {
4683 4683 page_t *tpp;
4684 4684 pgcnt_t npgs, i, j;
4685 4685 uint_t pszc = pp->p_szc;
4686 4686
4687 4687 #ifdef DEBUG
4688 4688 if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) {
4689 4689 return (0);
4690 4690 }
4691 4691 #endif
4692 4692
4693 4693 if (pp != PP_GROUPLEADER(pp, pszc)) {
4694 4694 return (0);
4695 4695 }
4696 4696
4697 4697 retry:
4698 4698 ASSERT(PAGE_LOCKED_SE(pp, se));
4699 4699 ASSERT(!PP_ISFREE(pp));
4700 4700 if (pszc == 0) {
4701 4701 return (1);
4702 4702 }
4703 4703 npgs = page_get_pagecnt(pszc);
4704 4704 tpp = pp + 1;
4705 4705 for (i = 1; i < npgs; i++, tpp++) {
4706 4706 if (!page_trylock(tpp, se)) {
4707 4707 tpp = pp + 1;
4708 4708 for (j = 1; j < i; j++, tpp++) {
4709 4709 page_unlock(tpp);
4710 4710 }
4711 4711 return (0);
4712 4712 }
4713 4713 }
4714 4714 if (pp->p_szc != pszc) {
4715 4715 ASSERT(pp->p_szc < pszc);
4716 4716 ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) &&
4717 4717 !IS_SWAPFSVP(pp->p_vnode));
4718 4718 tpp = pp + 1;
4719 4719 for (i = 1; i < npgs; i++, tpp++) {
4720 4720 page_unlock(tpp);
4721 4721 }
4722 4722 pszc = pp->p_szc;
4723 4723 goto retry;
4724 4724 }
4725 4725 return (1);
4726 4726 }
4727 4727
4728 4728 void
4729 4729 group_page_unlock(page_t *pp)
4730 4730 {
4731 4731 page_t *tpp;
4732 4732 pgcnt_t npgs, i;
4733 4733
4734 4734 ASSERT(PAGE_LOCKED(pp));
4735 4735 ASSERT(!PP_ISFREE(pp));
4736 4736 ASSERT(pp == PP_PAGEROOT(pp));
4737 4737 npgs = page_get_pagecnt(pp->p_szc);
4738 4738 for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) {
4739 4739 page_unlock(tpp);
4740 4740 }
4741 4741 }
4742 4742
4743 4743 /*
4744 4744 * returns
4745 4745 * 0 : on success and *nrelocp is number of relocated PAGESIZE pages
4746 4746 * ERANGE : this is not a base page
4747 4747 * EBUSY : failure to get locks on the page/pages
4748 4748 * ENOMEM : failure to obtain replacement pages
4749 4749 * EAGAIN : OBP has not yet completed its boot-time handoff to the kernel
4750 4750 * EIO : An error occurred while trying to copy the page data
4751 4751 *
4752 4752 * Return with all constituent members of target and replacement
4753 4753 * SE_EXCL locked. It is the callers responsibility to drop the
4754 4754 * locks.
4755 4755 */
4756 4756 int
4757 4757 do_page_relocate(
4758 4758 page_t **target,
4759 4759 page_t **replacement,
4760 4760 int grouplock,
4761 4761 spgcnt_t *nrelocp,
4762 4762 lgrp_t *lgrp)
4763 4763 {
4764 4764 page_t *first_repl;
4765 4765 page_t *repl;
4766 4766 page_t *targ;
4767 4767 page_t *pl = NULL;
4768 4768 uint_t ppattr;
4769 4769 pfn_t pfn, repl_pfn;
4770 4770 uint_t szc;
4771 4771 spgcnt_t npgs, i;
4772 4772 int repl_contig = 0;
4773 4773 uint_t flags = 0;
4774 4774 spgcnt_t dofree = 0;
4775 4775
4776 4776 *nrelocp = 0;
4777 4777
4778 4778 #if defined(__sparc)
4779 4779 /*
4780 4780 * We need to wait till OBP has completed
4781 4781 * its boot-time handoff of its resources to the kernel
4782 4782 * before we allow page relocation
4783 4783 */
4784 4784 if (page_relocate_ready == 0) {
4785 4785 return (EAGAIN);
4786 4786 }
4787 4787 #endif
4788 4788
4789 4789 /*
4790 4790 * If this is not a base page,
4791 4791 * just return with 0x0 pages relocated.
4792 4792 */
4793 4793 targ = *target;
4794 4794 ASSERT(PAGE_EXCL(targ));
4795 4795 ASSERT(!PP_ISFREE(targ));
4796 4796 szc = targ->p_szc;
4797 4797 ASSERT(szc < mmu_page_sizes);
4798 4798 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4799 4799 pfn = targ->p_pagenum;
4800 4800 if (pfn != PFN_BASE(pfn, szc)) {
4801 4801 VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]);
4802 4802 return (ERANGE);
4803 4803 }
4804 4804
4805 4805 if ((repl = *replacement) != NULL && repl->p_szc >= szc) {
4806 4806 repl_pfn = repl->p_pagenum;
4807 4807 if (repl_pfn != PFN_BASE(repl_pfn, szc)) {
4808 4808 VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]);
4809 4809 return (ERANGE);
4810 4810 }
4811 4811 repl_contig = 1;
4812 4812 }
4813 4813
4814 4814 /*
4815 4815 * We must lock all members of this large page or we cannot
4816 4816 * relocate any part of it.
4817 4817 */
4818 4818 if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) {
4819 4819 VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]);
4820 4820 return (EBUSY);
4821 4821 }
4822 4822
4823 4823 /*
4824 4824 * reread szc it could have been decreased before
4825 4825 * group_page_trylock() was done.
4826 4826 */
4827 4827 szc = targ->p_szc;
4828 4828 ASSERT(szc < mmu_page_sizes);
4829 4829 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4830 4830 ASSERT(pfn == PFN_BASE(pfn, szc));
4831 4831
4832 4832 npgs = page_get_pagecnt(targ->p_szc);
4833 4833
4834 4834 if (repl == NULL) {
4835 4835 dofree = npgs; /* Size of target page in MMU pages */
4836 4836 if (!page_create_wait(dofree, 0)) {
4837 4837 if (grouplock != 0) {
4838 4838 group_page_unlock(targ);
4839 4839 }
4840 4840 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4841 4841 return (ENOMEM);
4842 4842 }
4843 4843
4844 4844 /*
4845 4845 * seg kmem pages require that the target and replacement
4846 4846 * page be the same pagesize.
4847 4847 */
4848 4848 flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0;
4849 4849 repl = page_get_replacement_page(targ, lgrp, flags);
4850 4850 if (repl == NULL) {
4851 4851 if (grouplock != 0) {
4852 4852 group_page_unlock(targ);
4853 4853 }
4854 4854 page_create_putback(dofree);
4855 4855 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4856 4856 return (ENOMEM);
4857 4857 }
4858 4858 }
4859 4859 #ifdef DEBUG
4860 4860 else {
4861 4861 ASSERT(PAGE_LOCKED(repl));
4862 4862 }
4863 4863 #endif /* DEBUG */
4864 4864
4865 4865 #if defined(__sparc)
4866 4866 /*
4867 4867 * Let hat_page_relocate() complete the relocation if it's kernel page
4868 4868 */
4869 4869 if (VN_ISKAS(targ->p_vnode)) {
4870 4870 *replacement = repl;
4871 4871 if (hat_page_relocate(target, replacement, nrelocp) != 0) {
4872 4872 if (grouplock != 0) {
4873 4873 group_page_unlock(targ);
4874 4874 }
4875 4875 if (dofree) {
4876 4876 *replacement = NULL;
4877 4877 page_free_replacement_page(repl);
4878 4878 page_create_putback(dofree);
4879 4879 }
4880 4880 VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]);
4881 4881 return (EAGAIN);
4882 4882 }
4883 4883 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4884 4884 return (0);
4885 4885 }
4886 4886 #else
4887 4887 #if defined(lint)
4888 4888 dofree = dofree;
4889 4889 #endif
4890 4890 #endif
4891 4891
4892 4892 first_repl = repl;
4893 4893
4894 4894 for (i = 0; i < npgs; i++) {
4895 4895 ASSERT(PAGE_EXCL(targ));
4896 4896 ASSERT(targ->p_slckcnt == 0);
4897 4897 ASSERT(repl->p_slckcnt == 0);
4898 4898
4899 4899 (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD);
4900 4900
4901 4901 ASSERT(hat_page_getshare(targ) == 0);
4902 4902 ASSERT(!PP_ISFREE(targ));
4903 4903 ASSERT(targ->p_pagenum == (pfn + i));
4904 4904 ASSERT(repl_contig == 0 ||
4905 4905 repl->p_pagenum == (repl_pfn + i));
4906 4906
4907 4907 /*
4908 4908 * Copy the page contents and attributes then
4909 4909 * relocate the page in the page hash.
4910 4910 */
4911 4911 if (ppcopy(targ, repl) == 0) {
4912 4912 targ = *target;
4913 4913 repl = first_repl;
4914 4914 VM_STAT_ADD(vmm_vmstats.ppr_copyfail);
4915 4915 if (grouplock != 0) {
4916 4916 group_page_unlock(targ);
4917 4917 }
4918 4918 if (dofree) {
4919 4919 *replacement = NULL;
4920 4920 page_free_replacement_page(repl);
4921 4921 page_create_putback(dofree);
4922 4922 }
4923 4923 return (EIO);
4924 4924 }
4925 4925
4926 4926 targ++;
4927 4927 if (repl_contig != 0) {
4928 4928 repl++;
4929 4929 } else {
4930 4930 repl = repl->p_next;
4931 4931 }
4932 4932 }
4933 4933
4934 4934 repl = first_repl;
4935 4935 targ = *target;
4936 4936
4937 4937 for (i = 0; i < npgs; i++) {
4938 4938 ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
4939 4939 page_clr_all_props(repl);
4940 4940 page_set_props(repl, ppattr);
4941 4941 page_relocate_hash(repl, targ);
4942 4942
4943 4943 ASSERT(hat_page_getshare(targ) == 0);
4944 4944 ASSERT(hat_page_getshare(repl) == 0);
4945 4945 /*
4946 4946 * Now clear the props on targ, after the
4947 4947 * page_relocate_hash(), they no longer
4948 4948 * have any meaning.
4949 4949 */
4950 4950 page_clr_all_props(targ);
4951 4951 ASSERT(targ->p_next == targ);
4952 4952 ASSERT(targ->p_prev == targ);
4953 4953 page_list_concat(&pl, &targ);
4954 4954
4955 4955 targ++;
4956 4956 if (repl_contig != 0) {
4957 4957 repl++;
4958 4958 } else {
4959 4959 repl = repl->p_next;
4960 4960 }
4961 4961 }
4962 4962 /* assert that we have come full circle with repl */
4963 4963 ASSERT(repl_contig == 1 || first_repl == repl);
4964 4964
4965 4965 *target = pl;
4966 4966 if (*replacement == NULL) {
4967 4967 ASSERT(first_repl == repl);
4968 4968 *replacement = repl;
4969 4969 }
4970 4970 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4971 4971 *nrelocp = npgs;
4972 4972 return (0);
4973 4973 }
4974 4974 /*
4975 4975 * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated.
4976 4976 */
4977 4977 int
4978 4978 page_relocate(
4979 4979 page_t **target,
4980 4980 page_t **replacement,
4981 4981 int grouplock,
4982 4982 int freetarget,
4983 4983 spgcnt_t *nrelocp,
4984 4984 lgrp_t *lgrp)
4985 4985 {
4986 4986 spgcnt_t ret;
4987 4987
4988 4988 /* do_page_relocate returns 0 on success or errno value */
4989 4989 ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp);
4990 4990
4991 4991 if (ret != 0 || freetarget == 0) {
4992 4992 return (ret);
4993 4993 }
4994 4994 if (*nrelocp == 1) {
4995 4995 ASSERT(*target != NULL);
4996 4996 page_free(*target, 1);
4997 4997 } else {
4998 4998 page_t *tpp = *target;
4999 4999 uint_t szc = tpp->p_szc;
5000 5000 pgcnt_t npgs = page_get_pagecnt(szc);
5001 5001 ASSERT(npgs > 1);
5002 5002 ASSERT(szc != 0);
5003 5003 do {
5004 5004 ASSERT(PAGE_EXCL(tpp));
5005 5005 ASSERT(!hat_page_is_mapped(tpp));
5006 5006 ASSERT(tpp->p_szc == szc);
5007 5007 PP_SETFREE(tpp);
5008 5008 PP_SETAGED(tpp);
5009 5009 npgs--;
5010 5010 } while ((tpp = tpp->p_next) != *target);
5011 5011 ASSERT(npgs == 0);
5012 5012 page_list_add_pages(*target, 0);
5013 5013 npgs = page_get_pagecnt(szc);
5014 5014 page_create_putback(npgs);
5015 5015 }
5016 5016 return (ret);
5017 5017 }
5018 5018
5019 5019 /*
5020 5020 * it is up to the caller to deal with pcf accounting.
5021 5021 */
5022 5022 void
5023 5023 page_free_replacement_page(page_t *pplist)
5024 5024 {
5025 5025 page_t *pp;
5026 5026
5027 5027 while (pplist != NULL) {
5028 5028 /*
5029 5029 * pp_targ is a linked list.
5030 5030 */
5031 5031 pp = pplist;
5032 5032 if (pp->p_szc == 0) {
5033 5033 page_sub(&pplist, pp);
5034 5034 page_clr_all_props(pp);
5035 5035 PP_SETFREE(pp);
5036 5036 PP_SETAGED(pp);
5037 5037 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
5038 5038 page_unlock(pp);
5039 5039 VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]);
5040 5040 } else {
5041 5041 spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc);
5042 5042 page_t *tpp;
5043 5043 page_list_break(&pp, &pplist, curnpgs);
5044 5044 tpp = pp;
5045 5045 do {
5046 5046 ASSERT(PAGE_EXCL(tpp));
5047 5047 ASSERT(!hat_page_is_mapped(tpp));
5048 5048 page_clr_all_props(tpp);
5049 5049 PP_SETFREE(tpp);
5050 5050 PP_SETAGED(tpp);
5051 5051 } while ((tpp = tpp->p_next) != pp);
5052 5052 page_list_add_pages(pp, 0);
5053 5053 VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]);
5054 5054 }
5055 5055 }
5056 5056 }
5057 5057
5058 5058 /*
5059 5059 * Relocate target to non-relocatable replacement page.
5060 5060 */
5061 5061 int
5062 5062 page_relocate_cage(page_t **target, page_t **replacement)
5063 5063 {
5064 5064 page_t *tpp, *rpp;
5065 5065 spgcnt_t pgcnt, npgs;
5066 5066 int result;
5067 5067
5068 5068 tpp = *target;
5069 5069
5070 5070 ASSERT(PAGE_EXCL(tpp));
5071 5071 ASSERT(tpp->p_szc == 0);
5072 5072
5073 5073 pgcnt = btop(page_get_pagesize(tpp->p_szc));
5074 5074
5075 5075 do {
5076 5076 (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC);
5077 5077 rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC);
5078 5078 if (rpp == NULL) {
5079 5079 page_create_putback(pgcnt);
5080 5080 kcage_cageout_wakeup();
5081 5081 }
5082 5082 } while (rpp == NULL);
5083 5083
5084 5084 ASSERT(PP_ISNORELOC(rpp));
5085 5085
5086 5086 result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL);
5087 5087
5088 5088 if (result == 0) {
5089 5089 *replacement = rpp;
5090 5090 if (pgcnt != npgs)
5091 5091 panic("page_relocate_cage: partial relocation");
5092 5092 }
5093 5093
5094 5094 return (result);
5095 5095 }
5096 5096
5097 5097 /*
5098 5098 * Release the page lock on a page, place on cachelist
5099 5099 * tail if no longer mapped. Caller can let us know if
5100 5100 * the page is known to be clean.
5101 5101 */
5102 5102 int
5103 5103 page_release(page_t *pp, int checkmod)
5104 5104 {
5105 5105 int status;
5106 5106
5107 5107 ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) &&
5108 5108 (pp->p_vnode != NULL));
5109 5109
5110 5110 if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) &&
5111 5111 ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) &&
5112 5112 pp->p_lckcnt == 0 && pp->p_cowcnt == 0 &&
5113 5113 !hat_page_is_mapped(pp)) {
5114 5114
5115 5115 /*
5116 5116 * If page is modified, unlock it
5117 5117 *
5118 5118 * (p_nrm & P_MOD) bit has the latest stuff because:
5119 5119 * (1) We found that this page doesn't have any mappings
5120 5120 * _after_ holding SE_EXCL and
5121 5121 * (2) We didn't drop SE_EXCL lock after the check in (1)
5122 5122 */
5123 5123 if (checkmod && hat_ismod(pp)) {
5124 5124 page_unlock(pp);
5125 5125 status = PGREL_MOD;
5126 5126 } else {
5127 5127 /*LINTED: constant in conditional context*/
5128 5128 VN_DISPOSE(pp, B_FREE, 0, kcred);
5129 5129 status = PGREL_CLEAN;
5130 5130 }
5131 5131 } else {
5132 5132 page_unlock(pp);
5133 5133 status = PGREL_NOTREL;
5134 5134 }
5135 5135 return (status);
5136 5136 }
5137 5137
5138 5138 /*
5139 5139 * Given a constituent page, try to demote the large page on the freelist.
5140 5140 *
5141 5141 * Returns nonzero if the page could be demoted successfully. Returns with
5142 5142 * the constituent page still locked.
5143 5143 */
5144 5144 int
5145 5145 page_try_demote_free_pages(page_t *pp)
5146 5146 {
5147 5147 page_t *rootpp = pp;
5148 5148 pfn_t pfn = page_pptonum(pp);
5149 5149 spgcnt_t npgs;
5150 5150 uint_t szc = pp->p_szc;
5151 5151
5152 5152 ASSERT(PP_ISFREE(pp));
5153 5153 ASSERT(PAGE_EXCL(pp));
5154 5154
5155 5155 /*
5156 5156 * Adjust rootpp and lock it, if `pp' is not the base
5157 5157 * constituent page.
5158 5158 */
5159 5159 npgs = page_get_pagecnt(pp->p_szc);
5160 5160 if (npgs == 1) {
5161 5161 return (0);
5162 5162 }
5163 5163
5164 5164 if (!IS_P2ALIGNED(pfn, npgs)) {
5165 5165 pfn = P2ALIGN(pfn, npgs);
5166 5166 rootpp = page_numtopp_nolock(pfn);
5167 5167 }
5168 5168
5169 5169 if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) {
5170 5170 return (0);
5171 5171 }
5172 5172
5173 5173 if (rootpp->p_szc != szc) {
5174 5174 if (pp != rootpp)
5175 5175 page_unlock(rootpp);
5176 5176 return (0);
5177 5177 }
5178 5178
5179 5179 page_demote_free_pages(rootpp);
5180 5180
5181 5181 if (pp != rootpp)
5182 5182 page_unlock(rootpp);
5183 5183
5184 5184 ASSERT(PP_ISFREE(pp));
5185 5185 ASSERT(PAGE_EXCL(pp));
5186 5186 return (1);
5187 5187 }
5188 5188
5189 5189 /*
5190 5190 * Given a constituent page, try to demote the large page.
5191 5191 *
5192 5192 * Returns nonzero if the page could be demoted successfully. Returns with
5193 5193 * the constituent page still locked.
5194 5194 */
5195 5195 int
5196 5196 page_try_demote_pages(page_t *pp)
5197 5197 {
5198 5198 page_t *tpp, *rootpp = pp;
5199 5199 pfn_t pfn = page_pptonum(pp);
5200 5200 spgcnt_t i, npgs;
5201 5201 uint_t szc = pp->p_szc;
5202 5202 vnode_t *vp = pp->p_vnode;
5203 5203
5204 5204 ASSERT(PAGE_EXCL(pp));
5205 5205
5206 5206 VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
5207 5207
5208 5208 if (pp->p_szc == 0) {
5209 5209 VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
5210 5210 return (1);
5211 5211 }
5212 5212
5213 5213 if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) {
5214 5214 VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
5215 5215 page_demote_vp_pages(pp);
5216 5216 ASSERT(pp->p_szc == 0);
5217 5217 return (1);
5218 5218 }
5219 5219
5220 5220 /*
5221 5221 * Adjust rootpp if passed in is not the base
5222 5222 * constituent page.
5223 5223 */
5224 5224 npgs = page_get_pagecnt(pp->p_szc);
5225 5225 ASSERT(npgs > 1);
5226 5226 if (!IS_P2ALIGNED(pfn, npgs)) {
5227 5227 pfn = P2ALIGN(pfn, npgs);
5228 5228 rootpp = page_numtopp_nolock(pfn);
5229 5229 VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]);
5230 5230 ASSERT(rootpp->p_vnode != NULL);
5231 5231 ASSERT(rootpp->p_szc == szc);
5232 5232 }
5233 5233
5234 5234 /*
5235 5235 * We can't demote kernel pages since we can't hat_unload()
5236 5236 * the mappings.
5237 5237 */
5238 5238 if (VN_ISKAS(rootpp->p_vnode))
5239 5239 return (0);
5240 5240
5241 5241 /*
5242 5242 * Attempt to lock all constituent pages except the page passed
5243 5243 * in since it's already locked.
5244 5244 */
5245 5245 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5246 5246 ASSERT(!PP_ISFREE(tpp));
5247 5247 ASSERT(tpp->p_vnode != NULL);
5248 5248
5249 5249 if (tpp != pp && !page_trylock(tpp, SE_EXCL))
5250 5250 break;
5251 5251 ASSERT(tpp->p_szc == rootpp->p_szc);
5252 5252 ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
5253 5253 }
5254 5254
5255 5255 /*
5256 5256 * If we failed to lock them all then unlock what we have
5257 5257 * locked so far and bail.
5258 5258 */
5259 5259 if (i < npgs) {
5260 5260 tpp = rootpp;
5261 5261 while (i-- > 0) {
5262 5262 if (tpp != pp)
5263 5263 page_unlock(tpp);
5264 5264 tpp++;
5265 5265 }
5266 5266 VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]);
5267 5267 return (0);
5268 5268 }
5269 5269
5270 5270 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5271 5271 ASSERT(PAGE_EXCL(tpp));
5272 5272 ASSERT(tpp->p_slckcnt == 0);
5273 5273 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
5274 5274 tpp->p_szc = 0;
5275 5275 }
5276 5276
5277 5277 /*
5278 5278 * Unlock all pages except the page passed in.
5279 5279 */
5280 5280 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5281 5281 ASSERT(!hat_page_is_mapped(tpp));
5282 5282 if (tpp != pp)
5283 5283 page_unlock(tpp);
5284 5284 }
5285 5285
5286 5286 VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
5287 5287 return (1);
5288 5288 }
5289 5289
5290 5290 /*
5291 5291 * Called by page_free() and page_destroy() to demote the page size code
5292 5292 * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero
5293 5293 * p_szc on free list, neither can we just clear p_szc of a single page_t
5294 5294 * within a large page since it will break other code that relies on p_szc
5295 5295 * being the same for all page_t's of a large page). Anonymous pages should
5296 5296 * never end up here because anon_map_getpages() cannot deal with p_szc
5297 5297 * changes after a single constituent page is locked. While anonymous or
5298 5298 * kernel large pages are demoted or freed the entire large page at a time
5299 5299 * with all constituent pages locked EXCL for the file system pages we
5300 5300 * have to be able to demote a large page (i.e. decrease all constituent pages
5301 5301 * p_szc) with only just an EXCL lock on one of constituent pages. The reason
5302 5302 * we can easily deal with anonymous page demotion the entire large page at a
5303 5303 * time is that those operation originate at address space level and concern
5304 5304 * the entire large page region with actual demotion only done when pages are
5305 5305 * not shared with any other processes (therefore we can always get EXCL lock
5306 5306 * on all anonymous constituent pages after clearing segment page
5307 5307 * cache). However file system pages can be truncated or invalidated at a
5308 5308 * PAGESIZE level from the file system side and end up in page_free() or
5309 5309 * page_destroy() (we also allow only part of the large page to be SOFTLOCKed
5310 5310 * and therefore pageout should be able to demote a large page by EXCL locking
5311 5311 * any constituent page that is not under SOFTLOCK). In those cases we cannot
5312 5312 * rely on being able to lock EXCL all constituent pages.
5313 5313 *
5314 5314 * To prevent szc changes on file system pages one has to lock all constituent
5315 5315 * pages at least SHARED (or call page_szc_lock()). The only subsystem that
5316 5316 * doesn't rely on locking all constituent pages (or using page_szc_lock()) to
5317 5317 * prevent szc changes is hat layer that uses its own page level mlist
5318 5318 * locks. hat assumes that szc doesn't change after mlist lock for a page is
5319 5319 * taken. Therefore we need to change szc under hat level locks if we only
5320 5320 * have an EXCL lock on a single constituent page and hat still references any
5321 5321 * of constituent pages. (Note we can't "ignore" hat layer by simply
5322 5322 * hat_pageunload() all constituent pages without having EXCL locks on all of
5323 5323 * constituent pages). We use hat_page_demote() call to safely demote szc of
5324 5324 * all constituent pages under hat locks when we only have an EXCL lock on one
5325 5325 * of constituent pages.
5326 5326 *
5327 5327 * This routine calls page_szc_lock() before calling hat_page_demote() to
5328 5328 * allow segvn in one special case not to lock all constituent pages SHARED
5329 5329 * before calling hat_memload_array() that relies on p_szc not changing even
5330 5330 * before hat level mlist lock is taken. In that case segvn uses
5331 5331 * page_szc_lock() to prevent hat_page_demote() changing p_szc values.
5332 5332 *
5333 5333 * Anonymous or kernel page demotion still has to lock all pages exclusively
5334 5334 * and do hat_pageunload() on all constituent pages before demoting the page
5335 5335 * therefore there's no need for anonymous or kernel page demotion to use
5336 5336 * hat_page_demote() mechanism.
5337 5337 *
5338 5338 * hat_page_demote() removes all large mappings that map pp and then decreases
5339 5339 * p_szc starting from the last constituent page of the large page. By working
5340 5340 * from the tail of a large page in pfn decreasing order allows one looking at
5341 5341 * the root page to know that hat_page_demote() is done for root's szc area.
5342 5342 * e.g. if a root page has szc 1 one knows it only has to lock all constituent
5343 5343 * pages within szc 1 area to prevent szc changes because hat_page_demote()
5344 5344 * that started on this page when it had szc > 1 is done for this szc 1 area.
5345 5345 *
5346 5346 * We are guaranteed that all constituent pages of pp's large page belong to
5347 5347 * the same vnode with the consecutive offsets increasing in the direction of
5348 5348 * the pfn i.e. the identity of constituent pages can't change until their
5349 5349 * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove
5350 5350 * large mappings to pp even though we don't lock any constituent page except
5351 5351 * pp (i.e. we won't unload e.g. kernel locked page).
5352 5352 */
5353 5353 static void
5354 5354 page_demote_vp_pages(page_t *pp)
5355 5355 {
5356 5356 kmutex_t *mtx;
5357 5357
5358 5358 ASSERT(PAGE_EXCL(pp));
5359 5359 ASSERT(!PP_ISFREE(pp));
5360 5360 ASSERT(pp->p_vnode != NULL);
5361 5361 ASSERT(!IS_SWAPFSVP(pp->p_vnode));
5362 5362 ASSERT(!PP_ISKAS(pp));
5363 5363
5364 5364 VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
5365 5365
5366 5366 mtx = page_szc_lock(pp);
5367 5367 if (mtx != NULL) {
5368 5368 hat_page_demote(pp);
5369 5369 mutex_exit(mtx);
5370 5370 }
5371 5371 ASSERT(pp->p_szc == 0);
5372 5372 }
5373 5373
5374 5374 /*
5375 5375 * Mark any existing pages for migration in the given range
5376 5376 */
5377 5377 void
5378 5378 page_mark_migrate(struct seg *seg, caddr_t addr, size_t len,
5379 5379 struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
5380 5380 u_offset_t vnoff, int rflag)
5381 5381 {
5382 5382 struct anon *ap;
5383 5383 vnode_t *curvp;
5384 5384 lgrp_t *from;
5385 5385 pgcnt_t nlocked;
5386 5386 u_offset_t off;
5387 5387 pfn_t pfn;
5388 5388 size_t pgsz;
5389 5389 size_t segpgsz;
5390 5390 pgcnt_t pages;
5391 5391 uint_t pszc;
5392 5392 page_t *pp0, *pp;
5393 5393 caddr_t va;
5394 5394 ulong_t an_idx;
5395 5395 anon_sync_obj_t cookie;
5396 5396
5397 5397 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5398 5398
5399 5399 /*
5400 5400 * Don't do anything if don't need to do lgroup optimizations
5401 5401 * on this system
5402 5402 */
5403 5403 if (!lgrp_optimizations())
5404 5404 return;
5405 5405
5406 5406 /*
5407 5407 * Align address and length to (potentially large) page boundary
5408 5408 */
5409 5409 segpgsz = page_get_pagesize(seg->s_szc);
5410 5410 addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz);
5411 5411 if (rflag)
5412 5412 len = P2ROUNDUP(len, segpgsz);
5413 5413
5414 5414 /*
5415 5415 * Do one (large) page at a time
5416 5416 */
5417 5417 va = addr;
5418 5418 while (va < addr + len) {
5419 5419 /*
5420 5420 * Lookup (root) page for vnode and offset corresponding to
5421 5421 * this virtual address
5422 5422 * Try anonmap first since there may be copy-on-write
5423 5423 * pages, but initialize vnode pointer and offset using
5424 5424 * vnode arguments just in case there isn't an amp.
5425 5425 */
5426 5426 curvp = vp;
5427 5427 off = vnoff + va - seg->s_base;
5428 5428 if (amp) {
5429 5429 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
5430 5430 an_idx = anon_index + seg_page(seg, va);
5431 5431 anon_array_enter(amp, an_idx, &cookie);
5432 5432 ap = anon_get_ptr(amp->ahp, an_idx);
5433 5433 if (ap)
5434 5434 swap_xlate(ap, &curvp, &off);
5435 5435 anon_array_exit(&cookie);
5436 5436 ANON_LOCK_EXIT(&->a_rwlock);
5437 5437 }
5438 5438
5439 5439 pp = NULL;
5440 5440 if (curvp)
5441 5441 pp = page_lookup(curvp, off, SE_SHARED);
5442 5442
5443 5443 /*
5444 5444 * If there isn't a page at this virtual address,
5445 5445 * skip to next page
5446 5446 */
5447 5447 if (pp == NULL) {
5448 5448 va += PAGESIZE;
5449 5449 continue;
5450 5450 }
5451 5451
5452 5452 /*
5453 5453 * Figure out which lgroup this page is in for kstats
5454 5454 */
5455 5455 pfn = page_pptonum(pp);
5456 5456 from = lgrp_pfn_to_lgrp(pfn);
5457 5457
5458 5458 /*
5459 5459 * Get page size, and round up and skip to next page boundary
5460 5460 * if unaligned address
5461 5461 */
5462 5462 pszc = pp->p_szc;
5463 5463 pgsz = page_get_pagesize(pszc);
5464 5464 pages = btop(pgsz);
5465 5465 if (!IS_P2ALIGNED(va, pgsz) ||
5466 5466 !IS_P2ALIGNED(pfn, pages) ||
5467 5467 pgsz > segpgsz) {
5468 5468 pgsz = MIN(pgsz, segpgsz);
5469 5469 page_unlock(pp);
5470 5470 pages = btop(P2END((uintptr_t)va, pgsz) -
5471 5471 (uintptr_t)va);
5472 5472 va = (caddr_t)P2END((uintptr_t)va, pgsz);
5473 5473 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, pages);
5474 5474 continue;
5475 5475 }
5476 5476
5477 5477 /*
5478 5478 * Upgrade to exclusive lock on page
5479 5479 */
5480 5480 if (!page_tryupgrade(pp)) {
5481 5481 page_unlock(pp);
5482 5482 va += pgsz;
5483 5483 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5484 5484 btop(pgsz));
5485 5485 continue;
5486 5486 }
5487 5487
5488 5488 pp0 = pp++;
5489 5489 nlocked = 1;
5490 5490
5491 5491 /*
5492 5492 * Lock constituent pages if this is large page
5493 5493 */
5494 5494 if (pages > 1) {
5495 5495 /*
5496 5496 * Lock all constituents except root page, since it
5497 5497 * should be locked already.
5498 5498 */
5499 5499 for (; nlocked < pages; nlocked++) {
5500 5500 if (!page_trylock(pp, SE_EXCL)) {
5501 5501 break;
5502 5502 }
5503 5503 if (PP_ISFREE(pp) ||
5504 5504 pp->p_szc != pszc) {
5505 5505 /*
5506 5506 * hat_page_demote() raced in with us.
5507 5507 */
5508 5508 ASSERT(!IS_SWAPFSVP(curvp));
5509 5509 page_unlock(pp);
5510 5510 break;
5511 5511 }
5512 5512 pp++;
5513 5513 }
5514 5514 }
5515 5515
5516 5516 /*
5517 5517 * If all constituent pages couldn't be locked,
5518 5518 * unlock pages locked so far and skip to next page.
5519 5519 */
5520 5520 if (nlocked < pages) {
5521 5521 while (pp0 < pp) {
5522 5522 page_unlock(pp0++);
5523 5523 }
5524 5524 va += pgsz;
5525 5525 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5526 5526 btop(pgsz));
5527 5527 continue;
5528 5528 }
5529 5529
5530 5530 /*
5531 5531 * hat_page_demote() can no longer happen
5532 5532 * since last cons page had the right p_szc after
5533 5533 * all cons pages were locked. all cons pages
5534 5534 * should now have the same p_szc.
5535 5535 */
5536 5536
5537 5537 /*
5538 5538 * All constituent pages locked successfully, so mark
5539 5539 * large page for migration and unload the mappings of
5540 5540 * constituent pages, so a fault will occur on any part of the
5541 5541 * large page
5542 5542 */
5543 5543 PP_SETMIGRATE(pp0);
5544 5544 while (pp0 < pp) {
5545 5545 (void) hat_pageunload(pp0, HAT_FORCE_PGUNLOAD);
5546 5546 ASSERT(hat_page_getshare(pp0) == 0);
5547 5547 page_unlock(pp0++);
5548 5548 }
5549 5549 lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked);
5550 5550
5551 5551 va += pgsz;
5552 5552 }
5553 5553 }
5554 5554
5555 5555 /*
5556 5556 * Migrate any pages that have been marked for migration in the given range
5557 5557 */
5558 5558 void
5559 5559 page_migrate(
5560 5560 struct seg *seg,
5561 5561 caddr_t addr,
5562 5562 page_t **ppa,
5563 5563 pgcnt_t npages)
5564 5564 {
5565 5565 lgrp_t *from;
5566 5566 lgrp_t *to;
5567 5567 page_t *newpp;
5568 5568 page_t *pp;
5569 5569 pfn_t pfn;
5570 5570 size_t pgsz;
5571 5571 spgcnt_t page_cnt;
5572 5572 spgcnt_t i;
5573 5573 uint_t pszc;
5574 5574
5575 5575 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5576 5576
5577 5577 while (npages > 0) {
5578 5578 pp = *ppa;
5579 5579 pszc = pp->p_szc;
5580 5580 pgsz = page_get_pagesize(pszc);
5581 5581 page_cnt = btop(pgsz);
5582 5582
5583 5583 /*
5584 5584 * Check to see whether this page is marked for migration
5585 5585 *
5586 5586 * Assume that root page of large page is marked for
5587 5587 * migration and none of the other constituent pages
5588 5588 * are marked. This really simplifies clearing the
5589 5589 * migrate bit by not having to clear it from each
5590 5590 * constituent page.
5591 5591 *
5592 5592 * note we don't want to relocate an entire large page if
5593 5593 * someone is only using one subpage.
5594 5594 */
5595 5595 if (npages < page_cnt)
5596 5596 break;
5597 5597
5598 5598 /*
5599 5599 * Is it marked for migration?
5600 5600 */
5601 5601 if (!PP_ISMIGRATE(pp))
5602 5602 goto next;
5603 5603
5604 5604 /*
5605 5605 * Determine lgroups that page is being migrated between
5606 5606 */
5607 5607 pfn = page_pptonum(pp);
5608 5608 if (!IS_P2ALIGNED(pfn, page_cnt)) {
5609 5609 break;
5610 5610 }
5611 5611 from = lgrp_pfn_to_lgrp(pfn);
5612 5612 to = lgrp_mem_choose(seg, addr, pgsz);
5613 5613
5614 5614 /*
5615 5615 * Need to get exclusive lock's to migrate
5616 5616 */
5617 5617 for (i = 0; i < page_cnt; i++) {
5618 5618 ASSERT(PAGE_LOCKED(ppa[i]));
5619 5619 if (page_pptonum(ppa[i]) != pfn + i ||
5620 5620 ppa[i]->p_szc != pszc) {
5621 5621 break;
5622 5622 }
5623 5623 if (!page_tryupgrade(ppa[i])) {
5624 5624 lgrp_stat_add(from->lgrp_id,
5625 5625 LGRP_PM_FAIL_LOCK_PGS,
5626 5626 page_cnt);
5627 5627 break;
5628 5628 }
5629 5629
5630 5630 /*
5631 5631 * Check to see whether we are trying to migrate
5632 5632 * page to lgroup where it is allocated already.
5633 5633 * If so, clear the migrate bit and skip to next
5634 5634 * page.
5635 5635 */
5636 5636 if (i == 0 && to == from) {
5637 5637 PP_CLRMIGRATE(ppa[0]);
5638 5638 page_downgrade(ppa[0]);
5639 5639 goto next;
5640 5640 }
5641 5641 }
5642 5642
5643 5643 /*
5644 5644 * If all constituent pages couldn't be locked,
5645 5645 * unlock pages locked so far and skip to next page.
5646 5646 */
5647 5647 if (i != page_cnt) {
5648 5648 while (--i != -1) {
5649 5649 page_downgrade(ppa[i]);
5650 5650 }
5651 5651 goto next;
5652 5652 }
5653 5653
5654 5654 (void) page_create_wait(page_cnt, PG_WAIT);
5655 5655 newpp = page_get_replacement_page(pp, to, PGR_SAMESZC);
5656 5656 if (newpp == NULL) {
5657 5657 page_create_putback(page_cnt);
5658 5658 for (i = 0; i < page_cnt; i++) {
5659 5659 page_downgrade(ppa[i]);
5660 5660 }
5661 5661 lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS,
5662 5662 page_cnt);
5663 5663 goto next;
5664 5664 }
5665 5665 ASSERT(newpp->p_szc == pszc);
5666 5666 /*
5667 5667 * Clear migrate bit and relocate page
5668 5668 */
5669 5669 PP_CLRMIGRATE(pp);
5670 5670 if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) {
5671 5671 panic("page_migrate: page_relocate failed");
5672 5672 }
5673 5673 ASSERT(page_cnt * PAGESIZE == pgsz);
5674 5674
5675 5675 /*
5676 5676 * Keep stats for number of pages migrated from and to
5677 5677 * each lgroup
5678 5678 */
5679 5679 lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt);
5680 5680 lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt);
5681 5681 /*
5682 5682 * update the page_t array we were passed in and
5683 5683 * unlink constituent pages of a large page.
5684 5684 */
5685 5685 for (i = 0; i < page_cnt; ++i, ++pp) {
5686 5686 ASSERT(PAGE_EXCL(newpp));
5687 5687 ASSERT(newpp->p_szc == pszc);
5688 5688 ppa[i] = newpp;
5689 5689 pp = newpp;
5690 5690 page_sub(&newpp, pp);
5691 5691 page_downgrade(pp);
5692 5692 }
5693 5693 ASSERT(newpp == NULL);
5694 5694 next:
5695 5695 addr += pgsz;
5696 5696 ppa += page_cnt;
5697 5697 npages -= page_cnt;
5698 5698 }
5699 5699 }
5700 5700
5701 5701 #define MAX_CNT 60 /* max num of iterations */
5702 5702 /*
5703 5703 * Reclaim/reserve availrmem for npages.
5704 5704 * If there is not enough memory start reaping seg, kmem caches.
5705 5705 * Start pageout scanner (via page_needfree()).
5706 5706 * Exit after ~ MAX_CNT s regardless of how much memory has been released.
5707 5707 * Note: There is no guarantee that any availrmem will be freed as
5708 5708 * this memory typically is locked (kernel heap) or reserved for swap.
5709 5709 * Also due to memory fragmentation kmem allocator may not be able
5710 5710 * to free any memory (single user allocated buffer will prevent
5711 5711 * freeing slab or a page).
5712 5712 */
5713 5713 int
5714 5714 page_reclaim_mem(pgcnt_t npages, pgcnt_t epages, int adjust)
5715 5715 {
5716 5716 int i = 0;
5717 5717 int ret = 0;
5718 5718 pgcnt_t deficit;
5719 5719 pgcnt_t old_availrmem;
5720 5720
5721 5721 mutex_enter(&freemem_lock);
5722 5722 old_availrmem = availrmem - 1;
5723 5723 while ((availrmem < tune.t_minarmem + npages + epages) &&
5724 5724 (old_availrmem < availrmem) && (i++ < MAX_CNT)) {
5725 5725 old_availrmem = availrmem;
5726 5726 deficit = tune.t_minarmem + npages + epages - availrmem;
5727 5727 mutex_exit(&freemem_lock);
5728 5728 page_needfree(deficit);
5729 5729 kmem_reap();
5730 5730 delay(hz);
5731 5731 page_needfree(-(spgcnt_t)deficit);
5732 5732 mutex_enter(&freemem_lock);
5733 5733 }
5734 5734
5735 5735 if (adjust && (availrmem >= tune.t_minarmem + npages + epages)) {
5736 5736 availrmem -= npages;
5737 5737 ret = 1;
5738 5738 }
5739 5739
5740 5740 mutex_exit(&freemem_lock);
5741 5741
5742 5742 return (ret);
5743 5743 }
5744 5744
5745 5745 /*
5746 5746 * Search the memory segments to locate the desired page. Within a
5747 5747 * segment, pages increase linearly with one page structure per
5748 5748 * physical page frame (size PAGESIZE). The search begins
5749 5749 * with the segment that was accessed last, to take advantage of locality.
5750 5750 * If the hint misses, we start from the beginning of the sorted memseg list
5751 5751 */
5752 5752
5753 5753
5754 5754 /*
5755 5755 * Some data structures for pfn to pp lookup.
5756 5756 */
5757 5757 ulong_t mhash_per_slot;
5758 5758 struct memseg *memseg_hash[N_MEM_SLOTS];
5759 5759
5760 5760 page_t *
5761 5761 page_numtopp_nolock(pfn_t pfnum)
5762 5762 {
5763 5763 struct memseg *seg;
5764 5764 page_t *pp;
5765 5765 vm_cpu_data_t *vc;
5766 5766
5767 5767 /*
5768 5768 * We need to disable kernel preemption while referencing the
5769 5769 * cpu_vm_data field in order to prevent us from being switched to
5770 5770 * another cpu and trying to reference it after it has been freed.
5771 5771 * This will keep us on cpu and prevent it from being removed while
5772 5772 * we are still on it.
5773 5773 *
5774 5774 * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5775 5775 * which is being resued by DR who will flush those references
5776 5776 * before modifying the reused memseg. See memseg_cpu_vm_flush().
5777 5777 */
5778 5778 kpreempt_disable();
5779 5779 vc = CPU->cpu_vm_data;
5780 5780 ASSERT(vc != NULL);
5781 5781
5782 5782 MEMSEG_STAT_INCR(nsearch);
5783 5783
5784 5784 /* Try last winner first */
5785 5785 if (((seg = vc->vc_pnum_memseg) != NULL) &&
5786 5786 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5787 5787 MEMSEG_STAT_INCR(nlastwon);
5788 5788 pp = seg->pages + (pfnum - seg->pages_base);
5789 5789 if (pp->p_pagenum == pfnum) {
5790 5790 kpreempt_enable();
5791 5791 return ((page_t *)pp);
5792 5792 }
5793 5793 }
5794 5794
5795 5795 /* Else Try hash */
5796 5796 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5797 5797 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5798 5798 MEMSEG_STAT_INCR(nhashwon);
5799 5799 vc->vc_pnum_memseg = seg;
5800 5800 pp = seg->pages + (pfnum - seg->pages_base);
5801 5801 if (pp->p_pagenum == pfnum) {
5802 5802 kpreempt_enable();
5803 5803 return ((page_t *)pp);
5804 5804 }
5805 5805 }
5806 5806
5807 5807 /* Else Brute force */
5808 5808 for (seg = memsegs; seg != NULL; seg = seg->next) {
5809 5809 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5810 5810 vc->vc_pnum_memseg = seg;
5811 5811 pp = seg->pages + (pfnum - seg->pages_base);
5812 5812 if (pp->p_pagenum == pfnum) {
5813 5813 kpreempt_enable();
5814 5814 return ((page_t *)pp);
5815 5815 }
5816 5816 }
5817 5817 }
5818 5818 vc->vc_pnum_memseg = NULL;
5819 5819 kpreempt_enable();
5820 5820 MEMSEG_STAT_INCR(nnotfound);
5821 5821 return ((page_t *)NULL);
5822 5822
5823 5823 }
5824 5824
5825 5825 struct memseg *
5826 5826 page_numtomemseg_nolock(pfn_t pfnum)
5827 5827 {
5828 5828 struct memseg *seg;
5829 5829 page_t *pp;
5830 5830
5831 5831 /*
5832 5832 * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5833 5833 * which is being resued by DR who will flush those references
5834 5834 * before modifying the reused memseg. See memseg_cpu_vm_flush().
5835 5835 */
5836 5836 kpreempt_disable();
5837 5837 /* Try hash */
5838 5838 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5839 5839 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5840 5840 pp = seg->pages + (pfnum - seg->pages_base);
5841 5841 if (pp->p_pagenum == pfnum) {
5842 5842 kpreempt_enable();
5843 5843 return (seg);
5844 5844 }
5845 5845 }
5846 5846
5847 5847 /* Else Brute force */
5848 5848 for (seg = memsegs; seg != NULL; seg = seg->next) {
5849 5849 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5850 5850 pp = seg->pages + (pfnum - seg->pages_base);
5851 5851 if (pp->p_pagenum == pfnum) {
5852 5852 kpreempt_enable();
5853 5853 return (seg);
5854 5854 }
5855 5855 }
5856 5856 }
5857 5857 kpreempt_enable();
5858 5858 return ((struct memseg *)NULL);
5859 5859 }
5860 5860
5861 5861 /*
5862 5862 * Given a page and a count return the page struct that is
5863 5863 * n structs away from the current one in the global page
5864 5864 * list.
5865 5865 *
5866 5866 * This function wraps to the first page upon
5867 5867 * reaching the end of the memseg list.
5868 5868 */
5869 5869 page_t *
5870 5870 page_nextn(page_t *pp, ulong_t n)
5871 5871 {
5872 5872 struct memseg *seg;
5873 5873 page_t *ppn;
5874 5874 vm_cpu_data_t *vc;
5875 5875
5876 5876 /*
5877 5877 * We need to disable kernel preemption while referencing the
5878 5878 * cpu_vm_data field in order to prevent us from being switched to
5879 5879 * another cpu and trying to reference it after it has been freed.
5880 5880 * This will keep us on cpu and prevent it from being removed while
5881 5881 * we are still on it.
5882 5882 *
5883 5883 * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5884 5884 * which is being resued by DR who will flush those references
5885 5885 * before modifying the reused memseg. See memseg_cpu_vm_flush().
5886 5886 */
5887 5887 kpreempt_disable();
5888 5888 vc = (vm_cpu_data_t *)CPU->cpu_vm_data;
5889 5889
5890 5890 ASSERT(vc != NULL);
5891 5891
5892 5892 if (((seg = vc->vc_pnext_memseg) == NULL) ||
5893 5893 (seg->pages_base == seg->pages_end) ||
5894 5894 !(pp >= seg->pages && pp < seg->epages)) {
5895 5895
5896 5896 for (seg = memsegs; seg; seg = seg->next) {
5897 5897 if (pp >= seg->pages && pp < seg->epages)
5898 5898 break;
5899 5899 }
5900 5900
5901 5901 if (seg == NULL) {
5902 5902 /* Memory delete got in, return something valid. */
5903 5903 /* TODO: fix me. */
5904 5904 seg = memsegs;
5905 5905 pp = seg->pages;
5906 5906 }
5907 5907 }
5908 5908
5909 5909 /* check for wraparound - possible if n is large */
5910 5910 while ((ppn = (pp + n)) >= seg->epages || ppn < pp) {
5911 5911 n -= seg->epages - pp;
5912 5912 seg = seg->next;
5913 5913 if (seg == NULL)
5914 5914 seg = memsegs;
5915 5915 pp = seg->pages;
5916 5916 }
5917 5917 vc->vc_pnext_memseg = seg;
5918 5918 kpreempt_enable();
5919 5919 return (ppn);
5920 5920 }
5921 5921
5922 5922 /*
5923 5923 * Initialize for a loop using page_next_scan_large().
5924 5924 */
5925 5925 page_t *
5926 5926 page_next_scan_init(void **cookie)
5927 5927 {
5928 5928 ASSERT(cookie != NULL);
5929 5929 *cookie = (void *)memsegs;
5930 5930 return ((page_t *)memsegs->pages);
5931 5931 }
5932 5932
5933 5933 /*
5934 5934 * Return the next page in a scan of page_t's, assuming we want
5935 5935 * to skip over sub-pages within larger page sizes.
5936 5936 *
5937 5937 * The cookie is used to keep track of the current memseg.
5938 5938 */
5939 5939 page_t *
5940 5940 page_next_scan_large(
5941 5941 page_t *pp,
5942 5942 ulong_t *n,
5943 5943 void **cookie)
5944 5944 {
5945 5945 struct memseg *seg = (struct memseg *)*cookie;
5946 5946 page_t *new_pp;
5947 5947 ulong_t cnt;
5948 5948 pfn_t pfn;
5949 5949
5950 5950
5951 5951 /*
5952 5952 * get the count of page_t's to skip based on the page size
5953 5953 */
5954 5954 ASSERT(pp != NULL);
5955 5955 if (pp->p_szc == 0) {
5956 5956 cnt = 1;
5957 5957 } else {
5958 5958 pfn = page_pptonum(pp);
5959 5959 cnt = page_get_pagecnt(pp->p_szc);
5960 5960 cnt -= pfn & (cnt - 1);
5961 5961 }
5962 5962 *n += cnt;
5963 5963 new_pp = pp + cnt;
5964 5964
5965 5965 /*
5966 5966 * Catch if we went past the end of the current memory segment. If so,
5967 5967 * just move to the next segment with pages.
5968 5968 */
5969 5969 if (new_pp >= seg->epages || seg->pages_base == seg->pages_end) {
5970 5970 do {
5971 5971 seg = seg->next;
5972 5972 if (seg == NULL)
5973 5973 seg = memsegs;
5974 5974 } while (seg->pages_base == seg->pages_end);
5975 5975 new_pp = seg->pages;
5976 5976 *cookie = (void *)seg;
5977 5977 }
5978 5978
5979 5979 return (new_pp);
5980 5980 }
5981 5981
5982 5982
5983 5983 /*
5984 5984 * Returns next page in list. Note: this function wraps
5985 5985 * to the first page in the list upon reaching the end
5986 5986 * of the list. Callers should be aware of this fact.
5987 5987 */
5988 5988
5989 5989 /* We should change this be a #define */
5990 5990
5991 5991 page_t *
5992 5992 page_next(page_t *pp)
5993 5993 {
5994 5994 return (page_nextn(pp, 1));
5995 5995 }
5996 5996
5997 5997 page_t *
5998 5998 page_first()
5999 5999 {
6000 6000 return ((page_t *)memsegs->pages);
6001 6001 }
6002 6002
6003 6003
6004 6004 /*
6005 6005 * This routine is called at boot with the initial memory configuration
6006 6006 * and when memory is added or removed.
6007 6007 */
6008 6008 void
6009 6009 build_pfn_hash()
6010 6010 {
6011 6011 pfn_t cur;
6012 6012 pgcnt_t index;
6013 6013 struct memseg *pseg;
6014 6014 int i;
6015 6015
6016 6016 /*
6017 6017 * Clear memseg_hash array.
6018 6018 * Since memory add/delete is designed to operate concurrently
6019 6019 * with normal operation, the hash rebuild must be able to run
6020 6020 * concurrently with page_numtopp_nolock(). To support this
6021 6021 * functionality, assignments to memseg_hash array members must
6022 6022 * be done atomically.
6023 6023 *
6024 6024 * NOTE: bzero() does not currently guarantee this for kernel
6025 6025 * threads, and cannot be used here.
6026 6026 */
6027 6027 for (i = 0; i < N_MEM_SLOTS; i++)
6028 6028 memseg_hash[i] = NULL;
6029 6029
6030 6030 hat_kpm_mseghash_clear(N_MEM_SLOTS);
6031 6031
6032 6032 /*
6033 6033 * Physmax is the last valid pfn.
6034 6034 */
6035 6035 mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT;
6036 6036 for (pseg = memsegs; pseg != NULL; pseg = pseg->next) {
6037 6037 index = MEMSEG_PFN_HASH(pseg->pages_base);
6038 6038 cur = pseg->pages_base;
6039 6039 do {
6040 6040 if (index >= N_MEM_SLOTS)
6041 6041 index = MEMSEG_PFN_HASH(cur);
6042 6042
6043 6043 if (memseg_hash[index] == NULL ||
6044 6044 memseg_hash[index]->pages_base > pseg->pages_base) {
6045 6045 memseg_hash[index] = pseg;
6046 6046 hat_kpm_mseghash_update(index, pseg);
6047 6047 }
6048 6048 cur += mhash_per_slot;
6049 6049 index++;
6050 6050 } while (cur < pseg->pages_end);
6051 6051 }
6052 6052 }
6053 6053
6054 6054 /*
6055 6055 * Return the pagenum for the pp
6056 6056 */
6057 6057 pfn_t
6058 6058 page_pptonum(page_t *pp)
6059 6059 {
6060 6060 return (pp->p_pagenum);
6061 6061 }
6062 6062
6063 6063 /*
6064 6064 * interface to the referenced and modified etc bits
6065 6065 * in the PSM part of the page struct
6066 6066 * when no locking is desired.
6067 6067 */
6068 6068 void
6069 6069 page_set_props(page_t *pp, uint_t flags)
6070 6070 {
6071 6071 ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0);
6072 6072 pp->p_nrm |= (uchar_t)flags;
6073 6073 }
6074 6074
6075 6075 void
6076 6076 page_clr_all_props(page_t *pp)
6077 6077 {
6078 6078 pp->p_nrm = 0;
6079 6079 }
6080 6080
6081 6081 /*
6082 6082 * Clear p_lckcnt and p_cowcnt, adjusting freemem if required.
6083 6083 */
6084 6084 int
6085 6085 page_clear_lck_cow(page_t *pp, int adjust)
6086 6086 {
6087 6087 int f_amount;
6088 6088
6089 6089 ASSERT(PAGE_EXCL(pp));
6090 6090
6091 6091 /*
6092 6092 * The page_struct_lock need not be acquired here since
6093 6093 * we require the caller hold the page exclusively locked.
6094 6094 */
6095 6095 f_amount = 0;
6096 6096 if (pp->p_lckcnt) {
6097 6097 f_amount = 1;
6098 6098 pp->p_lckcnt = 0;
6099 6099 }
6100 6100 if (pp->p_cowcnt) {
6101 6101 f_amount += pp->p_cowcnt;
6102 6102 pp->p_cowcnt = 0;
6103 6103 }
6104 6104
6105 6105 if (adjust && f_amount) {
6106 6106 mutex_enter(&freemem_lock);
6107 6107 availrmem += f_amount;
6108 6108 mutex_exit(&freemem_lock);
6109 6109 }
6110 6110
6111 6111 return (f_amount);
6112 6112 }
6113 6113
6114 6114 /*
6115 6115 * The following functions is called from free_vp_pages()
6116 6116 * for an inexact estimate of a newly free'd page...
6117 6117 */
6118 6118 ulong_t
6119 6119 page_share_cnt(page_t *pp)
6120 6120 {
6121 6121 return (hat_page_getshare(pp));
6122 6122 }
6123 6123
6124 6124 int
6125 6125 page_isshared(page_t *pp)
6126 6126 {
6127 6127 return (hat_page_checkshare(pp, 1));
6128 6128 }
6129 6129
6130 6130 int
6131 6131 page_isfree(page_t *pp)
6132 6132 {
6133 6133 return (PP_ISFREE(pp));
6134 6134 }
6135 6135
6136 6136 int
6137 6137 page_isref(page_t *pp)
6138 6138 {
6139 6139 return (hat_page_getattr(pp, P_REF));
6140 6140 }
6141 6141
6142 6142 int
6143 6143 page_ismod(page_t *pp)
6144 6144 {
6145 6145 return (hat_page_getattr(pp, P_MOD));
6146 6146 }
6147 6147
6148 6148 /*
6149 6149 * The following code all currently relates to the page capture logic:
6150 6150 *
6151 6151 * This logic is used for cases where there is a desire to claim a certain
6152 6152 * physical page in the system for the caller. As it may not be possible
6153 6153 * to capture the page immediately, the p_toxic bits are used in the page
6154 6154 * structure to indicate that someone wants to capture this page. When the
6155 6155 * page gets unlocked, the toxic flag will be noted and an attempt to capture
6156 6156 * the page will be made. If it is successful, the original callers callback
6157 6157 * will be called with the page to do with it what they please.
6158 6158 *
6159 6159 * There is also an async thread which wakes up to attempt to capture
6160 6160 * pages occasionally which have the capture bit set. All of the pages which
6161 6161 * need to be captured asynchronously have been inserted into the
6162 6162 * page_capture_hash and thus this thread walks that hash list. Items in the
6163 6163 * hash have an expiration time so this thread handles that as well by removing
6164 6164 * the item from the hash if it has expired.
6165 6165 *
6166 6166 * Some important things to note are:
6167 6167 * - if the PR_CAPTURE bit is set on a page, then the page is in the
6168 6168 * page_capture_hash. The page_capture_hash_head.pchh_mutex is needed
6169 6169 * to set and clear this bit, and while the lock is held is the only time
6170 6170 * you can add or remove an entry from the hash.
6171 6171 * - the PR_CAPTURE bit can only be set and cleared while holding the
6172 6172 * page_capture_hash_head.pchh_mutex
6173 6173 * - the t_flag field of the thread struct is used with the T_CAPTURING
6174 6174 * flag to prevent recursion while dealing with large pages.
6175 6175 * - pages which need to be retired never expire on the page_capture_hash.
6176 6176 */
6177 6177
6178 6178 static void page_capture_thread(void);
6179 6179 static kthread_t *pc_thread_id;
6180 6180 kcondvar_t pc_cv;
6181 6181 static kmutex_t pc_thread_mutex;
6182 6182 static clock_t pc_thread_shortwait;
6183 6183 static clock_t pc_thread_longwait;
6184 6184 static int pc_thread_retry;
6185 6185
6186 6186 struct page_capture_callback pc_cb[PC_NUM_CALLBACKS];
6187 6187
6188 6188 /* Note that this is a circular linked list */
6189 6189 typedef struct page_capture_hash_bucket {
6190 6190 page_t *pp;
6191 6191 uchar_t szc;
6192 6192 uchar_t pri;
6193 6193 uint_t flags;
6194 6194 clock_t expires; /* lbolt at which this request expires. */
6195 6195 void *datap; /* Cached data passed in for callback */
6196 6196 struct page_capture_hash_bucket *next;
6197 6197 struct page_capture_hash_bucket *prev;
6198 6198 } page_capture_hash_bucket_t;
6199 6199
6200 6200 #define PC_PRI_HI 0 /* capture now */
6201 6201 #define PC_PRI_LO 1 /* capture later */
6202 6202 #define PC_NUM_PRI 2
6203 6203
6204 6204 #define PAGE_CAPTURE_PRIO(pp) (PP_ISRAF(pp) ? PC_PRI_LO : PC_PRI_HI)
6205 6205
6206 6206
6207 6207 /*
6208 6208 * Each hash bucket will have it's own mutex and two lists which are:
6209 6209 * active (0): represents requests which have not been processed by
6210 6210 * the page_capture async thread yet.
6211 6211 * walked (1): represents requests which have been processed by the
6212 6212 * page_capture async thread within it's given walk of this bucket.
6213 6213 *
6214 6214 * These are all needed so that we can synchronize all async page_capture
6215 6215 * events. When the async thread moves to a new bucket, it will append the
6216 6216 * walked list to the active list and walk each item one at a time, moving it
6217 6217 * from the active list to the walked list. Thus if there is an async request
6218 6218 * outstanding for a given page, it will always be in one of the two lists.
6219 6219 * New requests will always be added to the active list.
6220 6220 * If we were not able to capture a page before the request expired, we'd free
6221 6221 * up the request structure which would indicate to page_capture that there is
6222 6222 * no longer a need for the given page, and clear the PR_CAPTURE flag if
6223 6223 * possible.
6224 6224 */
6225 6225 typedef struct page_capture_hash_head {
6226 6226 kmutex_t pchh_mutex;
6227 6227 uint_t num_pages[PC_NUM_PRI];
6228 6228 page_capture_hash_bucket_t lists[2]; /* sentinel nodes */
6229 6229 } page_capture_hash_head_t;
6230 6230
6231 6231 #ifdef DEBUG
6232 6232 #define NUM_PAGE_CAPTURE_BUCKETS 4
6233 6233 #else
6234 6234 #define NUM_PAGE_CAPTURE_BUCKETS 64
6235 6235 #endif
6236 6236
6237 6237 page_capture_hash_head_t page_capture_hash[NUM_PAGE_CAPTURE_BUCKETS];
6238 6238
6239 6239 /* for now use a very simple hash based upon the size of a page struct */
6240 6240 #define PAGE_CAPTURE_HASH(pp) \
6241 6241 ((int)(((uintptr_t)pp >> 7) & (NUM_PAGE_CAPTURE_BUCKETS - 1)))
6242 6242
6243 6243 extern pgcnt_t swapfs_minfree;
6244 6244
6245 6245 int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap);
6246 6246
6247 6247 /*
6248 6248 * a callback function is required for page capture requests.
6249 6249 */
6250 6250 void
6251 6251 page_capture_register_callback(uint_t index, clock_t duration,
6252 6252 int (*cb_func)(page_t *, void *, uint_t))
6253 6253 {
6254 6254 ASSERT(pc_cb[index].cb_active == 0);
6255 6255 ASSERT(cb_func != NULL);
6256 6256 rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6257 6257 pc_cb[index].duration = duration;
6258 6258 pc_cb[index].cb_func = cb_func;
6259 6259 pc_cb[index].cb_active = 1;
6260 6260 rw_exit(&pc_cb[index].cb_rwlock);
6261 6261 }
6262 6262
6263 6263 void
6264 6264 page_capture_unregister_callback(uint_t index)
6265 6265 {
6266 6266 int i, j;
6267 6267 struct page_capture_hash_bucket *bp1;
6268 6268 struct page_capture_hash_bucket *bp2;
6269 6269 struct page_capture_hash_bucket *head = NULL;
6270 6270 uint_t flags = (1 << index);
6271 6271
6272 6272 rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6273 6273 ASSERT(pc_cb[index].cb_active == 1);
6274 6274 pc_cb[index].duration = 0; /* Paranoia */
6275 6275 pc_cb[index].cb_func = NULL; /* Paranoia */
6276 6276 pc_cb[index].cb_active = 0;
6277 6277 rw_exit(&pc_cb[index].cb_rwlock);
6278 6278
6279 6279 /*
6280 6280 * Just move all the entries to a private list which we can walk
6281 6281 * through without the need to hold any locks.
6282 6282 * No more requests can get added to the hash lists for this consumer
6283 6283 * as the cb_active field for the callback has been cleared.
6284 6284 */
6285 6285 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
6286 6286 mutex_enter(&page_capture_hash[i].pchh_mutex);
6287 6287 for (j = 0; j < 2; j++) {
6288 6288 bp1 = page_capture_hash[i].lists[j].next;
6289 6289 /* walk through all but first (sentinel) element */
6290 6290 while (bp1 != &page_capture_hash[i].lists[j]) {
6291 6291 bp2 = bp1;
6292 6292 if (bp2->flags & flags) {
6293 6293 bp1 = bp2->next;
6294 6294 bp1->prev = bp2->prev;
6295 6295 bp2->prev->next = bp1;
6296 6296 bp2->next = head;
6297 6297 head = bp2;
6298 6298 /*
6299 6299 * Clear the PR_CAPTURE bit as we
6300 6300 * hold appropriate locks here.
6301 6301 */
6302 6302 page_clrtoxic(head->pp, PR_CAPTURE);
6303 6303 page_capture_hash[i].
6304 6304 num_pages[bp2->pri]--;
6305 6305 continue;
6306 6306 }
6307 6307 bp1 = bp1->next;
6308 6308 }
6309 6309 }
6310 6310 mutex_exit(&page_capture_hash[i].pchh_mutex);
6311 6311 }
6312 6312
6313 6313 while (head != NULL) {
6314 6314 bp1 = head;
6315 6315 head = head->next;
6316 6316 kmem_free(bp1, sizeof (*bp1));
6317 6317 }
6318 6318 }
6319 6319
6320 6320
6321 6321 /*
6322 6322 * Find pp in the active list and move it to the walked list if it
6323 6323 * exists.
6324 6324 * Note that most often pp should be at the front of the active list
6325 6325 * as it is currently used and thus there is no other sort of optimization
6326 6326 * being done here as this is a linked list data structure.
6327 6327 * Returns 1 on successful move or 0 if page could not be found.
6328 6328 */
6329 6329 static int
6330 6330 page_capture_move_to_walked(page_t *pp)
6331 6331 {
6332 6332 page_capture_hash_bucket_t *bp;
6333 6333 int index;
6334 6334
6335 6335 index = PAGE_CAPTURE_HASH(pp);
6336 6336
6337 6337 mutex_enter(&page_capture_hash[index].pchh_mutex);
6338 6338 bp = page_capture_hash[index].lists[0].next;
6339 6339 while (bp != &page_capture_hash[index].lists[0]) {
6340 6340 if (bp->pp == pp) {
6341 6341 /* Remove from old list */
6342 6342 bp->next->prev = bp->prev;
6343 6343 bp->prev->next = bp->next;
6344 6344
6345 6345 /* Add to new list */
6346 6346 bp->next = page_capture_hash[index].lists[1].next;
6347 6347 bp->prev = &page_capture_hash[index].lists[1];
6348 6348 page_capture_hash[index].lists[1].next = bp;
6349 6349 bp->next->prev = bp;
6350 6350
6351 6351 /*
6352 6352 * There is a small probability of page on a free
6353 6353 * list being retired while being allocated
6354 6354 * and before P_RAF is set on it. The page may
6355 6355 * end up marked as high priority request instead
6356 6356 * of low priority request.
6357 6357 * If P_RAF page is not marked as low priority request
6358 6358 * change it to low priority request.
6359 6359 */
6360 6360 page_capture_hash[index].num_pages[bp->pri]--;
6361 6361 bp->pri = PAGE_CAPTURE_PRIO(pp);
6362 6362 page_capture_hash[index].num_pages[bp->pri]++;
6363 6363 mutex_exit(&page_capture_hash[index].pchh_mutex);
6364 6364 return (1);
6365 6365 }
6366 6366 bp = bp->next;
6367 6367 }
6368 6368 mutex_exit(&page_capture_hash[index].pchh_mutex);
6369 6369 return (0);
6370 6370 }
6371 6371
6372 6372 /*
6373 6373 * Add a new entry to the page capture hash. The only case where a new
6374 6374 * entry is not added is when the page capture consumer is no longer registered.
6375 6375 * In this case, we'll silently not add the page to the hash. We know that
6376 6376 * page retire will always be registered for the case where we are currently
6377 6377 * unretiring a page and thus there are no conflicts.
6378 6378 */
6379 6379 static void
6380 6380 page_capture_add_hash(page_t *pp, uint_t szc, uint_t flags, void *datap)
6381 6381 {
6382 6382 page_capture_hash_bucket_t *bp1;
6383 6383 page_capture_hash_bucket_t *bp2;
6384 6384 int index;
6385 6385 int cb_index;
6386 6386 int i;
6387 6387 uchar_t pri;
6388 6388 #ifdef DEBUG
6389 6389 page_capture_hash_bucket_t *tp1;
6390 6390 int l;
6391 6391 #endif
6392 6392
6393 6393 ASSERT(!(flags & CAPTURE_ASYNC));
6394 6394
6395 6395 bp1 = kmem_alloc(sizeof (struct page_capture_hash_bucket), KM_SLEEP);
6396 6396
6397 6397 bp1->pp = pp;
6398 6398 bp1->szc = szc;
6399 6399 bp1->flags = flags;
6400 6400 bp1->datap = datap;
6401 6401
6402 6402 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6403 6403 if ((flags >> cb_index) & 1) {
6404 6404 break;
6405 6405 }
6406 6406 }
6407 6407
6408 6408 ASSERT(cb_index != PC_NUM_CALLBACKS);
6409 6409
6410 6410 rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6411 6411 if (pc_cb[cb_index].cb_active) {
6412 6412 if (pc_cb[cb_index].duration == -1) {
6413 6413 bp1->expires = (clock_t)-1;
6414 6414 } else {
6415 6415 bp1->expires = ddi_get_lbolt() +
6416 6416 pc_cb[cb_index].duration;
6417 6417 }
6418 6418 } else {
6419 6419 /* There's no callback registered so don't add to the hash */
6420 6420 rw_exit(&pc_cb[cb_index].cb_rwlock);
6421 6421 kmem_free(bp1, sizeof (*bp1));
6422 6422 return;
6423 6423 }
6424 6424
6425 6425 index = PAGE_CAPTURE_HASH(pp);
6426 6426
6427 6427 /*
6428 6428 * Only allow capture flag to be modified under this mutex.
6429 6429 * Prevents multiple entries for same page getting added.
6430 6430 */
6431 6431 mutex_enter(&page_capture_hash[index].pchh_mutex);
6432 6432
6433 6433 /*
6434 6434 * if not already on the hash, set capture bit and add to the hash
6435 6435 */
6436 6436 if (!(pp->p_toxic & PR_CAPTURE)) {
6437 6437 #ifdef DEBUG
6438 6438 /* Check for duplicate entries */
6439 6439 for (l = 0; l < 2; l++) {
6440 6440 tp1 = page_capture_hash[index].lists[l].next;
6441 6441 while (tp1 != &page_capture_hash[index].lists[l]) {
6442 6442 if (tp1->pp == pp) {
6443 6443 panic("page pp 0x%p already on hash "
6444 6444 "at 0x%p\n",
6445 6445 (void *)pp, (void *)tp1);
6446 6446 }
6447 6447 tp1 = tp1->next;
6448 6448 }
6449 6449 }
6450 6450
6451 6451 #endif
6452 6452 page_settoxic(pp, PR_CAPTURE);
6453 6453 pri = PAGE_CAPTURE_PRIO(pp);
6454 6454 bp1->pri = pri;
6455 6455 bp1->next = page_capture_hash[index].lists[0].next;
6456 6456 bp1->prev = &page_capture_hash[index].lists[0];
6457 6457 bp1->next->prev = bp1;
6458 6458 page_capture_hash[index].lists[0].next = bp1;
6459 6459 page_capture_hash[index].num_pages[pri]++;
6460 6460 if (flags & CAPTURE_RETIRE) {
6461 6461 page_retire_incr_pend_count(datap);
6462 6462 }
6463 6463 mutex_exit(&page_capture_hash[index].pchh_mutex);
6464 6464 rw_exit(&pc_cb[cb_index].cb_rwlock);
6465 6465 cv_signal(&pc_cv);
6466 6466 return;
6467 6467 }
6468 6468
6469 6469 /*
6470 6470 * A page retire request will replace any other request.
6471 6471 * A second physmem request which is for a different process than
6472 6472 * the currently registered one will be dropped as there is
6473 6473 * no way to hold the private data for both calls.
6474 6474 * In the future, once there are more callers, this will have to
6475 6475 * be worked out better as there needs to be private storage for
6476 6476 * at least each type of caller (maybe have datap be an array of
6477 6477 * *void's so that we can index based upon callers index).
6478 6478 */
6479 6479
6480 6480 /* walk hash list to update expire time */
6481 6481 for (i = 0; i < 2; i++) {
6482 6482 bp2 = page_capture_hash[index].lists[i].next;
6483 6483 while (bp2 != &page_capture_hash[index].lists[i]) {
6484 6484 if (bp2->pp == pp) {
6485 6485 if (flags & CAPTURE_RETIRE) {
6486 6486 if (!(bp2->flags & CAPTURE_RETIRE)) {
6487 6487 page_retire_incr_pend_count(
6488 6488 datap);
6489 6489 bp2->flags = flags;
6490 6490 bp2->expires = bp1->expires;
6491 6491 bp2->datap = datap;
6492 6492 }
6493 6493 } else {
6494 6494 ASSERT(flags & CAPTURE_PHYSMEM);
6495 6495 if (!(bp2->flags & CAPTURE_RETIRE) &&
6496 6496 (datap == bp2->datap)) {
6497 6497 bp2->expires = bp1->expires;
6498 6498 }
6499 6499 }
6500 6500 mutex_exit(&page_capture_hash[index].
6501 6501 pchh_mutex);
6502 6502 rw_exit(&pc_cb[cb_index].cb_rwlock);
6503 6503 kmem_free(bp1, sizeof (*bp1));
6504 6504 return;
6505 6505 }
6506 6506 bp2 = bp2->next;
6507 6507 }
6508 6508 }
6509 6509
6510 6510 /*
6511 6511 * the PR_CAPTURE flag is protected by the page_capture_hash mutexes
6512 6512 * and thus it either has to be set or not set and can't change
6513 6513 * while holding the mutex above.
6514 6514 */
6515 6515 panic("page_capture_add_hash, PR_CAPTURE flag set on pp %p\n",
6516 6516 (void *)pp);
6517 6517 }
6518 6518
6519 6519 /*
6520 6520 * We have a page in our hands, lets try and make it ours by turning
6521 6521 * it into a clean page like it had just come off the freelists.
6522 6522 *
6523 6523 * Returns 0 on success, with the page still EXCL locked.
6524 6524 * On failure, the page will be unlocked, and returns EAGAIN
6525 6525 */
6526 6526 static int
6527 6527 page_capture_clean_page(page_t *pp)
6528 6528 {
6529 6529 page_t *newpp;
6530 6530 int skip_unlock = 0;
6531 6531 spgcnt_t count;
6532 6532 page_t *tpp;
6533 6533 int ret = 0;
6534 6534 int extra;
6535 6535
6536 6536 ASSERT(PAGE_EXCL(pp));
6537 6537 ASSERT(!PP_RETIRED(pp));
6538 6538 ASSERT(curthread->t_flag & T_CAPTURING);
6539 6539
6540 6540 if (PP_ISFREE(pp)) {
6541 6541 if (!page_reclaim(pp, NULL)) {
6542 6542 skip_unlock = 1;
6543 6543 ret = EAGAIN;
6544 6544 goto cleanup;
6545 6545 }
6546 6546 ASSERT(pp->p_szc == 0);
6547 6547 if (pp->p_vnode != NULL) {
6548 6548 /*
6549 6549 * Since this page came from the
6550 6550 * cachelist, we must destroy the
6551 6551 * old vnode association.
6552 6552 */
6553 6553 page_hashout(pp, NULL);
6554 6554 }
6555 6555 goto cleanup;
6556 6556 }
6557 6557
6558 6558 /*
6559 6559 * If we know page_relocate will fail, skip it
6560 6560 * It could still fail due to a UE on another page but we
6561 6561 * can't do anything about that.
6562 6562 */
6563 6563 if (pp->p_toxic & PR_UE) {
6564 6564 goto skip_relocate;
6565 6565 }
6566 6566
6567 6567 /*
6568 6568 * It's possible that pages can not have a vnode as fsflush comes
6569 6569 * through and cleans up these pages. It's ugly but that's how it is.
6570 6570 */
6571 6571 if (pp->p_vnode == NULL) {
6572 6572 goto skip_relocate;
6573 6573 }
6574 6574
6575 6575 /*
6576 6576 * Page was not free, so lets try to relocate it.
6577 6577 * page_relocate only works with root pages, so if this is not a root
6578 6578 * page, we need to demote it to try and relocate it.
6579 6579 * Unfortunately this is the best we can do right now.
6580 6580 */
6581 6581 newpp = NULL;
6582 6582 if ((pp->p_szc > 0) && (pp != PP_PAGEROOT(pp))) {
6583 6583 if (page_try_demote_pages(pp) == 0) {
6584 6584 ret = EAGAIN;
6585 6585 goto cleanup;
6586 6586 }
6587 6587 }
6588 6588 ret = page_relocate(&pp, &newpp, 1, 0, &count, NULL);
6589 6589 if (ret == 0) {
6590 6590 page_t *npp;
6591 6591 /* unlock the new page(s) */
6592 6592 while (count-- > 0) {
6593 6593 ASSERT(newpp != NULL);
6594 6594 npp = newpp;
6595 6595 page_sub(&newpp, npp);
6596 6596 page_unlock(npp);
6597 6597 }
6598 6598 ASSERT(newpp == NULL);
6599 6599 /*
6600 6600 * Check to see if the page we have is too large.
6601 6601 * If so, demote it freeing up the extra pages.
6602 6602 */
6603 6603 if (pp->p_szc > 0) {
6604 6604 /* For now demote extra pages to szc == 0 */
6605 6605 extra = page_get_pagecnt(pp->p_szc) - 1;
6606 6606 while (extra > 0) {
6607 6607 tpp = pp->p_next;
6608 6608 page_sub(&pp, tpp);
6609 6609 tpp->p_szc = 0;
6610 6610 page_free(tpp, 1);
6611 6611 extra--;
6612 6612 }
6613 6613 /* Make sure to set our page to szc 0 as well */
6614 6614 ASSERT(pp->p_next == pp && pp->p_prev == pp);
6615 6615 pp->p_szc = 0;
6616 6616 }
6617 6617 goto cleanup;
6618 6618 } else if (ret == EIO) {
6619 6619 ret = EAGAIN;
6620 6620 goto cleanup;
6621 6621 } else {
6622 6622 /*
6623 6623 * Need to reset return type as we failed to relocate the page
6624 6624 * but that does not mean that some of the next steps will not
6625 6625 * work.
6626 6626 */
6627 6627 ret = 0;
6628 6628 }
6629 6629
6630 6630 skip_relocate:
6631 6631
6632 6632 if (pp->p_szc > 0) {
6633 6633 if (page_try_demote_pages(pp) == 0) {
6634 6634 ret = EAGAIN;
6635 6635 goto cleanup;
6636 6636 }
6637 6637 }
6638 6638
6639 6639 ASSERT(pp->p_szc == 0);
6640 6640
6641 6641 if (hat_ismod(pp)) {
6642 6642 ret = EAGAIN;
6643 6643 goto cleanup;
6644 6644 }
6645 6645 if (PP_ISKAS(pp)) {
6646 6646 ret = EAGAIN;
6647 6647 goto cleanup;
6648 6648 }
6649 6649 if (pp->p_lckcnt || pp->p_cowcnt) {
6650 6650 ret = EAGAIN;
6651 6651 goto cleanup;
6652 6652 }
6653 6653
6654 6654 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
6655 6655 ASSERT(!hat_page_is_mapped(pp));
6656 6656
6657 6657 if (hat_ismod(pp)) {
6658 6658 /*
6659 6659 * This is a semi-odd case as the page is now modified but not
6660 6660 * mapped as we just unloaded the mappings above.
6661 6661 */
6662 6662 ret = EAGAIN;
6663 6663 goto cleanup;
6664 6664 }
6665 6665 if (pp->p_vnode != NULL) {
6666 6666 page_hashout(pp, NULL);
6667 6667 }
6668 6668
6669 6669 /*
6670 6670 * At this point, the page should be in a clean state and
6671 6671 * we can do whatever we want with it.
6672 6672 */
6673 6673
6674 6674 cleanup:
6675 6675 if (ret != 0) {
6676 6676 if (!skip_unlock) {
6677 6677 page_unlock(pp);
6678 6678 }
6679 6679 } else {
6680 6680 ASSERT(pp->p_szc == 0);
6681 6681 ASSERT(PAGE_EXCL(pp));
6682 6682
6683 6683 pp->p_next = pp;
6684 6684 pp->p_prev = pp;
6685 6685 }
6686 6686 return (ret);
6687 6687 }
6688 6688
6689 6689 /*
6690 6690 * Various callers of page_trycapture() can have different restrictions upon
6691 6691 * what memory they have access to.
6692 6692 * Returns 0 on success, with the following error codes on failure:
6693 6693 * EPERM - The requested page is long term locked, and thus repeated
6694 6694 * requests to capture this page will likely fail.
6695 6695 * ENOMEM - There was not enough free memory in the system to safely
6696 6696 * map the requested page.
6697 6697 * ENOENT - The requested page was inside the kernel cage, and the
6698 6698 * PHYSMEM_CAGE flag was not set.
6699 6699 */
6700 6700 int
6701 6701 page_capture_pre_checks(page_t *pp, uint_t flags)
6702 6702 {
6703 6703 ASSERT(pp != NULL);
6704 6704
6705 6705 #if defined(__sparc)
6706 6706 if (pp->p_vnode == &promvp) {
6707 6707 return (EPERM);
6708 6708 }
6709 6709
6710 6710 if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE) &&
6711 6711 (flags & CAPTURE_PHYSMEM)) {
6712 6712 return (ENOENT);
6713 6713 }
6714 6714
6715 6715 if (PP_ISNORELOCKERNEL(pp)) {
6716 6716 return (EPERM);
6717 6717 }
6718 6718 #else
6719 6719 if (PP_ISKAS(pp)) {
6720 6720 return (EPERM);
6721 6721 }
6722 6722 #endif /* __sparc */
6723 6723
6724 6724 /* only physmem currently has the restrictions checked below */
6725 6725 if (!(flags & CAPTURE_PHYSMEM)) {
6726 6726 return (0);
6727 6727 }
6728 6728
6729 6729 if (availrmem < swapfs_minfree) {
6730 6730 /*
6731 6731 * We won't try to capture this page as we are
6732 6732 * running low on memory.
6733 6733 */
6734 6734 return (ENOMEM);
6735 6735 }
6736 6736 return (0);
6737 6737 }
6738 6738
6739 6739 /*
6740 6740 * Once we have a page in our mits, go ahead and complete the capture
6741 6741 * operation.
6742 6742 * Returns 1 on failure where page is no longer needed
6743 6743 * Returns 0 on success
6744 6744 * Returns -1 if there was a transient failure.
6745 6745 * Failure cases must release the SE_EXCL lock on pp (usually via page_free).
6746 6746 */
6747 6747 int
6748 6748 page_capture_take_action(page_t *pp, uint_t flags, void *datap)
6749 6749 {
6750 6750 int cb_index;
6751 6751 int ret = 0;
6752 6752 page_capture_hash_bucket_t *bp1;
6753 6753 page_capture_hash_bucket_t *bp2;
6754 6754 int index;
6755 6755 int found = 0;
6756 6756 int i;
6757 6757
6758 6758 ASSERT(PAGE_EXCL(pp));
6759 6759 ASSERT(curthread->t_flag & T_CAPTURING);
6760 6760
6761 6761 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6762 6762 if ((flags >> cb_index) & 1) {
6763 6763 break;
6764 6764 }
6765 6765 }
6766 6766 ASSERT(cb_index < PC_NUM_CALLBACKS);
6767 6767
6768 6768 /*
6769 6769 * Remove the entry from the page_capture hash, but don't free it yet
6770 6770 * as we may need to put it back.
6771 6771 * Since we own the page at this point in time, we should find it
6772 6772 * in the hash if this is an ASYNC call. If we don't it's likely
6773 6773 * that the page_capture_async() thread decided that this request
6774 6774 * had expired, in which case we just continue on.
6775 6775 */
6776 6776 if (flags & CAPTURE_ASYNC) {
6777 6777
6778 6778 index = PAGE_CAPTURE_HASH(pp);
6779 6779
6780 6780 mutex_enter(&page_capture_hash[index].pchh_mutex);
6781 6781 for (i = 0; i < 2 && !found; i++) {
6782 6782 bp1 = page_capture_hash[index].lists[i].next;
6783 6783 while (bp1 != &page_capture_hash[index].lists[i]) {
6784 6784 if (bp1->pp == pp) {
6785 6785 bp1->next->prev = bp1->prev;
6786 6786 bp1->prev->next = bp1->next;
6787 6787 page_capture_hash[index].
6788 6788 num_pages[bp1->pri]--;
6789 6789 page_clrtoxic(pp, PR_CAPTURE);
6790 6790 found = 1;
6791 6791 break;
6792 6792 }
6793 6793 bp1 = bp1->next;
6794 6794 }
6795 6795 }
6796 6796 mutex_exit(&page_capture_hash[index].pchh_mutex);
6797 6797 }
6798 6798
6799 6799 /* Synchronize with the unregister func. */
6800 6800 rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6801 6801 if (!pc_cb[cb_index].cb_active) {
6802 6802 page_free(pp, 1);
6803 6803 rw_exit(&pc_cb[cb_index].cb_rwlock);
6804 6804 if (found) {
6805 6805 kmem_free(bp1, sizeof (*bp1));
6806 6806 }
6807 6807 return (1);
6808 6808 }
6809 6809
6810 6810 /*
6811 6811 * We need to remove the entry from the page capture hash and turn off
6812 6812 * the PR_CAPTURE bit before calling the callback. We'll need to cache
6813 6813 * the entry here, and then based upon the return value, cleanup
6814 6814 * appropriately or re-add it to the hash, making sure that someone else
6815 6815 * hasn't already done so.
6816 6816 * It should be rare for the callback to fail and thus it's ok for
6817 6817 * the failure path to be a bit complicated as the success path is
6818 6818 * cleaner and the locking rules are easier to follow.
6819 6819 */
6820 6820
6821 6821 ret = pc_cb[cb_index].cb_func(pp, datap, flags);
6822 6822
6823 6823 rw_exit(&pc_cb[cb_index].cb_rwlock);
6824 6824
6825 6825 /*
6826 6826 * If this was an ASYNC request, we need to cleanup the hash if the
6827 6827 * callback was successful or if the request was no longer valid.
6828 6828 * For non-ASYNC requests, we return failure to map and the caller
6829 6829 * will take care of adding the request to the hash.
6830 6830 * Note also that the callback itself is responsible for the page
6831 6831 * at this point in time in terms of locking ... The most common
6832 6832 * case for the failure path should just be a page_free.
6833 6833 */
6834 6834 if (ret >= 0) {
6835 6835 if (found) {
6836 6836 if (bp1->flags & CAPTURE_RETIRE) {
6837 6837 page_retire_decr_pend_count(datap);
6838 6838 }
6839 6839 kmem_free(bp1, sizeof (*bp1));
6840 6840 }
6841 6841 return (ret);
6842 6842 }
6843 6843 if (!found) {
6844 6844 return (ret);
6845 6845 }
6846 6846
6847 6847 ASSERT(flags & CAPTURE_ASYNC);
6848 6848
6849 6849 /*
6850 6850 * Check for expiration time first as we can just free it up if it's
6851 6851 * expired.
6852 6852 */
6853 6853 if (ddi_get_lbolt() > bp1->expires && bp1->expires != -1) {
6854 6854 kmem_free(bp1, sizeof (*bp1));
6855 6855 return (ret);
6856 6856 }
6857 6857
6858 6858 /*
6859 6859 * The callback failed and there used to be an entry in the hash for
6860 6860 * this page, so we need to add it back to the hash.
6861 6861 */
6862 6862 mutex_enter(&page_capture_hash[index].pchh_mutex);
6863 6863 if (!(pp->p_toxic & PR_CAPTURE)) {
6864 6864 /* just add bp1 back to head of walked list */
6865 6865 page_settoxic(pp, PR_CAPTURE);
6866 6866 bp1->next = page_capture_hash[index].lists[1].next;
6867 6867 bp1->prev = &page_capture_hash[index].lists[1];
6868 6868 bp1->next->prev = bp1;
6869 6869 bp1->pri = PAGE_CAPTURE_PRIO(pp);
6870 6870 page_capture_hash[index].lists[1].next = bp1;
6871 6871 page_capture_hash[index].num_pages[bp1->pri]++;
6872 6872 mutex_exit(&page_capture_hash[index].pchh_mutex);
6873 6873 return (ret);
6874 6874 }
6875 6875
6876 6876 /*
6877 6877 * Otherwise there was a new capture request added to list
6878 6878 * Need to make sure that our original data is represented if
6879 6879 * appropriate.
6880 6880 */
6881 6881 for (i = 0; i < 2; i++) {
6882 6882 bp2 = page_capture_hash[index].lists[i].next;
6883 6883 while (bp2 != &page_capture_hash[index].lists[i]) {
6884 6884 if (bp2->pp == pp) {
6885 6885 if (bp1->flags & CAPTURE_RETIRE) {
6886 6886 if (!(bp2->flags & CAPTURE_RETIRE)) {
6887 6887 bp2->szc = bp1->szc;
6888 6888 bp2->flags = bp1->flags;
6889 6889 bp2->expires = bp1->expires;
6890 6890 bp2->datap = bp1->datap;
6891 6891 }
6892 6892 } else {
6893 6893 ASSERT(bp1->flags & CAPTURE_PHYSMEM);
6894 6894 if (!(bp2->flags & CAPTURE_RETIRE)) {
6895 6895 bp2->szc = bp1->szc;
6896 6896 bp2->flags = bp1->flags;
6897 6897 bp2->expires = bp1->expires;
6898 6898 bp2->datap = bp1->datap;
6899 6899 }
6900 6900 }
6901 6901 page_capture_hash[index].num_pages[bp2->pri]--;
6902 6902 bp2->pri = PAGE_CAPTURE_PRIO(pp);
6903 6903 page_capture_hash[index].num_pages[bp2->pri]++;
6904 6904 mutex_exit(&page_capture_hash[index].
6905 6905 pchh_mutex);
6906 6906 kmem_free(bp1, sizeof (*bp1));
6907 6907 return (ret);
6908 6908 }
6909 6909 bp2 = bp2->next;
6910 6910 }
6911 6911 }
6912 6912 panic("PR_CAPTURE set but not on hash for pp 0x%p\n", (void *)pp);
6913 6913 /*NOTREACHED*/
6914 6914 }
6915 6915
6916 6916 /*
6917 6917 * Try to capture the given page for the caller specified in the flags
6918 6918 * parameter. The page will either be captured and handed over to the
6919 6919 * appropriate callback, or will be queued up in the page capture hash
6920 6920 * to be captured asynchronously.
6921 6921 * If the current request is due to an async capture, the page must be
6922 6922 * exclusively locked before calling this function.
6923 6923 * Currently szc must be 0 but in the future this should be expandable to
6924 6924 * other page sizes.
6925 6925 * Returns 0 on success, with the following error codes on failure:
6926 6926 * EPERM - The requested page is long term locked, and thus repeated
6927 6927 * requests to capture this page will likely fail.
6928 6928 * ENOMEM - There was not enough free memory in the system to safely
6929 6929 * map the requested page.
6930 6930 * ENOENT - The requested page was inside the kernel cage, and the
6931 6931 * CAPTURE_GET_CAGE flag was not set.
6932 6932 * EAGAIN - The requested page could not be capturead at this point in
6933 6933 * time but future requests will likely work.
6934 6934 * EBUSY - The requested page is retired and the CAPTURE_GET_RETIRED flag
6935 6935 * was not set.
6936 6936 */
6937 6937 int
6938 6938 page_itrycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
6939 6939 {
6940 6940 int ret;
6941 6941 int cb_index;
6942 6942
6943 6943 if (flags & CAPTURE_ASYNC) {
6944 6944 ASSERT(PAGE_EXCL(pp));
6945 6945 goto async;
6946 6946 }
6947 6947
6948 6948 /* Make sure there's enough availrmem ... */
6949 6949 ret = page_capture_pre_checks(pp, flags);
6950 6950 if (ret != 0) {
6951 6951 return (ret);
6952 6952 }
6953 6953
6954 6954 if (!page_trylock(pp, SE_EXCL)) {
6955 6955 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6956 6956 if ((flags >> cb_index) & 1) {
6957 6957 break;
6958 6958 }
6959 6959 }
6960 6960 ASSERT(cb_index < PC_NUM_CALLBACKS);
6961 6961 ret = EAGAIN;
6962 6962 /* Special case for retired pages */
6963 6963 if (PP_RETIRED(pp)) {
6964 6964 if (flags & CAPTURE_GET_RETIRED) {
6965 6965 if (!page_unretire_pp(pp, PR_UNR_TEMP)) {
6966 6966 /*
6967 6967 * Need to set capture bit and add to
6968 6968 * hash so that the page will be
6969 6969 * retired when freed.
6970 6970 */
6971 6971 page_capture_add_hash(pp, szc,
6972 6972 CAPTURE_RETIRE, NULL);
6973 6973 ret = 0;
6974 6974 goto own_page;
6975 6975 }
6976 6976 } else {
6977 6977 return (EBUSY);
6978 6978 }
6979 6979 }
6980 6980 page_capture_add_hash(pp, szc, flags, datap);
6981 6981 return (ret);
6982 6982 }
6983 6983
6984 6984 async:
6985 6985 ASSERT(PAGE_EXCL(pp));
6986 6986
6987 6987 /* Need to check for physmem async requests that availrmem is sane */
6988 6988 if ((flags & (CAPTURE_ASYNC | CAPTURE_PHYSMEM)) ==
6989 6989 (CAPTURE_ASYNC | CAPTURE_PHYSMEM) &&
6990 6990 (availrmem < swapfs_minfree)) {
6991 6991 page_unlock(pp);
6992 6992 return (ENOMEM);
6993 6993 }
6994 6994
6995 6995 ret = page_capture_clean_page(pp);
6996 6996
6997 6997 if (ret != 0) {
6998 6998 /* We failed to get the page, so lets add it to the hash */
6999 6999 if (!(flags & CAPTURE_ASYNC)) {
7000 7000 page_capture_add_hash(pp, szc, flags, datap);
7001 7001 }
7002 7002 return (ret);
7003 7003 }
7004 7004
7005 7005 own_page:
7006 7006 ASSERT(PAGE_EXCL(pp));
7007 7007 ASSERT(pp->p_szc == 0);
7008 7008
7009 7009 /* Call the callback */
7010 7010 ret = page_capture_take_action(pp, flags, datap);
7011 7011
7012 7012 if (ret == 0) {
7013 7013 return (0);
7014 7014 }
7015 7015
7016 7016 /*
7017 7017 * Note that in the failure cases from page_capture_take_action, the
7018 7018 * EXCL lock will have already been dropped.
7019 7019 */
7020 7020 if ((ret == -1) && (!(flags & CAPTURE_ASYNC))) {
7021 7021 page_capture_add_hash(pp, szc, flags, datap);
7022 7022 }
7023 7023 return (EAGAIN);
7024 7024 }
7025 7025
7026 7026 int
7027 7027 page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
7028 7028 {
7029 7029 int ret;
7030 7030
7031 7031 curthread->t_flag |= T_CAPTURING;
7032 7032 ret = page_itrycapture(pp, szc, flags, datap);
7033 7033 curthread->t_flag &= ~T_CAPTURING; /* xor works as we know its set */
7034 7034 return (ret);
7035 7035 }
7036 7036
7037 7037 /*
7038 7038 * When unlocking a page which has the PR_CAPTURE bit set, this routine
7039 7039 * gets called to try and capture the page.
7040 7040 */
7041 7041 void
7042 7042 page_unlock_capture(page_t *pp)
7043 7043 {
7044 7044 page_capture_hash_bucket_t *bp;
7045 7045 int index;
7046 7046 int i;
7047 7047 uint_t szc;
7048 7048 uint_t flags = 0;
7049 7049 void *datap;
7050 7050 kmutex_t *mp;
7051 7051 extern vnode_t retired_pages;
7052 7052
7053 7053 /*
7054 7054 * We need to protect against a possible deadlock here where we own
7055 7055 * the vnode page hash mutex and want to acquire it again as there
7056 7056 * are locations in the code, where we unlock a page while holding
7057 7057 * the mutex which can lead to the page being captured and eventually
7058 7058 * end up here. As we may be hashing out the old page and hashing into
7059 7059 * the retire vnode, we need to make sure we don't own them.
7060 7060 * Other callbacks who do hash operations also need to make sure that
7061 7061 * before they hashin to a vnode that they do not currently own the
7062 7062 * vphm mutex otherwise there will be a panic.
7063 7063 */
7064 7064 if (mutex_owned(page_vnode_mutex(&retired_pages))) {
7065 7065 page_unlock_nocapture(pp);
7066 7066 return;
7067 7067 }
7068 7068 if (pp->p_vnode != NULL && mutex_owned(page_vnode_mutex(pp->p_vnode))) {
7069 7069 page_unlock_nocapture(pp);
7070 7070 return;
7071 7071 }
7072 7072
7073 7073 index = PAGE_CAPTURE_HASH(pp);
7074 7074
7075 7075 mp = &page_capture_hash[index].pchh_mutex;
7076 7076 mutex_enter(mp);
7077 7077 for (i = 0; i < 2; i++) {
7078 7078 bp = page_capture_hash[index].lists[i].next;
7079 7079 while (bp != &page_capture_hash[index].lists[i]) {
7080 7080 if (bp->pp == pp) {
7081 7081 szc = bp->szc;
7082 7082 flags = bp->flags | CAPTURE_ASYNC;
7083 7083 datap = bp->datap;
7084 7084 mutex_exit(mp);
7085 7085 (void) page_trycapture(pp, szc, flags, datap);
7086 7086 return;
7087 7087 }
7088 7088 bp = bp->next;
7089 7089 }
7090 7090 }
7091 7091
7092 7092 /* Failed to find page in hash so clear flags and unlock it. */
7093 7093 page_clrtoxic(pp, PR_CAPTURE);
7094 7094 page_unlock(pp);
7095 7095
7096 7096 mutex_exit(mp);
7097 7097 }
7098 7098
7099 7099 void
7100 7100 page_capture_init()
7101 7101 {
7102 7102 int i;
7103 7103 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7104 7104 page_capture_hash[i].lists[0].next =
7105 7105 &page_capture_hash[i].lists[0];
7106 7106 page_capture_hash[i].lists[0].prev =
7107 7107 &page_capture_hash[i].lists[0];
7108 7108 page_capture_hash[i].lists[1].next =
7109 7109 &page_capture_hash[i].lists[1];
7110 7110 page_capture_hash[i].lists[1].prev =
7111 7111 &page_capture_hash[i].lists[1];
7112 7112 }
7113 7113
7114 7114 pc_thread_shortwait = 23 * hz;
7115 7115 pc_thread_longwait = 1201 * hz;
7116 7116 pc_thread_retry = 3;
7117 7117 mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
7118 7118 cv_init(&pc_cv, NULL, CV_DEFAULT, NULL);
7119 7119 pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0,
7120 7120 TS_RUN, minclsyspri);
7121 7121 }
7122 7122
7123 7123 /*
7124 7124 * It is necessary to scrub any failing pages prior to reboot in order to
7125 7125 * prevent a latent error trap from occurring on the next boot.
7126 7126 */
7127 7127 void
7128 7128 page_retire_mdboot()
7129 7129 {
7130 7130 page_t *pp;
7131 7131 int i, j;
7132 7132 page_capture_hash_bucket_t *bp;
7133 7133 uchar_t pri;
7134 7134
7135 7135 /* walk lists looking for pages to scrub */
7136 7136 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7137 7137 for (pri = 0; pri < PC_NUM_PRI; pri++) {
7138 7138 if (page_capture_hash[i].num_pages[pri] != 0) {
7139 7139 break;
7140 7140 }
7141 7141 }
7142 7142 if (pri == PC_NUM_PRI)
7143 7143 continue;
7144 7144
7145 7145 mutex_enter(&page_capture_hash[i].pchh_mutex);
7146 7146
7147 7147 for (j = 0; j < 2; j++) {
7148 7148 bp = page_capture_hash[i].lists[j].next;
7149 7149 while (bp != &page_capture_hash[i].lists[j]) {
7150 7150 pp = bp->pp;
7151 7151 if (PP_TOXIC(pp)) {
7152 7152 if (page_trylock(pp, SE_EXCL)) {
7153 7153 PP_CLRFREE(pp);
7154 7154 pagescrub(pp, 0, PAGESIZE);
7155 7155 page_unlock(pp);
7156 7156 }
7157 7157 }
7158 7158 bp = bp->next;
7159 7159 }
7160 7160 }
7161 7161 mutex_exit(&page_capture_hash[i].pchh_mutex);
7162 7162 }
7163 7163 }
7164 7164
7165 7165 /*
7166 7166 * Walk the page_capture_hash trying to capture pages and also cleanup old
7167 7167 * entries which have expired.
7168 7168 */
7169 7169 void
7170 7170 page_capture_async()
7171 7171 {
7172 7172 page_t *pp;
7173 7173 int i;
7174 7174 int ret;
7175 7175 page_capture_hash_bucket_t *bp1, *bp2;
7176 7176 uint_t szc;
7177 7177 uint_t flags;
7178 7178 void *datap;
7179 7179 uchar_t pri;
7180 7180
7181 7181 /* If there are outstanding pages to be captured, get to work */
7182 7182 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7183 7183 for (pri = 0; pri < PC_NUM_PRI; pri++) {
7184 7184 if (page_capture_hash[i].num_pages[pri] != 0)
7185 7185 break;
7186 7186 }
7187 7187 if (pri == PC_NUM_PRI)
7188 7188 continue;
7189 7189
7190 7190 /* Append list 1 to list 0 and then walk through list 0 */
7191 7191 mutex_enter(&page_capture_hash[i].pchh_mutex);
7192 7192 bp1 = &page_capture_hash[i].lists[1];
7193 7193 bp2 = bp1->next;
7194 7194 if (bp1 != bp2) {
7195 7195 bp1->prev->next = page_capture_hash[i].lists[0].next;
7196 7196 bp2->prev = &page_capture_hash[i].lists[0];
7197 7197 page_capture_hash[i].lists[0].next->prev = bp1->prev;
7198 7198 page_capture_hash[i].lists[0].next = bp2;
7199 7199 bp1->next = bp1;
7200 7200 bp1->prev = bp1;
7201 7201 }
7202 7202
7203 7203 /* list[1] will be empty now */
7204 7204
7205 7205 bp1 = page_capture_hash[i].lists[0].next;
7206 7206 while (bp1 != &page_capture_hash[i].lists[0]) {
7207 7207 /* Check expiration time */
7208 7208 if ((ddi_get_lbolt() > bp1->expires &&
7209 7209 bp1->expires != -1) ||
7210 7210 page_deleted(bp1->pp)) {
7211 7211 page_capture_hash[i].lists[0].next = bp1->next;
7212 7212 bp1->next->prev =
7213 7213 &page_capture_hash[i].lists[0];
7214 7214 page_capture_hash[i].num_pages[bp1->pri]--;
7215 7215
7216 7216 /*
7217 7217 * We can safely remove the PR_CAPTURE bit
7218 7218 * without holding the EXCL lock on the page
7219 7219 * as the PR_CAPTURE bit requres that the
7220 7220 * page_capture_hash[].pchh_mutex be held
7221 7221 * to modify it.
7222 7222 */
7223 7223 page_clrtoxic(bp1->pp, PR_CAPTURE);
7224 7224 mutex_exit(&page_capture_hash[i].pchh_mutex);
7225 7225 kmem_free(bp1, sizeof (*bp1));
7226 7226 mutex_enter(&page_capture_hash[i].pchh_mutex);
7227 7227 bp1 = page_capture_hash[i].lists[0].next;
7228 7228 continue;
7229 7229 }
7230 7230 pp = bp1->pp;
7231 7231 szc = bp1->szc;
7232 7232 flags = bp1->flags;
7233 7233 datap = bp1->datap;
7234 7234 mutex_exit(&page_capture_hash[i].pchh_mutex);
7235 7235 if (page_trylock(pp, SE_EXCL)) {
7236 7236 ret = page_trycapture(pp, szc,
7237 7237 flags | CAPTURE_ASYNC, datap);
7238 7238 } else {
7239 7239 ret = 1; /* move to walked hash */
7240 7240 }
7241 7241
7242 7242 if (ret != 0) {
7243 7243 /* Move to walked hash */
7244 7244 (void) page_capture_move_to_walked(pp);
7245 7245 }
7246 7246 mutex_enter(&page_capture_hash[i].pchh_mutex);
7247 7247 bp1 = page_capture_hash[i].lists[0].next;
7248 7248 }
7249 7249
7250 7250 mutex_exit(&page_capture_hash[i].pchh_mutex);
7251 7251 }
7252 7252 }
7253 7253
7254 7254 /*
7255 7255 * This function is called by the page_capture_thread, and is needed in
7256 7256 * in order to initiate aio cleanup, so that pages used in aio
7257 7257 * will be unlocked and subsequently retired by page_capture_thread.
7258 7258 */
7259 7259 static int
7260 7260 do_aio_cleanup(void)
7261 7261 {
7262 7262 proc_t *procp;
7263 7263 int (*aio_cleanup_dr_delete_memory)(proc_t *);
7264 7264 int cleaned = 0;
7265 7265
7266 7266 if (modload("sys", "kaio") == -1) {
7267 7267 cmn_err(CE_WARN, "do_aio_cleanup: cannot load kaio");
7268 7268 return (0);
7269 7269 }
7270 7270 /*
7271 7271 * We use the aio_cleanup_dr_delete_memory function to
7272 7272 * initiate the actual clean up; this function will wake
7273 7273 * up the per-process aio_cleanup_thread.
7274 7274 */
7275 7275 aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
7276 7276 modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
7277 7277 if (aio_cleanup_dr_delete_memory == NULL) {
7278 7278 cmn_err(CE_WARN,
7279 7279 "aio_cleanup_dr_delete_memory not found in kaio");
7280 7280 return (0);
7281 7281 }
7282 7282 mutex_enter(&pidlock);
7283 7283 for (procp = practive; (procp != NULL); procp = procp->p_next) {
7284 7284 mutex_enter(&procp->p_lock);
7285 7285 if (procp->p_aio != NULL) {
7286 7286 /* cleanup proc's outstanding kaio */
7287 7287 cleaned += (*aio_cleanup_dr_delete_memory)(procp);
7288 7288 }
7289 7289 mutex_exit(&procp->p_lock);
7290 7290 }
7291 7291 mutex_exit(&pidlock);
7292 7292 return (cleaned);
7293 7293 }
7294 7294
7295 7295 /*
7296 7296 * helper function for page_capture_thread
7297 7297 */
7298 7298 static void
7299 7299 page_capture_handle_outstanding(void)
7300 7300 {
7301 7301 int ntry;
7302 7302
7303 7303 /* Reap pages before attempting capture pages */
7304 7304 kmem_reap();
7305 7305
7306 7306 if ((page_retire_pend_count() > page_retire_pend_kas_count()) &&
7307 7307 hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
7308 7308 /*
7309 7309 * Note: Purging only for platforms that support
7310 7310 * ISM hat_pageunload() - mainly SPARC. On x86/x64
7311 7311 * platforms ISM pages SE_SHARED locked until destroyed.
7312 7312 */
7313 7313
7314 7314 /* disable and purge seg_pcache */
7315 7315 (void) seg_p_disable();
7316 7316 for (ntry = 0; ntry < pc_thread_retry; ntry++) {
7317 7317 if (!page_retire_pend_count())
7318 7318 break;
7319 7319 if (do_aio_cleanup()) {
7320 7320 /*
7321 7321 * allow the apps cleanup threads
7322 7322 * to run
7323 7323 */
7324 7324 delay(pc_thread_shortwait);
7325 7325 }
7326 7326 page_capture_async();
7327 7327 }
7328 7328 /* reenable seg_pcache */
7329 7329 seg_p_enable();
7330 7330
7331 7331 /* completed what can be done. break out */
7332 7332 return;
7333 7333 }
7334 7334
7335 7335 /*
7336 7336 * For kernel pages and/or unsupported HAT_DYNAMIC_ISM_UNMAP, reap
7337 7337 * and then attempt to capture.
7338 7338 */
7339 7339 seg_preap();
7340 7340 page_capture_async();
7341 7341 }
7342 7342
7343 7343 /*
7344 7344 * The page_capture_thread loops forever, looking to see if there are
7345 7345 * pages still waiting to be captured.
7346 7346 */
7347 7347 static void
7348 7348 page_capture_thread(void)
7349 7349 {
7350 7350 callb_cpr_t c;
7351 7351 int i;
7352 7352 int high_pri_pages;
7353 7353 int low_pri_pages;
7354 7354 clock_t timeout;
7355 7355
7356 7356 CALLB_CPR_INIT(&c, &pc_thread_mutex, callb_generic_cpr, "page_capture");
7357 7357
7358 7358 mutex_enter(&pc_thread_mutex);
7359 7359 for (;;) {
7360 7360 high_pri_pages = 0;
7361 7361 low_pri_pages = 0;
7362 7362 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7363 7363 high_pri_pages +=
7364 7364 page_capture_hash[i].num_pages[PC_PRI_HI];
7365 7365 low_pri_pages +=
7366 7366 page_capture_hash[i].num_pages[PC_PRI_LO];
7367 7367 }
7368 7368
7369 7369 timeout = pc_thread_longwait;
7370 7370 if (high_pri_pages != 0) {
7371 7371 timeout = pc_thread_shortwait;
7372 7372 page_capture_handle_outstanding();
7373 7373 } else if (low_pri_pages != 0) {
7374 7374 page_capture_async();
7375 7375 }
7376 7376 CALLB_CPR_SAFE_BEGIN(&c);
7377 7377 (void) cv_reltimedwait(&pc_cv, &pc_thread_mutex,
7378 7378 timeout, TR_CLOCK_TICK);
7379 7379 CALLB_CPR_SAFE_END(&c, &pc_thread_mutex);
7380 7380 }
7381 7381 /*NOTREACHED*/
7382 7382 }
7383 7383 /*
7384 7384 * Attempt to locate a bucket that has enough pages to satisfy the request.
7385 7385 * The initial check is done without the lock to avoid unneeded contention.
7386 7386 * The function returns 1 if enough pages were found, else 0 if it could not
7387 7387 * find enough pages in a bucket.
7388 7388 */
7389 7389 static int
7390 7390 pcf_decrement_bucket(pgcnt_t npages)
7391 7391 {
7392 7392 struct pcf *p;
7393 7393 struct pcf *q;
7394 7394 int i;
7395 7395
7396 7396 p = &pcf[PCF_INDEX()];
7397 7397 q = &pcf[pcf_fanout];
7398 7398 for (i = 0; i < pcf_fanout; i++) {
7399 7399 if (p->pcf_count > npages) {
7400 7400 /*
7401 7401 * a good one to try.
7402 7402 */
7403 7403 mutex_enter(&p->pcf_lock);
7404 7404 if (p->pcf_count > npages) {
7405 7405 p->pcf_count -= (uint_t)npages;
7406 7406 /*
7407 7407 * freemem is not protected by any lock.
7408 7408 * Thus, we cannot have any assertion
7409 7409 * containing freemem here.
7410 7410 */
7411 7411 freemem -= npages;
7412 7412 mutex_exit(&p->pcf_lock);
7413 7413 return (1);
7414 7414 }
7415 7415 mutex_exit(&p->pcf_lock);
7416 7416 }
7417 7417 p++;
7418 7418 if (p >= q) {
7419 7419 p = pcf;
7420 7420 }
7421 7421 }
7422 7422 return (0);
7423 7423 }
7424 7424
7425 7425 /*
7426 7426 * Arguments:
7427 7427 * pcftotal_ret: If the value is not NULL and we have walked all the
7428 7428 * buckets but did not find enough pages then it will
7429 7429 * be set to the total number of pages in all the pcf
7430 7430 * buckets.
7431 7431 * npages: Is the number of pages we have been requested to
7432 7432 * find.
7433 7433 * unlock: If set to 0 we will leave the buckets locked if the
7434 7434 * requested number of pages are not found.
7435 7435 *
7436 7436 * Go and try to satisfy the page request from any number of buckets.
7437 7437 * This can be a very expensive operation as we have to lock the buckets
7438 7438 * we are checking (and keep them locked), starting at bucket 0.
7439 7439 *
7440 7440 * The function returns 1 if enough pages were found, else 0 if it could not
7441 7441 * find enough pages in the buckets.
7442 7442 *
7443 7443 */
7444 7444 static int
7445 7445 pcf_decrement_multiple(pgcnt_t *pcftotal_ret, pgcnt_t npages, int unlock)
7446 7446 {
7447 7447 struct pcf *p;
7448 7448 pgcnt_t pcftotal;
7449 7449 int i;
7450 7450
7451 7451 p = pcf;
7452 7452 /* try to collect pages from several pcf bins */
7453 7453 for (pcftotal = 0, i = 0; i < pcf_fanout; i++) {
7454 7454 mutex_enter(&p->pcf_lock);
7455 7455 pcftotal += p->pcf_count;
7456 7456 if (pcftotal >= npages) {
7457 7457 /*
7458 7458 * Wow! There are enough pages laying around
7459 7459 * to satisfy the request. Do the accounting,
7460 7460 * drop the locks we acquired, and go back.
7461 7461 *
7462 7462 * freemem is not protected by any lock. So,
7463 7463 * we cannot have any assertion containing
7464 7464 * freemem.
7465 7465 */
7466 7466 freemem -= npages;
7467 7467 while (p >= pcf) {
7468 7468 if (p->pcf_count <= npages) {
7469 7469 npages -= p->pcf_count;
7470 7470 p->pcf_count = 0;
7471 7471 } else {
7472 7472 p->pcf_count -= (uint_t)npages;
7473 7473 npages = 0;
7474 7474 }
7475 7475 mutex_exit(&p->pcf_lock);
7476 7476 p--;
7477 7477 }
7478 7478 ASSERT(npages == 0);
7479 7479 return (1);
7480 7480 }
7481 7481 p++;
7482 7482 }
7483 7483 if (unlock) {
7484 7484 /* failed to collect pages - release the locks */
7485 7485 while (--p >= pcf) {
7486 7486 mutex_exit(&p->pcf_lock);
7487 7487 }
7488 7488 }
7489 7489 if (pcftotal_ret != NULL)
7490 7490 *pcftotal_ret = pcftotal;
7491 7491 return (0);
7492 7492 }
↓ open down ↓ |
7162 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX