Print this page
6145 instead using SEGOP_* macros, define full-fledged segop_* functions
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/vm_seg.c
+++ new/usr/src/uts/common/vm/vm_seg.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 * Copyright (c) 2015, Joyent, Inc.
25 + * Copyright 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
25 26 */
26 27
27 28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 29 /* All Rights Reserved */
29 30
30 31 /*
31 32 * University Copyright- Copyright (c) 1982, 1986, 1988
32 33 * The Regents of the University of California
33 34 * All Rights Reserved
34 35 *
35 36 * University Acknowledgment- Portions of this document are derived from
36 37 * software developed by the University of California, Berkeley, and its
37 38 * contributors.
38 39 */
39 40
40 41 /*
41 42 * VM - segment management.
42 43 */
43 44
44 45 #include <sys/types.h>
45 46 #include <sys/inttypes.h>
46 47 #include <sys/t_lock.h>
47 48 #include <sys/param.h>
48 49 #include <sys/systm.h>
49 50 #include <sys/kmem.h>
50 51 #include <sys/sysmacros.h>
51 52 #include <sys/vmsystm.h>
52 53 #include <sys/tuneable.h>
53 54 #include <sys/debug.h>
54 55 #include <sys/fs/swapnode.h>
55 56 #include <sys/cmn_err.h>
56 57 #include <sys/callb.h>
57 58 #include <sys/mem_config.h>
58 59 #include <sys/mman.h>
59 60
60 61 #include <vm/hat.h>
61 62 #include <vm/as.h>
62 63 #include <vm/seg.h>
63 64 #include <vm/seg_kmem.h>
64 65 #include <vm/seg_spt.h>
65 66 #include <vm/seg_vn.h>
66 67 #include <vm/anon.h>
67 68
68 69 /*
69 70 * kstats for segment advise
70 71 */
71 72 segadvstat_t segadvstat = {
72 73 { "MADV_FREE_hit", KSTAT_DATA_ULONG },
73 74 { "MADV_FREE_miss", KSTAT_DATA_ULONG },
74 75 };
75 76
76 77 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
77 78 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
78 79
79 80 /*
80 81 * entry in the segment page cache
81 82 */
82 83 struct seg_pcache {
83 84 struct seg_pcache *p_hnext; /* list for hashed blocks */
84 85 struct seg_pcache *p_hprev;
85 86 pcache_link_t p_plink; /* per segment/amp list */
86 87 void *p_htag0; /* segment/amp pointer */
87 88 caddr_t p_addr; /* base address/anon_idx */
88 89 size_t p_len; /* total bytes */
89 90 size_t p_wlen; /* writtable bytes at p_addr */
90 91 struct page **p_pp; /* pp shadow list */
91 92 seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */
92 93 clock_t p_lbolt; /* lbolt from last use */
93 94 struct seg_phash *p_hashp; /* our pcache hash bucket */
94 95 uint_t p_active; /* active count */
95 96 uchar_t p_write; /* true if S_WRITE */
96 97 uchar_t p_ref; /* reference byte */
97 98 ushort_t p_flags; /* bit flags */
98 99 };
99 100
100 101 struct seg_phash {
101 102 struct seg_pcache *p_hnext; /* list for hashed blocks */
102 103 struct seg_pcache *p_hprev;
103 104 kmutex_t p_hmutex; /* protects hash bucket */
104 105 pcache_link_t p_halink[2]; /* active bucket linkages */
105 106 };
106 107
107 108 struct seg_phash_wired {
108 109 struct seg_pcache *p_hnext; /* list for hashed blocks */
109 110 struct seg_pcache *p_hprev;
110 111 kmutex_t p_hmutex; /* protects hash bucket */
111 112 };
112 113
113 114 /*
114 115 * A parameter to control a maximum number of bytes that can be
115 116 * purged from pcache at a time.
116 117 */
117 118 #define P_MAX_APURGE_BYTES (1024 * 1024 * 1024)
118 119
119 120 /*
120 121 * log2(fraction of pcache to reclaim at a time).
121 122 */
122 123 #define P_SHRINK_SHFT (5)
123 124
124 125 /*
125 126 * The following variables can be tuned via /etc/system.
126 127 */
127 128
128 129 int segpcache_enabled = 1; /* if 1, shadow lists are cached */
129 130 pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */
130 131 ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */
131 132 ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */
132 133 int segpcache_reap_sec = 1; /* reap check rate in secs */
133 134 clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */
134 135 int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */
135 136 clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */
136 137 int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */
137 138 pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */
138 139
139 140 static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */
140 141 static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */
141 142 static kcondvar_t seg_pasync_cv;
142 143
143 144 #pragma align 64(pctrl1)
144 145 #pragma align 64(pctrl2)
145 146 #pragma align 64(pctrl3)
146 147
147 148 /*
148 149 * Keep frequently used variables together in one cache line.
149 150 */
150 151 static struct p_ctrl1 {
151 152 uint_t p_disabled; /* if not 0, caching temporarily off */
152 153 pgcnt_t p_maxwin; /* max # of pages that can be cached */
153 154 size_t p_hashwin_sz; /* # of non wired buckets */
154 155 struct seg_phash *p_htabwin; /* hash table for non wired entries */
155 156 size_t p_hashwired_sz; /* # of wired buckets */
156 157 struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
157 158 kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */
158 159 #ifdef _LP64
159 160 ulong_t pad[1];
160 161 #endif /* _LP64 */
161 162 } pctrl1;
162 163
163 164 static struct p_ctrl2 {
164 165 kmutex_t p_mem_mtx; /* protects window counter and p_halinks */
165 166 pgcnt_t p_locked_win; /* # pages from window */
166 167 pgcnt_t p_locked; /* # of pages cached by pagelock */
167 168 uchar_t p_ahcur; /* current active links for insert/delete */
168 169 uchar_t p_athr_on; /* async reclaim thread is running. */
169 170 pcache_link_t p_ahhead[2]; /* active buckets linkages */
170 171 } pctrl2;
171 172
172 173 static struct p_ctrl3 {
173 174 clock_t p_pcp_maxage; /* max pcp age in ticks */
174 175 ulong_t p_athr_empty_ahb; /* athread walk stats */
175 176 ulong_t p_athr_full_ahb; /* athread walk stats */
176 177 pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */
177 178 int p_shrink_shft; /* reap shift factor */
178 179 #ifdef _LP64
179 180 ulong_t pad[3];
180 181 #endif /* _LP64 */
181 182 } pctrl3;
182 183
183 184 #define seg_pdisabled pctrl1.p_disabled
184 185 #define seg_pmaxwindow pctrl1.p_maxwin
185 186 #define seg_phashsize_win pctrl1.p_hashwin_sz
186 187 #define seg_phashtab_win pctrl1.p_htabwin
187 188 #define seg_phashsize_wired pctrl1.p_hashwired_sz
188 189 #define seg_phashtab_wired pctrl1.p_htabwired
189 190 #define seg_pkmcache pctrl1.p_kmcache
190 191 #define seg_pmem_mtx pctrl2.p_mem_mtx
191 192 #define seg_plocked_window pctrl2.p_locked_win
192 193 #define seg_plocked pctrl2.p_locked
193 194 #define seg_pahcur pctrl2.p_ahcur
194 195 #define seg_pathr_on pctrl2.p_athr_on
195 196 #define seg_pahhead pctrl2.p_ahhead
196 197 #define seg_pmax_pcpage pctrl3.p_pcp_maxage
197 198 #define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb
198 199 #define seg_pathr_full_ahb pctrl3.p_athr_full_ahb
199 200 #define seg_pshrink_shift pctrl3.p_shrink_shft
200 201 #define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages
201 202
202 203 #define P_HASHWIN_MASK (seg_phashsize_win - 1)
203 204 #define P_HASHWIRED_MASK (seg_phashsize_wired - 1)
204 205 #define P_BASESHIFT (6)
205 206
206 207 kthread_t *seg_pasync_thr;
207 208
208 209 extern struct seg_ops segvn_ops;
209 210 extern struct seg_ops segspt_shmops;
210 211
211 212 #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
212 213 #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
213 214
214 215 #define LBOLT_DELTA(t) ((ulong_t)(ddi_get_lbolt() - (t)))
215 216
216 217 #define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt)
217 218
218 219 /*
219 220 * htag0 argument can be a seg or amp pointer.
220 221 */
221 222 #define P_HASHBP(seg, htag0, addr, flags) \
222 223 (IS_PFLAGS_WIRED((flags)) ? \
223 224 ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \
224 225 ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \
225 226 (&seg_phashtab_win[P_HASHWIN_MASK & \
226 227 (((uintptr_t)(htag0) >> 3) ^ \
227 228 ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \
228 229 (flags >> 16) : page_get_shift((seg)->s_szc))))]))
229 230
230 231 /*
231 232 * htag0 argument can be a seg or amp pointer.
232 233 */
233 234 #define P_MATCH(pcp, htag0, addr, len) \
234 235 ((pcp)->p_htag0 == (htag0) && \
235 236 (pcp)->p_addr == (addr) && \
236 237 (pcp)->p_len >= (len))
237 238
238 239 #define P_MATCH_PP(pcp, htag0, addr, len, pp) \
239 240 ((pcp)->p_pp == (pp) && \
240 241 (pcp)->p_htag0 == (htag0) && \
241 242 (pcp)->p_addr == (addr) && \
242 243 (pcp)->p_len >= (len))
243 244
244 245 #define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \
245 246 offsetof(struct seg_pcache, p_plink)))
246 247
247 248 #define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \
248 249 offsetof(struct seg_phash, p_halink[l])))
249 250
250 251 /*
251 252 * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
252 253 * active hash bucket lists. We maintain active bucket lists to reduce the
253 254 * overhead of finding active buckets during asynchronous purging since there
254 255 * can be 10s of millions of buckets on a large system but only a small subset
255 256 * of them in actual use.
256 257 *
257 258 * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
258 259 * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
259 260 * buckets. The other list is used by asynchronous purge thread. This allows
260 261 * the purge thread to walk its active list without holding seg_pmem_mtx for a
261 262 * long time. When asynchronous thread is done with its list it switches to
262 263 * current active list and makes the list it just finished processing as
263 264 * current active list.
264 265 *
265 266 * seg_padd_abuck() only adds the bucket to current list if the bucket is not
266 267 * yet on any list. seg_premove_abuck() may remove the bucket from either
267 268 * list. If the bucket is on current list it will be always removed. Otherwise
268 269 * the bucket is only removed if asynchronous purge thread is not currently
269 270 * running or seg_premove_abuck() is called by asynchronous purge thread
270 271 * itself. A given bucket can only be on one of active lists at a time. These
271 272 * routines should be called with per bucket lock held. The routines use
272 273 * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
273 274 * the first entry is added to the bucket chain and seg_premove_abuck() must
274 275 * be called after the last pcp entry is deleted from its chain. Per bucket
275 276 * lock should be held by the callers. This avoids a potential race condition
276 277 * when seg_premove_abuck() removes a bucket after pcp entries are added to
277 278 * its list after the caller checked that the bucket has no entries. (this
278 279 * race would cause a loss of an active bucket from the active lists).
279 280 *
280 281 * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
281 282 * New entries are added to the end of the list since LRU is used as the
282 283 * purging policy.
283 284 */
284 285 static void
285 286 seg_padd_abuck(struct seg_phash *hp)
286 287 {
287 288 int lix;
288 289
289 290 ASSERT(MUTEX_HELD(&hp->p_hmutex));
290 291 ASSERT((struct seg_phash *)hp->p_hnext != hp);
291 292 ASSERT((struct seg_phash *)hp->p_hprev != hp);
292 293 ASSERT(hp->p_hnext == hp->p_hprev);
293 294 ASSERT(!IS_PCP_WIRED(hp->p_hnext));
294 295 ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
295 296 ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
296 297 ASSERT(hp >= seg_phashtab_win &&
297 298 hp < &seg_phashtab_win[seg_phashsize_win]);
298 299
299 300 /*
300 301 * This bucket can already be on one of active lists
301 302 * since seg_premove_abuck() may have failed to remove it
302 303 * before.
303 304 */
304 305 mutex_enter(&seg_pmem_mtx);
305 306 lix = seg_pahcur;
306 307 ASSERT(lix >= 0 && lix <= 1);
307 308 if (hp->p_halink[lix].p_lnext != NULL) {
308 309 ASSERT(hp->p_halink[lix].p_lprev != NULL);
309 310 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
310 311 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
311 312 mutex_exit(&seg_pmem_mtx);
312 313 return;
313 314 }
314 315 ASSERT(hp->p_halink[lix].p_lprev == NULL);
315 316
316 317 /*
317 318 * If this bucket is still on list !lix async thread can't yet remove
318 319 * it since we hold here per bucket lock. In this case just return
319 320 * since async thread will eventually find and process this bucket.
320 321 */
321 322 if (hp->p_halink[!lix].p_lnext != NULL) {
322 323 ASSERT(hp->p_halink[!lix].p_lprev != NULL);
323 324 mutex_exit(&seg_pmem_mtx);
324 325 return;
325 326 }
326 327 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
327 328 /*
328 329 * This bucket is not on any active bucket list yet.
329 330 * Add the bucket to the tail of current active list.
330 331 */
331 332 hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
332 333 hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
333 334 seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
334 335 seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
335 336 mutex_exit(&seg_pmem_mtx);
336 337 }
337 338
338 339 static void
339 340 seg_premove_abuck(struct seg_phash *hp, int athr)
340 341 {
341 342 int lix;
342 343
343 344 ASSERT(MUTEX_HELD(&hp->p_hmutex));
344 345 ASSERT((struct seg_phash *)hp->p_hnext == hp);
345 346 ASSERT((struct seg_phash *)hp->p_hprev == hp);
346 347 ASSERT(hp >= seg_phashtab_win &&
347 348 hp < &seg_phashtab_win[seg_phashsize_win]);
348 349
349 350 if (athr) {
350 351 ASSERT(seg_pathr_on);
351 352 ASSERT(seg_pahcur <= 1);
352 353 /*
353 354 * We are called by asynchronous thread that found this bucket
354 355 * on not currently active (i.e. !seg_pahcur) list. Remove it
355 356 * from there. Per bucket lock we are holding makes sure
356 357 * seg_pinsert() can't sneak in and add pcp entries to this
357 358 * bucket right before we remove the bucket from its list.
358 359 */
359 360 lix = !seg_pahcur;
360 361 ASSERT(hp->p_halink[lix].p_lnext != NULL);
361 362 ASSERT(hp->p_halink[lix].p_lprev != NULL);
362 363 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
363 364 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
364 365 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
365 366 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
366 367 hp->p_halink[lix].p_lnext = NULL;
367 368 hp->p_halink[lix].p_lprev = NULL;
368 369 return;
369 370 }
370 371
371 372 mutex_enter(&seg_pmem_mtx);
372 373 lix = seg_pahcur;
373 374 ASSERT(lix >= 0 && lix <= 1);
374 375
375 376 /*
376 377 * If the bucket is on currently active list just remove it from
377 378 * there.
378 379 */
379 380 if (hp->p_halink[lix].p_lnext != NULL) {
380 381 ASSERT(hp->p_halink[lix].p_lprev != NULL);
381 382 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
382 383 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
383 384 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
384 385 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
385 386 hp->p_halink[lix].p_lnext = NULL;
386 387 hp->p_halink[lix].p_lprev = NULL;
387 388 mutex_exit(&seg_pmem_mtx);
388 389 return;
389 390 }
390 391 ASSERT(hp->p_halink[lix].p_lprev == NULL);
391 392
392 393 /*
393 394 * If asynchronous thread is not running we can remove the bucket from
394 395 * not currently active list. The bucket must be on this list since we
395 396 * already checked that it's not on the other list and the bucket from
396 397 * which we just deleted the last pcp entry must be still on one of the
397 398 * active bucket lists.
398 399 */
399 400 lix = !lix;
400 401 ASSERT(hp->p_halink[lix].p_lnext != NULL);
401 402 ASSERT(hp->p_halink[lix].p_lprev != NULL);
402 403
403 404 if (!seg_pathr_on) {
404 405 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
405 406 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
406 407 hp->p_halink[lix].p_lnext = NULL;
407 408 hp->p_halink[lix].p_lprev = NULL;
408 409 }
409 410 mutex_exit(&seg_pmem_mtx);
410 411 }
411 412
412 413 /*
413 414 * Check if bucket pointed by hp already has a pcp entry that matches request
414 415 * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
415 416 * Also delete matching entries that cover smaller address range but start
416 417 * at the same address as addr argument. Return the list of deleted entries if
417 418 * any. This is an internal helper function called from seg_pinsert() only
418 419 * for non wired shadow lists. The caller already holds a per seg/amp list
419 420 * lock.
420 421 */
421 422 static struct seg_pcache *
422 423 seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
423 424 caddr_t addr, size_t len, int *found)
424 425 {
425 426 struct seg_pcache *pcp;
426 427 struct seg_pcache *delcallb_list = NULL;
427 428
428 429 ASSERT(MUTEX_HELD(&hp->p_hmutex));
429 430
430 431 *found = 0;
431 432 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
432 433 pcp = pcp->p_hnext) {
433 434 ASSERT(pcp->p_hashp == hp);
434 435 if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
435 436 ASSERT(!IS_PCP_WIRED(pcp));
436 437 if (pcp->p_len < len) {
437 438 pcache_link_t *plinkp;
438 439 if (pcp->p_active) {
439 440 continue;
440 441 }
441 442 plinkp = &pcp->p_plink;
442 443 plinkp->p_lprev->p_lnext = plinkp->p_lnext;
443 444 plinkp->p_lnext->p_lprev = plinkp->p_lprev;
444 445 pcp->p_hprev->p_hnext = pcp->p_hnext;
445 446 pcp->p_hnext->p_hprev = pcp->p_hprev;
446 447 pcp->p_hprev = delcallb_list;
447 448 delcallb_list = pcp;
448 449 } else {
449 450 *found = 1;
450 451 break;
451 452 }
452 453 }
453 454 }
454 455 return (delcallb_list);
455 456 }
456 457
457 458 /*
458 459 * lookup an address range in pagelock cache. Return shadow list and bump up
459 460 * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
460 461 * as a lookup tag.
461 462 */
462 463 struct page **
463 464 seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
464 465 enum seg_rw rw, uint_t flags)
465 466 {
466 467 struct seg_pcache *pcp;
467 468 struct seg_phash *hp;
468 469 void *htag0;
469 470
470 471 ASSERT(seg != NULL);
471 472 ASSERT(rw == S_READ || rw == S_WRITE);
472 473
473 474 /*
474 475 * Skip pagelock cache, while DR is in progress or
475 476 * seg_pcache is off.
476 477 */
477 478 if (seg_pdisabled) {
478 479 return (NULL);
479 480 }
480 481 ASSERT(seg_phashsize_win != 0);
481 482
482 483 htag0 = (amp == NULL ? (void *)seg : (void *)amp);
483 484 hp = P_HASHBP(seg, htag0, addr, flags);
484 485 mutex_enter(&hp->p_hmutex);
485 486 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
486 487 pcp = pcp->p_hnext) {
487 488 ASSERT(pcp->p_hashp == hp);
488 489 if (P_MATCH(pcp, htag0, addr, len)) {
489 490 ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
490 491 /*
491 492 * If this request wants to write pages
492 493 * but write permissions starting from
493 494 * addr don't cover the entire length len
494 495 * return lookup failure back to the caller.
495 496 * It will check protections and fail this
496 497 * pagelock operation with EACCESS error.
497 498 */
498 499 if (rw == S_WRITE && pcp->p_wlen < len) {
499 500 break;
500 501 }
501 502 if (pcp->p_active == UINT_MAX) {
502 503 break;
503 504 }
504 505 pcp->p_active++;
505 506 if (rw == S_WRITE && !pcp->p_write) {
506 507 pcp->p_write = 1;
507 508 }
508 509 mutex_exit(&hp->p_hmutex);
509 510 return (pcp->p_pp);
510 511 }
511 512 }
512 513 mutex_exit(&hp->p_hmutex);
513 514 return (NULL);
514 515 }
515 516
516 517 /*
517 518 * mark address range inactive. If the cache is off or the address range is
518 519 * not in the cache or another shadow list that covers bigger range is found
519 520 * we call the segment driver to reclaim the pages. Otherwise just decrement
520 521 * active count and set ref bit. If amp is not NULL use amp as a lookup tag
521 522 * otherwise use seg as a lookup tag.
522 523 */
523 524 void
524 525 seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
525 526 size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
526 527 seg_preclaim_cbfunc_t callback)
527 528 {
528 529 struct seg_pcache *pcp;
529 530 struct seg_phash *hp;
530 531 kmutex_t *pmtx = NULL;
531 532 pcache_link_t *pheadp;
532 533 void *htag0;
533 534 pgcnt_t npages = 0;
534 535 int keep = 0;
535 536
536 537 ASSERT(seg != NULL);
537 538 ASSERT(rw == S_READ || rw == S_WRITE);
538 539
539 540 htag0 = (amp == NULL ? (void *)seg : (void *)amp);
540 541
541 542 /*
542 543 * Skip lookup if pcache is not configured.
543 544 */
544 545 if (seg_phashsize_win == 0) {
545 546 goto out;
546 547 }
547 548
548 549 /*
549 550 * Grab per seg/amp lock before hash lock if we are going to remove
550 551 * inactive entry from pcache.
551 552 */
552 553 if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
553 554 if (amp == NULL) {
554 555 pheadp = &seg->s_phead;
555 556 pmtx = &seg->s_pmtx;
556 557 } else {
557 558 pheadp = &->a_phead;
558 559 pmtx = &->a_pmtx;
559 560 }
560 561 mutex_enter(pmtx);
561 562 }
562 563
563 564 hp = P_HASHBP(seg, htag0, addr, flags);
564 565 mutex_enter(&hp->p_hmutex);
565 566 again:
566 567 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
567 568 pcp = pcp->p_hnext) {
568 569 ASSERT(pcp->p_hashp == hp);
569 570 if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
570 571 ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
571 572 ASSERT(pcp->p_active);
572 573 if (keep) {
573 574 /*
574 575 * Don't remove this pcp entry
575 576 * if we didn't find duplicate
576 577 * shadow lists on second search.
577 578 * Somebody removed those duplicates
578 579 * since we dropped hash lock after first
579 580 * search.
580 581 */
581 582 ASSERT(pmtx != NULL);
582 583 ASSERT(!IS_PFLAGS_WIRED(flags));
583 584 mutex_exit(pmtx);
584 585 pmtx = NULL;
585 586 }
586 587 pcp->p_active--;
587 588 if (pcp->p_active == 0 && (pmtx != NULL ||
588 589 (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
589 590
590 591 /*
591 592 * This entry is no longer active. Remove it
592 593 * now either because pcaching is temporarily
593 594 * disabled or there're other pcp entries that
594 595 * can match this pagelock request (i.e. this
595 596 * entry is a duplicate).
596 597 */
597 598
598 599 ASSERT(callback == pcp->p_callback);
599 600 if (pmtx != NULL) {
600 601 pcache_link_t *plinkp = &pcp->p_plink;
601 602 ASSERT(!IS_PCP_WIRED(pcp));
602 603 ASSERT(pheadp->p_lnext != pheadp);
603 604 ASSERT(pheadp->p_lprev != pheadp);
604 605 plinkp->p_lprev->p_lnext =
605 606 plinkp->p_lnext;
606 607 plinkp->p_lnext->p_lprev =
607 608 plinkp->p_lprev;
608 609 }
609 610 pcp->p_hprev->p_hnext = pcp->p_hnext;
610 611 pcp->p_hnext->p_hprev = pcp->p_hprev;
611 612 if (!IS_PCP_WIRED(pcp) &&
612 613 hp->p_hnext == (struct seg_pcache *)hp) {
613 614 /*
614 615 * We removed the last entry from this
615 616 * bucket. Now remove the bucket from
616 617 * its active list.
617 618 */
618 619 seg_premove_abuck(hp, 0);
619 620 }
620 621 mutex_exit(&hp->p_hmutex);
621 622 if (pmtx != NULL) {
622 623 mutex_exit(pmtx);
623 624 }
624 625 len = pcp->p_len;
625 626 npages = btop(len);
626 627 if (rw != S_WRITE && pcp->p_write) {
627 628 rw = S_WRITE;
628 629 }
629 630 kmem_cache_free(seg_pkmcache, pcp);
630 631 goto out;
631 632 } else {
632 633 /*
633 634 * We found a matching pcp entry but will not
634 635 * free it right away even if it's no longer
635 636 * active.
636 637 */
637 638 if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
638 639 /*
639 640 * Set the reference bit and mark the
640 641 * time of last access to this pcp
641 642 * so that asynchronous thread doesn't
642 643 * free it immediately since
643 644 * it may be reactivated very soon.
644 645 */
645 646 pcp->p_lbolt = ddi_get_lbolt();
646 647 pcp->p_ref = 1;
647 648 }
648 649 mutex_exit(&hp->p_hmutex);
649 650 if (pmtx != NULL) {
650 651 mutex_exit(pmtx);
651 652 }
652 653 return;
653 654 }
654 655 } else if (!IS_PFLAGS_WIRED(flags) &&
655 656 P_MATCH(pcp, htag0, addr, len)) {
656 657 /*
657 658 * This is a duplicate pcp entry. This situation may
658 659 * happen if a bigger shadow list that covers our
659 660 * range was added while our entry was still active.
660 661 * Now we can free our pcp entry if it becomes
661 662 * inactive.
662 663 */
663 664 if (!pcp->p_active) {
664 665 /*
665 666 * Mark this entry as referenced just in case
666 667 * we'll free our own pcp entry soon.
667 668 */
668 669 pcp->p_lbolt = ddi_get_lbolt();
669 670 pcp->p_ref = 1;
670 671 }
671 672 if (pmtx != NULL) {
672 673 /*
673 674 * we are already holding pmtx and found a
674 675 * duplicate. Don't keep our own pcp entry.
675 676 */
676 677 keep = 0;
677 678 continue;
678 679 }
679 680 /*
680 681 * We have to use mutex_tryenter to attempt to lock
681 682 * seg/amp list lock since we already hold hash lock
682 683 * and seg/amp list lock is above hash lock in lock
683 684 * order. If mutex_tryenter fails drop hash lock and
684 685 * retake both locks in correct order and research
685 686 * this hash chain.
686 687 */
687 688 ASSERT(keep == 0);
688 689 if (amp == NULL) {
689 690 pheadp = &seg->s_phead;
690 691 pmtx = &seg->s_pmtx;
691 692 } else {
692 693 pheadp = &->a_phead;
693 694 pmtx = &->a_pmtx;
694 695 }
695 696 if (!mutex_tryenter(pmtx)) {
696 697 mutex_exit(&hp->p_hmutex);
697 698 mutex_enter(pmtx);
698 699 mutex_enter(&hp->p_hmutex);
699 700 /*
700 701 * If we don't find bigger shadow list on
701 702 * second search (it may happen since we
702 703 * dropped bucket lock) keep the entry that
703 704 * matches our own shadow list.
704 705 */
705 706 keep = 1;
706 707 goto again;
707 708 }
708 709 }
709 710 }
710 711 mutex_exit(&hp->p_hmutex);
711 712 if (pmtx != NULL) {
712 713 mutex_exit(pmtx);
713 714 }
714 715 out:
715 716 (*callback)(htag0, addr, len, pp, rw, 0);
716 717 if (npages) {
717 718 mutex_enter(&seg_pmem_mtx);
718 719 ASSERT(seg_plocked >= npages);
719 720 seg_plocked -= npages;
720 721 if (!IS_PFLAGS_WIRED(flags)) {
721 722 ASSERT(seg_plocked_window >= npages);
722 723 seg_plocked_window -= npages;
723 724 }
724 725 mutex_exit(&seg_pmem_mtx);
725 726 }
726 727
727 728 }
728 729
729 730 #ifdef DEBUG
730 731 static uint32_t p_insert_chk_mtbf = 0;
731 732 #endif
732 733
733 734 /*
734 735 * The seg_pinsert_check() is used by segment drivers to predict whether
735 736 * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
736 737 */
737 738 /*ARGSUSED*/
738 739 int
739 740 seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
740 741 size_t len, uint_t flags)
741 742 {
742 743 ASSERT(seg != NULL);
743 744
744 745 #ifdef DEBUG
745 746 if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
746 747 return (SEGP_FAIL);
747 748 }
748 749 #endif
749 750
750 751 if (seg_pdisabled) {
751 752 return (SEGP_FAIL);
752 753 }
753 754 ASSERT(seg_phashsize_win != 0);
754 755
755 756 if (IS_PFLAGS_WIRED(flags)) {
756 757 return (SEGP_SUCCESS);
757 758 }
758 759
759 760 if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
760 761 return (SEGP_FAIL);
761 762 }
762 763
763 764 if (freemem < desfree) {
764 765 return (SEGP_FAIL);
765 766 }
766 767
767 768 return (SEGP_SUCCESS);
768 769 }
769 770
770 771 #ifdef DEBUG
771 772 static uint32_t p_insert_mtbf = 0;
772 773 #endif
773 774
774 775 /*
775 776 * Insert address range with shadow list into pagelock cache if there's no
776 777 * shadow list already cached for this address range. If the cache is off or
777 778 * caching is temporarily disabled or the allowed 'window' is exceeded return
778 779 * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
779 780 *
780 781 * For non wired shadow lists (segvn case) include address in the hashing
781 782 * function to avoid linking all the entries from the same segment or amp on
782 783 * the same bucket. amp is used instead of seg if amp is not NULL. Non wired
783 784 * pcache entries are also linked on a per segment/amp list so that all
784 785 * entries can be found quickly during seg/amp purge without walking the
785 786 * entire pcache hash table. For wired shadow lists (segspt case) we
786 787 * don't use address hashing and per segment linking because the caller
787 788 * currently inserts only one entry per segment that covers the entire
788 789 * segment. If we used per segment linking even for segspt it would complicate
789 790 * seg_ppurge_wiredpp() locking.
790 791 *
791 792 * Both hash bucket and per seg/amp locks need to be held before adding a non
792 793 * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
793 794 * first.
794 795 *
795 796 * This function will also remove from pcache old inactive shadow lists that
796 797 * overlap with this request but cover smaller range for the same start
797 798 * address.
798 799 */
799 800 int
800 801 seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
801 802 size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
802 803 seg_preclaim_cbfunc_t callback)
803 804 {
804 805 struct seg_pcache *pcp;
805 806 struct seg_phash *hp;
806 807 pgcnt_t npages;
807 808 pcache_link_t *pheadp;
808 809 kmutex_t *pmtx;
809 810 struct seg_pcache *delcallb_list = NULL;
810 811
811 812 ASSERT(seg != NULL);
812 813 ASSERT(rw == S_READ || rw == S_WRITE);
813 814 ASSERT(rw == S_READ || wlen == len);
814 815 ASSERT(rw == S_WRITE || wlen <= len);
815 816 ASSERT(amp == NULL || wlen == len);
816 817
817 818 #ifdef DEBUG
818 819 if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
819 820 return (SEGP_FAIL);
820 821 }
821 822 #endif
822 823
823 824 if (seg_pdisabled) {
824 825 return (SEGP_FAIL);
825 826 }
826 827 ASSERT(seg_phashsize_win != 0);
827 828
828 829 ASSERT((len & PAGEOFFSET) == 0);
829 830 npages = btop(len);
830 831 mutex_enter(&seg_pmem_mtx);
831 832 if (!IS_PFLAGS_WIRED(flags)) {
832 833 if (seg_plocked_window + npages > seg_pmaxwindow) {
833 834 mutex_exit(&seg_pmem_mtx);
834 835 return (SEGP_FAIL);
835 836 }
836 837 seg_plocked_window += npages;
837 838 }
838 839 seg_plocked += npages;
839 840 mutex_exit(&seg_pmem_mtx);
840 841
841 842 pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
842 843 /*
843 844 * If amp is not NULL set htag0 to amp otherwise set it to seg.
844 845 */
845 846 if (amp == NULL) {
846 847 pcp->p_htag0 = (void *)seg;
847 848 pcp->p_flags = flags & 0xffff;
848 849 } else {
849 850 pcp->p_htag0 = (void *)amp;
850 851 pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
851 852 }
852 853 pcp->p_addr = addr;
853 854 pcp->p_len = len;
854 855 pcp->p_wlen = wlen;
855 856 pcp->p_pp = pp;
856 857 pcp->p_write = (rw == S_WRITE);
857 858 pcp->p_callback = callback;
858 859 pcp->p_active = 1;
859 860
860 861 hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
861 862 if (!IS_PFLAGS_WIRED(flags)) {
862 863 int found;
863 864 void *htag0;
864 865 if (amp == NULL) {
865 866 pheadp = &seg->s_phead;
866 867 pmtx = &seg->s_pmtx;
867 868 htag0 = (void *)seg;
868 869 } else {
869 870 pheadp = &->a_phead;
870 871 pmtx = &->a_pmtx;
871 872 htag0 = (void *)amp;
872 873 }
873 874 mutex_enter(pmtx);
874 875 mutex_enter(&hp->p_hmutex);
875 876 delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
876 877 len, &found);
877 878 if (found) {
878 879 mutex_exit(&hp->p_hmutex);
879 880 mutex_exit(pmtx);
880 881 mutex_enter(&seg_pmem_mtx);
881 882 seg_plocked -= npages;
882 883 seg_plocked_window -= npages;
883 884 mutex_exit(&seg_pmem_mtx);
884 885 kmem_cache_free(seg_pkmcache, pcp);
885 886 goto out;
886 887 }
887 888 pcp->p_plink.p_lnext = pheadp->p_lnext;
888 889 pcp->p_plink.p_lprev = pheadp;
889 890 pheadp->p_lnext->p_lprev = &pcp->p_plink;
890 891 pheadp->p_lnext = &pcp->p_plink;
891 892 } else {
892 893 mutex_enter(&hp->p_hmutex);
893 894 }
894 895 pcp->p_hashp = hp;
895 896 pcp->p_hnext = hp->p_hnext;
896 897 pcp->p_hprev = (struct seg_pcache *)hp;
897 898 hp->p_hnext->p_hprev = pcp;
898 899 hp->p_hnext = pcp;
899 900 if (!IS_PFLAGS_WIRED(flags) &&
900 901 hp->p_hprev == pcp) {
901 902 seg_padd_abuck(hp);
902 903 }
903 904 mutex_exit(&hp->p_hmutex);
904 905 if (!IS_PFLAGS_WIRED(flags)) {
905 906 mutex_exit(pmtx);
906 907 }
907 908
908 909 out:
909 910 npages = 0;
910 911 while (delcallb_list != NULL) {
911 912 pcp = delcallb_list;
912 913 delcallb_list = pcp->p_hprev;
913 914 ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
914 915 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
915 916 pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
916 917 npages += btop(pcp->p_len);
917 918 kmem_cache_free(seg_pkmcache, pcp);
918 919 }
919 920 if (npages) {
920 921 ASSERT(!IS_PFLAGS_WIRED(flags));
921 922 mutex_enter(&seg_pmem_mtx);
922 923 ASSERT(seg_plocked >= npages);
923 924 ASSERT(seg_plocked_window >= npages);
924 925 seg_plocked -= npages;
925 926 seg_plocked_window -= npages;
926 927 mutex_exit(&seg_pmem_mtx);
927 928 }
928 929
929 930 return (SEGP_SUCCESS);
930 931 }
931 932
932 933 /*
933 934 * purge entries from the pagelock cache if not active
934 935 * and not recently used.
935 936 */
936 937 static void
937 938 seg_ppurge_async(int force)
938 939 {
939 940 struct seg_pcache *delcallb_list = NULL;
940 941 struct seg_pcache *pcp;
941 942 struct seg_phash *hp;
942 943 pgcnt_t npages = 0;
943 944 pgcnt_t npages_window = 0;
944 945 pgcnt_t npgs_to_purge;
945 946 pgcnt_t npgs_purged = 0;
946 947 int hlinks = 0;
947 948 int hlix;
948 949 pcache_link_t *hlinkp;
949 950 pcache_link_t *hlnextp = NULL;
950 951 int lowmem;
951 952 int trim;
952 953
953 954 ASSERT(seg_phashsize_win != 0);
954 955
955 956 /*
956 957 * if the cache is off or empty, return
957 958 */
958 959 if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
959 960 return;
960 961 }
961 962
962 963 if (!force) {
963 964 lowmem = 0;
964 965 trim = 0;
965 966 if (freemem < lotsfree + needfree) {
966 967 spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
967 968 if (fmem <= 5 * (desfree >> 2)) {
968 969 lowmem = 1;
969 970 } else if (fmem <= 7 * (lotsfree >> 3)) {
970 971 if (seg_plocked_window >=
971 972 (availrmem_initial >> 1)) {
972 973 lowmem = 1;
973 974 }
974 975 } else if (fmem < lotsfree) {
975 976 if (seg_plocked_window >=
976 977 3 * (availrmem_initial >> 2)) {
977 978 lowmem = 1;
978 979 }
979 980 }
980 981 }
981 982 if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
982 983 trim = 1;
983 984 }
984 985 if (!lowmem && !trim) {
985 986 return;
986 987 }
987 988 npgs_to_purge = seg_plocked_window >>
988 989 seg_pshrink_shift;
989 990 if (lowmem) {
990 991 npgs_to_purge = MIN(npgs_to_purge,
991 992 MAX(seg_pmaxapurge_npages, desfree));
992 993 } else {
993 994 npgs_to_purge = MIN(npgs_to_purge,
994 995 seg_pmaxapurge_npages);
995 996 }
996 997 if (npgs_to_purge == 0) {
997 998 return;
998 999 }
999 1000 } else {
1000 1001 struct seg_phash_wired *hpw;
1001 1002
1002 1003 ASSERT(seg_phashsize_wired != 0);
1003 1004
1004 1005 for (hpw = seg_phashtab_wired;
1005 1006 hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
1006 1007
1007 1008 if (hpw->p_hnext == (struct seg_pcache *)hpw) {
1008 1009 continue;
1009 1010 }
1010 1011
1011 1012 mutex_enter(&hpw->p_hmutex);
1012 1013
1013 1014 for (pcp = hpw->p_hnext;
1014 1015 pcp != (struct seg_pcache *)hpw;
1015 1016 pcp = pcp->p_hnext) {
1016 1017
1017 1018 ASSERT(IS_PCP_WIRED(pcp));
1018 1019 ASSERT(pcp->p_hashp ==
1019 1020 (struct seg_phash *)hpw);
1020 1021
1021 1022 if (pcp->p_active) {
1022 1023 continue;
1023 1024 }
1024 1025 pcp->p_hprev->p_hnext = pcp->p_hnext;
1025 1026 pcp->p_hnext->p_hprev = pcp->p_hprev;
1026 1027 pcp->p_hprev = delcallb_list;
1027 1028 delcallb_list = pcp;
1028 1029 }
1029 1030 mutex_exit(&hpw->p_hmutex);
1030 1031 }
1031 1032 }
1032 1033
1033 1034 mutex_enter(&seg_pmem_mtx);
1034 1035 if (seg_pathr_on) {
1035 1036 mutex_exit(&seg_pmem_mtx);
1036 1037 goto runcb;
1037 1038 }
1038 1039 seg_pathr_on = 1;
1039 1040 mutex_exit(&seg_pmem_mtx);
1040 1041 ASSERT(seg_pahcur <= 1);
1041 1042 hlix = !seg_pahcur;
1042 1043
1043 1044 again:
1044 1045 for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
1045 1046 hlinkp = hlnextp) {
1046 1047
1047 1048 hlnextp = hlinkp->p_lnext;
1048 1049 ASSERT(hlnextp != NULL);
1049 1050
1050 1051 hp = hlink2phash(hlinkp, hlix);
1051 1052 if (hp->p_hnext == (struct seg_pcache *)hp) {
1052 1053 seg_pathr_empty_ahb++;
1053 1054 continue;
1054 1055 }
1055 1056 seg_pathr_full_ahb++;
1056 1057 mutex_enter(&hp->p_hmutex);
1057 1058
1058 1059 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
1059 1060 pcp = pcp->p_hnext) {
1060 1061 pcache_link_t *pheadp;
1061 1062 pcache_link_t *plinkp;
1062 1063 void *htag0;
1063 1064 kmutex_t *pmtx;
1064 1065
1065 1066 ASSERT(!IS_PCP_WIRED(pcp));
1066 1067 ASSERT(pcp->p_hashp == hp);
1067 1068
1068 1069 if (pcp->p_active) {
1069 1070 continue;
1070 1071 }
1071 1072 if (!force && pcp->p_ref &&
1072 1073 PCP_AGE(pcp) < seg_pmax_pcpage) {
1073 1074 pcp->p_ref = 0;
1074 1075 continue;
1075 1076 }
1076 1077 plinkp = &pcp->p_plink;
1077 1078 htag0 = pcp->p_htag0;
1078 1079 if (pcp->p_flags & SEGP_AMP) {
1079 1080 pheadp = &((amp_t *)htag0)->a_phead;
1080 1081 pmtx = &((amp_t *)htag0)->a_pmtx;
1081 1082 } else {
1082 1083 pheadp = &((seg_t *)htag0)->s_phead;
1083 1084 pmtx = &((seg_t *)htag0)->s_pmtx;
1084 1085 }
1085 1086 if (!mutex_tryenter(pmtx)) {
1086 1087 continue;
1087 1088 }
1088 1089 ASSERT(pheadp->p_lnext != pheadp);
1089 1090 ASSERT(pheadp->p_lprev != pheadp);
1090 1091 plinkp->p_lprev->p_lnext =
1091 1092 plinkp->p_lnext;
1092 1093 plinkp->p_lnext->p_lprev =
1093 1094 plinkp->p_lprev;
1094 1095 pcp->p_hprev->p_hnext = pcp->p_hnext;
1095 1096 pcp->p_hnext->p_hprev = pcp->p_hprev;
1096 1097 mutex_exit(pmtx);
1097 1098 pcp->p_hprev = delcallb_list;
1098 1099 delcallb_list = pcp;
1099 1100 npgs_purged += btop(pcp->p_len);
1100 1101 }
1101 1102 if (hp->p_hnext == (struct seg_pcache *)hp) {
1102 1103 seg_premove_abuck(hp, 1);
1103 1104 }
1104 1105 mutex_exit(&hp->p_hmutex);
1105 1106 if (npgs_purged >= seg_plocked_window) {
1106 1107 break;
1107 1108 }
1108 1109 if (!force) {
1109 1110 if (npgs_purged >= npgs_to_purge) {
1110 1111 break;
1111 1112 }
1112 1113 if (!trim && !(seg_pathr_full_ahb & 15)) {
1113 1114 ASSERT(lowmem);
1114 1115 if (freemem >= lotsfree + needfree) {
1115 1116 break;
1116 1117 }
1117 1118 }
1118 1119 }
1119 1120 }
1120 1121
1121 1122 if (hlinkp == &seg_pahhead[hlix]) {
1122 1123 /*
1123 1124 * We processed the entire hlix active bucket list
1124 1125 * but didn't find enough pages to reclaim.
1125 1126 * Switch the lists and walk the other list
1126 1127 * if we haven't done it yet.
1127 1128 */
1128 1129 mutex_enter(&seg_pmem_mtx);
1129 1130 ASSERT(seg_pathr_on);
1130 1131 ASSERT(seg_pahcur == !hlix);
1131 1132 seg_pahcur = hlix;
1132 1133 mutex_exit(&seg_pmem_mtx);
1133 1134 if (++hlinks < 2) {
1134 1135 hlix = !hlix;
1135 1136 goto again;
1136 1137 }
1137 1138 } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
1138 1139 seg_pahhead[hlix].p_lnext != hlinkp) {
1139 1140 ASSERT(hlinkp != NULL);
1140 1141 ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
1141 1142 ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
1142 1143 ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
1143 1144
1144 1145 /*
1145 1146 * Reinsert the header to point to hlinkp
1146 1147 * so that we start from hlinkp bucket next time around.
1147 1148 */
1148 1149 seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
1149 1150 seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
1150 1151 seg_pahhead[hlix].p_lnext = hlinkp;
1151 1152 seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
1152 1153 hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
1153 1154 hlinkp->p_lprev = &seg_pahhead[hlix];
1154 1155 }
1155 1156
1156 1157 mutex_enter(&seg_pmem_mtx);
1157 1158 ASSERT(seg_pathr_on);
1158 1159 seg_pathr_on = 0;
1159 1160 mutex_exit(&seg_pmem_mtx);
1160 1161
1161 1162 runcb:
1162 1163 /*
1163 1164 * Run the delayed callback list. segments/amps can't go away until
1164 1165 * callback is executed since they must have non 0 softlockcnt. That's
1165 1166 * why we don't need to hold as/seg/amp locks to execute the callback.
1166 1167 */
1167 1168 while (delcallb_list != NULL) {
1168 1169 pcp = delcallb_list;
1169 1170 delcallb_list = pcp->p_hprev;
1170 1171 ASSERT(!pcp->p_active);
1171 1172 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1172 1173 pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
1173 1174 npages += btop(pcp->p_len);
1174 1175 if (!IS_PCP_WIRED(pcp)) {
1175 1176 npages_window += btop(pcp->p_len);
1176 1177 }
1177 1178 kmem_cache_free(seg_pkmcache, pcp);
1178 1179 }
1179 1180 if (npages) {
1180 1181 mutex_enter(&seg_pmem_mtx);
1181 1182 ASSERT(seg_plocked >= npages);
1182 1183 ASSERT(seg_plocked_window >= npages_window);
1183 1184 seg_plocked -= npages;
1184 1185 seg_plocked_window -= npages_window;
1185 1186 mutex_exit(&seg_pmem_mtx);
1186 1187 }
1187 1188 }
1188 1189
1189 1190 /*
1190 1191 * Remove cached pages for segment(s) entries from hashtable. The segments
1191 1192 * are identified by pp array. This is useful for multiple seg's cached on
1192 1193 * behalf of dummy segment (ISM/DISM) with common pp array.
1193 1194 */
1194 1195 void
1195 1196 seg_ppurge_wiredpp(struct page **pp)
1196 1197 {
1197 1198 struct seg_pcache *pcp;
1198 1199 struct seg_phash_wired *hp;
1199 1200 pgcnt_t npages = 0;
1200 1201 struct seg_pcache *delcallb_list = NULL;
1201 1202
1202 1203 /*
1203 1204 * if the cache is empty, return
1204 1205 */
1205 1206 if (seg_plocked == 0) {
1206 1207 return;
1207 1208 }
1208 1209 ASSERT(seg_phashsize_wired != 0);
1209 1210
1210 1211 for (hp = seg_phashtab_wired;
1211 1212 hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
1212 1213 if (hp->p_hnext == (struct seg_pcache *)hp) {
1213 1214 continue;
1214 1215 }
1215 1216 mutex_enter(&hp->p_hmutex);
1216 1217 pcp = hp->p_hnext;
1217 1218 while (pcp != (struct seg_pcache *)hp) {
1218 1219 ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
1219 1220 ASSERT(IS_PCP_WIRED(pcp));
1220 1221 /*
1221 1222 * purge entries which are not active
1222 1223 */
1223 1224 if (!pcp->p_active && pcp->p_pp == pp) {
1224 1225 ASSERT(pcp->p_htag0 != NULL);
1225 1226 pcp->p_hprev->p_hnext = pcp->p_hnext;
1226 1227 pcp->p_hnext->p_hprev = pcp->p_hprev;
1227 1228 pcp->p_hprev = delcallb_list;
1228 1229 delcallb_list = pcp;
1229 1230 }
1230 1231 pcp = pcp->p_hnext;
1231 1232 }
1232 1233 mutex_exit(&hp->p_hmutex);
1233 1234 /*
1234 1235 * segments can't go away until callback is executed since
1235 1236 * they must have non 0 softlockcnt. That's why we don't
1236 1237 * need to hold as/seg locks to execute the callback.
1237 1238 */
1238 1239 while (delcallb_list != NULL) {
1239 1240 int done;
1240 1241 pcp = delcallb_list;
1241 1242 delcallb_list = pcp->p_hprev;
1242 1243 ASSERT(!pcp->p_active);
1243 1244 done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1244 1245 pcp->p_len, pcp->p_pp,
1245 1246 pcp->p_write ? S_WRITE : S_READ, 1);
1246 1247 npages += btop(pcp->p_len);
1247 1248 ASSERT(IS_PCP_WIRED(pcp));
1248 1249 kmem_cache_free(seg_pkmcache, pcp);
1249 1250 if (done) {
1250 1251 ASSERT(delcallb_list == NULL);
1251 1252 goto out;
1252 1253 }
1253 1254 }
1254 1255 }
1255 1256
1256 1257 out:
1257 1258 mutex_enter(&seg_pmem_mtx);
1258 1259 ASSERT(seg_plocked >= npages);
1259 1260 seg_plocked -= npages;
1260 1261 mutex_exit(&seg_pmem_mtx);
1261 1262 }
1262 1263
1263 1264 /*
1264 1265 * purge all entries for a given segment. Since we
1265 1266 * callback into the segment driver directly for page
1266 1267 * reclaim the caller needs to hold the right locks.
1267 1268 */
1268 1269 void
1269 1270 seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
1270 1271 {
1271 1272 struct seg_pcache *delcallb_list = NULL;
1272 1273 struct seg_pcache *pcp;
1273 1274 struct seg_phash *hp;
1274 1275 pgcnt_t npages = 0;
1275 1276 void *htag0;
1276 1277
1277 1278 if (seg_plocked == 0) {
1278 1279 return;
1279 1280 }
1280 1281 ASSERT(seg_phashsize_win != 0);
1281 1282
1282 1283 /*
1283 1284 * If amp is not NULL use amp as a lookup tag otherwise use seg
1284 1285 * as a lookup tag.
1285 1286 */
1286 1287 htag0 = (amp == NULL ? (void *)seg : (void *)amp);
1287 1288 ASSERT(htag0 != NULL);
1288 1289 if (IS_PFLAGS_WIRED(flags)) {
1289 1290 hp = P_HASHBP(seg, htag0, 0, flags);
1290 1291 mutex_enter(&hp->p_hmutex);
1291 1292 pcp = hp->p_hnext;
1292 1293 while (pcp != (struct seg_pcache *)hp) {
1293 1294 ASSERT(pcp->p_hashp == hp);
1294 1295 ASSERT(IS_PCP_WIRED(pcp));
1295 1296 if (pcp->p_htag0 == htag0) {
1296 1297 if (pcp->p_active) {
1297 1298 break;
1298 1299 }
1299 1300 pcp->p_hprev->p_hnext = pcp->p_hnext;
1300 1301 pcp->p_hnext->p_hprev = pcp->p_hprev;
1301 1302 pcp->p_hprev = delcallb_list;
1302 1303 delcallb_list = pcp;
1303 1304 }
1304 1305 pcp = pcp->p_hnext;
1305 1306 }
1306 1307 mutex_exit(&hp->p_hmutex);
1307 1308 } else {
1308 1309 pcache_link_t *plinkp;
1309 1310 pcache_link_t *pheadp;
1310 1311 kmutex_t *pmtx;
1311 1312
1312 1313 if (amp == NULL) {
1313 1314 ASSERT(seg != NULL);
1314 1315 pheadp = &seg->s_phead;
1315 1316 pmtx = &seg->s_pmtx;
1316 1317 } else {
1317 1318 pheadp = &->a_phead;
1318 1319 pmtx = &->a_pmtx;
1319 1320 }
1320 1321 mutex_enter(pmtx);
1321 1322 while ((plinkp = pheadp->p_lnext) != pheadp) {
1322 1323 pcp = plink2pcache(plinkp);
1323 1324 ASSERT(!IS_PCP_WIRED(pcp));
1324 1325 ASSERT(pcp->p_htag0 == htag0);
1325 1326 hp = pcp->p_hashp;
1326 1327 mutex_enter(&hp->p_hmutex);
1327 1328 if (pcp->p_active) {
1328 1329 mutex_exit(&hp->p_hmutex);
1329 1330 break;
1330 1331 }
1331 1332 ASSERT(plinkp->p_lprev == pheadp);
1332 1333 pheadp->p_lnext = plinkp->p_lnext;
1333 1334 plinkp->p_lnext->p_lprev = pheadp;
1334 1335 pcp->p_hprev->p_hnext = pcp->p_hnext;
1335 1336 pcp->p_hnext->p_hprev = pcp->p_hprev;
1336 1337 pcp->p_hprev = delcallb_list;
1337 1338 delcallb_list = pcp;
1338 1339 if (hp->p_hnext == (struct seg_pcache *)hp) {
1339 1340 seg_premove_abuck(hp, 0);
1340 1341 }
1341 1342 mutex_exit(&hp->p_hmutex);
1342 1343 }
1343 1344 mutex_exit(pmtx);
1344 1345 }
1345 1346 while (delcallb_list != NULL) {
1346 1347 pcp = delcallb_list;
1347 1348 delcallb_list = pcp->p_hprev;
1348 1349 ASSERT(!pcp->p_active);
1349 1350 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
1350 1351 pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
1351 1352 npages += btop(pcp->p_len);
1352 1353 kmem_cache_free(seg_pkmcache, pcp);
1353 1354 }
1354 1355 mutex_enter(&seg_pmem_mtx);
1355 1356 ASSERT(seg_plocked >= npages);
1356 1357 seg_plocked -= npages;
1357 1358 if (!IS_PFLAGS_WIRED(flags)) {
1358 1359 ASSERT(seg_plocked_window >= npages);
1359 1360 seg_plocked_window -= npages;
1360 1361 }
1361 1362 mutex_exit(&seg_pmem_mtx);
1362 1363 }
1363 1364
1364 1365 static void seg_pinit_mem_config(void);
1365 1366
1366 1367 /*
1367 1368 * setup the pagelock cache
1368 1369 */
1369 1370 static void
1370 1371 seg_pinit(void)
1371 1372 {
1372 1373 struct seg_phash *hp;
1373 1374 ulong_t i;
1374 1375 pgcnt_t physmegs;
1375 1376
1376 1377 seg_plocked = 0;
1377 1378 seg_plocked_window = 0;
1378 1379
1379 1380 if (segpcache_enabled == 0) {
1380 1381 seg_phashsize_win = 0;
1381 1382 seg_phashsize_wired = 0;
1382 1383 seg_pdisabled = 1;
1383 1384 return;
1384 1385 }
1385 1386
1386 1387 seg_pdisabled = 0;
1387 1388 seg_pkmcache = kmem_cache_create("seg_pcache",
1388 1389 sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
1389 1390 if (segpcache_pcp_maxage_ticks <= 0) {
1390 1391 segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
1391 1392 }
1392 1393 seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
1393 1394 seg_pathr_empty_ahb = 0;
1394 1395 seg_pathr_full_ahb = 0;
1395 1396 seg_pshrink_shift = segpcache_shrink_shift;
1396 1397 seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
1397 1398
1398 1399 mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
1399 1400 mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
1400 1401 mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
1401 1402 cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
1402 1403
1403 1404 physmegs = physmem >> (20 - PAGESHIFT);
1404 1405
1405 1406 /*
1406 1407 * If segpcache_hashsize_win was not set in /etc/system or it has
1407 1408 * absurd value set it to a default.
1408 1409 */
1409 1410 if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
1410 1411 /*
1411 1412 * Create one bucket per 32K (or at least per 8 pages) of
1412 1413 * available memory.
1413 1414 */
1414 1415 pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
1415 1416 segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
1416 1417 }
1417 1418 if (!ISP2(segpcache_hashsize_win)) {
1418 1419 ulong_t rndfac = ~(1UL <<
1419 1420 (highbit(segpcache_hashsize_win) - 1));
1420 1421 rndfac &= segpcache_hashsize_win;
1421 1422 segpcache_hashsize_win += rndfac;
1422 1423 segpcache_hashsize_win = 1 <<
1423 1424 (highbit(segpcache_hashsize_win) - 1);
1424 1425 }
1425 1426 seg_phashsize_win = segpcache_hashsize_win;
1426 1427 seg_phashtab_win = kmem_zalloc(
1427 1428 seg_phashsize_win * sizeof (struct seg_phash),
1428 1429 KM_SLEEP);
1429 1430 for (i = 0; i < seg_phashsize_win; i++) {
1430 1431 hp = &seg_phashtab_win[i];
1431 1432 hp->p_hnext = (struct seg_pcache *)hp;
1432 1433 hp->p_hprev = (struct seg_pcache *)hp;
1433 1434 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1434 1435 }
1435 1436
1436 1437 seg_pahcur = 0;
1437 1438 seg_pathr_on = 0;
1438 1439 seg_pahhead[0].p_lnext = &seg_pahhead[0];
1439 1440 seg_pahhead[0].p_lprev = &seg_pahhead[0];
1440 1441 seg_pahhead[1].p_lnext = &seg_pahhead[1];
1441 1442 seg_pahhead[1].p_lprev = &seg_pahhead[1];
1442 1443
1443 1444 /*
1444 1445 * If segpcache_hashsize_wired was not set in /etc/system or it has
1445 1446 * absurd value set it to a default.
1446 1447 */
1447 1448 if (segpcache_hashsize_wired == 0 ||
1448 1449 segpcache_hashsize_wired > physmem / 4) {
1449 1450 /*
1450 1451 * Choose segpcache_hashsize_wired based on physmem.
1451 1452 * Create a bucket per 128K bytes upto 256K buckets.
1452 1453 */
1453 1454 if (physmegs < 20 * 1024) {
1454 1455 segpcache_hashsize_wired = MAX(1024, physmegs << 3);
1455 1456 } else {
1456 1457 segpcache_hashsize_wired = 256 * 1024;
1457 1458 }
1458 1459 }
1459 1460 if (!ISP2(segpcache_hashsize_wired)) {
1460 1461 segpcache_hashsize_wired = 1 <<
1461 1462 highbit(segpcache_hashsize_wired);
1462 1463 }
1463 1464 seg_phashsize_wired = segpcache_hashsize_wired;
1464 1465 seg_phashtab_wired = kmem_zalloc(
1465 1466 seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
1466 1467 for (i = 0; i < seg_phashsize_wired; i++) {
1467 1468 hp = (struct seg_phash *)&seg_phashtab_wired[i];
1468 1469 hp->p_hnext = (struct seg_pcache *)hp;
1469 1470 hp->p_hprev = (struct seg_pcache *)hp;
1470 1471 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1471 1472 }
1472 1473
1473 1474 if (segpcache_maxwindow == 0) {
1474 1475 if (physmegs < 64) {
1475 1476 /* 3% of memory */
1476 1477 segpcache_maxwindow = availrmem >> 5;
1477 1478 } else if (physmegs < 512) {
1478 1479 /* 12% of memory */
1479 1480 segpcache_maxwindow = availrmem >> 3;
1480 1481 } else if (physmegs < 1024) {
1481 1482 /* 25% of memory */
1482 1483 segpcache_maxwindow = availrmem >> 2;
1483 1484 } else if (physmegs < 2048) {
1484 1485 /* 50% of memory */
1485 1486 segpcache_maxwindow = availrmem >> 1;
1486 1487 } else {
1487 1488 /* no limit */
1488 1489 segpcache_maxwindow = (pgcnt_t)-1;
1489 1490 }
1490 1491 }
1491 1492 seg_pmaxwindow = segpcache_maxwindow;
1492 1493 seg_pinit_mem_config();
1493 1494 }
1494 1495
1495 1496 /*
1496 1497 * called by pageout if memory is low
1497 1498 */
1498 1499 void
1499 1500 seg_preap(void)
1500 1501 {
1501 1502 /*
1502 1503 * if the cache is off or empty, return
1503 1504 */
1504 1505 if (seg_plocked_window == 0) {
1505 1506 return;
1506 1507 }
1507 1508 ASSERT(seg_phashsize_win != 0);
1508 1509
1509 1510 /*
1510 1511 * If somebody is already purging pcache
1511 1512 * just return.
1512 1513 */
1513 1514 if (seg_pdisabled) {
1514 1515 return;
1515 1516 }
1516 1517
1517 1518 cv_signal(&seg_pasync_cv);
1518 1519 }
1519 1520
1520 1521 /*
1521 1522 * run as a backgroud thread and reclaim pagelock
1522 1523 * pages which have not been used recently
1523 1524 */
1524 1525 void
1525 1526 seg_pasync_thread(void)
1526 1527 {
1527 1528 callb_cpr_t cpr_info;
1528 1529
1529 1530 if (seg_phashsize_win == 0) {
1530 1531 thread_exit();
1531 1532 /*NOTREACHED*/
1532 1533 }
1533 1534
1534 1535 seg_pasync_thr = curthread;
1535 1536
1536 1537 CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
1537 1538 callb_generic_cpr, "seg_pasync");
1538 1539
1539 1540 if (segpcache_reap_ticks <= 0) {
1540 1541 segpcache_reap_ticks = segpcache_reap_sec * hz;
1541 1542 }
1542 1543
1543 1544 mutex_enter(&seg_pasync_mtx);
1544 1545 for (;;) {
1545 1546 CALLB_CPR_SAFE_BEGIN(&cpr_info);
1546 1547 (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx,
1547 1548 segpcache_reap_ticks, TR_CLOCK_TICK);
1548 1549 CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
1549 1550 if (seg_pdisabled == 0) {
1550 1551 seg_ppurge_async(0);
1551 1552 }
1552 1553 }
1553 1554 }
1554 1555
1555 1556 static struct kmem_cache *seg_cache;
1556 1557
1557 1558 /*
1558 1559 * Initialize segment management data structures.
1559 1560 */
1560 1561 void
1561 1562 seg_init(void)
1562 1563 {
1563 1564 kstat_t *ksp;
1564 1565
1565 1566 seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
1566 1567 0, NULL, NULL, NULL, NULL, NULL, 0);
1567 1568
1568 1569 ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
1569 1570 segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
1570 1571 if (ksp) {
1571 1572 ksp->ks_data = (void *)segadvstat_ptr;
1572 1573 kstat_install(ksp);
1573 1574 }
1574 1575
1575 1576 seg_pinit();
1576 1577 }
1577 1578
1578 1579 /*
1579 1580 * Allocate a segment to cover [base, base+size]
1580 1581 * and attach it to the specified address space.
1581 1582 */
1582 1583 struct seg *
1583 1584 seg_alloc(struct as *as, caddr_t base, size_t size)
1584 1585 {
1585 1586 struct seg *new;
1586 1587 caddr_t segbase;
1587 1588 size_t segsize;
1588 1589
1589 1590 segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
1590 1591 segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
1591 1592 (uintptr_t)segbase;
1592 1593
1593 1594 if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
1594 1595 return ((struct seg *)NULL); /* bad virtual addr range */
1595 1596
1596 1597 if (as != &kas &&
1597 1598 valid_usr_range(segbase, segsize, 0, as,
1598 1599 as->a_userlimit) != RANGE_OKAY)
1599 1600 return ((struct seg *)NULL); /* bad virtual addr range */
1600 1601
1601 1602 new = kmem_cache_alloc(seg_cache, KM_SLEEP);
1602 1603 new->s_ops = NULL;
1603 1604 new->s_data = NULL;
1604 1605 new->s_szc = 0;
1605 1606 new->s_flags = 0;
1606 1607 mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
1607 1608 new->s_phead.p_lnext = &new->s_phead;
1608 1609 new->s_phead.p_lprev = &new->s_phead;
1609 1610 if (seg_attach(as, segbase, segsize, new) < 0) {
1610 1611 kmem_cache_free(seg_cache, new);
1611 1612 return ((struct seg *)NULL);
1612 1613 }
1613 1614 /* caller must fill in ops, data */
1614 1615 return (new);
1615 1616 }
1616 1617
1617 1618 /*
1618 1619 * Attach a segment to the address space. Used by seg_alloc()
1619 1620 * and for kernel startup to attach to static segments.
1620 1621 */
1621 1622 int
1622 1623 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
1623 1624 {
1624 1625 seg->s_as = as;
1625 1626 seg->s_base = base;
1626 1627 seg->s_size = size;
1627 1628
1628 1629 /*
1629 1630 * as_addseg() will add the segment at the appropraite point
1630 1631 * in the list. It will return -1 if there is overlap with
1631 1632 * an already existing segment.
1632 1633 */
1633 1634 return (as_addseg(as, seg));
1634 1635 }
1635 1636
1636 1637 /*
1637 1638 * Unmap a segment and free it from its associated address space.
1638 1639 * This should be called by anybody who's finished with a whole segment's
1639 1640 * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the
1640 1641 * responsibility of the segment driver to unlink the the segment
1641 1642 * from the address space, and to free public and private data structures
1642 1643 * associated with the segment. (This is typically done by a call to
1643 1644 * seg_free()).
1644 1645 */
1645 1646 void
1646 1647 seg_unmap(struct seg *seg)
1647 1648 {
1648 1649 #ifdef DEBUG
1649 1650 int ret;
1650 1651 #endif /* DEBUG */
1651 1652
1652 1653 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1653 1654
1654 1655 /* Shouldn't have called seg_unmap if mapping isn't yet established */
1655 1656 ASSERT(seg->s_data != NULL);
1656 1657
1657 1658 /* Unmap the whole mapping */
1658 1659 #ifdef DEBUG
1659 1660 ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1660 1661 ASSERT(ret == 0);
1661 1662 #else
1662 1663 SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1663 1664 #endif /* DEBUG */
1664 1665 }
1665 1666
1666 1667 /*
1667 1668 * Free the segment from its associated as. This should only be called
1668 1669 * if a mapping to the segment has not yet been established (e.g., if
1669 1670 * an error occurs in the middle of doing an as_map when the segment
1670 1671 * has already been partially set up) or if it has already been deleted
1671 1672 * (e.g., from a segment driver unmap routine if the unmap applies to the
1672 1673 * entire segment). If the mapping is currently set up then seg_unmap() should
1673 1674 * be called instead.
1674 1675 */
1675 1676 void
1676 1677 seg_free(struct seg *seg)
1677 1678 {
1678 1679 register struct as *as = seg->s_as;
1679 1680 struct seg *tseg = as_removeseg(as, seg);
1680 1681
1681 1682 ASSERT(tseg == seg);
1682 1683
1683 1684 /*
1684 1685 * If the segment private data field is NULL,
1685 1686 * then segment driver is not attached yet.
1686 1687 */
1687 1688 if (seg->s_data != NULL)
1688 1689 SEGOP_FREE(seg);
1689 1690
1690 1691 mutex_destroy(&seg->s_pmtx);
1691 1692 ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
1692 1693 ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
1693 1694 kmem_cache_free(seg_cache, seg);
1694 1695 }
1695 1696
1696 1697 /*ARGSUSED*/
1697 1698 static void
1698 1699 seg_p_mem_config_post_add(
1699 1700 void *arg,
1700 1701 pgcnt_t delta_pages)
1701 1702 {
1702 1703 /* Nothing to do. */
1703 1704 }
1704 1705
1705 1706 void
1706 1707 seg_p_enable(void)
1707 1708 {
1708 1709 mutex_enter(&seg_pcache_mtx);
1709 1710 ASSERT(seg_pdisabled != 0);
1710 1711 seg_pdisabled--;
1711 1712 mutex_exit(&seg_pcache_mtx);
1712 1713 }
1713 1714
1714 1715 /*
1715 1716 * seg_p_disable - disables seg_pcache, and then attempts to empty the
1716 1717 * cache.
1717 1718 * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1718 1719 * SEGP_FAIL if the cache could not be emptied.
1719 1720 */
1720 1721 int
1721 1722 seg_p_disable(void)
1722 1723 {
1723 1724 pgcnt_t old_plocked;
1724 1725 int stall_count = 0;
1725 1726
1726 1727 mutex_enter(&seg_pcache_mtx);
1727 1728 seg_pdisabled++;
1728 1729 ASSERT(seg_pdisabled != 0);
1729 1730 mutex_exit(&seg_pcache_mtx);
1730 1731
1731 1732 /*
1732 1733 * Attempt to empty the cache. Terminate if seg_plocked does not
1733 1734 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
1734 1735 */
1735 1736 while (seg_plocked != 0) {
1736 1737 ASSERT(seg_phashsize_win != 0);
1737 1738 old_plocked = seg_plocked;
1738 1739 seg_ppurge_async(1);
1739 1740 if (seg_plocked == old_plocked) {
1740 1741 if (stall_count++ > SEGP_STALL_THRESHOLD) {
1741 1742 return (SEGP_FAIL);
1742 1743 }
1743 1744 } else
1744 1745 stall_count = 0;
1745 1746 if (seg_plocked != 0)
1746 1747 delay(hz/SEGP_PREDEL_DELAY_FACTOR);
1747 1748 }
1748 1749 return (SEGP_SUCCESS);
1749 1750 }
1750 1751
1751 1752 /*
1752 1753 * Attempt to purge seg_pcache. May need to return before this has
1753 1754 * completed to allow other pre_del callbacks to unlock pages. This is
1754 1755 * ok because:
1755 1756 * 1) The seg_pdisabled flag has been set so at least we won't
1756 1757 * cache anymore locks and the locks we couldn't purge
1757 1758 * will not be held if they do get released by a subsequent
1758 1759 * pre-delete callback.
1759 1760 *
1760 1761 * 2) The rest of the memory delete thread processing does not
1761 1762 * depend on the changes made in this pre-delete callback. No
1762 1763 * panics will result, the worst that will happen is that the
1763 1764 * DR code will timeout and cancel the delete.
1764 1765 */
1765 1766 /*ARGSUSED*/
1766 1767 static int
1767 1768 seg_p_mem_config_pre_del(
1768 1769 void *arg,
1769 1770 pgcnt_t delta_pages)
1770 1771 {
1771 1772 if (seg_phashsize_win == 0) {
1772 1773 return (0);
1773 1774 }
1774 1775 if (seg_p_disable() != SEGP_SUCCESS)
1775 1776 cmn_err(CE_NOTE,
1776 1777 "!Pre-delete couldn't purge"" pagelock cache - continuing");
1777 1778 return (0);
1778 1779 }
1779 1780
1780 1781 /*ARGSUSED*/
1781 1782 static void
1782 1783 seg_p_mem_config_post_del(
1783 1784 void *arg,
1784 1785 pgcnt_t delta_pages,
1785 1786 int cancelled)
1786 1787 {
1787 1788 if (seg_phashsize_win == 0) {
1788 1789 return;
1789 1790 }
1790 1791 seg_p_enable();
1791 1792 }
1792 1793
1793 1794 static kphysm_setup_vector_t seg_p_mem_config_vec = {
1794 1795 KPHYSM_SETUP_VECTOR_VERSION,
1795 1796 seg_p_mem_config_post_add,
1796 1797 seg_p_mem_config_pre_del,
1797 1798 seg_p_mem_config_post_del,
1798 1799 };
1799 1800
1800 1801 static void
1801 1802 seg_pinit_mem_config(void)
1802 1803 {
1803 1804 int ret;
1804 1805
1805 1806 ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
1806 1807 /*
1807 1808 * Want to catch this in the debug kernel. At run time, if the
1808 1809 * callbacks don't get run all will be OK as the disable just makes
1809 1810 * it more likely that the pages can be collected.
1810 1811 */
1811 1812 ASSERT(ret == 0);
1812 1813 }
1813 1814
1814 1815 /*
1815 1816 * Verify that segment is not a shared anonymous segment which reserves
1816 1817 * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
1817 1818 * from one zone to another if any segments are shared. This is because the
1818 1819 * last process to exit will credit the swap reservation. This could lead
1819 1820 * to the swap being reserved by one zone, and credited to another.
1820 1821 */
1821 1822 boolean_t
1822 1823 seg_can_change_zones(struct seg *seg)
1823 1824 {
1824 1825 struct segvn_data *svd;
1825 1826
1826 1827 if (seg->s_ops == &segspt_shmops)
1827 1828 return (B_FALSE);
1828 1829
1829 1830 if (seg->s_ops == &segvn_ops) {
1830 1831 svd = (struct segvn_data *)seg->s_data;
1831 1832 if (svd->type == MAP_SHARED &&
1832 1833 svd->amp != NULL &&
1833 1834 svd->amp->swresv > 0)
1834 1835 return (B_FALSE);
1835 1836 }
1836 1837 return (B_TRUE);
1837 1838 }
1838 1839
1839 1840 /*
1840 1841 * Return swap reserved by a segment backing a private mapping.
1841 1842 */
1842 1843 size_t
1843 1844 seg_swresv(struct seg *seg)
1844 1845 {
1845 1846 struct segvn_data *svd;
1846 1847 size_t swap = 0;
1847 1848
1848 1849 if (seg->s_ops == &segvn_ops) {
1849 1850 svd = (struct segvn_data *)seg->s_data;
1850 1851 if (svd->type == MAP_PRIVATE && svd->swresv > 0)
1851 1852 swap = svd->swresv;
1852 1853 }
1853 1854 return (swap);
↓ open down ↓ |
1819 lines elided |
↑ open up ↑ |
1854 1855 }
1855 1856
1856 1857 /*
1857 1858 * General not supported function for SEGOP_INHERIT
1858 1859 */
1859 1860 /* ARGSUSED */
1860 1861 int
1861 1862 seg_inherit_notsup(struct seg *seg, caddr_t addr, size_t len, uint_t op)
1862 1863 {
1863 1864 return (ENOTSUP);
1865 +}
1866 +
1867 +/*
1868 + * segop wrappers
1869 + */
1870 +int
1871 +segop_dup(struct seg *seg, struct seg *new)
1872 +{
1873 + return (seg->s_ops->dup(seg, new));
1874 +}
1875 +
1876 +int
1877 +segop_unmap(struct seg *seg, caddr_t addr, size_t len)
1878 +{
1879 + return (seg->s_ops->unmap(seg, addr, len));
1880 +}
1881 +
1882 +void
1883 +segop_free(struct seg *seg)
1884 +{
1885 + seg->s_ops->free(seg);
1886 +}
1887 +
1888 +faultcode_t
1889 +segop_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
1890 + enum fault_type type, enum seg_rw rw)
1891 +{
1892 + return (seg->s_ops->fault(hat, seg, addr, len, type, rw));
1893 +}
1894 +
1895 +faultcode_t
1896 +segop_faulta(struct seg *seg, caddr_t addr)
1897 +{
1898 + return (seg->s_ops->faulta(seg, addr));
1899 +}
1900 +
1901 +int
1902 +segop_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1903 +{
1904 + return (seg->s_ops->setprot(seg, addr, len, prot));
1905 +}
1906 +
1907 +int
1908 +segop_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1909 +{
1910 + return (seg->s_ops->checkprot(seg, addr, len, prot));
1911 +}
1912 +
1913 +int
1914 +segop_kluster(struct seg *seg, caddr_t addr, ssize_t d)
1915 +{
1916 + return (seg->s_ops->kluster(seg, addr, d));
1917 +}
1918 +
1919 +size_t
1920 +segop_swapout(struct seg *seg)
1921 +{
1922 + return (seg->s_ops->swapout(seg));
1923 +}
1924 +
1925 +int
1926 +segop_sync(struct seg *seg, caddr_t addr, size_t len, int atr, uint_t f)
1927 +{
1928 + return (seg->s_ops->sync(seg, addr, len, atr, f));
1929 +}
1930 +
1931 +size_t
1932 +segop_incore(struct seg *seg, caddr_t addr, size_t len, char *v)
1933 +{
1934 + return (seg->s_ops->incore(seg, addr, len, v));
1935 +}
1936 +
1937 +int
1938 +segop_lockop(struct seg *seg, caddr_t addr, size_t len, int atr, int op,
1939 + ulong_t *b, size_t p)
1940 +{
1941 + return (seg->s_ops->lockop(seg, addr, len, atr, op, b, p));
1942 +}
1943 +
1944 +int
1945 +segop_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *p)
1946 +{
1947 + return (seg->s_ops->getprot(seg, addr, len, p));
1948 +}
1949 +
1950 +u_offset_t
1951 +segop_getoffset(struct seg *seg, caddr_t addr)
1952 +{
1953 + return (seg->s_ops->getoffset(seg, addr));
1954 +}
1955 +
1956 +int
1957 +segop_gettype(struct seg *seg, caddr_t addr)
1958 +{
1959 + return (seg->s_ops->gettype(seg, addr));
1960 +}
1961 +
1962 +int
1963 +segop_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
1964 +{
1965 + return (seg->s_ops->getvp(seg, addr, vpp));
1966 +}
1967 +
1968 +int
1969 +segop_advise(struct seg *seg, caddr_t addr, size_t len, uint_t b)
1970 +{
1971 + return (seg->s_ops->advise(seg, addr, len, b));
1972 +}
1973 +
1974 +void
1975 +segop_dump(struct seg *seg)
1976 +{
1977 + seg->s_ops->dump(seg);
1978 +}
1979 +
1980 +int
1981 +segop_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***page,
1982 + enum lock_type type, enum seg_rw rw)
1983 +{
1984 + return (seg->s_ops->pagelock(seg, addr, len, page, type, rw));
1985 +}
1986 +
1987 +int
1988 +segop_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
1989 +{
1990 + return (seg->s_ops->setpagesize(seg, addr, len, szc));
1991 +}
1992 +
1993 +int
1994 +segop_getmemid(struct seg *seg, caddr_t addr, memid_t *mp)
1995 +{
1996 + return (seg->s_ops->getmemid(seg, addr, mp));
1997 +}
1998 +
1999 +struct lgrp_mem_policy_info *
2000 +segop_getpolicy(struct seg *seg, caddr_t addr)
2001 +{
2002 + if (seg->s_ops->getpolicy == NULL)
2003 + return (NULL);
2004 +
2005 + return (seg->s_ops->getpolicy(seg, addr));
2006 +}
2007 +
2008 +int
2009 +segop_capable(struct seg *seg, segcapability_t cap)
2010 +{
2011 + return (seg->s_ops->capable(seg, cap));
2012 +}
2013 +
2014 +int
2015 +segop_inherit(struct seg *seg, caddr_t addr, size_t len, uint_t op)
2016 +{
2017 + if (seg->s_ops->inherit == NULL)
2018 + return (ENOTSUP);
2019 +
2020 + return (seg->s_ops->inherit(seg, addr, len, op));
1864 2021 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX