Print this page
patch segpcache-maxwindow-is-useless
use NULL dump segop as a shorthand for no-op
Instead of forcing every segment driver to implement a dummy function that
does nothing, handle NULL dump segop function pointer as a no-op shorthand.
const-ify make segment ops structures
There is no reason to keep the segment ops structures writable.
use NULL setpagesize segop as a shorthand for ENOTSUP
Instead of forcing every segment driver to implement a dummp function to
return (hopefully) ENOTSUP, handle NULL setpagesize segop function pointer
as "return ENOTSUP" shorthand.
use NULL getmemid segop as a shorthand for ENODEV
Instead of forcing every segment driver to implement a dummy function to
return (hopefully) ENODEV, handle NULL getmemid segop function pointer as
"return ENODEV" shorthand.
use NULL capable segop as a shorthand for no-capabilities
Instead of forcing every segment driver to implement a dummy "return 0"
function, handle NULL capable segop function pointer as "no copabilities
supported" shorthand.
seg_inherit_notsup is redundant since segop_inherit checks for NULL properly
patch lower-case-segops
instead using SEGOP_* macros, define full-fledged segop_* functions
This will allow us to do some sanity checking or even implement stub
functionality in one place instead of duplicating it wherever these wrappers
are used.
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/vm_seg.c
+++ new/usr/src/uts/common/vm/vm_seg.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 * Copyright (c) 2015, Joyent, Inc.
25 25 */
26 26
27 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 28 /* All Rights Reserved */
29 29
30 30 /*
31 31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 32 * The Regents of the University of California
33 33 * All Rights Reserved
34 34 *
35 35 * University Acknowledgment- Portions of this document are derived from
36 36 * software developed by the University of California, Berkeley, and its
37 37 * contributors.
38 38 */
39 39
40 40 /*
41 41 * VM - segment management.
42 42 */
43 43
44 44 #include <sys/types.h>
45 45 #include <sys/inttypes.h>
46 46 #include <sys/t_lock.h>
47 47 #include <sys/param.h>
48 48 #include <sys/systm.h>
49 49 #include <sys/kmem.h>
50 50 #include <sys/sysmacros.h>
51 51 #include <sys/vmsystm.h>
52 52 #include <sys/tuneable.h>
53 53 #include <sys/debug.h>
54 54 #include <sys/fs/swapnode.h>
55 55 #include <sys/cmn_err.h>
56 56 #include <sys/callb.h>
57 57 #include <sys/mem_config.h>
58 58 #include <sys/mman.h>
59 59
60 60 #include <vm/hat.h>
61 61 #include <vm/as.h>
62 62 #include <vm/seg.h>
63 63 #include <vm/seg_kmem.h>
64 64 #include <vm/seg_spt.h>
65 65 #include <vm/seg_vn.h>
66 66 #include <vm/anon.h>
67 67
68 68 /*
69 69 * kstats for segment advise
70 70 */
71 71 segadvstat_t segadvstat = {
72 72 { "MADV_FREE_hit", KSTAT_DATA_ULONG },
73 73 { "MADV_FREE_miss", KSTAT_DATA_ULONG },
74 74 };
75 75
76 76 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
77 77 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
78 78
79 79 /*
80 80 * entry in the segment page cache
81 81 */
82 82 struct seg_pcache {
83 83 struct seg_pcache *p_hnext; /* list for hashed blocks */
84 84 struct seg_pcache *p_hprev;
85 85 pcache_link_t p_plink; /* per segment/amp list */
86 86 void *p_htag0; /* segment/amp pointer */
87 87 caddr_t p_addr; /* base address/anon_idx */
88 88 size_t p_len; /* total bytes */
89 89 size_t p_wlen; /* writtable bytes at p_addr */
90 90 struct page **p_pp; /* pp shadow list */
91 91 seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */
92 92 clock_t p_lbolt; /* lbolt from last use */
93 93 struct seg_phash *p_hashp; /* our pcache hash bucket */
94 94 uint_t p_active; /* active count */
95 95 uchar_t p_write; /* true if S_WRITE */
96 96 uchar_t p_ref; /* reference byte */
97 97 ushort_t p_flags; /* bit flags */
98 98 };
99 99
100 100 struct seg_phash {
101 101 struct seg_pcache *p_hnext; /* list for hashed blocks */
102 102 struct seg_pcache *p_hprev;
103 103 kmutex_t p_hmutex; /* protects hash bucket */
104 104 pcache_link_t p_halink[2]; /* active bucket linkages */
105 105 };
106 106
107 107 struct seg_phash_wired {
108 108 struct seg_pcache *p_hnext; /* list for hashed blocks */
109 109 struct seg_pcache *p_hprev;
110 110 kmutex_t p_hmutex; /* protects hash bucket */
111 111 };
112 112
113 113 /*
114 114 * A parameter to control a maximum number of bytes that can be
115 115 * purged from pcache at a time.
116 116 */
117 117 #define P_MAX_APURGE_BYTES (1024 * 1024 * 1024)
118 118
↓ open down ↓ |
118 lines elided |
↑ open up ↑ |
119 119 /*
120 120 * log2(fraction of pcache to reclaim at a time).
121 121 */
122 122 #define P_SHRINK_SHFT (5)
123 123
124 124 /*
125 125 * The following variables can be tuned via /etc/system.
126 126 */
127 127
128 128 int segpcache_enabled = 1; /* if 1, shadow lists are cached */
129 -pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */
130 129 ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */
131 130 ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */
132 131 int segpcache_reap_sec = 1; /* reap check rate in secs */
133 132 clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */
134 133 int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */
135 134 clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */
136 135 int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */
137 136 pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */
138 137
139 138 static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */
140 139 static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */
141 140 static kcondvar_t seg_pasync_cv;
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
142 141
143 142 #pragma align 64(pctrl1)
144 143 #pragma align 64(pctrl2)
145 144 #pragma align 64(pctrl3)
146 145
147 146 /*
148 147 * Keep frequently used variables together in one cache line.
149 148 */
150 149 static struct p_ctrl1 {
151 150 uint_t p_disabled; /* if not 0, caching temporarily off */
152 - pgcnt_t p_maxwin; /* max # of pages that can be cached */
153 151 size_t p_hashwin_sz; /* # of non wired buckets */
154 152 struct seg_phash *p_htabwin; /* hash table for non wired entries */
155 153 size_t p_hashwired_sz; /* # of wired buckets */
156 154 struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
157 155 kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */
158 156 #ifdef _LP64
159 - ulong_t pad[1];
157 + ulong_t pad[2];
160 158 #endif /* _LP64 */
161 159 } pctrl1;
162 160
163 161 static struct p_ctrl2 {
164 162 kmutex_t p_mem_mtx; /* protects window counter and p_halinks */
165 163 pgcnt_t p_locked_win; /* # pages from window */
166 164 pgcnt_t p_locked; /* # of pages cached by pagelock */
167 165 uchar_t p_ahcur; /* current active links for insert/delete */
168 166 uchar_t p_athr_on; /* async reclaim thread is running. */
169 167 pcache_link_t p_ahhead[2]; /* active buckets linkages */
170 168 } pctrl2;
171 169
172 170 static struct p_ctrl3 {
173 171 clock_t p_pcp_maxage; /* max pcp age in ticks */
↓ open down ↓ |
4 lines elided |
↑ open up ↑ |
174 172 ulong_t p_athr_empty_ahb; /* athread walk stats */
175 173 ulong_t p_athr_full_ahb; /* athread walk stats */
176 174 pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */
177 175 int p_shrink_shft; /* reap shift factor */
178 176 #ifdef _LP64
179 177 ulong_t pad[3];
180 178 #endif /* _LP64 */
181 179 } pctrl3;
182 180
183 181 #define seg_pdisabled pctrl1.p_disabled
184 -#define seg_pmaxwindow pctrl1.p_maxwin
185 182 #define seg_phashsize_win pctrl1.p_hashwin_sz
186 183 #define seg_phashtab_win pctrl1.p_htabwin
187 184 #define seg_phashsize_wired pctrl1.p_hashwired_sz
188 185 #define seg_phashtab_wired pctrl1.p_htabwired
189 186 #define seg_pkmcache pctrl1.p_kmcache
190 187 #define seg_pmem_mtx pctrl2.p_mem_mtx
191 188 #define seg_plocked_window pctrl2.p_locked_win
192 189 #define seg_plocked pctrl2.p_locked
193 190 #define seg_pahcur pctrl2.p_ahcur
194 191 #define seg_pathr_on pctrl2.p_athr_on
195 192 #define seg_pahhead pctrl2.p_ahhead
196 193 #define seg_pmax_pcpage pctrl3.p_pcp_maxage
197 194 #define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb
↓ open down ↓ |
3 lines elided |
↑ open up ↑ |
198 195 #define seg_pathr_full_ahb pctrl3.p_athr_full_ahb
199 196 #define seg_pshrink_shift pctrl3.p_shrink_shft
200 197 #define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages
201 198
202 199 #define P_HASHWIN_MASK (seg_phashsize_win - 1)
203 200 #define P_HASHWIRED_MASK (seg_phashsize_wired - 1)
204 201 #define P_BASESHIFT (6)
205 202
206 203 kthread_t *seg_pasync_thr;
207 204
208 -extern struct seg_ops segvn_ops;
209 -extern struct seg_ops segspt_shmops;
205 +extern const struct seg_ops segvn_ops;
206 +extern const struct seg_ops segspt_shmops;
210 207
211 208 #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
212 209 #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
213 210
214 211 #define LBOLT_DELTA(t) ((ulong_t)(ddi_get_lbolt() - (t)))
215 212
216 213 #define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt)
217 214
218 215 /*
219 216 * htag0 argument can be a seg or amp pointer.
220 217 */
221 218 #define P_HASHBP(seg, htag0, addr, flags) \
222 219 (IS_PFLAGS_WIRED((flags)) ? \
223 220 ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \
224 221 ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \
225 222 (&seg_phashtab_win[P_HASHWIN_MASK & \
226 223 (((uintptr_t)(htag0) >> 3) ^ \
227 224 ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \
228 225 (flags >> 16) : page_get_shift((seg)->s_szc))))]))
229 226
230 227 /*
231 228 * htag0 argument can be a seg or amp pointer.
232 229 */
233 230 #define P_MATCH(pcp, htag0, addr, len) \
234 231 ((pcp)->p_htag0 == (htag0) && \
235 232 (pcp)->p_addr == (addr) && \
236 233 (pcp)->p_len >= (len))
237 234
238 235 #define P_MATCH_PP(pcp, htag0, addr, len, pp) \
239 236 ((pcp)->p_pp == (pp) && \
240 237 (pcp)->p_htag0 == (htag0) && \
241 238 (pcp)->p_addr == (addr) && \
242 239 (pcp)->p_len >= (len))
243 240
244 241 #define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \
245 242 offsetof(struct seg_pcache, p_plink)))
246 243
247 244 #define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \
248 245 offsetof(struct seg_phash, p_halink[l])))
249 246
250 247 /*
251 248 * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
252 249 * active hash bucket lists. We maintain active bucket lists to reduce the
253 250 * overhead of finding active buckets during asynchronous purging since there
254 251 * can be 10s of millions of buckets on a large system but only a small subset
255 252 * of them in actual use.
256 253 *
257 254 * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
258 255 * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
259 256 * buckets. The other list is used by asynchronous purge thread. This allows
260 257 * the purge thread to walk its active list without holding seg_pmem_mtx for a
261 258 * long time. When asynchronous thread is done with its list it switches to
262 259 * current active list and makes the list it just finished processing as
263 260 * current active list.
264 261 *
265 262 * seg_padd_abuck() only adds the bucket to current list if the bucket is not
266 263 * yet on any list. seg_premove_abuck() may remove the bucket from either
267 264 * list. If the bucket is on current list it will be always removed. Otherwise
268 265 * the bucket is only removed if asynchronous purge thread is not currently
269 266 * running or seg_premove_abuck() is called by asynchronous purge thread
270 267 * itself. A given bucket can only be on one of active lists at a time. These
271 268 * routines should be called with per bucket lock held. The routines use
272 269 * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
273 270 * the first entry is added to the bucket chain and seg_premove_abuck() must
274 271 * be called after the last pcp entry is deleted from its chain. Per bucket
275 272 * lock should be held by the callers. This avoids a potential race condition
276 273 * when seg_premove_abuck() removes a bucket after pcp entries are added to
277 274 * its list after the caller checked that the bucket has no entries. (this
278 275 * race would cause a loss of an active bucket from the active lists).
279 276 *
280 277 * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
281 278 * New entries are added to the end of the list since LRU is used as the
282 279 * purging policy.
283 280 */
284 281 static void
285 282 seg_padd_abuck(struct seg_phash *hp)
286 283 {
287 284 int lix;
288 285
289 286 ASSERT(MUTEX_HELD(&hp->p_hmutex));
290 287 ASSERT((struct seg_phash *)hp->p_hnext != hp);
291 288 ASSERT((struct seg_phash *)hp->p_hprev != hp);
292 289 ASSERT(hp->p_hnext == hp->p_hprev);
293 290 ASSERT(!IS_PCP_WIRED(hp->p_hnext));
294 291 ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
295 292 ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
296 293 ASSERT(hp >= seg_phashtab_win &&
297 294 hp < &seg_phashtab_win[seg_phashsize_win]);
298 295
299 296 /*
300 297 * This bucket can already be on one of active lists
301 298 * since seg_premove_abuck() may have failed to remove it
302 299 * before.
303 300 */
304 301 mutex_enter(&seg_pmem_mtx);
305 302 lix = seg_pahcur;
306 303 ASSERT(lix >= 0 && lix <= 1);
307 304 if (hp->p_halink[lix].p_lnext != NULL) {
308 305 ASSERT(hp->p_halink[lix].p_lprev != NULL);
309 306 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
310 307 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
311 308 mutex_exit(&seg_pmem_mtx);
312 309 return;
313 310 }
314 311 ASSERT(hp->p_halink[lix].p_lprev == NULL);
315 312
316 313 /*
317 314 * If this bucket is still on list !lix async thread can't yet remove
318 315 * it since we hold here per bucket lock. In this case just return
319 316 * since async thread will eventually find and process this bucket.
320 317 */
321 318 if (hp->p_halink[!lix].p_lnext != NULL) {
322 319 ASSERT(hp->p_halink[!lix].p_lprev != NULL);
323 320 mutex_exit(&seg_pmem_mtx);
324 321 return;
325 322 }
326 323 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
327 324 /*
328 325 * This bucket is not on any active bucket list yet.
329 326 * Add the bucket to the tail of current active list.
330 327 */
331 328 hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
332 329 hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
333 330 seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
334 331 seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
335 332 mutex_exit(&seg_pmem_mtx);
336 333 }
337 334
338 335 static void
339 336 seg_premove_abuck(struct seg_phash *hp, int athr)
340 337 {
341 338 int lix;
342 339
343 340 ASSERT(MUTEX_HELD(&hp->p_hmutex));
344 341 ASSERT((struct seg_phash *)hp->p_hnext == hp);
345 342 ASSERT((struct seg_phash *)hp->p_hprev == hp);
346 343 ASSERT(hp >= seg_phashtab_win &&
347 344 hp < &seg_phashtab_win[seg_phashsize_win]);
348 345
349 346 if (athr) {
350 347 ASSERT(seg_pathr_on);
351 348 ASSERT(seg_pahcur <= 1);
352 349 /*
353 350 * We are called by asynchronous thread that found this bucket
354 351 * on not currently active (i.e. !seg_pahcur) list. Remove it
355 352 * from there. Per bucket lock we are holding makes sure
356 353 * seg_pinsert() can't sneak in and add pcp entries to this
357 354 * bucket right before we remove the bucket from its list.
358 355 */
359 356 lix = !seg_pahcur;
360 357 ASSERT(hp->p_halink[lix].p_lnext != NULL);
361 358 ASSERT(hp->p_halink[lix].p_lprev != NULL);
362 359 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
363 360 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
364 361 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
365 362 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
366 363 hp->p_halink[lix].p_lnext = NULL;
367 364 hp->p_halink[lix].p_lprev = NULL;
368 365 return;
369 366 }
370 367
371 368 mutex_enter(&seg_pmem_mtx);
372 369 lix = seg_pahcur;
373 370 ASSERT(lix >= 0 && lix <= 1);
374 371
375 372 /*
376 373 * If the bucket is on currently active list just remove it from
377 374 * there.
378 375 */
379 376 if (hp->p_halink[lix].p_lnext != NULL) {
380 377 ASSERT(hp->p_halink[lix].p_lprev != NULL);
381 378 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
382 379 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
383 380 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
384 381 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
385 382 hp->p_halink[lix].p_lnext = NULL;
386 383 hp->p_halink[lix].p_lprev = NULL;
387 384 mutex_exit(&seg_pmem_mtx);
388 385 return;
389 386 }
390 387 ASSERT(hp->p_halink[lix].p_lprev == NULL);
391 388
392 389 /*
393 390 * If asynchronous thread is not running we can remove the bucket from
394 391 * not currently active list. The bucket must be on this list since we
395 392 * already checked that it's not on the other list and the bucket from
396 393 * which we just deleted the last pcp entry must be still on one of the
397 394 * active bucket lists.
398 395 */
399 396 lix = !lix;
400 397 ASSERT(hp->p_halink[lix].p_lnext != NULL);
401 398 ASSERT(hp->p_halink[lix].p_lprev != NULL);
402 399
403 400 if (!seg_pathr_on) {
404 401 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
405 402 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
406 403 hp->p_halink[lix].p_lnext = NULL;
407 404 hp->p_halink[lix].p_lprev = NULL;
408 405 }
409 406 mutex_exit(&seg_pmem_mtx);
410 407 }
411 408
412 409 /*
413 410 * Check if bucket pointed by hp already has a pcp entry that matches request
414 411 * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
415 412 * Also delete matching entries that cover smaller address range but start
416 413 * at the same address as addr argument. Return the list of deleted entries if
417 414 * any. This is an internal helper function called from seg_pinsert() only
418 415 * for non wired shadow lists. The caller already holds a per seg/amp list
419 416 * lock.
420 417 */
421 418 static struct seg_pcache *
422 419 seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
423 420 caddr_t addr, size_t len, int *found)
424 421 {
425 422 struct seg_pcache *pcp;
426 423 struct seg_pcache *delcallb_list = NULL;
427 424
428 425 ASSERT(MUTEX_HELD(&hp->p_hmutex));
429 426
430 427 *found = 0;
431 428 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
432 429 pcp = pcp->p_hnext) {
433 430 ASSERT(pcp->p_hashp == hp);
434 431 if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
435 432 ASSERT(!IS_PCP_WIRED(pcp));
436 433 if (pcp->p_len < len) {
437 434 pcache_link_t *plinkp;
438 435 if (pcp->p_active) {
439 436 continue;
440 437 }
441 438 plinkp = &pcp->p_plink;
442 439 plinkp->p_lprev->p_lnext = plinkp->p_lnext;
443 440 plinkp->p_lnext->p_lprev = plinkp->p_lprev;
444 441 pcp->p_hprev->p_hnext = pcp->p_hnext;
445 442 pcp->p_hnext->p_hprev = pcp->p_hprev;
446 443 pcp->p_hprev = delcallb_list;
447 444 delcallb_list = pcp;
448 445 } else {
449 446 *found = 1;
450 447 break;
451 448 }
452 449 }
453 450 }
454 451 return (delcallb_list);
455 452 }
456 453
457 454 /*
458 455 * lookup an address range in pagelock cache. Return shadow list and bump up
459 456 * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
460 457 * as a lookup tag.
461 458 */
462 459 struct page **
463 460 seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
464 461 enum seg_rw rw, uint_t flags)
465 462 {
466 463 struct seg_pcache *pcp;
467 464 struct seg_phash *hp;
468 465 void *htag0;
469 466
470 467 ASSERT(seg != NULL);
471 468 ASSERT(rw == S_READ || rw == S_WRITE);
472 469
473 470 /*
474 471 * Skip pagelock cache, while DR is in progress or
475 472 * seg_pcache is off.
476 473 */
477 474 if (seg_pdisabled) {
478 475 return (NULL);
479 476 }
480 477 ASSERT(seg_phashsize_win != 0);
481 478
482 479 htag0 = (amp == NULL ? (void *)seg : (void *)amp);
483 480 hp = P_HASHBP(seg, htag0, addr, flags);
484 481 mutex_enter(&hp->p_hmutex);
485 482 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
486 483 pcp = pcp->p_hnext) {
487 484 ASSERT(pcp->p_hashp == hp);
488 485 if (P_MATCH(pcp, htag0, addr, len)) {
489 486 ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
490 487 /*
491 488 * If this request wants to write pages
492 489 * but write permissions starting from
493 490 * addr don't cover the entire length len
494 491 * return lookup failure back to the caller.
495 492 * It will check protections and fail this
496 493 * pagelock operation with EACCESS error.
497 494 */
498 495 if (rw == S_WRITE && pcp->p_wlen < len) {
499 496 break;
500 497 }
501 498 if (pcp->p_active == UINT_MAX) {
502 499 break;
503 500 }
504 501 pcp->p_active++;
505 502 if (rw == S_WRITE && !pcp->p_write) {
506 503 pcp->p_write = 1;
507 504 }
508 505 mutex_exit(&hp->p_hmutex);
509 506 return (pcp->p_pp);
510 507 }
511 508 }
512 509 mutex_exit(&hp->p_hmutex);
513 510 return (NULL);
514 511 }
515 512
516 513 /*
517 514 * mark address range inactive. If the cache is off or the address range is
518 515 * not in the cache or another shadow list that covers bigger range is found
519 516 * we call the segment driver to reclaim the pages. Otherwise just decrement
520 517 * active count and set ref bit. If amp is not NULL use amp as a lookup tag
521 518 * otherwise use seg as a lookup tag.
522 519 */
523 520 void
524 521 seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
525 522 size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
526 523 seg_preclaim_cbfunc_t callback)
527 524 {
528 525 struct seg_pcache *pcp;
529 526 struct seg_phash *hp;
530 527 kmutex_t *pmtx = NULL;
531 528 pcache_link_t *pheadp;
532 529 void *htag0;
533 530 pgcnt_t npages = 0;
534 531 int keep = 0;
535 532
536 533 ASSERT(seg != NULL);
537 534 ASSERT(rw == S_READ || rw == S_WRITE);
538 535
539 536 htag0 = (amp == NULL ? (void *)seg : (void *)amp);
540 537
541 538 /*
542 539 * Skip lookup if pcache is not configured.
543 540 */
544 541 if (seg_phashsize_win == 0) {
545 542 goto out;
546 543 }
547 544
548 545 /*
549 546 * Grab per seg/amp lock before hash lock if we are going to remove
550 547 * inactive entry from pcache.
551 548 */
552 549 if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
553 550 if (amp == NULL) {
554 551 pheadp = &seg->s_phead;
555 552 pmtx = &seg->s_pmtx;
556 553 } else {
557 554 pheadp = &->a_phead;
558 555 pmtx = &->a_pmtx;
559 556 }
560 557 mutex_enter(pmtx);
561 558 }
562 559
563 560 hp = P_HASHBP(seg, htag0, addr, flags);
564 561 mutex_enter(&hp->p_hmutex);
565 562 again:
566 563 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
567 564 pcp = pcp->p_hnext) {
568 565 ASSERT(pcp->p_hashp == hp);
569 566 if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
570 567 ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
571 568 ASSERT(pcp->p_active);
572 569 if (keep) {
573 570 /*
574 571 * Don't remove this pcp entry
575 572 * if we didn't find duplicate
576 573 * shadow lists on second search.
577 574 * Somebody removed those duplicates
578 575 * since we dropped hash lock after first
579 576 * search.
580 577 */
581 578 ASSERT(pmtx != NULL);
582 579 ASSERT(!IS_PFLAGS_WIRED(flags));
583 580 mutex_exit(pmtx);
584 581 pmtx = NULL;
585 582 }
586 583 pcp->p_active--;
587 584 if (pcp->p_active == 0 && (pmtx != NULL ||
588 585 (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
589 586
590 587 /*
591 588 * This entry is no longer active. Remove it
592 589 * now either because pcaching is temporarily
593 590 * disabled or there're other pcp entries that
594 591 * can match this pagelock request (i.e. this
595 592 * entry is a duplicate).
596 593 */
597 594
598 595 ASSERT(callback == pcp->p_callback);
599 596 if (pmtx != NULL) {
600 597 pcache_link_t *plinkp = &pcp->p_plink;
601 598 ASSERT(!IS_PCP_WIRED(pcp));
602 599 ASSERT(pheadp->p_lnext != pheadp);
603 600 ASSERT(pheadp->p_lprev != pheadp);
604 601 plinkp->p_lprev->p_lnext =
605 602 plinkp->p_lnext;
606 603 plinkp->p_lnext->p_lprev =
607 604 plinkp->p_lprev;
608 605 }
609 606 pcp->p_hprev->p_hnext = pcp->p_hnext;
610 607 pcp->p_hnext->p_hprev = pcp->p_hprev;
611 608 if (!IS_PCP_WIRED(pcp) &&
612 609 hp->p_hnext == (struct seg_pcache *)hp) {
613 610 /*
614 611 * We removed the last entry from this
615 612 * bucket. Now remove the bucket from
616 613 * its active list.
617 614 */
618 615 seg_premove_abuck(hp, 0);
619 616 }
620 617 mutex_exit(&hp->p_hmutex);
621 618 if (pmtx != NULL) {
622 619 mutex_exit(pmtx);
623 620 }
624 621 len = pcp->p_len;
625 622 npages = btop(len);
626 623 if (rw != S_WRITE && pcp->p_write) {
627 624 rw = S_WRITE;
628 625 }
629 626 kmem_cache_free(seg_pkmcache, pcp);
630 627 goto out;
631 628 } else {
632 629 /*
633 630 * We found a matching pcp entry but will not
634 631 * free it right away even if it's no longer
635 632 * active.
636 633 */
637 634 if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
638 635 /*
639 636 * Set the reference bit and mark the
640 637 * time of last access to this pcp
641 638 * so that asynchronous thread doesn't
642 639 * free it immediately since
643 640 * it may be reactivated very soon.
644 641 */
645 642 pcp->p_lbolt = ddi_get_lbolt();
646 643 pcp->p_ref = 1;
647 644 }
648 645 mutex_exit(&hp->p_hmutex);
649 646 if (pmtx != NULL) {
650 647 mutex_exit(pmtx);
651 648 }
652 649 return;
653 650 }
654 651 } else if (!IS_PFLAGS_WIRED(flags) &&
655 652 P_MATCH(pcp, htag0, addr, len)) {
656 653 /*
657 654 * This is a duplicate pcp entry. This situation may
658 655 * happen if a bigger shadow list that covers our
659 656 * range was added while our entry was still active.
660 657 * Now we can free our pcp entry if it becomes
661 658 * inactive.
662 659 */
663 660 if (!pcp->p_active) {
664 661 /*
665 662 * Mark this entry as referenced just in case
666 663 * we'll free our own pcp entry soon.
667 664 */
668 665 pcp->p_lbolt = ddi_get_lbolt();
669 666 pcp->p_ref = 1;
670 667 }
671 668 if (pmtx != NULL) {
672 669 /*
673 670 * we are already holding pmtx and found a
674 671 * duplicate. Don't keep our own pcp entry.
675 672 */
676 673 keep = 0;
677 674 continue;
678 675 }
679 676 /*
680 677 * We have to use mutex_tryenter to attempt to lock
681 678 * seg/amp list lock since we already hold hash lock
682 679 * and seg/amp list lock is above hash lock in lock
683 680 * order. If mutex_tryenter fails drop hash lock and
684 681 * retake both locks in correct order and research
685 682 * this hash chain.
686 683 */
687 684 ASSERT(keep == 0);
688 685 if (amp == NULL) {
689 686 pheadp = &seg->s_phead;
690 687 pmtx = &seg->s_pmtx;
691 688 } else {
692 689 pheadp = &->a_phead;
693 690 pmtx = &->a_pmtx;
694 691 }
695 692 if (!mutex_tryenter(pmtx)) {
696 693 mutex_exit(&hp->p_hmutex);
697 694 mutex_enter(pmtx);
698 695 mutex_enter(&hp->p_hmutex);
699 696 /*
700 697 * If we don't find bigger shadow list on
701 698 * second search (it may happen since we
702 699 * dropped bucket lock) keep the entry that
703 700 * matches our own shadow list.
704 701 */
705 702 keep = 1;
706 703 goto again;
707 704 }
708 705 }
709 706 }
710 707 mutex_exit(&hp->p_hmutex);
711 708 if (pmtx != NULL) {
712 709 mutex_exit(pmtx);
713 710 }
714 711 out:
715 712 (*callback)(htag0, addr, len, pp, rw, 0);
716 713 if (npages) {
717 714 mutex_enter(&seg_pmem_mtx);
718 715 ASSERT(seg_plocked >= npages);
719 716 seg_plocked -= npages;
720 717 if (!IS_PFLAGS_WIRED(flags)) {
721 718 ASSERT(seg_plocked_window >= npages);
722 719 seg_plocked_window -= npages;
723 720 }
724 721 mutex_exit(&seg_pmem_mtx);
725 722 }
726 723
727 724 }
728 725
729 726 #ifdef DEBUG
730 727 static uint32_t p_insert_chk_mtbf = 0;
731 728 #endif
732 729
733 730 /*
734 731 * The seg_pinsert_check() is used by segment drivers to predict whether
735 732 * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
736 733 */
737 734 /*ARGSUSED*/
738 735 int
739 736 seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
740 737 size_t len, uint_t flags)
741 738 {
742 739 ASSERT(seg != NULL);
743 740
744 741 #ifdef DEBUG
745 742 if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
746 743 return (SEGP_FAIL);
747 744 }
748 745 #endif
↓ open down ↓ |
529 lines elided |
↑ open up ↑ |
749 746
750 747 if (seg_pdisabled) {
751 748 return (SEGP_FAIL);
752 749 }
753 750 ASSERT(seg_phashsize_win != 0);
754 751
755 752 if (IS_PFLAGS_WIRED(flags)) {
756 753 return (SEGP_SUCCESS);
757 754 }
758 755
759 - if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
760 - return (SEGP_FAIL);
761 - }
762 -
763 756 if (freemem < desfree) {
764 757 return (SEGP_FAIL);
765 758 }
766 759
767 760 return (SEGP_SUCCESS);
768 761 }
769 762
770 763 #ifdef DEBUG
771 764 static uint32_t p_insert_mtbf = 0;
772 765 #endif
773 766
774 767 /*
775 768 * Insert address range with shadow list into pagelock cache if there's no
776 769 * shadow list already cached for this address range. If the cache is off or
777 770 * caching is temporarily disabled or the allowed 'window' is exceeded return
778 771 * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
779 772 *
780 773 * For non wired shadow lists (segvn case) include address in the hashing
781 774 * function to avoid linking all the entries from the same segment or amp on
782 775 * the same bucket. amp is used instead of seg if amp is not NULL. Non wired
783 776 * pcache entries are also linked on a per segment/amp list so that all
784 777 * entries can be found quickly during seg/amp purge without walking the
785 778 * entire pcache hash table. For wired shadow lists (segspt case) we
786 779 * don't use address hashing and per segment linking because the caller
787 780 * currently inserts only one entry per segment that covers the entire
788 781 * segment. If we used per segment linking even for segspt it would complicate
789 782 * seg_ppurge_wiredpp() locking.
790 783 *
791 784 * Both hash bucket and per seg/amp locks need to be held before adding a non
792 785 * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
793 786 * first.
794 787 *
795 788 * This function will also remove from pcache old inactive shadow lists that
796 789 * overlap with this request but cover smaller range for the same start
797 790 * address.
798 791 */
799 792 int
800 793 seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
801 794 size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
802 795 seg_preclaim_cbfunc_t callback)
803 796 {
804 797 struct seg_pcache *pcp;
805 798 struct seg_phash *hp;
806 799 pgcnt_t npages;
807 800 pcache_link_t *pheadp;
808 801 kmutex_t *pmtx;
809 802 struct seg_pcache *delcallb_list = NULL;
810 803
811 804 ASSERT(seg != NULL);
812 805 ASSERT(rw == S_READ || rw == S_WRITE);
813 806 ASSERT(rw == S_READ || wlen == len);
814 807 ASSERT(rw == S_WRITE || wlen <= len);
815 808 ASSERT(amp == NULL || wlen == len);
816 809
817 810 #ifdef DEBUG
818 811 if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
819 812 return (SEGP_FAIL);
820 813 }
821 814 #endif
↓ open down ↓ |
49 lines elided |
↑ open up ↑ |
822 815
823 816 if (seg_pdisabled) {
824 817 return (SEGP_FAIL);
825 818 }
826 819 ASSERT(seg_phashsize_win != 0);
827 820
828 821 ASSERT((len & PAGEOFFSET) == 0);
829 822 npages = btop(len);
830 823 mutex_enter(&seg_pmem_mtx);
831 824 if (!IS_PFLAGS_WIRED(flags)) {
832 - if (seg_plocked_window + npages > seg_pmaxwindow) {
833 - mutex_exit(&seg_pmem_mtx);
834 - return (SEGP_FAIL);
835 - }
836 825 seg_plocked_window += npages;
837 826 }
838 827 seg_plocked += npages;
839 828 mutex_exit(&seg_pmem_mtx);
840 829
841 830 pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
842 831 /*
843 832 * If amp is not NULL set htag0 to amp otherwise set it to seg.
844 833 */
845 834 if (amp == NULL) {
846 835 pcp->p_htag0 = (void *)seg;
847 836 pcp->p_flags = flags & 0xffff;
848 837 } else {
849 838 pcp->p_htag0 = (void *)amp;
850 839 pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
851 840 }
852 841 pcp->p_addr = addr;
853 842 pcp->p_len = len;
854 843 pcp->p_wlen = wlen;
855 844 pcp->p_pp = pp;
856 845 pcp->p_write = (rw == S_WRITE);
857 846 pcp->p_callback = callback;
858 847 pcp->p_active = 1;
859 848
860 849 hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
861 850 if (!IS_PFLAGS_WIRED(flags)) {
862 851 int found;
863 852 void *htag0;
864 853 if (amp == NULL) {
865 854 pheadp = &seg->s_phead;
866 855 pmtx = &seg->s_pmtx;
867 856 htag0 = (void *)seg;
868 857 } else {
869 858 pheadp = &->a_phead;
870 859 pmtx = &->a_pmtx;
871 860 htag0 = (void *)amp;
872 861 }
873 862 mutex_enter(pmtx);
874 863 mutex_enter(&hp->p_hmutex);
875 864 delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
876 865 len, &found);
877 866 if (found) {
878 867 mutex_exit(&hp->p_hmutex);
879 868 mutex_exit(pmtx);
880 869 mutex_enter(&seg_pmem_mtx);
881 870 seg_plocked -= npages;
882 871 seg_plocked_window -= npages;
883 872 mutex_exit(&seg_pmem_mtx);
884 873 kmem_cache_free(seg_pkmcache, pcp);
885 874 goto out;
886 875 }
887 876 pcp->p_plink.p_lnext = pheadp->p_lnext;
888 877 pcp->p_plink.p_lprev = pheadp;
889 878 pheadp->p_lnext->p_lprev = &pcp->p_plink;
890 879 pheadp->p_lnext = &pcp->p_plink;
891 880 } else {
892 881 mutex_enter(&hp->p_hmutex);
893 882 }
894 883 pcp->p_hashp = hp;
895 884 pcp->p_hnext = hp->p_hnext;
896 885 pcp->p_hprev = (struct seg_pcache *)hp;
897 886 hp->p_hnext->p_hprev = pcp;
898 887 hp->p_hnext = pcp;
899 888 if (!IS_PFLAGS_WIRED(flags) &&
900 889 hp->p_hprev == pcp) {
901 890 seg_padd_abuck(hp);
902 891 }
903 892 mutex_exit(&hp->p_hmutex);
904 893 if (!IS_PFLAGS_WIRED(flags)) {
905 894 mutex_exit(pmtx);
906 895 }
907 896
908 897 out:
909 898 npages = 0;
910 899 while (delcallb_list != NULL) {
911 900 pcp = delcallb_list;
912 901 delcallb_list = pcp->p_hprev;
913 902 ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
914 903 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
915 904 pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
916 905 npages += btop(pcp->p_len);
917 906 kmem_cache_free(seg_pkmcache, pcp);
918 907 }
919 908 if (npages) {
920 909 ASSERT(!IS_PFLAGS_WIRED(flags));
921 910 mutex_enter(&seg_pmem_mtx);
922 911 ASSERT(seg_plocked >= npages);
923 912 ASSERT(seg_plocked_window >= npages);
924 913 seg_plocked -= npages;
925 914 seg_plocked_window -= npages;
926 915 mutex_exit(&seg_pmem_mtx);
927 916 }
928 917
929 918 return (SEGP_SUCCESS);
930 919 }
931 920
932 921 /*
933 922 * purge entries from the pagelock cache if not active
934 923 * and not recently used.
935 924 */
936 925 static void
937 926 seg_ppurge_async(int force)
938 927 {
939 928 struct seg_pcache *delcallb_list = NULL;
940 929 struct seg_pcache *pcp;
↓ open down ↓ |
95 lines elided |
↑ open up ↑ |
941 930 struct seg_phash *hp;
942 931 pgcnt_t npages = 0;
943 932 pgcnt_t npages_window = 0;
944 933 pgcnt_t npgs_to_purge;
945 934 pgcnt_t npgs_purged = 0;
946 935 int hlinks = 0;
947 936 int hlix;
948 937 pcache_link_t *hlinkp;
949 938 pcache_link_t *hlnextp = NULL;
950 939 int lowmem;
951 - int trim;
952 940
953 941 ASSERT(seg_phashsize_win != 0);
954 942
955 943 /*
956 944 * if the cache is off or empty, return
957 945 */
958 946 if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
959 947 return;
960 948 }
961 949
962 950 if (!force) {
963 951 lowmem = 0;
964 - trim = 0;
965 952 if (freemem < lotsfree + needfree) {
966 953 spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
967 954 if (fmem <= 5 * (desfree >> 2)) {
968 955 lowmem = 1;
969 956 } else if (fmem <= 7 * (lotsfree >> 3)) {
970 957 if (seg_plocked_window >=
971 958 (availrmem_initial >> 1)) {
972 959 lowmem = 1;
973 960 }
974 961 } else if (fmem < lotsfree) {
975 962 if (seg_plocked_window >=
976 963 3 * (availrmem_initial >> 2)) {
977 964 lowmem = 1;
978 965 }
979 966 }
980 967 }
981 - if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
982 - trim = 1;
983 - }
984 - if (!lowmem && !trim) {
968 + if (!lowmem) {
985 969 return;
986 970 }
987 971 npgs_to_purge = seg_plocked_window >>
988 972 seg_pshrink_shift;
989 973 if (lowmem) {
990 974 npgs_to_purge = MIN(npgs_to_purge,
991 975 MAX(seg_pmaxapurge_npages, desfree));
992 976 } else {
993 977 npgs_to_purge = MIN(npgs_to_purge,
994 978 seg_pmaxapurge_npages);
995 979 }
996 980 if (npgs_to_purge == 0) {
997 981 return;
998 982 }
999 983 } else {
1000 984 struct seg_phash_wired *hpw;
1001 985
1002 986 ASSERT(seg_phashsize_wired != 0);
1003 987
1004 988 for (hpw = seg_phashtab_wired;
1005 989 hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
1006 990
1007 991 if (hpw->p_hnext == (struct seg_pcache *)hpw) {
1008 992 continue;
1009 993 }
1010 994
1011 995 mutex_enter(&hpw->p_hmutex);
1012 996
1013 997 for (pcp = hpw->p_hnext;
1014 998 pcp != (struct seg_pcache *)hpw;
1015 999 pcp = pcp->p_hnext) {
1016 1000
1017 1001 ASSERT(IS_PCP_WIRED(pcp));
1018 1002 ASSERT(pcp->p_hashp ==
1019 1003 (struct seg_phash *)hpw);
1020 1004
1021 1005 if (pcp->p_active) {
1022 1006 continue;
1023 1007 }
1024 1008 pcp->p_hprev->p_hnext = pcp->p_hnext;
1025 1009 pcp->p_hnext->p_hprev = pcp->p_hprev;
1026 1010 pcp->p_hprev = delcallb_list;
1027 1011 delcallb_list = pcp;
1028 1012 }
1029 1013 mutex_exit(&hpw->p_hmutex);
1030 1014 }
1031 1015 }
1032 1016
1033 1017 mutex_enter(&seg_pmem_mtx);
1034 1018 if (seg_pathr_on) {
1035 1019 mutex_exit(&seg_pmem_mtx);
1036 1020 goto runcb;
1037 1021 }
1038 1022 seg_pathr_on = 1;
1039 1023 mutex_exit(&seg_pmem_mtx);
1040 1024 ASSERT(seg_pahcur <= 1);
1041 1025 hlix = !seg_pahcur;
1042 1026
1043 1027 again:
1044 1028 for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
1045 1029 hlinkp = hlnextp) {
1046 1030
1047 1031 hlnextp = hlinkp->p_lnext;
1048 1032 ASSERT(hlnextp != NULL);
1049 1033
1050 1034 hp = hlink2phash(hlinkp, hlix);
1051 1035 if (hp->p_hnext == (struct seg_pcache *)hp) {
1052 1036 seg_pathr_empty_ahb++;
1053 1037 continue;
1054 1038 }
1055 1039 seg_pathr_full_ahb++;
1056 1040 mutex_enter(&hp->p_hmutex);
1057 1041
1058 1042 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
1059 1043 pcp = pcp->p_hnext) {
1060 1044 pcache_link_t *pheadp;
1061 1045 pcache_link_t *plinkp;
1062 1046 void *htag0;
1063 1047 kmutex_t *pmtx;
1064 1048
1065 1049 ASSERT(!IS_PCP_WIRED(pcp));
1066 1050 ASSERT(pcp->p_hashp == hp);
1067 1051
1068 1052 if (pcp->p_active) {
1069 1053 continue;
1070 1054 }
1071 1055 if (!force && pcp->p_ref &&
1072 1056 PCP_AGE(pcp) < seg_pmax_pcpage) {
1073 1057 pcp->p_ref = 0;
1074 1058 continue;
1075 1059 }
1076 1060 plinkp = &pcp->p_plink;
1077 1061 htag0 = pcp->p_htag0;
1078 1062 if (pcp->p_flags & SEGP_AMP) {
1079 1063 pheadp = &((amp_t *)htag0)->a_phead;
1080 1064 pmtx = &((amp_t *)htag0)->a_pmtx;
1081 1065 } else {
1082 1066 pheadp = &((seg_t *)htag0)->s_phead;
1083 1067 pmtx = &((seg_t *)htag0)->s_pmtx;
1084 1068 }
1085 1069 if (!mutex_tryenter(pmtx)) {
1086 1070 continue;
1087 1071 }
1088 1072 ASSERT(pheadp->p_lnext != pheadp);
1089 1073 ASSERT(pheadp->p_lprev != pheadp);
1090 1074 plinkp->p_lprev->p_lnext =
1091 1075 plinkp->p_lnext;
1092 1076 plinkp->p_lnext->p_lprev =
1093 1077 plinkp->p_lprev;
1094 1078 pcp->p_hprev->p_hnext = pcp->p_hnext;
1095 1079 pcp->p_hnext->p_hprev = pcp->p_hprev;
1096 1080 mutex_exit(pmtx);
1097 1081 pcp->p_hprev = delcallb_list;
1098 1082 delcallb_list = pcp;
1099 1083 npgs_purged += btop(pcp->p_len);
1100 1084 }
1101 1085 if (hp->p_hnext == (struct seg_pcache *)hp) {
↓ open down ↓ |
107 lines elided |
↑ open up ↑ |
1102 1086 seg_premove_abuck(hp, 1);
1103 1087 }
1104 1088 mutex_exit(&hp->p_hmutex);
1105 1089 if (npgs_purged >= seg_plocked_window) {
1106 1090 break;
1107 1091 }
1108 1092 if (!force) {
1109 1093 if (npgs_purged >= npgs_to_purge) {
1110 1094 break;
1111 1095 }
1112 - if (!trim && !(seg_pathr_full_ahb & 15)) {
1096 + if (!(seg_pathr_full_ahb & 15)) {
1113 1097 ASSERT(lowmem);
1114 1098 if (freemem >= lotsfree + needfree) {
1115 1099 break;
1116 1100 }
1117 1101 }
1118 1102 }
1119 1103 }
1120 1104
1121 1105 if (hlinkp == &seg_pahhead[hlix]) {
1122 1106 /*
1123 1107 * We processed the entire hlix active bucket list
1124 1108 * but didn't find enough pages to reclaim.
1125 1109 * Switch the lists and walk the other list
1126 1110 * if we haven't done it yet.
1127 1111 */
1128 1112 mutex_enter(&seg_pmem_mtx);
1129 1113 ASSERT(seg_pathr_on);
1130 1114 ASSERT(seg_pahcur == !hlix);
1131 1115 seg_pahcur = hlix;
1132 1116 mutex_exit(&seg_pmem_mtx);
1133 1117 if (++hlinks < 2) {
1134 1118 hlix = !hlix;
1135 1119 goto again;
1136 1120 }
1137 1121 } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
1138 1122 seg_pahhead[hlix].p_lnext != hlinkp) {
1139 1123 ASSERT(hlinkp != NULL);
1140 1124 ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
1141 1125 ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
1142 1126 ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
1143 1127
1144 1128 /*
1145 1129 * Reinsert the header to point to hlinkp
1146 1130 * so that we start from hlinkp bucket next time around.
1147 1131 */
1148 1132 seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
1149 1133 seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
1150 1134 seg_pahhead[hlix].p_lnext = hlinkp;
1151 1135 seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
1152 1136 hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
1153 1137 hlinkp->p_lprev = &seg_pahhead[hlix];
1154 1138 }
1155 1139
1156 1140 mutex_enter(&seg_pmem_mtx);
1157 1141 ASSERT(seg_pathr_on);
1158 1142 seg_pathr_on = 0;
1159 1143 mutex_exit(&seg_pmem_mtx);
1160 1144
1161 1145 runcb:
1162 1146 /*
1163 1147 * Run the delayed callback list. segments/amps can't go away until
1164 1148 * callback is executed since they must have non 0 softlockcnt. That's
1165 1149 * why we don't need to hold as/seg/amp locks to execute the callback.
1166 1150 */
1167 1151 while (delcallb_list != NULL) {
1168 1152 pcp = delcallb_list;
1169 1153 delcallb_list = pcp->p_hprev;
1170 1154 ASSERT(!pcp->p_active);
1171 1155 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1172 1156 pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
1173 1157 npages += btop(pcp->p_len);
1174 1158 if (!IS_PCP_WIRED(pcp)) {
1175 1159 npages_window += btop(pcp->p_len);
1176 1160 }
1177 1161 kmem_cache_free(seg_pkmcache, pcp);
1178 1162 }
1179 1163 if (npages) {
1180 1164 mutex_enter(&seg_pmem_mtx);
1181 1165 ASSERT(seg_plocked >= npages);
1182 1166 ASSERT(seg_plocked_window >= npages_window);
1183 1167 seg_plocked -= npages;
1184 1168 seg_plocked_window -= npages_window;
1185 1169 mutex_exit(&seg_pmem_mtx);
1186 1170 }
1187 1171 }
1188 1172
1189 1173 /*
1190 1174 * Remove cached pages for segment(s) entries from hashtable. The segments
1191 1175 * are identified by pp array. This is useful for multiple seg's cached on
1192 1176 * behalf of dummy segment (ISM/DISM) with common pp array.
1193 1177 */
1194 1178 void
1195 1179 seg_ppurge_wiredpp(struct page **pp)
1196 1180 {
1197 1181 struct seg_pcache *pcp;
1198 1182 struct seg_phash_wired *hp;
1199 1183 pgcnt_t npages = 0;
1200 1184 struct seg_pcache *delcallb_list = NULL;
1201 1185
1202 1186 /*
1203 1187 * if the cache is empty, return
1204 1188 */
1205 1189 if (seg_plocked == 0) {
1206 1190 return;
1207 1191 }
1208 1192 ASSERT(seg_phashsize_wired != 0);
1209 1193
1210 1194 for (hp = seg_phashtab_wired;
1211 1195 hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
1212 1196 if (hp->p_hnext == (struct seg_pcache *)hp) {
1213 1197 continue;
1214 1198 }
1215 1199 mutex_enter(&hp->p_hmutex);
1216 1200 pcp = hp->p_hnext;
1217 1201 while (pcp != (struct seg_pcache *)hp) {
1218 1202 ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
1219 1203 ASSERT(IS_PCP_WIRED(pcp));
1220 1204 /*
1221 1205 * purge entries which are not active
1222 1206 */
1223 1207 if (!pcp->p_active && pcp->p_pp == pp) {
1224 1208 ASSERT(pcp->p_htag0 != NULL);
1225 1209 pcp->p_hprev->p_hnext = pcp->p_hnext;
1226 1210 pcp->p_hnext->p_hprev = pcp->p_hprev;
1227 1211 pcp->p_hprev = delcallb_list;
1228 1212 delcallb_list = pcp;
1229 1213 }
1230 1214 pcp = pcp->p_hnext;
1231 1215 }
1232 1216 mutex_exit(&hp->p_hmutex);
1233 1217 /*
1234 1218 * segments can't go away until callback is executed since
1235 1219 * they must have non 0 softlockcnt. That's why we don't
1236 1220 * need to hold as/seg locks to execute the callback.
1237 1221 */
1238 1222 while (delcallb_list != NULL) {
1239 1223 int done;
1240 1224 pcp = delcallb_list;
1241 1225 delcallb_list = pcp->p_hprev;
1242 1226 ASSERT(!pcp->p_active);
1243 1227 done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1244 1228 pcp->p_len, pcp->p_pp,
1245 1229 pcp->p_write ? S_WRITE : S_READ, 1);
1246 1230 npages += btop(pcp->p_len);
1247 1231 ASSERT(IS_PCP_WIRED(pcp));
1248 1232 kmem_cache_free(seg_pkmcache, pcp);
1249 1233 if (done) {
1250 1234 ASSERT(delcallb_list == NULL);
1251 1235 goto out;
1252 1236 }
1253 1237 }
1254 1238 }
1255 1239
1256 1240 out:
1257 1241 mutex_enter(&seg_pmem_mtx);
1258 1242 ASSERT(seg_plocked >= npages);
1259 1243 seg_plocked -= npages;
1260 1244 mutex_exit(&seg_pmem_mtx);
1261 1245 }
1262 1246
1263 1247 /*
1264 1248 * purge all entries for a given segment. Since we
1265 1249 * callback into the segment driver directly for page
1266 1250 * reclaim the caller needs to hold the right locks.
1267 1251 */
1268 1252 void
1269 1253 seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
1270 1254 {
1271 1255 struct seg_pcache *delcallb_list = NULL;
1272 1256 struct seg_pcache *pcp;
1273 1257 struct seg_phash *hp;
1274 1258 pgcnt_t npages = 0;
1275 1259 void *htag0;
1276 1260
1277 1261 if (seg_plocked == 0) {
1278 1262 return;
1279 1263 }
1280 1264 ASSERT(seg_phashsize_win != 0);
1281 1265
1282 1266 /*
1283 1267 * If amp is not NULL use amp as a lookup tag otherwise use seg
1284 1268 * as a lookup tag.
1285 1269 */
1286 1270 htag0 = (amp == NULL ? (void *)seg : (void *)amp);
1287 1271 ASSERT(htag0 != NULL);
1288 1272 if (IS_PFLAGS_WIRED(flags)) {
1289 1273 hp = P_HASHBP(seg, htag0, 0, flags);
1290 1274 mutex_enter(&hp->p_hmutex);
1291 1275 pcp = hp->p_hnext;
1292 1276 while (pcp != (struct seg_pcache *)hp) {
1293 1277 ASSERT(pcp->p_hashp == hp);
1294 1278 ASSERT(IS_PCP_WIRED(pcp));
1295 1279 if (pcp->p_htag0 == htag0) {
1296 1280 if (pcp->p_active) {
1297 1281 break;
1298 1282 }
1299 1283 pcp->p_hprev->p_hnext = pcp->p_hnext;
1300 1284 pcp->p_hnext->p_hprev = pcp->p_hprev;
1301 1285 pcp->p_hprev = delcallb_list;
1302 1286 delcallb_list = pcp;
1303 1287 }
1304 1288 pcp = pcp->p_hnext;
1305 1289 }
1306 1290 mutex_exit(&hp->p_hmutex);
1307 1291 } else {
1308 1292 pcache_link_t *plinkp;
1309 1293 pcache_link_t *pheadp;
1310 1294 kmutex_t *pmtx;
1311 1295
1312 1296 if (amp == NULL) {
1313 1297 ASSERT(seg != NULL);
1314 1298 pheadp = &seg->s_phead;
1315 1299 pmtx = &seg->s_pmtx;
1316 1300 } else {
1317 1301 pheadp = &->a_phead;
1318 1302 pmtx = &->a_pmtx;
1319 1303 }
1320 1304 mutex_enter(pmtx);
1321 1305 while ((plinkp = pheadp->p_lnext) != pheadp) {
1322 1306 pcp = plink2pcache(plinkp);
1323 1307 ASSERT(!IS_PCP_WIRED(pcp));
1324 1308 ASSERT(pcp->p_htag0 == htag0);
1325 1309 hp = pcp->p_hashp;
1326 1310 mutex_enter(&hp->p_hmutex);
1327 1311 if (pcp->p_active) {
1328 1312 mutex_exit(&hp->p_hmutex);
1329 1313 break;
1330 1314 }
1331 1315 ASSERT(plinkp->p_lprev == pheadp);
1332 1316 pheadp->p_lnext = plinkp->p_lnext;
1333 1317 plinkp->p_lnext->p_lprev = pheadp;
1334 1318 pcp->p_hprev->p_hnext = pcp->p_hnext;
1335 1319 pcp->p_hnext->p_hprev = pcp->p_hprev;
1336 1320 pcp->p_hprev = delcallb_list;
1337 1321 delcallb_list = pcp;
1338 1322 if (hp->p_hnext == (struct seg_pcache *)hp) {
1339 1323 seg_premove_abuck(hp, 0);
1340 1324 }
1341 1325 mutex_exit(&hp->p_hmutex);
1342 1326 }
1343 1327 mutex_exit(pmtx);
1344 1328 }
1345 1329 while (delcallb_list != NULL) {
1346 1330 pcp = delcallb_list;
1347 1331 delcallb_list = pcp->p_hprev;
1348 1332 ASSERT(!pcp->p_active);
1349 1333 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
1350 1334 pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
1351 1335 npages += btop(pcp->p_len);
1352 1336 kmem_cache_free(seg_pkmcache, pcp);
1353 1337 }
1354 1338 mutex_enter(&seg_pmem_mtx);
1355 1339 ASSERT(seg_plocked >= npages);
1356 1340 seg_plocked -= npages;
1357 1341 if (!IS_PFLAGS_WIRED(flags)) {
1358 1342 ASSERT(seg_plocked_window >= npages);
1359 1343 seg_plocked_window -= npages;
1360 1344 }
1361 1345 mutex_exit(&seg_pmem_mtx);
1362 1346 }
1363 1347
1364 1348 static void seg_pinit_mem_config(void);
1365 1349
1366 1350 /*
1367 1351 * setup the pagelock cache
1368 1352 */
1369 1353 static void
1370 1354 seg_pinit(void)
1371 1355 {
1372 1356 struct seg_phash *hp;
1373 1357 ulong_t i;
1374 1358 pgcnt_t physmegs;
1375 1359
1376 1360 seg_plocked = 0;
1377 1361 seg_plocked_window = 0;
1378 1362
1379 1363 if (segpcache_enabled == 0) {
1380 1364 seg_phashsize_win = 0;
1381 1365 seg_phashsize_wired = 0;
1382 1366 seg_pdisabled = 1;
1383 1367 return;
1384 1368 }
1385 1369
1386 1370 seg_pdisabled = 0;
1387 1371 seg_pkmcache = kmem_cache_create("seg_pcache",
1388 1372 sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
1389 1373 if (segpcache_pcp_maxage_ticks <= 0) {
1390 1374 segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
1391 1375 }
1392 1376 seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
1393 1377 seg_pathr_empty_ahb = 0;
1394 1378 seg_pathr_full_ahb = 0;
1395 1379 seg_pshrink_shift = segpcache_shrink_shift;
1396 1380 seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
1397 1381
1398 1382 mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
1399 1383 mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
1400 1384 mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
1401 1385 cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
1402 1386
1403 1387 physmegs = physmem >> (20 - PAGESHIFT);
1404 1388
1405 1389 /*
1406 1390 * If segpcache_hashsize_win was not set in /etc/system or it has
1407 1391 * absurd value set it to a default.
1408 1392 */
1409 1393 if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
1410 1394 /*
1411 1395 * Create one bucket per 32K (or at least per 8 pages) of
1412 1396 * available memory.
1413 1397 */
1414 1398 pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
1415 1399 segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
1416 1400 }
1417 1401 if (!ISP2(segpcache_hashsize_win)) {
1418 1402 ulong_t rndfac = ~(1UL <<
1419 1403 (highbit(segpcache_hashsize_win) - 1));
1420 1404 rndfac &= segpcache_hashsize_win;
1421 1405 segpcache_hashsize_win += rndfac;
1422 1406 segpcache_hashsize_win = 1 <<
1423 1407 (highbit(segpcache_hashsize_win) - 1);
1424 1408 }
1425 1409 seg_phashsize_win = segpcache_hashsize_win;
1426 1410 seg_phashtab_win = kmem_zalloc(
1427 1411 seg_phashsize_win * sizeof (struct seg_phash),
1428 1412 KM_SLEEP);
1429 1413 for (i = 0; i < seg_phashsize_win; i++) {
1430 1414 hp = &seg_phashtab_win[i];
1431 1415 hp->p_hnext = (struct seg_pcache *)hp;
1432 1416 hp->p_hprev = (struct seg_pcache *)hp;
1433 1417 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1434 1418 }
1435 1419
1436 1420 seg_pahcur = 0;
1437 1421 seg_pathr_on = 0;
1438 1422 seg_pahhead[0].p_lnext = &seg_pahhead[0];
1439 1423 seg_pahhead[0].p_lprev = &seg_pahhead[0];
1440 1424 seg_pahhead[1].p_lnext = &seg_pahhead[1];
1441 1425 seg_pahhead[1].p_lprev = &seg_pahhead[1];
1442 1426
1443 1427 /*
1444 1428 * If segpcache_hashsize_wired was not set in /etc/system or it has
1445 1429 * absurd value set it to a default.
1446 1430 */
1447 1431 if (segpcache_hashsize_wired == 0 ||
1448 1432 segpcache_hashsize_wired > physmem / 4) {
1449 1433 /*
1450 1434 * Choose segpcache_hashsize_wired based on physmem.
1451 1435 * Create a bucket per 128K bytes upto 256K buckets.
1452 1436 */
1453 1437 if (physmegs < 20 * 1024) {
1454 1438 segpcache_hashsize_wired = MAX(1024, physmegs << 3);
1455 1439 } else {
1456 1440 segpcache_hashsize_wired = 256 * 1024;
1457 1441 }
1458 1442 }
1459 1443 if (!ISP2(segpcache_hashsize_wired)) {
1460 1444 segpcache_hashsize_wired = 1 <<
1461 1445 highbit(segpcache_hashsize_wired);
1462 1446 }
↓ open down ↓ |
340 lines elided |
↑ open up ↑ |
1463 1447 seg_phashsize_wired = segpcache_hashsize_wired;
1464 1448 seg_phashtab_wired = kmem_zalloc(
1465 1449 seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
1466 1450 for (i = 0; i < seg_phashsize_wired; i++) {
1467 1451 hp = (struct seg_phash *)&seg_phashtab_wired[i];
1468 1452 hp->p_hnext = (struct seg_pcache *)hp;
1469 1453 hp->p_hprev = (struct seg_pcache *)hp;
1470 1454 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1471 1455 }
1472 1456
1473 - if (segpcache_maxwindow == 0) {
1474 - if (physmegs < 64) {
1475 - /* 3% of memory */
1476 - segpcache_maxwindow = availrmem >> 5;
1477 - } else if (physmegs < 512) {
1478 - /* 12% of memory */
1479 - segpcache_maxwindow = availrmem >> 3;
1480 - } else if (physmegs < 1024) {
1481 - /* 25% of memory */
1482 - segpcache_maxwindow = availrmem >> 2;
1483 - } else if (physmegs < 2048) {
1484 - /* 50% of memory */
1485 - segpcache_maxwindow = availrmem >> 1;
1486 - } else {
1487 - /* no limit */
1488 - segpcache_maxwindow = (pgcnt_t)-1;
1489 - }
1490 - }
1491 - seg_pmaxwindow = segpcache_maxwindow;
1492 1457 seg_pinit_mem_config();
1493 1458 }
1494 1459
1495 1460 /*
1496 1461 * called by pageout if memory is low
1497 1462 */
1498 1463 void
1499 1464 seg_preap(void)
1500 1465 {
1501 1466 /*
1502 1467 * if the cache is off or empty, return
1503 1468 */
1504 1469 if (seg_plocked_window == 0) {
1505 1470 return;
1506 1471 }
1507 1472 ASSERT(seg_phashsize_win != 0);
1508 1473
1509 1474 /*
1510 1475 * If somebody is already purging pcache
1511 1476 * just return.
1512 1477 */
1513 1478 if (seg_pdisabled) {
1514 1479 return;
1515 1480 }
1516 1481
1517 1482 cv_signal(&seg_pasync_cv);
1518 1483 }
1519 1484
1520 1485 /*
1521 1486 * run as a backgroud thread and reclaim pagelock
1522 1487 * pages which have not been used recently
1523 1488 */
1524 1489 void
1525 1490 seg_pasync_thread(void)
1526 1491 {
1527 1492 callb_cpr_t cpr_info;
1528 1493
1529 1494 if (seg_phashsize_win == 0) {
1530 1495 thread_exit();
1531 1496 /*NOTREACHED*/
1532 1497 }
1533 1498
1534 1499 seg_pasync_thr = curthread;
1535 1500
1536 1501 CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
1537 1502 callb_generic_cpr, "seg_pasync");
1538 1503
1539 1504 if (segpcache_reap_ticks <= 0) {
1540 1505 segpcache_reap_ticks = segpcache_reap_sec * hz;
1541 1506 }
1542 1507
1543 1508 mutex_enter(&seg_pasync_mtx);
1544 1509 for (;;) {
1545 1510 CALLB_CPR_SAFE_BEGIN(&cpr_info);
1546 1511 (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx,
1547 1512 segpcache_reap_ticks, TR_CLOCK_TICK);
1548 1513 CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
1549 1514 if (seg_pdisabled == 0) {
1550 1515 seg_ppurge_async(0);
1551 1516 }
1552 1517 }
1553 1518 }
1554 1519
1555 1520 static struct kmem_cache *seg_cache;
1556 1521
1557 1522 /*
1558 1523 * Initialize segment management data structures.
1559 1524 */
1560 1525 void
1561 1526 seg_init(void)
1562 1527 {
1563 1528 kstat_t *ksp;
1564 1529
1565 1530 seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
1566 1531 0, NULL, NULL, NULL, NULL, NULL, 0);
1567 1532
1568 1533 ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
1569 1534 segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
1570 1535 if (ksp) {
1571 1536 ksp->ks_data = (void *)segadvstat_ptr;
1572 1537 kstat_install(ksp);
1573 1538 }
1574 1539
1575 1540 seg_pinit();
1576 1541 }
1577 1542
1578 1543 /*
1579 1544 * Allocate a segment to cover [base, base+size]
1580 1545 * and attach it to the specified address space.
1581 1546 */
1582 1547 struct seg *
1583 1548 seg_alloc(struct as *as, caddr_t base, size_t size)
1584 1549 {
1585 1550 struct seg *new;
1586 1551 caddr_t segbase;
1587 1552 size_t segsize;
1588 1553
1589 1554 segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
1590 1555 segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
1591 1556 (uintptr_t)segbase;
1592 1557
1593 1558 if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
1594 1559 return ((struct seg *)NULL); /* bad virtual addr range */
1595 1560
1596 1561 if (as != &kas &&
1597 1562 valid_usr_range(segbase, segsize, 0, as,
1598 1563 as->a_userlimit) != RANGE_OKAY)
1599 1564 return ((struct seg *)NULL); /* bad virtual addr range */
1600 1565
1601 1566 new = kmem_cache_alloc(seg_cache, KM_SLEEP);
1602 1567 new->s_ops = NULL;
1603 1568 new->s_data = NULL;
1604 1569 new->s_szc = 0;
1605 1570 new->s_flags = 0;
1606 1571 mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
1607 1572 new->s_phead.p_lnext = &new->s_phead;
1608 1573 new->s_phead.p_lprev = &new->s_phead;
1609 1574 if (seg_attach(as, segbase, segsize, new) < 0) {
1610 1575 kmem_cache_free(seg_cache, new);
1611 1576 return ((struct seg *)NULL);
1612 1577 }
1613 1578 /* caller must fill in ops, data */
1614 1579 return (new);
1615 1580 }
1616 1581
1617 1582 /*
1618 1583 * Attach a segment to the address space. Used by seg_alloc()
1619 1584 * and for kernel startup to attach to static segments.
1620 1585 */
1621 1586 int
1622 1587 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
1623 1588 {
1624 1589 seg->s_as = as;
1625 1590 seg->s_base = base;
1626 1591 seg->s_size = size;
1627 1592
1628 1593 /*
↓ open down ↓ |
127 lines elided |
↑ open up ↑ |
1629 1594 * as_addseg() will add the segment at the appropraite point
1630 1595 * in the list. It will return -1 if there is overlap with
1631 1596 * an already existing segment.
1632 1597 */
1633 1598 return (as_addseg(as, seg));
1634 1599 }
1635 1600
1636 1601 /*
1637 1602 * Unmap a segment and free it from its associated address space.
1638 1603 * This should be called by anybody who's finished with a whole segment's
1639 - * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the
1604 + * mapping. Just calls segop_unmap() on the whole mapping . It is the
1640 1605 * responsibility of the segment driver to unlink the the segment
1641 1606 * from the address space, and to free public and private data structures
1642 1607 * associated with the segment. (This is typically done by a call to
1643 1608 * seg_free()).
1644 1609 */
1645 1610 void
1646 1611 seg_unmap(struct seg *seg)
1647 1612 {
1648 1613 #ifdef DEBUG
1649 1614 int ret;
1650 1615 #endif /* DEBUG */
1651 1616
1652 1617 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1653 1618
1654 1619 /* Shouldn't have called seg_unmap if mapping isn't yet established */
1655 1620 ASSERT(seg->s_data != NULL);
1656 1621
1657 1622 /* Unmap the whole mapping */
1658 1623 #ifdef DEBUG
1659 - ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1624 + ret = segop_unmap(seg, seg->s_base, seg->s_size);
1660 1625 ASSERT(ret == 0);
1661 1626 #else
1662 - SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1627 + segop_unmap(seg, seg->s_base, seg->s_size);
1663 1628 #endif /* DEBUG */
1664 1629 }
1665 1630
1666 1631 /*
1667 1632 * Free the segment from its associated as. This should only be called
1668 1633 * if a mapping to the segment has not yet been established (e.g., if
1669 1634 * an error occurs in the middle of doing an as_map when the segment
1670 1635 * has already been partially set up) or if it has already been deleted
1671 1636 * (e.g., from a segment driver unmap routine if the unmap applies to the
1672 1637 * entire segment). If the mapping is currently set up then seg_unmap() should
1673 1638 * be called instead.
1674 1639 */
1675 1640 void
1676 1641 seg_free(struct seg *seg)
1677 1642 {
↓ open down ↓ |
5 lines elided |
↑ open up ↑ |
1678 1643 register struct as *as = seg->s_as;
1679 1644 struct seg *tseg = as_removeseg(as, seg);
1680 1645
1681 1646 ASSERT(tseg == seg);
1682 1647
1683 1648 /*
1684 1649 * If the segment private data field is NULL,
1685 1650 * then segment driver is not attached yet.
1686 1651 */
1687 1652 if (seg->s_data != NULL)
1688 - SEGOP_FREE(seg);
1653 + segop_free(seg);
1689 1654
1690 1655 mutex_destroy(&seg->s_pmtx);
1691 1656 ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
1692 1657 ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
1693 1658 kmem_cache_free(seg_cache, seg);
1694 1659 }
1695 1660
1696 1661 /*ARGSUSED*/
1697 1662 static void
1698 1663 seg_p_mem_config_post_add(
1699 1664 void *arg,
1700 1665 pgcnt_t delta_pages)
1701 1666 {
1702 1667 /* Nothing to do. */
1703 1668 }
1704 1669
1705 1670 void
1706 1671 seg_p_enable(void)
1707 1672 {
1708 1673 mutex_enter(&seg_pcache_mtx);
1709 1674 ASSERT(seg_pdisabled != 0);
1710 1675 seg_pdisabled--;
1711 1676 mutex_exit(&seg_pcache_mtx);
1712 1677 }
1713 1678
1714 1679 /*
1715 1680 * seg_p_disable - disables seg_pcache, and then attempts to empty the
1716 1681 * cache.
1717 1682 * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1718 1683 * SEGP_FAIL if the cache could not be emptied.
1719 1684 */
1720 1685 int
1721 1686 seg_p_disable(void)
1722 1687 {
1723 1688 pgcnt_t old_plocked;
1724 1689 int stall_count = 0;
1725 1690
1726 1691 mutex_enter(&seg_pcache_mtx);
1727 1692 seg_pdisabled++;
1728 1693 ASSERT(seg_pdisabled != 0);
1729 1694 mutex_exit(&seg_pcache_mtx);
1730 1695
1731 1696 /*
1732 1697 * Attempt to empty the cache. Terminate if seg_plocked does not
1733 1698 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
1734 1699 */
1735 1700 while (seg_plocked != 0) {
1736 1701 ASSERT(seg_phashsize_win != 0);
1737 1702 old_plocked = seg_plocked;
1738 1703 seg_ppurge_async(1);
1739 1704 if (seg_plocked == old_plocked) {
1740 1705 if (stall_count++ > SEGP_STALL_THRESHOLD) {
1741 1706 return (SEGP_FAIL);
1742 1707 }
1743 1708 } else
1744 1709 stall_count = 0;
1745 1710 if (seg_plocked != 0)
1746 1711 delay(hz/SEGP_PREDEL_DELAY_FACTOR);
1747 1712 }
1748 1713 return (SEGP_SUCCESS);
1749 1714 }
1750 1715
1751 1716 /*
1752 1717 * Attempt to purge seg_pcache. May need to return before this has
1753 1718 * completed to allow other pre_del callbacks to unlock pages. This is
1754 1719 * ok because:
1755 1720 * 1) The seg_pdisabled flag has been set so at least we won't
1756 1721 * cache anymore locks and the locks we couldn't purge
1757 1722 * will not be held if they do get released by a subsequent
1758 1723 * pre-delete callback.
1759 1724 *
1760 1725 * 2) The rest of the memory delete thread processing does not
1761 1726 * depend on the changes made in this pre-delete callback. No
1762 1727 * panics will result, the worst that will happen is that the
1763 1728 * DR code will timeout and cancel the delete.
1764 1729 */
1765 1730 /*ARGSUSED*/
1766 1731 static int
1767 1732 seg_p_mem_config_pre_del(
1768 1733 void *arg,
1769 1734 pgcnt_t delta_pages)
1770 1735 {
1771 1736 if (seg_phashsize_win == 0) {
1772 1737 return (0);
1773 1738 }
1774 1739 if (seg_p_disable() != SEGP_SUCCESS)
1775 1740 cmn_err(CE_NOTE,
1776 1741 "!Pre-delete couldn't purge"" pagelock cache - continuing");
1777 1742 return (0);
1778 1743 }
1779 1744
1780 1745 /*ARGSUSED*/
1781 1746 static void
1782 1747 seg_p_mem_config_post_del(
1783 1748 void *arg,
1784 1749 pgcnt_t delta_pages,
1785 1750 int cancelled)
1786 1751 {
1787 1752 if (seg_phashsize_win == 0) {
1788 1753 return;
1789 1754 }
1790 1755 seg_p_enable();
1791 1756 }
1792 1757
1793 1758 static kphysm_setup_vector_t seg_p_mem_config_vec = {
1794 1759 KPHYSM_SETUP_VECTOR_VERSION,
1795 1760 seg_p_mem_config_post_add,
1796 1761 seg_p_mem_config_pre_del,
1797 1762 seg_p_mem_config_post_del,
1798 1763 };
1799 1764
1800 1765 static void
1801 1766 seg_pinit_mem_config(void)
1802 1767 {
1803 1768 int ret;
1804 1769
1805 1770 ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
1806 1771 /*
1807 1772 * Want to catch this in the debug kernel. At run time, if the
1808 1773 * callbacks don't get run all will be OK as the disable just makes
1809 1774 * it more likely that the pages can be collected.
1810 1775 */
1811 1776 ASSERT(ret == 0);
1812 1777 }
1813 1778
1814 1779 /*
1815 1780 * Verify that segment is not a shared anonymous segment which reserves
1816 1781 * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
1817 1782 * from one zone to another if any segments are shared. This is because the
1818 1783 * last process to exit will credit the swap reservation. This could lead
1819 1784 * to the swap being reserved by one zone, and credited to another.
1820 1785 */
1821 1786 boolean_t
1822 1787 seg_can_change_zones(struct seg *seg)
1823 1788 {
1824 1789 struct segvn_data *svd;
1825 1790
1826 1791 if (seg->s_ops == &segspt_shmops)
1827 1792 return (B_FALSE);
1828 1793
1829 1794 if (seg->s_ops == &segvn_ops) {
1830 1795 svd = (struct segvn_data *)seg->s_data;
1831 1796 if (svd->type == MAP_SHARED &&
1832 1797 svd->amp != NULL &&
1833 1798 svd->amp->swresv > 0)
1834 1799 return (B_FALSE);
1835 1800 }
1836 1801 return (B_TRUE);
1837 1802 }
1838 1803
1839 1804 /*
1840 1805 * Return swap reserved by a segment backing a private mapping.
1841 1806 */
1842 1807 size_t
1843 1808 seg_swresv(struct seg *seg)
1844 1809 {
1845 1810 struct segvn_data *svd;
1846 1811 size_t swap = 0;
↓ open down ↓ |
148 lines elided |
↑ open up ↑ |
1847 1812
1848 1813 if (seg->s_ops == &segvn_ops) {
1849 1814 svd = (struct segvn_data *)seg->s_data;
1850 1815 if (svd->type == MAP_PRIVATE && svd->swresv > 0)
1851 1816 swap = svd->swresv;
1852 1817 }
1853 1818 return (swap);
1854 1819 }
1855 1820
1856 1821 /*
1857 - * General not supported function for SEGOP_INHERIT
1822 + * segop wrappers
1858 1823 */
1859 -/* ARGSUSED */
1860 1824 int
1861 -seg_inherit_notsup(struct seg *seg, caddr_t addr, size_t len, uint_t op)
1825 +segop_dup(struct seg *seg, struct seg *new)
1826 +{
1827 + VERIFY3P(seg->s_ops->dup, !=, NULL);
1828 +
1829 + return (seg->s_ops->dup(seg, new));
1830 +}
1831 +
1832 +int
1833 +segop_unmap(struct seg *seg, caddr_t addr, size_t len)
1834 +{
1835 + VERIFY3P(seg->s_ops->unmap, !=, NULL);
1836 +
1837 + return (seg->s_ops->unmap(seg, addr, len));
1838 +}
1839 +
1840 +void
1841 +segop_free(struct seg *seg)
1842 +{
1843 + VERIFY3P(seg->s_ops->free, !=, NULL);
1844 +
1845 + seg->s_ops->free(seg);
1846 +}
1847 +
1848 +faultcode_t
1849 +segop_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
1850 + enum fault_type type, enum seg_rw rw)
1851 +{
1852 + VERIFY3P(seg->s_ops->fault, !=, NULL);
1853 +
1854 + return (seg->s_ops->fault(hat, seg, addr, len, type, rw));
1855 +}
1856 +
1857 +faultcode_t
1858 +segop_faulta(struct seg *seg, caddr_t addr)
1859 +{
1860 + VERIFY3P(seg->s_ops->faulta, !=, NULL);
1861 +
1862 + return (seg->s_ops->faulta(seg, addr));
1863 +}
1864 +
1865 +int
1866 +segop_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1867 +{
1868 + VERIFY3P(seg->s_ops->setprot, !=, NULL);
1869 +
1870 + return (seg->s_ops->setprot(seg, addr, len, prot));
1871 +}
1872 +
1873 +int
1874 +segop_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1875 +{
1876 + VERIFY3P(seg->s_ops->checkprot, !=, NULL);
1877 +
1878 + return (seg->s_ops->checkprot(seg, addr, len, prot));
1879 +}
1880 +
1881 +int
1882 +segop_kluster(struct seg *seg, caddr_t addr, ssize_t d)
1883 +{
1884 + VERIFY3P(seg->s_ops->kluster, !=, NULL);
1885 +
1886 + return (seg->s_ops->kluster(seg, addr, d));
1887 +}
1888 +
1889 +int
1890 +segop_sync(struct seg *seg, caddr_t addr, size_t len, int atr, uint_t f)
1891 +{
1892 + VERIFY3P(seg->s_ops->sync, !=, NULL);
1893 +
1894 + return (seg->s_ops->sync(seg, addr, len, atr, f));
1895 +}
1896 +
1897 +size_t
1898 +segop_incore(struct seg *seg, caddr_t addr, size_t len, char *v)
1862 1899 {
1863 - return (ENOTSUP);
1900 + VERIFY3P(seg->s_ops->incore, !=, NULL);
1901 +
1902 + return (seg->s_ops->incore(seg, addr, len, v));
1903 +}
1904 +
1905 +int
1906 +segop_lockop(struct seg *seg, caddr_t addr, size_t len, int atr, int op,
1907 + ulong_t *b, size_t p)
1908 +{
1909 + VERIFY3P(seg->s_ops->lockop, !=, NULL);
1910 +
1911 + return (seg->s_ops->lockop(seg, addr, len, atr, op, b, p));
1912 +}
1913 +
1914 +int
1915 +segop_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *p)
1916 +{
1917 + VERIFY3P(seg->s_ops->getprot, !=, NULL);
1918 +
1919 + return (seg->s_ops->getprot(seg, addr, len, p));
1920 +}
1921 +
1922 +u_offset_t
1923 +segop_getoffset(struct seg *seg, caddr_t addr)
1924 +{
1925 + VERIFY3P(seg->s_ops->getoffset, !=, NULL);
1926 +
1927 + return (seg->s_ops->getoffset(seg, addr));
1928 +}
1929 +
1930 +int
1931 +segop_gettype(struct seg *seg, caddr_t addr)
1932 +{
1933 + VERIFY3P(seg->s_ops->gettype, !=, NULL);
1934 +
1935 + return (seg->s_ops->gettype(seg, addr));
1936 +}
1937 +
1938 +int
1939 +segop_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
1940 +{
1941 + VERIFY3P(seg->s_ops->getvp, !=, NULL);
1942 +
1943 + return (seg->s_ops->getvp(seg, addr, vpp));
1944 +}
1945 +
1946 +int
1947 +segop_advise(struct seg *seg, caddr_t addr, size_t len, uint_t b)
1948 +{
1949 + VERIFY3P(seg->s_ops->advise, !=, NULL);
1950 +
1951 + return (seg->s_ops->advise(seg, addr, len, b));
1952 +}
1953 +
1954 +void
1955 +segop_dump(struct seg *seg)
1956 +{
1957 + if (seg->s_ops->dump == NULL)
1958 + return;
1959 +
1960 + seg->s_ops->dump(seg);
1961 +}
1962 +
1963 +int
1964 +segop_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***page,
1965 + enum lock_type type, enum seg_rw rw)
1966 +{
1967 + VERIFY3P(seg->s_ops->pagelock, !=, NULL);
1968 +
1969 + return (seg->s_ops->pagelock(seg, addr, len, page, type, rw));
1970 +}
1971 +
1972 +int
1973 +segop_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
1974 +{
1975 + if (seg->s_ops->setpagesize == NULL)
1976 + return (ENOTSUP);
1977 +
1978 + return (seg->s_ops->setpagesize(seg, addr, len, szc));
1979 +}
1980 +
1981 +int
1982 +segop_getmemid(struct seg *seg, caddr_t addr, memid_t *mp)
1983 +{
1984 + if (seg->s_ops->getmemid == NULL)
1985 + return (ENODEV);
1986 +
1987 + return (seg->s_ops->getmemid(seg, addr, mp));
1988 +}
1989 +
1990 +struct lgrp_mem_policy_info *
1991 +segop_getpolicy(struct seg *seg, caddr_t addr)
1992 +{
1993 + if (seg->s_ops->getpolicy == NULL)
1994 + return (NULL);
1995 +
1996 + return (seg->s_ops->getpolicy(seg, addr));
1997 +}
1998 +
1999 +int
2000 +segop_capable(struct seg *seg, segcapability_t cap)
2001 +{
2002 + if (seg->s_ops->capable == NULL)
2003 + return (0);
2004 +
2005 + return (seg->s_ops->capable(seg, cap));
2006 +}
2007 +
2008 +int
2009 +segop_inherit(struct seg *seg, caddr_t addr, size_t len, uint_t op)
2010 +{
2011 + if (seg->s_ops->inherit == NULL)
2012 + return (ENOTSUP);
2013 +
2014 + return (seg->s_ops->inherit(seg, addr, len, op));
1864 2015 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX