Print this page
patch lower-case-segops
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/vm_seg.c
+++ new/usr/src/uts/common/vm/vm_seg.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 * Copyright (c) 2015, Joyent, Inc.
25 25 */
26 26
27 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 28 /* All Rights Reserved */
29 29
30 30 /*
31 31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 32 * The Regents of the University of California
33 33 * All Rights Reserved
34 34 *
35 35 * University Acknowledgment- Portions of this document are derived from
36 36 * software developed by the University of California, Berkeley, and its
37 37 * contributors.
38 38 */
39 39
40 40 /*
41 41 * VM - segment management.
42 42 */
43 43
44 44 #include <sys/types.h>
45 45 #include <sys/inttypes.h>
46 46 #include <sys/t_lock.h>
47 47 #include <sys/param.h>
48 48 #include <sys/systm.h>
49 49 #include <sys/kmem.h>
50 50 #include <sys/sysmacros.h>
51 51 #include <sys/vmsystm.h>
52 52 #include <sys/tuneable.h>
53 53 #include <sys/debug.h>
54 54 #include <sys/fs/swapnode.h>
55 55 #include <sys/cmn_err.h>
56 56 #include <sys/callb.h>
57 57 #include <sys/mem_config.h>
58 58 #include <sys/mman.h>
59 59
60 60 #include <vm/hat.h>
61 61 #include <vm/as.h>
62 62 #include <vm/seg.h>
63 63 #include <vm/seg_kmem.h>
64 64 #include <vm/seg_spt.h>
65 65 #include <vm/seg_vn.h>
66 66 #include <vm/anon.h>
67 67
68 68 /*
69 69 * kstats for segment advise
70 70 */
71 71 segadvstat_t segadvstat = {
72 72 { "MADV_FREE_hit", KSTAT_DATA_ULONG },
73 73 { "MADV_FREE_miss", KSTAT_DATA_ULONG },
74 74 };
75 75
76 76 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
77 77 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
78 78
79 79 /*
80 80 * entry in the segment page cache
81 81 */
82 82 struct seg_pcache {
83 83 struct seg_pcache *p_hnext; /* list for hashed blocks */
84 84 struct seg_pcache *p_hprev;
85 85 pcache_link_t p_plink; /* per segment/amp list */
86 86 void *p_htag0; /* segment/amp pointer */
87 87 caddr_t p_addr; /* base address/anon_idx */
88 88 size_t p_len; /* total bytes */
89 89 size_t p_wlen; /* writtable bytes at p_addr */
90 90 struct page **p_pp; /* pp shadow list */
91 91 seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */
92 92 clock_t p_lbolt; /* lbolt from last use */
93 93 struct seg_phash *p_hashp; /* our pcache hash bucket */
94 94 uint_t p_active; /* active count */
95 95 uchar_t p_write; /* true if S_WRITE */
96 96 uchar_t p_ref; /* reference byte */
97 97 ushort_t p_flags; /* bit flags */
98 98 };
99 99
100 100 struct seg_phash {
101 101 struct seg_pcache *p_hnext; /* list for hashed blocks */
102 102 struct seg_pcache *p_hprev;
103 103 kmutex_t p_hmutex; /* protects hash bucket */
104 104 pcache_link_t p_halink[2]; /* active bucket linkages */
105 105 };
106 106
107 107 struct seg_phash_wired {
108 108 struct seg_pcache *p_hnext; /* list for hashed blocks */
109 109 struct seg_pcache *p_hprev;
110 110 kmutex_t p_hmutex; /* protects hash bucket */
111 111 };
112 112
113 113 /*
114 114 * A parameter to control a maximum number of bytes that can be
115 115 * purged from pcache at a time.
116 116 */
117 117 #define P_MAX_APURGE_BYTES (1024 * 1024 * 1024)
118 118
119 119 /*
120 120 * log2(fraction of pcache to reclaim at a time).
121 121 */
122 122 #define P_SHRINK_SHFT (5)
123 123
124 124 /*
125 125 * The following variables can be tuned via /etc/system.
126 126 */
127 127
128 128 int segpcache_enabled = 1; /* if 1, shadow lists are cached */
129 129 pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */
130 130 ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */
131 131 ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */
132 132 int segpcache_reap_sec = 1; /* reap check rate in secs */
133 133 clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */
134 134 int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */
135 135 clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */
136 136 int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */
137 137 pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */
138 138
139 139 static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */
140 140 static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */
141 141 static kcondvar_t seg_pasync_cv;
142 142
143 143 #pragma align 64(pctrl1)
144 144 #pragma align 64(pctrl2)
145 145 #pragma align 64(pctrl3)
146 146
147 147 /*
148 148 * Keep frequently used variables together in one cache line.
149 149 */
150 150 static struct p_ctrl1 {
151 151 uint_t p_disabled; /* if not 0, caching temporarily off */
152 152 pgcnt_t p_maxwin; /* max # of pages that can be cached */
153 153 size_t p_hashwin_sz; /* # of non wired buckets */
154 154 struct seg_phash *p_htabwin; /* hash table for non wired entries */
155 155 size_t p_hashwired_sz; /* # of wired buckets */
156 156 struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
157 157 kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */
158 158 #ifdef _LP64
159 159 ulong_t pad[1];
160 160 #endif /* _LP64 */
161 161 } pctrl1;
162 162
163 163 static struct p_ctrl2 {
164 164 kmutex_t p_mem_mtx; /* protects window counter and p_halinks */
165 165 pgcnt_t p_locked_win; /* # pages from window */
166 166 pgcnt_t p_locked; /* # of pages cached by pagelock */
167 167 uchar_t p_ahcur; /* current active links for insert/delete */
168 168 uchar_t p_athr_on; /* async reclaim thread is running. */
169 169 pcache_link_t p_ahhead[2]; /* active buckets linkages */
170 170 } pctrl2;
171 171
172 172 static struct p_ctrl3 {
173 173 clock_t p_pcp_maxage; /* max pcp age in ticks */
174 174 ulong_t p_athr_empty_ahb; /* athread walk stats */
175 175 ulong_t p_athr_full_ahb; /* athread walk stats */
176 176 pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */
177 177 int p_shrink_shft; /* reap shift factor */
178 178 #ifdef _LP64
179 179 ulong_t pad[3];
180 180 #endif /* _LP64 */
181 181 } pctrl3;
182 182
183 183 #define seg_pdisabled pctrl1.p_disabled
184 184 #define seg_pmaxwindow pctrl1.p_maxwin
185 185 #define seg_phashsize_win pctrl1.p_hashwin_sz
186 186 #define seg_phashtab_win pctrl1.p_htabwin
187 187 #define seg_phashsize_wired pctrl1.p_hashwired_sz
188 188 #define seg_phashtab_wired pctrl1.p_htabwired
189 189 #define seg_pkmcache pctrl1.p_kmcache
190 190 #define seg_pmem_mtx pctrl2.p_mem_mtx
191 191 #define seg_plocked_window pctrl2.p_locked_win
192 192 #define seg_plocked pctrl2.p_locked
193 193 #define seg_pahcur pctrl2.p_ahcur
194 194 #define seg_pathr_on pctrl2.p_athr_on
195 195 #define seg_pahhead pctrl2.p_ahhead
196 196 #define seg_pmax_pcpage pctrl3.p_pcp_maxage
197 197 #define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb
198 198 #define seg_pathr_full_ahb pctrl3.p_athr_full_ahb
199 199 #define seg_pshrink_shift pctrl3.p_shrink_shft
200 200 #define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages
201 201
202 202 #define P_HASHWIN_MASK (seg_phashsize_win - 1)
203 203 #define P_HASHWIRED_MASK (seg_phashsize_wired - 1)
204 204 #define P_BASESHIFT (6)
205 205
206 206 kthread_t *seg_pasync_thr;
207 207
208 208 extern struct seg_ops segvn_ops;
209 209 extern struct seg_ops segspt_shmops;
210 210
211 211 #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
212 212 #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
213 213
214 214 #define LBOLT_DELTA(t) ((ulong_t)(ddi_get_lbolt() - (t)))
215 215
216 216 #define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt)
217 217
218 218 /*
219 219 * htag0 argument can be a seg or amp pointer.
220 220 */
221 221 #define P_HASHBP(seg, htag0, addr, flags) \
222 222 (IS_PFLAGS_WIRED((flags)) ? \
223 223 ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \
224 224 ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \
225 225 (&seg_phashtab_win[P_HASHWIN_MASK & \
226 226 (((uintptr_t)(htag0) >> 3) ^ \
227 227 ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \
228 228 (flags >> 16) : page_get_shift((seg)->s_szc))))]))
229 229
230 230 /*
231 231 * htag0 argument can be a seg or amp pointer.
232 232 */
233 233 #define P_MATCH(pcp, htag0, addr, len) \
234 234 ((pcp)->p_htag0 == (htag0) && \
235 235 (pcp)->p_addr == (addr) && \
236 236 (pcp)->p_len >= (len))
237 237
238 238 #define P_MATCH_PP(pcp, htag0, addr, len, pp) \
239 239 ((pcp)->p_pp == (pp) && \
240 240 (pcp)->p_htag0 == (htag0) && \
241 241 (pcp)->p_addr == (addr) && \
242 242 (pcp)->p_len >= (len))
243 243
244 244 #define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \
245 245 offsetof(struct seg_pcache, p_plink)))
246 246
247 247 #define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \
248 248 offsetof(struct seg_phash, p_halink[l])))
249 249
250 250 /*
251 251 * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
252 252 * active hash bucket lists. We maintain active bucket lists to reduce the
253 253 * overhead of finding active buckets during asynchronous purging since there
254 254 * can be 10s of millions of buckets on a large system but only a small subset
255 255 * of them in actual use.
256 256 *
257 257 * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
258 258 * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
259 259 * buckets. The other list is used by asynchronous purge thread. This allows
260 260 * the purge thread to walk its active list without holding seg_pmem_mtx for a
261 261 * long time. When asynchronous thread is done with its list it switches to
262 262 * current active list and makes the list it just finished processing as
263 263 * current active list.
264 264 *
265 265 * seg_padd_abuck() only adds the bucket to current list if the bucket is not
266 266 * yet on any list. seg_premove_abuck() may remove the bucket from either
267 267 * list. If the bucket is on current list it will be always removed. Otherwise
268 268 * the bucket is only removed if asynchronous purge thread is not currently
269 269 * running or seg_premove_abuck() is called by asynchronous purge thread
270 270 * itself. A given bucket can only be on one of active lists at a time. These
271 271 * routines should be called with per bucket lock held. The routines use
272 272 * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
273 273 * the first entry is added to the bucket chain and seg_premove_abuck() must
274 274 * be called after the last pcp entry is deleted from its chain. Per bucket
275 275 * lock should be held by the callers. This avoids a potential race condition
276 276 * when seg_premove_abuck() removes a bucket after pcp entries are added to
277 277 * its list after the caller checked that the bucket has no entries. (this
278 278 * race would cause a loss of an active bucket from the active lists).
279 279 *
280 280 * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
281 281 * New entries are added to the end of the list since LRU is used as the
282 282 * purging policy.
283 283 */
284 284 static void
285 285 seg_padd_abuck(struct seg_phash *hp)
286 286 {
287 287 int lix;
288 288
289 289 ASSERT(MUTEX_HELD(&hp->p_hmutex));
290 290 ASSERT((struct seg_phash *)hp->p_hnext != hp);
291 291 ASSERT((struct seg_phash *)hp->p_hprev != hp);
292 292 ASSERT(hp->p_hnext == hp->p_hprev);
293 293 ASSERT(!IS_PCP_WIRED(hp->p_hnext));
294 294 ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
295 295 ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
296 296 ASSERT(hp >= seg_phashtab_win &&
297 297 hp < &seg_phashtab_win[seg_phashsize_win]);
298 298
299 299 /*
300 300 * This bucket can already be on one of active lists
301 301 * since seg_premove_abuck() may have failed to remove it
302 302 * before.
303 303 */
304 304 mutex_enter(&seg_pmem_mtx);
305 305 lix = seg_pahcur;
306 306 ASSERT(lix >= 0 && lix <= 1);
307 307 if (hp->p_halink[lix].p_lnext != NULL) {
308 308 ASSERT(hp->p_halink[lix].p_lprev != NULL);
309 309 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
310 310 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
311 311 mutex_exit(&seg_pmem_mtx);
312 312 return;
313 313 }
314 314 ASSERT(hp->p_halink[lix].p_lprev == NULL);
315 315
316 316 /*
317 317 * If this bucket is still on list !lix async thread can't yet remove
318 318 * it since we hold here per bucket lock. In this case just return
319 319 * since async thread will eventually find and process this bucket.
320 320 */
321 321 if (hp->p_halink[!lix].p_lnext != NULL) {
322 322 ASSERT(hp->p_halink[!lix].p_lprev != NULL);
323 323 mutex_exit(&seg_pmem_mtx);
324 324 return;
325 325 }
326 326 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
327 327 /*
328 328 * This bucket is not on any active bucket list yet.
329 329 * Add the bucket to the tail of current active list.
330 330 */
331 331 hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
332 332 hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
333 333 seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
334 334 seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
335 335 mutex_exit(&seg_pmem_mtx);
336 336 }
337 337
338 338 static void
339 339 seg_premove_abuck(struct seg_phash *hp, int athr)
340 340 {
341 341 int lix;
342 342
343 343 ASSERT(MUTEX_HELD(&hp->p_hmutex));
344 344 ASSERT((struct seg_phash *)hp->p_hnext == hp);
345 345 ASSERT((struct seg_phash *)hp->p_hprev == hp);
346 346 ASSERT(hp >= seg_phashtab_win &&
347 347 hp < &seg_phashtab_win[seg_phashsize_win]);
348 348
349 349 if (athr) {
350 350 ASSERT(seg_pathr_on);
351 351 ASSERT(seg_pahcur <= 1);
352 352 /*
353 353 * We are called by asynchronous thread that found this bucket
354 354 * on not currently active (i.e. !seg_pahcur) list. Remove it
355 355 * from there. Per bucket lock we are holding makes sure
356 356 * seg_pinsert() can't sneak in and add pcp entries to this
357 357 * bucket right before we remove the bucket from its list.
358 358 */
359 359 lix = !seg_pahcur;
360 360 ASSERT(hp->p_halink[lix].p_lnext != NULL);
361 361 ASSERT(hp->p_halink[lix].p_lprev != NULL);
362 362 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
363 363 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
364 364 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
365 365 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
366 366 hp->p_halink[lix].p_lnext = NULL;
367 367 hp->p_halink[lix].p_lprev = NULL;
368 368 return;
369 369 }
370 370
371 371 mutex_enter(&seg_pmem_mtx);
372 372 lix = seg_pahcur;
373 373 ASSERT(lix >= 0 && lix <= 1);
374 374
375 375 /*
376 376 * If the bucket is on currently active list just remove it from
377 377 * there.
378 378 */
379 379 if (hp->p_halink[lix].p_lnext != NULL) {
380 380 ASSERT(hp->p_halink[lix].p_lprev != NULL);
381 381 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
382 382 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
383 383 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
384 384 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
385 385 hp->p_halink[lix].p_lnext = NULL;
386 386 hp->p_halink[lix].p_lprev = NULL;
387 387 mutex_exit(&seg_pmem_mtx);
388 388 return;
389 389 }
390 390 ASSERT(hp->p_halink[lix].p_lprev == NULL);
391 391
392 392 /*
393 393 * If asynchronous thread is not running we can remove the bucket from
394 394 * not currently active list. The bucket must be on this list since we
395 395 * already checked that it's not on the other list and the bucket from
396 396 * which we just deleted the last pcp entry must be still on one of the
397 397 * active bucket lists.
398 398 */
399 399 lix = !lix;
400 400 ASSERT(hp->p_halink[lix].p_lnext != NULL);
401 401 ASSERT(hp->p_halink[lix].p_lprev != NULL);
402 402
403 403 if (!seg_pathr_on) {
404 404 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
405 405 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
406 406 hp->p_halink[lix].p_lnext = NULL;
407 407 hp->p_halink[lix].p_lprev = NULL;
408 408 }
409 409 mutex_exit(&seg_pmem_mtx);
410 410 }
411 411
412 412 /*
413 413 * Check if bucket pointed by hp already has a pcp entry that matches request
414 414 * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
415 415 * Also delete matching entries that cover smaller address range but start
416 416 * at the same address as addr argument. Return the list of deleted entries if
417 417 * any. This is an internal helper function called from seg_pinsert() only
418 418 * for non wired shadow lists. The caller already holds a per seg/amp list
419 419 * lock.
420 420 */
421 421 static struct seg_pcache *
422 422 seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
423 423 caddr_t addr, size_t len, int *found)
424 424 {
425 425 struct seg_pcache *pcp;
426 426 struct seg_pcache *delcallb_list = NULL;
427 427
428 428 ASSERT(MUTEX_HELD(&hp->p_hmutex));
429 429
430 430 *found = 0;
431 431 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
432 432 pcp = pcp->p_hnext) {
433 433 ASSERT(pcp->p_hashp == hp);
434 434 if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
435 435 ASSERT(!IS_PCP_WIRED(pcp));
436 436 if (pcp->p_len < len) {
437 437 pcache_link_t *plinkp;
438 438 if (pcp->p_active) {
439 439 continue;
440 440 }
441 441 plinkp = &pcp->p_plink;
442 442 plinkp->p_lprev->p_lnext = plinkp->p_lnext;
443 443 plinkp->p_lnext->p_lprev = plinkp->p_lprev;
444 444 pcp->p_hprev->p_hnext = pcp->p_hnext;
445 445 pcp->p_hnext->p_hprev = pcp->p_hprev;
446 446 pcp->p_hprev = delcallb_list;
447 447 delcallb_list = pcp;
448 448 } else {
449 449 *found = 1;
450 450 break;
451 451 }
452 452 }
453 453 }
454 454 return (delcallb_list);
455 455 }
456 456
457 457 /*
458 458 * lookup an address range in pagelock cache. Return shadow list and bump up
459 459 * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
460 460 * as a lookup tag.
461 461 */
462 462 struct page **
463 463 seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
464 464 enum seg_rw rw, uint_t flags)
465 465 {
466 466 struct seg_pcache *pcp;
467 467 struct seg_phash *hp;
468 468 void *htag0;
469 469
470 470 ASSERT(seg != NULL);
471 471 ASSERT(rw == S_READ || rw == S_WRITE);
472 472
473 473 /*
474 474 * Skip pagelock cache, while DR is in progress or
475 475 * seg_pcache is off.
476 476 */
477 477 if (seg_pdisabled) {
478 478 return (NULL);
479 479 }
480 480 ASSERT(seg_phashsize_win != 0);
481 481
482 482 htag0 = (amp == NULL ? (void *)seg : (void *)amp);
483 483 hp = P_HASHBP(seg, htag0, addr, flags);
484 484 mutex_enter(&hp->p_hmutex);
485 485 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
486 486 pcp = pcp->p_hnext) {
487 487 ASSERT(pcp->p_hashp == hp);
488 488 if (P_MATCH(pcp, htag0, addr, len)) {
489 489 ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
490 490 /*
491 491 * If this request wants to write pages
492 492 * but write permissions starting from
493 493 * addr don't cover the entire length len
494 494 * return lookup failure back to the caller.
495 495 * It will check protections and fail this
496 496 * pagelock operation with EACCESS error.
497 497 */
498 498 if (rw == S_WRITE && pcp->p_wlen < len) {
499 499 break;
500 500 }
501 501 if (pcp->p_active == UINT_MAX) {
502 502 break;
503 503 }
504 504 pcp->p_active++;
505 505 if (rw == S_WRITE && !pcp->p_write) {
506 506 pcp->p_write = 1;
507 507 }
508 508 mutex_exit(&hp->p_hmutex);
509 509 return (pcp->p_pp);
510 510 }
511 511 }
512 512 mutex_exit(&hp->p_hmutex);
513 513 return (NULL);
514 514 }
515 515
516 516 /*
517 517 * mark address range inactive. If the cache is off or the address range is
518 518 * not in the cache or another shadow list that covers bigger range is found
519 519 * we call the segment driver to reclaim the pages. Otherwise just decrement
520 520 * active count and set ref bit. If amp is not NULL use amp as a lookup tag
521 521 * otherwise use seg as a lookup tag.
522 522 */
523 523 void
524 524 seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
525 525 size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
526 526 seg_preclaim_cbfunc_t callback)
527 527 {
528 528 struct seg_pcache *pcp;
529 529 struct seg_phash *hp;
530 530 kmutex_t *pmtx = NULL;
531 531 pcache_link_t *pheadp;
532 532 void *htag0;
533 533 pgcnt_t npages = 0;
534 534 int keep = 0;
535 535
536 536 ASSERT(seg != NULL);
537 537 ASSERT(rw == S_READ || rw == S_WRITE);
538 538
539 539 htag0 = (amp == NULL ? (void *)seg : (void *)amp);
540 540
541 541 /*
542 542 * Skip lookup if pcache is not configured.
543 543 */
544 544 if (seg_phashsize_win == 0) {
545 545 goto out;
546 546 }
547 547
548 548 /*
549 549 * Grab per seg/amp lock before hash lock if we are going to remove
550 550 * inactive entry from pcache.
551 551 */
552 552 if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
553 553 if (amp == NULL) {
554 554 pheadp = &seg->s_phead;
555 555 pmtx = &seg->s_pmtx;
556 556 } else {
557 557 pheadp = &->a_phead;
558 558 pmtx = &->a_pmtx;
559 559 }
560 560 mutex_enter(pmtx);
561 561 }
562 562
563 563 hp = P_HASHBP(seg, htag0, addr, flags);
564 564 mutex_enter(&hp->p_hmutex);
565 565 again:
566 566 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
567 567 pcp = pcp->p_hnext) {
568 568 ASSERT(pcp->p_hashp == hp);
569 569 if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
570 570 ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
571 571 ASSERT(pcp->p_active);
572 572 if (keep) {
573 573 /*
574 574 * Don't remove this pcp entry
575 575 * if we didn't find duplicate
576 576 * shadow lists on second search.
577 577 * Somebody removed those duplicates
578 578 * since we dropped hash lock after first
579 579 * search.
580 580 */
581 581 ASSERT(pmtx != NULL);
582 582 ASSERT(!IS_PFLAGS_WIRED(flags));
583 583 mutex_exit(pmtx);
584 584 pmtx = NULL;
585 585 }
586 586 pcp->p_active--;
587 587 if (pcp->p_active == 0 && (pmtx != NULL ||
588 588 (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
589 589
590 590 /*
591 591 * This entry is no longer active. Remove it
592 592 * now either because pcaching is temporarily
593 593 * disabled or there're other pcp entries that
594 594 * can match this pagelock request (i.e. this
595 595 * entry is a duplicate).
596 596 */
597 597
598 598 ASSERT(callback == pcp->p_callback);
599 599 if (pmtx != NULL) {
600 600 pcache_link_t *plinkp = &pcp->p_plink;
601 601 ASSERT(!IS_PCP_WIRED(pcp));
602 602 ASSERT(pheadp->p_lnext != pheadp);
603 603 ASSERT(pheadp->p_lprev != pheadp);
604 604 plinkp->p_lprev->p_lnext =
605 605 plinkp->p_lnext;
606 606 plinkp->p_lnext->p_lprev =
607 607 plinkp->p_lprev;
608 608 }
609 609 pcp->p_hprev->p_hnext = pcp->p_hnext;
610 610 pcp->p_hnext->p_hprev = pcp->p_hprev;
611 611 if (!IS_PCP_WIRED(pcp) &&
612 612 hp->p_hnext == (struct seg_pcache *)hp) {
613 613 /*
614 614 * We removed the last entry from this
615 615 * bucket. Now remove the bucket from
616 616 * its active list.
617 617 */
618 618 seg_premove_abuck(hp, 0);
619 619 }
620 620 mutex_exit(&hp->p_hmutex);
621 621 if (pmtx != NULL) {
622 622 mutex_exit(pmtx);
623 623 }
624 624 len = pcp->p_len;
625 625 npages = btop(len);
626 626 if (rw != S_WRITE && pcp->p_write) {
627 627 rw = S_WRITE;
628 628 }
629 629 kmem_cache_free(seg_pkmcache, pcp);
630 630 goto out;
631 631 } else {
632 632 /*
633 633 * We found a matching pcp entry but will not
634 634 * free it right away even if it's no longer
635 635 * active.
636 636 */
637 637 if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
638 638 /*
639 639 * Set the reference bit and mark the
640 640 * time of last access to this pcp
641 641 * so that asynchronous thread doesn't
642 642 * free it immediately since
643 643 * it may be reactivated very soon.
644 644 */
645 645 pcp->p_lbolt = ddi_get_lbolt();
646 646 pcp->p_ref = 1;
647 647 }
648 648 mutex_exit(&hp->p_hmutex);
649 649 if (pmtx != NULL) {
650 650 mutex_exit(pmtx);
651 651 }
652 652 return;
653 653 }
654 654 } else if (!IS_PFLAGS_WIRED(flags) &&
655 655 P_MATCH(pcp, htag0, addr, len)) {
656 656 /*
657 657 * This is a duplicate pcp entry. This situation may
658 658 * happen if a bigger shadow list that covers our
659 659 * range was added while our entry was still active.
660 660 * Now we can free our pcp entry if it becomes
661 661 * inactive.
662 662 */
663 663 if (!pcp->p_active) {
664 664 /*
665 665 * Mark this entry as referenced just in case
666 666 * we'll free our own pcp entry soon.
667 667 */
668 668 pcp->p_lbolt = ddi_get_lbolt();
669 669 pcp->p_ref = 1;
670 670 }
671 671 if (pmtx != NULL) {
672 672 /*
673 673 * we are already holding pmtx and found a
674 674 * duplicate. Don't keep our own pcp entry.
675 675 */
676 676 keep = 0;
677 677 continue;
678 678 }
679 679 /*
680 680 * We have to use mutex_tryenter to attempt to lock
681 681 * seg/amp list lock since we already hold hash lock
682 682 * and seg/amp list lock is above hash lock in lock
683 683 * order. If mutex_tryenter fails drop hash lock and
684 684 * retake both locks in correct order and research
685 685 * this hash chain.
686 686 */
687 687 ASSERT(keep == 0);
688 688 if (amp == NULL) {
689 689 pheadp = &seg->s_phead;
690 690 pmtx = &seg->s_pmtx;
691 691 } else {
692 692 pheadp = &->a_phead;
693 693 pmtx = &->a_pmtx;
694 694 }
695 695 if (!mutex_tryenter(pmtx)) {
696 696 mutex_exit(&hp->p_hmutex);
697 697 mutex_enter(pmtx);
698 698 mutex_enter(&hp->p_hmutex);
699 699 /*
700 700 * If we don't find bigger shadow list on
701 701 * second search (it may happen since we
702 702 * dropped bucket lock) keep the entry that
703 703 * matches our own shadow list.
704 704 */
705 705 keep = 1;
706 706 goto again;
707 707 }
708 708 }
709 709 }
710 710 mutex_exit(&hp->p_hmutex);
711 711 if (pmtx != NULL) {
712 712 mutex_exit(pmtx);
713 713 }
714 714 out:
715 715 (*callback)(htag0, addr, len, pp, rw, 0);
716 716 if (npages) {
717 717 mutex_enter(&seg_pmem_mtx);
718 718 ASSERT(seg_plocked >= npages);
719 719 seg_plocked -= npages;
720 720 if (!IS_PFLAGS_WIRED(flags)) {
721 721 ASSERT(seg_plocked_window >= npages);
722 722 seg_plocked_window -= npages;
723 723 }
724 724 mutex_exit(&seg_pmem_mtx);
725 725 }
726 726
727 727 }
728 728
729 729 #ifdef DEBUG
730 730 static uint32_t p_insert_chk_mtbf = 0;
731 731 #endif
732 732
733 733 /*
734 734 * The seg_pinsert_check() is used by segment drivers to predict whether
735 735 * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
736 736 */
737 737 /*ARGSUSED*/
738 738 int
739 739 seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
740 740 size_t len, uint_t flags)
741 741 {
742 742 ASSERT(seg != NULL);
743 743
744 744 #ifdef DEBUG
745 745 if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
746 746 return (SEGP_FAIL);
747 747 }
748 748 #endif
749 749
750 750 if (seg_pdisabled) {
751 751 return (SEGP_FAIL);
752 752 }
753 753 ASSERT(seg_phashsize_win != 0);
754 754
755 755 if (IS_PFLAGS_WIRED(flags)) {
756 756 return (SEGP_SUCCESS);
757 757 }
758 758
759 759 if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
760 760 return (SEGP_FAIL);
761 761 }
762 762
763 763 if (freemem < desfree) {
764 764 return (SEGP_FAIL);
765 765 }
766 766
767 767 return (SEGP_SUCCESS);
768 768 }
769 769
770 770 #ifdef DEBUG
771 771 static uint32_t p_insert_mtbf = 0;
772 772 #endif
773 773
774 774 /*
775 775 * Insert address range with shadow list into pagelock cache if there's no
776 776 * shadow list already cached for this address range. If the cache is off or
777 777 * caching is temporarily disabled or the allowed 'window' is exceeded return
778 778 * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
779 779 *
780 780 * For non wired shadow lists (segvn case) include address in the hashing
781 781 * function to avoid linking all the entries from the same segment or amp on
782 782 * the same bucket. amp is used instead of seg if amp is not NULL. Non wired
783 783 * pcache entries are also linked on a per segment/amp list so that all
784 784 * entries can be found quickly during seg/amp purge without walking the
785 785 * entire pcache hash table. For wired shadow lists (segspt case) we
786 786 * don't use address hashing and per segment linking because the caller
787 787 * currently inserts only one entry per segment that covers the entire
788 788 * segment. If we used per segment linking even for segspt it would complicate
789 789 * seg_ppurge_wiredpp() locking.
790 790 *
791 791 * Both hash bucket and per seg/amp locks need to be held before adding a non
792 792 * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
793 793 * first.
794 794 *
795 795 * This function will also remove from pcache old inactive shadow lists that
796 796 * overlap with this request but cover smaller range for the same start
797 797 * address.
798 798 */
799 799 int
800 800 seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
801 801 size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
802 802 seg_preclaim_cbfunc_t callback)
803 803 {
804 804 struct seg_pcache *pcp;
805 805 struct seg_phash *hp;
806 806 pgcnt_t npages;
807 807 pcache_link_t *pheadp;
808 808 kmutex_t *pmtx;
809 809 struct seg_pcache *delcallb_list = NULL;
810 810
811 811 ASSERT(seg != NULL);
812 812 ASSERT(rw == S_READ || rw == S_WRITE);
813 813 ASSERT(rw == S_READ || wlen == len);
814 814 ASSERT(rw == S_WRITE || wlen <= len);
815 815 ASSERT(amp == NULL || wlen == len);
816 816
817 817 #ifdef DEBUG
818 818 if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
819 819 return (SEGP_FAIL);
820 820 }
821 821 #endif
822 822
823 823 if (seg_pdisabled) {
824 824 return (SEGP_FAIL);
825 825 }
826 826 ASSERT(seg_phashsize_win != 0);
827 827
828 828 ASSERT((len & PAGEOFFSET) == 0);
829 829 npages = btop(len);
830 830 mutex_enter(&seg_pmem_mtx);
831 831 if (!IS_PFLAGS_WIRED(flags)) {
832 832 if (seg_plocked_window + npages > seg_pmaxwindow) {
833 833 mutex_exit(&seg_pmem_mtx);
834 834 return (SEGP_FAIL);
835 835 }
836 836 seg_plocked_window += npages;
837 837 }
838 838 seg_plocked += npages;
839 839 mutex_exit(&seg_pmem_mtx);
840 840
841 841 pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
842 842 /*
843 843 * If amp is not NULL set htag0 to amp otherwise set it to seg.
844 844 */
845 845 if (amp == NULL) {
846 846 pcp->p_htag0 = (void *)seg;
847 847 pcp->p_flags = flags & 0xffff;
848 848 } else {
849 849 pcp->p_htag0 = (void *)amp;
850 850 pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
851 851 }
852 852 pcp->p_addr = addr;
853 853 pcp->p_len = len;
854 854 pcp->p_wlen = wlen;
855 855 pcp->p_pp = pp;
856 856 pcp->p_write = (rw == S_WRITE);
857 857 pcp->p_callback = callback;
858 858 pcp->p_active = 1;
859 859
860 860 hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
861 861 if (!IS_PFLAGS_WIRED(flags)) {
862 862 int found;
863 863 void *htag0;
864 864 if (amp == NULL) {
865 865 pheadp = &seg->s_phead;
866 866 pmtx = &seg->s_pmtx;
867 867 htag0 = (void *)seg;
868 868 } else {
869 869 pheadp = &->a_phead;
870 870 pmtx = &->a_pmtx;
871 871 htag0 = (void *)amp;
872 872 }
873 873 mutex_enter(pmtx);
874 874 mutex_enter(&hp->p_hmutex);
875 875 delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
876 876 len, &found);
877 877 if (found) {
878 878 mutex_exit(&hp->p_hmutex);
879 879 mutex_exit(pmtx);
880 880 mutex_enter(&seg_pmem_mtx);
881 881 seg_plocked -= npages;
882 882 seg_plocked_window -= npages;
883 883 mutex_exit(&seg_pmem_mtx);
884 884 kmem_cache_free(seg_pkmcache, pcp);
885 885 goto out;
886 886 }
887 887 pcp->p_plink.p_lnext = pheadp->p_lnext;
888 888 pcp->p_plink.p_lprev = pheadp;
889 889 pheadp->p_lnext->p_lprev = &pcp->p_plink;
890 890 pheadp->p_lnext = &pcp->p_plink;
891 891 } else {
892 892 mutex_enter(&hp->p_hmutex);
893 893 }
894 894 pcp->p_hashp = hp;
895 895 pcp->p_hnext = hp->p_hnext;
896 896 pcp->p_hprev = (struct seg_pcache *)hp;
897 897 hp->p_hnext->p_hprev = pcp;
898 898 hp->p_hnext = pcp;
899 899 if (!IS_PFLAGS_WIRED(flags) &&
900 900 hp->p_hprev == pcp) {
901 901 seg_padd_abuck(hp);
902 902 }
903 903 mutex_exit(&hp->p_hmutex);
904 904 if (!IS_PFLAGS_WIRED(flags)) {
905 905 mutex_exit(pmtx);
906 906 }
907 907
908 908 out:
909 909 npages = 0;
910 910 while (delcallb_list != NULL) {
911 911 pcp = delcallb_list;
912 912 delcallb_list = pcp->p_hprev;
913 913 ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
914 914 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
915 915 pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
916 916 npages += btop(pcp->p_len);
917 917 kmem_cache_free(seg_pkmcache, pcp);
918 918 }
919 919 if (npages) {
920 920 ASSERT(!IS_PFLAGS_WIRED(flags));
921 921 mutex_enter(&seg_pmem_mtx);
922 922 ASSERT(seg_plocked >= npages);
923 923 ASSERT(seg_plocked_window >= npages);
924 924 seg_plocked -= npages;
925 925 seg_plocked_window -= npages;
926 926 mutex_exit(&seg_pmem_mtx);
927 927 }
928 928
929 929 return (SEGP_SUCCESS);
930 930 }
931 931
932 932 /*
933 933 * purge entries from the pagelock cache if not active
934 934 * and not recently used.
935 935 */
936 936 static void
937 937 seg_ppurge_async(int force)
938 938 {
939 939 struct seg_pcache *delcallb_list = NULL;
940 940 struct seg_pcache *pcp;
941 941 struct seg_phash *hp;
942 942 pgcnt_t npages = 0;
943 943 pgcnt_t npages_window = 0;
944 944 pgcnt_t npgs_to_purge;
945 945 pgcnt_t npgs_purged = 0;
946 946 int hlinks = 0;
947 947 int hlix;
948 948 pcache_link_t *hlinkp;
949 949 pcache_link_t *hlnextp = NULL;
950 950 int lowmem;
951 951 int trim;
952 952
953 953 ASSERT(seg_phashsize_win != 0);
954 954
955 955 /*
956 956 * if the cache is off or empty, return
957 957 */
958 958 if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
959 959 return;
960 960 }
961 961
962 962 if (!force) {
963 963 lowmem = 0;
964 964 trim = 0;
965 965 if (freemem < lotsfree + needfree) {
966 966 spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
967 967 if (fmem <= 5 * (desfree >> 2)) {
968 968 lowmem = 1;
969 969 } else if (fmem <= 7 * (lotsfree >> 3)) {
970 970 if (seg_plocked_window >=
971 971 (availrmem_initial >> 1)) {
972 972 lowmem = 1;
973 973 }
974 974 } else if (fmem < lotsfree) {
975 975 if (seg_plocked_window >=
976 976 3 * (availrmem_initial >> 2)) {
977 977 lowmem = 1;
978 978 }
979 979 }
980 980 }
981 981 if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
982 982 trim = 1;
983 983 }
984 984 if (!lowmem && !trim) {
985 985 return;
986 986 }
987 987 npgs_to_purge = seg_plocked_window >>
988 988 seg_pshrink_shift;
989 989 if (lowmem) {
990 990 npgs_to_purge = MIN(npgs_to_purge,
991 991 MAX(seg_pmaxapurge_npages, desfree));
992 992 } else {
993 993 npgs_to_purge = MIN(npgs_to_purge,
994 994 seg_pmaxapurge_npages);
995 995 }
996 996 if (npgs_to_purge == 0) {
997 997 return;
998 998 }
999 999 } else {
1000 1000 struct seg_phash_wired *hpw;
1001 1001
1002 1002 ASSERT(seg_phashsize_wired != 0);
1003 1003
1004 1004 for (hpw = seg_phashtab_wired;
1005 1005 hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
1006 1006
1007 1007 if (hpw->p_hnext == (struct seg_pcache *)hpw) {
1008 1008 continue;
1009 1009 }
1010 1010
1011 1011 mutex_enter(&hpw->p_hmutex);
1012 1012
1013 1013 for (pcp = hpw->p_hnext;
1014 1014 pcp != (struct seg_pcache *)hpw;
1015 1015 pcp = pcp->p_hnext) {
1016 1016
1017 1017 ASSERT(IS_PCP_WIRED(pcp));
1018 1018 ASSERT(pcp->p_hashp ==
1019 1019 (struct seg_phash *)hpw);
1020 1020
1021 1021 if (pcp->p_active) {
1022 1022 continue;
1023 1023 }
1024 1024 pcp->p_hprev->p_hnext = pcp->p_hnext;
1025 1025 pcp->p_hnext->p_hprev = pcp->p_hprev;
1026 1026 pcp->p_hprev = delcallb_list;
1027 1027 delcallb_list = pcp;
1028 1028 }
1029 1029 mutex_exit(&hpw->p_hmutex);
1030 1030 }
1031 1031 }
1032 1032
1033 1033 mutex_enter(&seg_pmem_mtx);
1034 1034 if (seg_pathr_on) {
1035 1035 mutex_exit(&seg_pmem_mtx);
1036 1036 goto runcb;
1037 1037 }
1038 1038 seg_pathr_on = 1;
1039 1039 mutex_exit(&seg_pmem_mtx);
1040 1040 ASSERT(seg_pahcur <= 1);
1041 1041 hlix = !seg_pahcur;
1042 1042
1043 1043 again:
1044 1044 for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
1045 1045 hlinkp = hlnextp) {
1046 1046
1047 1047 hlnextp = hlinkp->p_lnext;
1048 1048 ASSERT(hlnextp != NULL);
1049 1049
1050 1050 hp = hlink2phash(hlinkp, hlix);
1051 1051 if (hp->p_hnext == (struct seg_pcache *)hp) {
1052 1052 seg_pathr_empty_ahb++;
1053 1053 continue;
1054 1054 }
1055 1055 seg_pathr_full_ahb++;
1056 1056 mutex_enter(&hp->p_hmutex);
1057 1057
1058 1058 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
1059 1059 pcp = pcp->p_hnext) {
1060 1060 pcache_link_t *pheadp;
1061 1061 pcache_link_t *plinkp;
1062 1062 void *htag0;
1063 1063 kmutex_t *pmtx;
1064 1064
1065 1065 ASSERT(!IS_PCP_WIRED(pcp));
1066 1066 ASSERT(pcp->p_hashp == hp);
1067 1067
1068 1068 if (pcp->p_active) {
1069 1069 continue;
1070 1070 }
1071 1071 if (!force && pcp->p_ref &&
1072 1072 PCP_AGE(pcp) < seg_pmax_pcpage) {
1073 1073 pcp->p_ref = 0;
1074 1074 continue;
1075 1075 }
1076 1076 plinkp = &pcp->p_plink;
1077 1077 htag0 = pcp->p_htag0;
1078 1078 if (pcp->p_flags & SEGP_AMP) {
1079 1079 pheadp = &((amp_t *)htag0)->a_phead;
1080 1080 pmtx = &((amp_t *)htag0)->a_pmtx;
1081 1081 } else {
1082 1082 pheadp = &((seg_t *)htag0)->s_phead;
1083 1083 pmtx = &((seg_t *)htag0)->s_pmtx;
1084 1084 }
1085 1085 if (!mutex_tryenter(pmtx)) {
1086 1086 continue;
1087 1087 }
1088 1088 ASSERT(pheadp->p_lnext != pheadp);
1089 1089 ASSERT(pheadp->p_lprev != pheadp);
1090 1090 plinkp->p_lprev->p_lnext =
1091 1091 plinkp->p_lnext;
1092 1092 plinkp->p_lnext->p_lprev =
1093 1093 plinkp->p_lprev;
1094 1094 pcp->p_hprev->p_hnext = pcp->p_hnext;
1095 1095 pcp->p_hnext->p_hprev = pcp->p_hprev;
1096 1096 mutex_exit(pmtx);
1097 1097 pcp->p_hprev = delcallb_list;
1098 1098 delcallb_list = pcp;
1099 1099 npgs_purged += btop(pcp->p_len);
1100 1100 }
1101 1101 if (hp->p_hnext == (struct seg_pcache *)hp) {
1102 1102 seg_premove_abuck(hp, 1);
1103 1103 }
1104 1104 mutex_exit(&hp->p_hmutex);
1105 1105 if (npgs_purged >= seg_plocked_window) {
1106 1106 break;
1107 1107 }
1108 1108 if (!force) {
1109 1109 if (npgs_purged >= npgs_to_purge) {
1110 1110 break;
1111 1111 }
1112 1112 if (!trim && !(seg_pathr_full_ahb & 15)) {
1113 1113 ASSERT(lowmem);
1114 1114 if (freemem >= lotsfree + needfree) {
1115 1115 break;
1116 1116 }
1117 1117 }
1118 1118 }
1119 1119 }
1120 1120
1121 1121 if (hlinkp == &seg_pahhead[hlix]) {
1122 1122 /*
1123 1123 * We processed the entire hlix active bucket list
1124 1124 * but didn't find enough pages to reclaim.
1125 1125 * Switch the lists and walk the other list
1126 1126 * if we haven't done it yet.
1127 1127 */
1128 1128 mutex_enter(&seg_pmem_mtx);
1129 1129 ASSERT(seg_pathr_on);
1130 1130 ASSERT(seg_pahcur == !hlix);
1131 1131 seg_pahcur = hlix;
1132 1132 mutex_exit(&seg_pmem_mtx);
1133 1133 if (++hlinks < 2) {
1134 1134 hlix = !hlix;
1135 1135 goto again;
1136 1136 }
1137 1137 } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
1138 1138 seg_pahhead[hlix].p_lnext != hlinkp) {
1139 1139 ASSERT(hlinkp != NULL);
1140 1140 ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
1141 1141 ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
1142 1142 ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
1143 1143
1144 1144 /*
1145 1145 * Reinsert the header to point to hlinkp
1146 1146 * so that we start from hlinkp bucket next time around.
1147 1147 */
1148 1148 seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
1149 1149 seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
1150 1150 seg_pahhead[hlix].p_lnext = hlinkp;
1151 1151 seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
1152 1152 hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
1153 1153 hlinkp->p_lprev = &seg_pahhead[hlix];
1154 1154 }
1155 1155
1156 1156 mutex_enter(&seg_pmem_mtx);
1157 1157 ASSERT(seg_pathr_on);
1158 1158 seg_pathr_on = 0;
1159 1159 mutex_exit(&seg_pmem_mtx);
1160 1160
1161 1161 runcb:
1162 1162 /*
1163 1163 * Run the delayed callback list. segments/amps can't go away until
1164 1164 * callback is executed since they must have non 0 softlockcnt. That's
1165 1165 * why we don't need to hold as/seg/amp locks to execute the callback.
1166 1166 */
1167 1167 while (delcallb_list != NULL) {
1168 1168 pcp = delcallb_list;
1169 1169 delcallb_list = pcp->p_hprev;
1170 1170 ASSERT(!pcp->p_active);
1171 1171 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1172 1172 pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
1173 1173 npages += btop(pcp->p_len);
1174 1174 if (!IS_PCP_WIRED(pcp)) {
1175 1175 npages_window += btop(pcp->p_len);
1176 1176 }
1177 1177 kmem_cache_free(seg_pkmcache, pcp);
1178 1178 }
1179 1179 if (npages) {
1180 1180 mutex_enter(&seg_pmem_mtx);
1181 1181 ASSERT(seg_plocked >= npages);
1182 1182 ASSERT(seg_plocked_window >= npages_window);
1183 1183 seg_plocked -= npages;
1184 1184 seg_plocked_window -= npages_window;
1185 1185 mutex_exit(&seg_pmem_mtx);
1186 1186 }
1187 1187 }
1188 1188
1189 1189 /*
1190 1190 * Remove cached pages for segment(s) entries from hashtable. The segments
1191 1191 * are identified by pp array. This is useful for multiple seg's cached on
1192 1192 * behalf of dummy segment (ISM/DISM) with common pp array.
1193 1193 */
1194 1194 void
1195 1195 seg_ppurge_wiredpp(struct page **pp)
1196 1196 {
1197 1197 struct seg_pcache *pcp;
1198 1198 struct seg_phash_wired *hp;
1199 1199 pgcnt_t npages = 0;
1200 1200 struct seg_pcache *delcallb_list = NULL;
1201 1201
1202 1202 /*
1203 1203 * if the cache is empty, return
1204 1204 */
1205 1205 if (seg_plocked == 0) {
1206 1206 return;
1207 1207 }
1208 1208 ASSERT(seg_phashsize_wired != 0);
1209 1209
1210 1210 for (hp = seg_phashtab_wired;
1211 1211 hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
1212 1212 if (hp->p_hnext == (struct seg_pcache *)hp) {
1213 1213 continue;
1214 1214 }
1215 1215 mutex_enter(&hp->p_hmutex);
1216 1216 pcp = hp->p_hnext;
1217 1217 while (pcp != (struct seg_pcache *)hp) {
1218 1218 ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
1219 1219 ASSERT(IS_PCP_WIRED(pcp));
1220 1220 /*
1221 1221 * purge entries which are not active
1222 1222 */
1223 1223 if (!pcp->p_active && pcp->p_pp == pp) {
1224 1224 ASSERT(pcp->p_htag0 != NULL);
1225 1225 pcp->p_hprev->p_hnext = pcp->p_hnext;
1226 1226 pcp->p_hnext->p_hprev = pcp->p_hprev;
1227 1227 pcp->p_hprev = delcallb_list;
1228 1228 delcallb_list = pcp;
1229 1229 }
1230 1230 pcp = pcp->p_hnext;
1231 1231 }
1232 1232 mutex_exit(&hp->p_hmutex);
1233 1233 /*
1234 1234 * segments can't go away until callback is executed since
1235 1235 * they must have non 0 softlockcnt. That's why we don't
1236 1236 * need to hold as/seg locks to execute the callback.
1237 1237 */
1238 1238 while (delcallb_list != NULL) {
1239 1239 int done;
1240 1240 pcp = delcallb_list;
1241 1241 delcallb_list = pcp->p_hprev;
1242 1242 ASSERT(!pcp->p_active);
1243 1243 done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1244 1244 pcp->p_len, pcp->p_pp,
1245 1245 pcp->p_write ? S_WRITE : S_READ, 1);
1246 1246 npages += btop(pcp->p_len);
1247 1247 ASSERT(IS_PCP_WIRED(pcp));
1248 1248 kmem_cache_free(seg_pkmcache, pcp);
1249 1249 if (done) {
1250 1250 ASSERT(delcallb_list == NULL);
1251 1251 goto out;
1252 1252 }
1253 1253 }
1254 1254 }
1255 1255
1256 1256 out:
1257 1257 mutex_enter(&seg_pmem_mtx);
1258 1258 ASSERT(seg_plocked >= npages);
1259 1259 seg_plocked -= npages;
1260 1260 mutex_exit(&seg_pmem_mtx);
1261 1261 }
1262 1262
1263 1263 /*
1264 1264 * purge all entries for a given segment. Since we
1265 1265 * callback into the segment driver directly for page
1266 1266 * reclaim the caller needs to hold the right locks.
1267 1267 */
1268 1268 void
1269 1269 seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
1270 1270 {
1271 1271 struct seg_pcache *delcallb_list = NULL;
1272 1272 struct seg_pcache *pcp;
1273 1273 struct seg_phash *hp;
1274 1274 pgcnt_t npages = 0;
1275 1275 void *htag0;
1276 1276
1277 1277 if (seg_plocked == 0) {
1278 1278 return;
1279 1279 }
1280 1280 ASSERT(seg_phashsize_win != 0);
1281 1281
1282 1282 /*
1283 1283 * If amp is not NULL use amp as a lookup tag otherwise use seg
1284 1284 * as a lookup tag.
1285 1285 */
1286 1286 htag0 = (amp == NULL ? (void *)seg : (void *)amp);
1287 1287 ASSERT(htag0 != NULL);
1288 1288 if (IS_PFLAGS_WIRED(flags)) {
1289 1289 hp = P_HASHBP(seg, htag0, 0, flags);
1290 1290 mutex_enter(&hp->p_hmutex);
1291 1291 pcp = hp->p_hnext;
1292 1292 while (pcp != (struct seg_pcache *)hp) {
1293 1293 ASSERT(pcp->p_hashp == hp);
1294 1294 ASSERT(IS_PCP_WIRED(pcp));
1295 1295 if (pcp->p_htag0 == htag0) {
1296 1296 if (pcp->p_active) {
1297 1297 break;
1298 1298 }
1299 1299 pcp->p_hprev->p_hnext = pcp->p_hnext;
1300 1300 pcp->p_hnext->p_hprev = pcp->p_hprev;
1301 1301 pcp->p_hprev = delcallb_list;
1302 1302 delcallb_list = pcp;
1303 1303 }
1304 1304 pcp = pcp->p_hnext;
1305 1305 }
1306 1306 mutex_exit(&hp->p_hmutex);
1307 1307 } else {
1308 1308 pcache_link_t *plinkp;
1309 1309 pcache_link_t *pheadp;
1310 1310 kmutex_t *pmtx;
1311 1311
1312 1312 if (amp == NULL) {
1313 1313 ASSERT(seg != NULL);
1314 1314 pheadp = &seg->s_phead;
1315 1315 pmtx = &seg->s_pmtx;
1316 1316 } else {
1317 1317 pheadp = &->a_phead;
1318 1318 pmtx = &->a_pmtx;
1319 1319 }
1320 1320 mutex_enter(pmtx);
1321 1321 while ((plinkp = pheadp->p_lnext) != pheadp) {
1322 1322 pcp = plink2pcache(plinkp);
1323 1323 ASSERT(!IS_PCP_WIRED(pcp));
1324 1324 ASSERT(pcp->p_htag0 == htag0);
1325 1325 hp = pcp->p_hashp;
1326 1326 mutex_enter(&hp->p_hmutex);
1327 1327 if (pcp->p_active) {
1328 1328 mutex_exit(&hp->p_hmutex);
1329 1329 break;
1330 1330 }
1331 1331 ASSERT(plinkp->p_lprev == pheadp);
1332 1332 pheadp->p_lnext = plinkp->p_lnext;
1333 1333 plinkp->p_lnext->p_lprev = pheadp;
1334 1334 pcp->p_hprev->p_hnext = pcp->p_hnext;
1335 1335 pcp->p_hnext->p_hprev = pcp->p_hprev;
1336 1336 pcp->p_hprev = delcallb_list;
1337 1337 delcallb_list = pcp;
1338 1338 if (hp->p_hnext == (struct seg_pcache *)hp) {
1339 1339 seg_premove_abuck(hp, 0);
1340 1340 }
1341 1341 mutex_exit(&hp->p_hmutex);
1342 1342 }
1343 1343 mutex_exit(pmtx);
1344 1344 }
1345 1345 while (delcallb_list != NULL) {
1346 1346 pcp = delcallb_list;
1347 1347 delcallb_list = pcp->p_hprev;
1348 1348 ASSERT(!pcp->p_active);
1349 1349 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
1350 1350 pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
1351 1351 npages += btop(pcp->p_len);
1352 1352 kmem_cache_free(seg_pkmcache, pcp);
1353 1353 }
1354 1354 mutex_enter(&seg_pmem_mtx);
1355 1355 ASSERT(seg_plocked >= npages);
1356 1356 seg_plocked -= npages;
1357 1357 if (!IS_PFLAGS_WIRED(flags)) {
1358 1358 ASSERT(seg_plocked_window >= npages);
1359 1359 seg_plocked_window -= npages;
1360 1360 }
1361 1361 mutex_exit(&seg_pmem_mtx);
1362 1362 }
1363 1363
1364 1364 static void seg_pinit_mem_config(void);
1365 1365
1366 1366 /*
1367 1367 * setup the pagelock cache
1368 1368 */
1369 1369 static void
1370 1370 seg_pinit(void)
1371 1371 {
1372 1372 struct seg_phash *hp;
1373 1373 ulong_t i;
1374 1374 pgcnt_t physmegs;
1375 1375
1376 1376 seg_plocked = 0;
1377 1377 seg_plocked_window = 0;
1378 1378
1379 1379 if (segpcache_enabled == 0) {
1380 1380 seg_phashsize_win = 0;
1381 1381 seg_phashsize_wired = 0;
1382 1382 seg_pdisabled = 1;
1383 1383 return;
1384 1384 }
1385 1385
1386 1386 seg_pdisabled = 0;
1387 1387 seg_pkmcache = kmem_cache_create("seg_pcache",
1388 1388 sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
1389 1389 if (segpcache_pcp_maxage_ticks <= 0) {
1390 1390 segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
1391 1391 }
1392 1392 seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
1393 1393 seg_pathr_empty_ahb = 0;
1394 1394 seg_pathr_full_ahb = 0;
1395 1395 seg_pshrink_shift = segpcache_shrink_shift;
1396 1396 seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
1397 1397
1398 1398 mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
1399 1399 mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
1400 1400 mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
1401 1401 cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
1402 1402
1403 1403 physmegs = physmem >> (20 - PAGESHIFT);
1404 1404
1405 1405 /*
1406 1406 * If segpcache_hashsize_win was not set in /etc/system or it has
1407 1407 * absurd value set it to a default.
1408 1408 */
1409 1409 if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
1410 1410 /*
1411 1411 * Create one bucket per 32K (or at least per 8 pages) of
1412 1412 * available memory.
1413 1413 */
1414 1414 pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
1415 1415 segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
1416 1416 }
1417 1417 if (!ISP2(segpcache_hashsize_win)) {
1418 1418 ulong_t rndfac = ~(1UL <<
1419 1419 (highbit(segpcache_hashsize_win) - 1));
1420 1420 rndfac &= segpcache_hashsize_win;
1421 1421 segpcache_hashsize_win += rndfac;
1422 1422 segpcache_hashsize_win = 1 <<
1423 1423 (highbit(segpcache_hashsize_win) - 1);
1424 1424 }
1425 1425 seg_phashsize_win = segpcache_hashsize_win;
1426 1426 seg_phashtab_win = kmem_zalloc(
1427 1427 seg_phashsize_win * sizeof (struct seg_phash),
1428 1428 KM_SLEEP);
1429 1429 for (i = 0; i < seg_phashsize_win; i++) {
1430 1430 hp = &seg_phashtab_win[i];
1431 1431 hp->p_hnext = (struct seg_pcache *)hp;
1432 1432 hp->p_hprev = (struct seg_pcache *)hp;
1433 1433 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1434 1434 }
1435 1435
1436 1436 seg_pahcur = 0;
1437 1437 seg_pathr_on = 0;
1438 1438 seg_pahhead[0].p_lnext = &seg_pahhead[0];
1439 1439 seg_pahhead[0].p_lprev = &seg_pahhead[0];
1440 1440 seg_pahhead[1].p_lnext = &seg_pahhead[1];
1441 1441 seg_pahhead[1].p_lprev = &seg_pahhead[1];
1442 1442
1443 1443 /*
1444 1444 * If segpcache_hashsize_wired was not set in /etc/system or it has
1445 1445 * absurd value set it to a default.
1446 1446 */
1447 1447 if (segpcache_hashsize_wired == 0 ||
1448 1448 segpcache_hashsize_wired > physmem / 4) {
1449 1449 /*
1450 1450 * Choose segpcache_hashsize_wired based on physmem.
1451 1451 * Create a bucket per 128K bytes upto 256K buckets.
1452 1452 */
1453 1453 if (physmegs < 20 * 1024) {
1454 1454 segpcache_hashsize_wired = MAX(1024, physmegs << 3);
1455 1455 } else {
1456 1456 segpcache_hashsize_wired = 256 * 1024;
1457 1457 }
1458 1458 }
1459 1459 if (!ISP2(segpcache_hashsize_wired)) {
1460 1460 segpcache_hashsize_wired = 1 <<
1461 1461 highbit(segpcache_hashsize_wired);
1462 1462 }
1463 1463 seg_phashsize_wired = segpcache_hashsize_wired;
1464 1464 seg_phashtab_wired = kmem_zalloc(
1465 1465 seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
1466 1466 for (i = 0; i < seg_phashsize_wired; i++) {
1467 1467 hp = (struct seg_phash *)&seg_phashtab_wired[i];
1468 1468 hp->p_hnext = (struct seg_pcache *)hp;
1469 1469 hp->p_hprev = (struct seg_pcache *)hp;
1470 1470 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1471 1471 }
1472 1472
1473 1473 if (segpcache_maxwindow == 0) {
1474 1474 if (physmegs < 64) {
1475 1475 /* 3% of memory */
1476 1476 segpcache_maxwindow = availrmem >> 5;
1477 1477 } else if (physmegs < 512) {
1478 1478 /* 12% of memory */
1479 1479 segpcache_maxwindow = availrmem >> 3;
1480 1480 } else if (physmegs < 1024) {
1481 1481 /* 25% of memory */
1482 1482 segpcache_maxwindow = availrmem >> 2;
1483 1483 } else if (physmegs < 2048) {
1484 1484 /* 50% of memory */
1485 1485 segpcache_maxwindow = availrmem >> 1;
1486 1486 } else {
1487 1487 /* no limit */
1488 1488 segpcache_maxwindow = (pgcnt_t)-1;
1489 1489 }
1490 1490 }
1491 1491 seg_pmaxwindow = segpcache_maxwindow;
1492 1492 seg_pinit_mem_config();
1493 1493 }
1494 1494
1495 1495 /*
1496 1496 * called by pageout if memory is low
1497 1497 */
1498 1498 void
1499 1499 seg_preap(void)
1500 1500 {
1501 1501 /*
1502 1502 * if the cache is off or empty, return
1503 1503 */
1504 1504 if (seg_plocked_window == 0) {
1505 1505 return;
1506 1506 }
1507 1507 ASSERT(seg_phashsize_win != 0);
1508 1508
1509 1509 /*
1510 1510 * If somebody is already purging pcache
1511 1511 * just return.
1512 1512 */
1513 1513 if (seg_pdisabled) {
1514 1514 return;
1515 1515 }
1516 1516
1517 1517 cv_signal(&seg_pasync_cv);
1518 1518 }
1519 1519
1520 1520 /*
1521 1521 * run as a backgroud thread and reclaim pagelock
1522 1522 * pages which have not been used recently
1523 1523 */
1524 1524 void
1525 1525 seg_pasync_thread(void)
1526 1526 {
1527 1527 callb_cpr_t cpr_info;
1528 1528
1529 1529 if (seg_phashsize_win == 0) {
1530 1530 thread_exit();
1531 1531 /*NOTREACHED*/
1532 1532 }
1533 1533
1534 1534 seg_pasync_thr = curthread;
1535 1535
1536 1536 CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
1537 1537 callb_generic_cpr, "seg_pasync");
1538 1538
1539 1539 if (segpcache_reap_ticks <= 0) {
1540 1540 segpcache_reap_ticks = segpcache_reap_sec * hz;
1541 1541 }
1542 1542
1543 1543 mutex_enter(&seg_pasync_mtx);
1544 1544 for (;;) {
1545 1545 CALLB_CPR_SAFE_BEGIN(&cpr_info);
1546 1546 (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx,
1547 1547 segpcache_reap_ticks, TR_CLOCK_TICK);
1548 1548 CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
1549 1549 if (seg_pdisabled == 0) {
1550 1550 seg_ppurge_async(0);
1551 1551 }
1552 1552 }
1553 1553 }
1554 1554
1555 1555 static struct kmem_cache *seg_cache;
1556 1556
1557 1557 /*
1558 1558 * Initialize segment management data structures.
1559 1559 */
1560 1560 void
1561 1561 seg_init(void)
1562 1562 {
1563 1563 kstat_t *ksp;
1564 1564
1565 1565 seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
1566 1566 0, NULL, NULL, NULL, NULL, NULL, 0);
1567 1567
1568 1568 ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
1569 1569 segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
1570 1570 if (ksp) {
1571 1571 ksp->ks_data = (void *)segadvstat_ptr;
1572 1572 kstat_install(ksp);
1573 1573 }
1574 1574
1575 1575 seg_pinit();
1576 1576 }
1577 1577
1578 1578 /*
1579 1579 * Allocate a segment to cover [base, base+size]
1580 1580 * and attach it to the specified address space.
1581 1581 */
1582 1582 struct seg *
1583 1583 seg_alloc(struct as *as, caddr_t base, size_t size)
1584 1584 {
1585 1585 struct seg *new;
1586 1586 caddr_t segbase;
1587 1587 size_t segsize;
1588 1588
1589 1589 segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
1590 1590 segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
1591 1591 (uintptr_t)segbase;
1592 1592
1593 1593 if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
1594 1594 return ((struct seg *)NULL); /* bad virtual addr range */
1595 1595
1596 1596 if (as != &kas &&
1597 1597 valid_usr_range(segbase, segsize, 0, as,
1598 1598 as->a_userlimit) != RANGE_OKAY)
1599 1599 return ((struct seg *)NULL); /* bad virtual addr range */
1600 1600
1601 1601 new = kmem_cache_alloc(seg_cache, KM_SLEEP);
1602 1602 new->s_ops = NULL;
1603 1603 new->s_data = NULL;
1604 1604 new->s_szc = 0;
1605 1605 new->s_flags = 0;
1606 1606 mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
1607 1607 new->s_phead.p_lnext = &new->s_phead;
1608 1608 new->s_phead.p_lprev = &new->s_phead;
1609 1609 if (seg_attach(as, segbase, segsize, new) < 0) {
1610 1610 kmem_cache_free(seg_cache, new);
1611 1611 return ((struct seg *)NULL);
1612 1612 }
1613 1613 /* caller must fill in ops, data */
1614 1614 return (new);
1615 1615 }
1616 1616
1617 1617 /*
1618 1618 * Attach a segment to the address space. Used by seg_alloc()
1619 1619 * and for kernel startup to attach to static segments.
1620 1620 */
1621 1621 int
1622 1622 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
1623 1623 {
1624 1624 seg->s_as = as;
1625 1625 seg->s_base = base;
1626 1626 seg->s_size = size;
1627 1627
1628 1628 /*
↓ open down ↓ |
1628 lines elided |
↑ open up ↑ |
1629 1629 * as_addseg() will add the segment at the appropraite point
1630 1630 * in the list. It will return -1 if there is overlap with
1631 1631 * an already existing segment.
1632 1632 */
1633 1633 return (as_addseg(as, seg));
1634 1634 }
1635 1635
1636 1636 /*
1637 1637 * Unmap a segment and free it from its associated address space.
1638 1638 * This should be called by anybody who's finished with a whole segment's
1639 - * mapping. Just calls SEGOP_UNMAP() on the whole mapping . It is the
1639 + * mapping. Just calls segop_unmap() on the whole mapping . It is the
1640 1640 * responsibility of the segment driver to unlink the the segment
1641 1641 * from the address space, and to free public and private data structures
1642 1642 * associated with the segment. (This is typically done by a call to
1643 1643 * seg_free()).
1644 1644 */
1645 1645 void
1646 1646 seg_unmap(struct seg *seg)
1647 1647 {
1648 1648 #ifdef DEBUG
1649 1649 int ret;
1650 1650 #endif /* DEBUG */
1651 1651
1652 1652 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1653 1653
1654 1654 /* Shouldn't have called seg_unmap if mapping isn't yet established */
1655 1655 ASSERT(seg->s_data != NULL);
1656 1656
1657 1657 /* Unmap the whole mapping */
1658 1658 #ifdef DEBUG
1659 - ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1659 + ret = segop_unmap(seg, seg->s_base, seg->s_size);
1660 1660 ASSERT(ret == 0);
1661 1661 #else
1662 - SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1662 + segop_unmap(seg, seg->s_base, seg->s_size);
1663 1663 #endif /* DEBUG */
1664 1664 }
1665 1665
1666 1666 /*
1667 1667 * Free the segment from its associated as. This should only be called
1668 1668 * if a mapping to the segment has not yet been established (e.g., if
1669 1669 * an error occurs in the middle of doing an as_map when the segment
1670 1670 * has already been partially set up) or if it has already been deleted
1671 1671 * (e.g., from a segment driver unmap routine if the unmap applies to the
1672 1672 * entire segment). If the mapping is currently set up then seg_unmap() should
1673 1673 * be called instead.
1674 1674 */
1675 1675 void
1676 1676 seg_free(struct seg *seg)
1677 1677 {
↓ open down ↓ |
5 lines elided |
↑ open up ↑ |
1678 1678 register struct as *as = seg->s_as;
1679 1679 struct seg *tseg = as_removeseg(as, seg);
1680 1680
1681 1681 ASSERT(tseg == seg);
1682 1682
1683 1683 /*
1684 1684 * If the segment private data field is NULL,
1685 1685 * then segment driver is not attached yet.
1686 1686 */
1687 1687 if (seg->s_data != NULL)
1688 - SEGOP_FREE(seg);
1688 + segop_free(seg);
1689 1689
1690 1690 mutex_destroy(&seg->s_pmtx);
1691 1691 ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
1692 1692 ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
1693 1693 kmem_cache_free(seg_cache, seg);
1694 1694 }
1695 1695
1696 1696 /*ARGSUSED*/
1697 1697 static void
1698 1698 seg_p_mem_config_post_add(
1699 1699 void *arg,
1700 1700 pgcnt_t delta_pages)
1701 1701 {
1702 1702 /* Nothing to do. */
1703 1703 }
1704 1704
1705 1705 void
1706 1706 seg_p_enable(void)
1707 1707 {
1708 1708 mutex_enter(&seg_pcache_mtx);
1709 1709 ASSERT(seg_pdisabled != 0);
1710 1710 seg_pdisabled--;
1711 1711 mutex_exit(&seg_pcache_mtx);
1712 1712 }
1713 1713
1714 1714 /*
1715 1715 * seg_p_disable - disables seg_pcache, and then attempts to empty the
1716 1716 * cache.
1717 1717 * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1718 1718 * SEGP_FAIL if the cache could not be emptied.
1719 1719 */
1720 1720 int
1721 1721 seg_p_disable(void)
1722 1722 {
1723 1723 pgcnt_t old_plocked;
1724 1724 int stall_count = 0;
1725 1725
1726 1726 mutex_enter(&seg_pcache_mtx);
1727 1727 seg_pdisabled++;
1728 1728 ASSERT(seg_pdisabled != 0);
1729 1729 mutex_exit(&seg_pcache_mtx);
1730 1730
1731 1731 /*
1732 1732 * Attempt to empty the cache. Terminate if seg_plocked does not
1733 1733 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
1734 1734 */
1735 1735 while (seg_plocked != 0) {
1736 1736 ASSERT(seg_phashsize_win != 0);
1737 1737 old_plocked = seg_plocked;
1738 1738 seg_ppurge_async(1);
1739 1739 if (seg_plocked == old_plocked) {
1740 1740 if (stall_count++ > SEGP_STALL_THRESHOLD) {
1741 1741 return (SEGP_FAIL);
1742 1742 }
1743 1743 } else
1744 1744 stall_count = 0;
1745 1745 if (seg_plocked != 0)
1746 1746 delay(hz/SEGP_PREDEL_DELAY_FACTOR);
1747 1747 }
1748 1748 return (SEGP_SUCCESS);
1749 1749 }
1750 1750
1751 1751 /*
1752 1752 * Attempt to purge seg_pcache. May need to return before this has
1753 1753 * completed to allow other pre_del callbacks to unlock pages. This is
1754 1754 * ok because:
1755 1755 * 1) The seg_pdisabled flag has been set so at least we won't
1756 1756 * cache anymore locks and the locks we couldn't purge
1757 1757 * will not be held if they do get released by a subsequent
1758 1758 * pre-delete callback.
1759 1759 *
1760 1760 * 2) The rest of the memory delete thread processing does not
1761 1761 * depend on the changes made in this pre-delete callback. No
1762 1762 * panics will result, the worst that will happen is that the
1763 1763 * DR code will timeout and cancel the delete.
1764 1764 */
1765 1765 /*ARGSUSED*/
1766 1766 static int
1767 1767 seg_p_mem_config_pre_del(
1768 1768 void *arg,
1769 1769 pgcnt_t delta_pages)
1770 1770 {
1771 1771 if (seg_phashsize_win == 0) {
1772 1772 return (0);
1773 1773 }
1774 1774 if (seg_p_disable() != SEGP_SUCCESS)
1775 1775 cmn_err(CE_NOTE,
1776 1776 "!Pre-delete couldn't purge"" pagelock cache - continuing");
1777 1777 return (0);
1778 1778 }
1779 1779
1780 1780 /*ARGSUSED*/
1781 1781 static void
1782 1782 seg_p_mem_config_post_del(
1783 1783 void *arg,
1784 1784 pgcnt_t delta_pages,
1785 1785 int cancelled)
1786 1786 {
1787 1787 if (seg_phashsize_win == 0) {
1788 1788 return;
1789 1789 }
1790 1790 seg_p_enable();
1791 1791 }
1792 1792
1793 1793 static kphysm_setup_vector_t seg_p_mem_config_vec = {
1794 1794 KPHYSM_SETUP_VECTOR_VERSION,
1795 1795 seg_p_mem_config_post_add,
1796 1796 seg_p_mem_config_pre_del,
1797 1797 seg_p_mem_config_post_del,
1798 1798 };
1799 1799
1800 1800 static void
1801 1801 seg_pinit_mem_config(void)
1802 1802 {
1803 1803 int ret;
1804 1804
1805 1805 ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
1806 1806 /*
1807 1807 * Want to catch this in the debug kernel. At run time, if the
1808 1808 * callbacks don't get run all will be OK as the disable just makes
1809 1809 * it more likely that the pages can be collected.
1810 1810 */
1811 1811 ASSERT(ret == 0);
1812 1812 }
1813 1813
1814 1814 /*
1815 1815 * Verify that segment is not a shared anonymous segment which reserves
1816 1816 * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
1817 1817 * from one zone to another if any segments are shared. This is because the
1818 1818 * last process to exit will credit the swap reservation. This could lead
1819 1819 * to the swap being reserved by one zone, and credited to another.
1820 1820 */
1821 1821 boolean_t
1822 1822 seg_can_change_zones(struct seg *seg)
1823 1823 {
1824 1824 struct segvn_data *svd;
1825 1825
1826 1826 if (seg->s_ops == &segspt_shmops)
1827 1827 return (B_FALSE);
1828 1828
1829 1829 if (seg->s_ops == &segvn_ops) {
1830 1830 svd = (struct segvn_data *)seg->s_data;
1831 1831 if (svd->type == MAP_SHARED &&
1832 1832 svd->amp != NULL &&
1833 1833 svd->amp->swresv > 0)
1834 1834 return (B_FALSE);
1835 1835 }
1836 1836 return (B_TRUE);
1837 1837 }
1838 1838
1839 1839 /*
1840 1840 * Return swap reserved by a segment backing a private mapping.
1841 1841 */
1842 1842 size_t
1843 1843 seg_swresv(struct seg *seg)
1844 1844 {
1845 1845 struct segvn_data *svd;
1846 1846 size_t swap = 0;
↓ open down ↓ |
148 lines elided |
↑ open up ↑ |
1847 1847
1848 1848 if (seg->s_ops == &segvn_ops) {
1849 1849 svd = (struct segvn_data *)seg->s_data;
1850 1850 if (svd->type == MAP_PRIVATE && svd->swresv > 0)
1851 1851 swap = svd->swresv;
1852 1852 }
1853 1853 return (swap);
1854 1854 }
1855 1855
1856 1856 /*
1857 - * General not supported function for SEGOP_INHERIT
1857 + * General not supported function for segop_inherit
1858 1858 */
1859 1859 /* ARGSUSED */
1860 1860 int
1861 1861 seg_inherit_notsup(struct seg *seg, caddr_t addr, size_t len, uint_t op)
1862 1862 {
1863 1863 return (ENOTSUP);
1864 1864 }
1865 1865
1866 1866 /*
1867 1867 * segop wrappers
1868 1868 */
1869 1869 int
1870 1870 segop_dup(struct seg *seg, struct seg *new)
1871 1871 {
1872 1872 VERIFY3P(seg->s_ops->dup, !=, NULL);
1873 1873
1874 1874 return (seg->s_ops->dup(seg, new));
1875 1875 }
1876 1876
1877 1877 int
1878 1878 segop_unmap(struct seg *seg, caddr_t addr, size_t len)
1879 1879 {
1880 1880 VERIFY3P(seg->s_ops->unmap, !=, NULL);
1881 1881
1882 1882 return (seg->s_ops->unmap(seg, addr, len));
1883 1883 }
1884 1884
1885 1885 void
1886 1886 segop_free(struct seg *seg)
1887 1887 {
1888 1888 VERIFY3P(seg->s_ops->free, !=, NULL);
1889 1889
1890 1890 seg->s_ops->free(seg);
1891 1891 }
1892 1892
1893 1893 faultcode_t
1894 1894 segop_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
1895 1895 enum fault_type type, enum seg_rw rw)
1896 1896 {
1897 1897 VERIFY3P(seg->s_ops->fault, !=, NULL);
1898 1898
1899 1899 return (seg->s_ops->fault(hat, seg, addr, len, type, rw));
1900 1900 }
1901 1901
1902 1902 faultcode_t
1903 1903 segop_faulta(struct seg *seg, caddr_t addr)
1904 1904 {
1905 1905 VERIFY3P(seg->s_ops->faulta, !=, NULL);
1906 1906
1907 1907 return (seg->s_ops->faulta(seg, addr));
1908 1908 }
1909 1909
1910 1910 int
1911 1911 segop_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1912 1912 {
1913 1913 VERIFY3P(seg->s_ops->setprot, !=, NULL);
1914 1914
1915 1915 return (seg->s_ops->setprot(seg, addr, len, prot));
1916 1916 }
1917 1917
1918 1918 int
1919 1919 segop_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1920 1920 {
1921 1921 VERIFY3P(seg->s_ops->checkprot, !=, NULL);
1922 1922
1923 1923 return (seg->s_ops->checkprot(seg, addr, len, prot));
1924 1924 }
1925 1925
1926 1926 int
1927 1927 segop_kluster(struct seg *seg, caddr_t addr, ssize_t d)
1928 1928 {
1929 1929 VERIFY3P(seg->s_ops->kluster, !=, NULL);
1930 1930
1931 1931 return (seg->s_ops->kluster(seg, addr, d));
1932 1932 }
1933 1933
1934 1934 int
1935 1935 segop_sync(struct seg *seg, caddr_t addr, size_t len, int atr, uint_t f)
1936 1936 {
1937 1937 VERIFY3P(seg->s_ops->sync, !=, NULL);
1938 1938
1939 1939 return (seg->s_ops->sync(seg, addr, len, atr, f));
1940 1940 }
1941 1941
1942 1942 size_t
1943 1943 segop_incore(struct seg *seg, caddr_t addr, size_t len, char *v)
1944 1944 {
1945 1945 VERIFY3P(seg->s_ops->incore, !=, NULL);
1946 1946
1947 1947 return (seg->s_ops->incore(seg, addr, len, v));
1948 1948 }
1949 1949
1950 1950 int
1951 1951 segop_lockop(struct seg *seg, caddr_t addr, size_t len, int atr, int op,
1952 1952 ulong_t *b, size_t p)
1953 1953 {
1954 1954 VERIFY3P(seg->s_ops->lockop, !=, NULL);
1955 1955
1956 1956 return (seg->s_ops->lockop(seg, addr, len, atr, op, b, p));
1957 1957 }
1958 1958
1959 1959 int
1960 1960 segop_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *p)
1961 1961 {
1962 1962 VERIFY3P(seg->s_ops->getprot, !=, NULL);
1963 1963
1964 1964 return (seg->s_ops->getprot(seg, addr, len, p));
1965 1965 }
1966 1966
1967 1967 u_offset_t
1968 1968 segop_getoffset(struct seg *seg, caddr_t addr)
1969 1969 {
1970 1970 VERIFY3P(seg->s_ops->getoffset, !=, NULL);
1971 1971
1972 1972 return (seg->s_ops->getoffset(seg, addr));
1973 1973 }
1974 1974
1975 1975 int
1976 1976 segop_gettype(struct seg *seg, caddr_t addr)
1977 1977 {
1978 1978 VERIFY3P(seg->s_ops->gettype, !=, NULL);
1979 1979
1980 1980 return (seg->s_ops->gettype(seg, addr));
1981 1981 }
1982 1982
1983 1983 int
1984 1984 segop_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
1985 1985 {
1986 1986 VERIFY3P(seg->s_ops->getvp, !=, NULL);
1987 1987
1988 1988 return (seg->s_ops->getvp(seg, addr, vpp));
1989 1989 }
1990 1990
1991 1991 int
1992 1992 segop_advise(struct seg *seg, caddr_t addr, size_t len, uint_t b)
1993 1993 {
1994 1994 VERIFY3P(seg->s_ops->advise, !=, NULL);
1995 1995
1996 1996 return (seg->s_ops->advise(seg, addr, len, b));
1997 1997 }
1998 1998
1999 1999 void
2000 2000 segop_dump(struct seg *seg)
2001 2001 {
2002 2002 VERIFY3P(seg->s_ops->dump, !=, NULL);
2003 2003
2004 2004 seg->s_ops->dump(seg);
2005 2005 }
2006 2006
2007 2007 int
2008 2008 segop_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***page,
2009 2009 enum lock_type type, enum seg_rw rw)
2010 2010 {
2011 2011 VERIFY3P(seg->s_ops->pagelock, !=, NULL);
2012 2012
2013 2013 return (seg->s_ops->pagelock(seg, addr, len, page, type, rw));
2014 2014 }
2015 2015
2016 2016 int
2017 2017 segop_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
2018 2018 {
2019 2019 VERIFY3P(seg->s_ops->setpagesize, !=, NULL);
2020 2020
2021 2021 return (seg->s_ops->setpagesize(seg, addr, len, szc));
2022 2022 }
2023 2023
2024 2024 int
2025 2025 segop_getmemid(struct seg *seg, caddr_t addr, memid_t *mp)
2026 2026 {
2027 2027 VERIFY3P(seg->s_ops->getmemid, !=, NULL);
2028 2028
2029 2029 return (seg->s_ops->getmemid(seg, addr, mp));
2030 2030 }
2031 2031
2032 2032 struct lgrp_mem_policy_info *
2033 2033 segop_getpolicy(struct seg *seg, caddr_t addr)
2034 2034 {
2035 2035 if (seg->s_ops->getpolicy == NULL)
2036 2036 return (NULL);
2037 2037
2038 2038 return (seg->s_ops->getpolicy(seg, addr));
2039 2039 }
2040 2040
2041 2041 int
2042 2042 segop_capable(struct seg *seg, segcapability_t cap)
2043 2043 {
2044 2044 VERIFY3P(seg->s_ops->capable, !=, NULL);
2045 2045
2046 2046 return (seg->s_ops->capable(seg, cap));
2047 2047 }
2048 2048
2049 2049 int
2050 2050 segop_inherit(struct seg *seg, caddr_t addr, size_t len, uint_t op)
2051 2051 {
2052 2052 if (seg->s_ops->inherit == NULL)
2053 2053 return (ENOTSUP);
2054 2054
2055 2055 return (seg->s_ops->inherit(seg, addr, len, op));
2056 2056 }
↓ open down ↓ |
189 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX