Print this page
remove whole-process swapping
Long before Unix supported paging, it used process swapping to reclaim
memory. The code is there and in theory it runs when we get *extremely* low
on memory. In practice, it never runs since the definition of low-on-memory
is antiquated. (XXX: define what antiquated means)
You can check the number of swapout/swapin events with kstats:
$ kstat -p ::vm:swapin ::vm:swapout
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/vm_as.c
+++ new/usr/src/uts/common/vm/vm_as.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 * Copyright 2015, Joyent, Inc. All rights reserved.
25 25 */
26 26
27 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 28 /* All Rights Reserved */
29 29
30 30 /*
31 31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 32 * The Regents of the University of California
33 33 * All Rights Reserved
34 34 *
35 35 * University Acknowledgment- Portions of this document are derived from
36 36 * software developed by the University of California, Berkeley, and its
37 37 * contributors.
38 38 */
39 39
40 40 /*
41 41 * VM - address spaces.
42 42 */
43 43
44 44 #include <sys/types.h>
45 45 #include <sys/t_lock.h>
46 46 #include <sys/param.h>
47 47 #include <sys/errno.h>
48 48 #include <sys/systm.h>
49 49 #include <sys/mman.h>
50 50 #include <sys/sysmacros.h>
51 51 #include <sys/cpuvar.h>
52 52 #include <sys/sysinfo.h>
53 53 #include <sys/kmem.h>
54 54 #include <sys/vnode.h>
55 55 #include <sys/vmsystm.h>
56 56 #include <sys/cmn_err.h>
57 57 #include <sys/debug.h>
58 58 #include <sys/tnf_probe.h>
59 59 #include <sys/vtrace.h>
60 60
61 61 #include <vm/hat.h>
62 62 #include <vm/as.h>
63 63 #include <vm/seg.h>
64 64 #include <vm/seg_vn.h>
65 65 #include <vm/seg_dev.h>
66 66 #include <vm/seg_kmem.h>
67 67 #include <vm/seg_map.h>
68 68 #include <vm/seg_spt.h>
69 69 #include <vm/page.h>
70 70
71 71 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
72 72
73 73 static struct kmem_cache *as_cache;
74 74
75 75 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
76 76 static void as_clearwatchprot(struct as *, caddr_t, size_t);
77 77 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
78 78
79 79
80 80 /*
81 81 * Verifying the segment lists is very time-consuming; it may not be
82 82 * desirable always to define VERIFY_SEGLIST when DEBUG is set.
83 83 */
84 84 #ifdef DEBUG
85 85 #define VERIFY_SEGLIST
86 86 int do_as_verify = 0;
87 87 #endif
88 88
89 89 /*
90 90 * Allocate a new callback data structure entry and fill in the events of
91 91 * interest, the address range of interest, and the callback argument.
92 92 * Link the entry on the as->a_callbacks list. A callback entry for the
93 93 * entire address space may be specified with vaddr = 0 and size = -1.
94 94 *
95 95 * CALLERS RESPONSIBILITY: If not calling from within the process context for
96 96 * the specified as, the caller must guarantee persistence of the specified as
97 97 * for the duration of this function (eg. pages being locked within the as
98 98 * will guarantee persistence).
99 99 */
100 100 int
101 101 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
102 102 caddr_t vaddr, size_t size, int sleepflag)
103 103 {
104 104 struct as_callback *current_head, *cb;
105 105 caddr_t saddr;
106 106 size_t rsize;
107 107
108 108 /* callback function and an event are mandatory */
109 109 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
110 110 return (EINVAL);
111 111
112 112 /* Adding a callback after as_free has been called is not allowed */
113 113 if (as == &kas)
114 114 return (ENOMEM);
115 115
116 116 /*
117 117 * vaddr = 0 and size = -1 is used to indicate that the callback range
118 118 * is the entire address space so no rounding is done in that case.
119 119 */
120 120 if (size != -1) {
121 121 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
122 122 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
123 123 (size_t)saddr;
124 124 /* check for wraparound */
125 125 if (saddr + rsize < saddr)
126 126 return (ENOMEM);
127 127 } else {
128 128 if (vaddr != 0)
129 129 return (EINVAL);
130 130 saddr = vaddr;
131 131 rsize = size;
132 132 }
133 133
134 134 /* Allocate and initialize a callback entry */
135 135 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
136 136 if (cb == NULL)
137 137 return (EAGAIN);
138 138
139 139 cb->ascb_func = cb_func;
140 140 cb->ascb_arg = arg;
141 141 cb->ascb_events = events;
142 142 cb->ascb_saddr = saddr;
143 143 cb->ascb_len = rsize;
144 144
145 145 /* Add the entry to the list */
146 146 mutex_enter(&as->a_contents);
147 147 current_head = as->a_callbacks;
148 148 as->a_callbacks = cb;
149 149 cb->ascb_next = current_head;
150 150
151 151 /*
152 152 * The call to this function may lose in a race with
153 153 * a pertinent event - eg. a thread does long term memory locking
154 154 * but before the callback is added another thread executes as_unmap.
155 155 * A broadcast here resolves that.
156 156 */
157 157 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
158 158 AS_CLRUNMAPWAIT(as);
159 159 cv_broadcast(&as->a_cv);
160 160 }
161 161
162 162 mutex_exit(&as->a_contents);
163 163 return (0);
164 164 }
165 165
166 166 /*
167 167 * Search the callback list for an entry which pertains to arg.
168 168 *
169 169 * This is called from within the client upon completion of the callback.
170 170 * RETURN VALUES:
171 171 * AS_CALLBACK_DELETED (callback entry found and deleted)
172 172 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
173 173 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
174 174 * entry will be made in as_do_callbacks)
175 175 *
176 176 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
177 177 * set, it indicates that as_do_callbacks is processing this entry. The
178 178 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
179 179 * to unblock as_do_callbacks, in case it is blocked.
180 180 *
181 181 * CALLERS RESPONSIBILITY: If not calling from within the process context for
182 182 * the specified as, the caller must guarantee persistence of the specified as
183 183 * for the duration of this function (eg. pages being locked within the as
184 184 * will guarantee persistence).
185 185 */
186 186 uint_t
187 187 as_delete_callback(struct as *as, void *arg)
188 188 {
189 189 struct as_callback **prevcb = &as->a_callbacks;
190 190 struct as_callback *cb;
191 191 uint_t rc = AS_CALLBACK_NOTFOUND;
192 192
193 193 mutex_enter(&as->a_contents);
194 194 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
195 195 if (cb->ascb_arg != arg)
196 196 continue;
197 197
198 198 /*
199 199 * If the events indicate AS_CALLBACK_CALLED, just clear
200 200 * AS_ALL_EVENT in the events field and wakeup the thread
201 201 * that may be waiting in as_do_callbacks. as_do_callbacks
202 202 * will take care of removing this entry from the list. In
203 203 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
204 204 * (AS_CALLBACK_CALLED not set), just remove it from the
205 205 * list, return the memory and return AS_CALLBACK_DELETED.
206 206 */
207 207 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
208 208 /* leave AS_CALLBACK_CALLED */
209 209 cb->ascb_events &= ~AS_ALL_EVENT;
210 210 rc = AS_CALLBACK_DELETE_DEFERRED;
211 211 cv_broadcast(&as->a_cv);
212 212 } else {
213 213 *prevcb = cb->ascb_next;
214 214 kmem_free(cb, sizeof (struct as_callback));
215 215 rc = AS_CALLBACK_DELETED;
216 216 }
217 217 break;
218 218 }
219 219 mutex_exit(&as->a_contents);
220 220 return (rc);
221 221 }
222 222
223 223 /*
224 224 * Searches the as callback list for a matching entry.
225 225 * Returns a pointer to the first matching callback, or NULL if
226 226 * nothing is found.
227 227 * This function never sleeps so it is ok to call it with more
228 228 * locks held but the (required) a_contents mutex.
229 229 *
230 230 * See also comment on as_do_callbacks below.
231 231 */
232 232 static struct as_callback *
233 233 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
234 234 size_t event_len)
235 235 {
236 236 struct as_callback *cb;
237 237
238 238 ASSERT(MUTEX_HELD(&as->a_contents));
239 239 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
240 240 /*
241 241 * If the callback has not already been called, then
242 242 * check if events or address range pertains. An event_len
243 243 * of zero means do an unconditional callback.
244 244 */
245 245 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
246 246 ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
247 247 (event_addr + event_len < cb->ascb_saddr) ||
248 248 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
249 249 continue;
250 250 }
251 251 break;
252 252 }
253 253 return (cb);
254 254 }
255 255
256 256 /*
257 257 * Executes a given callback and removes it from the callback list for
258 258 * this address space.
259 259 * This function may sleep so the caller must drop all locks except
260 260 * a_contents before calling this func.
261 261 *
262 262 * See also comments on as_do_callbacks below.
263 263 */
264 264 static void
265 265 as_execute_callback(struct as *as, struct as_callback *cb,
266 266 uint_t events)
267 267 {
268 268 struct as_callback **prevcb;
269 269 void *cb_arg;
270 270
271 271 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
272 272 cb->ascb_events |= AS_CALLBACK_CALLED;
273 273 mutex_exit(&as->a_contents);
274 274 (*cb->ascb_func)(as, cb->ascb_arg, events);
275 275 mutex_enter(&as->a_contents);
276 276 /*
277 277 * the callback function is required to delete the callback
278 278 * when the callback function determines it is OK for
279 279 * this thread to continue. as_delete_callback will clear
280 280 * the AS_ALL_EVENT in the events field when it is deleted.
281 281 * If the callback function called as_delete_callback,
282 282 * events will already be cleared and there will be no blocking.
283 283 */
284 284 while ((cb->ascb_events & events) != 0) {
285 285 cv_wait(&as->a_cv, &as->a_contents);
286 286 }
287 287 /*
288 288 * This entry needs to be taken off the list. Normally, the
289 289 * callback func itself does that, but unfortunately the list
290 290 * may have changed while the callback was running because the
291 291 * a_contents mutex was dropped and someone else other than the
292 292 * callback func itself could have called as_delete_callback,
293 293 * so we have to search to find this entry again. The entry
294 294 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
295 295 */
296 296 cb_arg = cb->ascb_arg;
297 297 prevcb = &as->a_callbacks;
298 298 for (cb = as->a_callbacks; cb != NULL;
299 299 prevcb = &cb->ascb_next, cb = *prevcb) {
300 300 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
301 301 (cb_arg != cb->ascb_arg)) {
302 302 continue;
303 303 }
304 304 *prevcb = cb->ascb_next;
305 305 kmem_free(cb, sizeof (struct as_callback));
306 306 break;
307 307 }
308 308 }
309 309
310 310 /*
311 311 * Check the callback list for a matching event and intersection of
312 312 * address range. If there is a match invoke the callback. Skip an entry if:
313 313 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
314 314 * - not event of interest
315 315 * - not address range of interest
316 316 *
317 317 * An event_len of zero indicates a request for an unconditional callback
318 318 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The
319 319 * a_contents lock must be dropped before a callback, so only one callback
320 320 * can be done before returning. Return -1 (true) if a callback was
321 321 * executed and removed from the list, else return 0 (false).
322 322 *
323 323 * The logically separate parts, i.e. finding a matching callback and
324 324 * executing a given callback have been separated into two functions
325 325 * so that they can be called with different sets of locks held beyond
326 326 * the always-required a_contents. as_find_callback does not sleep so
327 327 * it is ok to call it if more locks than a_contents (i.e. the a_lock
328 328 * rwlock) are held. as_execute_callback on the other hand may sleep
329 329 * so all locks beyond a_contents must be dropped by the caller if one
330 330 * does not want to end comatose.
331 331 */
332 332 static int
333 333 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
334 334 size_t event_len)
335 335 {
336 336 struct as_callback *cb;
337 337
338 338 if ((cb = as_find_callback(as, events, event_addr, event_len))) {
339 339 as_execute_callback(as, cb, events);
340 340 return (-1);
341 341 }
342 342 return (0);
343 343 }
344 344
345 345 /*
346 346 * Search for the segment containing addr. If a segment containing addr
347 347 * exists, that segment is returned. If no such segment exists, and
348 348 * the list spans addresses greater than addr, then the first segment
349 349 * whose base is greater than addr is returned; otherwise, NULL is
350 350 * returned unless tail is true, in which case the last element of the
351 351 * list is returned.
352 352 *
353 353 * a_seglast is used to cache the last found segment for repeated
354 354 * searches to the same addr (which happens frequently).
355 355 */
356 356 struct seg *
357 357 as_findseg(struct as *as, caddr_t addr, int tail)
358 358 {
359 359 struct seg *seg = as->a_seglast;
360 360 avl_index_t where;
361 361
362 362 ASSERT(AS_LOCK_HELD(as, &as->a_lock));
363 363
364 364 if (seg != NULL &&
365 365 seg->s_base <= addr &&
366 366 addr < seg->s_base + seg->s_size)
367 367 return (seg);
368 368
369 369 seg = avl_find(&as->a_segtree, &addr, &where);
370 370 if (seg != NULL)
371 371 return (as->a_seglast = seg);
372 372
373 373 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
374 374 if (seg == NULL && tail)
375 375 seg = avl_last(&as->a_segtree);
376 376 return (as->a_seglast = seg);
377 377 }
378 378
379 379 #ifdef VERIFY_SEGLIST
380 380 /*
381 381 * verify that the linked list is coherent
382 382 */
383 383 static void
384 384 as_verify(struct as *as)
385 385 {
386 386 struct seg *seg, *seglast, *p, *n;
387 387 uint_t nsegs = 0;
388 388
389 389 if (do_as_verify == 0)
390 390 return;
391 391
392 392 seglast = as->a_seglast;
393 393
394 394 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
395 395 ASSERT(seg->s_as == as);
396 396 p = AS_SEGPREV(as, seg);
397 397 n = AS_SEGNEXT(as, seg);
398 398 ASSERT(p == NULL || p->s_as == as);
399 399 ASSERT(p == NULL || p->s_base < seg->s_base);
400 400 ASSERT(n == NULL || n->s_base > seg->s_base);
401 401 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
402 402 if (seg == seglast)
403 403 seglast = NULL;
404 404 nsegs++;
405 405 }
406 406 ASSERT(seglast == NULL);
407 407 ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
408 408 }
409 409 #endif /* VERIFY_SEGLIST */
410 410
411 411 /*
412 412 * Add a new segment to the address space. The avl_find()
413 413 * may be expensive so we attempt to use last segment accessed
414 414 * in as_gap() as an insertion point.
415 415 */
416 416 int
417 417 as_addseg(struct as *as, struct seg *newseg)
418 418 {
419 419 struct seg *seg;
420 420 caddr_t addr;
421 421 caddr_t eaddr;
422 422 avl_index_t where;
423 423
424 424 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
425 425
426 426 as->a_updatedir = 1; /* inform /proc */
427 427 gethrestime(&as->a_updatetime);
428 428
429 429 if (as->a_lastgaphl != NULL) {
430 430 struct seg *hseg = NULL;
431 431 struct seg *lseg = NULL;
432 432
433 433 if (as->a_lastgaphl->s_base > newseg->s_base) {
434 434 hseg = as->a_lastgaphl;
435 435 lseg = AVL_PREV(&as->a_segtree, hseg);
436 436 } else {
437 437 lseg = as->a_lastgaphl;
438 438 hseg = AVL_NEXT(&as->a_segtree, lseg);
439 439 }
440 440
441 441 if (hseg && lseg && lseg->s_base < newseg->s_base &&
442 442 hseg->s_base > newseg->s_base) {
443 443 avl_insert_here(&as->a_segtree, newseg, lseg,
444 444 AVL_AFTER);
445 445 as->a_lastgaphl = NULL;
446 446 as->a_seglast = newseg;
447 447 return (0);
448 448 }
449 449 as->a_lastgaphl = NULL;
450 450 }
451 451
452 452 addr = newseg->s_base;
453 453 eaddr = addr + newseg->s_size;
454 454 again:
455 455
456 456 seg = avl_find(&as->a_segtree, &addr, &where);
457 457
458 458 if (seg == NULL)
459 459 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
460 460
461 461 if (seg == NULL)
462 462 seg = avl_last(&as->a_segtree);
463 463
464 464 if (seg != NULL) {
465 465 caddr_t base = seg->s_base;
466 466
467 467 /*
468 468 * If top of seg is below the requested address, then
469 469 * the insertion point is at the end of the linked list,
470 470 * and seg points to the tail of the list. Otherwise,
471 471 * the insertion point is immediately before seg.
472 472 */
473 473 if (base + seg->s_size > addr) {
474 474 if (addr >= base || eaddr > base) {
475 475 #ifdef __sparc
476 476 extern struct seg_ops segnf_ops;
477 477
478 478 /*
479 479 * no-fault segs must disappear if overlaid.
480 480 * XXX need new segment type so
481 481 * we don't have to check s_ops
482 482 */
483 483 if (seg->s_ops == &segnf_ops) {
484 484 seg_unmap(seg);
485 485 goto again;
486 486 }
487 487 #endif
488 488 return (-1); /* overlapping segment */
489 489 }
490 490 }
491 491 }
492 492 as->a_seglast = newseg;
493 493 avl_insert(&as->a_segtree, newseg, where);
494 494
495 495 #ifdef VERIFY_SEGLIST
496 496 as_verify(as);
497 497 #endif
498 498 return (0);
499 499 }
500 500
501 501 struct seg *
502 502 as_removeseg(struct as *as, struct seg *seg)
503 503 {
504 504 avl_tree_t *t;
505 505
506 506 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
507 507
508 508 as->a_updatedir = 1; /* inform /proc */
509 509 gethrestime(&as->a_updatetime);
510 510
511 511 if (seg == NULL)
512 512 return (NULL);
513 513
514 514 t = &as->a_segtree;
515 515 if (as->a_seglast == seg)
516 516 as->a_seglast = NULL;
517 517 as->a_lastgaphl = NULL;
518 518
519 519 /*
520 520 * if this segment is at an address higher than
521 521 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
522 522 */
523 523 if (as->a_lastgap &&
524 524 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
525 525 as->a_lastgap = AVL_NEXT(t, seg);
526 526
527 527 /*
528 528 * remove the segment from the seg tree
529 529 */
530 530 avl_remove(t, seg);
531 531
532 532 #ifdef VERIFY_SEGLIST
533 533 as_verify(as);
534 534 #endif
535 535 return (seg);
536 536 }
537 537
538 538 /*
539 539 * Find a segment containing addr.
540 540 */
541 541 struct seg *
542 542 as_segat(struct as *as, caddr_t addr)
543 543 {
544 544 struct seg *seg = as->a_seglast;
545 545
546 546 ASSERT(AS_LOCK_HELD(as, &as->a_lock));
547 547
548 548 if (seg != NULL && seg->s_base <= addr &&
549 549 addr < seg->s_base + seg->s_size)
550 550 return (seg);
551 551
552 552 seg = avl_find(&as->a_segtree, &addr, NULL);
553 553 return (seg);
554 554 }
555 555
556 556 /*
557 557 * Serialize all searches for holes in an address space to
558 558 * prevent two or more threads from allocating the same virtual
559 559 * address range. The address space must not be "read/write"
560 560 * locked by the caller since we may block.
561 561 */
562 562 void
563 563 as_rangelock(struct as *as)
564 564 {
565 565 mutex_enter(&as->a_contents);
566 566 while (AS_ISCLAIMGAP(as))
567 567 cv_wait(&as->a_cv, &as->a_contents);
568 568 AS_SETCLAIMGAP(as);
569 569 mutex_exit(&as->a_contents);
570 570 }
571 571
572 572 /*
573 573 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
574 574 */
575 575 void
576 576 as_rangeunlock(struct as *as)
577 577 {
578 578 mutex_enter(&as->a_contents);
579 579 AS_CLRCLAIMGAP(as);
580 580 cv_signal(&as->a_cv);
581 581 mutex_exit(&as->a_contents);
582 582 }
583 583
584 584 /*
585 585 * compar segments (or just an address) by segment address range
586 586 */
587 587 static int
588 588 as_segcompar(const void *x, const void *y)
589 589 {
590 590 struct seg *a = (struct seg *)x;
591 591 struct seg *b = (struct seg *)y;
592 592
593 593 if (a->s_base < b->s_base)
594 594 return (-1);
595 595 if (a->s_base >= b->s_base + b->s_size)
596 596 return (1);
597 597 return (0);
598 598 }
599 599
600 600
601 601 void
602 602 as_avlinit(struct as *as)
603 603 {
604 604 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
605 605 offsetof(struct seg, s_tree));
606 606 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
607 607 offsetof(struct watched_page, wp_link));
608 608 }
609 609
610 610 /*ARGSUSED*/
611 611 static int
612 612 as_constructor(void *buf, void *cdrarg, int kmflags)
613 613 {
614 614 struct as *as = buf;
615 615
616 616 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
617 617 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
618 618 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
619 619 as_avlinit(as);
620 620 return (0);
621 621 }
622 622
623 623 /*ARGSUSED1*/
624 624 static void
625 625 as_destructor(void *buf, void *cdrarg)
626 626 {
627 627 struct as *as = buf;
628 628
629 629 avl_destroy(&as->a_segtree);
630 630 mutex_destroy(&as->a_contents);
631 631 cv_destroy(&as->a_cv);
632 632 rw_destroy(&as->a_lock);
633 633 }
634 634
635 635 void
636 636 as_init(void)
637 637 {
638 638 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
639 639 as_constructor, as_destructor, NULL, NULL, NULL, 0);
640 640 }
641 641
642 642 /*
643 643 * Allocate and initialize an address space data structure.
644 644 * We call hat_alloc to allow any machine dependent
645 645 * information in the hat structure to be initialized.
646 646 */
647 647 struct as *
648 648 as_alloc(void)
649 649 {
650 650 struct as *as;
651 651
652 652 as = kmem_cache_alloc(as_cache, KM_SLEEP);
653 653
654 654 as->a_flags = 0;
655 655 as->a_vbits = 0;
656 656 as->a_hrm = NULL;
657 657 as->a_seglast = NULL;
658 658 as->a_size = 0;
659 659 as->a_resvsize = 0;
660 660 as->a_updatedir = 0;
661 661 gethrestime(&as->a_updatetime);
662 662 as->a_objectdir = NULL;
663 663 as->a_sizedir = 0;
664 664 as->a_userlimit = (caddr_t)USERLIMIT;
665 665 as->a_lastgap = NULL;
666 666 as->a_lastgaphl = NULL;
667 667 as->a_callbacks = NULL;
668 668
669 669 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
670 670 as->a_hat = hat_alloc(as); /* create hat for default system mmu */
671 671 AS_LOCK_EXIT(as, &as->a_lock);
672 672
673 673 return (as);
674 674 }
675 675
676 676 /*
677 677 * Free an address space data structure.
678 678 * Need to free the hat first and then
679 679 * all the segments on this as and finally
680 680 * the space for the as struct itself.
681 681 */
682 682 void
683 683 as_free(struct as *as)
684 684 {
685 685 struct hat *hat = as->a_hat;
686 686 struct seg *seg, *next;
687 687 boolean_t free_started = B_FALSE;
688 688
689 689 top:
690 690 /*
691 691 * Invoke ALL callbacks. as_do_callbacks will do one callback
692 692 * per call, and not return (-1) until the callback has completed.
693 693 * When as_do_callbacks returns zero, all callbacks have completed.
694 694 */
695 695 mutex_enter(&as->a_contents);
696 696 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
697 697 ;
698 698
699 699 mutex_exit(&as->a_contents);
700 700 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
701 701
702 702 if (!free_started) {
703 703 free_started = B_TRUE;
704 704 hat_free_start(hat);
705 705 }
706 706 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
707 707 int err;
708 708
709 709 next = AS_SEGNEXT(as, seg);
710 710 retry:
711 711 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
712 712 if (err == EAGAIN) {
713 713 mutex_enter(&as->a_contents);
714 714 if (as->a_callbacks) {
715 715 AS_LOCK_EXIT(as, &as->a_lock);
716 716 } else if (!AS_ISNOUNMAPWAIT(as)) {
717 717 /*
718 718 * Memory is currently locked. Wait for a
719 719 * cv_signal that it has been unlocked, then
720 720 * try the operation again.
721 721 */
722 722 if (AS_ISUNMAPWAIT(as) == 0)
723 723 cv_broadcast(&as->a_cv);
724 724 AS_SETUNMAPWAIT(as);
725 725 AS_LOCK_EXIT(as, &as->a_lock);
726 726 while (AS_ISUNMAPWAIT(as))
727 727 cv_wait(&as->a_cv, &as->a_contents);
728 728 } else {
729 729 /*
730 730 * We may have raced with
731 731 * segvn_reclaim()/segspt_reclaim(). In this
732 732 * case clean nounmapwait flag and retry since
733 733 * softlockcnt in this segment may be already
734 734 * 0. We don't drop as writer lock so our
735 735 * number of retries without sleeping should
736 736 * be very small. See segvn_reclaim() for
737 737 * more comments.
738 738 */
739 739 AS_CLRNOUNMAPWAIT(as);
740 740 mutex_exit(&as->a_contents);
741 741 goto retry;
742 742 }
743 743 mutex_exit(&as->a_contents);
744 744 goto top;
745 745 } else {
746 746 /*
747 747 * We do not expect any other error return at this
748 748 * time. This is similar to an ASSERT in seg_unmap()
749 749 */
750 750 ASSERT(err == 0);
751 751 }
752 752 }
753 753 hat_free_end(hat);
754 754 AS_LOCK_EXIT(as, &as->a_lock);
755 755
756 756 /* /proc stuff */
757 757 ASSERT(avl_numnodes(&as->a_wpage) == 0);
758 758 if (as->a_objectdir) {
759 759 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
760 760 as->a_objectdir = NULL;
761 761 as->a_sizedir = 0;
762 762 }
763 763
764 764 /*
765 765 * Free the struct as back to kmem. Assert it has no segments.
766 766 */
767 767 ASSERT(avl_numnodes(&as->a_segtree) == 0);
768 768 kmem_cache_free(as_cache, as);
769 769 }
770 770
771 771 int
772 772 as_dup(struct as *as, struct proc *forkedproc)
773 773 {
774 774 struct as *newas;
775 775 struct seg *seg, *newseg;
776 776 size_t purgesize = 0;
777 777 int error;
778 778
779 779 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
780 780 as_clearwatch(as);
781 781 newas = as_alloc();
782 782 newas->a_userlimit = as->a_userlimit;
783 783 newas->a_proc = forkedproc;
784 784
785 785 AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
786 786
787 787 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
788 788
789 789 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
790 790
791 791 if (seg->s_flags & S_PURGE) {
792 792 purgesize += seg->s_size;
793 793 continue;
794 794 }
795 795
796 796 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
797 797 if (newseg == NULL) {
798 798 AS_LOCK_EXIT(newas, &newas->a_lock);
799 799 as_setwatch(as);
800 800 AS_LOCK_EXIT(as, &as->a_lock);
801 801 as_free(newas);
802 802 return (-1);
803 803 }
804 804 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
805 805 /*
806 806 * We call seg_free() on the new seg
807 807 * because the segment is not set up
808 808 * completely; i.e. it has no ops.
809 809 */
810 810 as_setwatch(as);
811 811 AS_LOCK_EXIT(as, &as->a_lock);
812 812 seg_free(newseg);
813 813 AS_LOCK_EXIT(newas, &newas->a_lock);
814 814 as_free(newas);
815 815 return (error);
816 816 }
817 817 newas->a_size += seg->s_size;
818 818 }
819 819 newas->a_resvsize = as->a_resvsize - purgesize;
820 820
821 821 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
822 822
823 823 AS_LOCK_EXIT(newas, &newas->a_lock);
824 824
825 825 as_setwatch(as);
826 826 AS_LOCK_EXIT(as, &as->a_lock);
827 827 if (error != 0) {
828 828 as_free(newas);
829 829 return (error);
830 830 }
831 831 forkedproc->p_as = newas;
832 832 return (0);
833 833 }
834 834
835 835 /*
836 836 * Handle a ``fault'' at addr for size bytes.
837 837 */
838 838 faultcode_t
839 839 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
840 840 enum fault_type type, enum seg_rw rw)
841 841 {
842 842 struct seg *seg;
843 843 caddr_t raddr; /* rounded down addr */
844 844 size_t rsize; /* rounded up size */
845 845 size_t ssize;
846 846 faultcode_t res = 0;
847 847 caddr_t addrsav;
848 848 struct seg *segsav;
849 849 int as_lock_held;
850 850 klwp_t *lwp = ttolwp(curthread);
851 851 int holding_wpage = 0;
852 852
853 853
854 854
855 855 retry:
856 856 /*
857 857 * Indicate that the lwp is not to be stopped while waiting for a
858 858 * pagefault. This is to avoid deadlock while debugging a process
859 859 * via /proc over NFS (in particular).
860 860 */
861 861 if (lwp != NULL)
862 862 lwp->lwp_nostop++;
863 863
864 864 /*
865 865 * same length must be used when we softlock and softunlock. We
866 866 * don't support softunlocking lengths less than the original length
867 867 * when there is largepage support. See seg_dev.c for more
868 868 * comments.
869 869 */
870 870 switch (type) {
871 871
872 872 case F_SOFTLOCK:
873 873 CPU_STATS_ADD_K(vm, softlock, 1);
874 874 break;
875 875
876 876 case F_SOFTUNLOCK:
877 877 break;
878 878
879 879 case F_PROT:
880 880 CPU_STATS_ADD_K(vm, prot_fault, 1);
881 881 break;
882 882
883 883 case F_INVAL:
884 884 CPU_STATS_ENTER_K();
885 885 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
886 886 if (as == &kas)
887 887 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
888 888 CPU_STATS_EXIT_K();
889 889 break;
890 890 }
891 891
892 892 /* Kernel probe */
893 893 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
894 894 tnf_opaque, address, addr,
895 895 tnf_fault_type, fault_type, type,
896 896 tnf_seg_access, access, rw);
897 897
898 898 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
899 899 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
900 900 (size_t)raddr;
901 901
902 902 /*
903 903 * XXX -- Don't grab the as lock for segkmap. We should grab it for
904 904 * correctness, but then we could be stuck holding this lock for
905 905 * a LONG time if the fault needs to be resolved on a slow
906 906 * filesystem, and then no-one will be able to exec new commands,
907 907 * as exec'ing requires the write lock on the as.
908 908 */
909 909 if (as == &kas && segkmap && segkmap->s_base <= raddr &&
910 910 raddr + size < segkmap->s_base + segkmap->s_size) {
911 911 seg = segkmap;
912 912 as_lock_held = 0;
913 913 } else {
914 914 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
915 915
916 916 seg = as_segat(as, raddr);
917 917 if (seg == NULL) {
918 918 AS_LOCK_EXIT(as, &as->a_lock);
919 919 if (lwp != NULL)
920 920 lwp->lwp_nostop--;
921 921 return (FC_NOMAP);
922 922 }
923 923
924 924 as_lock_held = 1;
925 925 }
926 926
927 927 addrsav = raddr;
928 928 segsav = seg;
929 929
930 930 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
931 931 if (raddr >= seg->s_base + seg->s_size) {
932 932 seg = AS_SEGNEXT(as, seg);
933 933 if (seg == NULL || raddr != seg->s_base) {
934 934 res = FC_NOMAP;
935 935 break;
936 936 }
937 937 }
938 938 if (raddr + rsize > seg->s_base + seg->s_size)
939 939 ssize = seg->s_base + seg->s_size - raddr;
940 940 else
941 941 ssize = rsize;
942 942
943 943 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
944 944
945 945 /* Restore watchpoints */
946 946 if (holding_wpage) {
947 947 as_setwatch(as);
948 948 holding_wpage = 0;
949 949 }
950 950
951 951 if (res != 0)
952 952 break;
953 953 }
954 954
955 955 /*
956 956 * If we were SOFTLOCKing and encountered a failure,
957 957 * we must SOFTUNLOCK the range we already did. (Maybe we
958 958 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
959 959 * right here...)
960 960 */
961 961 if (res != 0 && type == F_SOFTLOCK) {
962 962 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
963 963 if (addrsav >= seg->s_base + seg->s_size)
964 964 seg = AS_SEGNEXT(as, seg);
965 965 ASSERT(seg != NULL);
966 966 /*
967 967 * Now call the fault routine again to perform the
968 968 * unlock using S_OTHER instead of the rw variable
969 969 * since we never got a chance to touch the pages.
970 970 */
971 971 if (raddr > seg->s_base + seg->s_size)
972 972 ssize = seg->s_base + seg->s_size - addrsav;
973 973 else
974 974 ssize = raddr - addrsav;
975 975 (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
976 976 F_SOFTUNLOCK, S_OTHER);
977 977 }
978 978 }
979 979 if (as_lock_held)
980 980 AS_LOCK_EXIT(as, &as->a_lock);
981 981 if (lwp != NULL)
982 982 lwp->lwp_nostop--;
983 983
984 984 /*
985 985 * If the lower levels returned EDEADLK for a fault,
986 986 * It means that we should retry the fault. Let's wait
987 987 * a bit also to let the deadlock causing condition clear.
988 988 * This is part of a gross hack to work around a design flaw
989 989 * in the ufs/sds logging code and should go away when the
990 990 * logging code is re-designed to fix the problem. See bug
991 991 * 4125102 for details of the problem.
992 992 */
993 993 if (FC_ERRNO(res) == EDEADLK) {
994 994 delay(deadlk_wait);
995 995 res = 0;
996 996 goto retry;
997 997 }
998 998 return (res);
999 999 }
1000 1000
1001 1001
1002 1002
1003 1003 /*
1004 1004 * Asynchronous ``fault'' at addr for size bytes.
1005 1005 */
1006 1006 faultcode_t
1007 1007 as_faulta(struct as *as, caddr_t addr, size_t size)
1008 1008 {
1009 1009 struct seg *seg;
1010 1010 caddr_t raddr; /* rounded down addr */
1011 1011 size_t rsize; /* rounded up size */
1012 1012 faultcode_t res = 0;
1013 1013 klwp_t *lwp = ttolwp(curthread);
1014 1014
1015 1015 retry:
1016 1016 /*
1017 1017 * Indicate that the lwp is not to be stopped while waiting
1018 1018 * for a pagefault. This is to avoid deadlock while debugging
1019 1019 * a process via /proc over NFS (in particular).
1020 1020 */
1021 1021 if (lwp != NULL)
1022 1022 lwp->lwp_nostop++;
1023 1023
1024 1024 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1025 1025 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1026 1026 (size_t)raddr;
1027 1027
1028 1028 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1029 1029 seg = as_segat(as, raddr);
1030 1030 if (seg == NULL) {
1031 1031 AS_LOCK_EXIT(as, &as->a_lock);
1032 1032 if (lwp != NULL)
1033 1033 lwp->lwp_nostop--;
1034 1034 return (FC_NOMAP);
1035 1035 }
1036 1036
1037 1037 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1038 1038 if (raddr >= seg->s_base + seg->s_size) {
1039 1039 seg = AS_SEGNEXT(as, seg);
1040 1040 if (seg == NULL || raddr != seg->s_base) {
1041 1041 res = FC_NOMAP;
1042 1042 break;
1043 1043 }
1044 1044 }
1045 1045 res = SEGOP_FAULTA(seg, raddr);
1046 1046 if (res != 0)
1047 1047 break;
1048 1048 }
1049 1049 AS_LOCK_EXIT(as, &as->a_lock);
1050 1050 if (lwp != NULL)
1051 1051 lwp->lwp_nostop--;
1052 1052 /*
1053 1053 * If the lower levels returned EDEADLK for a fault,
1054 1054 * It means that we should retry the fault. Let's wait
1055 1055 * a bit also to let the deadlock causing condition clear.
1056 1056 * This is part of a gross hack to work around a design flaw
1057 1057 * in the ufs/sds logging code and should go away when the
1058 1058 * logging code is re-designed to fix the problem. See bug
1059 1059 * 4125102 for details of the problem.
1060 1060 */
1061 1061 if (FC_ERRNO(res) == EDEADLK) {
1062 1062 delay(deadlk_wait);
1063 1063 res = 0;
1064 1064 goto retry;
1065 1065 }
1066 1066 return (res);
1067 1067 }
1068 1068
1069 1069 /*
1070 1070 * Set the virtual mapping for the interval from [addr : addr + size)
1071 1071 * in address space `as' to have the specified protection.
1072 1072 * It is ok for the range to cross over several segments,
1073 1073 * as long as they are contiguous.
1074 1074 */
1075 1075 int
1076 1076 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1077 1077 {
1078 1078 struct seg *seg;
1079 1079 struct as_callback *cb;
1080 1080 size_t ssize;
1081 1081 caddr_t raddr; /* rounded down addr */
1082 1082 size_t rsize; /* rounded up size */
1083 1083 int error = 0, writer = 0;
1084 1084 caddr_t saveraddr;
1085 1085 size_t saversize;
1086 1086
1087 1087 setprot_top:
1088 1088 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1089 1089 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1090 1090 (size_t)raddr;
1091 1091
1092 1092 if (raddr + rsize < raddr) /* check for wraparound */
1093 1093 return (ENOMEM);
1094 1094
1095 1095 saveraddr = raddr;
1096 1096 saversize = rsize;
1097 1097
1098 1098 /*
1099 1099 * Normally we only lock the as as a reader. But
1100 1100 * if due to setprot the segment driver needs to split
1101 1101 * a segment it will return IE_RETRY. Therefore we re-acquire
1102 1102 * the as lock as a writer so the segment driver can change
1103 1103 * the seg list. Also the segment driver will return IE_RETRY
1104 1104 * after it has changed the segment list so we therefore keep
1105 1105 * locking as a writer. Since these opeartions should be rare
1106 1106 * want to only lock as a writer when necessary.
1107 1107 */
1108 1108 if (writer || avl_numnodes(&as->a_wpage) != 0) {
1109 1109 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1110 1110 } else {
1111 1111 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1112 1112 }
1113 1113
1114 1114 as_clearwatchprot(as, raddr, rsize);
1115 1115 seg = as_segat(as, raddr);
1116 1116 if (seg == NULL) {
1117 1117 as_setwatch(as);
1118 1118 AS_LOCK_EXIT(as, &as->a_lock);
1119 1119 return (ENOMEM);
1120 1120 }
1121 1121
1122 1122 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1123 1123 if (raddr >= seg->s_base + seg->s_size) {
1124 1124 seg = AS_SEGNEXT(as, seg);
1125 1125 if (seg == NULL || raddr != seg->s_base) {
1126 1126 error = ENOMEM;
1127 1127 break;
1128 1128 }
1129 1129 }
1130 1130 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1131 1131 ssize = seg->s_base + seg->s_size - raddr;
1132 1132 else
1133 1133 ssize = rsize;
1134 1134 retry:
1135 1135 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1136 1136
1137 1137 if (error == IE_NOMEM) {
1138 1138 error = EAGAIN;
1139 1139 break;
1140 1140 }
1141 1141
1142 1142 if (error == IE_RETRY) {
1143 1143 AS_LOCK_EXIT(as, &as->a_lock);
1144 1144 writer = 1;
1145 1145 goto setprot_top;
1146 1146 }
1147 1147
1148 1148 if (error == EAGAIN) {
1149 1149 /*
1150 1150 * Make sure we have a_lock as writer.
1151 1151 */
1152 1152 if (writer == 0) {
1153 1153 AS_LOCK_EXIT(as, &as->a_lock);
1154 1154 writer = 1;
1155 1155 goto setprot_top;
1156 1156 }
1157 1157
1158 1158 /*
1159 1159 * Memory is currently locked. It must be unlocked
1160 1160 * before this operation can succeed through a retry.
1161 1161 * The possible reasons for locked memory and
1162 1162 * corresponding strategies for unlocking are:
1163 1163 * (1) Normal I/O
1164 1164 * wait for a signal that the I/O operation
1165 1165 * has completed and the memory is unlocked.
1166 1166 * (2) Asynchronous I/O
1167 1167 * The aio subsystem does not unlock pages when
1168 1168 * the I/O is completed. Those pages are unlocked
1169 1169 * when the application calls aiowait/aioerror.
1170 1170 * So, to prevent blocking forever, cv_broadcast()
1171 1171 * is done to wake up aio_cleanup_thread.
1172 1172 * Subsequently, segvn_reclaim will be called, and
1173 1173 * that will do AS_CLRUNMAPWAIT() and wake us up.
1174 1174 * (3) Long term page locking:
1175 1175 * Drivers intending to have pages locked for a
1176 1176 * period considerably longer than for normal I/O
1177 1177 * (essentially forever) may have registered for a
1178 1178 * callback so they may unlock these pages on
1179 1179 * request. This is needed to allow this operation
1180 1180 * to succeed. Each entry on the callback list is
1181 1181 * examined. If the event or address range pertains
1182 1182 * the callback is invoked (unless it already is in
1183 1183 * progress). The a_contents lock must be dropped
1184 1184 * before the callback, so only one callback can
1185 1185 * be done at a time. Go to the top and do more
1186 1186 * until zero is returned. If zero is returned,
1187 1187 * either there were no callbacks for this event
1188 1188 * or they were already in progress.
1189 1189 */
1190 1190 mutex_enter(&as->a_contents);
1191 1191 if (as->a_callbacks &&
1192 1192 (cb = as_find_callback(as, AS_SETPROT_EVENT,
1193 1193 seg->s_base, seg->s_size))) {
1194 1194 AS_LOCK_EXIT(as, &as->a_lock);
1195 1195 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1196 1196 } else if (!AS_ISNOUNMAPWAIT(as)) {
1197 1197 if (AS_ISUNMAPWAIT(as) == 0)
1198 1198 cv_broadcast(&as->a_cv);
1199 1199 AS_SETUNMAPWAIT(as);
1200 1200 AS_LOCK_EXIT(as, &as->a_lock);
1201 1201 while (AS_ISUNMAPWAIT(as))
1202 1202 cv_wait(&as->a_cv, &as->a_contents);
1203 1203 } else {
1204 1204 /*
1205 1205 * We may have raced with
1206 1206 * segvn_reclaim()/segspt_reclaim(). In this
1207 1207 * case clean nounmapwait flag and retry since
1208 1208 * softlockcnt in this segment may be already
1209 1209 * 0. We don't drop as writer lock so our
1210 1210 * number of retries without sleeping should
1211 1211 * be very small. See segvn_reclaim() for
1212 1212 * more comments.
1213 1213 */
1214 1214 AS_CLRNOUNMAPWAIT(as);
1215 1215 mutex_exit(&as->a_contents);
1216 1216 goto retry;
1217 1217 }
1218 1218 mutex_exit(&as->a_contents);
1219 1219 goto setprot_top;
1220 1220 } else if (error != 0)
1221 1221 break;
1222 1222 }
1223 1223 if (error != 0) {
1224 1224 as_setwatch(as);
1225 1225 } else {
1226 1226 as_setwatchprot(as, saveraddr, saversize, prot);
1227 1227 }
1228 1228 AS_LOCK_EXIT(as, &as->a_lock);
1229 1229 return (error);
1230 1230 }
1231 1231
1232 1232 /*
1233 1233 * Check to make sure that the interval [addr, addr + size)
1234 1234 * in address space `as' has at least the specified protection.
1235 1235 * It is ok for the range to cross over several segments, as long
1236 1236 * as they are contiguous.
1237 1237 */
1238 1238 int
1239 1239 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1240 1240 {
1241 1241 struct seg *seg;
1242 1242 size_t ssize;
1243 1243 caddr_t raddr; /* rounded down addr */
1244 1244 size_t rsize; /* rounded up size */
1245 1245 int error = 0;
1246 1246
1247 1247 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1248 1248 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1249 1249 (size_t)raddr;
1250 1250
1251 1251 if (raddr + rsize < raddr) /* check for wraparound */
1252 1252 return (ENOMEM);
1253 1253
1254 1254 /*
1255 1255 * This is ugly as sin...
1256 1256 * Normally, we only acquire the address space readers lock.
1257 1257 * However, if the address space has watchpoints present,
1258 1258 * we must acquire the writer lock on the address space for
1259 1259 * the benefit of as_clearwatchprot() and as_setwatchprot().
1260 1260 */
1261 1261 if (avl_numnodes(&as->a_wpage) != 0)
1262 1262 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1263 1263 else
1264 1264 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1265 1265 as_clearwatchprot(as, raddr, rsize);
1266 1266 seg = as_segat(as, raddr);
1267 1267 if (seg == NULL) {
1268 1268 as_setwatch(as);
1269 1269 AS_LOCK_EXIT(as, &as->a_lock);
1270 1270 return (ENOMEM);
1271 1271 }
1272 1272
1273 1273 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1274 1274 if (raddr >= seg->s_base + seg->s_size) {
1275 1275 seg = AS_SEGNEXT(as, seg);
1276 1276 if (seg == NULL || raddr != seg->s_base) {
1277 1277 error = ENOMEM;
1278 1278 break;
1279 1279 }
1280 1280 }
1281 1281 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1282 1282 ssize = seg->s_base + seg->s_size - raddr;
1283 1283 else
1284 1284 ssize = rsize;
1285 1285
1286 1286 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1287 1287 if (error != 0)
1288 1288 break;
1289 1289 }
1290 1290 as_setwatch(as);
1291 1291 AS_LOCK_EXIT(as, &as->a_lock);
1292 1292 return (error);
1293 1293 }
1294 1294
1295 1295 int
1296 1296 as_unmap(struct as *as, caddr_t addr, size_t size)
1297 1297 {
1298 1298 struct seg *seg, *seg_next;
1299 1299 struct as_callback *cb;
1300 1300 caddr_t raddr, eaddr;
1301 1301 size_t ssize, rsize = 0;
1302 1302 int err;
1303 1303
1304 1304 top:
1305 1305 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1306 1306 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1307 1307 (uintptr_t)PAGEMASK);
1308 1308
1309 1309 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1310 1310
1311 1311 as->a_updatedir = 1; /* inform /proc */
1312 1312 gethrestime(&as->a_updatetime);
1313 1313
1314 1314 /*
1315 1315 * Use as_findseg to find the first segment in the range, then
1316 1316 * step through the segments in order, following s_next.
1317 1317 */
1318 1318 as_clearwatchprot(as, raddr, eaddr - raddr);
1319 1319
1320 1320 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1321 1321 if (eaddr <= seg->s_base)
1322 1322 break; /* eaddr was in a gap; all done */
1323 1323
1324 1324 /* this is implied by the test above */
1325 1325 ASSERT(raddr < eaddr);
1326 1326
1327 1327 if (raddr < seg->s_base)
1328 1328 raddr = seg->s_base; /* raddr was in a gap */
1329 1329
1330 1330 if (eaddr > (seg->s_base + seg->s_size))
1331 1331 ssize = seg->s_base + seg->s_size - raddr;
1332 1332 else
1333 1333 ssize = eaddr - raddr;
1334 1334
1335 1335 /*
1336 1336 * Save next segment pointer since seg can be
1337 1337 * destroyed during the segment unmap operation.
1338 1338 */
1339 1339 seg_next = AS_SEGNEXT(as, seg);
1340 1340
1341 1341 /*
1342 1342 * We didn't count /dev/null mappings, so ignore them here.
1343 1343 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1344 1344 * we have to do this check here while we have seg.)
1345 1345 */
1346 1346 rsize = 0;
1347 1347 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1348 1348 !SEG_IS_PARTIAL_RESV(seg))
1349 1349 rsize = ssize;
1350 1350
1351 1351 retry:
1352 1352 err = SEGOP_UNMAP(seg, raddr, ssize);
1353 1353 if (err == EAGAIN) {
1354 1354 /*
1355 1355 * Memory is currently locked. It must be unlocked
1356 1356 * before this operation can succeed through a retry.
1357 1357 * The possible reasons for locked memory and
1358 1358 * corresponding strategies for unlocking are:
1359 1359 * (1) Normal I/O
1360 1360 * wait for a signal that the I/O operation
1361 1361 * has completed and the memory is unlocked.
1362 1362 * (2) Asynchronous I/O
1363 1363 * The aio subsystem does not unlock pages when
1364 1364 * the I/O is completed. Those pages are unlocked
1365 1365 * when the application calls aiowait/aioerror.
1366 1366 * So, to prevent blocking forever, cv_broadcast()
1367 1367 * is done to wake up aio_cleanup_thread.
1368 1368 * Subsequently, segvn_reclaim will be called, and
1369 1369 * that will do AS_CLRUNMAPWAIT() and wake us up.
1370 1370 * (3) Long term page locking:
1371 1371 * Drivers intending to have pages locked for a
1372 1372 * period considerably longer than for normal I/O
1373 1373 * (essentially forever) may have registered for a
1374 1374 * callback so they may unlock these pages on
1375 1375 * request. This is needed to allow this operation
1376 1376 * to succeed. Each entry on the callback list is
1377 1377 * examined. If the event or address range pertains
1378 1378 * the callback is invoked (unless it already is in
1379 1379 * progress). The a_contents lock must be dropped
1380 1380 * before the callback, so only one callback can
1381 1381 * be done at a time. Go to the top and do more
1382 1382 * until zero is returned. If zero is returned,
1383 1383 * either there were no callbacks for this event
1384 1384 * or they were already in progress.
1385 1385 */
1386 1386 mutex_enter(&as->a_contents);
1387 1387 if (as->a_callbacks &&
1388 1388 (cb = as_find_callback(as, AS_UNMAP_EVENT,
1389 1389 seg->s_base, seg->s_size))) {
1390 1390 AS_LOCK_EXIT(as, &as->a_lock);
1391 1391 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1392 1392 } else if (!AS_ISNOUNMAPWAIT(as)) {
1393 1393 if (AS_ISUNMAPWAIT(as) == 0)
1394 1394 cv_broadcast(&as->a_cv);
1395 1395 AS_SETUNMAPWAIT(as);
1396 1396 AS_LOCK_EXIT(as, &as->a_lock);
1397 1397 while (AS_ISUNMAPWAIT(as))
1398 1398 cv_wait(&as->a_cv, &as->a_contents);
1399 1399 } else {
1400 1400 /*
1401 1401 * We may have raced with
1402 1402 * segvn_reclaim()/segspt_reclaim(). In this
1403 1403 * case clean nounmapwait flag and retry since
1404 1404 * softlockcnt in this segment may be already
1405 1405 * 0. We don't drop as writer lock so our
1406 1406 * number of retries without sleeping should
1407 1407 * be very small. See segvn_reclaim() for
1408 1408 * more comments.
1409 1409 */
1410 1410 AS_CLRNOUNMAPWAIT(as);
1411 1411 mutex_exit(&as->a_contents);
1412 1412 goto retry;
1413 1413 }
1414 1414 mutex_exit(&as->a_contents);
1415 1415 goto top;
1416 1416 } else if (err == IE_RETRY) {
1417 1417 AS_LOCK_EXIT(as, &as->a_lock);
1418 1418 goto top;
1419 1419 } else if (err) {
1420 1420 as_setwatch(as);
1421 1421 AS_LOCK_EXIT(as, &as->a_lock);
1422 1422 return (-1);
1423 1423 }
1424 1424
1425 1425 as->a_size -= ssize;
1426 1426 if (rsize)
1427 1427 as->a_resvsize -= rsize;
1428 1428 raddr += ssize;
1429 1429 }
1430 1430 AS_LOCK_EXIT(as, &as->a_lock);
1431 1431 return (0);
1432 1432 }
1433 1433
1434 1434 static int
1435 1435 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1436 1436 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1437 1437 {
1438 1438 uint_t szc;
1439 1439 uint_t nszc;
1440 1440 int error;
1441 1441 caddr_t a;
1442 1442 caddr_t eaddr;
1443 1443 size_t segsize;
1444 1444 struct seg *seg;
1445 1445 size_t pgsz;
1446 1446 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1447 1447 uint_t save_szcvec;
1448 1448
1449 1449 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1450 1450 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1451 1451 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1452 1452 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1453 1453 if (!do_off) {
1454 1454 vn_a->offset = 0;
1455 1455 }
1456 1456
1457 1457 if (szcvec <= 1) {
1458 1458 seg = seg_alloc(as, addr, size);
1459 1459 if (seg == NULL) {
1460 1460 return (ENOMEM);
1461 1461 }
1462 1462 vn_a->szc = 0;
1463 1463 error = (*crfp)(seg, vn_a);
1464 1464 if (error != 0) {
1465 1465 seg_free(seg);
1466 1466 } else {
1467 1467 as->a_size += size;
1468 1468 as->a_resvsize += size;
1469 1469 }
1470 1470 return (error);
1471 1471 }
1472 1472
1473 1473 eaddr = addr + size;
1474 1474 save_szcvec = szcvec;
1475 1475 szcvec >>= 1;
1476 1476 szc = 0;
1477 1477 nszc = 0;
1478 1478 while (szcvec) {
1479 1479 if ((szcvec & 0x1) == 0) {
1480 1480 nszc++;
1481 1481 szcvec >>= 1;
1482 1482 continue;
1483 1483 }
1484 1484 nszc++;
1485 1485 pgsz = page_get_pagesize(nszc);
1486 1486 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1487 1487 if (a != addr) {
1488 1488 ASSERT(a < eaddr);
1489 1489 segsize = a - addr;
1490 1490 seg = seg_alloc(as, addr, segsize);
1491 1491 if (seg == NULL) {
1492 1492 return (ENOMEM);
1493 1493 }
1494 1494 vn_a->szc = szc;
1495 1495 error = (*crfp)(seg, vn_a);
1496 1496 if (error != 0) {
1497 1497 seg_free(seg);
1498 1498 return (error);
1499 1499 }
1500 1500 as->a_size += segsize;
1501 1501 as->a_resvsize += segsize;
1502 1502 *segcreated = 1;
1503 1503 if (do_off) {
1504 1504 vn_a->offset += segsize;
1505 1505 }
1506 1506 addr = a;
1507 1507 }
1508 1508 szc = nszc;
1509 1509 szcvec >>= 1;
1510 1510 }
1511 1511
1512 1512 ASSERT(addr < eaddr);
1513 1513 szcvec = save_szcvec | 1; /* add 8K pages */
1514 1514 while (szcvec) {
1515 1515 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1516 1516 ASSERT(a >= addr);
1517 1517 if (a != addr) {
1518 1518 segsize = a - addr;
1519 1519 seg = seg_alloc(as, addr, segsize);
1520 1520 if (seg == NULL) {
1521 1521 return (ENOMEM);
1522 1522 }
1523 1523 vn_a->szc = szc;
1524 1524 error = (*crfp)(seg, vn_a);
1525 1525 if (error != 0) {
1526 1526 seg_free(seg);
1527 1527 return (error);
1528 1528 }
1529 1529 as->a_size += segsize;
1530 1530 as->a_resvsize += segsize;
1531 1531 *segcreated = 1;
1532 1532 if (do_off) {
1533 1533 vn_a->offset += segsize;
1534 1534 }
1535 1535 addr = a;
1536 1536 }
1537 1537 szcvec &= ~(1 << szc);
1538 1538 if (szcvec) {
1539 1539 szc = highbit(szcvec) - 1;
1540 1540 pgsz = page_get_pagesize(szc);
1541 1541 }
1542 1542 }
1543 1543 ASSERT(addr == eaddr);
1544 1544
1545 1545 return (0);
1546 1546 }
1547 1547
1548 1548 static int
1549 1549 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1550 1550 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1551 1551 {
1552 1552 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1553 1553 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1554 1554 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1555 1555 type, 0);
1556 1556 int error;
1557 1557 struct seg *seg;
1558 1558 struct vattr va;
1559 1559 u_offset_t eoff;
1560 1560 size_t save_size = 0;
1561 1561 extern size_t textrepl_size_thresh;
1562 1562
1563 1563 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1564 1564 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1565 1565 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1566 1566 ASSERT(vn_a->vp != NULL);
1567 1567 ASSERT(vn_a->amp == NULL);
1568 1568
1569 1569 again:
1570 1570 if (szcvec <= 1) {
1571 1571 seg = seg_alloc(as, addr, size);
1572 1572 if (seg == NULL) {
1573 1573 return (ENOMEM);
1574 1574 }
1575 1575 vn_a->szc = 0;
1576 1576 error = (*crfp)(seg, vn_a);
1577 1577 if (error != 0) {
1578 1578 seg_free(seg);
1579 1579 } else {
1580 1580 as->a_size += size;
1581 1581 as->a_resvsize += size;
1582 1582 }
1583 1583 return (error);
1584 1584 }
1585 1585
1586 1586 va.va_mask = AT_SIZE;
1587 1587 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1588 1588 szcvec = 0;
1589 1589 goto again;
1590 1590 }
1591 1591 eoff = vn_a->offset & PAGEMASK;
1592 1592 if (eoff >= va.va_size) {
1593 1593 szcvec = 0;
1594 1594 goto again;
1595 1595 }
1596 1596 eoff += size;
1597 1597 if (btopr(va.va_size) < btopr(eoff)) {
1598 1598 save_size = size;
1599 1599 size = va.va_size - (vn_a->offset & PAGEMASK);
1600 1600 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1601 1601 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1602 1602 type, 0);
1603 1603 if (szcvec <= 1) {
1604 1604 size = save_size;
1605 1605 goto again;
1606 1606 }
1607 1607 }
1608 1608
1609 1609 if (size > textrepl_size_thresh) {
1610 1610 vn_a->flags |= _MAP_TEXTREPL;
1611 1611 }
1612 1612 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1613 1613 segcreated);
1614 1614 if (error != 0) {
1615 1615 return (error);
1616 1616 }
1617 1617 if (save_size) {
1618 1618 addr += size;
1619 1619 size = save_size - size;
1620 1620 szcvec = 0;
1621 1621 goto again;
1622 1622 }
1623 1623 return (0);
1624 1624 }
1625 1625
1626 1626 /*
1627 1627 * as_map_ansegs: shared or private anonymous memory. Note that the flags
1628 1628 * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1629 1629 */
1630 1630 static int
1631 1631 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1632 1632 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1633 1633 {
1634 1634 uint_t szcvec;
1635 1635 uchar_t type;
1636 1636
1637 1637 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1638 1638 if (vn_a->type == MAP_SHARED) {
1639 1639 type = MAPPGSZC_SHM;
1640 1640 } else if (vn_a->type == MAP_PRIVATE) {
1641 1641 if (vn_a->szc == AS_MAP_HEAP) {
1642 1642 type = MAPPGSZC_HEAP;
1643 1643 } else if (vn_a->szc == AS_MAP_STACK) {
1644 1644 type = MAPPGSZC_STACK;
1645 1645 } else {
1646 1646 type = MAPPGSZC_PRIVM;
1647 1647 }
1648 1648 }
1649 1649 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1650 1650 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1651 1651 (vn_a->flags & MAP_TEXT), type, 0);
1652 1652 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1653 1653 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1654 1654 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1655 1655 ASSERT(vn_a->vp == NULL);
1656 1656
1657 1657 return (as_map_segvn_segs(as, addr, size, szcvec,
1658 1658 crfp, vn_a, segcreated));
1659 1659 }
1660 1660
1661 1661 int
1662 1662 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1663 1663 {
1664 1664 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1665 1665 return (as_map_locked(as, addr, size, crfp, argsp));
1666 1666 }
1667 1667
1668 1668 int
1669 1669 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1670 1670 void *argsp)
1671 1671 {
1672 1672 struct seg *seg = NULL;
1673 1673 caddr_t raddr; /* rounded down addr */
1674 1674 size_t rsize; /* rounded up size */
1675 1675 int error;
1676 1676 int unmap = 0;
1677 1677 struct proc *p = curproc;
1678 1678 struct segvn_crargs crargs;
1679 1679
1680 1680 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1681 1681 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1682 1682 (size_t)raddr;
1683 1683
1684 1684 /*
1685 1685 * check for wrap around
1686 1686 */
1687 1687 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1688 1688 AS_LOCK_EXIT(as, &as->a_lock);
1689 1689 return (ENOMEM);
1690 1690 }
1691 1691
1692 1692 as->a_updatedir = 1; /* inform /proc */
1693 1693 gethrestime(&as->a_updatetime);
1694 1694
1695 1695 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1696 1696 AS_LOCK_EXIT(as, &as->a_lock);
1697 1697
1698 1698 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1699 1699 RCA_UNSAFE_ALL);
1700 1700
1701 1701 return (ENOMEM);
1702 1702 }
1703 1703
1704 1704 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1705 1705 crargs = *(struct segvn_crargs *)argsp;
1706 1706 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1707 1707 if (error != 0) {
1708 1708 AS_LOCK_EXIT(as, &as->a_lock);
1709 1709 if (unmap) {
1710 1710 (void) as_unmap(as, addr, size);
1711 1711 }
1712 1712 return (error);
1713 1713 }
1714 1714 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1715 1715 crargs = *(struct segvn_crargs *)argsp;
1716 1716 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1717 1717 if (error != 0) {
1718 1718 AS_LOCK_EXIT(as, &as->a_lock);
1719 1719 if (unmap) {
1720 1720 (void) as_unmap(as, addr, size);
1721 1721 }
1722 1722 return (error);
1723 1723 }
1724 1724 } else {
1725 1725 seg = seg_alloc(as, addr, size);
1726 1726 if (seg == NULL) {
1727 1727 AS_LOCK_EXIT(as, &as->a_lock);
1728 1728 return (ENOMEM);
1729 1729 }
1730 1730
1731 1731 error = (*crfp)(seg, argsp);
1732 1732 if (error != 0) {
1733 1733 seg_free(seg);
1734 1734 AS_LOCK_EXIT(as, &as->a_lock);
1735 1735 return (error);
1736 1736 }
1737 1737 /*
1738 1738 * Add size now so as_unmap will work if as_ctl fails.
1739 1739 */
1740 1740 as->a_size += rsize;
1741 1741 as->a_resvsize += rsize;
1742 1742 }
1743 1743
1744 1744 as_setwatch(as);
1745 1745
1746 1746 /*
1747 1747 * If the address space is locked,
1748 1748 * establish memory locks for the new segment.
1749 1749 */
1750 1750 mutex_enter(&as->a_contents);
1751 1751 if (AS_ISPGLCK(as)) {
1752 1752 mutex_exit(&as->a_contents);
1753 1753 AS_LOCK_EXIT(as, &as->a_lock);
1754 1754 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1755 1755 if (error != 0)
1756 1756 (void) as_unmap(as, addr, size);
1757 1757 } else {
1758 1758 mutex_exit(&as->a_contents);
1759 1759 AS_LOCK_EXIT(as, &as->a_lock);
1760 1760 }
1761 1761 return (error);
1762 1762 }
1763 1763
1764 1764
1765 1765 /*
1766 1766 * Delete all segments in the address space marked with S_PURGE.
1767 1767 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1768 1768 * These segments are deleted as a first step before calls to as_gap(), so
1769 1769 * that they don't affect mmap() or shmat().
1770 1770 */
1771 1771 void
1772 1772 as_purge(struct as *as)
1773 1773 {
1774 1774 struct seg *seg;
1775 1775 struct seg *next_seg;
1776 1776
1777 1777 /*
1778 1778 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1779 1779 * no need to grab a_contents mutex for this check
1780 1780 */
1781 1781 if ((as->a_flags & AS_NEEDSPURGE) == 0)
1782 1782 return;
1783 1783
1784 1784 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1785 1785 next_seg = NULL;
1786 1786 seg = AS_SEGFIRST(as);
1787 1787 while (seg != NULL) {
1788 1788 next_seg = AS_SEGNEXT(as, seg);
1789 1789 if (seg->s_flags & S_PURGE)
1790 1790 SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1791 1791 seg = next_seg;
1792 1792 }
1793 1793 AS_LOCK_EXIT(as, &as->a_lock);
1794 1794
1795 1795 mutex_enter(&as->a_contents);
1796 1796 as->a_flags &= ~AS_NEEDSPURGE;
1797 1797 mutex_exit(&as->a_contents);
1798 1798 }
1799 1799
1800 1800 /*
1801 1801 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1802 1802 * range of addresses at least "minlen" long, where the base of the range is
1803 1803 * at "off" phase from an "align" boundary and there is space for a
1804 1804 * "redzone"-sized redzone on eithe rside of the range. Thus,
1805 1805 * if align was 4M and off was 16k, the user wants a hole which will start
1806 1806 * 16k into a 4M page.
1807 1807 *
1808 1808 * If flags specifies AH_HI, the hole will have the highest possible address
1809 1809 * in the range. We use the as->a_lastgap field to figure out where to
1810 1810 * start looking for a gap.
1811 1811 *
1812 1812 * Otherwise, the gap will have the lowest possible address.
1813 1813 *
1814 1814 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1815 1815 *
1816 1816 * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1817 1817 * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1818 1818 *
1819 1819 * NOTE: This routine is not correct when base+len overflows caddr_t.
1820 1820 */
1821 1821 int
1822 1822 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1823 1823 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1824 1824 {
1825 1825 caddr_t lobound = *basep;
1826 1826 caddr_t hibound = lobound + *lenp;
1827 1827 struct seg *lseg, *hseg;
1828 1828 caddr_t lo, hi;
1829 1829 int forward;
1830 1830 caddr_t save_base;
1831 1831 size_t save_len;
1832 1832 size_t save_minlen;
1833 1833 size_t save_redzone;
1834 1834 int fast_path = 1;
1835 1835
1836 1836 save_base = *basep;
1837 1837 save_len = *lenp;
1838 1838 save_minlen = minlen;
1839 1839 save_redzone = redzone;
1840 1840
1841 1841 /*
1842 1842 * For the first pass/fast_path, just add align and redzone into
1843 1843 * minlen since if we get an allocation, we can guarantee that it
1844 1844 * will fit the alignment and redzone requested.
1845 1845 * This increases the chance that hibound will be adjusted to
1846 1846 * a_lastgap->s_base which will likely allow us to find an
1847 1847 * acceptable hole in the address space quicker.
1848 1848 * If we can't find a hole with this fast_path, then we look for
1849 1849 * smaller holes in which the alignment and offset may allow
1850 1850 * the allocation to fit.
1851 1851 */
1852 1852 minlen += align;
1853 1853 minlen += 2 * redzone;
1854 1854 redzone = 0;
1855 1855
1856 1856 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1857 1857 if (AS_SEGFIRST(as) == NULL) {
1858 1858 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1859 1859 align, redzone, off)) {
1860 1860 AS_LOCK_EXIT(as, &as->a_lock);
1861 1861 return (0);
1862 1862 } else {
1863 1863 AS_LOCK_EXIT(as, &as->a_lock);
1864 1864 *basep = save_base;
1865 1865 *lenp = save_len;
1866 1866 return (-1);
1867 1867 }
1868 1868 }
1869 1869
1870 1870 retry:
1871 1871 /*
1872 1872 * Set up to iterate over all the inter-segment holes in the given
1873 1873 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1874 1874 * NULL for the highest-addressed hole. If moving backwards, we reset
1875 1875 * sseg to denote the highest-addressed segment.
1876 1876 */
1877 1877 forward = (flags & AH_DIR) == AH_LO;
1878 1878 if (forward) {
1879 1879 hseg = as_findseg(as, lobound, 1);
1880 1880 lseg = AS_SEGPREV(as, hseg);
1881 1881 } else {
1882 1882
1883 1883 /*
1884 1884 * If allocating at least as much as the last allocation,
1885 1885 * use a_lastgap's base as a better estimate of hibound.
1886 1886 */
1887 1887 if (as->a_lastgap &&
1888 1888 minlen >= as->a_lastgap->s_size &&
1889 1889 hibound >= as->a_lastgap->s_base)
1890 1890 hibound = as->a_lastgap->s_base;
1891 1891
1892 1892 hseg = as_findseg(as, hibound, 1);
1893 1893 if (hseg->s_base + hseg->s_size < hibound) {
1894 1894 lseg = hseg;
1895 1895 hseg = NULL;
1896 1896 } else {
1897 1897 lseg = AS_SEGPREV(as, hseg);
1898 1898 }
1899 1899 }
1900 1900
1901 1901 for (;;) {
1902 1902 /*
1903 1903 * Set lo and hi to the hole's boundaries. (We should really
1904 1904 * use MAXADDR in place of hibound in the expression below,
1905 1905 * but can't express it easily; using hibound in its place is
1906 1906 * harmless.)
1907 1907 */
1908 1908 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1909 1909 hi = (hseg == NULL) ? hibound : hseg->s_base;
1910 1910 /*
1911 1911 * If the iteration has moved past the interval from lobound
1912 1912 * to hibound it's pointless to continue.
1913 1913 */
1914 1914 if ((forward && lo > hibound) || (!forward && hi < lobound))
1915 1915 break;
1916 1916 else if (lo > hibound || hi < lobound)
1917 1917 goto cont;
1918 1918 /*
1919 1919 * Candidate hole lies at least partially within the allowable
1920 1920 * range. Restrict it to fall completely within that range,
1921 1921 * i.e., to [max(lo, lobound), min(hi, hibound)].
1922 1922 */
1923 1923 if (lo < lobound)
1924 1924 lo = lobound;
1925 1925 if (hi > hibound)
1926 1926 hi = hibound;
1927 1927 /*
1928 1928 * Verify that the candidate hole is big enough and meets
1929 1929 * hardware constraints. If the hole is too small, no need
1930 1930 * to do the further checks since they will fail.
1931 1931 */
1932 1932 *basep = lo;
1933 1933 *lenp = hi - lo;
1934 1934 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1935 1935 minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1936 1936 ((flags & AH_CONTAIN) == 0 ||
1937 1937 (*basep <= addr && *basep + *lenp > addr))) {
1938 1938 if (!forward)
1939 1939 as->a_lastgap = hseg;
1940 1940 if (hseg != NULL)
1941 1941 as->a_lastgaphl = hseg;
1942 1942 else
1943 1943 as->a_lastgaphl = lseg;
1944 1944 AS_LOCK_EXIT(as, &as->a_lock);
1945 1945 return (0);
1946 1946 }
1947 1947 cont:
1948 1948 /*
1949 1949 * Move to the next hole.
1950 1950 */
1951 1951 if (forward) {
1952 1952 lseg = hseg;
1953 1953 if (lseg == NULL)
1954 1954 break;
1955 1955 hseg = AS_SEGNEXT(as, hseg);
1956 1956 } else {
1957 1957 hseg = lseg;
1958 1958 if (hseg == NULL)
1959 1959 break;
1960 1960 lseg = AS_SEGPREV(as, lseg);
1961 1961 }
1962 1962 }
1963 1963 if (fast_path && (align != 0 || save_redzone != 0)) {
1964 1964 fast_path = 0;
1965 1965 minlen = save_minlen;
1966 1966 redzone = save_redzone;
1967 1967 goto retry;
1968 1968 }
1969 1969 *basep = save_base;
1970 1970 *lenp = save_len;
1971 1971 AS_LOCK_EXIT(as, &as->a_lock);
1972 1972 return (-1);
1973 1973 }
1974 1974
1975 1975 /*
1976 1976 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1977 1977 *
1978 1978 * If flags specifies AH_HI, the hole will have the highest possible address
1979 1979 * in the range. We use the as->a_lastgap field to figure out where to
1980 1980 * start looking for a gap.
1981 1981 *
1982 1982 * Otherwise, the gap will have the lowest possible address.
1983 1983 *
1984 1984 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1985 1985 *
1986 1986 * If an adequate hole is found, base and len are set to reflect the part of
1987 1987 * the hole that is within range, and 0 is returned, otherwise,
1988 1988 * -1 is returned.
1989 1989 *
1990 1990 * NOTE: This routine is not correct when base+len overflows caddr_t.
1991 1991 */
1992 1992 int
1993 1993 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
1994 1994 caddr_t addr)
1995 1995 {
1996 1996
1997 1997 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
1998 1998 }
1999 1999
2000 2000 /*
2001 2001 * Return the next range within [base, base + len) that is backed
2002 2002 * with "real memory". Skip holes and non-seg_vn segments.
2003 2003 * We're lazy and only return one segment at a time.
2004 2004 */
2005 2005 int
2006 2006 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2007 2007 {
2008 2008 extern struct seg_ops segspt_shmops; /* needs a header file */
2009 2009 struct seg *seg;
2010 2010 caddr_t addr, eaddr;
2011 2011 caddr_t segend;
2012 2012
2013 2013 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2014 2014
2015 2015 addr = *basep;
2016 2016 eaddr = addr + *lenp;
2017 2017
2018 2018 seg = as_findseg(as, addr, 0);
2019 2019 if (seg != NULL)
2020 2020 addr = MAX(seg->s_base, addr);
2021 2021
2022 2022 for (;;) {
2023 2023 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2024 2024 AS_LOCK_EXIT(as, &as->a_lock);
2025 2025 return (EINVAL);
2026 2026 }
2027 2027
2028 2028 if (seg->s_ops == &segvn_ops) {
2029 2029 segend = seg->s_base + seg->s_size;
2030 2030 break;
2031 2031 }
2032 2032
2033 2033 /*
2034 2034 * We do ISM by looking into the private data
2035 2035 * to determine the real size of the segment.
2036 2036 */
2037 2037 if (seg->s_ops == &segspt_shmops) {
2038 2038 segend = seg->s_base + spt_realsize(seg);
2039 2039 if (addr < segend)
2040 2040 break;
2041 2041 }
2042 2042
2043 2043 seg = AS_SEGNEXT(as, seg);
2044 2044
2045 2045 if (seg != NULL)
2046 2046 addr = seg->s_base;
2047 2047 }
2048 2048
2049 2049 *basep = addr;
2050 2050
↓ open down ↓ |
2050 lines elided |
↑ open up ↑ |
2051 2051 if (segend > eaddr)
2052 2052 *lenp = eaddr - addr;
2053 2053 else
2054 2054 *lenp = segend - addr;
2055 2055
2056 2056 AS_LOCK_EXIT(as, &as->a_lock);
2057 2057 return (0);
2058 2058 }
2059 2059
2060 2060 /*
2061 - * Swap the pages associated with the address space as out to
2062 - * secondary storage, returning the number of bytes actually
2063 - * swapped.
2064 - *
2065 - * The value returned is intended to correlate well with the process's
2066 - * memory requirements. Its usefulness for this purpose depends on
2067 - * how well the segment-level routines do at returning accurate
2068 - * information.
2069 - */
2070 -size_t
2071 -as_swapout(struct as *as)
2072 -{
2073 - struct seg *seg;
2074 - size_t swpcnt = 0;
2075 -
2076 - /*
2077 - * Kernel-only processes have given up their address
2078 - * spaces. Of course, we shouldn't be attempting to
2079 - * swap out such processes in the first place...
2080 - */
2081 - if (as == NULL)
2082 - return (0);
2083 -
2084 - AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2085 -
2086 - /*
2087 - * Free all mapping resources associated with the address
2088 - * space. The segment-level swapout routines capitalize
2089 - * on this unmapping by scavanging pages that have become
2090 - * unmapped here.
2091 - */
2092 - hat_swapout(as->a_hat);
2093 -
2094 - /*
2095 - * Call the swapout routines of all segments in the address
2096 - * space to do the actual work, accumulating the amount of
2097 - * space reclaimed.
2098 - */
2099 - for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2100 - struct seg_ops *ov = seg->s_ops;
2101 -
2102 - /*
2103 - * We have to check to see if the seg has
2104 - * an ops vector because the seg may have
2105 - * been in the middle of being set up when
2106 - * the process was picked for swapout.
2107 - */
2108 - if ((ov != NULL) && (ov->swapout != NULL))
2109 - swpcnt += SEGOP_SWAPOUT(seg);
2110 - }
2111 - AS_LOCK_EXIT(as, &as->a_lock);
2112 - return (swpcnt);
2113 -}
2114 -
2115 -/*
2116 2061 * Determine whether data from the mappings in interval [addr, addr + size)
2117 2062 * are in the primary memory (core) cache.
2118 2063 */
2119 2064 int
2120 2065 as_incore(struct as *as, caddr_t addr,
2121 2066 size_t size, char *vec, size_t *sizep)
2122 2067 {
2123 2068 struct seg *seg;
2124 2069 size_t ssize;
2125 2070 caddr_t raddr; /* rounded down addr */
2126 2071 size_t rsize; /* rounded up size */
2127 2072 size_t isize; /* iteration size */
2128 2073 int error = 0; /* result, assume success */
2129 2074
2130 2075 *sizep = 0;
2131 2076 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2132 2077 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2133 2078 (size_t)raddr;
2134 2079
2135 2080 if (raddr + rsize < raddr) /* check for wraparound */
2136 2081 return (ENOMEM);
2137 2082
2138 2083 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2139 2084 seg = as_segat(as, raddr);
2140 2085 if (seg == NULL) {
2141 2086 AS_LOCK_EXIT(as, &as->a_lock);
2142 2087 return (-1);
2143 2088 }
2144 2089
2145 2090 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2146 2091 if (raddr >= seg->s_base + seg->s_size) {
2147 2092 seg = AS_SEGNEXT(as, seg);
2148 2093 if (seg == NULL || raddr != seg->s_base) {
2149 2094 error = -1;
2150 2095 break;
2151 2096 }
2152 2097 }
2153 2098 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2154 2099 ssize = seg->s_base + seg->s_size - raddr;
2155 2100 else
2156 2101 ssize = rsize;
2157 2102 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2158 2103 if (isize != ssize) {
2159 2104 error = -1;
2160 2105 break;
2161 2106 }
2162 2107 vec += btopr(ssize);
2163 2108 }
2164 2109 AS_LOCK_EXIT(as, &as->a_lock);
2165 2110 return (error);
2166 2111 }
2167 2112
2168 2113 static void
2169 2114 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2170 2115 ulong_t *bitmap, size_t position, size_t npages)
2171 2116 {
2172 2117 caddr_t range_start;
2173 2118 size_t pos1 = position;
2174 2119 size_t pos2;
2175 2120 size_t size;
2176 2121 size_t end_pos = npages + position;
2177 2122
2178 2123 while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2179 2124 size = ptob((pos2 - pos1));
2180 2125 range_start = (caddr_t)((uintptr_t)addr +
2181 2126 ptob(pos1 - position));
2182 2127
2183 2128 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2184 2129 (ulong_t *)NULL, (size_t)NULL);
2185 2130 pos1 = pos2;
2186 2131 }
2187 2132 }
2188 2133
2189 2134 static void
2190 2135 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2191 2136 caddr_t raddr, size_t rsize)
2192 2137 {
2193 2138 struct seg *seg = as_segat(as, raddr);
2194 2139 size_t ssize;
2195 2140
2196 2141 while (rsize != 0) {
2197 2142 if (raddr >= seg->s_base + seg->s_size)
2198 2143 seg = AS_SEGNEXT(as, seg);
2199 2144
2200 2145 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2201 2146 ssize = seg->s_base + seg->s_size - raddr;
2202 2147 else
2203 2148 ssize = rsize;
2204 2149
2205 2150 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2206 2151
2207 2152 rsize -= ssize;
2208 2153 raddr += ssize;
2209 2154 }
2210 2155 }
2211 2156
2212 2157 /*
2213 2158 * Cache control operations over the interval [addr, addr + size) in
2214 2159 * address space "as".
2215 2160 */
2216 2161 /*ARGSUSED*/
2217 2162 int
2218 2163 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2219 2164 uintptr_t arg, ulong_t *lock_map, size_t pos)
2220 2165 {
2221 2166 struct seg *seg; /* working segment */
2222 2167 caddr_t raddr; /* rounded down addr */
2223 2168 caddr_t initraddr; /* saved initial rounded down addr */
2224 2169 size_t rsize; /* rounded up size */
2225 2170 size_t initrsize; /* saved initial rounded up size */
2226 2171 size_t ssize; /* size of seg */
2227 2172 int error = 0; /* result */
2228 2173 size_t mlock_size; /* size of bitmap */
2229 2174 ulong_t *mlock_map; /* pointer to bitmap used */
2230 2175 /* to represent the locked */
2231 2176 /* pages. */
2232 2177 retry:
2233 2178 if (error == IE_RETRY)
2234 2179 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2235 2180 else
2236 2181 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2237 2182
2238 2183 /*
2239 2184 * If these are address space lock/unlock operations, loop over
2240 2185 * all segments in the address space, as appropriate.
2241 2186 */
2242 2187 if (func == MC_LOCKAS) {
2243 2188 size_t npages, idx;
2244 2189 size_t rlen = 0; /* rounded as length */
2245 2190
2246 2191 idx = pos;
2247 2192
2248 2193 if (arg & MCL_FUTURE) {
2249 2194 mutex_enter(&as->a_contents);
2250 2195 AS_SETPGLCK(as);
2251 2196 mutex_exit(&as->a_contents);
2252 2197 }
2253 2198 if ((arg & MCL_CURRENT) == 0) {
2254 2199 AS_LOCK_EXIT(as, &as->a_lock);
2255 2200 return (0);
2256 2201 }
2257 2202
2258 2203 seg = AS_SEGFIRST(as);
2259 2204 if (seg == NULL) {
2260 2205 AS_LOCK_EXIT(as, &as->a_lock);
2261 2206 return (0);
2262 2207 }
2263 2208
2264 2209 do {
2265 2210 raddr = (caddr_t)((uintptr_t)seg->s_base &
2266 2211 (uintptr_t)PAGEMASK);
2267 2212 rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2268 2213 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2269 2214 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2270 2215
2271 2216 mlock_size = BT_BITOUL(btopr(rlen));
2272 2217 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2273 2218 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2274 2219 AS_LOCK_EXIT(as, &as->a_lock);
2275 2220 return (EAGAIN);
2276 2221 }
2277 2222
2278 2223 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2279 2224 error = SEGOP_LOCKOP(seg, seg->s_base,
2280 2225 seg->s_size, attr, MC_LOCK, mlock_map, pos);
2281 2226 if (error != 0)
2282 2227 break;
2283 2228 pos += seg_pages(seg);
2284 2229 }
2285 2230
2286 2231 if (error) {
2287 2232 for (seg = AS_SEGFIRST(as); seg != NULL;
2288 2233 seg = AS_SEGNEXT(as, seg)) {
2289 2234
2290 2235 raddr = (caddr_t)((uintptr_t)seg->s_base &
2291 2236 (uintptr_t)PAGEMASK);
2292 2237 npages = seg_pages(seg);
2293 2238 as_segunlock(seg, raddr, attr, mlock_map,
2294 2239 idx, npages);
2295 2240 idx += npages;
2296 2241 }
2297 2242 }
2298 2243
2299 2244 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2300 2245 AS_LOCK_EXIT(as, &as->a_lock);
2301 2246 goto lockerr;
2302 2247 } else if (func == MC_UNLOCKAS) {
2303 2248 mutex_enter(&as->a_contents);
2304 2249 AS_CLRPGLCK(as);
2305 2250 mutex_exit(&as->a_contents);
2306 2251
2307 2252 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2308 2253 error = SEGOP_LOCKOP(seg, seg->s_base,
2309 2254 seg->s_size, attr, MC_UNLOCK, NULL, 0);
2310 2255 if (error != 0)
2311 2256 break;
2312 2257 }
2313 2258
2314 2259 AS_LOCK_EXIT(as, &as->a_lock);
2315 2260 goto lockerr;
2316 2261 }
2317 2262
2318 2263 /*
2319 2264 * Normalize addresses and sizes.
2320 2265 */
2321 2266 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2322 2267 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2323 2268 (size_t)raddr;
2324 2269
2325 2270 if (raddr + rsize < raddr) { /* check for wraparound */
2326 2271 AS_LOCK_EXIT(as, &as->a_lock);
2327 2272 return (ENOMEM);
2328 2273 }
2329 2274
2330 2275 /*
2331 2276 * Get initial segment.
2332 2277 */
2333 2278 if ((seg = as_segat(as, raddr)) == NULL) {
2334 2279 AS_LOCK_EXIT(as, &as->a_lock);
2335 2280 return (ENOMEM);
2336 2281 }
2337 2282
2338 2283 if (func == MC_LOCK) {
2339 2284 mlock_size = BT_BITOUL(btopr(rsize));
2340 2285 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2341 2286 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2342 2287 AS_LOCK_EXIT(as, &as->a_lock);
2343 2288 return (EAGAIN);
2344 2289 }
2345 2290 }
2346 2291
2347 2292 /*
2348 2293 * Loop over all segments. If a hole in the address range is
2349 2294 * discovered, then fail. For each segment, perform the appropriate
2350 2295 * control operation.
2351 2296 */
2352 2297 while (rsize != 0) {
2353 2298
2354 2299 /*
2355 2300 * Make sure there's no hole, calculate the portion
2356 2301 * of the next segment to be operated over.
2357 2302 */
2358 2303 if (raddr >= seg->s_base + seg->s_size) {
2359 2304 seg = AS_SEGNEXT(as, seg);
2360 2305 if (seg == NULL || raddr != seg->s_base) {
2361 2306 if (func == MC_LOCK) {
2362 2307 as_unlockerr(as, attr, mlock_map,
2363 2308 initraddr, initrsize - rsize);
2364 2309 kmem_free(mlock_map,
2365 2310 mlock_size * sizeof (ulong_t));
2366 2311 }
2367 2312 AS_LOCK_EXIT(as, &as->a_lock);
2368 2313 return (ENOMEM);
2369 2314 }
2370 2315 }
2371 2316 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2372 2317 ssize = seg->s_base + seg->s_size - raddr;
2373 2318 else
2374 2319 ssize = rsize;
2375 2320
2376 2321 /*
2377 2322 * Dispatch on specific function.
2378 2323 */
2379 2324 switch (func) {
2380 2325
2381 2326 /*
2382 2327 * Synchronize cached data from mappings with backing
2383 2328 * objects.
2384 2329 */
2385 2330 case MC_SYNC:
2386 2331 if (error = SEGOP_SYNC(seg, raddr, ssize,
2387 2332 attr, (uint_t)arg)) {
2388 2333 AS_LOCK_EXIT(as, &as->a_lock);
2389 2334 return (error);
2390 2335 }
2391 2336 break;
2392 2337
2393 2338 /*
2394 2339 * Lock pages in memory.
2395 2340 */
2396 2341 case MC_LOCK:
2397 2342 if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2398 2343 attr, func, mlock_map, pos)) {
2399 2344 as_unlockerr(as, attr, mlock_map, initraddr,
2400 2345 initrsize - rsize + ssize);
2401 2346 kmem_free(mlock_map, mlock_size *
2402 2347 sizeof (ulong_t));
2403 2348 AS_LOCK_EXIT(as, &as->a_lock);
2404 2349 goto lockerr;
2405 2350 }
2406 2351 break;
2407 2352
2408 2353 /*
2409 2354 * Unlock mapped pages.
2410 2355 */
2411 2356 case MC_UNLOCK:
2412 2357 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2413 2358 (ulong_t *)NULL, (size_t)NULL);
2414 2359 break;
2415 2360
2416 2361 /*
2417 2362 * Store VM advise for mapped pages in segment layer.
2418 2363 */
2419 2364 case MC_ADVISE:
2420 2365 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2421 2366
2422 2367 /*
2423 2368 * Check for regular errors and special retry error
2424 2369 */
2425 2370 if (error) {
2426 2371 if (error == IE_RETRY) {
2427 2372 /*
2428 2373 * Need to acquire writers lock, so
2429 2374 * have to drop readers lock and start
2430 2375 * all over again
2431 2376 */
2432 2377 AS_LOCK_EXIT(as, &as->a_lock);
2433 2378 goto retry;
2434 2379 } else if (error == IE_REATTACH) {
2435 2380 /*
2436 2381 * Find segment for current address
2437 2382 * because current segment just got
2438 2383 * split or concatenated
2439 2384 */
2440 2385 seg = as_segat(as, raddr);
2441 2386 if (seg == NULL) {
2442 2387 AS_LOCK_EXIT(as, &as->a_lock);
2443 2388 return (ENOMEM);
2444 2389 }
2445 2390 } else {
2446 2391 /*
2447 2392 * Regular error
2448 2393 */
2449 2394 AS_LOCK_EXIT(as, &as->a_lock);
2450 2395 return (error);
2451 2396 }
2452 2397 }
2453 2398 break;
2454 2399
2455 2400 case MC_INHERIT_ZERO:
2456 2401 if (seg->s_ops->inherit == NULL) {
2457 2402 error = ENOTSUP;
2458 2403 } else {
2459 2404 error = SEGOP_INHERIT(seg, raddr, ssize,
2460 2405 SEGP_INH_ZERO);
2461 2406 }
2462 2407 if (error != 0) {
2463 2408 AS_LOCK_EXIT(as, &as->a_lock);
2464 2409 return (error);
2465 2410 }
2466 2411 break;
2467 2412
2468 2413 /*
2469 2414 * Can't happen.
2470 2415 */
2471 2416 default:
2472 2417 panic("as_ctl: bad operation %d", func);
2473 2418 /*NOTREACHED*/
2474 2419 }
2475 2420
2476 2421 rsize -= ssize;
2477 2422 raddr += ssize;
2478 2423 }
2479 2424
2480 2425 if (func == MC_LOCK)
2481 2426 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2482 2427 AS_LOCK_EXIT(as, &as->a_lock);
2483 2428 return (0);
2484 2429 lockerr:
2485 2430
2486 2431 /*
2487 2432 * If the lower levels returned EDEADLK for a segment lockop,
2488 2433 * it means that we should retry the operation. Let's wait
2489 2434 * a bit also to let the deadlock causing condition clear.
2490 2435 * This is part of a gross hack to work around a design flaw
2491 2436 * in the ufs/sds logging code and should go away when the
2492 2437 * logging code is re-designed to fix the problem. See bug
2493 2438 * 4125102 for details of the problem.
2494 2439 */
2495 2440 if (error == EDEADLK) {
2496 2441 delay(deadlk_wait);
2497 2442 error = 0;
2498 2443 goto retry;
2499 2444 }
2500 2445 return (error);
2501 2446 }
2502 2447
2503 2448 int
2504 2449 fc_decode(faultcode_t fault_err)
2505 2450 {
2506 2451 int error = 0;
2507 2452
2508 2453 switch (FC_CODE(fault_err)) {
2509 2454 case FC_OBJERR:
2510 2455 error = FC_ERRNO(fault_err);
2511 2456 break;
2512 2457 case FC_PROT:
2513 2458 error = EACCES;
2514 2459 break;
2515 2460 default:
2516 2461 error = EFAULT;
2517 2462 break;
2518 2463 }
2519 2464 return (error);
2520 2465 }
2521 2466
2522 2467 /*
2523 2468 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow
2524 2469 * lists from each segment and copy them to one contiguous shadow list (plist)
2525 2470 * as expected by the caller. Save pointers to per segment shadow lists at
2526 2471 * the tail of plist so that they can be used during as_pageunlock().
2527 2472 */
2528 2473 static int
2529 2474 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2530 2475 caddr_t addr, size_t size, enum seg_rw rw)
2531 2476 {
2532 2477 caddr_t sv_addr = addr;
2533 2478 size_t sv_size = size;
2534 2479 struct seg *sv_seg = seg;
2535 2480 ulong_t segcnt = 1;
2536 2481 ulong_t cnt;
2537 2482 size_t ssize;
2538 2483 pgcnt_t npages = btop(size);
2539 2484 page_t **plist;
2540 2485 page_t **pl;
2541 2486 int error;
2542 2487 caddr_t eaddr;
2543 2488 faultcode_t fault_err = 0;
2544 2489 pgcnt_t pl_off;
2545 2490 extern struct seg_ops segspt_shmops;
2546 2491
2547 2492 ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2548 2493 ASSERT(seg != NULL);
2549 2494 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2550 2495 ASSERT(addr + size > seg->s_base + seg->s_size);
2551 2496 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2552 2497 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2553 2498
2554 2499 /*
2555 2500 * Count the number of segments covered by the range we are about to
2556 2501 * lock. The segment count is used to size the shadow list we return
2557 2502 * back to the caller.
2558 2503 */
2559 2504 for (; size != 0; size -= ssize, addr += ssize) {
2560 2505 if (addr >= seg->s_base + seg->s_size) {
2561 2506
2562 2507 seg = AS_SEGNEXT(as, seg);
2563 2508 if (seg == NULL || addr != seg->s_base) {
2564 2509 AS_LOCK_EXIT(as, &as->a_lock);
2565 2510 return (EFAULT);
2566 2511 }
2567 2512 /*
2568 2513 * Do a quick check if subsequent segments
2569 2514 * will most likely support pagelock.
2570 2515 */
2571 2516 if (seg->s_ops == &segvn_ops) {
2572 2517 vnode_t *vp;
2573 2518
2574 2519 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2575 2520 vp != NULL) {
2576 2521 AS_LOCK_EXIT(as, &as->a_lock);
2577 2522 goto slow;
2578 2523 }
2579 2524 } else if (seg->s_ops != &segspt_shmops) {
2580 2525 AS_LOCK_EXIT(as, &as->a_lock);
2581 2526 goto slow;
2582 2527 }
2583 2528 segcnt++;
2584 2529 }
2585 2530 if (addr + size > seg->s_base + seg->s_size) {
2586 2531 ssize = seg->s_base + seg->s_size - addr;
2587 2532 } else {
2588 2533 ssize = size;
2589 2534 }
2590 2535 }
2591 2536 ASSERT(segcnt > 1);
2592 2537
2593 2538 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2594 2539
2595 2540 addr = sv_addr;
2596 2541 size = sv_size;
2597 2542 seg = sv_seg;
2598 2543
2599 2544 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2600 2545 if (addr >= seg->s_base + seg->s_size) {
2601 2546 seg = AS_SEGNEXT(as, seg);
2602 2547 ASSERT(seg != NULL && addr == seg->s_base);
2603 2548 cnt++;
2604 2549 ASSERT(cnt < segcnt);
2605 2550 }
2606 2551 if (addr + size > seg->s_base + seg->s_size) {
2607 2552 ssize = seg->s_base + seg->s_size - addr;
2608 2553 } else {
2609 2554 ssize = size;
2610 2555 }
2611 2556 pl = &plist[npages + cnt];
2612 2557 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2613 2558 L_PAGELOCK, rw);
2614 2559 if (error) {
2615 2560 break;
2616 2561 }
2617 2562 ASSERT(plist[npages + cnt] != NULL);
2618 2563 ASSERT(pl_off + btop(ssize) <= npages);
2619 2564 bcopy(plist[npages + cnt], &plist[pl_off],
2620 2565 btop(ssize) * sizeof (page_t *));
2621 2566 pl_off += btop(ssize);
2622 2567 }
2623 2568
2624 2569 if (size == 0) {
2625 2570 AS_LOCK_EXIT(as, &as->a_lock);
2626 2571 ASSERT(cnt == segcnt - 1);
2627 2572 *ppp = plist;
2628 2573 return (0);
2629 2574 }
2630 2575
2631 2576 /*
2632 2577 * one of pagelock calls failed. The error type is in error variable.
2633 2578 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2634 2579 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2635 2580 * back to the caller.
2636 2581 */
2637 2582
2638 2583 eaddr = addr;
2639 2584 seg = sv_seg;
2640 2585
2641 2586 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2642 2587 if (addr >= seg->s_base + seg->s_size) {
2643 2588 seg = AS_SEGNEXT(as, seg);
2644 2589 ASSERT(seg != NULL && addr == seg->s_base);
2645 2590 cnt++;
2646 2591 ASSERT(cnt < segcnt);
2647 2592 }
2648 2593 if (eaddr > seg->s_base + seg->s_size) {
2649 2594 ssize = seg->s_base + seg->s_size - addr;
2650 2595 } else {
2651 2596 ssize = eaddr - addr;
2652 2597 }
2653 2598 pl = &plist[npages + cnt];
2654 2599 ASSERT(*pl != NULL);
2655 2600 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2656 2601 L_PAGEUNLOCK, rw);
2657 2602 }
2658 2603
2659 2604 AS_LOCK_EXIT(as, &as->a_lock);
2660 2605
2661 2606 kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2662 2607
2663 2608 if (error != ENOTSUP && error != EFAULT) {
2664 2609 return (error);
2665 2610 }
2666 2611
2667 2612 slow:
2668 2613 /*
2669 2614 * If we are here because pagelock failed due to the need to cow fault
2670 2615 * in the pages we want to lock F_SOFTLOCK will do this job and in
2671 2616 * next as_pagelock() call for this address range pagelock will
2672 2617 * hopefully succeed.
2673 2618 */
2674 2619 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2675 2620 if (fault_err != 0) {
2676 2621 return (fc_decode(fault_err));
2677 2622 }
2678 2623 *ppp = NULL;
2679 2624
2680 2625 return (0);
2681 2626 }
2682 2627
2683 2628 /*
2684 2629 * lock pages in a given address space. Return shadow list. If
2685 2630 * the list is NULL, the MMU mapping is also locked.
2686 2631 */
2687 2632 int
2688 2633 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2689 2634 size_t size, enum seg_rw rw)
2690 2635 {
2691 2636 size_t rsize;
2692 2637 caddr_t raddr;
2693 2638 faultcode_t fault_err;
2694 2639 struct seg *seg;
2695 2640 int err;
2696 2641
2697 2642 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2698 2643 "as_pagelock_start: addr %p size %ld", addr, size);
2699 2644
2700 2645 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2701 2646 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2702 2647 (size_t)raddr;
2703 2648
2704 2649 /*
2705 2650 * if the request crosses two segments let
2706 2651 * as_fault handle it.
2707 2652 */
2708 2653 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2709 2654
2710 2655 seg = as_segat(as, raddr);
2711 2656 if (seg == NULL) {
2712 2657 AS_LOCK_EXIT(as, &as->a_lock);
2713 2658 return (EFAULT);
2714 2659 }
2715 2660 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2716 2661 if (raddr + rsize > seg->s_base + seg->s_size) {
2717 2662 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2718 2663 }
2719 2664 if (raddr + rsize <= raddr) {
2720 2665 AS_LOCK_EXIT(as, &as->a_lock);
2721 2666 return (EFAULT);
2722 2667 }
2723 2668
2724 2669 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2725 2670 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2726 2671
2727 2672 /*
2728 2673 * try to lock pages and pass back shadow list
2729 2674 */
2730 2675 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2731 2676
2732 2677 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2733 2678
2734 2679 AS_LOCK_EXIT(as, &as->a_lock);
2735 2680
2736 2681 if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2737 2682 return (err);
2738 2683 }
2739 2684
2740 2685 /*
2741 2686 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2742 2687 * to no pagelock support for this segment or pages need to be cow
2743 2688 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2744 2689 * this as_pagelock() call and in the next as_pagelock() call for the
2745 2690 * same address range pagelock call will hopefull succeed.
2746 2691 */
2747 2692 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2748 2693 if (fault_err != 0) {
2749 2694 return (fc_decode(fault_err));
2750 2695 }
2751 2696 *ppp = NULL;
2752 2697
2753 2698 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2754 2699 return (0);
2755 2700 }
2756 2701
2757 2702 /*
2758 2703 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2759 2704 * lists from the end of plist and call pageunlock interface for each segment.
2760 2705 * Drop as lock and free plist.
2761 2706 */
2762 2707 static void
2763 2708 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2764 2709 struct page **plist, enum seg_rw rw)
2765 2710 {
2766 2711 ulong_t cnt;
2767 2712 caddr_t eaddr = addr + size;
2768 2713 pgcnt_t npages = btop(size);
2769 2714 size_t ssize;
2770 2715 page_t **pl;
2771 2716
2772 2717 ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2773 2718 ASSERT(seg != NULL);
2774 2719 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2775 2720 ASSERT(addr + size > seg->s_base + seg->s_size);
2776 2721 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2777 2722 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2778 2723 ASSERT(plist != NULL);
2779 2724
2780 2725 for (cnt = 0; addr < eaddr; addr += ssize) {
2781 2726 if (addr >= seg->s_base + seg->s_size) {
2782 2727 seg = AS_SEGNEXT(as, seg);
2783 2728 ASSERT(seg != NULL && addr == seg->s_base);
2784 2729 cnt++;
2785 2730 }
2786 2731 if (eaddr > seg->s_base + seg->s_size) {
2787 2732 ssize = seg->s_base + seg->s_size - addr;
2788 2733 } else {
2789 2734 ssize = eaddr - addr;
2790 2735 }
2791 2736 pl = &plist[npages + cnt];
2792 2737 ASSERT(*pl != NULL);
2793 2738 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2794 2739 L_PAGEUNLOCK, rw);
2795 2740 }
2796 2741 ASSERT(cnt > 0);
2797 2742 AS_LOCK_EXIT(as, &as->a_lock);
2798 2743
2799 2744 cnt++;
2800 2745 kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2801 2746 }
2802 2747
2803 2748 /*
2804 2749 * unlock pages in a given address range
2805 2750 */
2806 2751 void
2807 2752 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2808 2753 enum seg_rw rw)
2809 2754 {
2810 2755 struct seg *seg;
2811 2756 size_t rsize;
2812 2757 caddr_t raddr;
2813 2758
2814 2759 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2815 2760 "as_pageunlock_start: addr %p size %ld", addr, size);
2816 2761
2817 2762 /*
2818 2763 * if the shadow list is NULL, as_pagelock was
2819 2764 * falling back to as_fault
2820 2765 */
2821 2766 if (pp == NULL) {
2822 2767 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2823 2768 return;
2824 2769 }
2825 2770
2826 2771 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2827 2772 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2828 2773 (size_t)raddr;
2829 2774
2830 2775 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2831 2776 seg = as_segat(as, raddr);
2832 2777 ASSERT(seg != NULL);
2833 2778
2834 2779 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2835 2780 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2836 2781
2837 2782 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2838 2783 if (raddr + rsize <= seg->s_base + seg->s_size) {
2839 2784 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2840 2785 } else {
2841 2786 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2842 2787 return;
2843 2788 }
2844 2789 AS_LOCK_EXIT(as, &as->a_lock);
2845 2790 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2846 2791 }
2847 2792
2848 2793 int
2849 2794 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2850 2795 boolean_t wait)
2851 2796 {
2852 2797 struct seg *seg;
2853 2798 size_t ssize;
2854 2799 caddr_t raddr; /* rounded down addr */
2855 2800 size_t rsize; /* rounded up size */
2856 2801 int error = 0;
2857 2802 size_t pgsz = page_get_pagesize(szc);
2858 2803
2859 2804 setpgsz_top:
2860 2805 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2861 2806 return (EINVAL);
2862 2807 }
2863 2808
2864 2809 raddr = addr;
2865 2810 rsize = size;
2866 2811
2867 2812 if (raddr + rsize < raddr) /* check for wraparound */
2868 2813 return (ENOMEM);
2869 2814
2870 2815 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2871 2816 as_clearwatchprot(as, raddr, rsize);
2872 2817 seg = as_segat(as, raddr);
2873 2818 if (seg == NULL) {
2874 2819 as_setwatch(as);
2875 2820 AS_LOCK_EXIT(as, &as->a_lock);
2876 2821 return (ENOMEM);
2877 2822 }
2878 2823
2879 2824 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2880 2825 if (raddr >= seg->s_base + seg->s_size) {
2881 2826 seg = AS_SEGNEXT(as, seg);
2882 2827 if (seg == NULL || raddr != seg->s_base) {
2883 2828 error = ENOMEM;
2884 2829 break;
2885 2830 }
2886 2831 }
2887 2832 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2888 2833 ssize = seg->s_base + seg->s_size - raddr;
2889 2834 } else {
2890 2835 ssize = rsize;
2891 2836 }
2892 2837
2893 2838 retry:
2894 2839 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2895 2840
2896 2841 if (error == IE_NOMEM) {
2897 2842 error = EAGAIN;
2898 2843 break;
2899 2844 }
2900 2845
2901 2846 if (error == IE_RETRY) {
2902 2847 AS_LOCK_EXIT(as, &as->a_lock);
2903 2848 goto setpgsz_top;
2904 2849 }
2905 2850
2906 2851 if (error == ENOTSUP) {
2907 2852 error = EINVAL;
2908 2853 break;
2909 2854 }
2910 2855
2911 2856 if (wait && (error == EAGAIN)) {
2912 2857 /*
2913 2858 * Memory is currently locked. It must be unlocked
2914 2859 * before this operation can succeed through a retry.
2915 2860 * The possible reasons for locked memory and
2916 2861 * corresponding strategies for unlocking are:
2917 2862 * (1) Normal I/O
2918 2863 * wait for a signal that the I/O operation
2919 2864 * has completed and the memory is unlocked.
2920 2865 * (2) Asynchronous I/O
2921 2866 * The aio subsystem does not unlock pages when
2922 2867 * the I/O is completed. Those pages are unlocked
2923 2868 * when the application calls aiowait/aioerror.
2924 2869 * So, to prevent blocking forever, cv_broadcast()
2925 2870 * is done to wake up aio_cleanup_thread.
2926 2871 * Subsequently, segvn_reclaim will be called, and
2927 2872 * that will do AS_CLRUNMAPWAIT() and wake us up.
2928 2873 * (3) Long term page locking:
2929 2874 * This is not relevant for as_setpagesize()
2930 2875 * because we cannot change the page size for
2931 2876 * driver memory. The attempt to do so will
2932 2877 * fail with a different error than EAGAIN so
2933 2878 * there's no need to trigger as callbacks like
2934 2879 * as_unmap, as_setprot or as_free would do.
2935 2880 */
2936 2881 mutex_enter(&as->a_contents);
2937 2882 if (!AS_ISNOUNMAPWAIT(as)) {
2938 2883 if (AS_ISUNMAPWAIT(as) == 0) {
2939 2884 cv_broadcast(&as->a_cv);
2940 2885 }
2941 2886 AS_SETUNMAPWAIT(as);
2942 2887 AS_LOCK_EXIT(as, &as->a_lock);
2943 2888 while (AS_ISUNMAPWAIT(as)) {
2944 2889 cv_wait(&as->a_cv, &as->a_contents);
2945 2890 }
2946 2891 } else {
2947 2892 /*
2948 2893 * We may have raced with
2949 2894 * segvn_reclaim()/segspt_reclaim(). In this
2950 2895 * case clean nounmapwait flag and retry since
2951 2896 * softlockcnt in this segment may be already
2952 2897 * 0. We don't drop as writer lock so our
2953 2898 * number of retries without sleeping should
2954 2899 * be very small. See segvn_reclaim() for
2955 2900 * more comments.
2956 2901 */
2957 2902 AS_CLRNOUNMAPWAIT(as);
2958 2903 mutex_exit(&as->a_contents);
2959 2904 goto retry;
2960 2905 }
2961 2906 mutex_exit(&as->a_contents);
2962 2907 goto setpgsz_top;
2963 2908 } else if (error != 0) {
2964 2909 break;
2965 2910 }
2966 2911 }
2967 2912 as_setwatch(as);
2968 2913 AS_LOCK_EXIT(as, &as->a_lock);
2969 2914 return (error);
2970 2915 }
2971 2916
2972 2917 /*
2973 2918 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
2974 2919 * in its chunk where s_szc is less than the szc we want to set.
2975 2920 */
2976 2921 static int
2977 2922 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2978 2923 int *retry)
2979 2924 {
2980 2925 struct seg *seg;
2981 2926 size_t ssize;
2982 2927 int error;
2983 2928
2984 2929 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
2985 2930
2986 2931 seg = as_segat(as, raddr);
2987 2932 if (seg == NULL) {
2988 2933 panic("as_iset3_default_lpsize: no seg");
2989 2934 }
2990 2935
2991 2936 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2992 2937 if (raddr >= seg->s_base + seg->s_size) {
2993 2938 seg = AS_SEGNEXT(as, seg);
2994 2939 if (seg == NULL || raddr != seg->s_base) {
2995 2940 panic("as_iset3_default_lpsize: as changed");
2996 2941 }
2997 2942 }
2998 2943 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2999 2944 ssize = seg->s_base + seg->s_size - raddr;
3000 2945 } else {
3001 2946 ssize = rsize;
3002 2947 }
3003 2948
3004 2949 if (szc > seg->s_szc) {
3005 2950 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3006 2951 /* Only retry on EINVAL segments that have no vnode. */
3007 2952 if (error == EINVAL) {
3008 2953 vnode_t *vp = NULL;
3009 2954 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3010 2955 (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3011 2956 vp == NULL)) {
3012 2957 *retry = 1;
3013 2958 } else {
3014 2959 *retry = 0;
3015 2960 }
3016 2961 }
3017 2962 if (error) {
3018 2963 return (error);
3019 2964 }
3020 2965 }
3021 2966 }
3022 2967 return (0);
3023 2968 }
3024 2969
3025 2970 /*
3026 2971 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3027 2972 * pagesize on each segment in its range, but if any fails with EINVAL,
3028 2973 * then it reduces the pagesizes to the next size in the bitmap and
3029 2974 * retries as_iset3_default_lpsize(). The reason why the code retries
3030 2975 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3031 2976 * match the bigger sizes, and (b) it's hard to get this offset (to begin
3032 2977 * with) to pass to map_pgszcvec().
3033 2978 */
3034 2979 static int
3035 2980 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3036 2981 uint_t szcvec)
3037 2982 {
3038 2983 int error;
3039 2984 int retry;
3040 2985
3041 2986 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3042 2987
3043 2988 for (;;) {
3044 2989 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3045 2990 if (error == EINVAL && retry) {
3046 2991 szcvec &= ~(1 << szc);
3047 2992 if (szcvec <= 1) {
3048 2993 return (EINVAL);
3049 2994 }
3050 2995 szc = highbit(szcvec) - 1;
3051 2996 } else {
3052 2997 return (error);
3053 2998 }
3054 2999 }
3055 3000 }
3056 3001
3057 3002 /*
3058 3003 * as_iset1_default_lpsize() breaks its chunk into areas where existing
3059 3004 * segments have a smaller szc than we want to set. For each such area,
3060 3005 * it calls as_iset2_default_lpsize()
3061 3006 */
3062 3007 static int
3063 3008 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3064 3009 uint_t szcvec)
3065 3010 {
3066 3011 struct seg *seg;
3067 3012 size_t ssize;
3068 3013 caddr_t setaddr = raddr;
3069 3014 size_t setsize = 0;
3070 3015 int set;
3071 3016 int error;
3072 3017
3073 3018 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3074 3019
3075 3020 seg = as_segat(as, raddr);
3076 3021 if (seg == NULL) {
3077 3022 panic("as_iset1_default_lpsize: no seg");
3078 3023 }
3079 3024 if (seg->s_szc < szc) {
3080 3025 set = 1;
3081 3026 } else {
3082 3027 set = 0;
3083 3028 }
3084 3029
3085 3030 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3086 3031 if (raddr >= seg->s_base + seg->s_size) {
3087 3032 seg = AS_SEGNEXT(as, seg);
3088 3033 if (seg == NULL || raddr != seg->s_base) {
3089 3034 panic("as_iset1_default_lpsize: as changed");
3090 3035 }
3091 3036 if (seg->s_szc >= szc && set) {
3092 3037 ASSERT(setsize != 0);
3093 3038 error = as_iset2_default_lpsize(as,
3094 3039 setaddr, setsize, szc, szcvec);
3095 3040 if (error) {
3096 3041 return (error);
3097 3042 }
3098 3043 set = 0;
3099 3044 } else if (seg->s_szc < szc && !set) {
3100 3045 setaddr = raddr;
3101 3046 setsize = 0;
3102 3047 set = 1;
3103 3048 }
3104 3049 }
3105 3050 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3106 3051 ssize = seg->s_base + seg->s_size - raddr;
3107 3052 } else {
3108 3053 ssize = rsize;
3109 3054 }
3110 3055 }
3111 3056 error = 0;
3112 3057 if (set) {
3113 3058 ASSERT(setsize != 0);
3114 3059 error = as_iset2_default_lpsize(as, setaddr, setsize,
3115 3060 szc, szcvec);
3116 3061 }
3117 3062 return (error);
3118 3063 }
3119 3064
3120 3065 /*
3121 3066 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3122 3067 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3123 3068 * chunk to as_iset1_default_lpsize().
3124 3069 */
3125 3070 static int
3126 3071 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3127 3072 int type)
3128 3073 {
3129 3074 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3130 3075 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3131 3076 flags, rtype, 1);
3132 3077 uint_t szc;
3133 3078 uint_t nszc;
3134 3079 int error;
3135 3080 caddr_t a;
3136 3081 caddr_t eaddr;
3137 3082 size_t segsize;
3138 3083 size_t pgsz;
3139 3084 uint_t save_szcvec;
3140 3085
3141 3086 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3142 3087 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3143 3088 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3144 3089
3145 3090 szcvec &= ~1;
3146 3091 if (szcvec <= 1) { /* skip if base page size */
3147 3092 return (0);
3148 3093 }
3149 3094
3150 3095 /* Get the pagesize of the first larger page size. */
3151 3096 szc = lowbit(szcvec) - 1;
3152 3097 pgsz = page_get_pagesize(szc);
3153 3098 eaddr = addr + size;
3154 3099 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3155 3100 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3156 3101
3157 3102 save_szcvec = szcvec;
3158 3103 szcvec >>= (szc + 1);
3159 3104 nszc = szc;
3160 3105 while (szcvec) {
3161 3106 if ((szcvec & 0x1) == 0) {
3162 3107 nszc++;
3163 3108 szcvec >>= 1;
3164 3109 continue;
3165 3110 }
3166 3111 nszc++;
3167 3112 pgsz = page_get_pagesize(nszc);
3168 3113 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3169 3114 if (a != addr) {
3170 3115 ASSERT(szc > 0);
3171 3116 ASSERT(a < eaddr);
3172 3117 segsize = a - addr;
3173 3118 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3174 3119 save_szcvec);
3175 3120 if (error) {
3176 3121 return (error);
3177 3122 }
3178 3123 addr = a;
3179 3124 }
3180 3125 szc = nszc;
3181 3126 szcvec >>= 1;
3182 3127 }
3183 3128
3184 3129 ASSERT(addr < eaddr);
3185 3130 szcvec = save_szcvec;
3186 3131 while (szcvec) {
3187 3132 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3188 3133 ASSERT(a >= addr);
3189 3134 if (a != addr) {
3190 3135 ASSERT(szc > 0);
3191 3136 segsize = a - addr;
3192 3137 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3193 3138 save_szcvec);
3194 3139 if (error) {
3195 3140 return (error);
3196 3141 }
3197 3142 addr = a;
3198 3143 }
3199 3144 szcvec &= ~(1 << szc);
3200 3145 if (szcvec) {
3201 3146 szc = highbit(szcvec) - 1;
3202 3147 pgsz = page_get_pagesize(szc);
3203 3148 }
3204 3149 }
3205 3150 ASSERT(addr == eaddr);
3206 3151
3207 3152 return (0);
3208 3153 }
3209 3154
3210 3155 /*
3211 3156 * Set the default large page size for the range. Called via memcntl with
3212 3157 * page size set to 0. as_set_default_lpsize breaks the range down into
3213 3158 * chunks with the same type/flags, ignores-non segvn segments, and passes
3214 3159 * each chunk to as_iset_default_lpsize().
3215 3160 */
3216 3161 int
3217 3162 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3218 3163 {
3219 3164 struct seg *seg;
3220 3165 caddr_t raddr;
3221 3166 size_t rsize;
3222 3167 size_t ssize;
3223 3168 int rtype, rflags;
3224 3169 int stype, sflags;
3225 3170 int error;
3226 3171 caddr_t setaddr;
3227 3172 size_t setsize;
3228 3173 int segvn;
3229 3174
3230 3175 if (size == 0)
3231 3176 return (0);
3232 3177
3233 3178 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3234 3179 again:
3235 3180 error = 0;
3236 3181
3237 3182 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3238 3183 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3239 3184 (size_t)raddr;
3240 3185
3241 3186 if (raddr + rsize < raddr) { /* check for wraparound */
3242 3187 AS_LOCK_EXIT(as, &as->a_lock);
3243 3188 return (ENOMEM);
3244 3189 }
3245 3190 as_clearwatchprot(as, raddr, rsize);
3246 3191 seg = as_segat(as, raddr);
3247 3192 if (seg == NULL) {
3248 3193 as_setwatch(as);
3249 3194 AS_LOCK_EXIT(as, &as->a_lock);
3250 3195 return (ENOMEM);
3251 3196 }
3252 3197 if (seg->s_ops == &segvn_ops) {
3253 3198 rtype = SEGOP_GETTYPE(seg, addr);
3254 3199 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3255 3200 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3256 3201 segvn = 1;
3257 3202 } else {
3258 3203 segvn = 0;
3259 3204 }
3260 3205 setaddr = raddr;
3261 3206 setsize = 0;
3262 3207
3263 3208 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3264 3209 if (raddr >= (seg->s_base + seg->s_size)) {
3265 3210 seg = AS_SEGNEXT(as, seg);
3266 3211 if (seg == NULL || raddr != seg->s_base) {
3267 3212 error = ENOMEM;
3268 3213 break;
3269 3214 }
3270 3215 if (seg->s_ops == &segvn_ops) {
3271 3216 stype = SEGOP_GETTYPE(seg, raddr);
3272 3217 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3273 3218 stype &= (MAP_SHARED | MAP_PRIVATE);
3274 3219 if (segvn && (rflags != sflags ||
3275 3220 rtype != stype)) {
3276 3221 /*
3277 3222 * The next segment is also segvn but
3278 3223 * has different flags and/or type.
3279 3224 */
3280 3225 ASSERT(setsize != 0);
3281 3226 error = as_iset_default_lpsize(as,
3282 3227 setaddr, setsize, rflags, rtype);
3283 3228 if (error) {
3284 3229 break;
3285 3230 }
3286 3231 rflags = sflags;
3287 3232 rtype = stype;
3288 3233 setaddr = raddr;
3289 3234 setsize = 0;
3290 3235 } else if (!segvn) {
3291 3236 rflags = sflags;
3292 3237 rtype = stype;
3293 3238 setaddr = raddr;
3294 3239 setsize = 0;
3295 3240 segvn = 1;
3296 3241 }
3297 3242 } else if (segvn) {
3298 3243 /* The next segment is not segvn. */
3299 3244 ASSERT(setsize != 0);
3300 3245 error = as_iset_default_lpsize(as,
3301 3246 setaddr, setsize, rflags, rtype);
3302 3247 if (error) {
3303 3248 break;
3304 3249 }
3305 3250 segvn = 0;
3306 3251 }
3307 3252 }
3308 3253 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3309 3254 ssize = seg->s_base + seg->s_size - raddr;
3310 3255 } else {
3311 3256 ssize = rsize;
3312 3257 }
3313 3258 }
3314 3259 if (error == 0 && segvn) {
3315 3260 /* The last chunk when rsize == 0. */
3316 3261 ASSERT(setsize != 0);
3317 3262 error = as_iset_default_lpsize(as, setaddr, setsize,
3318 3263 rflags, rtype);
3319 3264 }
3320 3265
3321 3266 if (error == IE_RETRY) {
3322 3267 goto again;
3323 3268 } else if (error == IE_NOMEM) {
3324 3269 error = EAGAIN;
3325 3270 } else if (error == ENOTSUP) {
3326 3271 error = EINVAL;
3327 3272 } else if (error == EAGAIN) {
3328 3273 mutex_enter(&as->a_contents);
3329 3274 if (!AS_ISNOUNMAPWAIT(as)) {
3330 3275 if (AS_ISUNMAPWAIT(as) == 0) {
3331 3276 cv_broadcast(&as->a_cv);
3332 3277 }
3333 3278 AS_SETUNMAPWAIT(as);
3334 3279 AS_LOCK_EXIT(as, &as->a_lock);
3335 3280 while (AS_ISUNMAPWAIT(as)) {
3336 3281 cv_wait(&as->a_cv, &as->a_contents);
3337 3282 }
3338 3283 mutex_exit(&as->a_contents);
3339 3284 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3340 3285 } else {
3341 3286 /*
3342 3287 * We may have raced with
3343 3288 * segvn_reclaim()/segspt_reclaim(). In this case
3344 3289 * clean nounmapwait flag and retry since softlockcnt
3345 3290 * in this segment may be already 0. We don't drop as
3346 3291 * writer lock so our number of retries without
3347 3292 * sleeping should be very small. See segvn_reclaim()
3348 3293 * for more comments.
3349 3294 */
3350 3295 AS_CLRNOUNMAPWAIT(as);
3351 3296 mutex_exit(&as->a_contents);
3352 3297 }
3353 3298 goto again;
3354 3299 }
3355 3300
3356 3301 as_setwatch(as);
3357 3302 AS_LOCK_EXIT(as, &as->a_lock);
3358 3303 return (error);
3359 3304 }
3360 3305
3361 3306 /*
3362 3307 * Setup all of the uninitialized watched pages that we can.
3363 3308 */
3364 3309 void
3365 3310 as_setwatch(struct as *as)
3366 3311 {
3367 3312 struct watched_page *pwp;
3368 3313 struct seg *seg;
3369 3314 caddr_t vaddr;
3370 3315 uint_t prot;
3371 3316 int err, retrycnt;
3372 3317
3373 3318 if (avl_numnodes(&as->a_wpage) == 0)
3374 3319 return;
3375 3320
3376 3321 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3377 3322
3378 3323 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3379 3324 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3380 3325 retrycnt = 0;
3381 3326 retry:
3382 3327 vaddr = pwp->wp_vaddr;
3383 3328 if (pwp->wp_oprot != 0 || /* already set up */
3384 3329 (seg = as_segat(as, vaddr)) == NULL ||
3385 3330 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3386 3331 continue;
3387 3332
3388 3333 pwp->wp_oprot = prot;
3389 3334 if (pwp->wp_read)
3390 3335 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3391 3336 if (pwp->wp_write)
3392 3337 prot &= ~PROT_WRITE;
3393 3338 if (pwp->wp_exec)
3394 3339 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3395 3340 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3396 3341 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3397 3342 if (err == IE_RETRY) {
3398 3343 pwp->wp_oprot = 0;
3399 3344 ASSERT(retrycnt == 0);
3400 3345 retrycnt++;
3401 3346 goto retry;
3402 3347 }
3403 3348 }
3404 3349 pwp->wp_prot = prot;
3405 3350 }
3406 3351 }
3407 3352
3408 3353 /*
3409 3354 * Clear all of the watched pages in the address space.
3410 3355 */
3411 3356 void
3412 3357 as_clearwatch(struct as *as)
3413 3358 {
3414 3359 struct watched_page *pwp;
3415 3360 struct seg *seg;
3416 3361 caddr_t vaddr;
3417 3362 uint_t prot;
3418 3363 int err, retrycnt;
3419 3364
3420 3365 if (avl_numnodes(&as->a_wpage) == 0)
3421 3366 return;
3422 3367
3423 3368 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3424 3369
3425 3370 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3426 3371 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3427 3372 retrycnt = 0;
3428 3373 retry:
3429 3374 vaddr = pwp->wp_vaddr;
3430 3375 if (pwp->wp_oprot == 0 || /* not set up */
3431 3376 (seg = as_segat(as, vaddr)) == NULL)
3432 3377 continue;
3433 3378
3434 3379 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3435 3380 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3436 3381 if (err == IE_RETRY) {
3437 3382 ASSERT(retrycnt == 0);
3438 3383 retrycnt++;
3439 3384 goto retry;
3440 3385 }
3441 3386 }
3442 3387 pwp->wp_oprot = 0;
3443 3388 pwp->wp_prot = 0;
3444 3389 }
3445 3390 }
3446 3391
3447 3392 /*
3448 3393 * Force a new setup for all the watched pages in the range.
3449 3394 */
3450 3395 static void
3451 3396 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3452 3397 {
3453 3398 struct watched_page *pwp;
3454 3399 struct watched_page tpw;
3455 3400 caddr_t eaddr = addr + size;
3456 3401 caddr_t vaddr;
3457 3402 struct seg *seg;
3458 3403 int err, retrycnt;
3459 3404 uint_t wprot;
3460 3405 avl_index_t where;
3461 3406
3462 3407 if (avl_numnodes(&as->a_wpage) == 0)
3463 3408 return;
3464 3409
3465 3410 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3466 3411
3467 3412 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3468 3413 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3469 3414 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3470 3415
3471 3416 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3472 3417 retrycnt = 0;
3473 3418 vaddr = pwp->wp_vaddr;
3474 3419
3475 3420 wprot = prot;
3476 3421 if (pwp->wp_read)
3477 3422 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3478 3423 if (pwp->wp_write)
3479 3424 wprot &= ~PROT_WRITE;
3480 3425 if (pwp->wp_exec)
3481 3426 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3482 3427 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3483 3428 retry:
3484 3429 seg = as_segat(as, vaddr);
3485 3430 if (seg == NULL) {
3486 3431 panic("as_setwatchprot: no seg");
3487 3432 /*NOTREACHED*/
3488 3433 }
3489 3434 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3490 3435 if (err == IE_RETRY) {
3491 3436 ASSERT(retrycnt == 0);
3492 3437 retrycnt++;
3493 3438 goto retry;
3494 3439 }
3495 3440 }
3496 3441 pwp->wp_oprot = prot;
3497 3442 pwp->wp_prot = wprot;
3498 3443
3499 3444 pwp = AVL_NEXT(&as->a_wpage, pwp);
3500 3445 }
3501 3446 }
3502 3447
3503 3448 /*
3504 3449 * Clear all of the watched pages in the range.
3505 3450 */
3506 3451 static void
3507 3452 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3508 3453 {
3509 3454 caddr_t eaddr = addr + size;
3510 3455 struct watched_page *pwp;
3511 3456 struct watched_page tpw;
3512 3457 uint_t prot;
3513 3458 struct seg *seg;
3514 3459 int err, retrycnt;
3515 3460 avl_index_t where;
3516 3461
3517 3462 if (avl_numnodes(&as->a_wpage) == 0)
3518 3463 return;
3519 3464
3520 3465 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3521 3466 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3522 3467 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3523 3468
3524 3469 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3525 3470
3526 3471 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3527 3472
3528 3473 if ((prot = pwp->wp_oprot) != 0) {
3529 3474 retrycnt = 0;
3530 3475
3531 3476 if (prot != pwp->wp_prot) {
3532 3477 retry:
3533 3478 seg = as_segat(as, pwp->wp_vaddr);
3534 3479 if (seg == NULL)
3535 3480 continue;
3536 3481 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3537 3482 PAGESIZE, prot);
3538 3483 if (err == IE_RETRY) {
3539 3484 ASSERT(retrycnt == 0);
3540 3485 retrycnt++;
3541 3486 goto retry;
3542 3487
3543 3488 }
3544 3489 }
3545 3490 pwp->wp_oprot = 0;
3546 3491 pwp->wp_prot = 0;
3547 3492 }
3548 3493
3549 3494 pwp = AVL_NEXT(&as->a_wpage, pwp);
3550 3495 }
3551 3496 }
3552 3497
3553 3498 void
3554 3499 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3555 3500 {
3556 3501 struct proc *p;
3557 3502
3558 3503 mutex_enter(&pidlock);
3559 3504 for (p = practive; p; p = p->p_next) {
3560 3505 if (p->p_as == as) {
3561 3506 mutex_enter(&p->p_lock);
3562 3507 if (p->p_as == as)
3563 3508 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3564 3509 mutex_exit(&p->p_lock);
3565 3510 }
3566 3511 }
3567 3512 mutex_exit(&pidlock);
3568 3513 }
3569 3514
3570 3515 /*
3571 3516 * return memory object ID
3572 3517 */
3573 3518 int
3574 3519 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3575 3520 {
3576 3521 struct seg *seg;
3577 3522 int sts;
3578 3523
3579 3524 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
3580 3525 seg = as_segat(as, addr);
3581 3526 if (seg == NULL) {
3582 3527 AS_LOCK_EXIT(as, &as->a_lock);
3583 3528 return (EFAULT);
3584 3529 }
3585 3530 /*
3586 3531 * catch old drivers which may not support getmemid
3587 3532 */
3588 3533 if (seg->s_ops->getmemid == NULL) {
3589 3534 AS_LOCK_EXIT(as, &as->a_lock);
3590 3535 return (ENODEV);
3591 3536 }
3592 3537
3593 3538 sts = SEGOP_GETMEMID(seg, addr, memidp);
3594 3539
3595 3540 AS_LOCK_EXIT(as, &as->a_lock);
3596 3541 return (sts);
3597 3542 }
↓ open down ↓ |
1472 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX