Print this page
patch remove-as_swapout
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/vm_as.c
+++ new/usr/src/uts/common/vm/vm_as.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 27 /* All Rights Reserved */
28 28
29 29 /*
30 30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 31 * The Regents of the University of California
32 32 * All Rights Reserved
33 33 *
34 34 * University Acknowledgment- Portions of this document are derived from
35 35 * software developed by the University of California, Berkeley, and its
36 36 * contributors.
37 37 */
38 38
39 39 /*
40 40 * VM - address spaces.
41 41 */
42 42
43 43 #include <sys/types.h>
44 44 #include <sys/t_lock.h>
45 45 #include <sys/param.h>
46 46 #include <sys/errno.h>
47 47 #include <sys/systm.h>
48 48 #include <sys/mman.h>
49 49 #include <sys/sysmacros.h>
50 50 #include <sys/cpuvar.h>
51 51 #include <sys/sysinfo.h>
52 52 #include <sys/kmem.h>
53 53 #include <sys/vnode.h>
54 54 #include <sys/vmsystm.h>
55 55 #include <sys/cmn_err.h>
56 56 #include <sys/debug.h>
57 57 #include <sys/tnf_probe.h>
58 58 #include <sys/vtrace.h>
59 59
60 60 #include <vm/hat.h>
61 61 #include <vm/xhat.h>
62 62 #include <vm/as.h>
63 63 #include <vm/seg.h>
64 64 #include <vm/seg_vn.h>
65 65 #include <vm/seg_dev.h>
66 66 #include <vm/seg_kmem.h>
67 67 #include <vm/seg_map.h>
68 68 #include <vm/seg_spt.h>
69 69 #include <vm/page.h>
70 70
71 71 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
72 72
73 73 static struct kmem_cache *as_cache;
74 74
75 75 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
76 76 static void as_clearwatchprot(struct as *, caddr_t, size_t);
77 77 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
78 78
79 79
80 80 /*
81 81 * Verifying the segment lists is very time-consuming; it may not be
82 82 * desirable always to define VERIFY_SEGLIST when DEBUG is set.
83 83 */
84 84 #ifdef DEBUG
85 85 #define VERIFY_SEGLIST
86 86 int do_as_verify = 0;
87 87 #endif
88 88
89 89 /*
90 90 * Allocate a new callback data structure entry and fill in the events of
91 91 * interest, the address range of interest, and the callback argument.
92 92 * Link the entry on the as->a_callbacks list. A callback entry for the
93 93 * entire address space may be specified with vaddr = 0 and size = -1.
94 94 *
95 95 * CALLERS RESPONSIBILITY: If not calling from within the process context for
96 96 * the specified as, the caller must guarantee persistence of the specified as
97 97 * for the duration of this function (eg. pages being locked within the as
98 98 * will guarantee persistence).
99 99 */
100 100 int
101 101 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
102 102 caddr_t vaddr, size_t size, int sleepflag)
103 103 {
104 104 struct as_callback *current_head, *cb;
105 105 caddr_t saddr;
106 106 size_t rsize;
107 107
108 108 /* callback function and an event are mandatory */
109 109 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
110 110 return (EINVAL);
111 111
112 112 /* Adding a callback after as_free has been called is not allowed */
113 113 if (as == &kas)
114 114 return (ENOMEM);
115 115
116 116 /*
117 117 * vaddr = 0 and size = -1 is used to indicate that the callback range
118 118 * is the entire address space so no rounding is done in that case.
119 119 */
120 120 if (size != -1) {
121 121 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
122 122 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
123 123 (size_t)saddr;
124 124 /* check for wraparound */
125 125 if (saddr + rsize < saddr)
126 126 return (ENOMEM);
127 127 } else {
128 128 if (vaddr != 0)
129 129 return (EINVAL);
130 130 saddr = vaddr;
131 131 rsize = size;
132 132 }
133 133
134 134 /* Allocate and initialize a callback entry */
135 135 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
136 136 if (cb == NULL)
137 137 return (EAGAIN);
138 138
139 139 cb->ascb_func = cb_func;
140 140 cb->ascb_arg = arg;
141 141 cb->ascb_events = events;
142 142 cb->ascb_saddr = saddr;
143 143 cb->ascb_len = rsize;
144 144
145 145 /* Add the entry to the list */
146 146 mutex_enter(&as->a_contents);
147 147 current_head = as->a_callbacks;
148 148 as->a_callbacks = cb;
149 149 cb->ascb_next = current_head;
150 150
151 151 /*
152 152 * The call to this function may lose in a race with
153 153 * a pertinent event - eg. a thread does long term memory locking
154 154 * but before the callback is added another thread executes as_unmap.
155 155 * A broadcast here resolves that.
156 156 */
157 157 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
158 158 AS_CLRUNMAPWAIT(as);
159 159 cv_broadcast(&as->a_cv);
160 160 }
161 161
162 162 mutex_exit(&as->a_contents);
163 163 return (0);
164 164 }
165 165
166 166 /*
167 167 * Search the callback list for an entry which pertains to arg.
168 168 *
169 169 * This is called from within the client upon completion of the callback.
170 170 * RETURN VALUES:
171 171 * AS_CALLBACK_DELETED (callback entry found and deleted)
172 172 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
173 173 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
174 174 * entry will be made in as_do_callbacks)
175 175 *
176 176 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
177 177 * set, it indicates that as_do_callbacks is processing this entry. The
178 178 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
179 179 * to unblock as_do_callbacks, in case it is blocked.
180 180 *
181 181 * CALLERS RESPONSIBILITY: If not calling from within the process context for
182 182 * the specified as, the caller must guarantee persistence of the specified as
183 183 * for the duration of this function (eg. pages being locked within the as
184 184 * will guarantee persistence).
185 185 */
186 186 uint_t
187 187 as_delete_callback(struct as *as, void *arg)
188 188 {
189 189 struct as_callback **prevcb = &as->a_callbacks;
190 190 struct as_callback *cb;
191 191 uint_t rc = AS_CALLBACK_NOTFOUND;
192 192
193 193 mutex_enter(&as->a_contents);
194 194 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
195 195 if (cb->ascb_arg != arg)
196 196 continue;
197 197
198 198 /*
199 199 * If the events indicate AS_CALLBACK_CALLED, just clear
200 200 * AS_ALL_EVENT in the events field and wakeup the thread
201 201 * that may be waiting in as_do_callbacks. as_do_callbacks
202 202 * will take care of removing this entry from the list. In
203 203 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
204 204 * (AS_CALLBACK_CALLED not set), just remove it from the
205 205 * list, return the memory and return AS_CALLBACK_DELETED.
206 206 */
207 207 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
208 208 /* leave AS_CALLBACK_CALLED */
209 209 cb->ascb_events &= ~AS_ALL_EVENT;
210 210 rc = AS_CALLBACK_DELETE_DEFERRED;
211 211 cv_broadcast(&as->a_cv);
212 212 } else {
213 213 *prevcb = cb->ascb_next;
214 214 kmem_free(cb, sizeof (struct as_callback));
215 215 rc = AS_CALLBACK_DELETED;
216 216 }
217 217 break;
218 218 }
219 219 mutex_exit(&as->a_contents);
220 220 return (rc);
221 221 }
222 222
223 223 /*
224 224 * Searches the as callback list for a matching entry.
225 225 * Returns a pointer to the first matching callback, or NULL if
226 226 * nothing is found.
227 227 * This function never sleeps so it is ok to call it with more
228 228 * locks held but the (required) a_contents mutex.
229 229 *
230 230 * See also comment on as_do_callbacks below.
231 231 */
232 232 static struct as_callback *
233 233 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
234 234 size_t event_len)
235 235 {
236 236 struct as_callback *cb;
237 237
238 238 ASSERT(MUTEX_HELD(&as->a_contents));
239 239 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
240 240 /*
241 241 * If the callback has not already been called, then
242 242 * check if events or address range pertains. An event_len
243 243 * of zero means do an unconditional callback.
244 244 */
245 245 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
246 246 ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
247 247 (event_addr + event_len < cb->ascb_saddr) ||
248 248 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
249 249 continue;
250 250 }
251 251 break;
252 252 }
253 253 return (cb);
254 254 }
255 255
256 256 /*
257 257 * Executes a given callback and removes it from the callback list for
258 258 * this address space.
259 259 * This function may sleep so the caller must drop all locks except
260 260 * a_contents before calling this func.
261 261 *
262 262 * See also comments on as_do_callbacks below.
263 263 */
264 264 static void
265 265 as_execute_callback(struct as *as, struct as_callback *cb,
266 266 uint_t events)
267 267 {
268 268 struct as_callback **prevcb;
269 269 void *cb_arg;
270 270
271 271 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
272 272 cb->ascb_events |= AS_CALLBACK_CALLED;
273 273 mutex_exit(&as->a_contents);
274 274 (*cb->ascb_func)(as, cb->ascb_arg, events);
275 275 mutex_enter(&as->a_contents);
276 276 /*
277 277 * the callback function is required to delete the callback
278 278 * when the callback function determines it is OK for
279 279 * this thread to continue. as_delete_callback will clear
280 280 * the AS_ALL_EVENT in the events field when it is deleted.
281 281 * If the callback function called as_delete_callback,
282 282 * events will already be cleared and there will be no blocking.
283 283 */
284 284 while ((cb->ascb_events & events) != 0) {
285 285 cv_wait(&as->a_cv, &as->a_contents);
286 286 }
287 287 /*
288 288 * This entry needs to be taken off the list. Normally, the
289 289 * callback func itself does that, but unfortunately the list
290 290 * may have changed while the callback was running because the
291 291 * a_contents mutex was dropped and someone else other than the
292 292 * callback func itself could have called as_delete_callback,
293 293 * so we have to search to find this entry again. The entry
294 294 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
295 295 */
296 296 cb_arg = cb->ascb_arg;
297 297 prevcb = &as->a_callbacks;
298 298 for (cb = as->a_callbacks; cb != NULL;
299 299 prevcb = &cb->ascb_next, cb = *prevcb) {
300 300 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
301 301 (cb_arg != cb->ascb_arg)) {
302 302 continue;
303 303 }
304 304 *prevcb = cb->ascb_next;
305 305 kmem_free(cb, sizeof (struct as_callback));
306 306 break;
307 307 }
308 308 }
309 309
310 310 /*
311 311 * Check the callback list for a matching event and intersection of
312 312 * address range. If there is a match invoke the callback. Skip an entry if:
313 313 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
314 314 * - not event of interest
315 315 * - not address range of interest
316 316 *
317 317 * An event_len of zero indicates a request for an unconditional callback
318 318 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The
319 319 * a_contents lock must be dropped before a callback, so only one callback
320 320 * can be done before returning. Return -1 (true) if a callback was
321 321 * executed and removed from the list, else return 0 (false).
322 322 *
323 323 * The logically separate parts, i.e. finding a matching callback and
324 324 * executing a given callback have been separated into two functions
325 325 * so that they can be called with different sets of locks held beyond
326 326 * the always-required a_contents. as_find_callback does not sleep so
327 327 * it is ok to call it if more locks than a_contents (i.e. the a_lock
328 328 * rwlock) are held. as_execute_callback on the other hand may sleep
329 329 * so all locks beyond a_contents must be dropped by the caller if one
330 330 * does not want to end comatose.
331 331 */
332 332 static int
333 333 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
334 334 size_t event_len)
335 335 {
336 336 struct as_callback *cb;
337 337
338 338 if ((cb = as_find_callback(as, events, event_addr, event_len))) {
339 339 as_execute_callback(as, cb, events);
340 340 return (-1);
341 341 }
342 342 return (0);
343 343 }
344 344
345 345 /*
346 346 * Search for the segment containing addr. If a segment containing addr
347 347 * exists, that segment is returned. If no such segment exists, and
348 348 * the list spans addresses greater than addr, then the first segment
349 349 * whose base is greater than addr is returned; otherwise, NULL is
350 350 * returned unless tail is true, in which case the last element of the
351 351 * list is returned.
352 352 *
353 353 * a_seglast is used to cache the last found segment for repeated
354 354 * searches to the same addr (which happens frequently).
355 355 */
356 356 struct seg *
357 357 as_findseg(struct as *as, caddr_t addr, int tail)
358 358 {
359 359 struct seg *seg = as->a_seglast;
360 360 avl_index_t where;
361 361
362 362 ASSERT(AS_LOCK_HELD(as, &as->a_lock));
363 363
364 364 if (seg != NULL &&
365 365 seg->s_base <= addr &&
366 366 addr < seg->s_base + seg->s_size)
367 367 return (seg);
368 368
369 369 seg = avl_find(&as->a_segtree, &addr, &where);
370 370 if (seg != NULL)
371 371 return (as->a_seglast = seg);
372 372
373 373 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
374 374 if (seg == NULL && tail)
375 375 seg = avl_last(&as->a_segtree);
376 376 return (as->a_seglast = seg);
377 377 }
378 378
379 379 #ifdef VERIFY_SEGLIST
380 380 /*
381 381 * verify that the linked list is coherent
382 382 */
383 383 static void
384 384 as_verify(struct as *as)
385 385 {
386 386 struct seg *seg, *seglast, *p, *n;
387 387 uint_t nsegs = 0;
388 388
389 389 if (do_as_verify == 0)
390 390 return;
391 391
392 392 seglast = as->a_seglast;
393 393
394 394 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
395 395 ASSERT(seg->s_as == as);
396 396 p = AS_SEGPREV(as, seg);
397 397 n = AS_SEGNEXT(as, seg);
398 398 ASSERT(p == NULL || p->s_as == as);
399 399 ASSERT(p == NULL || p->s_base < seg->s_base);
400 400 ASSERT(n == NULL || n->s_base > seg->s_base);
401 401 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
402 402 if (seg == seglast)
403 403 seglast = NULL;
404 404 nsegs++;
405 405 }
406 406 ASSERT(seglast == NULL);
407 407 ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
408 408 }
409 409 #endif /* VERIFY_SEGLIST */
410 410
411 411 /*
412 412 * Add a new segment to the address space. The avl_find()
413 413 * may be expensive so we attempt to use last segment accessed
414 414 * in as_gap() as an insertion point.
415 415 */
416 416 int
417 417 as_addseg(struct as *as, struct seg *newseg)
418 418 {
419 419 struct seg *seg;
420 420 caddr_t addr;
421 421 caddr_t eaddr;
422 422 avl_index_t where;
423 423
424 424 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
425 425
426 426 as->a_updatedir = 1; /* inform /proc */
427 427 gethrestime(&as->a_updatetime);
428 428
429 429 if (as->a_lastgaphl != NULL) {
430 430 struct seg *hseg = NULL;
431 431 struct seg *lseg = NULL;
432 432
433 433 if (as->a_lastgaphl->s_base > newseg->s_base) {
434 434 hseg = as->a_lastgaphl;
435 435 lseg = AVL_PREV(&as->a_segtree, hseg);
436 436 } else {
437 437 lseg = as->a_lastgaphl;
438 438 hseg = AVL_NEXT(&as->a_segtree, lseg);
439 439 }
440 440
441 441 if (hseg && lseg && lseg->s_base < newseg->s_base &&
442 442 hseg->s_base > newseg->s_base) {
443 443 avl_insert_here(&as->a_segtree, newseg, lseg,
444 444 AVL_AFTER);
445 445 as->a_lastgaphl = NULL;
446 446 as->a_seglast = newseg;
447 447 return (0);
448 448 }
449 449 as->a_lastgaphl = NULL;
450 450 }
451 451
452 452 addr = newseg->s_base;
453 453 eaddr = addr + newseg->s_size;
454 454 again:
455 455
456 456 seg = avl_find(&as->a_segtree, &addr, &where);
457 457
458 458 if (seg == NULL)
459 459 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
460 460
461 461 if (seg == NULL)
462 462 seg = avl_last(&as->a_segtree);
463 463
464 464 if (seg != NULL) {
465 465 caddr_t base = seg->s_base;
466 466
467 467 /*
468 468 * If top of seg is below the requested address, then
469 469 * the insertion point is at the end of the linked list,
470 470 * and seg points to the tail of the list. Otherwise,
471 471 * the insertion point is immediately before seg.
472 472 */
473 473 if (base + seg->s_size > addr) {
474 474 if (addr >= base || eaddr > base) {
475 475 #ifdef __sparc
476 476 extern struct seg_ops segnf_ops;
477 477
478 478 /*
479 479 * no-fault segs must disappear if overlaid.
480 480 * XXX need new segment type so
481 481 * we don't have to check s_ops
482 482 */
483 483 if (seg->s_ops == &segnf_ops) {
484 484 seg_unmap(seg);
485 485 goto again;
486 486 }
487 487 #endif
488 488 return (-1); /* overlapping segment */
489 489 }
490 490 }
491 491 }
492 492 as->a_seglast = newseg;
493 493 avl_insert(&as->a_segtree, newseg, where);
494 494
495 495 #ifdef VERIFY_SEGLIST
496 496 as_verify(as);
497 497 #endif
498 498 return (0);
499 499 }
500 500
501 501 struct seg *
502 502 as_removeseg(struct as *as, struct seg *seg)
503 503 {
504 504 avl_tree_t *t;
505 505
506 506 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
507 507
508 508 as->a_updatedir = 1; /* inform /proc */
509 509 gethrestime(&as->a_updatetime);
510 510
511 511 if (seg == NULL)
512 512 return (NULL);
513 513
514 514 t = &as->a_segtree;
515 515 if (as->a_seglast == seg)
516 516 as->a_seglast = NULL;
517 517 as->a_lastgaphl = NULL;
518 518
519 519 /*
520 520 * if this segment is at an address higher than
521 521 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
522 522 */
523 523 if (as->a_lastgap &&
524 524 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
525 525 as->a_lastgap = AVL_NEXT(t, seg);
526 526
527 527 /*
528 528 * remove the segment from the seg tree
529 529 */
530 530 avl_remove(t, seg);
531 531
532 532 #ifdef VERIFY_SEGLIST
533 533 as_verify(as);
534 534 #endif
535 535 return (seg);
536 536 }
537 537
538 538 /*
539 539 * Find a segment containing addr.
540 540 */
541 541 struct seg *
542 542 as_segat(struct as *as, caddr_t addr)
543 543 {
544 544 struct seg *seg = as->a_seglast;
545 545
546 546 ASSERT(AS_LOCK_HELD(as, &as->a_lock));
547 547
548 548 if (seg != NULL && seg->s_base <= addr &&
549 549 addr < seg->s_base + seg->s_size)
550 550 return (seg);
551 551
552 552 seg = avl_find(&as->a_segtree, &addr, NULL);
553 553 return (seg);
554 554 }
555 555
556 556 /*
557 557 * Serialize all searches for holes in an address space to
558 558 * prevent two or more threads from allocating the same virtual
559 559 * address range. The address space must not be "read/write"
560 560 * locked by the caller since we may block.
561 561 */
562 562 void
563 563 as_rangelock(struct as *as)
564 564 {
565 565 mutex_enter(&as->a_contents);
566 566 while (AS_ISCLAIMGAP(as))
567 567 cv_wait(&as->a_cv, &as->a_contents);
568 568 AS_SETCLAIMGAP(as);
569 569 mutex_exit(&as->a_contents);
570 570 }
571 571
572 572 /*
573 573 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
574 574 */
575 575 void
576 576 as_rangeunlock(struct as *as)
577 577 {
578 578 mutex_enter(&as->a_contents);
579 579 AS_CLRCLAIMGAP(as);
580 580 cv_signal(&as->a_cv);
581 581 mutex_exit(&as->a_contents);
582 582 }
583 583
584 584 /*
585 585 * compar segments (or just an address) by segment address range
586 586 */
587 587 static int
588 588 as_segcompar(const void *x, const void *y)
589 589 {
590 590 struct seg *a = (struct seg *)x;
591 591 struct seg *b = (struct seg *)y;
592 592
593 593 if (a->s_base < b->s_base)
594 594 return (-1);
595 595 if (a->s_base >= b->s_base + b->s_size)
596 596 return (1);
597 597 return (0);
598 598 }
599 599
600 600
601 601 void
602 602 as_avlinit(struct as *as)
603 603 {
604 604 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
605 605 offsetof(struct seg, s_tree));
606 606 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
607 607 offsetof(struct watched_page, wp_link));
608 608 }
609 609
610 610 /*ARGSUSED*/
611 611 static int
612 612 as_constructor(void *buf, void *cdrarg, int kmflags)
613 613 {
614 614 struct as *as = buf;
615 615
616 616 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
617 617 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
618 618 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
619 619 as_avlinit(as);
620 620 return (0);
621 621 }
622 622
623 623 /*ARGSUSED1*/
624 624 static void
625 625 as_destructor(void *buf, void *cdrarg)
626 626 {
627 627 struct as *as = buf;
628 628
629 629 avl_destroy(&as->a_segtree);
630 630 mutex_destroy(&as->a_contents);
631 631 cv_destroy(&as->a_cv);
632 632 rw_destroy(&as->a_lock);
633 633 }
634 634
635 635 void
636 636 as_init(void)
637 637 {
638 638 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
639 639 as_constructor, as_destructor, NULL, NULL, NULL, 0);
640 640 }
641 641
642 642 /*
643 643 * Allocate and initialize an address space data structure.
644 644 * We call hat_alloc to allow any machine dependent
645 645 * information in the hat structure to be initialized.
646 646 */
647 647 struct as *
648 648 as_alloc(void)
649 649 {
650 650 struct as *as;
651 651
652 652 as = kmem_cache_alloc(as_cache, KM_SLEEP);
653 653
654 654 as->a_flags = 0;
655 655 as->a_vbits = 0;
656 656 as->a_hrm = NULL;
657 657 as->a_seglast = NULL;
658 658 as->a_size = 0;
659 659 as->a_resvsize = 0;
660 660 as->a_updatedir = 0;
661 661 gethrestime(&as->a_updatetime);
662 662 as->a_objectdir = NULL;
663 663 as->a_sizedir = 0;
664 664 as->a_userlimit = (caddr_t)USERLIMIT;
665 665 as->a_lastgap = NULL;
666 666 as->a_lastgaphl = NULL;
667 667 as->a_callbacks = NULL;
668 668
669 669 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
670 670 as->a_hat = hat_alloc(as); /* create hat for default system mmu */
671 671 AS_LOCK_EXIT(as, &as->a_lock);
672 672
673 673 as->a_xhat = NULL;
674 674
675 675 return (as);
676 676 }
677 677
678 678 /*
679 679 * Free an address space data structure.
680 680 * Need to free the hat first and then
681 681 * all the segments on this as and finally
682 682 * the space for the as struct itself.
683 683 */
684 684 void
685 685 as_free(struct as *as)
686 686 {
687 687 struct hat *hat = as->a_hat;
688 688 struct seg *seg, *next;
689 689 int called = 0;
690 690
691 691 top:
692 692 /*
693 693 * Invoke ALL callbacks. as_do_callbacks will do one callback
694 694 * per call, and not return (-1) until the callback has completed.
695 695 * When as_do_callbacks returns zero, all callbacks have completed.
696 696 */
697 697 mutex_enter(&as->a_contents);
698 698 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
699 699 ;
700 700
701 701 /* This will prevent new XHATs from attaching to as */
702 702 if (!called)
703 703 AS_SETBUSY(as);
704 704 mutex_exit(&as->a_contents);
705 705 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
706 706
707 707 if (!called) {
708 708 called = 1;
709 709 hat_free_start(hat);
710 710 if (as->a_xhat != NULL)
711 711 xhat_free_start_all(as);
712 712 }
713 713 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
714 714 int err;
715 715
716 716 next = AS_SEGNEXT(as, seg);
717 717 retry:
718 718 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
719 719 if (err == EAGAIN) {
720 720 mutex_enter(&as->a_contents);
721 721 if (as->a_callbacks) {
722 722 AS_LOCK_EXIT(as, &as->a_lock);
723 723 } else if (!AS_ISNOUNMAPWAIT(as)) {
724 724 /*
725 725 * Memory is currently locked. Wait for a
726 726 * cv_signal that it has been unlocked, then
727 727 * try the operation again.
728 728 */
729 729 if (AS_ISUNMAPWAIT(as) == 0)
730 730 cv_broadcast(&as->a_cv);
731 731 AS_SETUNMAPWAIT(as);
732 732 AS_LOCK_EXIT(as, &as->a_lock);
733 733 while (AS_ISUNMAPWAIT(as))
734 734 cv_wait(&as->a_cv, &as->a_contents);
735 735 } else {
736 736 /*
737 737 * We may have raced with
738 738 * segvn_reclaim()/segspt_reclaim(). In this
739 739 * case clean nounmapwait flag and retry since
740 740 * softlockcnt in this segment may be already
741 741 * 0. We don't drop as writer lock so our
742 742 * number of retries without sleeping should
743 743 * be very small. See segvn_reclaim() for
744 744 * more comments.
745 745 */
746 746 AS_CLRNOUNMAPWAIT(as);
747 747 mutex_exit(&as->a_contents);
748 748 goto retry;
749 749 }
750 750 mutex_exit(&as->a_contents);
751 751 goto top;
752 752 } else {
753 753 /*
754 754 * We do not expect any other error return at this
755 755 * time. This is similar to an ASSERT in seg_unmap()
756 756 */
757 757 ASSERT(err == 0);
758 758 }
759 759 }
760 760 hat_free_end(hat);
761 761 if (as->a_xhat != NULL)
762 762 xhat_free_end_all(as);
763 763 AS_LOCK_EXIT(as, &as->a_lock);
764 764
765 765 /* /proc stuff */
766 766 ASSERT(avl_numnodes(&as->a_wpage) == 0);
767 767 if (as->a_objectdir) {
768 768 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
769 769 as->a_objectdir = NULL;
770 770 as->a_sizedir = 0;
771 771 }
772 772
773 773 /*
774 774 * Free the struct as back to kmem. Assert it has no segments.
775 775 */
776 776 ASSERT(avl_numnodes(&as->a_segtree) == 0);
777 777 kmem_cache_free(as_cache, as);
778 778 }
779 779
780 780 int
781 781 as_dup(struct as *as, struct proc *forkedproc)
782 782 {
783 783 struct as *newas;
784 784 struct seg *seg, *newseg;
785 785 size_t purgesize = 0;
786 786 int error;
787 787
788 788 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
789 789 as_clearwatch(as);
790 790 newas = as_alloc();
791 791 newas->a_userlimit = as->a_userlimit;
792 792 newas->a_proc = forkedproc;
793 793
794 794 AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
795 795
796 796 /* This will prevent new XHATs from attaching */
797 797 mutex_enter(&as->a_contents);
798 798 AS_SETBUSY(as);
799 799 mutex_exit(&as->a_contents);
800 800 mutex_enter(&newas->a_contents);
801 801 AS_SETBUSY(newas);
802 802 mutex_exit(&newas->a_contents);
803 803
804 804 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
805 805
806 806 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
807 807
808 808 if (seg->s_flags & S_PURGE) {
809 809 purgesize += seg->s_size;
810 810 continue;
811 811 }
812 812
813 813 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
814 814 if (newseg == NULL) {
815 815 AS_LOCK_EXIT(newas, &newas->a_lock);
816 816 as_setwatch(as);
817 817 mutex_enter(&as->a_contents);
818 818 AS_CLRBUSY(as);
819 819 mutex_exit(&as->a_contents);
820 820 AS_LOCK_EXIT(as, &as->a_lock);
821 821 as_free(newas);
822 822 return (-1);
823 823 }
824 824 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
825 825 /*
826 826 * We call seg_free() on the new seg
827 827 * because the segment is not set up
828 828 * completely; i.e. it has no ops.
829 829 */
830 830 as_setwatch(as);
831 831 mutex_enter(&as->a_contents);
832 832 AS_CLRBUSY(as);
833 833 mutex_exit(&as->a_contents);
834 834 AS_LOCK_EXIT(as, &as->a_lock);
835 835 seg_free(newseg);
836 836 AS_LOCK_EXIT(newas, &newas->a_lock);
837 837 as_free(newas);
838 838 return (error);
839 839 }
840 840 newas->a_size += seg->s_size;
841 841 }
842 842 newas->a_resvsize = as->a_resvsize - purgesize;
843 843
844 844 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
845 845 if (as->a_xhat != NULL)
846 846 error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL);
847 847
848 848 mutex_enter(&newas->a_contents);
849 849 AS_CLRBUSY(newas);
850 850 mutex_exit(&newas->a_contents);
851 851 AS_LOCK_EXIT(newas, &newas->a_lock);
852 852
853 853 as_setwatch(as);
854 854 mutex_enter(&as->a_contents);
855 855 AS_CLRBUSY(as);
856 856 mutex_exit(&as->a_contents);
857 857 AS_LOCK_EXIT(as, &as->a_lock);
858 858 if (error != 0) {
859 859 as_free(newas);
860 860 return (error);
861 861 }
862 862 forkedproc->p_as = newas;
863 863 return (0);
864 864 }
865 865
866 866 /*
867 867 * Handle a ``fault'' at addr for size bytes.
868 868 */
869 869 faultcode_t
870 870 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
871 871 enum fault_type type, enum seg_rw rw)
872 872 {
873 873 struct seg *seg;
874 874 caddr_t raddr; /* rounded down addr */
875 875 size_t rsize; /* rounded up size */
876 876 size_t ssize;
877 877 faultcode_t res = 0;
878 878 caddr_t addrsav;
879 879 struct seg *segsav;
880 880 int as_lock_held;
881 881 klwp_t *lwp = ttolwp(curthread);
882 882 int is_xhat = 0;
883 883 int holding_wpage = 0;
884 884 extern struct seg_ops segdev_ops;
885 885
886 886
887 887
888 888 if (as->a_hat != hat) {
889 889 /* This must be an XHAT then */
890 890 is_xhat = 1;
891 891
892 892 if ((type != F_INVAL) || (as == &kas))
893 893 return (FC_NOSUPPORT);
894 894 }
895 895
896 896 retry:
897 897 if (!is_xhat) {
898 898 /*
899 899 * Indicate that the lwp is not to be stopped while waiting
900 900 * for a pagefault. This is to avoid deadlock while debugging
901 901 * a process via /proc over NFS (in particular).
902 902 */
903 903 if (lwp != NULL)
904 904 lwp->lwp_nostop++;
905 905
906 906 /*
907 907 * same length must be used when we softlock and softunlock.
908 908 * We don't support softunlocking lengths less than
909 909 * the original length when there is largepage support.
910 910 * See seg_dev.c for more comments.
911 911 */
912 912 switch (type) {
913 913
914 914 case F_SOFTLOCK:
915 915 CPU_STATS_ADD_K(vm, softlock, 1);
916 916 break;
917 917
918 918 case F_SOFTUNLOCK:
919 919 break;
920 920
921 921 case F_PROT:
922 922 CPU_STATS_ADD_K(vm, prot_fault, 1);
923 923 break;
924 924
925 925 case F_INVAL:
926 926 CPU_STATS_ENTER_K();
927 927 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
928 928 if (as == &kas)
929 929 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
930 930 CPU_STATS_EXIT_K();
931 931 break;
932 932 }
933 933 }
934 934
935 935 /* Kernel probe */
936 936 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
937 937 tnf_opaque, address, addr,
938 938 tnf_fault_type, fault_type, type,
939 939 tnf_seg_access, access, rw);
940 940
941 941 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
942 942 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
943 943 (size_t)raddr;
944 944
945 945 /*
946 946 * XXX -- Don't grab the as lock for segkmap. We should grab it for
947 947 * correctness, but then we could be stuck holding this lock for
948 948 * a LONG time if the fault needs to be resolved on a slow
949 949 * filesystem, and then no-one will be able to exec new commands,
950 950 * as exec'ing requires the write lock on the as.
951 951 */
952 952 if (as == &kas && segkmap && segkmap->s_base <= raddr &&
953 953 raddr + size < segkmap->s_base + segkmap->s_size) {
954 954 /*
955 955 * if (as==&kas), this can't be XHAT: we've already returned
956 956 * FC_NOSUPPORT.
957 957 */
958 958 seg = segkmap;
959 959 as_lock_held = 0;
960 960 } else {
961 961 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
962 962 if (is_xhat && avl_numnodes(&as->a_wpage) != 0) {
963 963 /*
964 964 * Grab and hold the writers' lock on the as
965 965 * if the fault is to a watched page.
966 966 * This will keep CPUs from "peeking" at the
967 967 * address range while we're temporarily boosting
968 968 * the permissions for the XHAT device to
969 969 * resolve the fault in the segment layer.
970 970 *
971 971 * We could check whether faulted address
972 972 * is within a watched page and only then grab
973 973 * the writer lock, but this is simpler.
974 974 */
975 975 AS_LOCK_EXIT(as, &as->a_lock);
976 976 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
977 977 }
978 978
979 979 seg = as_segat(as, raddr);
980 980 if (seg == NULL) {
981 981 AS_LOCK_EXIT(as, &as->a_lock);
982 982 if ((lwp != NULL) && (!is_xhat))
983 983 lwp->lwp_nostop--;
984 984 return (FC_NOMAP);
985 985 }
986 986
987 987 as_lock_held = 1;
988 988 }
989 989
990 990 addrsav = raddr;
991 991 segsav = seg;
992 992
993 993 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
994 994 if (raddr >= seg->s_base + seg->s_size) {
995 995 seg = AS_SEGNEXT(as, seg);
996 996 if (seg == NULL || raddr != seg->s_base) {
997 997 res = FC_NOMAP;
998 998 break;
999 999 }
1000 1000 }
1001 1001 if (raddr + rsize > seg->s_base + seg->s_size)
1002 1002 ssize = seg->s_base + seg->s_size - raddr;
1003 1003 else
1004 1004 ssize = rsize;
1005 1005
1006 1006 if (!is_xhat || (seg->s_ops != &segdev_ops)) {
1007 1007
1008 1008 if (is_xhat && avl_numnodes(&as->a_wpage) != 0 &&
1009 1009 pr_is_watchpage_as(raddr, rw, as)) {
1010 1010 /*
1011 1011 * Handle watch pages. If we're faulting on a
1012 1012 * watched page from an X-hat, we have to
1013 1013 * restore the original permissions while we
1014 1014 * handle the fault.
1015 1015 */
1016 1016 as_clearwatch(as);
1017 1017 holding_wpage = 1;
1018 1018 }
1019 1019
1020 1020 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
1021 1021
1022 1022 /* Restore watchpoints */
1023 1023 if (holding_wpage) {
1024 1024 as_setwatch(as);
1025 1025 holding_wpage = 0;
1026 1026 }
1027 1027
1028 1028 if (res != 0)
1029 1029 break;
1030 1030 } else {
1031 1031 /* XHAT does not support seg_dev */
1032 1032 res = FC_NOSUPPORT;
1033 1033 break;
1034 1034 }
1035 1035 }
1036 1036
1037 1037 /*
1038 1038 * If we were SOFTLOCKing and encountered a failure,
1039 1039 * we must SOFTUNLOCK the range we already did. (Maybe we
1040 1040 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
1041 1041 * right here...)
1042 1042 */
1043 1043 if (res != 0 && type == F_SOFTLOCK) {
1044 1044 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
1045 1045 if (addrsav >= seg->s_base + seg->s_size)
1046 1046 seg = AS_SEGNEXT(as, seg);
1047 1047 ASSERT(seg != NULL);
1048 1048 /*
1049 1049 * Now call the fault routine again to perform the
1050 1050 * unlock using S_OTHER instead of the rw variable
1051 1051 * since we never got a chance to touch the pages.
1052 1052 */
1053 1053 if (raddr > seg->s_base + seg->s_size)
1054 1054 ssize = seg->s_base + seg->s_size - addrsav;
1055 1055 else
1056 1056 ssize = raddr - addrsav;
1057 1057 (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
1058 1058 F_SOFTUNLOCK, S_OTHER);
1059 1059 }
1060 1060 }
1061 1061 if (as_lock_held)
1062 1062 AS_LOCK_EXIT(as, &as->a_lock);
1063 1063 if ((lwp != NULL) && (!is_xhat))
1064 1064 lwp->lwp_nostop--;
1065 1065
1066 1066 /*
1067 1067 * If the lower levels returned EDEADLK for a fault,
1068 1068 * It means that we should retry the fault. Let's wait
1069 1069 * a bit also to let the deadlock causing condition clear.
1070 1070 * This is part of a gross hack to work around a design flaw
1071 1071 * in the ufs/sds logging code and should go away when the
1072 1072 * logging code is re-designed to fix the problem. See bug
1073 1073 * 4125102 for details of the problem.
1074 1074 */
1075 1075 if (FC_ERRNO(res) == EDEADLK) {
1076 1076 delay(deadlk_wait);
1077 1077 res = 0;
1078 1078 goto retry;
1079 1079 }
1080 1080 return (res);
1081 1081 }
1082 1082
1083 1083
1084 1084
1085 1085 /*
1086 1086 * Asynchronous ``fault'' at addr for size bytes.
1087 1087 */
1088 1088 faultcode_t
1089 1089 as_faulta(struct as *as, caddr_t addr, size_t size)
1090 1090 {
1091 1091 struct seg *seg;
1092 1092 caddr_t raddr; /* rounded down addr */
1093 1093 size_t rsize; /* rounded up size */
1094 1094 faultcode_t res = 0;
1095 1095 klwp_t *lwp = ttolwp(curthread);
1096 1096
1097 1097 retry:
1098 1098 /*
1099 1099 * Indicate that the lwp is not to be stopped while waiting
1100 1100 * for a pagefault. This is to avoid deadlock while debugging
1101 1101 * a process via /proc over NFS (in particular).
1102 1102 */
1103 1103 if (lwp != NULL)
1104 1104 lwp->lwp_nostop++;
1105 1105
1106 1106 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1107 1107 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1108 1108 (size_t)raddr;
1109 1109
1110 1110 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1111 1111 seg = as_segat(as, raddr);
1112 1112 if (seg == NULL) {
1113 1113 AS_LOCK_EXIT(as, &as->a_lock);
1114 1114 if (lwp != NULL)
1115 1115 lwp->lwp_nostop--;
1116 1116 return (FC_NOMAP);
1117 1117 }
1118 1118
1119 1119 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1120 1120 if (raddr >= seg->s_base + seg->s_size) {
1121 1121 seg = AS_SEGNEXT(as, seg);
1122 1122 if (seg == NULL || raddr != seg->s_base) {
1123 1123 res = FC_NOMAP;
1124 1124 break;
1125 1125 }
1126 1126 }
1127 1127 res = SEGOP_FAULTA(seg, raddr);
1128 1128 if (res != 0)
1129 1129 break;
1130 1130 }
1131 1131 AS_LOCK_EXIT(as, &as->a_lock);
1132 1132 if (lwp != NULL)
1133 1133 lwp->lwp_nostop--;
1134 1134 /*
1135 1135 * If the lower levels returned EDEADLK for a fault,
1136 1136 * It means that we should retry the fault. Let's wait
1137 1137 * a bit also to let the deadlock causing condition clear.
1138 1138 * This is part of a gross hack to work around a design flaw
1139 1139 * in the ufs/sds logging code and should go away when the
1140 1140 * logging code is re-designed to fix the problem. See bug
1141 1141 * 4125102 for details of the problem.
1142 1142 */
1143 1143 if (FC_ERRNO(res) == EDEADLK) {
1144 1144 delay(deadlk_wait);
1145 1145 res = 0;
1146 1146 goto retry;
1147 1147 }
1148 1148 return (res);
1149 1149 }
1150 1150
1151 1151 /*
1152 1152 * Set the virtual mapping for the interval from [addr : addr + size)
1153 1153 * in address space `as' to have the specified protection.
1154 1154 * It is ok for the range to cross over several segments,
1155 1155 * as long as they are contiguous.
1156 1156 */
1157 1157 int
1158 1158 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1159 1159 {
1160 1160 struct seg *seg;
1161 1161 struct as_callback *cb;
1162 1162 size_t ssize;
1163 1163 caddr_t raddr; /* rounded down addr */
1164 1164 size_t rsize; /* rounded up size */
1165 1165 int error = 0, writer = 0;
1166 1166 caddr_t saveraddr;
1167 1167 size_t saversize;
1168 1168
1169 1169 setprot_top:
1170 1170 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1171 1171 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1172 1172 (size_t)raddr;
1173 1173
1174 1174 if (raddr + rsize < raddr) /* check for wraparound */
1175 1175 return (ENOMEM);
1176 1176
1177 1177 saveraddr = raddr;
1178 1178 saversize = rsize;
1179 1179
1180 1180 /*
1181 1181 * Normally we only lock the as as a reader. But
1182 1182 * if due to setprot the segment driver needs to split
1183 1183 * a segment it will return IE_RETRY. Therefore we re-acquire
1184 1184 * the as lock as a writer so the segment driver can change
1185 1185 * the seg list. Also the segment driver will return IE_RETRY
1186 1186 * after it has changed the segment list so we therefore keep
1187 1187 * locking as a writer. Since these opeartions should be rare
1188 1188 * want to only lock as a writer when necessary.
1189 1189 */
1190 1190 if (writer || avl_numnodes(&as->a_wpage) != 0) {
1191 1191 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1192 1192 } else {
1193 1193 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1194 1194 }
1195 1195
1196 1196 as_clearwatchprot(as, raddr, rsize);
1197 1197 seg = as_segat(as, raddr);
1198 1198 if (seg == NULL) {
1199 1199 as_setwatch(as);
1200 1200 AS_LOCK_EXIT(as, &as->a_lock);
1201 1201 return (ENOMEM);
1202 1202 }
1203 1203
1204 1204 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1205 1205 if (raddr >= seg->s_base + seg->s_size) {
1206 1206 seg = AS_SEGNEXT(as, seg);
1207 1207 if (seg == NULL || raddr != seg->s_base) {
1208 1208 error = ENOMEM;
1209 1209 break;
1210 1210 }
1211 1211 }
1212 1212 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1213 1213 ssize = seg->s_base + seg->s_size - raddr;
1214 1214 else
1215 1215 ssize = rsize;
1216 1216 retry:
1217 1217 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1218 1218
1219 1219 if (error == IE_NOMEM) {
1220 1220 error = EAGAIN;
1221 1221 break;
1222 1222 }
1223 1223
1224 1224 if (error == IE_RETRY) {
1225 1225 AS_LOCK_EXIT(as, &as->a_lock);
1226 1226 writer = 1;
1227 1227 goto setprot_top;
1228 1228 }
1229 1229
1230 1230 if (error == EAGAIN) {
1231 1231 /*
1232 1232 * Make sure we have a_lock as writer.
1233 1233 */
1234 1234 if (writer == 0) {
1235 1235 AS_LOCK_EXIT(as, &as->a_lock);
1236 1236 writer = 1;
1237 1237 goto setprot_top;
1238 1238 }
1239 1239
1240 1240 /*
1241 1241 * Memory is currently locked. It must be unlocked
1242 1242 * before this operation can succeed through a retry.
1243 1243 * The possible reasons for locked memory and
1244 1244 * corresponding strategies for unlocking are:
1245 1245 * (1) Normal I/O
1246 1246 * wait for a signal that the I/O operation
1247 1247 * has completed and the memory is unlocked.
1248 1248 * (2) Asynchronous I/O
1249 1249 * The aio subsystem does not unlock pages when
1250 1250 * the I/O is completed. Those pages are unlocked
1251 1251 * when the application calls aiowait/aioerror.
1252 1252 * So, to prevent blocking forever, cv_broadcast()
1253 1253 * is done to wake up aio_cleanup_thread.
1254 1254 * Subsequently, segvn_reclaim will be called, and
1255 1255 * that will do AS_CLRUNMAPWAIT() and wake us up.
1256 1256 * (3) Long term page locking:
1257 1257 * Drivers intending to have pages locked for a
1258 1258 * period considerably longer than for normal I/O
1259 1259 * (essentially forever) may have registered for a
1260 1260 * callback so they may unlock these pages on
1261 1261 * request. This is needed to allow this operation
1262 1262 * to succeed. Each entry on the callback list is
1263 1263 * examined. If the event or address range pertains
1264 1264 * the callback is invoked (unless it already is in
1265 1265 * progress). The a_contents lock must be dropped
1266 1266 * before the callback, so only one callback can
1267 1267 * be done at a time. Go to the top and do more
1268 1268 * until zero is returned. If zero is returned,
1269 1269 * either there were no callbacks for this event
1270 1270 * or they were already in progress.
1271 1271 */
1272 1272 mutex_enter(&as->a_contents);
1273 1273 if (as->a_callbacks &&
1274 1274 (cb = as_find_callback(as, AS_SETPROT_EVENT,
1275 1275 seg->s_base, seg->s_size))) {
1276 1276 AS_LOCK_EXIT(as, &as->a_lock);
1277 1277 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1278 1278 } else if (!AS_ISNOUNMAPWAIT(as)) {
1279 1279 if (AS_ISUNMAPWAIT(as) == 0)
1280 1280 cv_broadcast(&as->a_cv);
1281 1281 AS_SETUNMAPWAIT(as);
1282 1282 AS_LOCK_EXIT(as, &as->a_lock);
1283 1283 while (AS_ISUNMAPWAIT(as))
1284 1284 cv_wait(&as->a_cv, &as->a_contents);
1285 1285 } else {
1286 1286 /*
1287 1287 * We may have raced with
1288 1288 * segvn_reclaim()/segspt_reclaim(). In this
1289 1289 * case clean nounmapwait flag and retry since
1290 1290 * softlockcnt in this segment may be already
1291 1291 * 0. We don't drop as writer lock so our
1292 1292 * number of retries without sleeping should
1293 1293 * be very small. See segvn_reclaim() for
1294 1294 * more comments.
1295 1295 */
1296 1296 AS_CLRNOUNMAPWAIT(as);
1297 1297 mutex_exit(&as->a_contents);
1298 1298 goto retry;
1299 1299 }
1300 1300 mutex_exit(&as->a_contents);
1301 1301 goto setprot_top;
1302 1302 } else if (error != 0)
1303 1303 break;
1304 1304 }
1305 1305 if (error != 0) {
1306 1306 as_setwatch(as);
1307 1307 } else {
1308 1308 as_setwatchprot(as, saveraddr, saversize, prot);
1309 1309 }
1310 1310 AS_LOCK_EXIT(as, &as->a_lock);
1311 1311 return (error);
1312 1312 }
1313 1313
1314 1314 /*
1315 1315 * Check to make sure that the interval [addr, addr + size)
1316 1316 * in address space `as' has at least the specified protection.
1317 1317 * It is ok for the range to cross over several segments, as long
1318 1318 * as they are contiguous.
1319 1319 */
1320 1320 int
1321 1321 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1322 1322 {
1323 1323 struct seg *seg;
1324 1324 size_t ssize;
1325 1325 caddr_t raddr; /* rounded down addr */
1326 1326 size_t rsize; /* rounded up size */
1327 1327 int error = 0;
1328 1328
1329 1329 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1330 1330 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1331 1331 (size_t)raddr;
1332 1332
1333 1333 if (raddr + rsize < raddr) /* check for wraparound */
1334 1334 return (ENOMEM);
1335 1335
1336 1336 /*
1337 1337 * This is ugly as sin...
1338 1338 * Normally, we only acquire the address space readers lock.
1339 1339 * However, if the address space has watchpoints present,
1340 1340 * we must acquire the writer lock on the address space for
1341 1341 * the benefit of as_clearwatchprot() and as_setwatchprot().
1342 1342 */
1343 1343 if (avl_numnodes(&as->a_wpage) != 0)
1344 1344 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1345 1345 else
1346 1346 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1347 1347 as_clearwatchprot(as, raddr, rsize);
1348 1348 seg = as_segat(as, raddr);
1349 1349 if (seg == NULL) {
1350 1350 as_setwatch(as);
1351 1351 AS_LOCK_EXIT(as, &as->a_lock);
1352 1352 return (ENOMEM);
1353 1353 }
1354 1354
1355 1355 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1356 1356 if (raddr >= seg->s_base + seg->s_size) {
1357 1357 seg = AS_SEGNEXT(as, seg);
1358 1358 if (seg == NULL || raddr != seg->s_base) {
1359 1359 error = ENOMEM;
1360 1360 break;
1361 1361 }
1362 1362 }
1363 1363 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1364 1364 ssize = seg->s_base + seg->s_size - raddr;
1365 1365 else
1366 1366 ssize = rsize;
1367 1367
1368 1368 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1369 1369 if (error != 0)
1370 1370 break;
1371 1371 }
1372 1372 as_setwatch(as);
1373 1373 AS_LOCK_EXIT(as, &as->a_lock);
1374 1374 return (error);
1375 1375 }
1376 1376
1377 1377 int
1378 1378 as_unmap(struct as *as, caddr_t addr, size_t size)
1379 1379 {
1380 1380 struct seg *seg, *seg_next;
1381 1381 struct as_callback *cb;
1382 1382 caddr_t raddr, eaddr;
1383 1383 size_t ssize, rsize = 0;
1384 1384 int err;
1385 1385
1386 1386 top:
1387 1387 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1388 1388 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1389 1389 (uintptr_t)PAGEMASK);
1390 1390
1391 1391 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1392 1392
1393 1393 as->a_updatedir = 1; /* inform /proc */
1394 1394 gethrestime(&as->a_updatetime);
1395 1395
1396 1396 /*
1397 1397 * Use as_findseg to find the first segment in the range, then
1398 1398 * step through the segments in order, following s_next.
1399 1399 */
1400 1400 as_clearwatchprot(as, raddr, eaddr - raddr);
1401 1401
1402 1402 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1403 1403 if (eaddr <= seg->s_base)
1404 1404 break; /* eaddr was in a gap; all done */
1405 1405
1406 1406 /* this is implied by the test above */
1407 1407 ASSERT(raddr < eaddr);
1408 1408
1409 1409 if (raddr < seg->s_base)
1410 1410 raddr = seg->s_base; /* raddr was in a gap */
1411 1411
1412 1412 if (eaddr > (seg->s_base + seg->s_size))
1413 1413 ssize = seg->s_base + seg->s_size - raddr;
1414 1414 else
1415 1415 ssize = eaddr - raddr;
1416 1416
1417 1417 /*
1418 1418 * Save next segment pointer since seg can be
1419 1419 * destroyed during the segment unmap operation.
1420 1420 */
1421 1421 seg_next = AS_SEGNEXT(as, seg);
1422 1422
1423 1423 /*
1424 1424 * We didn't count /dev/null mappings, so ignore them here.
1425 1425 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1426 1426 * we have to do this check here while we have seg.)
1427 1427 */
1428 1428 rsize = 0;
1429 1429 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1430 1430 !SEG_IS_PARTIAL_RESV(seg))
1431 1431 rsize = ssize;
1432 1432
1433 1433 retry:
1434 1434 err = SEGOP_UNMAP(seg, raddr, ssize);
1435 1435 if (err == EAGAIN) {
1436 1436 /*
1437 1437 * Memory is currently locked. It must be unlocked
1438 1438 * before this operation can succeed through a retry.
1439 1439 * The possible reasons for locked memory and
1440 1440 * corresponding strategies for unlocking are:
1441 1441 * (1) Normal I/O
1442 1442 * wait for a signal that the I/O operation
1443 1443 * has completed and the memory is unlocked.
1444 1444 * (2) Asynchronous I/O
1445 1445 * The aio subsystem does not unlock pages when
1446 1446 * the I/O is completed. Those pages are unlocked
1447 1447 * when the application calls aiowait/aioerror.
1448 1448 * So, to prevent blocking forever, cv_broadcast()
1449 1449 * is done to wake up aio_cleanup_thread.
1450 1450 * Subsequently, segvn_reclaim will be called, and
1451 1451 * that will do AS_CLRUNMAPWAIT() and wake us up.
1452 1452 * (3) Long term page locking:
1453 1453 * Drivers intending to have pages locked for a
1454 1454 * period considerably longer than for normal I/O
1455 1455 * (essentially forever) may have registered for a
1456 1456 * callback so they may unlock these pages on
1457 1457 * request. This is needed to allow this operation
1458 1458 * to succeed. Each entry on the callback list is
1459 1459 * examined. If the event or address range pertains
1460 1460 * the callback is invoked (unless it already is in
1461 1461 * progress). The a_contents lock must be dropped
1462 1462 * before the callback, so only one callback can
1463 1463 * be done at a time. Go to the top and do more
1464 1464 * until zero is returned. If zero is returned,
1465 1465 * either there were no callbacks for this event
1466 1466 * or they were already in progress.
1467 1467 */
1468 1468 mutex_enter(&as->a_contents);
1469 1469 if (as->a_callbacks &&
1470 1470 (cb = as_find_callback(as, AS_UNMAP_EVENT,
1471 1471 seg->s_base, seg->s_size))) {
1472 1472 AS_LOCK_EXIT(as, &as->a_lock);
1473 1473 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1474 1474 } else if (!AS_ISNOUNMAPWAIT(as)) {
1475 1475 if (AS_ISUNMAPWAIT(as) == 0)
1476 1476 cv_broadcast(&as->a_cv);
1477 1477 AS_SETUNMAPWAIT(as);
1478 1478 AS_LOCK_EXIT(as, &as->a_lock);
1479 1479 while (AS_ISUNMAPWAIT(as))
1480 1480 cv_wait(&as->a_cv, &as->a_contents);
1481 1481 } else {
1482 1482 /*
1483 1483 * We may have raced with
1484 1484 * segvn_reclaim()/segspt_reclaim(). In this
1485 1485 * case clean nounmapwait flag and retry since
1486 1486 * softlockcnt in this segment may be already
1487 1487 * 0. We don't drop as writer lock so our
1488 1488 * number of retries without sleeping should
1489 1489 * be very small. See segvn_reclaim() for
1490 1490 * more comments.
1491 1491 */
1492 1492 AS_CLRNOUNMAPWAIT(as);
1493 1493 mutex_exit(&as->a_contents);
1494 1494 goto retry;
1495 1495 }
1496 1496 mutex_exit(&as->a_contents);
1497 1497 goto top;
1498 1498 } else if (err == IE_RETRY) {
1499 1499 AS_LOCK_EXIT(as, &as->a_lock);
1500 1500 goto top;
1501 1501 } else if (err) {
1502 1502 as_setwatch(as);
1503 1503 AS_LOCK_EXIT(as, &as->a_lock);
1504 1504 return (-1);
1505 1505 }
1506 1506
1507 1507 as->a_size -= ssize;
1508 1508 if (rsize)
1509 1509 as->a_resvsize -= rsize;
1510 1510 raddr += ssize;
1511 1511 }
1512 1512 AS_LOCK_EXIT(as, &as->a_lock);
1513 1513 return (0);
1514 1514 }
1515 1515
1516 1516 static int
1517 1517 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1518 1518 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1519 1519 {
1520 1520 uint_t szc;
1521 1521 uint_t nszc;
1522 1522 int error;
1523 1523 caddr_t a;
1524 1524 caddr_t eaddr;
1525 1525 size_t segsize;
1526 1526 struct seg *seg;
1527 1527 size_t pgsz;
1528 1528 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1529 1529 uint_t save_szcvec;
1530 1530
1531 1531 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1532 1532 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1533 1533 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1534 1534 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1535 1535 if (!do_off) {
1536 1536 vn_a->offset = 0;
1537 1537 }
1538 1538
1539 1539 if (szcvec <= 1) {
1540 1540 seg = seg_alloc(as, addr, size);
1541 1541 if (seg == NULL) {
1542 1542 return (ENOMEM);
1543 1543 }
1544 1544 vn_a->szc = 0;
1545 1545 error = (*crfp)(seg, vn_a);
1546 1546 if (error != 0) {
1547 1547 seg_free(seg);
1548 1548 } else {
1549 1549 as->a_size += size;
1550 1550 as->a_resvsize += size;
1551 1551 }
1552 1552 return (error);
1553 1553 }
1554 1554
1555 1555 eaddr = addr + size;
1556 1556 save_szcvec = szcvec;
1557 1557 szcvec >>= 1;
1558 1558 szc = 0;
1559 1559 nszc = 0;
1560 1560 while (szcvec) {
1561 1561 if ((szcvec & 0x1) == 0) {
1562 1562 nszc++;
1563 1563 szcvec >>= 1;
1564 1564 continue;
1565 1565 }
1566 1566 nszc++;
1567 1567 pgsz = page_get_pagesize(nszc);
1568 1568 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1569 1569 if (a != addr) {
1570 1570 ASSERT(a < eaddr);
1571 1571 segsize = a - addr;
1572 1572 seg = seg_alloc(as, addr, segsize);
1573 1573 if (seg == NULL) {
1574 1574 return (ENOMEM);
1575 1575 }
1576 1576 vn_a->szc = szc;
1577 1577 error = (*crfp)(seg, vn_a);
1578 1578 if (error != 0) {
1579 1579 seg_free(seg);
1580 1580 return (error);
1581 1581 }
1582 1582 as->a_size += segsize;
1583 1583 as->a_resvsize += segsize;
1584 1584 *segcreated = 1;
1585 1585 if (do_off) {
1586 1586 vn_a->offset += segsize;
1587 1587 }
1588 1588 addr = a;
1589 1589 }
1590 1590 szc = nszc;
1591 1591 szcvec >>= 1;
1592 1592 }
1593 1593
1594 1594 ASSERT(addr < eaddr);
1595 1595 szcvec = save_szcvec | 1; /* add 8K pages */
1596 1596 while (szcvec) {
1597 1597 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1598 1598 ASSERT(a >= addr);
1599 1599 if (a != addr) {
1600 1600 segsize = a - addr;
1601 1601 seg = seg_alloc(as, addr, segsize);
1602 1602 if (seg == NULL) {
1603 1603 return (ENOMEM);
1604 1604 }
1605 1605 vn_a->szc = szc;
1606 1606 error = (*crfp)(seg, vn_a);
1607 1607 if (error != 0) {
1608 1608 seg_free(seg);
1609 1609 return (error);
1610 1610 }
1611 1611 as->a_size += segsize;
1612 1612 as->a_resvsize += segsize;
1613 1613 *segcreated = 1;
1614 1614 if (do_off) {
1615 1615 vn_a->offset += segsize;
1616 1616 }
1617 1617 addr = a;
1618 1618 }
1619 1619 szcvec &= ~(1 << szc);
1620 1620 if (szcvec) {
1621 1621 szc = highbit(szcvec) - 1;
1622 1622 pgsz = page_get_pagesize(szc);
1623 1623 }
1624 1624 }
1625 1625 ASSERT(addr == eaddr);
1626 1626
1627 1627 return (0);
1628 1628 }
1629 1629
1630 1630 static int
1631 1631 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1632 1632 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1633 1633 {
1634 1634 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1635 1635 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1636 1636 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1637 1637 type, 0);
1638 1638 int error;
1639 1639 struct seg *seg;
1640 1640 struct vattr va;
1641 1641 u_offset_t eoff;
1642 1642 size_t save_size = 0;
1643 1643 extern size_t textrepl_size_thresh;
1644 1644
1645 1645 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1646 1646 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1647 1647 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1648 1648 ASSERT(vn_a->vp != NULL);
1649 1649 ASSERT(vn_a->amp == NULL);
1650 1650
1651 1651 again:
1652 1652 if (szcvec <= 1) {
1653 1653 seg = seg_alloc(as, addr, size);
1654 1654 if (seg == NULL) {
1655 1655 return (ENOMEM);
1656 1656 }
1657 1657 vn_a->szc = 0;
1658 1658 error = (*crfp)(seg, vn_a);
1659 1659 if (error != 0) {
1660 1660 seg_free(seg);
1661 1661 } else {
1662 1662 as->a_size += size;
1663 1663 as->a_resvsize += size;
1664 1664 }
1665 1665 return (error);
1666 1666 }
1667 1667
1668 1668 va.va_mask = AT_SIZE;
1669 1669 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1670 1670 szcvec = 0;
1671 1671 goto again;
1672 1672 }
1673 1673 eoff = vn_a->offset & PAGEMASK;
1674 1674 if (eoff >= va.va_size) {
1675 1675 szcvec = 0;
1676 1676 goto again;
1677 1677 }
1678 1678 eoff += size;
1679 1679 if (btopr(va.va_size) < btopr(eoff)) {
1680 1680 save_size = size;
1681 1681 size = va.va_size - (vn_a->offset & PAGEMASK);
1682 1682 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1683 1683 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1684 1684 type, 0);
1685 1685 if (szcvec <= 1) {
1686 1686 size = save_size;
1687 1687 goto again;
1688 1688 }
1689 1689 }
1690 1690
1691 1691 if (size > textrepl_size_thresh) {
1692 1692 vn_a->flags |= _MAP_TEXTREPL;
1693 1693 }
1694 1694 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1695 1695 segcreated);
1696 1696 if (error != 0) {
1697 1697 return (error);
1698 1698 }
1699 1699 if (save_size) {
1700 1700 addr += size;
1701 1701 size = save_size - size;
1702 1702 szcvec = 0;
1703 1703 goto again;
1704 1704 }
1705 1705 return (0);
1706 1706 }
1707 1707
1708 1708 /*
1709 1709 * as_map_ansegs: shared or private anonymous memory. Note that the flags
1710 1710 * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1711 1711 */
1712 1712 static int
1713 1713 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1714 1714 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1715 1715 {
1716 1716 uint_t szcvec;
1717 1717 uchar_t type;
1718 1718
1719 1719 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1720 1720 if (vn_a->type == MAP_SHARED) {
1721 1721 type = MAPPGSZC_SHM;
1722 1722 } else if (vn_a->type == MAP_PRIVATE) {
1723 1723 if (vn_a->szc == AS_MAP_HEAP) {
1724 1724 type = MAPPGSZC_HEAP;
1725 1725 } else if (vn_a->szc == AS_MAP_STACK) {
1726 1726 type = MAPPGSZC_STACK;
1727 1727 } else {
1728 1728 type = MAPPGSZC_PRIVM;
1729 1729 }
1730 1730 }
1731 1731 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1732 1732 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1733 1733 (vn_a->flags & MAP_TEXT), type, 0);
1734 1734 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1735 1735 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1736 1736 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1737 1737 ASSERT(vn_a->vp == NULL);
1738 1738
1739 1739 return (as_map_segvn_segs(as, addr, size, szcvec,
1740 1740 crfp, vn_a, segcreated));
1741 1741 }
1742 1742
1743 1743 int
1744 1744 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1745 1745 {
1746 1746 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1747 1747 return (as_map_locked(as, addr, size, crfp, argsp));
1748 1748 }
1749 1749
1750 1750 int
1751 1751 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1752 1752 void *argsp)
1753 1753 {
1754 1754 struct seg *seg = NULL;
1755 1755 caddr_t raddr; /* rounded down addr */
1756 1756 size_t rsize; /* rounded up size */
1757 1757 int error;
1758 1758 int unmap = 0;
1759 1759 struct proc *p = curproc;
1760 1760 struct segvn_crargs crargs;
1761 1761
1762 1762 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1763 1763 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1764 1764 (size_t)raddr;
1765 1765
1766 1766 /*
1767 1767 * check for wrap around
1768 1768 */
1769 1769 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1770 1770 AS_LOCK_EXIT(as, &as->a_lock);
1771 1771 return (ENOMEM);
1772 1772 }
1773 1773
1774 1774 as->a_updatedir = 1; /* inform /proc */
1775 1775 gethrestime(&as->a_updatetime);
1776 1776
1777 1777 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1778 1778 AS_LOCK_EXIT(as, &as->a_lock);
1779 1779
1780 1780 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1781 1781 RCA_UNSAFE_ALL);
1782 1782
1783 1783 return (ENOMEM);
1784 1784 }
1785 1785
1786 1786 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1787 1787 crargs = *(struct segvn_crargs *)argsp;
1788 1788 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1789 1789 if (error != 0) {
1790 1790 AS_LOCK_EXIT(as, &as->a_lock);
1791 1791 if (unmap) {
1792 1792 (void) as_unmap(as, addr, size);
1793 1793 }
1794 1794 return (error);
1795 1795 }
1796 1796 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1797 1797 crargs = *(struct segvn_crargs *)argsp;
1798 1798 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1799 1799 if (error != 0) {
1800 1800 AS_LOCK_EXIT(as, &as->a_lock);
1801 1801 if (unmap) {
1802 1802 (void) as_unmap(as, addr, size);
1803 1803 }
1804 1804 return (error);
1805 1805 }
1806 1806 } else {
1807 1807 seg = seg_alloc(as, addr, size);
1808 1808 if (seg == NULL) {
1809 1809 AS_LOCK_EXIT(as, &as->a_lock);
1810 1810 return (ENOMEM);
1811 1811 }
1812 1812
1813 1813 error = (*crfp)(seg, argsp);
1814 1814 if (error != 0) {
1815 1815 seg_free(seg);
1816 1816 AS_LOCK_EXIT(as, &as->a_lock);
1817 1817 return (error);
1818 1818 }
1819 1819 /*
1820 1820 * Add size now so as_unmap will work if as_ctl fails.
1821 1821 */
1822 1822 as->a_size += rsize;
1823 1823 as->a_resvsize += rsize;
1824 1824 }
1825 1825
1826 1826 as_setwatch(as);
1827 1827
1828 1828 /*
1829 1829 * If the address space is locked,
1830 1830 * establish memory locks for the new segment.
1831 1831 */
1832 1832 mutex_enter(&as->a_contents);
1833 1833 if (AS_ISPGLCK(as)) {
1834 1834 mutex_exit(&as->a_contents);
1835 1835 AS_LOCK_EXIT(as, &as->a_lock);
1836 1836 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1837 1837 if (error != 0)
1838 1838 (void) as_unmap(as, addr, size);
1839 1839 } else {
1840 1840 mutex_exit(&as->a_contents);
1841 1841 AS_LOCK_EXIT(as, &as->a_lock);
1842 1842 }
1843 1843 return (error);
1844 1844 }
1845 1845
1846 1846
1847 1847 /*
1848 1848 * Delete all segments in the address space marked with S_PURGE.
1849 1849 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1850 1850 * These segments are deleted as a first step before calls to as_gap(), so
1851 1851 * that they don't affect mmap() or shmat().
1852 1852 */
1853 1853 void
1854 1854 as_purge(struct as *as)
1855 1855 {
1856 1856 struct seg *seg;
1857 1857 struct seg *next_seg;
1858 1858
1859 1859 /*
1860 1860 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1861 1861 * no need to grab a_contents mutex for this check
1862 1862 */
1863 1863 if ((as->a_flags & AS_NEEDSPURGE) == 0)
1864 1864 return;
1865 1865
1866 1866 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1867 1867 next_seg = NULL;
1868 1868 seg = AS_SEGFIRST(as);
1869 1869 while (seg != NULL) {
1870 1870 next_seg = AS_SEGNEXT(as, seg);
1871 1871 if (seg->s_flags & S_PURGE)
1872 1872 SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1873 1873 seg = next_seg;
1874 1874 }
1875 1875 AS_LOCK_EXIT(as, &as->a_lock);
1876 1876
1877 1877 mutex_enter(&as->a_contents);
1878 1878 as->a_flags &= ~AS_NEEDSPURGE;
1879 1879 mutex_exit(&as->a_contents);
1880 1880 }
1881 1881
1882 1882 /*
1883 1883 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1884 1884 * range of addresses at least "minlen" long, where the base of the range is
1885 1885 * at "off" phase from an "align" boundary and there is space for a
1886 1886 * "redzone"-sized redzone on eithe rside of the range. Thus,
1887 1887 * if align was 4M and off was 16k, the user wants a hole which will start
1888 1888 * 16k into a 4M page.
1889 1889 *
1890 1890 * If flags specifies AH_HI, the hole will have the highest possible address
1891 1891 * in the range. We use the as->a_lastgap field to figure out where to
1892 1892 * start looking for a gap.
1893 1893 *
1894 1894 * Otherwise, the gap will have the lowest possible address.
1895 1895 *
1896 1896 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1897 1897 *
1898 1898 * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1899 1899 * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1900 1900 *
1901 1901 * NOTE: This routine is not correct when base+len overflows caddr_t.
1902 1902 */
1903 1903 int
1904 1904 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1905 1905 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1906 1906 {
1907 1907 caddr_t lobound = *basep;
1908 1908 caddr_t hibound = lobound + *lenp;
1909 1909 struct seg *lseg, *hseg;
1910 1910 caddr_t lo, hi;
1911 1911 int forward;
1912 1912 caddr_t save_base;
1913 1913 size_t save_len;
1914 1914 size_t save_minlen;
1915 1915 size_t save_redzone;
1916 1916 int fast_path = 1;
1917 1917
1918 1918 save_base = *basep;
1919 1919 save_len = *lenp;
1920 1920 save_minlen = minlen;
1921 1921 save_redzone = redzone;
1922 1922
1923 1923 /*
1924 1924 * For the first pass/fast_path, just add align and redzone into
1925 1925 * minlen since if we get an allocation, we can guarantee that it
1926 1926 * will fit the alignment and redzone requested.
1927 1927 * This increases the chance that hibound will be adjusted to
1928 1928 * a_lastgap->s_base which will likely allow us to find an
1929 1929 * acceptable hole in the address space quicker.
1930 1930 * If we can't find a hole with this fast_path, then we look for
1931 1931 * smaller holes in which the alignment and offset may allow
1932 1932 * the allocation to fit.
1933 1933 */
1934 1934 minlen += align;
1935 1935 minlen += 2 * redzone;
1936 1936 redzone = 0;
1937 1937
1938 1938 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1939 1939 if (AS_SEGFIRST(as) == NULL) {
1940 1940 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1941 1941 align, redzone, off)) {
1942 1942 AS_LOCK_EXIT(as, &as->a_lock);
1943 1943 return (0);
1944 1944 } else {
1945 1945 AS_LOCK_EXIT(as, &as->a_lock);
1946 1946 *basep = save_base;
1947 1947 *lenp = save_len;
1948 1948 return (-1);
1949 1949 }
1950 1950 }
1951 1951
1952 1952 retry:
1953 1953 /*
1954 1954 * Set up to iterate over all the inter-segment holes in the given
1955 1955 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1956 1956 * NULL for the highest-addressed hole. If moving backwards, we reset
1957 1957 * sseg to denote the highest-addressed segment.
1958 1958 */
1959 1959 forward = (flags & AH_DIR) == AH_LO;
1960 1960 if (forward) {
1961 1961 hseg = as_findseg(as, lobound, 1);
1962 1962 lseg = AS_SEGPREV(as, hseg);
1963 1963 } else {
1964 1964
1965 1965 /*
1966 1966 * If allocating at least as much as the last allocation,
1967 1967 * use a_lastgap's base as a better estimate of hibound.
1968 1968 */
1969 1969 if (as->a_lastgap &&
1970 1970 minlen >= as->a_lastgap->s_size &&
1971 1971 hibound >= as->a_lastgap->s_base)
1972 1972 hibound = as->a_lastgap->s_base;
1973 1973
1974 1974 hseg = as_findseg(as, hibound, 1);
1975 1975 if (hseg->s_base + hseg->s_size < hibound) {
1976 1976 lseg = hseg;
1977 1977 hseg = NULL;
1978 1978 } else {
1979 1979 lseg = AS_SEGPREV(as, hseg);
1980 1980 }
1981 1981 }
1982 1982
1983 1983 for (;;) {
1984 1984 /*
1985 1985 * Set lo and hi to the hole's boundaries. (We should really
1986 1986 * use MAXADDR in place of hibound in the expression below,
1987 1987 * but can't express it easily; using hibound in its place is
1988 1988 * harmless.)
1989 1989 */
1990 1990 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1991 1991 hi = (hseg == NULL) ? hibound : hseg->s_base;
1992 1992 /*
1993 1993 * If the iteration has moved past the interval from lobound
1994 1994 * to hibound it's pointless to continue.
1995 1995 */
1996 1996 if ((forward && lo > hibound) || (!forward && hi < lobound))
1997 1997 break;
1998 1998 else if (lo > hibound || hi < lobound)
1999 1999 goto cont;
2000 2000 /*
2001 2001 * Candidate hole lies at least partially within the allowable
2002 2002 * range. Restrict it to fall completely within that range,
2003 2003 * i.e., to [max(lo, lobound), min(hi, hibound)].
2004 2004 */
2005 2005 if (lo < lobound)
2006 2006 lo = lobound;
2007 2007 if (hi > hibound)
2008 2008 hi = hibound;
2009 2009 /*
2010 2010 * Verify that the candidate hole is big enough and meets
2011 2011 * hardware constraints. If the hole is too small, no need
2012 2012 * to do the further checks since they will fail.
2013 2013 */
2014 2014 *basep = lo;
2015 2015 *lenp = hi - lo;
2016 2016 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
2017 2017 minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
2018 2018 ((flags & AH_CONTAIN) == 0 ||
2019 2019 (*basep <= addr && *basep + *lenp > addr))) {
2020 2020 if (!forward)
2021 2021 as->a_lastgap = hseg;
2022 2022 if (hseg != NULL)
2023 2023 as->a_lastgaphl = hseg;
2024 2024 else
2025 2025 as->a_lastgaphl = lseg;
2026 2026 AS_LOCK_EXIT(as, &as->a_lock);
2027 2027 return (0);
2028 2028 }
2029 2029 cont:
2030 2030 /*
2031 2031 * Move to the next hole.
2032 2032 */
2033 2033 if (forward) {
2034 2034 lseg = hseg;
2035 2035 if (lseg == NULL)
2036 2036 break;
2037 2037 hseg = AS_SEGNEXT(as, hseg);
2038 2038 } else {
2039 2039 hseg = lseg;
2040 2040 if (hseg == NULL)
2041 2041 break;
2042 2042 lseg = AS_SEGPREV(as, lseg);
2043 2043 }
2044 2044 }
2045 2045 if (fast_path && (align != 0 || save_redzone != 0)) {
2046 2046 fast_path = 0;
2047 2047 minlen = save_minlen;
2048 2048 redzone = save_redzone;
2049 2049 goto retry;
2050 2050 }
2051 2051 *basep = save_base;
2052 2052 *lenp = save_len;
2053 2053 AS_LOCK_EXIT(as, &as->a_lock);
2054 2054 return (-1);
2055 2055 }
2056 2056
2057 2057 /*
2058 2058 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2059 2059 *
2060 2060 * If flags specifies AH_HI, the hole will have the highest possible address
2061 2061 * in the range. We use the as->a_lastgap field to figure out where to
2062 2062 * start looking for a gap.
2063 2063 *
2064 2064 * Otherwise, the gap will have the lowest possible address.
2065 2065 *
2066 2066 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2067 2067 *
2068 2068 * If an adequate hole is found, base and len are set to reflect the part of
2069 2069 * the hole that is within range, and 0 is returned, otherwise,
2070 2070 * -1 is returned.
2071 2071 *
2072 2072 * NOTE: This routine is not correct when base+len overflows caddr_t.
2073 2073 */
2074 2074 int
2075 2075 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2076 2076 caddr_t addr)
2077 2077 {
2078 2078
2079 2079 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2080 2080 }
2081 2081
2082 2082 /*
2083 2083 * Return the next range within [base, base + len) that is backed
2084 2084 * with "real memory". Skip holes and non-seg_vn segments.
2085 2085 * We're lazy and only return one segment at a time.
2086 2086 */
2087 2087 int
2088 2088 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2089 2089 {
2090 2090 extern struct seg_ops segspt_shmops; /* needs a header file */
2091 2091 struct seg *seg;
2092 2092 caddr_t addr, eaddr;
2093 2093 caddr_t segend;
2094 2094
2095 2095 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2096 2096
2097 2097 addr = *basep;
2098 2098 eaddr = addr + *lenp;
2099 2099
2100 2100 seg = as_findseg(as, addr, 0);
2101 2101 if (seg != NULL)
2102 2102 addr = MAX(seg->s_base, addr);
2103 2103
2104 2104 for (;;) {
2105 2105 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2106 2106 AS_LOCK_EXIT(as, &as->a_lock);
2107 2107 return (EINVAL);
2108 2108 }
2109 2109
2110 2110 if (seg->s_ops == &segvn_ops) {
2111 2111 segend = seg->s_base + seg->s_size;
2112 2112 break;
2113 2113 }
2114 2114
2115 2115 /*
2116 2116 * We do ISM by looking into the private data
2117 2117 * to determine the real size of the segment.
2118 2118 */
2119 2119 if (seg->s_ops == &segspt_shmops) {
2120 2120 segend = seg->s_base + spt_realsize(seg);
2121 2121 if (addr < segend)
2122 2122 break;
2123 2123 }
2124 2124
2125 2125 seg = AS_SEGNEXT(as, seg);
2126 2126
2127 2127 if (seg != NULL)
2128 2128 addr = seg->s_base;
2129 2129 }
2130 2130
2131 2131 *basep = addr;
2132 2132
↓ open down ↓ |
2132 lines elided |
↑ open up ↑ |
2133 2133 if (segend > eaddr)
2134 2134 *lenp = eaddr - addr;
2135 2135 else
2136 2136 *lenp = segend - addr;
2137 2137
2138 2138 AS_LOCK_EXIT(as, &as->a_lock);
2139 2139 return (0);
2140 2140 }
2141 2141
2142 2142 /*
2143 - * Swap the pages associated with the address space as out to
2144 - * secondary storage, returning the number of bytes actually
2145 - * swapped.
2146 - *
2147 - * The value returned is intended to correlate well with the process's
2148 - * memory requirements. Its usefulness for this purpose depends on
2149 - * how well the segment-level routines do at returning accurate
2150 - * information.
2151 - */
2152 -size_t
2153 -as_swapout(struct as *as)
2154 -{
2155 - struct seg *seg;
2156 - size_t swpcnt = 0;
2157 -
2158 - /*
2159 - * Kernel-only processes have given up their address
2160 - * spaces. Of course, we shouldn't be attempting to
2161 - * swap out such processes in the first place...
2162 - */
2163 - if (as == NULL)
2164 - return (0);
2165 -
2166 - AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2167 -
2168 - /* Prevent XHATs from attaching */
2169 - mutex_enter(&as->a_contents);
2170 - AS_SETBUSY(as);
2171 - mutex_exit(&as->a_contents);
2172 -
2173 -
2174 - /*
2175 - * Free all mapping resources associated with the address
2176 - * space. The segment-level swapout routines capitalize
2177 - * on this unmapping by scavanging pages that have become
2178 - * unmapped here.
2179 - */
2180 - hat_swapout(as->a_hat);
2181 - if (as->a_xhat != NULL)
2182 - xhat_swapout_all(as);
2183 -
2184 - mutex_enter(&as->a_contents);
2185 - AS_CLRBUSY(as);
2186 - mutex_exit(&as->a_contents);
2187 -
2188 - /*
2189 - * Call the swapout routines of all segments in the address
2190 - * space to do the actual work, accumulating the amount of
2191 - * space reclaimed.
2192 - */
2193 - for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2194 - struct seg_ops *ov = seg->s_ops;
2195 -
2196 - /*
2197 - * We have to check to see if the seg has
2198 - * an ops vector because the seg may have
2199 - * been in the middle of being set up when
2200 - * the process was picked for swapout.
2201 - */
2202 - if ((ov != NULL) && (ov->swapout != NULL))
2203 - swpcnt += SEGOP_SWAPOUT(seg);
2204 - }
2205 - AS_LOCK_EXIT(as, &as->a_lock);
2206 - return (swpcnt);
2207 -}
2208 -
2209 -/*
2210 2143 * Determine whether data from the mappings in interval [addr, addr + size)
2211 2144 * are in the primary memory (core) cache.
2212 2145 */
2213 2146 int
2214 2147 as_incore(struct as *as, caddr_t addr,
2215 2148 size_t size, char *vec, size_t *sizep)
2216 2149 {
2217 2150 struct seg *seg;
2218 2151 size_t ssize;
2219 2152 caddr_t raddr; /* rounded down addr */
2220 2153 size_t rsize; /* rounded up size */
2221 2154 size_t isize; /* iteration size */
2222 2155 int error = 0; /* result, assume success */
2223 2156
2224 2157 *sizep = 0;
2225 2158 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2226 2159 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2227 2160 (size_t)raddr;
2228 2161
2229 2162 if (raddr + rsize < raddr) /* check for wraparound */
2230 2163 return (ENOMEM);
2231 2164
2232 2165 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2233 2166 seg = as_segat(as, raddr);
2234 2167 if (seg == NULL) {
2235 2168 AS_LOCK_EXIT(as, &as->a_lock);
2236 2169 return (-1);
2237 2170 }
2238 2171
2239 2172 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2240 2173 if (raddr >= seg->s_base + seg->s_size) {
2241 2174 seg = AS_SEGNEXT(as, seg);
2242 2175 if (seg == NULL || raddr != seg->s_base) {
2243 2176 error = -1;
2244 2177 break;
2245 2178 }
2246 2179 }
2247 2180 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2248 2181 ssize = seg->s_base + seg->s_size - raddr;
2249 2182 else
2250 2183 ssize = rsize;
2251 2184 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2252 2185 if (isize != ssize) {
2253 2186 error = -1;
2254 2187 break;
2255 2188 }
2256 2189 vec += btopr(ssize);
2257 2190 }
2258 2191 AS_LOCK_EXIT(as, &as->a_lock);
2259 2192 return (error);
2260 2193 }
2261 2194
2262 2195 static void
2263 2196 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2264 2197 ulong_t *bitmap, size_t position, size_t npages)
2265 2198 {
2266 2199 caddr_t range_start;
2267 2200 size_t pos1 = position;
2268 2201 size_t pos2;
2269 2202 size_t size;
2270 2203 size_t end_pos = npages + position;
2271 2204
2272 2205 while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2273 2206 size = ptob((pos2 - pos1));
2274 2207 range_start = (caddr_t)((uintptr_t)addr +
2275 2208 ptob(pos1 - position));
2276 2209
2277 2210 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2278 2211 (ulong_t *)NULL, (size_t)NULL);
2279 2212 pos1 = pos2;
2280 2213 }
2281 2214 }
2282 2215
2283 2216 static void
2284 2217 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2285 2218 caddr_t raddr, size_t rsize)
2286 2219 {
2287 2220 struct seg *seg = as_segat(as, raddr);
2288 2221 size_t ssize;
2289 2222
2290 2223 while (rsize != 0) {
2291 2224 if (raddr >= seg->s_base + seg->s_size)
2292 2225 seg = AS_SEGNEXT(as, seg);
2293 2226
2294 2227 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2295 2228 ssize = seg->s_base + seg->s_size - raddr;
2296 2229 else
2297 2230 ssize = rsize;
2298 2231
2299 2232 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2300 2233
2301 2234 rsize -= ssize;
2302 2235 raddr += ssize;
2303 2236 }
2304 2237 }
2305 2238
2306 2239 /*
2307 2240 * Cache control operations over the interval [addr, addr + size) in
2308 2241 * address space "as".
2309 2242 */
2310 2243 /*ARGSUSED*/
2311 2244 int
2312 2245 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2313 2246 uintptr_t arg, ulong_t *lock_map, size_t pos)
2314 2247 {
2315 2248 struct seg *seg; /* working segment */
2316 2249 caddr_t raddr; /* rounded down addr */
2317 2250 caddr_t initraddr; /* saved initial rounded down addr */
2318 2251 size_t rsize; /* rounded up size */
2319 2252 size_t initrsize; /* saved initial rounded up size */
2320 2253 size_t ssize; /* size of seg */
2321 2254 int error = 0; /* result */
2322 2255 size_t mlock_size; /* size of bitmap */
2323 2256 ulong_t *mlock_map; /* pointer to bitmap used */
2324 2257 /* to represent the locked */
2325 2258 /* pages. */
2326 2259 retry:
2327 2260 if (error == IE_RETRY)
2328 2261 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2329 2262 else
2330 2263 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2331 2264
2332 2265 /*
2333 2266 * If these are address space lock/unlock operations, loop over
2334 2267 * all segments in the address space, as appropriate.
2335 2268 */
2336 2269 if (func == MC_LOCKAS) {
2337 2270 size_t npages, idx;
2338 2271 size_t rlen = 0; /* rounded as length */
2339 2272
2340 2273 idx = pos;
2341 2274
2342 2275 if (arg & MCL_FUTURE) {
2343 2276 mutex_enter(&as->a_contents);
2344 2277 AS_SETPGLCK(as);
2345 2278 mutex_exit(&as->a_contents);
2346 2279 }
2347 2280 if ((arg & MCL_CURRENT) == 0) {
2348 2281 AS_LOCK_EXIT(as, &as->a_lock);
2349 2282 return (0);
2350 2283 }
2351 2284
2352 2285 seg = AS_SEGFIRST(as);
2353 2286 if (seg == NULL) {
2354 2287 AS_LOCK_EXIT(as, &as->a_lock);
2355 2288 return (0);
2356 2289 }
2357 2290
2358 2291 do {
2359 2292 raddr = (caddr_t)((uintptr_t)seg->s_base &
2360 2293 (uintptr_t)PAGEMASK);
2361 2294 rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2362 2295 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2363 2296 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2364 2297
2365 2298 mlock_size = BT_BITOUL(btopr(rlen));
2366 2299 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2367 2300 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2368 2301 AS_LOCK_EXIT(as, &as->a_lock);
2369 2302 return (EAGAIN);
2370 2303 }
2371 2304
2372 2305 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2373 2306 error = SEGOP_LOCKOP(seg, seg->s_base,
2374 2307 seg->s_size, attr, MC_LOCK, mlock_map, pos);
2375 2308 if (error != 0)
2376 2309 break;
2377 2310 pos += seg_pages(seg);
2378 2311 }
2379 2312
2380 2313 if (error) {
2381 2314 for (seg = AS_SEGFIRST(as); seg != NULL;
2382 2315 seg = AS_SEGNEXT(as, seg)) {
2383 2316
2384 2317 raddr = (caddr_t)((uintptr_t)seg->s_base &
2385 2318 (uintptr_t)PAGEMASK);
2386 2319 npages = seg_pages(seg);
2387 2320 as_segunlock(seg, raddr, attr, mlock_map,
2388 2321 idx, npages);
2389 2322 idx += npages;
2390 2323 }
2391 2324 }
2392 2325
2393 2326 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2394 2327 AS_LOCK_EXIT(as, &as->a_lock);
2395 2328 goto lockerr;
2396 2329 } else if (func == MC_UNLOCKAS) {
2397 2330 mutex_enter(&as->a_contents);
2398 2331 AS_CLRPGLCK(as);
2399 2332 mutex_exit(&as->a_contents);
2400 2333
2401 2334 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2402 2335 error = SEGOP_LOCKOP(seg, seg->s_base,
2403 2336 seg->s_size, attr, MC_UNLOCK, NULL, 0);
2404 2337 if (error != 0)
2405 2338 break;
2406 2339 }
2407 2340
2408 2341 AS_LOCK_EXIT(as, &as->a_lock);
2409 2342 goto lockerr;
2410 2343 }
2411 2344
2412 2345 /*
2413 2346 * Normalize addresses and sizes.
2414 2347 */
2415 2348 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2416 2349 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2417 2350 (size_t)raddr;
2418 2351
2419 2352 if (raddr + rsize < raddr) { /* check for wraparound */
2420 2353 AS_LOCK_EXIT(as, &as->a_lock);
2421 2354 return (ENOMEM);
2422 2355 }
2423 2356
2424 2357 /*
2425 2358 * Get initial segment.
2426 2359 */
2427 2360 if ((seg = as_segat(as, raddr)) == NULL) {
2428 2361 AS_LOCK_EXIT(as, &as->a_lock);
2429 2362 return (ENOMEM);
2430 2363 }
2431 2364
2432 2365 if (func == MC_LOCK) {
2433 2366 mlock_size = BT_BITOUL(btopr(rsize));
2434 2367 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2435 2368 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2436 2369 AS_LOCK_EXIT(as, &as->a_lock);
2437 2370 return (EAGAIN);
2438 2371 }
2439 2372 }
2440 2373
2441 2374 /*
2442 2375 * Loop over all segments. If a hole in the address range is
2443 2376 * discovered, then fail. For each segment, perform the appropriate
2444 2377 * control operation.
2445 2378 */
2446 2379 while (rsize != 0) {
2447 2380
2448 2381 /*
2449 2382 * Make sure there's no hole, calculate the portion
2450 2383 * of the next segment to be operated over.
2451 2384 */
2452 2385 if (raddr >= seg->s_base + seg->s_size) {
2453 2386 seg = AS_SEGNEXT(as, seg);
2454 2387 if (seg == NULL || raddr != seg->s_base) {
2455 2388 if (func == MC_LOCK) {
2456 2389 as_unlockerr(as, attr, mlock_map,
2457 2390 initraddr, initrsize - rsize);
2458 2391 kmem_free(mlock_map,
2459 2392 mlock_size * sizeof (ulong_t));
2460 2393 }
2461 2394 AS_LOCK_EXIT(as, &as->a_lock);
2462 2395 return (ENOMEM);
2463 2396 }
2464 2397 }
2465 2398 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2466 2399 ssize = seg->s_base + seg->s_size - raddr;
2467 2400 else
2468 2401 ssize = rsize;
2469 2402
2470 2403 /*
2471 2404 * Dispatch on specific function.
2472 2405 */
2473 2406 switch (func) {
2474 2407
2475 2408 /*
2476 2409 * Synchronize cached data from mappings with backing
2477 2410 * objects.
2478 2411 */
2479 2412 case MC_SYNC:
2480 2413 if (error = SEGOP_SYNC(seg, raddr, ssize,
2481 2414 attr, (uint_t)arg)) {
2482 2415 AS_LOCK_EXIT(as, &as->a_lock);
2483 2416 return (error);
2484 2417 }
2485 2418 break;
2486 2419
2487 2420 /*
2488 2421 * Lock pages in memory.
2489 2422 */
2490 2423 case MC_LOCK:
2491 2424 if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2492 2425 attr, func, mlock_map, pos)) {
2493 2426 as_unlockerr(as, attr, mlock_map, initraddr,
2494 2427 initrsize - rsize + ssize);
2495 2428 kmem_free(mlock_map, mlock_size *
2496 2429 sizeof (ulong_t));
2497 2430 AS_LOCK_EXIT(as, &as->a_lock);
2498 2431 goto lockerr;
2499 2432 }
2500 2433 break;
2501 2434
2502 2435 /*
2503 2436 * Unlock mapped pages.
2504 2437 */
2505 2438 case MC_UNLOCK:
2506 2439 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2507 2440 (ulong_t *)NULL, (size_t)NULL);
2508 2441 break;
2509 2442
2510 2443 /*
2511 2444 * Store VM advise for mapped pages in segment layer.
2512 2445 */
2513 2446 case MC_ADVISE:
2514 2447 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2515 2448
2516 2449 /*
2517 2450 * Check for regular errors and special retry error
2518 2451 */
2519 2452 if (error) {
2520 2453 if (error == IE_RETRY) {
2521 2454 /*
2522 2455 * Need to acquire writers lock, so
2523 2456 * have to drop readers lock and start
2524 2457 * all over again
2525 2458 */
2526 2459 AS_LOCK_EXIT(as, &as->a_lock);
2527 2460 goto retry;
2528 2461 } else if (error == IE_REATTACH) {
2529 2462 /*
2530 2463 * Find segment for current address
2531 2464 * because current segment just got
2532 2465 * split or concatenated
2533 2466 */
2534 2467 seg = as_segat(as, raddr);
2535 2468 if (seg == NULL) {
2536 2469 AS_LOCK_EXIT(as, &as->a_lock);
2537 2470 return (ENOMEM);
2538 2471 }
2539 2472 } else {
2540 2473 /*
2541 2474 * Regular error
2542 2475 */
2543 2476 AS_LOCK_EXIT(as, &as->a_lock);
2544 2477 return (error);
2545 2478 }
2546 2479 }
2547 2480 break;
2548 2481
2549 2482 /*
2550 2483 * Can't happen.
2551 2484 */
2552 2485 default:
2553 2486 panic("as_ctl: bad operation %d", func);
2554 2487 /*NOTREACHED*/
2555 2488 }
2556 2489
2557 2490 rsize -= ssize;
2558 2491 raddr += ssize;
2559 2492 }
2560 2493
2561 2494 if (func == MC_LOCK)
2562 2495 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2563 2496 AS_LOCK_EXIT(as, &as->a_lock);
2564 2497 return (0);
2565 2498 lockerr:
2566 2499
2567 2500 /*
2568 2501 * If the lower levels returned EDEADLK for a segment lockop,
2569 2502 * it means that we should retry the operation. Let's wait
2570 2503 * a bit also to let the deadlock causing condition clear.
2571 2504 * This is part of a gross hack to work around a design flaw
2572 2505 * in the ufs/sds logging code and should go away when the
2573 2506 * logging code is re-designed to fix the problem. See bug
2574 2507 * 4125102 for details of the problem.
2575 2508 */
2576 2509 if (error == EDEADLK) {
2577 2510 delay(deadlk_wait);
2578 2511 error = 0;
2579 2512 goto retry;
2580 2513 }
2581 2514 return (error);
2582 2515 }
2583 2516
2584 2517 int
2585 2518 fc_decode(faultcode_t fault_err)
2586 2519 {
2587 2520 int error = 0;
2588 2521
2589 2522 switch (FC_CODE(fault_err)) {
2590 2523 case FC_OBJERR:
2591 2524 error = FC_ERRNO(fault_err);
2592 2525 break;
2593 2526 case FC_PROT:
2594 2527 error = EACCES;
2595 2528 break;
2596 2529 default:
2597 2530 error = EFAULT;
2598 2531 break;
2599 2532 }
2600 2533 return (error);
2601 2534 }
2602 2535
2603 2536 /*
2604 2537 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow
2605 2538 * lists from each segment and copy them to one contiguous shadow list (plist)
2606 2539 * as expected by the caller. Save pointers to per segment shadow lists at
2607 2540 * the tail of plist so that they can be used during as_pageunlock().
2608 2541 */
2609 2542 static int
2610 2543 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2611 2544 caddr_t addr, size_t size, enum seg_rw rw)
2612 2545 {
2613 2546 caddr_t sv_addr = addr;
2614 2547 size_t sv_size = size;
2615 2548 struct seg *sv_seg = seg;
2616 2549 ulong_t segcnt = 1;
2617 2550 ulong_t cnt;
2618 2551 size_t ssize;
2619 2552 pgcnt_t npages = btop(size);
2620 2553 page_t **plist;
2621 2554 page_t **pl;
2622 2555 int error;
2623 2556 caddr_t eaddr;
2624 2557 faultcode_t fault_err = 0;
2625 2558 pgcnt_t pl_off;
2626 2559 extern struct seg_ops segspt_shmops;
2627 2560
2628 2561 ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2629 2562 ASSERT(seg != NULL);
2630 2563 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2631 2564 ASSERT(addr + size > seg->s_base + seg->s_size);
2632 2565 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2633 2566 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2634 2567
2635 2568 /*
2636 2569 * Count the number of segments covered by the range we are about to
2637 2570 * lock. The segment count is used to size the shadow list we return
2638 2571 * back to the caller.
2639 2572 */
2640 2573 for (; size != 0; size -= ssize, addr += ssize) {
2641 2574 if (addr >= seg->s_base + seg->s_size) {
2642 2575
2643 2576 seg = AS_SEGNEXT(as, seg);
2644 2577 if (seg == NULL || addr != seg->s_base) {
2645 2578 AS_LOCK_EXIT(as, &as->a_lock);
2646 2579 return (EFAULT);
2647 2580 }
2648 2581 /*
2649 2582 * Do a quick check if subsequent segments
2650 2583 * will most likely support pagelock.
2651 2584 */
2652 2585 if (seg->s_ops == &segvn_ops) {
2653 2586 vnode_t *vp;
2654 2587
2655 2588 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2656 2589 vp != NULL) {
2657 2590 AS_LOCK_EXIT(as, &as->a_lock);
2658 2591 goto slow;
2659 2592 }
2660 2593 } else if (seg->s_ops != &segspt_shmops) {
2661 2594 AS_LOCK_EXIT(as, &as->a_lock);
2662 2595 goto slow;
2663 2596 }
2664 2597 segcnt++;
2665 2598 }
2666 2599 if (addr + size > seg->s_base + seg->s_size) {
2667 2600 ssize = seg->s_base + seg->s_size - addr;
2668 2601 } else {
2669 2602 ssize = size;
2670 2603 }
2671 2604 }
2672 2605 ASSERT(segcnt > 1);
2673 2606
2674 2607 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2675 2608
2676 2609 addr = sv_addr;
2677 2610 size = sv_size;
2678 2611 seg = sv_seg;
2679 2612
2680 2613 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2681 2614 if (addr >= seg->s_base + seg->s_size) {
2682 2615 seg = AS_SEGNEXT(as, seg);
2683 2616 ASSERT(seg != NULL && addr == seg->s_base);
2684 2617 cnt++;
2685 2618 ASSERT(cnt < segcnt);
2686 2619 }
2687 2620 if (addr + size > seg->s_base + seg->s_size) {
2688 2621 ssize = seg->s_base + seg->s_size - addr;
2689 2622 } else {
2690 2623 ssize = size;
2691 2624 }
2692 2625 pl = &plist[npages + cnt];
2693 2626 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2694 2627 L_PAGELOCK, rw);
2695 2628 if (error) {
2696 2629 break;
2697 2630 }
2698 2631 ASSERT(plist[npages + cnt] != NULL);
2699 2632 ASSERT(pl_off + btop(ssize) <= npages);
2700 2633 bcopy(plist[npages + cnt], &plist[pl_off],
2701 2634 btop(ssize) * sizeof (page_t *));
2702 2635 pl_off += btop(ssize);
2703 2636 }
2704 2637
2705 2638 if (size == 0) {
2706 2639 AS_LOCK_EXIT(as, &as->a_lock);
2707 2640 ASSERT(cnt == segcnt - 1);
2708 2641 *ppp = plist;
2709 2642 return (0);
2710 2643 }
2711 2644
2712 2645 /*
2713 2646 * one of pagelock calls failed. The error type is in error variable.
2714 2647 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2715 2648 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2716 2649 * back to the caller.
2717 2650 */
2718 2651
2719 2652 eaddr = addr;
2720 2653 seg = sv_seg;
2721 2654
2722 2655 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2723 2656 if (addr >= seg->s_base + seg->s_size) {
2724 2657 seg = AS_SEGNEXT(as, seg);
2725 2658 ASSERT(seg != NULL && addr == seg->s_base);
2726 2659 cnt++;
2727 2660 ASSERT(cnt < segcnt);
2728 2661 }
2729 2662 if (eaddr > seg->s_base + seg->s_size) {
2730 2663 ssize = seg->s_base + seg->s_size - addr;
2731 2664 } else {
2732 2665 ssize = eaddr - addr;
2733 2666 }
2734 2667 pl = &plist[npages + cnt];
2735 2668 ASSERT(*pl != NULL);
2736 2669 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2737 2670 L_PAGEUNLOCK, rw);
2738 2671 }
2739 2672
2740 2673 AS_LOCK_EXIT(as, &as->a_lock);
2741 2674
2742 2675 kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2743 2676
2744 2677 if (error != ENOTSUP && error != EFAULT) {
2745 2678 return (error);
2746 2679 }
2747 2680
2748 2681 slow:
2749 2682 /*
2750 2683 * If we are here because pagelock failed due to the need to cow fault
2751 2684 * in the pages we want to lock F_SOFTLOCK will do this job and in
2752 2685 * next as_pagelock() call for this address range pagelock will
2753 2686 * hopefully succeed.
2754 2687 */
2755 2688 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2756 2689 if (fault_err != 0) {
2757 2690 return (fc_decode(fault_err));
2758 2691 }
2759 2692 *ppp = NULL;
2760 2693
2761 2694 return (0);
2762 2695 }
2763 2696
2764 2697 /*
2765 2698 * lock pages in a given address space. Return shadow list. If
2766 2699 * the list is NULL, the MMU mapping is also locked.
2767 2700 */
2768 2701 int
2769 2702 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2770 2703 size_t size, enum seg_rw rw)
2771 2704 {
2772 2705 size_t rsize;
2773 2706 caddr_t raddr;
2774 2707 faultcode_t fault_err;
2775 2708 struct seg *seg;
2776 2709 int err;
2777 2710
2778 2711 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2779 2712 "as_pagelock_start: addr %p size %ld", addr, size);
2780 2713
2781 2714 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2782 2715 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2783 2716 (size_t)raddr;
2784 2717
2785 2718 /*
2786 2719 * if the request crosses two segments let
2787 2720 * as_fault handle it.
2788 2721 */
2789 2722 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2790 2723
2791 2724 seg = as_segat(as, raddr);
2792 2725 if (seg == NULL) {
2793 2726 AS_LOCK_EXIT(as, &as->a_lock);
2794 2727 return (EFAULT);
2795 2728 }
2796 2729 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2797 2730 if (raddr + rsize > seg->s_base + seg->s_size) {
2798 2731 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2799 2732 }
2800 2733 if (raddr + rsize <= raddr) {
2801 2734 AS_LOCK_EXIT(as, &as->a_lock);
2802 2735 return (EFAULT);
2803 2736 }
2804 2737
2805 2738 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2806 2739 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2807 2740
2808 2741 /*
2809 2742 * try to lock pages and pass back shadow list
2810 2743 */
2811 2744 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2812 2745
2813 2746 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2814 2747
2815 2748 AS_LOCK_EXIT(as, &as->a_lock);
2816 2749
2817 2750 if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2818 2751 return (err);
2819 2752 }
2820 2753
2821 2754 /*
2822 2755 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2823 2756 * to no pagelock support for this segment or pages need to be cow
2824 2757 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2825 2758 * this as_pagelock() call and in the next as_pagelock() call for the
2826 2759 * same address range pagelock call will hopefull succeed.
2827 2760 */
2828 2761 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2829 2762 if (fault_err != 0) {
2830 2763 return (fc_decode(fault_err));
2831 2764 }
2832 2765 *ppp = NULL;
2833 2766
2834 2767 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2835 2768 return (0);
2836 2769 }
2837 2770
2838 2771 /*
2839 2772 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2840 2773 * lists from the end of plist and call pageunlock interface for each segment.
2841 2774 * Drop as lock and free plist.
2842 2775 */
2843 2776 static void
2844 2777 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2845 2778 struct page **plist, enum seg_rw rw)
2846 2779 {
2847 2780 ulong_t cnt;
2848 2781 caddr_t eaddr = addr + size;
2849 2782 pgcnt_t npages = btop(size);
2850 2783 size_t ssize;
2851 2784 page_t **pl;
2852 2785
2853 2786 ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2854 2787 ASSERT(seg != NULL);
2855 2788 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2856 2789 ASSERT(addr + size > seg->s_base + seg->s_size);
2857 2790 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2858 2791 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2859 2792 ASSERT(plist != NULL);
2860 2793
2861 2794 for (cnt = 0; addr < eaddr; addr += ssize) {
2862 2795 if (addr >= seg->s_base + seg->s_size) {
2863 2796 seg = AS_SEGNEXT(as, seg);
2864 2797 ASSERT(seg != NULL && addr == seg->s_base);
2865 2798 cnt++;
2866 2799 }
2867 2800 if (eaddr > seg->s_base + seg->s_size) {
2868 2801 ssize = seg->s_base + seg->s_size - addr;
2869 2802 } else {
2870 2803 ssize = eaddr - addr;
2871 2804 }
2872 2805 pl = &plist[npages + cnt];
2873 2806 ASSERT(*pl != NULL);
2874 2807 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2875 2808 L_PAGEUNLOCK, rw);
2876 2809 }
2877 2810 ASSERT(cnt > 0);
2878 2811 AS_LOCK_EXIT(as, &as->a_lock);
2879 2812
2880 2813 cnt++;
2881 2814 kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2882 2815 }
2883 2816
2884 2817 /*
2885 2818 * unlock pages in a given address range
2886 2819 */
2887 2820 void
2888 2821 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2889 2822 enum seg_rw rw)
2890 2823 {
2891 2824 struct seg *seg;
2892 2825 size_t rsize;
2893 2826 caddr_t raddr;
2894 2827
2895 2828 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2896 2829 "as_pageunlock_start: addr %p size %ld", addr, size);
2897 2830
2898 2831 /*
2899 2832 * if the shadow list is NULL, as_pagelock was
2900 2833 * falling back to as_fault
2901 2834 */
2902 2835 if (pp == NULL) {
2903 2836 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2904 2837 return;
2905 2838 }
2906 2839
2907 2840 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2908 2841 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2909 2842 (size_t)raddr;
2910 2843
2911 2844 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2912 2845 seg = as_segat(as, raddr);
2913 2846 ASSERT(seg != NULL);
2914 2847
2915 2848 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2916 2849 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2917 2850
2918 2851 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2919 2852 if (raddr + rsize <= seg->s_base + seg->s_size) {
2920 2853 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2921 2854 } else {
2922 2855 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2923 2856 return;
2924 2857 }
2925 2858 AS_LOCK_EXIT(as, &as->a_lock);
2926 2859 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2927 2860 }
2928 2861
2929 2862 int
2930 2863 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2931 2864 boolean_t wait)
2932 2865 {
2933 2866 struct seg *seg;
2934 2867 size_t ssize;
2935 2868 caddr_t raddr; /* rounded down addr */
2936 2869 size_t rsize; /* rounded up size */
2937 2870 int error = 0;
2938 2871 size_t pgsz = page_get_pagesize(szc);
2939 2872
2940 2873 setpgsz_top:
2941 2874 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2942 2875 return (EINVAL);
2943 2876 }
2944 2877
2945 2878 raddr = addr;
2946 2879 rsize = size;
2947 2880
2948 2881 if (raddr + rsize < raddr) /* check for wraparound */
2949 2882 return (ENOMEM);
2950 2883
2951 2884 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2952 2885 as_clearwatchprot(as, raddr, rsize);
2953 2886 seg = as_segat(as, raddr);
2954 2887 if (seg == NULL) {
2955 2888 as_setwatch(as);
2956 2889 AS_LOCK_EXIT(as, &as->a_lock);
2957 2890 return (ENOMEM);
2958 2891 }
2959 2892
2960 2893 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2961 2894 if (raddr >= seg->s_base + seg->s_size) {
2962 2895 seg = AS_SEGNEXT(as, seg);
2963 2896 if (seg == NULL || raddr != seg->s_base) {
2964 2897 error = ENOMEM;
2965 2898 break;
2966 2899 }
2967 2900 }
2968 2901 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2969 2902 ssize = seg->s_base + seg->s_size - raddr;
2970 2903 } else {
2971 2904 ssize = rsize;
2972 2905 }
2973 2906
2974 2907 retry:
2975 2908 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2976 2909
2977 2910 if (error == IE_NOMEM) {
2978 2911 error = EAGAIN;
2979 2912 break;
2980 2913 }
2981 2914
2982 2915 if (error == IE_RETRY) {
2983 2916 AS_LOCK_EXIT(as, &as->a_lock);
2984 2917 goto setpgsz_top;
2985 2918 }
2986 2919
2987 2920 if (error == ENOTSUP) {
2988 2921 error = EINVAL;
2989 2922 break;
2990 2923 }
2991 2924
2992 2925 if (wait && (error == EAGAIN)) {
2993 2926 /*
2994 2927 * Memory is currently locked. It must be unlocked
2995 2928 * before this operation can succeed through a retry.
2996 2929 * The possible reasons for locked memory and
2997 2930 * corresponding strategies for unlocking are:
2998 2931 * (1) Normal I/O
2999 2932 * wait for a signal that the I/O operation
3000 2933 * has completed and the memory is unlocked.
3001 2934 * (2) Asynchronous I/O
3002 2935 * The aio subsystem does not unlock pages when
3003 2936 * the I/O is completed. Those pages are unlocked
3004 2937 * when the application calls aiowait/aioerror.
3005 2938 * So, to prevent blocking forever, cv_broadcast()
3006 2939 * is done to wake up aio_cleanup_thread.
3007 2940 * Subsequently, segvn_reclaim will be called, and
3008 2941 * that will do AS_CLRUNMAPWAIT() and wake us up.
3009 2942 * (3) Long term page locking:
3010 2943 * This is not relevant for as_setpagesize()
3011 2944 * because we cannot change the page size for
3012 2945 * driver memory. The attempt to do so will
3013 2946 * fail with a different error than EAGAIN so
3014 2947 * there's no need to trigger as callbacks like
3015 2948 * as_unmap, as_setprot or as_free would do.
3016 2949 */
3017 2950 mutex_enter(&as->a_contents);
3018 2951 if (!AS_ISNOUNMAPWAIT(as)) {
3019 2952 if (AS_ISUNMAPWAIT(as) == 0) {
3020 2953 cv_broadcast(&as->a_cv);
3021 2954 }
3022 2955 AS_SETUNMAPWAIT(as);
3023 2956 AS_LOCK_EXIT(as, &as->a_lock);
3024 2957 while (AS_ISUNMAPWAIT(as)) {
3025 2958 cv_wait(&as->a_cv, &as->a_contents);
3026 2959 }
3027 2960 } else {
3028 2961 /*
3029 2962 * We may have raced with
3030 2963 * segvn_reclaim()/segspt_reclaim(). In this
3031 2964 * case clean nounmapwait flag and retry since
3032 2965 * softlockcnt in this segment may be already
3033 2966 * 0. We don't drop as writer lock so our
3034 2967 * number of retries without sleeping should
3035 2968 * be very small. See segvn_reclaim() for
3036 2969 * more comments.
3037 2970 */
3038 2971 AS_CLRNOUNMAPWAIT(as);
3039 2972 mutex_exit(&as->a_contents);
3040 2973 goto retry;
3041 2974 }
3042 2975 mutex_exit(&as->a_contents);
3043 2976 goto setpgsz_top;
3044 2977 } else if (error != 0) {
3045 2978 break;
3046 2979 }
3047 2980 }
3048 2981 as_setwatch(as);
3049 2982 AS_LOCK_EXIT(as, &as->a_lock);
3050 2983 return (error);
3051 2984 }
3052 2985
3053 2986 /*
3054 2987 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
3055 2988 * in its chunk where s_szc is less than the szc we want to set.
3056 2989 */
3057 2990 static int
3058 2991 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3059 2992 int *retry)
3060 2993 {
3061 2994 struct seg *seg;
3062 2995 size_t ssize;
3063 2996 int error;
3064 2997
3065 2998 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3066 2999
3067 3000 seg = as_segat(as, raddr);
3068 3001 if (seg == NULL) {
3069 3002 panic("as_iset3_default_lpsize: no seg");
3070 3003 }
3071 3004
3072 3005 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3073 3006 if (raddr >= seg->s_base + seg->s_size) {
3074 3007 seg = AS_SEGNEXT(as, seg);
3075 3008 if (seg == NULL || raddr != seg->s_base) {
3076 3009 panic("as_iset3_default_lpsize: as changed");
3077 3010 }
3078 3011 }
3079 3012 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3080 3013 ssize = seg->s_base + seg->s_size - raddr;
3081 3014 } else {
3082 3015 ssize = rsize;
3083 3016 }
3084 3017
3085 3018 if (szc > seg->s_szc) {
3086 3019 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3087 3020 /* Only retry on EINVAL segments that have no vnode. */
3088 3021 if (error == EINVAL) {
3089 3022 vnode_t *vp = NULL;
3090 3023 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3091 3024 (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3092 3025 vp == NULL)) {
3093 3026 *retry = 1;
3094 3027 } else {
3095 3028 *retry = 0;
3096 3029 }
3097 3030 }
3098 3031 if (error) {
3099 3032 return (error);
3100 3033 }
3101 3034 }
3102 3035 }
3103 3036 return (0);
3104 3037 }
3105 3038
3106 3039 /*
3107 3040 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3108 3041 * pagesize on each segment in its range, but if any fails with EINVAL,
3109 3042 * then it reduces the pagesizes to the next size in the bitmap and
3110 3043 * retries as_iset3_default_lpsize(). The reason why the code retries
3111 3044 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3112 3045 * match the bigger sizes, and (b) it's hard to get this offset (to begin
3113 3046 * with) to pass to map_pgszcvec().
3114 3047 */
3115 3048 static int
3116 3049 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3117 3050 uint_t szcvec)
3118 3051 {
3119 3052 int error;
3120 3053 int retry;
3121 3054
3122 3055 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3123 3056
3124 3057 for (;;) {
3125 3058 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3126 3059 if (error == EINVAL && retry) {
3127 3060 szcvec &= ~(1 << szc);
3128 3061 if (szcvec <= 1) {
3129 3062 return (EINVAL);
3130 3063 }
3131 3064 szc = highbit(szcvec) - 1;
3132 3065 } else {
3133 3066 return (error);
3134 3067 }
3135 3068 }
3136 3069 }
3137 3070
3138 3071 /*
3139 3072 * as_iset1_default_lpsize() breaks its chunk into areas where existing
3140 3073 * segments have a smaller szc than we want to set. For each such area,
3141 3074 * it calls as_iset2_default_lpsize()
3142 3075 */
3143 3076 static int
3144 3077 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3145 3078 uint_t szcvec)
3146 3079 {
3147 3080 struct seg *seg;
3148 3081 size_t ssize;
3149 3082 caddr_t setaddr = raddr;
3150 3083 size_t setsize = 0;
3151 3084 int set;
3152 3085 int error;
3153 3086
3154 3087 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3155 3088
3156 3089 seg = as_segat(as, raddr);
3157 3090 if (seg == NULL) {
3158 3091 panic("as_iset1_default_lpsize: no seg");
3159 3092 }
3160 3093 if (seg->s_szc < szc) {
3161 3094 set = 1;
3162 3095 } else {
3163 3096 set = 0;
3164 3097 }
3165 3098
3166 3099 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3167 3100 if (raddr >= seg->s_base + seg->s_size) {
3168 3101 seg = AS_SEGNEXT(as, seg);
3169 3102 if (seg == NULL || raddr != seg->s_base) {
3170 3103 panic("as_iset1_default_lpsize: as changed");
3171 3104 }
3172 3105 if (seg->s_szc >= szc && set) {
3173 3106 ASSERT(setsize != 0);
3174 3107 error = as_iset2_default_lpsize(as,
3175 3108 setaddr, setsize, szc, szcvec);
3176 3109 if (error) {
3177 3110 return (error);
3178 3111 }
3179 3112 set = 0;
3180 3113 } else if (seg->s_szc < szc && !set) {
3181 3114 setaddr = raddr;
3182 3115 setsize = 0;
3183 3116 set = 1;
3184 3117 }
3185 3118 }
3186 3119 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3187 3120 ssize = seg->s_base + seg->s_size - raddr;
3188 3121 } else {
3189 3122 ssize = rsize;
3190 3123 }
3191 3124 }
3192 3125 error = 0;
3193 3126 if (set) {
3194 3127 ASSERT(setsize != 0);
3195 3128 error = as_iset2_default_lpsize(as, setaddr, setsize,
3196 3129 szc, szcvec);
3197 3130 }
3198 3131 return (error);
3199 3132 }
3200 3133
3201 3134 /*
3202 3135 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3203 3136 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3204 3137 * chunk to as_iset1_default_lpsize().
3205 3138 */
3206 3139 static int
3207 3140 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3208 3141 int type)
3209 3142 {
3210 3143 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3211 3144 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3212 3145 flags, rtype, 1);
3213 3146 uint_t szc;
3214 3147 uint_t nszc;
3215 3148 int error;
3216 3149 caddr_t a;
3217 3150 caddr_t eaddr;
3218 3151 size_t segsize;
3219 3152 size_t pgsz;
3220 3153 uint_t save_szcvec;
3221 3154
3222 3155 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3223 3156 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3224 3157 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3225 3158
3226 3159 szcvec &= ~1;
3227 3160 if (szcvec <= 1) { /* skip if base page size */
3228 3161 return (0);
3229 3162 }
3230 3163
3231 3164 /* Get the pagesize of the first larger page size. */
3232 3165 szc = lowbit(szcvec) - 1;
3233 3166 pgsz = page_get_pagesize(szc);
3234 3167 eaddr = addr + size;
3235 3168 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3236 3169 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3237 3170
3238 3171 save_szcvec = szcvec;
3239 3172 szcvec >>= (szc + 1);
3240 3173 nszc = szc;
3241 3174 while (szcvec) {
3242 3175 if ((szcvec & 0x1) == 0) {
3243 3176 nszc++;
3244 3177 szcvec >>= 1;
3245 3178 continue;
3246 3179 }
3247 3180 nszc++;
3248 3181 pgsz = page_get_pagesize(nszc);
3249 3182 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3250 3183 if (a != addr) {
3251 3184 ASSERT(szc > 0);
3252 3185 ASSERT(a < eaddr);
3253 3186 segsize = a - addr;
3254 3187 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3255 3188 save_szcvec);
3256 3189 if (error) {
3257 3190 return (error);
3258 3191 }
3259 3192 addr = a;
3260 3193 }
3261 3194 szc = nszc;
3262 3195 szcvec >>= 1;
3263 3196 }
3264 3197
3265 3198 ASSERT(addr < eaddr);
3266 3199 szcvec = save_szcvec;
3267 3200 while (szcvec) {
3268 3201 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3269 3202 ASSERT(a >= addr);
3270 3203 if (a != addr) {
3271 3204 ASSERT(szc > 0);
3272 3205 segsize = a - addr;
3273 3206 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3274 3207 save_szcvec);
3275 3208 if (error) {
3276 3209 return (error);
3277 3210 }
3278 3211 addr = a;
3279 3212 }
3280 3213 szcvec &= ~(1 << szc);
3281 3214 if (szcvec) {
3282 3215 szc = highbit(szcvec) - 1;
3283 3216 pgsz = page_get_pagesize(szc);
3284 3217 }
3285 3218 }
3286 3219 ASSERT(addr == eaddr);
3287 3220
3288 3221 return (0);
3289 3222 }
3290 3223
3291 3224 /*
3292 3225 * Set the default large page size for the range. Called via memcntl with
3293 3226 * page size set to 0. as_set_default_lpsize breaks the range down into
3294 3227 * chunks with the same type/flags, ignores-non segvn segments, and passes
3295 3228 * each chunk to as_iset_default_lpsize().
3296 3229 */
3297 3230 int
3298 3231 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3299 3232 {
3300 3233 struct seg *seg;
3301 3234 caddr_t raddr;
3302 3235 size_t rsize;
3303 3236 size_t ssize;
3304 3237 int rtype, rflags;
3305 3238 int stype, sflags;
3306 3239 int error;
3307 3240 caddr_t setaddr;
3308 3241 size_t setsize;
3309 3242 int segvn;
3310 3243
3311 3244 if (size == 0)
3312 3245 return (0);
3313 3246
3314 3247 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3315 3248 again:
3316 3249 error = 0;
3317 3250
3318 3251 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3319 3252 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3320 3253 (size_t)raddr;
3321 3254
3322 3255 if (raddr + rsize < raddr) { /* check for wraparound */
3323 3256 AS_LOCK_EXIT(as, &as->a_lock);
3324 3257 return (ENOMEM);
3325 3258 }
3326 3259 as_clearwatchprot(as, raddr, rsize);
3327 3260 seg = as_segat(as, raddr);
3328 3261 if (seg == NULL) {
3329 3262 as_setwatch(as);
3330 3263 AS_LOCK_EXIT(as, &as->a_lock);
3331 3264 return (ENOMEM);
3332 3265 }
3333 3266 if (seg->s_ops == &segvn_ops) {
3334 3267 rtype = SEGOP_GETTYPE(seg, addr);
3335 3268 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3336 3269 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3337 3270 segvn = 1;
3338 3271 } else {
3339 3272 segvn = 0;
3340 3273 }
3341 3274 setaddr = raddr;
3342 3275 setsize = 0;
3343 3276
3344 3277 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3345 3278 if (raddr >= (seg->s_base + seg->s_size)) {
3346 3279 seg = AS_SEGNEXT(as, seg);
3347 3280 if (seg == NULL || raddr != seg->s_base) {
3348 3281 error = ENOMEM;
3349 3282 break;
3350 3283 }
3351 3284 if (seg->s_ops == &segvn_ops) {
3352 3285 stype = SEGOP_GETTYPE(seg, raddr);
3353 3286 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3354 3287 stype &= (MAP_SHARED | MAP_PRIVATE);
3355 3288 if (segvn && (rflags != sflags ||
3356 3289 rtype != stype)) {
3357 3290 /*
3358 3291 * The next segment is also segvn but
3359 3292 * has different flags and/or type.
3360 3293 */
3361 3294 ASSERT(setsize != 0);
3362 3295 error = as_iset_default_lpsize(as,
3363 3296 setaddr, setsize, rflags, rtype);
3364 3297 if (error) {
3365 3298 break;
3366 3299 }
3367 3300 rflags = sflags;
3368 3301 rtype = stype;
3369 3302 setaddr = raddr;
3370 3303 setsize = 0;
3371 3304 } else if (!segvn) {
3372 3305 rflags = sflags;
3373 3306 rtype = stype;
3374 3307 setaddr = raddr;
3375 3308 setsize = 0;
3376 3309 segvn = 1;
3377 3310 }
3378 3311 } else if (segvn) {
3379 3312 /* The next segment is not segvn. */
3380 3313 ASSERT(setsize != 0);
3381 3314 error = as_iset_default_lpsize(as,
3382 3315 setaddr, setsize, rflags, rtype);
3383 3316 if (error) {
3384 3317 break;
3385 3318 }
3386 3319 segvn = 0;
3387 3320 }
3388 3321 }
3389 3322 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3390 3323 ssize = seg->s_base + seg->s_size - raddr;
3391 3324 } else {
3392 3325 ssize = rsize;
3393 3326 }
3394 3327 }
3395 3328 if (error == 0 && segvn) {
3396 3329 /* The last chunk when rsize == 0. */
3397 3330 ASSERT(setsize != 0);
3398 3331 error = as_iset_default_lpsize(as, setaddr, setsize,
3399 3332 rflags, rtype);
3400 3333 }
3401 3334
3402 3335 if (error == IE_RETRY) {
3403 3336 goto again;
3404 3337 } else if (error == IE_NOMEM) {
3405 3338 error = EAGAIN;
3406 3339 } else if (error == ENOTSUP) {
3407 3340 error = EINVAL;
3408 3341 } else if (error == EAGAIN) {
3409 3342 mutex_enter(&as->a_contents);
3410 3343 if (!AS_ISNOUNMAPWAIT(as)) {
3411 3344 if (AS_ISUNMAPWAIT(as) == 0) {
3412 3345 cv_broadcast(&as->a_cv);
3413 3346 }
3414 3347 AS_SETUNMAPWAIT(as);
3415 3348 AS_LOCK_EXIT(as, &as->a_lock);
3416 3349 while (AS_ISUNMAPWAIT(as)) {
3417 3350 cv_wait(&as->a_cv, &as->a_contents);
3418 3351 }
3419 3352 mutex_exit(&as->a_contents);
3420 3353 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3421 3354 } else {
3422 3355 /*
3423 3356 * We may have raced with
3424 3357 * segvn_reclaim()/segspt_reclaim(). In this case
3425 3358 * clean nounmapwait flag and retry since softlockcnt
3426 3359 * in this segment may be already 0. We don't drop as
3427 3360 * writer lock so our number of retries without
3428 3361 * sleeping should be very small. See segvn_reclaim()
3429 3362 * for more comments.
3430 3363 */
3431 3364 AS_CLRNOUNMAPWAIT(as);
3432 3365 mutex_exit(&as->a_contents);
3433 3366 }
3434 3367 goto again;
3435 3368 }
3436 3369
3437 3370 as_setwatch(as);
3438 3371 AS_LOCK_EXIT(as, &as->a_lock);
3439 3372 return (error);
3440 3373 }
3441 3374
3442 3375 /*
3443 3376 * Setup all of the uninitialized watched pages that we can.
3444 3377 */
3445 3378 void
3446 3379 as_setwatch(struct as *as)
3447 3380 {
3448 3381 struct watched_page *pwp;
3449 3382 struct seg *seg;
3450 3383 caddr_t vaddr;
3451 3384 uint_t prot;
3452 3385 int err, retrycnt;
3453 3386
3454 3387 if (avl_numnodes(&as->a_wpage) == 0)
3455 3388 return;
3456 3389
3457 3390 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3458 3391
3459 3392 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3460 3393 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3461 3394 retrycnt = 0;
3462 3395 retry:
3463 3396 vaddr = pwp->wp_vaddr;
3464 3397 if (pwp->wp_oprot != 0 || /* already set up */
3465 3398 (seg = as_segat(as, vaddr)) == NULL ||
3466 3399 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3467 3400 continue;
3468 3401
3469 3402 pwp->wp_oprot = prot;
3470 3403 if (pwp->wp_read)
3471 3404 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3472 3405 if (pwp->wp_write)
3473 3406 prot &= ~PROT_WRITE;
3474 3407 if (pwp->wp_exec)
3475 3408 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3476 3409 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3477 3410 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3478 3411 if (err == IE_RETRY) {
3479 3412 pwp->wp_oprot = 0;
3480 3413 ASSERT(retrycnt == 0);
3481 3414 retrycnt++;
3482 3415 goto retry;
3483 3416 }
3484 3417 }
3485 3418 pwp->wp_prot = prot;
3486 3419 }
3487 3420 }
3488 3421
3489 3422 /*
3490 3423 * Clear all of the watched pages in the address space.
3491 3424 */
3492 3425 void
3493 3426 as_clearwatch(struct as *as)
3494 3427 {
3495 3428 struct watched_page *pwp;
3496 3429 struct seg *seg;
3497 3430 caddr_t vaddr;
3498 3431 uint_t prot;
3499 3432 int err, retrycnt;
3500 3433
3501 3434 if (avl_numnodes(&as->a_wpage) == 0)
3502 3435 return;
3503 3436
3504 3437 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3505 3438
3506 3439 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3507 3440 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3508 3441 retrycnt = 0;
3509 3442 retry:
3510 3443 vaddr = pwp->wp_vaddr;
3511 3444 if (pwp->wp_oprot == 0 || /* not set up */
3512 3445 (seg = as_segat(as, vaddr)) == NULL)
3513 3446 continue;
3514 3447
3515 3448 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3516 3449 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3517 3450 if (err == IE_RETRY) {
3518 3451 ASSERT(retrycnt == 0);
3519 3452 retrycnt++;
3520 3453 goto retry;
3521 3454 }
3522 3455 }
3523 3456 pwp->wp_oprot = 0;
3524 3457 pwp->wp_prot = 0;
3525 3458 }
3526 3459 }
3527 3460
3528 3461 /*
3529 3462 * Force a new setup for all the watched pages in the range.
3530 3463 */
3531 3464 static void
3532 3465 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3533 3466 {
3534 3467 struct watched_page *pwp;
3535 3468 struct watched_page tpw;
3536 3469 caddr_t eaddr = addr + size;
3537 3470 caddr_t vaddr;
3538 3471 struct seg *seg;
3539 3472 int err, retrycnt;
3540 3473 uint_t wprot;
3541 3474 avl_index_t where;
3542 3475
3543 3476 if (avl_numnodes(&as->a_wpage) == 0)
3544 3477 return;
3545 3478
3546 3479 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3547 3480
3548 3481 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3549 3482 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3550 3483 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3551 3484
3552 3485 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3553 3486 retrycnt = 0;
3554 3487 vaddr = pwp->wp_vaddr;
3555 3488
3556 3489 wprot = prot;
3557 3490 if (pwp->wp_read)
3558 3491 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3559 3492 if (pwp->wp_write)
3560 3493 wprot &= ~PROT_WRITE;
3561 3494 if (pwp->wp_exec)
3562 3495 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3563 3496 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3564 3497 retry:
3565 3498 seg = as_segat(as, vaddr);
3566 3499 if (seg == NULL) {
3567 3500 panic("as_setwatchprot: no seg");
3568 3501 /*NOTREACHED*/
3569 3502 }
3570 3503 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3571 3504 if (err == IE_RETRY) {
3572 3505 ASSERT(retrycnt == 0);
3573 3506 retrycnt++;
3574 3507 goto retry;
3575 3508 }
3576 3509 }
3577 3510 pwp->wp_oprot = prot;
3578 3511 pwp->wp_prot = wprot;
3579 3512
3580 3513 pwp = AVL_NEXT(&as->a_wpage, pwp);
3581 3514 }
3582 3515 }
3583 3516
3584 3517 /*
3585 3518 * Clear all of the watched pages in the range.
3586 3519 */
3587 3520 static void
3588 3521 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3589 3522 {
3590 3523 caddr_t eaddr = addr + size;
3591 3524 struct watched_page *pwp;
3592 3525 struct watched_page tpw;
3593 3526 uint_t prot;
3594 3527 struct seg *seg;
3595 3528 int err, retrycnt;
3596 3529 avl_index_t where;
3597 3530
3598 3531 if (avl_numnodes(&as->a_wpage) == 0)
3599 3532 return;
3600 3533
3601 3534 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3602 3535 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3603 3536 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3604 3537
3605 3538 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3606 3539
3607 3540 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3608 3541
3609 3542 if ((prot = pwp->wp_oprot) != 0) {
3610 3543 retrycnt = 0;
3611 3544
3612 3545 if (prot != pwp->wp_prot) {
3613 3546 retry:
3614 3547 seg = as_segat(as, pwp->wp_vaddr);
3615 3548 if (seg == NULL)
3616 3549 continue;
3617 3550 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3618 3551 PAGESIZE, prot);
3619 3552 if (err == IE_RETRY) {
3620 3553 ASSERT(retrycnt == 0);
3621 3554 retrycnt++;
3622 3555 goto retry;
3623 3556
3624 3557 }
3625 3558 }
3626 3559 pwp->wp_oprot = 0;
3627 3560 pwp->wp_prot = 0;
3628 3561 }
3629 3562
3630 3563 pwp = AVL_NEXT(&as->a_wpage, pwp);
3631 3564 }
3632 3565 }
3633 3566
3634 3567 void
3635 3568 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3636 3569 {
3637 3570 struct proc *p;
3638 3571
3639 3572 mutex_enter(&pidlock);
3640 3573 for (p = practive; p; p = p->p_next) {
3641 3574 if (p->p_as == as) {
3642 3575 mutex_enter(&p->p_lock);
3643 3576 if (p->p_as == as)
3644 3577 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3645 3578 mutex_exit(&p->p_lock);
3646 3579 }
3647 3580 }
3648 3581 mutex_exit(&pidlock);
3649 3582 }
3650 3583
3651 3584 /*
3652 3585 * return memory object ID
3653 3586 */
3654 3587 int
3655 3588 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3656 3589 {
3657 3590 struct seg *seg;
3658 3591 int sts;
3659 3592
3660 3593 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
3661 3594 seg = as_segat(as, addr);
3662 3595 if (seg == NULL) {
3663 3596 AS_LOCK_EXIT(as, &as->a_lock);
3664 3597 return (EFAULT);
3665 3598 }
3666 3599 /*
3667 3600 * catch old drivers which may not support getmemid
3668 3601 */
3669 3602 if (seg->s_ops->getmemid == NULL) {
3670 3603 AS_LOCK_EXIT(as, &as->a_lock);
3671 3604 return (ENODEV);
3672 3605 }
3673 3606
3674 3607 sts = SEGOP_GETMEMID(seg, addr, memidp);
3675 3608
3676 3609 AS_LOCK_EXIT(as, &as->a_lock);
3677 3610 return (sts);
3678 3611 }
↓ open down ↓ |
1459 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX