combined New usr/src/uts/common/vm/vm

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2015, Joyent, Inc.  All rights reserved.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * University Copyright- Copyright (c) 1982, 1986, 1988
  32  * The Regents of the University of California
  33  * All Rights Reserved
  34  *
  35  * University Acknowledgment- Portions of this document are derived from
  36  * software developed by the University of California, Berkeley, and its
  37  * contributors.
  38  */
  39 
  40 /*
  41  * VM - address spaces.
  42  */
  43 
  44 #include <sys/types.h>
  45 #include <sys/t_lock.h>
  46 #include <sys/param.h>
  47 #include <sys/errno.h>
  48 #include <sys/systm.h>
  49 #include <sys/mman.h>
  50 #include <sys/sysmacros.h>
  51 #include <sys/cpuvar.h>
  52 #include <sys/sysinfo.h>
  53 #include <sys/kmem.h>
  54 #include <sys/vnode.h>
  55 #include <sys/vmsystm.h>
  56 #include <sys/cmn_err.h>
  57 #include <sys/debug.h>
  58 #include <sys/tnf_probe.h>
  59 #include <sys/vtrace.h>
  60 
  61 #include <vm/hat.h>
  62 #include <vm/as.h>
  63 #include <vm/seg.h>
  64 #include <vm/seg_vn.h>
  65 #include <vm/seg_dev.h>
  66 #include <vm/seg_kmem.h>
  67 #include <vm/seg_map.h>
  68 #include <vm/seg_spt.h>
  69 #include <vm/page.h>
  70 
  71 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
  72 
  73 static struct kmem_cache *as_cache;
  74 
  75 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  76 static void as_clearwatchprot(struct as *, caddr_t, size_t);
  77 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
  78 
  79 
  80 /*
  81  * Verifying the segment lists is very time-consuming; it may not be
  82  * desirable always to define VERIFY_SEGLIST when DEBUG is set.
  83  */
  84 #ifdef DEBUG
  85 #define VERIFY_SEGLIST
  86 int do_as_verify = 0;
  87 #endif
  88 
  89 /*
  90  * Allocate a new callback data structure entry and fill in the events of
  91  * interest, the address range of interest, and the callback argument.
  92  * Link the entry on the as->a_callbacks list. A callback entry for the
  93  * entire address space may be specified with vaddr = 0 and size = -1.
  94  *
  95  * CALLERS RESPONSIBILITY: If not calling from within the process context for
  96  * the specified as, the caller must guarantee persistence of the specified as
  97  * for the duration of this function (eg. pages being locked within the as
  98  * will guarantee persistence).
  99  */
 100 int
 101 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 102                 caddr_t vaddr, size_t size, int sleepflag)
 103 {
 104         struct as_callback      *current_head, *cb;
 105         caddr_t                 saddr;
 106         size_t                  rsize;
 107 
 108         /* callback function and an event are mandatory */
 109         if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 110                 return (EINVAL);
 111 
 112         /* Adding a callback after as_free has been called is not allowed */
 113         if (as == &kas)
 114                 return (ENOMEM);
 115 
 116         /*
 117          * vaddr = 0 and size = -1 is used to indicate that the callback range
 118          * is the entire address space so no rounding is done in that case.
 119          */
 120         if (size != -1) {
 121                 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 122                 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 123                     (size_t)saddr;
 124                 /* check for wraparound */
 125                 if (saddr + rsize < saddr)
 126                         return (ENOMEM);
 127         } else {
 128                 if (vaddr != 0)
 129                         return (EINVAL);
 130                 saddr = vaddr;
 131                 rsize = size;
 132         }
 133 
 134         /* Allocate and initialize a callback entry */
 135         cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 136         if (cb == NULL)
 137                 return (EAGAIN);
 138 
 139         cb->ascb_func = cb_func;
 140         cb->ascb_arg = arg;
 141         cb->ascb_events = events;
 142         cb->ascb_saddr = saddr;
 143         cb->ascb_len = rsize;
 144 
 145         /* Add the entry to the list */
 146         mutex_enter(&as->a_contents);
 147         current_head = as->a_callbacks;
 148         as->a_callbacks = cb;
 149         cb->ascb_next = current_head;
 150 
 151         /*
 152          * The call to this function may lose in a race with
 153          * a pertinent event - eg. a thread does long term memory locking
 154          * but before the callback is added another thread executes as_unmap.
 155          * A broadcast here resolves that.
 156          */
 157         if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 158                 AS_CLRUNMAPWAIT(as);
 159                 cv_broadcast(&as->a_cv);
 160         }
 161 
 162         mutex_exit(&as->a_contents);
 163         return (0);
 164 }
 165 
 166 /*
 167  * Search the callback list for an entry which pertains to arg.
 168  *
 169  * This is called from within the client upon completion of the callback.
 170  * RETURN VALUES:
 171  *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 172  *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 173  *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 174  *                      entry will be made in as_do_callbacks)
 175  *
 176  * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 177  * set, it indicates that as_do_callbacks is processing this entry.  The
 178  * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 179  * to unblock as_do_callbacks, in case it is blocked.
 180  *
 181  * CALLERS RESPONSIBILITY: If not calling from within the process context for
 182  * the specified as, the caller must guarantee persistence of the specified as
 183  * for the duration of this function (eg. pages being locked within the as
 184  * will guarantee persistence).
 185  */
 186 uint_t
 187 as_delete_callback(struct as *as, void *arg)
 188 {
 189         struct as_callback **prevcb = &as->a_callbacks;
 190         struct as_callback *cb;
 191         uint_t rc = AS_CALLBACK_NOTFOUND;
 192 
 193         mutex_enter(&as->a_contents);
 194         for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 195                 if (cb->ascb_arg != arg)
 196                         continue;
 197 
 198                 /*
 199                  * If the events indicate AS_CALLBACK_CALLED, just clear
 200                  * AS_ALL_EVENT in the events field and wakeup the thread
 201                  * that may be waiting in as_do_callbacks.  as_do_callbacks
 202                  * will take care of removing this entry from the list.  In
 203                  * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 204                  * (AS_CALLBACK_CALLED not set), just remove it from the
 205                  * list, return the memory and return AS_CALLBACK_DELETED.
 206                  */
 207                 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 208                         /* leave AS_CALLBACK_CALLED */
 209                         cb->ascb_events &= ~AS_ALL_EVENT;
 210                         rc = AS_CALLBACK_DELETE_DEFERRED;
 211                         cv_broadcast(&as->a_cv);
 212                 } else {
 213                         *prevcb = cb->ascb_next;
 214                         kmem_free(cb, sizeof (struct as_callback));
 215                         rc = AS_CALLBACK_DELETED;
 216                 }
 217                 break;
 218         }
 219         mutex_exit(&as->a_contents);
 220         return (rc);
 221 }
 222 
 223 /*
 224  * Searches the as callback list for a matching entry.
 225  * Returns a pointer to the first matching callback, or NULL if
 226  * nothing is found.
 227  * This function never sleeps so it is ok to call it with more
 228  * locks held but the (required) a_contents mutex.
 229  *
 230  * See also comment on as_do_callbacks below.
 231  */
 232 static struct as_callback *
 233 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 234                         size_t event_len)
 235 {
 236         struct as_callback      *cb;
 237 
 238         ASSERT(MUTEX_HELD(&as->a_contents));
 239         for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 240                 /*
 241                  * If the callback has not already been called, then
 242                  * check if events or address range pertains.  An event_len
 243                  * of zero means do an unconditional callback.
 244                  */
 245                 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 246                     ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 247                     (event_addr + event_len < cb->ascb_saddr) ||
 248                     (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 249                         continue;
 250                 }
 251                 break;
 252         }
 253         return (cb);
 254 }
 255 
 256 /*
 257  * Executes a given callback and removes it from the callback list for
 258  * this address space.
 259  * This function may sleep so the caller must drop all locks except
 260  * a_contents before calling this func.
 261  *
 262  * See also comments on as_do_callbacks below.
 263  */
 264 static void
 265 as_execute_callback(struct as *as, struct as_callback *cb,
 266                                 uint_t events)
 267 {
 268         struct as_callback **prevcb;
 269         void    *cb_arg;
 270 
 271         ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 272         cb->ascb_events |= AS_CALLBACK_CALLED;
 273         mutex_exit(&as->a_contents);
 274         (*cb->ascb_func)(as, cb->ascb_arg, events);
 275         mutex_enter(&as->a_contents);
 276         /*
 277          * the callback function is required to delete the callback
 278          * when the callback function determines it is OK for
 279          * this thread to continue. as_delete_callback will clear
 280          * the AS_ALL_EVENT in the events field when it is deleted.
 281          * If the callback function called as_delete_callback,
 282          * events will already be cleared and there will be no blocking.
 283          */
 284         while ((cb->ascb_events & events) != 0) {
 285                 cv_wait(&as->a_cv, &as->a_contents);
 286         }
 287         /*
 288          * This entry needs to be taken off the list. Normally, the
 289          * callback func itself does that, but unfortunately the list
 290          * may have changed while the callback was running because the
 291          * a_contents mutex was dropped and someone else other than the
 292          * callback func itself could have called as_delete_callback,
 293          * so we have to search to find this entry again.  The entry
 294          * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 295          */
 296         cb_arg = cb->ascb_arg;
 297         prevcb = &as->a_callbacks;
 298         for (cb = as->a_callbacks; cb != NULL;
 299             prevcb = &cb->ascb_next, cb = *prevcb) {
 300                 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 301                     (cb_arg != cb->ascb_arg)) {
 302                         continue;
 303                 }
 304                 *prevcb = cb->ascb_next;
 305                 kmem_free(cb, sizeof (struct as_callback));
 306                 break;
 307         }
 308 }
 309 
 310 /*
 311  * Check the callback list for a matching event and intersection of
 312  * address range. If there is a match invoke the callback.  Skip an entry if:
 313  *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 314  *    - not event of interest
 315  *    - not address range of interest
 316  *
 317  * An event_len of zero indicates a request for an unconditional callback
 318  * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 319  * a_contents lock must be dropped before a callback, so only one callback
 320  * can be done before returning. Return -1 (true) if a callback was
 321  * executed and removed from the list, else return 0 (false).
 322  *
 323  * The logically separate parts, i.e. finding a matching callback and
 324  * executing a given callback have been separated into two functions
 325  * so that they can be called with different sets of locks held beyond
 326  * the always-required a_contents. as_find_callback does not sleep so
 327  * it is ok to call it if more locks than a_contents (i.e. the a_lock
 328  * rwlock) are held. as_execute_callback on the other hand may sleep
 329  * so all locks beyond a_contents must be dropped by the caller if one
 330  * does not want to end comatose.
 331  */
 332 static int
 333 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 334                         size_t event_len)
 335 {
 336         struct as_callback *cb;
 337 
 338         if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 339                 as_execute_callback(as, cb, events);
 340                 return (-1);
 341         }
 342         return (0);
 343 }
 344 
 345 /*
 346  * Search for the segment containing addr. If a segment containing addr
 347  * exists, that segment is returned.  If no such segment exists, and
 348  * the list spans addresses greater than addr, then the first segment
 349  * whose base is greater than addr is returned; otherwise, NULL is
 350  * returned unless tail is true, in which case the last element of the
 351  * list is returned.
 352  *
 353  * a_seglast is used to cache the last found segment for repeated
 354  * searches to the same addr (which happens frequently).
 355  */
 356 struct seg *
 357 as_findseg(struct as *as, caddr_t addr, int tail)
 358 {
 359         struct seg *seg = as->a_seglast;
 360         avl_index_t where;
 361 
 362         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 363 
 364         if (seg != NULL &&
 365             seg->s_base <= addr &&
 366             addr < seg->s_base + seg->s_size)
 367                 return (seg);
 368 
 369         seg = avl_find(&as->a_segtree, &addr, &where);
 370         if (seg != NULL)
 371                 return (as->a_seglast = seg);
 372 
 373         seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 374         if (seg == NULL && tail)
 375                 seg = avl_last(&as->a_segtree);
 376         return (as->a_seglast = seg);
 377 }
 378 
 379 #ifdef VERIFY_SEGLIST
 380 /*
 381  * verify that the linked list is coherent
 382  */
 383 static void
 384 as_verify(struct as *as)
 385 {
 386         struct seg *seg, *seglast, *p, *n;
 387         uint_t nsegs = 0;
 388 
 389         if (do_as_verify == 0)
 390                 return;
 391 
 392         seglast = as->a_seglast;
 393 
 394         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 395                 ASSERT(seg->s_as == as);
 396                 p = AS_SEGPREV(as, seg);
 397                 n = AS_SEGNEXT(as, seg);
 398                 ASSERT(p == NULL || p->s_as == as);
 399                 ASSERT(p == NULL || p->s_base < seg->s_base);
 400                 ASSERT(n == NULL || n->s_base > seg->s_base);
 401                 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 402                 if (seg == seglast)
 403                         seglast = NULL;
 404                 nsegs++;
 405         }
 406         ASSERT(seglast == NULL);
 407         ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 408 }
 409 #endif /* VERIFY_SEGLIST */
 410 
 411 /*
 412  * Add a new segment to the address space. The avl_find()
 413  * may be expensive so we attempt to use last segment accessed
 414  * in as_gap() as an insertion point.
 415  */
 416 int
 417 as_addseg(struct as  *as, struct seg *newseg)
 418 {
 419         struct seg *seg;
 420         caddr_t addr;
 421         caddr_t eaddr;
 422         avl_index_t where;
 423 
 424         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 425 
 426         as->a_updatedir = 1; /* inform /proc */
 427         gethrestime(&as->a_updatetime);
 428 
 429         if (as->a_lastgaphl != NULL) {
 430                 struct seg *hseg = NULL;
 431                 struct seg *lseg = NULL;
 432 
 433                 if (as->a_lastgaphl->s_base > newseg->s_base) {
 434                         hseg = as->a_lastgaphl;
 435                         lseg = AVL_PREV(&as->a_segtree, hseg);
 436                 } else {
 437                         lseg = as->a_lastgaphl;
 438                         hseg = AVL_NEXT(&as->a_segtree, lseg);
 439                 }
 440 
 441                 if (hseg && lseg && lseg->s_base < newseg->s_base &&
 442                     hseg->s_base > newseg->s_base) {
 443                         avl_insert_here(&as->a_segtree, newseg, lseg,
 444                             AVL_AFTER);
 445                         as->a_lastgaphl = NULL;
 446                         as->a_seglast = newseg;
 447                         return (0);
 448                 }
 449                 as->a_lastgaphl = NULL;
 450         }
 451 
 452         addr = newseg->s_base;
 453         eaddr = addr + newseg->s_size;
 454 again:
 455 
 456         seg = avl_find(&as->a_segtree, &addr, &where);
 457 
 458         if (seg == NULL)
 459                 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 460 
 461         if (seg == NULL)
 462                 seg = avl_last(&as->a_segtree);
 463 
 464         if (seg != NULL) {
 465                 caddr_t base = seg->s_base;
 466 
 467                 /*
 468                  * If top of seg is below the requested address, then
 469                  * the insertion point is at the end of the linked list,
 470                  * and seg points to the tail of the list.  Otherwise,
 471                  * the insertion point is immediately before seg.
 472                  */
 473                 if (base + seg->s_size > addr) {
 474                         if (addr >= base || eaddr > base) {
 475 #ifdef __sparc
 476                                 extern const struct seg_ops segnf_ops;
 477 
 478                                 /*
 479                                  * no-fault segs must disappear if overlaid.
 480                                  * XXX need new segment type so
 481                                  * we don't have to check s_ops
 482                                  */
 483                                 if (seg->s_ops == &segnf_ops) {
 484                                         seg_unmap(seg);
 485                                         goto again;
 486                                 }
 487 #endif
 488                                 return (-1);    /* overlapping segment */
 489                         }
 490                 }
 491         }
 492         as->a_seglast = newseg;
 493         avl_insert(&as->a_segtree, newseg, where);
 494 
 495 #ifdef VERIFY_SEGLIST
 496         as_verify(as);
 497 #endif
 498         return (0);
 499 }
 500 
 501 struct seg *
 502 as_removeseg(struct as *as, struct seg *seg)
 503 {
 504         avl_tree_t *t;
 505 
 506         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 507 
 508         as->a_updatedir = 1; /* inform /proc */
 509         gethrestime(&as->a_updatetime);
 510 
 511         if (seg == NULL)
 512                 return (NULL);
 513 
 514         t = &as->a_segtree;
 515         if (as->a_seglast == seg)
 516                 as->a_seglast = NULL;
 517         as->a_lastgaphl = NULL;
 518 
 519         /*
 520          * if this segment is at an address higher than
 521          * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 522          */
 523         if (as->a_lastgap &&
 524             (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 525                 as->a_lastgap = AVL_NEXT(t, seg);
 526 
 527         /*
 528          * remove the segment from the seg tree
 529          */
 530         avl_remove(t, seg);
 531 
 532 #ifdef VERIFY_SEGLIST
 533         as_verify(as);
 534 #endif
 535         return (seg);
 536 }
 537 
 538 /*
 539  * Find a segment containing addr.
 540  */
 541 struct seg *
 542 as_segat(struct as *as, caddr_t addr)
 543 {
 544         struct seg *seg = as->a_seglast;
 545 
 546         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 547 
 548         if (seg != NULL && seg->s_base <= addr &&
 549             addr < seg->s_base + seg->s_size)
 550                 return (seg);
 551 
 552         seg = avl_find(&as->a_segtree, &addr, NULL);
 553         return (seg);
 554 }
 555 
 556 /*
 557  * Serialize all searches for holes in an address space to
 558  * prevent two or more threads from allocating the same virtual
 559  * address range.  The address space must not be "read/write"
 560  * locked by the caller since we may block.
 561  */
 562 void
 563 as_rangelock(struct as *as)
 564 {
 565         mutex_enter(&as->a_contents);
 566         while (AS_ISCLAIMGAP(as))
 567                 cv_wait(&as->a_cv, &as->a_contents);
 568         AS_SETCLAIMGAP(as);
 569         mutex_exit(&as->a_contents);
 570 }
 571 
 572 /*
 573  * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 574  */
 575 void
 576 as_rangeunlock(struct as *as)
 577 {
 578         mutex_enter(&as->a_contents);
 579         AS_CLRCLAIMGAP(as);
 580         cv_signal(&as->a_cv);
 581         mutex_exit(&as->a_contents);
 582 }
 583 
 584 /*
 585  * compar segments (or just an address) by segment address range
 586  */
 587 static int
 588 as_segcompar(const void *x, const void *y)
 589 {
 590         struct seg *a = (struct seg *)x;
 591         struct seg *b = (struct seg *)y;
 592 
 593         if (a->s_base < b->s_base)
 594                 return (-1);
 595         if (a->s_base >= b->s_base + b->s_size)
 596                 return (1);
 597         return (0);
 598 }
 599 
 600 
 601 void
 602 as_avlinit(struct as *as)
 603 {
 604         avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 605             offsetof(struct seg, s_tree));
 606         avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 607             offsetof(struct watched_page, wp_link));
 608 }
 609 
 610 /*ARGSUSED*/
 611 static int
 612 as_constructor(void *buf, void *cdrarg, int kmflags)
 613 {
 614         struct as *as = buf;
 615 
 616         mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 617         cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 618         rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 619         as_avlinit(as);
 620         return (0);
 621 }
 622 
 623 /*ARGSUSED1*/
 624 static void
 625 as_destructor(void *buf, void *cdrarg)
 626 {
 627         struct as *as = buf;
 628 
 629         avl_destroy(&as->a_segtree);
 630         mutex_destroy(&as->a_contents);
 631         cv_destroy(&as->a_cv);
 632         rw_destroy(&as->a_lock);
 633 }
 634 
 635 void
 636 as_init(void)
 637 {
 638         as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 639             as_constructor, as_destructor, NULL, NULL, NULL, 0);
 640 }
 641 
 642 /*
 643  * Allocate and initialize an address space data structure.
 644  * We call hat_alloc to allow any machine dependent
 645  * information in the hat structure to be initialized.
 646  */
 647 struct as *
 648 as_alloc(void)
 649 {
 650         struct as *as;
 651 
 652         as = kmem_cache_alloc(as_cache, KM_SLEEP);
 653 
 654         as->a_flags          = 0;
 655         as->a_vbits          = 0;
 656         as->a_hrm            = NULL;
 657         as->a_seglast                = NULL;
 658         as->a_size           = 0;
 659         as->a_resvsize               = 0;
 660         as->a_updatedir              = 0;
 661         gethrestime(&as->a_updatetime);
 662         as->a_objectdir              = NULL;
 663         as->a_sizedir                = 0;
 664         as->a_userlimit              = (caddr_t)USERLIMIT;
 665         as->a_lastgap                = NULL;
 666         as->a_lastgaphl              = NULL;
 667         as->a_callbacks              = NULL;
 668 
 669         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 670         as->a_hat = hat_alloc(as);   /* create hat for default system mmu */
 671         AS_LOCK_EXIT(as, &as->a_lock);
 672 
 673         return (as);
 674 }
 675 
 676 /*
 677  * Free an address space data structure.
 678  * Need to free the hat first and then
 679  * all the segments on this as and finally
 680  * the space for the as struct itself.
 681  */
 682 void
 683 as_free(struct as *as)
 684 {
 685         struct hat *hat = as->a_hat;
 686         struct seg *seg, *next;
 687         boolean_t free_started = B_FALSE;
 688 
 689 top:
 690         /*
 691          * Invoke ALL callbacks. as_do_callbacks will do one callback
 692          * per call, and not return (-1) until the callback has completed.
 693          * When as_do_callbacks returns zero, all callbacks have completed.
 694          */
 695         mutex_enter(&as->a_contents);
 696         while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 697                 ;
 698 
 699         mutex_exit(&as->a_contents);
 700         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 701 
 702         if (!free_started) {
 703                 free_started = B_TRUE;
 704                 hat_free_start(hat);
 705         }
 706         for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 707                 int err;
 708 
 709                 next = AS_SEGNEXT(as, seg);
 710 retry:
 711                 err = segop_unmap(seg, seg->s_base, seg->s_size);
 712                 if (err == EAGAIN) {
 713                         mutex_enter(&as->a_contents);
 714                         if (as->a_callbacks) {
 715                                 AS_LOCK_EXIT(as, &as->a_lock);
 716                         } else if (!AS_ISNOUNMAPWAIT(as)) {
 717                                 /*
 718                                  * Memory is currently locked. Wait for a
 719                                  * cv_signal that it has been unlocked, then
 720                                  * try the operation again.
 721                                  */
 722                                 if (AS_ISUNMAPWAIT(as) == 0)
 723                                         cv_broadcast(&as->a_cv);
 724                                 AS_SETUNMAPWAIT(as);
 725                                 AS_LOCK_EXIT(as, &as->a_lock);
 726                                 while (AS_ISUNMAPWAIT(as))
 727                                         cv_wait(&as->a_cv, &as->a_contents);
 728                         } else {
 729                                 /*
 730                                  * We may have raced with
 731                                  * segvn_reclaim()/segspt_reclaim(). In this
 732                                  * case clean nounmapwait flag and retry since
 733                                  * softlockcnt in this segment may be already
 734                                  * 0.  We don't drop as writer lock so our
 735                                  * number of retries without sleeping should
 736                                  * be very small. See segvn_reclaim() for
 737                                  * more comments.
 738                                  */
 739                                 AS_CLRNOUNMAPWAIT(as);
 740                                 mutex_exit(&as->a_contents);
 741                                 goto retry;
 742                         }
 743                         mutex_exit(&as->a_contents);
 744                         goto top;
 745                 } else {
 746                         /*
 747                          * We do not expect any other error return at this
 748                          * time. This is similar to an ASSERT in seg_unmap()
 749                          */
 750                         ASSERT(err == 0);
 751                 }
 752         }
 753         hat_free_end(hat);
 754         AS_LOCK_EXIT(as, &as->a_lock);
 755 
 756         /* /proc stuff */
 757         ASSERT(avl_numnodes(&as->a_wpage) == 0);
 758         if (as->a_objectdir) {
 759                 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 760                 as->a_objectdir = NULL;
 761                 as->a_sizedir = 0;
 762         }
 763 
 764         /*
 765          * Free the struct as back to kmem.  Assert it has no segments.
 766          */
 767         ASSERT(avl_numnodes(&as->a_segtree) == 0);
 768         kmem_cache_free(as_cache, as);
 769 }
 770 
 771 int
 772 as_dup(struct as *as, struct proc *forkedproc)
 773 {
 774         struct as *newas;
 775         struct seg *seg, *newseg;
 776         size_t  purgesize = 0;
 777         int error;
 778 
 779         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 780         as_clearwatch(as);
 781         newas = as_alloc();
 782         newas->a_userlimit = as->a_userlimit;
 783         newas->a_proc = forkedproc;
 784 
 785         AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
 786 
 787         (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 788 
 789         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 790 
 791                 if (seg->s_flags & S_PURGE) {
 792                         purgesize += seg->s_size;
 793                         continue;
 794                 }
 795 
 796                 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 797                 if (newseg == NULL) {
 798                         AS_LOCK_EXIT(newas, &newas->a_lock);
 799                         as_setwatch(as);
 800                         AS_LOCK_EXIT(as, &as->a_lock);
 801                         as_free(newas);
 802                         return (-1);
 803                 }
 804                 if ((error = segop_dup(seg, newseg)) != 0) {
 805                         /*
 806                          * We call seg_free() on the new seg
 807                          * because the segment is not set up
 808                          * completely; i.e. it has no ops.
 809                          */
 810                         as_setwatch(as);
 811                         AS_LOCK_EXIT(as, &as->a_lock);
 812                         seg_free(newseg);
 813                         AS_LOCK_EXIT(newas, &newas->a_lock);
 814                         as_free(newas);
 815                         return (error);
 816                 }
 817                 newas->a_size += seg->s_size;
 818         }
 819         newas->a_resvsize = as->a_resvsize - purgesize;
 820 
 821         error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 822 
 823         AS_LOCK_EXIT(newas, &newas->a_lock);
 824 
 825         as_setwatch(as);
 826         AS_LOCK_EXIT(as, &as->a_lock);
 827         if (error != 0) {
 828                 as_free(newas);
 829                 return (error);
 830         }
 831         forkedproc->p_as = newas;
 832         return (0);
 833 }
 834 
 835 /*
 836  * Handle a ``fault'' at addr for size bytes.
 837  */
 838 faultcode_t
 839 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 840         enum fault_type type, enum seg_rw rw)
 841 {
 842         struct seg *seg;
 843         caddr_t raddr;                  /* rounded down addr */
 844         size_t rsize;                   /* rounded up size */
 845         size_t ssize;
 846         faultcode_t res = 0;
 847         caddr_t addrsav;
 848         struct seg *segsav;
 849         int as_lock_held;
 850         klwp_t *lwp = ttolwp(curthread);
 851         int holding_wpage = 0;
 852 
 853 
 854 
 855 retry:
 856         /*
 857          * Indicate that the lwp is not to be stopped while waiting for a
 858          * pagefault.  This is to avoid deadlock while debugging a process
 859          * via /proc over NFS (in particular).
 860          */
 861         if (lwp != NULL)
 862                 lwp->lwp_nostop++;
 863 
 864         /*
 865          * same length must be used when we softlock and softunlock.  We
 866          * don't support softunlocking lengths less than the original length
 867          * when there is largepage support.  See seg_dev.c for more
 868          * comments.
 869          */
 870         switch (type) {
 871 
 872         case F_SOFTLOCK:
 873                 CPU_STATS_ADD_K(vm, softlock, 1);
 874                 break;
 875 
 876         case F_SOFTUNLOCK:
 877                 break;
 878 
 879         case F_PROT:
 880                 CPU_STATS_ADD_K(vm, prot_fault, 1);
 881                 break;
 882 
 883         case F_INVAL:
 884                 CPU_STATS_ENTER_K();
 885                 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 886                 if (as == &kas)
 887                         CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 888                 CPU_STATS_EXIT_K();
 889                 break;
 890         }
 891 
 892         /* Kernel probe */
 893         TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
 894             tnf_opaque, address,        addr,
 895             tnf_fault_type,     fault_type,     type,
 896             tnf_seg_access,     access,         rw);
 897 
 898         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 899         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 900             (size_t)raddr;
 901 
 902         /*
 903          * XXX -- Don't grab the as lock for segkmap. We should grab it for
 904          * correctness, but then we could be stuck holding this lock for
 905          * a LONG time if the fault needs to be resolved on a slow
 906          * filesystem, and then no-one will be able to exec new commands,
 907          * as exec'ing requires the write lock on the as.
 908          */
 909         if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 910             raddr + size < segkmap->s_base + segkmap->s_size) {
 911                 seg = segkmap;
 912                 as_lock_held = 0;
 913         } else {
 914                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
 915 
 916                 seg = as_segat(as, raddr);
 917                 if (seg == NULL) {
 918                         AS_LOCK_EXIT(as, &as->a_lock);
 919                         if (lwp != NULL)
 920                                 lwp->lwp_nostop--;
 921                         return (FC_NOMAP);
 922                 }
 923 
 924                 as_lock_held = 1;
 925         }
 926 
 927         addrsav = raddr;
 928         segsav = seg;
 929 
 930         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 931                 if (raddr >= seg->s_base + seg->s_size) {
 932                         seg = AS_SEGNEXT(as, seg);
 933                         if (seg == NULL || raddr != seg->s_base) {
 934                                 res = FC_NOMAP;
 935                                 break;
 936                         }
 937                 }
 938                 if (raddr + rsize > seg->s_base + seg->s_size)
 939                         ssize = seg->s_base + seg->s_size - raddr;
 940                 else
 941                         ssize = rsize;
 942 
 943                 res = segop_fault(hat, seg, raddr, ssize, type, rw);
 944 
 945                 /* Restore watchpoints */
 946                 if (holding_wpage) {
 947                         as_setwatch(as);
 948                         holding_wpage = 0;
 949                 }
 950 
 951                 if (res != 0)
 952                         break;
 953         }
 954 
 955         /*
 956          * If we were SOFTLOCKing and encountered a failure,
 957          * we must SOFTUNLOCK the range we already did. (Maybe we
 958          * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
 959          * right here...)
 960          */
 961         if (res != 0 && type == F_SOFTLOCK) {
 962                 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
 963                         if (addrsav >= seg->s_base + seg->s_size)
 964                                 seg = AS_SEGNEXT(as, seg);
 965                         ASSERT(seg != NULL);
 966                         /*
 967                          * Now call the fault routine again to perform the
 968                          * unlock using S_OTHER instead of the rw variable
 969                          * since we never got a chance to touch the pages.
 970                          */
 971                         if (raddr > seg->s_base + seg->s_size)
 972                                 ssize = seg->s_base + seg->s_size - addrsav;
 973                         else
 974                                 ssize = raddr - addrsav;
 975                         (void) segop_fault(hat, seg, addrsav, ssize,
 976                             F_SOFTUNLOCK, S_OTHER);
 977                 }
 978         }
 979         if (as_lock_held)
 980                 AS_LOCK_EXIT(as, &as->a_lock);
 981         if (lwp != NULL)
 982                 lwp->lwp_nostop--;
 983 
 984         /*
 985          * If the lower levels returned EDEADLK for a fault,
 986          * It means that we should retry the fault.  Let's wait
 987          * a bit also to let the deadlock causing condition clear.
 988          * This is part of a gross hack to work around a design flaw
 989          * in the ufs/sds logging code and should go away when the
 990          * logging code is re-designed to fix the problem. See bug
 991          * 4125102 for details of the problem.
 992          */
 993         if (FC_ERRNO(res) == EDEADLK) {
 994                 delay(deadlk_wait);
 995                 res = 0;
 996                 goto retry;
 997         }
 998         return (res);
 999 }
1000 
1001 
1002 
1003 /*
1004  * Asynchronous ``fault'' at addr for size bytes.
1005  */
1006 faultcode_t
1007 as_faulta(struct as *as, caddr_t addr, size_t size)
1008 {
1009         struct seg *seg;
1010         caddr_t raddr;                  /* rounded down addr */
1011         size_t rsize;                   /* rounded up size */
1012         faultcode_t res = 0;
1013         klwp_t *lwp = ttolwp(curthread);
1014 
1015 retry:
1016         /*
1017          * Indicate that the lwp is not to be stopped while waiting
1018          * for a pagefault.  This is to avoid deadlock while debugging
1019          * a process via /proc over NFS (in particular).
1020          */
1021         if (lwp != NULL)
1022                 lwp->lwp_nostop++;
1023 
1024         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1025         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1026             (size_t)raddr;
1027 
1028         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1029         seg = as_segat(as, raddr);
1030         if (seg == NULL) {
1031                 AS_LOCK_EXIT(as, &as->a_lock);
1032                 if (lwp != NULL)
1033                         lwp->lwp_nostop--;
1034                 return (FC_NOMAP);
1035         }
1036 
1037         for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1038                 if (raddr >= seg->s_base + seg->s_size) {
1039                         seg = AS_SEGNEXT(as, seg);
1040                         if (seg == NULL || raddr != seg->s_base) {
1041                                 res = FC_NOMAP;
1042                                 break;
1043                         }
1044                 }
1045                 res = segop_faulta(seg, raddr);
1046                 if (res != 0)
1047                         break;
1048         }
1049         AS_LOCK_EXIT(as, &as->a_lock);
1050         if (lwp != NULL)
1051                 lwp->lwp_nostop--;
1052         /*
1053          * If the lower levels returned EDEADLK for a fault,
1054          * It means that we should retry the fault.  Let's wait
1055          * a bit also to let the deadlock causing condition clear.
1056          * This is part of a gross hack to work around a design flaw
1057          * in the ufs/sds logging code and should go away when the
1058          * logging code is re-designed to fix the problem. See bug
1059          * 4125102 for details of the problem.
1060          */
1061         if (FC_ERRNO(res) == EDEADLK) {
1062                 delay(deadlk_wait);
1063                 res = 0;
1064                 goto retry;
1065         }
1066         return (res);
1067 }
1068 
1069 /*
1070  * Set the virtual mapping for the interval from [addr : addr + size)
1071  * in address space `as' to have the specified protection.
1072  * It is ok for the range to cross over several segments,
1073  * as long as they are contiguous.
1074  */
1075 int
1076 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1077 {
1078         struct seg *seg;
1079         struct as_callback *cb;
1080         size_t ssize;
1081         caddr_t raddr;                  /* rounded down addr */
1082         size_t rsize;                   /* rounded up size */
1083         int error = 0, writer = 0;
1084         caddr_t saveraddr;
1085         size_t saversize;
1086 
1087 setprot_top:
1088         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1089         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1090             (size_t)raddr;
1091 
1092         if (raddr + rsize < raddr)           /* check for wraparound */
1093                 return (ENOMEM);
1094 
1095         saveraddr = raddr;
1096         saversize = rsize;
1097 
1098         /*
1099          * Normally we only lock the as as a reader. But
1100          * if due to setprot the segment driver needs to split
1101          * a segment it will return IE_RETRY. Therefore we re-acquire
1102          * the as lock as a writer so the segment driver can change
1103          * the seg list. Also the segment driver will return IE_RETRY
1104          * after it has changed the segment list so we therefore keep
1105          * locking as a writer. Since these opeartions should be rare
1106          * want to only lock as a writer when necessary.
1107          */
1108         if (writer || avl_numnodes(&as->a_wpage) != 0) {
1109                 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1110         } else {
1111                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1112         }
1113 
1114         as_clearwatchprot(as, raddr, rsize);
1115         seg = as_segat(as, raddr);
1116         if (seg == NULL) {
1117                 as_setwatch(as);
1118                 AS_LOCK_EXIT(as, &as->a_lock);
1119                 return (ENOMEM);
1120         }
1121 
1122         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1123                 if (raddr >= seg->s_base + seg->s_size) {
1124                         seg = AS_SEGNEXT(as, seg);
1125                         if (seg == NULL || raddr != seg->s_base) {
1126                                 error = ENOMEM;
1127                                 break;
1128                         }
1129                 }
1130                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1131                         ssize = seg->s_base + seg->s_size - raddr;
1132                 else
1133                         ssize = rsize;
1134 retry:
1135                 error = segop_setprot(seg, raddr, ssize, prot);
1136 
1137                 if (error == IE_NOMEM) {
1138                         error = EAGAIN;
1139                         break;
1140                 }
1141 
1142                 if (error == IE_RETRY) {
1143                         AS_LOCK_EXIT(as, &as->a_lock);
1144                         writer = 1;
1145                         goto setprot_top;
1146                 }
1147 
1148                 if (error == EAGAIN) {
1149                         /*
1150                          * Make sure we have a_lock as writer.
1151                          */
1152                         if (writer == 0) {
1153                                 AS_LOCK_EXIT(as, &as->a_lock);
1154                                 writer = 1;
1155                                 goto setprot_top;
1156                         }
1157 
1158                         /*
1159                          * Memory is currently locked.  It must be unlocked
1160                          * before this operation can succeed through a retry.
1161                          * The possible reasons for locked memory and
1162                          * corresponding strategies for unlocking are:
1163                          * (1) Normal I/O
1164                          *      wait for a signal that the I/O operation
1165                          *      has completed and the memory is unlocked.
1166                          * (2) Asynchronous I/O
1167                          *      The aio subsystem does not unlock pages when
1168                          *      the I/O is completed. Those pages are unlocked
1169                          *      when the application calls aiowait/aioerror.
1170                          *      So, to prevent blocking forever, cv_broadcast()
1171                          *      is done to wake up aio_cleanup_thread.
1172                          *      Subsequently, segvn_reclaim will be called, and
1173                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1174                          * (3) Long term page locking:
1175                          *      Drivers intending to have pages locked for a
1176                          *      period considerably longer than for normal I/O
1177                          *      (essentially forever) may have registered for a
1178                          *      callback so they may unlock these pages on
1179                          *      request. This is needed to allow this operation
1180                          *      to succeed. Each entry on the callback list is
1181                          *      examined. If the event or address range pertains
1182                          *      the callback is invoked (unless it already is in
1183                          *      progress). The a_contents lock must be dropped
1184                          *      before the callback, so only one callback can
1185                          *      be done at a time. Go to the top and do more
1186                          *      until zero is returned. If zero is returned,
1187                          *      either there were no callbacks for this event
1188                          *      or they were already in progress.
1189                          */
1190                         mutex_enter(&as->a_contents);
1191                         if (as->a_callbacks &&
1192                             (cb = as_find_callback(as, AS_SETPROT_EVENT,
1193                             seg->s_base, seg->s_size))) {
1194                                 AS_LOCK_EXIT(as, &as->a_lock);
1195                                 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1196                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1197                                 if (AS_ISUNMAPWAIT(as) == 0)
1198                                         cv_broadcast(&as->a_cv);
1199                                 AS_SETUNMAPWAIT(as);
1200                                 AS_LOCK_EXIT(as, &as->a_lock);
1201                                 while (AS_ISUNMAPWAIT(as))
1202                                         cv_wait(&as->a_cv, &as->a_contents);
1203                         } else {
1204                                 /*
1205                                  * We may have raced with
1206                                  * segvn_reclaim()/segspt_reclaim(). In this
1207                                  * case clean nounmapwait flag and retry since
1208                                  * softlockcnt in this segment may be already
1209                                  * 0.  We don't drop as writer lock so our
1210                                  * number of retries without sleeping should
1211                                  * be very small. See segvn_reclaim() for
1212                                  * more comments.
1213                                  */
1214                                 AS_CLRNOUNMAPWAIT(as);
1215                                 mutex_exit(&as->a_contents);
1216                                 goto retry;
1217                         }
1218                         mutex_exit(&as->a_contents);
1219                         goto setprot_top;
1220                 } else if (error != 0)
1221                         break;
1222         }
1223         if (error != 0) {
1224                 as_setwatch(as);
1225         } else {
1226                 as_setwatchprot(as, saveraddr, saversize, prot);
1227         }
1228         AS_LOCK_EXIT(as, &as->a_lock);
1229         return (error);
1230 }
1231 
1232 /*
1233  * Check to make sure that the interval [addr, addr + size)
1234  * in address space `as' has at least the specified protection.
1235  * It is ok for the range to cross over several segments, as long
1236  * as they are contiguous.
1237  */
1238 int
1239 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1240 {
1241         struct seg *seg;
1242         size_t ssize;
1243         caddr_t raddr;                  /* rounded down addr */
1244         size_t rsize;                   /* rounded up size */
1245         int error = 0;
1246 
1247         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1248         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1249             (size_t)raddr;
1250 
1251         if (raddr + rsize < raddr)           /* check for wraparound */
1252                 return (ENOMEM);
1253 
1254         /*
1255          * This is ugly as sin...
1256          * Normally, we only acquire the address space readers lock.
1257          * However, if the address space has watchpoints present,
1258          * we must acquire the writer lock on the address space for
1259          * the benefit of as_clearwatchprot() and as_setwatchprot().
1260          */
1261         if (avl_numnodes(&as->a_wpage) != 0)
1262                 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1263         else
1264                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1265         as_clearwatchprot(as, raddr, rsize);
1266         seg = as_segat(as, raddr);
1267         if (seg == NULL) {
1268                 as_setwatch(as);
1269                 AS_LOCK_EXIT(as, &as->a_lock);
1270                 return (ENOMEM);
1271         }
1272 
1273         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1274                 if (raddr >= seg->s_base + seg->s_size) {
1275                         seg = AS_SEGNEXT(as, seg);
1276                         if (seg == NULL || raddr != seg->s_base) {
1277                                 error = ENOMEM;
1278                                 break;
1279                         }
1280                 }
1281                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1282                         ssize = seg->s_base + seg->s_size - raddr;
1283                 else
1284                         ssize = rsize;
1285 
1286                 error = segop_checkprot(seg, raddr, ssize, prot);
1287                 if (error != 0)
1288                         break;
1289         }
1290         as_setwatch(as);
1291         AS_LOCK_EXIT(as, &as->a_lock);
1292         return (error);
1293 }
1294 
1295 int
1296 as_unmap(struct as *as, caddr_t addr, size_t size)
1297 {
1298         struct seg *seg, *seg_next;
1299         struct as_callback *cb;
1300         caddr_t raddr, eaddr;
1301         size_t ssize, rsize = 0;
1302         int err;
1303 
1304 top:
1305         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1306         eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1307             (uintptr_t)PAGEMASK);
1308 
1309         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1310 
1311         as->a_updatedir = 1; /* inform /proc */
1312         gethrestime(&as->a_updatetime);
1313 
1314         /*
1315          * Use as_findseg to find the first segment in the range, then
1316          * step through the segments in order, following s_next.
1317          */
1318         as_clearwatchprot(as, raddr, eaddr - raddr);
1319 
1320         for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1321                 if (eaddr <= seg->s_base)
1322                         break;          /* eaddr was in a gap; all done */
1323 
1324                 /* this is implied by the test above */
1325                 ASSERT(raddr < eaddr);
1326 
1327                 if (raddr < seg->s_base)
1328                         raddr = seg->s_base;         /* raddr was in a gap */
1329 
1330                 if (eaddr > (seg->s_base + seg->s_size))
1331                         ssize = seg->s_base + seg->s_size - raddr;
1332                 else
1333                         ssize = eaddr - raddr;
1334 
1335                 /*
1336                  * Save next segment pointer since seg can be
1337                  * destroyed during the segment unmap operation.
1338                  */
1339                 seg_next = AS_SEGNEXT(as, seg);
1340 
1341                 /*
1342                  * We didn't count /dev/null mappings, so ignore them here.
1343                  * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1344                  * we have to do this check here while we have seg.)
1345                  */
1346                 rsize = 0;
1347                 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1348                     !SEG_IS_PARTIAL_RESV(seg))
1349                         rsize = ssize;
1350 
1351 retry:
1352                 err = segop_unmap(seg, raddr, ssize);
1353                 if (err == EAGAIN) {
1354                         /*
1355                          * Memory is currently locked.  It must be unlocked
1356                          * before this operation can succeed through a retry.
1357                          * The possible reasons for locked memory and
1358                          * corresponding strategies for unlocking are:
1359                          * (1) Normal I/O
1360                          *      wait for a signal that the I/O operation
1361                          *      has completed and the memory is unlocked.
1362                          * (2) Asynchronous I/O
1363                          *      The aio subsystem does not unlock pages when
1364                          *      the I/O is completed. Those pages are unlocked
1365                          *      when the application calls aiowait/aioerror.
1366                          *      So, to prevent blocking forever, cv_broadcast()
1367                          *      is done to wake up aio_cleanup_thread.
1368                          *      Subsequently, segvn_reclaim will be called, and
1369                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1370                          * (3) Long term page locking:
1371                          *      Drivers intending to have pages locked for a
1372                          *      period considerably longer than for normal I/O
1373                          *      (essentially forever) may have registered for a
1374                          *      callback so they may unlock these pages on
1375                          *      request. This is needed to allow this operation
1376                          *      to succeed. Each entry on the callback list is
1377                          *      examined. If the event or address range pertains
1378                          *      the callback is invoked (unless it already is in
1379                          *      progress). The a_contents lock must be dropped
1380                          *      before the callback, so only one callback can
1381                          *      be done at a time. Go to the top and do more
1382                          *      until zero is returned. If zero is returned,
1383                          *      either there were no callbacks for this event
1384                          *      or they were already in progress.
1385                          */
1386                         mutex_enter(&as->a_contents);
1387                         if (as->a_callbacks &&
1388                             (cb = as_find_callback(as, AS_UNMAP_EVENT,
1389                             seg->s_base, seg->s_size))) {
1390                                 AS_LOCK_EXIT(as, &as->a_lock);
1391                                 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1392                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1393                                 if (AS_ISUNMAPWAIT(as) == 0)
1394                                         cv_broadcast(&as->a_cv);
1395                                 AS_SETUNMAPWAIT(as);
1396                                 AS_LOCK_EXIT(as, &as->a_lock);
1397                                 while (AS_ISUNMAPWAIT(as))
1398                                         cv_wait(&as->a_cv, &as->a_contents);
1399                         } else {
1400                                 /*
1401                                  * We may have raced with
1402                                  * segvn_reclaim()/segspt_reclaim(). In this
1403                                  * case clean nounmapwait flag and retry since
1404                                  * softlockcnt in this segment may be already
1405                                  * 0.  We don't drop as writer lock so our
1406                                  * number of retries without sleeping should
1407                                  * be very small. See segvn_reclaim() for
1408                                  * more comments.
1409                                  */
1410                                 AS_CLRNOUNMAPWAIT(as);
1411                                 mutex_exit(&as->a_contents);
1412                                 goto retry;
1413                         }
1414                         mutex_exit(&as->a_contents);
1415                         goto top;
1416                 } else if (err == IE_RETRY) {
1417                         AS_LOCK_EXIT(as, &as->a_lock);
1418                         goto top;
1419                 } else if (err) {
1420                         as_setwatch(as);
1421                         AS_LOCK_EXIT(as, &as->a_lock);
1422                         return (-1);
1423                 }
1424 
1425                 as->a_size -= ssize;
1426                 if (rsize)
1427                         as->a_resvsize -= rsize;
1428                 raddr += ssize;
1429         }
1430         AS_LOCK_EXIT(as, &as->a_lock);
1431         return (0);
1432 }
1433 
1434 static int
1435 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1436     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1437 {
1438         uint_t szc;
1439         uint_t nszc;
1440         int error;
1441         caddr_t a;
1442         caddr_t eaddr;
1443         size_t segsize;
1444         struct seg *seg;
1445         size_t pgsz;
1446         int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1447         uint_t save_szcvec;
1448 
1449         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1450         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1451         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1452         ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1453         if (!do_off) {
1454                 vn_a->offset = 0;
1455         }
1456 
1457         if (szcvec <= 1) {
1458                 seg = seg_alloc(as, addr, size);
1459                 if (seg == NULL) {
1460                         return (ENOMEM);
1461                 }
1462                 vn_a->szc = 0;
1463                 error = (*crfp)(seg, vn_a);
1464                 if (error != 0) {
1465                         seg_free(seg);
1466                 } else {
1467                         as->a_size += size;
1468                         as->a_resvsize += size;
1469                 }
1470                 return (error);
1471         }
1472 
1473         eaddr = addr + size;
1474         save_szcvec = szcvec;
1475         szcvec >>= 1;
1476         szc = 0;
1477         nszc = 0;
1478         while (szcvec) {
1479                 if ((szcvec & 0x1) == 0) {
1480                         nszc++;
1481                         szcvec >>= 1;
1482                         continue;
1483                 }
1484                 nszc++;
1485                 pgsz = page_get_pagesize(nszc);
1486                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1487                 if (a != addr) {
1488                         ASSERT(a < eaddr);
1489                         segsize = a - addr;
1490                         seg = seg_alloc(as, addr, segsize);
1491                         if (seg == NULL) {
1492                                 return (ENOMEM);
1493                         }
1494                         vn_a->szc = szc;
1495                         error = (*crfp)(seg, vn_a);
1496                         if (error != 0) {
1497                                 seg_free(seg);
1498                                 return (error);
1499                         }
1500                         as->a_size += segsize;
1501                         as->a_resvsize += segsize;
1502                         *segcreated = 1;
1503                         if (do_off) {
1504                                 vn_a->offset += segsize;
1505                         }
1506                         addr = a;
1507                 }
1508                 szc = nszc;
1509                 szcvec >>= 1;
1510         }
1511 
1512         ASSERT(addr < eaddr);
1513         szcvec = save_szcvec | 1; /* add 8K pages */
1514         while (szcvec) {
1515                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1516                 ASSERT(a >= addr);
1517                 if (a != addr) {
1518                         segsize = a - addr;
1519                         seg = seg_alloc(as, addr, segsize);
1520                         if (seg == NULL) {
1521                                 return (ENOMEM);
1522                         }
1523                         vn_a->szc = szc;
1524                         error = (*crfp)(seg, vn_a);
1525                         if (error != 0) {
1526                                 seg_free(seg);
1527                                 return (error);
1528                         }
1529                         as->a_size += segsize;
1530                         as->a_resvsize += segsize;
1531                         *segcreated = 1;
1532                         if (do_off) {
1533                                 vn_a->offset += segsize;
1534                         }
1535                         addr = a;
1536                 }
1537                 szcvec &= ~(1 << szc);
1538                 if (szcvec) {
1539                         szc = highbit(szcvec) - 1;
1540                         pgsz = page_get_pagesize(szc);
1541                 }
1542         }
1543         ASSERT(addr == eaddr);
1544 
1545         return (0);
1546 }
1547 
1548 static int
1549 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1550     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1551 {
1552         uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1553         int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1554         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1555             type, 0);
1556         int error;
1557         struct seg *seg;
1558         struct vattr va;
1559         u_offset_t eoff;
1560         size_t save_size = 0;
1561         extern size_t textrepl_size_thresh;
1562 
1563         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1564         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1565         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1566         ASSERT(vn_a->vp != NULL);
1567         ASSERT(vn_a->amp == NULL);
1568 
1569 again:
1570         if (szcvec <= 1) {
1571                 seg = seg_alloc(as, addr, size);
1572                 if (seg == NULL) {
1573                         return (ENOMEM);
1574                 }
1575                 vn_a->szc = 0;
1576                 error = (*crfp)(seg, vn_a);
1577                 if (error != 0) {
1578                         seg_free(seg);
1579                 } else {
1580                         as->a_size += size;
1581                         as->a_resvsize += size;
1582                 }
1583                 return (error);
1584         }
1585 
1586         va.va_mask = AT_SIZE;
1587         if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1588                 szcvec = 0;
1589                 goto again;
1590         }
1591         eoff = vn_a->offset & PAGEMASK;
1592         if (eoff >= va.va_size) {
1593                 szcvec = 0;
1594                 goto again;
1595         }
1596         eoff += size;
1597         if (btopr(va.va_size) < btopr(eoff)) {
1598                 save_size = size;
1599                 size = va.va_size - (vn_a->offset & PAGEMASK);
1600                 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1601                 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1602                     type, 0);
1603                 if (szcvec <= 1) {
1604                         size = save_size;
1605                         goto again;
1606                 }
1607         }
1608 
1609         if (size > textrepl_size_thresh) {
1610                 vn_a->flags |= _MAP_TEXTREPL;
1611         }
1612         error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1613             segcreated);
1614         if (error != 0) {
1615                 return (error);
1616         }
1617         if (save_size) {
1618                 addr += size;
1619                 size = save_size - size;
1620                 szcvec = 0;
1621                 goto again;
1622         }
1623         return (0);
1624 }
1625 
1626 /*
1627  * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1628  * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1629  */
1630 static int
1631 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1632     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1633 {
1634         uint_t szcvec;
1635         uchar_t type;
1636 
1637         ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1638         if (vn_a->type == MAP_SHARED) {
1639                 type = MAPPGSZC_SHM;
1640         } else if (vn_a->type == MAP_PRIVATE) {
1641                 if (vn_a->szc == AS_MAP_HEAP) {
1642                         type = MAPPGSZC_HEAP;
1643                 } else if (vn_a->szc == AS_MAP_STACK) {
1644                         type = MAPPGSZC_STACK;
1645                 } else {
1646                         type = MAPPGSZC_PRIVM;
1647                 }
1648         }
1649         szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1650             (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1651             (vn_a->flags & MAP_TEXT), type, 0);
1652         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1653         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1654         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1655         ASSERT(vn_a->vp == NULL);
1656 
1657         return (as_map_segvn_segs(as, addr, size, szcvec,
1658             crfp, vn_a, segcreated));
1659 }
1660 
1661 int
1662 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1663 {
1664         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1665         return (as_map_locked(as, addr, size, crfp, argsp));
1666 }
1667 
1668 int
1669 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1670                 void *argsp)
1671 {
1672         struct seg *seg = NULL;
1673         caddr_t raddr;                  /* rounded down addr */
1674         size_t rsize;                   /* rounded up size */
1675         int error;
1676         int unmap = 0;
1677         struct proc *p = curproc;
1678         struct segvn_crargs crargs;
1679 
1680         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1681         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1682             (size_t)raddr;
1683 
1684         /*
1685          * check for wrap around
1686          */
1687         if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1688                 AS_LOCK_EXIT(as, &as->a_lock);
1689                 return (ENOMEM);
1690         }
1691 
1692         as->a_updatedir = 1; /* inform /proc */
1693         gethrestime(&as->a_updatetime);
1694 
1695         if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1696                 AS_LOCK_EXIT(as, &as->a_lock);
1697 
1698                 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1699                     RCA_UNSAFE_ALL);
1700 
1701                 return (ENOMEM);
1702         }
1703 
1704         if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1705                 crargs = *(struct segvn_crargs *)argsp;
1706                 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1707                 if (error != 0) {
1708                         AS_LOCK_EXIT(as, &as->a_lock);
1709                         if (unmap) {
1710                                 (void) as_unmap(as, addr, size);
1711                         }
1712                         return (error);
1713                 }
1714         } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1715                 crargs = *(struct segvn_crargs *)argsp;
1716                 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1717                 if (error != 0) {
1718                         AS_LOCK_EXIT(as, &as->a_lock);
1719                         if (unmap) {
1720                                 (void) as_unmap(as, addr, size);
1721                         }
1722                         return (error);
1723                 }
1724         } else {
1725                 seg = seg_alloc(as, addr, size);
1726                 if (seg == NULL) {
1727                         AS_LOCK_EXIT(as, &as->a_lock);
1728                         return (ENOMEM);
1729                 }
1730 
1731                 error = (*crfp)(seg, argsp);
1732                 if (error != 0) {
1733                         seg_free(seg);
1734                         AS_LOCK_EXIT(as, &as->a_lock);
1735                         return (error);
1736                 }
1737                 /*
1738                  * Add size now so as_unmap will work if as_ctl fails.
1739                  */
1740                 as->a_size += rsize;
1741                 as->a_resvsize += rsize;
1742         }
1743 
1744         as_setwatch(as);
1745 
1746         /*
1747          * If the address space is locked,
1748          * establish memory locks for the new segment.
1749          */
1750         mutex_enter(&as->a_contents);
1751         if (AS_ISPGLCK(as)) {
1752                 mutex_exit(&as->a_contents);
1753                 AS_LOCK_EXIT(as, &as->a_lock);
1754                 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1755                 if (error != 0)
1756                         (void) as_unmap(as, addr, size);
1757         } else {
1758                 mutex_exit(&as->a_contents);
1759                 AS_LOCK_EXIT(as, &as->a_lock);
1760         }
1761         return (error);
1762 }
1763 
1764 
1765 /*
1766  * Delete all segments in the address space marked with S_PURGE.
1767  * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1768  * These segments are deleted as a first step before calls to as_gap(), so
1769  * that they don't affect mmap() or shmat().
1770  */
1771 void
1772 as_purge(struct as *as)
1773 {
1774         struct seg *seg;
1775         struct seg *next_seg;
1776 
1777         /*
1778          * the setting of NEEDSPURGE is protect by as_rangelock(), so
1779          * no need to grab a_contents mutex for this check
1780          */
1781         if ((as->a_flags & AS_NEEDSPURGE) == 0)
1782                 return;
1783 
1784         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1785         next_seg = NULL;
1786         seg = AS_SEGFIRST(as);
1787         while (seg != NULL) {
1788                 next_seg = AS_SEGNEXT(as, seg);
1789                 if (seg->s_flags & S_PURGE)
1790                         segop_unmap(seg, seg->s_base, seg->s_size);
1791                 seg = next_seg;
1792         }
1793         AS_LOCK_EXIT(as, &as->a_lock);
1794 
1795         mutex_enter(&as->a_contents);
1796         as->a_flags &= ~AS_NEEDSPURGE;
1797         mutex_exit(&as->a_contents);
1798 }
1799 
1800 /*
1801  * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1802  * range of addresses at least "minlen" long, where the base of the range is
1803  * at "off" phase from an "align" boundary and there is space for a
1804  * "redzone"-sized redzone on eithe rside of the range.  Thus,
1805  * if align was 4M and off was 16k, the user wants a hole which will start
1806  * 16k into a 4M page.
1807  *
1808  * If flags specifies AH_HI, the hole will have the highest possible address
1809  * in the range.  We use the as->a_lastgap field to figure out where to
1810  * start looking for a gap.
1811  *
1812  * Otherwise, the gap will have the lowest possible address.
1813  *
1814  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1815  *
1816  * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1817  * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1818  *
1819  * NOTE: This routine is not correct when base+len overflows caddr_t.
1820  */
1821 int
1822 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1823     uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1824 {
1825         caddr_t lobound = *basep;
1826         caddr_t hibound = lobound + *lenp;
1827         struct seg *lseg, *hseg;
1828         caddr_t lo, hi;
1829         int forward;
1830         caddr_t save_base;
1831         size_t save_len;
1832         size_t save_minlen;
1833         size_t save_redzone;
1834         int fast_path = 1;
1835 
1836         save_base = *basep;
1837         save_len = *lenp;
1838         save_minlen = minlen;
1839         save_redzone = redzone;
1840 
1841         /*
1842          * For the first pass/fast_path, just add align and redzone into
1843          * minlen since if we get an allocation, we can guarantee that it
1844          * will fit the alignment and redzone requested.
1845          * This increases the chance that hibound will be adjusted to
1846          * a_lastgap->s_base which will likely allow us to find an
1847          * acceptable hole in the address space quicker.
1848          * If we can't find a hole with this fast_path, then we look for
1849          * smaller holes in which the alignment and offset may allow
1850          * the allocation to fit.
1851          */
1852         minlen += align;
1853         minlen += 2 * redzone;
1854         redzone = 0;
1855 
1856         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1857         if (AS_SEGFIRST(as) == NULL) {
1858                 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1859                     align, redzone, off)) {
1860                         AS_LOCK_EXIT(as, &as->a_lock);
1861                         return (0);
1862                 } else {
1863                         AS_LOCK_EXIT(as, &as->a_lock);
1864                         *basep = save_base;
1865                         *lenp = save_len;
1866                         return (-1);
1867                 }
1868         }
1869 
1870 retry:
1871         /*
1872          * Set up to iterate over all the inter-segment holes in the given
1873          * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1874          * NULL for the highest-addressed hole.  If moving backwards, we reset
1875          * sseg to denote the highest-addressed segment.
1876          */
1877         forward = (flags & AH_DIR) == AH_LO;
1878         if (forward) {
1879                 hseg = as_findseg(as, lobound, 1);
1880                 lseg = AS_SEGPREV(as, hseg);
1881         } else {
1882 
1883                 /*
1884                  * If allocating at least as much as the last allocation,
1885                  * use a_lastgap's base as a better estimate of hibound.
1886                  */
1887                 if (as->a_lastgap &&
1888                     minlen >= as->a_lastgap->s_size &&
1889                     hibound >= as->a_lastgap->s_base)
1890                         hibound = as->a_lastgap->s_base;
1891 
1892                 hseg = as_findseg(as, hibound, 1);
1893                 if (hseg->s_base + hseg->s_size < hibound) {
1894                         lseg = hseg;
1895                         hseg = NULL;
1896                 } else {
1897                         lseg = AS_SEGPREV(as, hseg);
1898                 }
1899         }
1900 
1901         for (;;) {
1902                 /*
1903                  * Set lo and hi to the hole's boundaries.  (We should really
1904                  * use MAXADDR in place of hibound in the expression below,
1905                  * but can't express it easily; using hibound in its place is
1906                  * harmless.)
1907                  */
1908                 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1909                 hi = (hseg == NULL) ? hibound : hseg->s_base;
1910                 /*
1911                  * If the iteration has moved past the interval from lobound
1912                  * to hibound it's pointless to continue.
1913                  */
1914                 if ((forward && lo > hibound) || (!forward && hi < lobound))
1915                         break;
1916                 else if (lo > hibound || hi < lobound)
1917                         goto cont;
1918                 /*
1919                  * Candidate hole lies at least partially within the allowable
1920                  * range.  Restrict it to fall completely within that range,
1921                  * i.e., to [max(lo, lobound), min(hi, hibound)].
1922                  */
1923                 if (lo < lobound)
1924                         lo = lobound;
1925                 if (hi > hibound)
1926                         hi = hibound;
1927                 /*
1928                  * Verify that the candidate hole is big enough and meets
1929                  * hardware constraints.  If the hole is too small, no need
1930                  * to do the further checks since they will fail.
1931                  */
1932                 *basep = lo;
1933                 *lenp = hi - lo;
1934                 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1935                     minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1936                     ((flags & AH_CONTAIN) == 0 ||
1937                     (*basep <= addr && *basep + *lenp > addr))) {
1938                         if (!forward)
1939                                 as->a_lastgap = hseg;
1940                         if (hseg != NULL)
1941                                 as->a_lastgaphl = hseg;
1942                         else
1943                                 as->a_lastgaphl = lseg;
1944                         AS_LOCK_EXIT(as, &as->a_lock);
1945                         return (0);
1946                 }
1947         cont:
1948                 /*
1949                  * Move to the next hole.
1950                  */
1951                 if (forward) {
1952                         lseg = hseg;
1953                         if (lseg == NULL)
1954                                 break;
1955                         hseg = AS_SEGNEXT(as, hseg);
1956                 } else {
1957                         hseg = lseg;
1958                         if (hseg == NULL)
1959                                 break;
1960                         lseg = AS_SEGPREV(as, lseg);
1961                 }
1962         }
1963         if (fast_path && (align != 0 || save_redzone != 0)) {
1964                 fast_path = 0;
1965                 minlen = save_minlen;
1966                 redzone = save_redzone;
1967                 goto retry;
1968         }
1969         *basep = save_base;
1970         *lenp = save_len;
1971         AS_LOCK_EXIT(as, &as->a_lock);
1972         return (-1);
1973 }
1974 
1975 /*
1976  * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1977  *
1978  * If flags specifies AH_HI, the hole will have the highest possible address
1979  * in the range.  We use the as->a_lastgap field to figure out where to
1980  * start looking for a gap.
1981  *
1982  * Otherwise, the gap will have the lowest possible address.
1983  *
1984  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1985  *
1986  * If an adequate hole is found, base and len are set to reflect the part of
1987  * the hole that is within range, and 0 is returned, otherwise,
1988  * -1 is returned.
1989  *
1990  * NOTE: This routine is not correct when base+len overflows caddr_t.
1991  */
1992 int
1993 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
1994     caddr_t addr)
1995 {
1996 
1997         return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
1998 }
1999 
2000 /*
2001  * Return the next range within [base, base + len) that is backed
2002  * with "real memory".  Skip holes and non-seg_vn segments.
2003  * We're lazy and only return one segment at a time.
2004  */
2005 int
2006 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2007 {
2008         extern const struct seg_ops segspt_shmops;      /* needs a header file */
2009         struct seg *seg;
2010         caddr_t addr, eaddr;
2011         caddr_t segend;
2012 
2013         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2014 
2015         addr = *basep;
2016         eaddr = addr + *lenp;
2017 
2018         seg = as_findseg(as, addr, 0);
2019         if (seg != NULL)
2020                 addr = MAX(seg->s_base, addr);
2021 
2022         for (;;) {
2023                 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2024                         AS_LOCK_EXIT(as, &as->a_lock);
2025                         return (EINVAL);
2026                 }
2027 
2028                 if (seg->s_ops == &segvn_ops) {
2029                         segend = seg->s_base + seg->s_size;
2030                         break;
2031                 }
2032 
2033                 /*
2034                  * We do ISM by looking into the private data
2035                  * to determine the real size of the segment.
2036                  */
2037                 if (seg->s_ops == &segspt_shmops) {
2038                         segend = seg->s_base + spt_realsize(seg);
2039                         if (addr < segend)
2040                                 break;
2041                 }
2042 
2043                 seg = AS_SEGNEXT(as, seg);
2044 
2045                 if (seg != NULL)
2046                         addr = seg->s_base;
2047         }
2048 
2049         *basep = addr;
2050 
2051         if (segend > eaddr)
2052                 *lenp = eaddr - addr;
2053         else
2054                 *lenp = segend - addr;
2055 
2056         AS_LOCK_EXIT(as, &as->a_lock);
2057         return (0);
2058 }
2059 
2060 /*
2061  * Determine whether data from the mappings in interval [addr, addr + size)
2062  * are in the primary memory (core) cache.
2063  */
2064 int
2065 as_incore(struct as *as, caddr_t addr,
2066     size_t size, char *vec, size_t *sizep)
2067 {
2068         struct seg *seg;
2069         size_t ssize;
2070         caddr_t raddr;          /* rounded down addr */
2071         size_t rsize;           /* rounded up size */
2072         size_t isize;                   /* iteration size */
2073         int error = 0;          /* result, assume success */
2074 
2075         *sizep = 0;
2076         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2077         rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2078             (size_t)raddr;
2079 
2080         if (raddr + rsize < raddr)           /* check for wraparound */
2081                 return (ENOMEM);
2082 
2083         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2084         seg = as_segat(as, raddr);
2085         if (seg == NULL) {
2086                 AS_LOCK_EXIT(as, &as->a_lock);
2087                 return (-1);
2088         }
2089 
2090         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2091                 if (raddr >= seg->s_base + seg->s_size) {
2092                         seg = AS_SEGNEXT(as, seg);
2093                         if (seg == NULL || raddr != seg->s_base) {
2094                                 error = -1;
2095                                 break;
2096                         }
2097                 }
2098                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2099                         ssize = seg->s_base + seg->s_size - raddr;
2100                 else
2101                         ssize = rsize;
2102                 *sizep += isize = segop_incore(seg, raddr, ssize, vec);
2103                 if (isize != ssize) {
2104                         error = -1;
2105                         break;
2106                 }
2107                 vec += btopr(ssize);
2108         }
2109         AS_LOCK_EXIT(as, &as->a_lock);
2110         return (error);
2111 }
2112 
2113 static void
2114 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2115         ulong_t *bitmap, size_t position, size_t npages)
2116 {
2117         caddr_t range_start;
2118         size_t  pos1 = position;
2119         size_t  pos2;
2120         size_t  size;
2121         size_t  end_pos = npages + position;
2122 
2123         while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2124                 size = ptob((pos2 - pos1));
2125                 range_start = (caddr_t)((uintptr_t)addr +
2126                     ptob(pos1 - position));
2127 
2128                 (void) segop_lockop(seg, range_start, size, attr, MC_UNLOCK,
2129                     (ulong_t *)NULL, (size_t)NULL);
2130                 pos1 = pos2;
2131         }
2132 }
2133 
2134 static void
2135 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2136         caddr_t raddr, size_t rsize)
2137 {
2138         struct seg *seg = as_segat(as, raddr);
2139         size_t ssize;
2140 
2141         while (rsize != 0) {
2142                 if (raddr >= seg->s_base + seg->s_size)
2143                         seg = AS_SEGNEXT(as, seg);
2144 
2145                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2146                         ssize = seg->s_base + seg->s_size - raddr;
2147                 else
2148                         ssize = rsize;
2149 
2150                 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2151 
2152                 rsize -= ssize;
2153                 raddr += ssize;
2154         }
2155 }
2156 
2157 /*
2158  * Cache control operations over the interval [addr, addr + size) in
2159  * address space "as".
2160  */
2161 /*ARGSUSED*/
2162 int
2163 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2164     uintptr_t arg, ulong_t *lock_map, size_t pos)
2165 {
2166         struct seg *seg;        /* working segment */
2167         caddr_t raddr;          /* rounded down addr */
2168         caddr_t initraddr;      /* saved initial rounded down addr */
2169         size_t rsize;           /* rounded up size */
2170         size_t initrsize;       /* saved initial rounded up size */
2171         size_t ssize;           /* size of seg */
2172         int error = 0;                  /* result */
2173         size_t mlock_size;      /* size of bitmap */
2174         ulong_t *mlock_map;     /* pointer to bitmap used */
2175                                 /* to represent the locked */
2176                                 /* pages. */
2177 retry:
2178         if (error == IE_RETRY)
2179                 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2180         else
2181                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2182 
2183         /*
2184          * If these are address space lock/unlock operations, loop over
2185          * all segments in the address space, as appropriate.
2186          */
2187         if (func == MC_LOCKAS) {
2188                 size_t npages, idx;
2189                 size_t rlen = 0;        /* rounded as length */
2190 
2191                 idx = pos;
2192 
2193                 if (arg & MCL_FUTURE) {
2194                         mutex_enter(&as->a_contents);
2195                         AS_SETPGLCK(as);
2196                         mutex_exit(&as->a_contents);
2197                 }
2198                 if ((arg & MCL_CURRENT) == 0) {
2199                         AS_LOCK_EXIT(as, &as->a_lock);
2200                         return (0);
2201                 }
2202 
2203                 seg = AS_SEGFIRST(as);
2204                 if (seg == NULL) {
2205                         AS_LOCK_EXIT(as, &as->a_lock);
2206                         return (0);
2207                 }
2208 
2209                 do {
2210                         raddr = (caddr_t)((uintptr_t)seg->s_base &
2211                             (uintptr_t)PAGEMASK);
2212                         rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2213                             PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2214                 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2215 
2216                 mlock_size = BT_BITOUL(btopr(rlen));
2217                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2218                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2219                                 AS_LOCK_EXIT(as, &as->a_lock);
2220                                 return (EAGAIN);
2221                 }
2222 
2223                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2224                         error = segop_lockop(seg, seg->s_base,
2225                             seg->s_size, attr, MC_LOCK, mlock_map, pos);
2226                         if (error != 0)
2227                                 break;
2228                         pos += seg_pages(seg);
2229                 }
2230 
2231                 if (error) {
2232                         for (seg = AS_SEGFIRST(as); seg != NULL;
2233                             seg = AS_SEGNEXT(as, seg)) {
2234 
2235                                 raddr = (caddr_t)((uintptr_t)seg->s_base &
2236                                     (uintptr_t)PAGEMASK);
2237                                 npages = seg_pages(seg);
2238                                 as_segunlock(seg, raddr, attr, mlock_map,
2239                                     idx, npages);
2240                                 idx += npages;
2241                         }
2242                 }
2243 
2244                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2245                 AS_LOCK_EXIT(as, &as->a_lock);
2246                 goto lockerr;
2247         } else if (func == MC_UNLOCKAS) {
2248                 mutex_enter(&as->a_contents);
2249                 AS_CLRPGLCK(as);
2250                 mutex_exit(&as->a_contents);
2251 
2252                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2253                         error = segop_lockop(seg, seg->s_base,
2254                             seg->s_size, attr, MC_UNLOCK, NULL, 0);
2255                         if (error != 0)
2256                                 break;
2257                 }
2258 
2259                 AS_LOCK_EXIT(as, &as->a_lock);
2260                 goto lockerr;
2261         }
2262 
2263         /*
2264          * Normalize addresses and sizes.
2265          */
2266         initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2267         initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2268             (size_t)raddr;
2269 
2270         if (raddr + rsize < raddr) {         /* check for wraparound */
2271                 AS_LOCK_EXIT(as, &as->a_lock);
2272                 return (ENOMEM);
2273         }
2274 
2275         /*
2276          * Get initial segment.
2277          */
2278         if ((seg = as_segat(as, raddr)) == NULL) {
2279                 AS_LOCK_EXIT(as, &as->a_lock);
2280                 return (ENOMEM);
2281         }
2282 
2283         if (func == MC_LOCK) {
2284                 mlock_size = BT_BITOUL(btopr(rsize));
2285                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2286                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2287                                 AS_LOCK_EXIT(as, &as->a_lock);
2288                                 return (EAGAIN);
2289                 }
2290         }
2291 
2292         /*
2293          * Loop over all segments.  If a hole in the address range is
2294          * discovered, then fail.  For each segment, perform the appropriate
2295          * control operation.
2296          */
2297         while (rsize != 0) {
2298 
2299                 /*
2300                  * Make sure there's no hole, calculate the portion
2301                  * of the next segment to be operated over.
2302                  */
2303                 if (raddr >= seg->s_base + seg->s_size) {
2304                         seg = AS_SEGNEXT(as, seg);
2305                         if (seg == NULL || raddr != seg->s_base) {
2306                                 if (func == MC_LOCK) {
2307                                         as_unlockerr(as, attr, mlock_map,
2308                                             initraddr, initrsize - rsize);
2309                                         kmem_free(mlock_map,
2310                                             mlock_size * sizeof (ulong_t));
2311                                 }
2312                                 AS_LOCK_EXIT(as, &as->a_lock);
2313                                 return (ENOMEM);
2314                         }
2315                 }
2316                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2317                         ssize = seg->s_base + seg->s_size - raddr;
2318                 else
2319                         ssize = rsize;
2320 
2321                 /*
2322                  * Dispatch on specific function.
2323                  */
2324                 switch (func) {
2325 
2326                 /*
2327                  * Synchronize cached data from mappings with backing
2328                  * objects.
2329                  */
2330                 case MC_SYNC:
2331                         if (error = segop_sync(seg, raddr, ssize,
2332                             attr, (uint_t)arg)) {
2333                                 AS_LOCK_EXIT(as, &as->a_lock);
2334                                 return (error);
2335                         }
2336                         break;
2337 
2338                 /*
2339                  * Lock pages in memory.
2340                  */
2341                 case MC_LOCK:
2342                         if (error = segop_lockop(seg, raddr, ssize,
2343                             attr, func, mlock_map, pos)) {
2344                                 as_unlockerr(as, attr, mlock_map, initraddr,
2345                                     initrsize - rsize + ssize);
2346                                 kmem_free(mlock_map, mlock_size *
2347                                     sizeof (ulong_t));
2348                                 AS_LOCK_EXIT(as, &as->a_lock);
2349                                 goto lockerr;
2350                         }
2351                         break;
2352 
2353                 /*
2354                  * Unlock mapped pages.
2355                  */
2356                 case MC_UNLOCK:
2357                         (void) segop_lockop(seg, raddr, ssize, attr, func,
2358                             (ulong_t *)NULL, (size_t)NULL);
2359                         break;
2360 
2361                 /*
2362                  * Store VM advise for mapped pages in segment layer.
2363                  */
2364                 case MC_ADVISE:
2365                         error = segop_advise(seg, raddr, ssize, (uint_t)arg);
2366 
2367                         /*
2368                          * Check for regular errors and special retry error
2369                          */
2370                         if (error) {
2371                                 if (error == IE_RETRY) {
2372                                         /*
2373                                          * Need to acquire writers lock, so
2374                                          * have to drop readers lock and start
2375                                          * all over again
2376                                          */
2377                                         AS_LOCK_EXIT(as, &as->a_lock);
2378                                         goto retry;
2379                                 } else if (error == IE_REATTACH) {
2380                                         /*
2381                                          * Find segment for current address
2382                                          * because current segment just got
2383                                          * split or concatenated
2384                                          */
2385                                         seg = as_segat(as, raddr);
2386                                         if (seg == NULL) {
2387                                                 AS_LOCK_EXIT(as, &as->a_lock);
2388                                                 return (ENOMEM);
2389                                         }
2390                                 } else {
2391                                         /*
2392                                          * Regular error
2393                                          */
2394                                         AS_LOCK_EXIT(as, &as->a_lock);
2395                                         return (error);
2396                                 }
2397                         }
2398                         break;
2399 
2400                 case MC_INHERIT_ZERO:
2401                         error = segop_inherit(seg, raddr, ssize, SEGP_INH_ZERO);
2402                         if (error != 0) {
2403                                 AS_LOCK_EXIT(as, &as->a_lock);
2404                                 return (error);
2405                         }
2406                         break;
2407 
2408                 /*
2409                  * Can't happen.
2410                  */
2411                 default:
2412                         panic("as_ctl: bad operation %d", func);
2413                         /*NOTREACHED*/
2414                 }
2415 
2416                 rsize -= ssize;
2417                 raddr += ssize;
2418         }
2419 
2420         if (func == MC_LOCK)
2421                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2422         AS_LOCK_EXIT(as, &as->a_lock);
2423         return (0);
2424 lockerr:
2425 
2426         /*
2427          * If the lower levels returned EDEADLK for a segment lockop,
2428          * it means that we should retry the operation.  Let's wait
2429          * a bit also to let the deadlock causing condition clear.
2430          * This is part of a gross hack to work around a design flaw
2431          * in the ufs/sds logging code and should go away when the
2432          * logging code is re-designed to fix the problem. See bug
2433          * 4125102 for details of the problem.
2434          */
2435         if (error == EDEADLK) {
2436                 delay(deadlk_wait);
2437                 error = 0;
2438                 goto retry;
2439         }
2440         return (error);
2441 }
2442 
2443 int
2444 fc_decode(faultcode_t fault_err)
2445 {
2446         int error = 0;
2447 
2448         switch (FC_CODE(fault_err)) {
2449         case FC_OBJERR:
2450                 error = FC_ERRNO(fault_err);
2451                 break;
2452         case FC_PROT:
2453                 error = EACCES;
2454                 break;
2455         default:
2456                 error = EFAULT;
2457                 break;
2458         }
2459         return (error);
2460 }
2461 
2462 /*
2463  * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2464  * lists from each segment and copy them to one contiguous shadow list (plist)
2465  * as expected by the caller.  Save pointers to per segment shadow lists at
2466  * the tail of plist so that they can be used during as_pageunlock().
2467  */
2468 static int
2469 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2470     caddr_t addr, size_t size, enum seg_rw rw)
2471 {
2472         caddr_t sv_addr = addr;
2473         size_t sv_size = size;
2474         struct seg *sv_seg = seg;
2475         ulong_t segcnt = 1;
2476         ulong_t cnt;
2477         size_t ssize;
2478         pgcnt_t npages = btop(size);
2479         page_t **plist;
2480         page_t **pl;
2481         int error;
2482         caddr_t eaddr;
2483         faultcode_t fault_err = 0;
2484         pgcnt_t pl_off;
2485         extern const struct seg_ops segspt_shmops;
2486 
2487         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2488         ASSERT(seg != NULL);
2489         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2490         ASSERT(addr + size > seg->s_base + seg->s_size);
2491         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2492         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2493 
2494         /*
2495          * Count the number of segments covered by the range we are about to
2496          * lock. The segment count is used to size the shadow list we return
2497          * back to the caller.
2498          */
2499         for (; size != 0; size -= ssize, addr += ssize) {
2500                 if (addr >= seg->s_base + seg->s_size) {
2501 
2502                         seg = AS_SEGNEXT(as, seg);
2503                         if (seg == NULL || addr != seg->s_base) {
2504                                 AS_LOCK_EXIT(as, &as->a_lock);
2505                                 return (EFAULT);
2506                         }
2507                         /*
2508                          * Do a quick check if subsequent segments
2509                          * will most likely support pagelock.
2510                          */
2511                         if (seg->s_ops == &segvn_ops) {
2512                                 vnode_t *vp;
2513 
2514                                 if (segop_getvp(seg, addr, &vp) != 0 ||
2515                                     vp != NULL) {
2516                                         AS_LOCK_EXIT(as, &as->a_lock);
2517                                         goto slow;
2518                                 }
2519                         } else if (seg->s_ops != &segspt_shmops) {
2520                                 AS_LOCK_EXIT(as, &as->a_lock);
2521                                 goto slow;
2522                         }
2523                         segcnt++;
2524                 }
2525                 if (addr + size > seg->s_base + seg->s_size) {
2526                         ssize = seg->s_base + seg->s_size - addr;
2527                 } else {
2528                         ssize = size;
2529                 }
2530         }
2531         ASSERT(segcnt > 1);
2532 
2533         plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2534 
2535         addr = sv_addr;
2536         size = sv_size;
2537         seg = sv_seg;
2538 
2539         for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2540                 if (addr >= seg->s_base + seg->s_size) {
2541                         seg = AS_SEGNEXT(as, seg);
2542                         ASSERT(seg != NULL && addr == seg->s_base);
2543                         cnt++;
2544                         ASSERT(cnt < segcnt);
2545                 }
2546                 if (addr + size > seg->s_base + seg->s_size) {
2547                         ssize = seg->s_base + seg->s_size - addr;
2548                 } else {
2549                         ssize = size;
2550                 }
2551                 pl = &plist[npages + cnt];
2552                 error = segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2553                     L_PAGELOCK, rw);
2554                 if (error) {
2555                         break;
2556                 }
2557                 ASSERT(plist[npages + cnt] != NULL);
2558                 ASSERT(pl_off + btop(ssize) <= npages);
2559                 bcopy(plist[npages + cnt], &plist[pl_off],
2560                     btop(ssize) * sizeof (page_t *));
2561                 pl_off += btop(ssize);
2562         }
2563 
2564         if (size == 0) {
2565                 AS_LOCK_EXIT(as, &as->a_lock);
2566                 ASSERT(cnt == segcnt - 1);
2567                 *ppp = plist;
2568                 return (0);
2569         }
2570 
2571         /*
2572          * one of pagelock calls failed. The error type is in error variable.
2573          * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2574          * type is either EFAULT or ENOTSUP. Otherwise just return the error
2575          * back to the caller.
2576          */
2577 
2578         eaddr = addr;
2579         seg = sv_seg;
2580 
2581         for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2582                 if (addr >= seg->s_base + seg->s_size) {
2583                         seg = AS_SEGNEXT(as, seg);
2584                         ASSERT(seg != NULL && addr == seg->s_base);
2585                         cnt++;
2586                         ASSERT(cnt < segcnt);
2587                 }
2588                 if (eaddr > seg->s_base + seg->s_size) {
2589                         ssize = seg->s_base + seg->s_size - addr;
2590                 } else {
2591                         ssize = eaddr - addr;
2592                 }
2593                 pl = &plist[npages + cnt];
2594                 ASSERT(*pl != NULL);
2595                 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2596                     L_PAGEUNLOCK, rw);
2597         }
2598 
2599         AS_LOCK_EXIT(as, &as->a_lock);
2600 
2601         kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2602 
2603         if (error != ENOTSUP && error != EFAULT) {
2604                 return (error);
2605         }
2606 
2607 slow:
2608         /*
2609          * If we are here because pagelock failed due to the need to cow fault
2610          * in the pages we want to lock F_SOFTLOCK will do this job and in
2611          * next as_pagelock() call for this address range pagelock will
2612          * hopefully succeed.
2613          */
2614         fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2615         if (fault_err != 0) {
2616                 return (fc_decode(fault_err));
2617         }
2618         *ppp = NULL;
2619 
2620         return (0);
2621 }
2622 
2623 /*
2624  * lock pages in a given address space. Return shadow list. If
2625  * the list is NULL, the MMU mapping is also locked.
2626  */
2627 int
2628 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2629     size_t size, enum seg_rw rw)
2630 {
2631         size_t rsize;
2632         caddr_t raddr;
2633         faultcode_t fault_err;
2634         struct seg *seg;
2635         int err;
2636 
2637         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2638             "as_pagelock_start: addr %p size %ld", addr, size);
2639 
2640         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2641         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2642             (size_t)raddr;
2643 
2644         /*
2645          * if the request crosses two segments let
2646          * as_fault handle it.
2647          */
2648         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2649 
2650         seg = as_segat(as, raddr);
2651         if (seg == NULL) {
2652                 AS_LOCK_EXIT(as, &as->a_lock);
2653                 return (EFAULT);
2654         }
2655         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2656         if (raddr + rsize > seg->s_base + seg->s_size) {
2657                 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2658         }
2659         if (raddr + rsize <= raddr) {
2660                 AS_LOCK_EXIT(as, &as->a_lock);
2661                 return (EFAULT);
2662         }
2663 
2664         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2665             "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2666 
2667         /*
2668          * try to lock pages and pass back shadow list
2669          */
2670         err = segop_pagelock(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2671 
2672         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2673 
2674         AS_LOCK_EXIT(as, &as->a_lock);
2675 
2676         if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2677                 return (err);
2678         }
2679 
2680         /*
2681          * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2682          * to no pagelock support for this segment or pages need to be cow
2683          * faulted in. If fault is needed F_SOFTLOCK will do this job for
2684          * this as_pagelock() call and in the next as_pagelock() call for the
2685          * same address range pagelock call will hopefull succeed.
2686          */
2687         fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2688         if (fault_err != 0) {
2689                 return (fc_decode(fault_err));
2690         }
2691         *ppp = NULL;
2692 
2693         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2694         return (0);
2695 }
2696 
2697 /*
2698  * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2699  * lists from the end of plist and call pageunlock interface for each segment.
2700  * Drop as lock and free plist.
2701  */
2702 static void
2703 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2704     struct page **plist, enum seg_rw rw)
2705 {
2706         ulong_t cnt;
2707         caddr_t eaddr = addr + size;
2708         pgcnt_t npages = btop(size);
2709         size_t ssize;
2710         page_t **pl;
2711 
2712         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2713         ASSERT(seg != NULL);
2714         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2715         ASSERT(addr + size > seg->s_base + seg->s_size);
2716         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2717         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2718         ASSERT(plist != NULL);
2719 
2720         for (cnt = 0; addr < eaddr; addr += ssize) {
2721                 if (addr >= seg->s_base + seg->s_size) {
2722                         seg = AS_SEGNEXT(as, seg);
2723                         ASSERT(seg != NULL && addr == seg->s_base);
2724                         cnt++;
2725                 }
2726                 if (eaddr > seg->s_base + seg->s_size) {
2727                         ssize = seg->s_base + seg->s_size - addr;
2728                 } else {
2729                         ssize = eaddr - addr;
2730                 }
2731                 pl = &plist[npages + cnt];
2732                 ASSERT(*pl != NULL);
2733                 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2734                     L_PAGEUNLOCK, rw);
2735         }
2736         ASSERT(cnt > 0);
2737         AS_LOCK_EXIT(as, &as->a_lock);
2738 
2739         cnt++;
2740         kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2741 }
2742 
2743 /*
2744  * unlock pages in a given address range
2745  */
2746 void
2747 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2748     enum seg_rw rw)
2749 {
2750         struct seg *seg;
2751         size_t rsize;
2752         caddr_t raddr;
2753 
2754         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2755             "as_pageunlock_start: addr %p size %ld", addr, size);
2756 
2757         /*
2758          * if the shadow list is NULL, as_pagelock was
2759          * falling back to as_fault
2760          */
2761         if (pp == NULL) {
2762                 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2763                 return;
2764         }
2765 
2766         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2767         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2768             (size_t)raddr;
2769 
2770         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2771         seg = as_segat(as, raddr);
2772         ASSERT(seg != NULL);
2773 
2774         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2775             "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2776 
2777         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2778         if (raddr + rsize <= seg->s_base + seg->s_size) {
2779                 segop_pagelock(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2780         } else {
2781                 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2782                 return;
2783         }
2784         AS_LOCK_EXIT(as, &as->a_lock);
2785         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2786 }
2787 
2788 int
2789 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2790     boolean_t wait)
2791 {
2792         struct seg *seg;
2793         size_t ssize;
2794         caddr_t raddr;                  /* rounded down addr */
2795         size_t rsize;                   /* rounded up size */
2796         int error = 0;
2797         size_t pgsz = page_get_pagesize(szc);
2798 
2799 setpgsz_top:
2800         if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2801                 return (EINVAL);
2802         }
2803 
2804         raddr = addr;
2805         rsize = size;
2806 
2807         if (raddr + rsize < raddr)           /* check for wraparound */
2808                 return (ENOMEM);
2809 
2810         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2811         as_clearwatchprot(as, raddr, rsize);
2812         seg = as_segat(as, raddr);
2813         if (seg == NULL) {
2814                 as_setwatch(as);
2815                 AS_LOCK_EXIT(as, &as->a_lock);
2816                 return (ENOMEM);
2817         }
2818 
2819         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2820                 if (raddr >= seg->s_base + seg->s_size) {
2821                         seg = AS_SEGNEXT(as, seg);
2822                         if (seg == NULL || raddr != seg->s_base) {
2823                                 error = ENOMEM;
2824                                 break;
2825                         }
2826                 }
2827                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2828                         ssize = seg->s_base + seg->s_size - raddr;
2829                 } else {
2830                         ssize = rsize;
2831                 }
2832 
2833 retry:
2834                 error = segop_setpagesize(seg, raddr, ssize, szc);
2835 
2836                 if (error == IE_NOMEM) {
2837                         error = EAGAIN;
2838                         break;
2839                 }
2840 
2841                 if (error == IE_RETRY) {
2842                         AS_LOCK_EXIT(as, &as->a_lock);
2843                         goto setpgsz_top;
2844                 }
2845 
2846                 if (error == ENOTSUP) {
2847                         error = EINVAL;
2848                         break;
2849                 }
2850 
2851                 if (wait && (error == EAGAIN)) {
2852                         /*
2853                          * Memory is currently locked.  It must be unlocked
2854                          * before this operation can succeed through a retry.
2855                          * The possible reasons for locked memory and
2856                          * corresponding strategies for unlocking are:
2857                          * (1) Normal I/O
2858                          *      wait for a signal that the I/O operation
2859                          *      has completed and the memory is unlocked.
2860                          * (2) Asynchronous I/O
2861                          *      The aio subsystem does not unlock pages when
2862                          *      the I/O is completed. Those pages are unlocked
2863                          *      when the application calls aiowait/aioerror.
2864                          *      So, to prevent blocking forever, cv_broadcast()
2865                          *      is done to wake up aio_cleanup_thread.
2866                          *      Subsequently, segvn_reclaim will be called, and
2867                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
2868                          * (3) Long term page locking:
2869                          *      This is not relevant for as_setpagesize()
2870                          *      because we cannot change the page size for
2871                          *      driver memory. The attempt to do so will
2872                          *      fail with a different error than EAGAIN so
2873                          *      there's no need to trigger as callbacks like
2874                          *      as_unmap, as_setprot or as_free would do.
2875                          */
2876                         mutex_enter(&as->a_contents);
2877                         if (!AS_ISNOUNMAPWAIT(as)) {
2878                                 if (AS_ISUNMAPWAIT(as) == 0) {
2879                                         cv_broadcast(&as->a_cv);
2880                                 }
2881                                 AS_SETUNMAPWAIT(as);
2882                                 AS_LOCK_EXIT(as, &as->a_lock);
2883                                 while (AS_ISUNMAPWAIT(as)) {
2884                                         cv_wait(&as->a_cv, &as->a_contents);
2885                                 }
2886                         } else {
2887                                 /*
2888                                  * We may have raced with
2889                                  * segvn_reclaim()/segspt_reclaim(). In this
2890                                  * case clean nounmapwait flag and retry since
2891                                  * softlockcnt in this segment may be already
2892                                  * 0.  We don't drop as writer lock so our
2893                                  * number of retries without sleeping should
2894                                  * be very small. See segvn_reclaim() for
2895                                  * more comments.
2896                                  */
2897                                 AS_CLRNOUNMAPWAIT(as);
2898                                 mutex_exit(&as->a_contents);
2899                                 goto retry;
2900                         }
2901                         mutex_exit(&as->a_contents);
2902                         goto setpgsz_top;
2903                 } else if (error != 0) {
2904                         break;
2905                 }
2906         }
2907         as_setwatch(as);
2908         AS_LOCK_EXIT(as, &as->a_lock);
2909         return (error);
2910 }
2911 
2912 /*
2913  * as_iset3_default_lpsize() just calls segop_setpagesize() on all segments
2914  * in its chunk where s_szc is less than the szc we want to set.
2915  */
2916 static int
2917 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2918     int *retry)
2919 {
2920         struct seg *seg;
2921         size_t ssize;
2922         int error;
2923 
2924         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
2925 
2926         seg = as_segat(as, raddr);
2927         if (seg == NULL) {
2928                 panic("as_iset3_default_lpsize: no seg");
2929         }
2930 
2931         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2932                 if (raddr >= seg->s_base + seg->s_size) {
2933                         seg = AS_SEGNEXT(as, seg);
2934                         if (seg == NULL || raddr != seg->s_base) {
2935                                 panic("as_iset3_default_lpsize: as changed");
2936                         }
2937                 }
2938                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2939                         ssize = seg->s_base + seg->s_size - raddr;
2940                 } else {
2941                         ssize = rsize;
2942                 }
2943 
2944                 if (szc > seg->s_szc) {
2945                         error = segop_setpagesize(seg, raddr, ssize, szc);
2946                         /* Only retry on EINVAL segments that have no vnode. */
2947                         if (error == EINVAL) {
2948                                 vnode_t *vp = NULL;
2949                                 if ((segop_gettype(seg, raddr) & MAP_SHARED) &&
2950                                     (segop_getvp(seg, raddr, &vp) != 0 ||
2951                                     vp == NULL)) {
2952                                         *retry = 1;
2953                                 } else {
2954                                         *retry = 0;
2955                                 }
2956                         }
2957                         if (error) {
2958                                 return (error);
2959                         }
2960                 }
2961         }
2962         return (0);
2963 }
2964 
2965 /*
2966  * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
2967  * pagesize on each segment in its range, but if any fails with EINVAL,
2968  * then it reduces the pagesizes to the next size in the bitmap and
2969  * retries as_iset3_default_lpsize(). The reason why the code retries
2970  * smaller allowed sizes on EINVAL is because (a) the anon offset may not
2971  * match the bigger sizes, and (b) it's hard to get this offset (to begin
2972  * with) to pass to map_pgszcvec().
2973  */
2974 static int
2975 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2976     uint_t szcvec)
2977 {
2978         int error;
2979         int retry;
2980 
2981         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
2982 
2983         for (;;) {
2984                 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
2985                 if (error == EINVAL && retry) {
2986                         szcvec &= ~(1 << szc);
2987                         if (szcvec <= 1) {
2988                                 return (EINVAL);
2989                         }
2990                         szc = highbit(szcvec) - 1;
2991                 } else {
2992                         return (error);
2993                 }
2994         }
2995 }
2996 
2997 /*
2998  * as_iset1_default_lpsize() breaks its chunk into areas where existing
2999  * segments have a smaller szc than we want to set. For each such area,
3000  * it calls as_iset2_default_lpsize()
3001  */
3002 static int
3003 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3004     uint_t szcvec)
3005 {
3006         struct seg *seg;
3007         size_t ssize;
3008         caddr_t setaddr = raddr;
3009         size_t setsize = 0;
3010         int set;
3011         int error;
3012 
3013         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3014 
3015         seg = as_segat(as, raddr);
3016         if (seg == NULL) {
3017                 panic("as_iset1_default_lpsize: no seg");
3018         }
3019         if (seg->s_szc < szc) {
3020                 set = 1;
3021         } else {
3022                 set = 0;
3023         }
3024 
3025         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3026                 if (raddr >= seg->s_base + seg->s_size) {
3027                         seg = AS_SEGNEXT(as, seg);
3028                         if (seg == NULL || raddr != seg->s_base) {
3029                                 panic("as_iset1_default_lpsize: as changed");
3030                         }
3031                         if (seg->s_szc >= szc && set) {
3032                                 ASSERT(setsize != 0);
3033                                 error = as_iset2_default_lpsize(as,
3034                                     setaddr, setsize, szc, szcvec);
3035                                 if (error) {
3036                                         return (error);
3037                                 }
3038                                 set = 0;
3039                         } else if (seg->s_szc < szc && !set) {
3040                                 setaddr = raddr;
3041                                 setsize = 0;
3042                                 set = 1;
3043                         }
3044                 }
3045                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3046                         ssize = seg->s_base + seg->s_size - raddr;
3047                 } else {
3048                         ssize = rsize;
3049                 }
3050         }
3051         error = 0;
3052         if (set) {
3053                 ASSERT(setsize != 0);
3054                 error = as_iset2_default_lpsize(as, setaddr, setsize,
3055                     szc, szcvec);
3056         }
3057         return (error);
3058 }
3059 
3060 /*
3061  * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3062  * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3063  * chunk to as_iset1_default_lpsize().
3064  */
3065 static int
3066 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3067     int type)
3068 {
3069         int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3070         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3071             flags, rtype, 1);
3072         uint_t szc;
3073         uint_t nszc;
3074         int error;
3075         caddr_t a;
3076         caddr_t eaddr;
3077         size_t segsize;
3078         size_t pgsz;
3079         uint_t save_szcvec;
3080 
3081         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3082         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3083         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3084 
3085         szcvec &= ~1;
3086         if (szcvec <= 1) {   /* skip if base page size */
3087                 return (0);
3088         }
3089 
3090         /* Get the pagesize of the first larger page size. */
3091         szc = lowbit(szcvec) - 1;
3092         pgsz = page_get_pagesize(szc);
3093         eaddr = addr + size;
3094         addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3095         eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3096 
3097         save_szcvec = szcvec;
3098         szcvec >>= (szc + 1);
3099         nszc = szc;
3100         while (szcvec) {
3101                 if ((szcvec & 0x1) == 0) {
3102                         nszc++;
3103                         szcvec >>= 1;
3104                         continue;
3105                 }
3106                 nszc++;
3107                 pgsz = page_get_pagesize(nszc);
3108                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3109                 if (a != addr) {
3110                         ASSERT(szc > 0);
3111                         ASSERT(a < eaddr);
3112                         segsize = a - addr;
3113                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3114                             save_szcvec);
3115                         if (error) {
3116                                 return (error);
3117                         }
3118                         addr = a;
3119                 }
3120                 szc = nszc;
3121                 szcvec >>= 1;
3122         }
3123 
3124         ASSERT(addr < eaddr);
3125         szcvec = save_szcvec;
3126         while (szcvec) {
3127                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3128                 ASSERT(a >= addr);
3129                 if (a != addr) {
3130                         ASSERT(szc > 0);
3131                         segsize = a - addr;
3132                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3133                             save_szcvec);
3134                         if (error) {
3135                                 return (error);
3136                         }
3137                         addr = a;
3138                 }
3139                 szcvec &= ~(1 << szc);
3140                 if (szcvec) {
3141                         szc = highbit(szcvec) - 1;
3142                         pgsz = page_get_pagesize(szc);
3143                 }
3144         }
3145         ASSERT(addr == eaddr);
3146 
3147         return (0);
3148 }
3149 
3150 /*
3151  * Set the default large page size for the range. Called via memcntl with
3152  * page size set to 0. as_set_default_lpsize breaks the range down into
3153  * chunks with the same type/flags, ignores-non segvn segments, and passes
3154  * each chunk to as_iset_default_lpsize().
3155  */
3156 int
3157 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3158 {
3159         struct seg *seg;
3160         caddr_t raddr;
3161         size_t rsize;
3162         size_t ssize;
3163         int rtype, rflags;
3164         int stype, sflags;
3165         int error;
3166         caddr_t setaddr;
3167         size_t setsize;
3168         int segvn;
3169 
3170         if (size == 0)
3171                 return (0);
3172 
3173         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3174 again:
3175         error = 0;
3176 
3177         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3178         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3179             (size_t)raddr;
3180 
3181         if (raddr + rsize < raddr) {         /* check for wraparound */
3182                 AS_LOCK_EXIT(as, &as->a_lock);
3183                 return (ENOMEM);
3184         }
3185         as_clearwatchprot(as, raddr, rsize);
3186         seg = as_segat(as, raddr);
3187         if (seg == NULL) {
3188                 as_setwatch(as);
3189                 AS_LOCK_EXIT(as, &as->a_lock);
3190                 return (ENOMEM);
3191         }
3192         if (seg->s_ops == &segvn_ops) {
3193                 rtype = segop_gettype(seg, addr);
3194                 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3195                 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3196                 segvn = 1;
3197         } else {
3198                 segvn = 0;
3199         }
3200         setaddr = raddr;
3201         setsize = 0;
3202 
3203         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3204                 if (raddr >= (seg->s_base + seg->s_size)) {
3205                         seg = AS_SEGNEXT(as, seg);
3206                         if (seg == NULL || raddr != seg->s_base) {
3207                                 error = ENOMEM;
3208                                 break;
3209                         }
3210                         if (seg->s_ops == &segvn_ops) {
3211                                 stype = segop_gettype(seg, raddr);
3212                                 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3213                                 stype &= (MAP_SHARED | MAP_PRIVATE);
3214                                 if (segvn && (rflags != sflags ||
3215                                     rtype != stype)) {
3216                                         /*
3217                                          * The next segment is also segvn but
3218                                          * has different flags and/or type.
3219                                          */
3220                                         ASSERT(setsize != 0);
3221                                         error = as_iset_default_lpsize(as,
3222                                             setaddr, setsize, rflags, rtype);
3223                                         if (error) {
3224                                                 break;
3225                                         }
3226                                         rflags = sflags;
3227                                         rtype = stype;
3228                                         setaddr = raddr;
3229                                         setsize = 0;
3230                                 } else if (!segvn) {
3231                                         rflags = sflags;
3232                                         rtype = stype;
3233                                         setaddr = raddr;
3234                                         setsize = 0;
3235                                         segvn = 1;
3236                                 }
3237                         } else if (segvn) {
3238                                 /* The next segment is not segvn. */
3239                                 ASSERT(setsize != 0);
3240                                 error = as_iset_default_lpsize(as,
3241                                     setaddr, setsize, rflags, rtype);
3242                                 if (error) {
3243                                         break;
3244                                 }
3245                                 segvn = 0;
3246                         }
3247                 }
3248                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3249                         ssize = seg->s_base + seg->s_size - raddr;
3250                 } else {
3251                         ssize = rsize;
3252                 }
3253         }
3254         if (error == 0 && segvn) {
3255                 /* The last chunk when rsize == 0. */
3256                 ASSERT(setsize != 0);
3257                 error = as_iset_default_lpsize(as, setaddr, setsize,
3258                     rflags, rtype);
3259         }
3260 
3261         if (error == IE_RETRY) {
3262                 goto again;
3263         } else if (error == IE_NOMEM) {
3264                 error = EAGAIN;
3265         } else if (error == ENOTSUP) {
3266                 error = EINVAL;
3267         } else if (error == EAGAIN) {
3268                 mutex_enter(&as->a_contents);
3269                 if (!AS_ISNOUNMAPWAIT(as)) {
3270                         if (AS_ISUNMAPWAIT(as) == 0) {
3271                                 cv_broadcast(&as->a_cv);
3272                         }
3273                         AS_SETUNMAPWAIT(as);
3274                         AS_LOCK_EXIT(as, &as->a_lock);
3275                         while (AS_ISUNMAPWAIT(as)) {
3276                                 cv_wait(&as->a_cv, &as->a_contents);
3277                         }
3278                         mutex_exit(&as->a_contents);
3279                         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3280                 } else {
3281                         /*
3282                          * We may have raced with
3283                          * segvn_reclaim()/segspt_reclaim(). In this case
3284                          * clean nounmapwait flag and retry since softlockcnt
3285                          * in this segment may be already 0.  We don't drop as
3286                          * writer lock so our number of retries without
3287                          * sleeping should be very small. See segvn_reclaim()
3288                          * for more comments.
3289                          */
3290                         AS_CLRNOUNMAPWAIT(as);
3291                         mutex_exit(&as->a_contents);
3292                 }
3293                 goto again;
3294         }
3295 
3296         as_setwatch(as);
3297         AS_LOCK_EXIT(as, &as->a_lock);
3298         return (error);
3299 }
3300 
3301 /*
3302  * Setup all of the uninitialized watched pages that we can.
3303  */
3304 void
3305 as_setwatch(struct as *as)
3306 {
3307         struct watched_page *pwp;
3308         struct seg *seg;
3309         caddr_t vaddr;
3310         uint_t prot;
3311         int  err, retrycnt;
3312 
3313         if (avl_numnodes(&as->a_wpage) == 0)
3314                 return;
3315 
3316         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3317 
3318         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3319             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3320                 retrycnt = 0;
3321         retry:
3322                 vaddr = pwp->wp_vaddr;
3323                 if (pwp->wp_oprot != 0 ||    /* already set up */
3324                     (seg = as_segat(as, vaddr)) == NULL ||
3325                     segop_getprot(seg, vaddr, 0, &prot) != 0)
3326                         continue;
3327 
3328                 pwp->wp_oprot = prot;
3329                 if (pwp->wp_read)
3330                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3331                 if (pwp->wp_write)
3332                         prot &= ~PROT_WRITE;
3333                 if (pwp->wp_exec)
3334                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3335                 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3336                         err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3337                         if (err == IE_RETRY) {
3338                                 pwp->wp_oprot = 0;
3339                                 ASSERT(retrycnt == 0);
3340                                 retrycnt++;
3341                                 goto retry;
3342                         }
3343                 }
3344                 pwp->wp_prot = prot;
3345         }
3346 }
3347 
3348 /*
3349  * Clear all of the watched pages in the address space.
3350  */
3351 void
3352 as_clearwatch(struct as *as)
3353 {
3354         struct watched_page *pwp;
3355         struct seg *seg;
3356         caddr_t vaddr;
3357         uint_t prot;
3358         int err, retrycnt;
3359 
3360         if (avl_numnodes(&as->a_wpage) == 0)
3361                 return;
3362 
3363         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3364 
3365         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3366             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3367                 retrycnt = 0;
3368         retry:
3369                 vaddr = pwp->wp_vaddr;
3370                 if (pwp->wp_oprot == 0 ||    /* not set up */
3371                     (seg = as_segat(as, vaddr)) == NULL)
3372                         continue;
3373 
3374                 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3375                         err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3376                         if (err == IE_RETRY) {
3377                                 ASSERT(retrycnt == 0);
3378                                 retrycnt++;
3379                                 goto retry;
3380                         }
3381                 }
3382                 pwp->wp_oprot = 0;
3383                 pwp->wp_prot = 0;
3384         }
3385 }
3386 
3387 /*
3388  * Force a new setup for all the watched pages in the range.
3389  */
3390 static void
3391 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3392 {
3393         struct watched_page *pwp;
3394         struct watched_page tpw;
3395         caddr_t eaddr = addr + size;
3396         caddr_t vaddr;
3397         struct seg *seg;
3398         int err, retrycnt;
3399         uint_t  wprot;
3400         avl_index_t where;
3401 
3402         if (avl_numnodes(&as->a_wpage) == 0)
3403                 return;
3404 
3405         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3406 
3407         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3408         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3409                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3410 
3411         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3412                 retrycnt = 0;
3413                 vaddr = pwp->wp_vaddr;
3414 
3415                 wprot = prot;
3416                 if (pwp->wp_read)
3417                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3418                 if (pwp->wp_write)
3419                         wprot &= ~PROT_WRITE;
3420                 if (pwp->wp_exec)
3421                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3422                 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3423                 retry:
3424                         seg = as_segat(as, vaddr);
3425                         if (seg == NULL) {
3426                                 panic("as_setwatchprot: no seg");
3427                                 /*NOTREACHED*/
3428                         }
3429                         err = segop_setprot(seg, vaddr, PAGESIZE, wprot);
3430                         if (err == IE_RETRY) {
3431                                 ASSERT(retrycnt == 0);
3432                                 retrycnt++;
3433                                 goto retry;
3434                         }
3435                 }
3436                 pwp->wp_oprot = prot;
3437                 pwp->wp_prot = wprot;
3438 
3439                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3440         }
3441 }
3442 
3443 /*
3444  * Clear all of the watched pages in the range.
3445  */
3446 static void
3447 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3448 {
3449         caddr_t eaddr = addr + size;
3450         struct watched_page *pwp;
3451         struct watched_page tpw;
3452         uint_t prot;
3453         struct seg *seg;
3454         int err, retrycnt;
3455         avl_index_t where;
3456 
3457         if (avl_numnodes(&as->a_wpage) == 0)
3458                 return;
3459 
3460         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3461         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3462                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3463 
3464         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3465 
3466         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3467 
3468                 if ((prot = pwp->wp_oprot) != 0) {
3469                         retrycnt = 0;
3470 
3471                         if (prot != pwp->wp_prot) {
3472                         retry:
3473                                 seg = as_segat(as, pwp->wp_vaddr);
3474                                 if (seg == NULL)
3475                                         continue;
3476                                 err = segop_setprot(seg, pwp->wp_vaddr,
3477                                     PAGESIZE, prot);
3478                                 if (err == IE_RETRY) {
3479                                         ASSERT(retrycnt == 0);
3480                                         retrycnt++;
3481                                         goto retry;
3482 
3483                                 }
3484                         }
3485                         pwp->wp_oprot = 0;
3486                         pwp->wp_prot = 0;
3487                 }
3488 
3489                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3490         }
3491 }
3492 
3493 void
3494 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3495 {
3496         struct proc *p;
3497 
3498         mutex_enter(&pidlock);
3499         for (p = practive; p; p = p->p_next) {
3500                 if (p->p_as == as) {
3501                         mutex_enter(&p->p_lock);
3502                         if (p->p_as == as)
3503                                 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3504                         mutex_exit(&p->p_lock);
3505                 }
3506         }
3507         mutex_exit(&pidlock);
3508 }
3509 
3510 /*
3511  * return memory object ID
3512  */
3513 int
3514 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3515 {
3516         struct seg      *seg;
3517         int             sts;
3518 
3519         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
3520         seg = as_segat(as, addr);
3521         if (seg == NULL) {
3522                 AS_LOCK_EXIT(as, &as->a_lock);
3523                 return (EFAULT);
3524         }
3525 
3526         sts = segop_getmemid(seg, addr, memidp);
3527 
3528         AS_LOCK_EXIT(as, &as->a_lock);
3529         return (sts);
3530 }