remove-whole-process-swapping Wdiff usr/src/uts/common/vm/vm_as.c

Print this page

remove whole-process swapping
Long before Unix supported paging, it used process swapping to reclaim
memory.  The code is there and in theory it runs when we get *extremely* low
on memory.  In practice, it never runs since the definition of low-on-memory
is antiquated. (XXX: define what antiquated means)
You can check the number of swapout/swapin events with kstats:
$ kstat -p ::vm:swapin ::vm:swapout

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_as.c
          +++ new/usr/src/uts/common/vm/vm_as.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   * Copyright 2015, Joyent, Inc.  All rights reserved.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28   28  /*        All Rights Reserved   */
  29   29  
  30   30  /*
  31   31   * University Copyright- Copyright (c) 1982, 1986, 1988
  32   32   * The Regents of the University of California
  33   33   * All Rights Reserved
  34   34   *
  35   35   * University Acknowledgment- Portions of this document are derived from
  36   36   * software developed by the University of California, Berkeley, and its
  37   37   * contributors.
  38   38   */
  39   39  
  40   40  /*
  41   41   * VM - address spaces.
  42   42   */
  43   43  
  44   44  #include <sys/types.h>
  45   45  #include <sys/t_lock.h>
  46   46  #include <sys/param.h>
  47   47  #include <sys/errno.h>
  48   48  #include <sys/systm.h>
  49   49  #include <sys/mman.h>
  50   50  #include <sys/sysmacros.h>
  51   51  #include <sys/cpuvar.h>
  52   52  #include <sys/sysinfo.h>
  53   53  #include <sys/kmem.h>
  54   54  #include <sys/vnode.h>
  55   55  #include <sys/vmsystm.h>
  56   56  #include <sys/cmn_err.h>
  57   57  #include <sys/debug.h>
  58   58  #include <sys/tnf_probe.h>
  59   59  #include <sys/vtrace.h>
  60   60  
  61   61  #include <vm/hat.h>
  62   62  #include <vm/as.h>
  63   63  #include <vm/seg.h>
  64   64  #include <vm/seg_vn.h>
  65   65  #include <vm/seg_dev.h>
  66   66  #include <vm/seg_kmem.h>
  67   67  #include <vm/seg_map.h>
  68   68  #include <vm/seg_spt.h>
  69   69  #include <vm/page.h>
  70   70  
  71   71  clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
  72   72  
  73   73  static struct kmem_cache *as_cache;
  74   74  
  75   75  static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  76   76  static void as_clearwatchprot(struct as *, caddr_t, size_t);
  77   77  int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
  78   78  
  79   79  
  80   80  /*
  81   81   * Verifying the segment lists is very time-consuming; it may not be
  82   82   * desirable always to define VERIFY_SEGLIST when DEBUG is set.
  83   83   */
  84   84  #ifdef DEBUG
  85   85  #define VERIFY_SEGLIST
  86   86  int do_as_verify = 0;
  87   87  #endif
  88   88  
  89   89  /*
  90   90   * Allocate a new callback data structure entry and fill in the events of
  91   91   * interest, the address range of interest, and the callback argument.
  92   92   * Link the entry on the as->a_callbacks list. A callback entry for the
  93   93   * entire address space may be specified with vaddr = 0 and size = -1.
  94   94   *
  95   95   * CALLERS RESPONSIBILITY: If not calling from within the process context for
  96   96   * the specified as, the caller must guarantee persistence of the specified as
  97   97   * for the duration of this function (eg. pages being locked within the as
  98   98   * will guarantee persistence).
  99   99   */
 100  100  int
 101  101  as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 102  102                  caddr_t vaddr, size_t size, int sleepflag)
 103  103  {
 104  104          struct as_callback      *current_head, *cb;
 105  105          caddr_t                 saddr;
 106  106          size_t                  rsize;
 107  107  
 108  108          /* callback function and an event are mandatory */
 109  109          if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 110  110                  return (EINVAL);
 111  111  
 112  112          /* Adding a callback after as_free has been called is not allowed */
 113  113          if (as == &kas)
 114  114                  return (ENOMEM);
 115  115  
 116  116          /*
 117  117           * vaddr = 0 and size = -1 is used to indicate that the callback range
 118  118           * is the entire address space so no rounding is done in that case.
 119  119           */
 120  120          if (size != -1) {
 121  121                  saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 122  122                  rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 123  123                      (size_t)saddr;
 124  124                  /* check for wraparound */
 125  125                  if (saddr + rsize < saddr)
 126  126                          return (ENOMEM);
 127  127          } else {
 128  128                  if (vaddr != 0)
 129  129                          return (EINVAL);
 130  130                  saddr = vaddr;
 131  131                  rsize = size;
 132  132          }
 133  133  
 134  134          /* Allocate and initialize a callback entry */
 135  135          cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 136  136          if (cb == NULL)
 137  137                  return (EAGAIN);
 138  138  
 139  139          cb->ascb_func = cb_func;
 140  140          cb->ascb_arg = arg;
 141  141          cb->ascb_events = events;
 142  142          cb->ascb_saddr = saddr;
 143  143          cb->ascb_len = rsize;
 144  144  
 145  145          /* Add the entry to the list */
 146  146          mutex_enter(&as->a_contents);
 147  147          current_head = as->a_callbacks;
 148  148          as->a_callbacks = cb;
 149  149          cb->ascb_next = current_head;
 150  150  
 151  151          /*
 152  152           * The call to this function may lose in a race with
 153  153           * a pertinent event - eg. a thread does long term memory locking
 154  154           * but before the callback is added another thread executes as_unmap.
 155  155           * A broadcast here resolves that.
 156  156           */
 157  157          if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 158  158                  AS_CLRUNMAPWAIT(as);
 159  159                  cv_broadcast(&as->a_cv);
 160  160          }
 161  161  
 162  162          mutex_exit(&as->a_contents);
 163  163          return (0);
 164  164  }
 165  165  
 166  166  /*
 167  167   * Search the callback list for an entry which pertains to arg.
 168  168   *
 169  169   * This is called from within the client upon completion of the callback.
 170  170   * RETURN VALUES:
 171  171   *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 172  172   *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 173  173   *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 174  174   *                      entry will be made in as_do_callbacks)
 175  175   *
 176  176   * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 177  177   * set, it indicates that as_do_callbacks is processing this entry.  The
 178  178   * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 179  179   * to unblock as_do_callbacks, in case it is blocked.
 180  180   *
 181  181   * CALLERS RESPONSIBILITY: If not calling from within the process context for
 182  182   * the specified as, the caller must guarantee persistence of the specified as
 183  183   * for the duration of this function (eg. pages being locked within the as
 184  184   * will guarantee persistence).
 185  185   */
 186  186  uint_t
 187  187  as_delete_callback(struct as *as, void *arg)
 188  188  {
 189  189          struct as_callback **prevcb = &as->a_callbacks;
 190  190          struct as_callback *cb;
 191  191          uint_t rc = AS_CALLBACK_NOTFOUND;
 192  192  
 193  193          mutex_enter(&as->a_contents);
 194  194          for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 195  195                  if (cb->ascb_arg != arg)
 196  196                          continue;
 197  197  
 198  198                  /*
 199  199                   * If the events indicate AS_CALLBACK_CALLED, just clear
 200  200                   * AS_ALL_EVENT in the events field and wakeup the thread
 201  201                   * that may be waiting in as_do_callbacks.  as_do_callbacks
 202  202                   * will take care of removing this entry from the list.  In
 203  203                   * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 204  204                   * (AS_CALLBACK_CALLED not set), just remove it from the
 205  205                   * list, return the memory and return AS_CALLBACK_DELETED.
 206  206                   */
 207  207                  if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 208  208                          /* leave AS_CALLBACK_CALLED */
 209  209                          cb->ascb_events &= ~AS_ALL_EVENT;
 210  210                          rc = AS_CALLBACK_DELETE_DEFERRED;
 211  211                          cv_broadcast(&as->a_cv);
 212  212                  } else {
 213  213                          *prevcb = cb->ascb_next;
 214  214                          kmem_free(cb, sizeof (struct as_callback));
 215  215                          rc = AS_CALLBACK_DELETED;
 216  216                  }
 217  217                  break;
 218  218          }
 219  219          mutex_exit(&as->a_contents);
 220  220          return (rc);
 221  221  }
 222  222  
 223  223  /*
 224  224   * Searches the as callback list for a matching entry.
 225  225   * Returns a pointer to the first matching callback, or NULL if
 226  226   * nothing is found.
 227  227   * This function never sleeps so it is ok to call it with more
 228  228   * locks held but the (required) a_contents mutex.
 229  229   *
 230  230   * See also comment on as_do_callbacks below.
 231  231   */
 232  232  static struct as_callback *
 233  233  as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 234  234                          size_t event_len)
 235  235  {
 236  236          struct as_callback      *cb;
 237  237  
 238  238          ASSERT(MUTEX_HELD(&as->a_contents));
 239  239          for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 240  240                  /*
 241  241                   * If the callback has not already been called, then
 242  242                   * check if events or address range pertains.  An event_len
 243  243                   * of zero means do an unconditional callback.
 244  244                   */
 245  245                  if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 246  246                      ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 247  247                      (event_addr + event_len < cb->ascb_saddr) ||
 248  248                      (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 249  249                          continue;
 250  250                  }
 251  251                  break;
 252  252          }
 253  253          return (cb);
 254  254  }
 255  255  
 256  256  /*
 257  257   * Executes a given callback and removes it from the callback list for
 258  258   * this address space.
 259  259   * This function may sleep so the caller must drop all locks except
 260  260   * a_contents before calling this func.
 261  261   *
 262  262   * See also comments on as_do_callbacks below.
 263  263   */
 264  264  static void
 265  265  as_execute_callback(struct as *as, struct as_callback *cb,
 266  266                                  uint_t events)
 267  267  {
 268  268          struct as_callback **prevcb;
 269  269          void    *cb_arg;
 270  270  
 271  271          ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 272  272          cb->ascb_events |= AS_CALLBACK_CALLED;
 273  273          mutex_exit(&as->a_contents);
 274  274          (*cb->ascb_func)(as, cb->ascb_arg, events);
 275  275          mutex_enter(&as->a_contents);
 276  276          /*
 277  277           * the callback function is required to delete the callback
 278  278           * when the callback function determines it is OK for
 279  279           * this thread to continue. as_delete_callback will clear
 280  280           * the AS_ALL_EVENT in the events field when it is deleted.
 281  281           * If the callback function called as_delete_callback,
 282  282           * events will already be cleared and there will be no blocking.
 283  283           */
 284  284          while ((cb->ascb_events & events) != 0) {
 285  285                  cv_wait(&as->a_cv, &as->a_contents);
 286  286          }
 287  287          /*
 288  288           * This entry needs to be taken off the list. Normally, the
 289  289           * callback func itself does that, but unfortunately the list
 290  290           * may have changed while the callback was running because the
 291  291           * a_contents mutex was dropped and someone else other than the
 292  292           * callback func itself could have called as_delete_callback,
 293  293           * so we have to search to find this entry again.  The entry
 294  294           * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 295  295           */
 296  296          cb_arg = cb->ascb_arg;
 297  297          prevcb = &as->a_callbacks;
 298  298          for (cb = as->a_callbacks; cb != NULL;
 299  299              prevcb = &cb->ascb_next, cb = *prevcb) {
 300  300                  if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 301  301                      (cb_arg != cb->ascb_arg)) {
 302  302                          continue;
 303  303                  }
 304  304                  *prevcb = cb->ascb_next;
 305  305                  kmem_free(cb, sizeof (struct as_callback));
 306  306                  break;
 307  307          }
 308  308  }
 309  309  
 310  310  /*
 311  311   * Check the callback list for a matching event and intersection of
 312  312   * address range. If there is a match invoke the callback.  Skip an entry if:
 313  313   *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 314  314   *    - not event of interest
 315  315   *    - not address range of interest
 316  316   *
 317  317   * An event_len of zero indicates a request for an unconditional callback
 318  318   * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 319  319   * a_contents lock must be dropped before a callback, so only one callback
 320  320   * can be done before returning. Return -1 (true) if a callback was
 321  321   * executed and removed from the list, else return 0 (false).
 322  322   *
 323  323   * The logically separate parts, i.e. finding a matching callback and
 324  324   * executing a given callback have been separated into two functions
 325  325   * so that they can be called with different sets of locks held beyond
 326  326   * the always-required a_contents. as_find_callback does not sleep so
 327  327   * it is ok to call it if more locks than a_contents (i.e. the a_lock
 328  328   * rwlock) are held. as_execute_callback on the other hand may sleep
 329  329   * so all locks beyond a_contents must be dropped by the caller if one
 330  330   * does not want to end comatose.
 331  331   */
 332  332  static int
 333  333  as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 334  334                          size_t event_len)
 335  335  {
 336  336          struct as_callback *cb;
 337  337  
 338  338          if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 339  339                  as_execute_callback(as, cb, events);
 340  340                  return (-1);
 341  341          }
 342  342          return (0);
 343  343  }
 344  344  
 345  345  /*
 346  346   * Search for the segment containing addr. If a segment containing addr
 347  347   * exists, that segment is returned.  If no such segment exists, and
 348  348   * the list spans addresses greater than addr, then the first segment
 349  349   * whose base is greater than addr is returned; otherwise, NULL is
 350  350   * returned unless tail is true, in which case the last element of the
 351  351   * list is returned.
 352  352   *
 353  353   * a_seglast is used to cache the last found segment for repeated
 354  354   * searches to the same addr (which happens frequently).
 355  355   */
 356  356  struct seg *
 357  357  as_findseg(struct as *as, caddr_t addr, int tail)
 358  358  {
 359  359          struct seg *seg = as->a_seglast;
 360  360          avl_index_t where;
 361  361  
 362  362          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 363  363  
 364  364          if (seg != NULL &&
 365  365              seg->s_base <= addr &&
 366  366              addr < seg->s_base + seg->s_size)
 367  367                  return (seg);
 368  368  
 369  369          seg = avl_find(&as->a_segtree, &addr, &where);
 370  370          if (seg != NULL)
 371  371                  return (as->a_seglast = seg);
 372  372  
 373  373          seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 374  374          if (seg == NULL && tail)
 375  375                  seg = avl_last(&as->a_segtree);
 376  376          return (as->a_seglast = seg);
 377  377  }
 378  378  
 379  379  #ifdef VERIFY_SEGLIST
 380  380  /*
 381  381   * verify that the linked list is coherent
 382  382   */
 383  383  static void
 384  384  as_verify(struct as *as)
 385  385  {
 386  386          struct seg *seg, *seglast, *p, *n;
 387  387          uint_t nsegs = 0;
 388  388  
 389  389          if (do_as_verify == 0)
 390  390                  return;
 391  391  
 392  392          seglast = as->a_seglast;
 393  393  
 394  394          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 395  395                  ASSERT(seg->s_as == as);
 396  396                  p = AS_SEGPREV(as, seg);
 397  397                  n = AS_SEGNEXT(as, seg);
 398  398                  ASSERT(p == NULL || p->s_as == as);
 399  399                  ASSERT(p == NULL || p->s_base < seg->s_base);
 400  400                  ASSERT(n == NULL || n->s_base > seg->s_base);
 401  401                  ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 402  402                  if (seg == seglast)
 403  403                          seglast = NULL;
 404  404                  nsegs++;
 405  405          }
 406  406          ASSERT(seglast == NULL);
 407  407          ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 408  408  }
 409  409  #endif /* VERIFY_SEGLIST */
 410  410  
 411  411  /*
 412  412   * Add a new segment to the address space. The avl_find()
 413  413   * may be expensive so we attempt to use last segment accessed
 414  414   * in as_gap() as an insertion point.
 415  415   */
 416  416  int
 417  417  as_addseg(struct as  *as, struct seg *newseg)
 418  418  {
 419  419          struct seg *seg;
 420  420          caddr_t addr;
 421  421          caddr_t eaddr;
 422  422          avl_index_t where;
 423  423  
 424  424          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 425  425  
 426  426          as->a_updatedir = 1;    /* inform /proc */
 427  427          gethrestime(&as->a_updatetime);
 428  428  
 429  429          if (as->a_lastgaphl != NULL) {
 430  430                  struct seg *hseg = NULL;
 431  431                  struct seg *lseg = NULL;
 432  432  
 433  433                  if (as->a_lastgaphl->s_base > newseg->s_base) {
 434  434                          hseg = as->a_lastgaphl;
 435  435                          lseg = AVL_PREV(&as->a_segtree, hseg);
 436  436                  } else {
 437  437                          lseg = as->a_lastgaphl;
 438  438                          hseg = AVL_NEXT(&as->a_segtree, lseg);
 439  439                  }
 440  440  
 441  441                  if (hseg && lseg && lseg->s_base < newseg->s_base &&
 442  442                      hseg->s_base > newseg->s_base) {
 443  443                          avl_insert_here(&as->a_segtree, newseg, lseg,
 444  444                              AVL_AFTER);
 445  445                          as->a_lastgaphl = NULL;
 446  446                          as->a_seglast = newseg;
 447  447                          return (0);
 448  448                  }
 449  449                  as->a_lastgaphl = NULL;
 450  450          }
 451  451  
 452  452          addr = newseg->s_base;
 453  453          eaddr = addr + newseg->s_size;
 454  454  again:
 455  455  
 456  456          seg = avl_find(&as->a_segtree, &addr, &where);
 457  457  
 458  458          if (seg == NULL)
 459  459                  seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 460  460  
 461  461          if (seg == NULL)
 462  462                  seg = avl_last(&as->a_segtree);
 463  463  
 464  464          if (seg != NULL) {
 465  465                  caddr_t base = seg->s_base;
 466  466  
 467  467                  /*
 468  468                   * If top of seg is below the requested address, then
 469  469                   * the insertion point is at the end of the linked list,
 470  470                   * and seg points to the tail of the list.  Otherwise,
 471  471                   * the insertion point is immediately before seg.
 472  472                   */
 473  473                  if (base + seg->s_size > addr) {
 474  474                          if (addr >= base || eaddr > base) {
 475  475  #ifdef __sparc
 476  476                                  extern struct seg_ops segnf_ops;
 477  477  
 478  478                                  /*
 479  479                                   * no-fault segs must disappear if overlaid.
 480  480                                   * XXX need new segment type so
 481  481                                   * we don't have to check s_ops
 482  482                                   */
 483  483                                  if (seg->s_ops == &segnf_ops) {
 484  484                                          seg_unmap(seg);
 485  485                                          goto again;
 486  486                                  }
 487  487  #endif
 488  488                                  return (-1);    /* overlapping segment */
 489  489                          }
 490  490                  }
 491  491          }
 492  492          as->a_seglast = newseg;
 493  493          avl_insert(&as->a_segtree, newseg, where);
 494  494  
 495  495  #ifdef VERIFY_SEGLIST
 496  496          as_verify(as);
 497  497  #endif
 498  498          return (0);
 499  499  }
 500  500  
 501  501  struct seg *
 502  502  as_removeseg(struct as *as, struct seg *seg)
 503  503  {
 504  504          avl_tree_t *t;
 505  505  
 506  506          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 507  507  
 508  508          as->a_updatedir = 1;    /* inform /proc */
 509  509          gethrestime(&as->a_updatetime);
 510  510  
 511  511          if (seg == NULL)
 512  512                  return (NULL);
 513  513  
 514  514          t = &as->a_segtree;
 515  515          if (as->a_seglast == seg)
 516  516                  as->a_seglast = NULL;
 517  517          as->a_lastgaphl = NULL;
 518  518  
 519  519          /*
 520  520           * if this segment is at an address higher than
 521  521           * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 522  522           */
 523  523          if (as->a_lastgap &&
 524  524              (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 525  525                  as->a_lastgap = AVL_NEXT(t, seg);
 526  526  
 527  527          /*
 528  528           * remove the segment from the seg tree
 529  529           */
 530  530          avl_remove(t, seg);
 531  531  
 532  532  #ifdef VERIFY_SEGLIST
 533  533          as_verify(as);
 534  534  #endif
 535  535          return (seg);
 536  536  }
 537  537  
 538  538  /*
 539  539   * Find a segment containing addr.
 540  540   */
 541  541  struct seg *
 542  542  as_segat(struct as *as, caddr_t addr)
 543  543  {
 544  544          struct seg *seg = as->a_seglast;
 545  545  
 546  546          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 547  547  
 548  548          if (seg != NULL && seg->s_base <= addr &&
 549  549              addr < seg->s_base + seg->s_size)
 550  550                  return (seg);
 551  551  
 552  552          seg = avl_find(&as->a_segtree, &addr, NULL);
 553  553          return (seg);
 554  554  }
 555  555  
 556  556  /*
 557  557   * Serialize all searches for holes in an address space to
 558  558   * prevent two or more threads from allocating the same virtual
 559  559   * address range.  The address space must not be "read/write"
 560  560   * locked by the caller since we may block.
 561  561   */
 562  562  void
 563  563  as_rangelock(struct as *as)
 564  564  {
 565  565          mutex_enter(&as->a_contents);
 566  566          while (AS_ISCLAIMGAP(as))
 567  567                  cv_wait(&as->a_cv, &as->a_contents);
 568  568          AS_SETCLAIMGAP(as);
 569  569          mutex_exit(&as->a_contents);
 570  570  }
 571  571  
 572  572  /*
 573  573   * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 574  574   */
 575  575  void
 576  576  as_rangeunlock(struct as *as)
 577  577  {
 578  578          mutex_enter(&as->a_contents);
 579  579          AS_CLRCLAIMGAP(as);
 580  580          cv_signal(&as->a_cv);
 581  581          mutex_exit(&as->a_contents);
 582  582  }
 583  583  
 584  584  /*
 585  585   * compar segments (or just an address) by segment address range
 586  586   */
 587  587  static int
 588  588  as_segcompar(const void *x, const void *y)
 589  589  {
 590  590          struct seg *a = (struct seg *)x;
 591  591          struct seg *b = (struct seg *)y;
 592  592  
 593  593          if (a->s_base < b->s_base)
 594  594                  return (-1);
 595  595          if (a->s_base >= b->s_base + b->s_size)
 596  596                  return (1);
 597  597          return (0);
 598  598  }
 599  599  
 600  600  
 601  601  void
 602  602  as_avlinit(struct as *as)
 603  603  {
 604  604          avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 605  605              offsetof(struct seg, s_tree));
 606  606          avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 607  607              offsetof(struct watched_page, wp_link));
 608  608  }
 609  609  
 610  610  /*ARGSUSED*/
 611  611  static int
 612  612  as_constructor(void *buf, void *cdrarg, int kmflags)
 613  613  {
 614  614          struct as *as = buf;
 615  615  
 616  616          mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 617  617          cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 618  618          rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 619  619          as_avlinit(as);
 620  620          return (0);
 621  621  }
 622  622  
 623  623  /*ARGSUSED1*/
 624  624  static void
 625  625  as_destructor(void *buf, void *cdrarg)
 626  626  {
 627  627          struct as *as = buf;
 628  628  
 629  629          avl_destroy(&as->a_segtree);
 630  630          mutex_destroy(&as->a_contents);
 631  631          cv_destroy(&as->a_cv);
 632  632          rw_destroy(&as->a_lock);
 633  633  }
 634  634  
 635  635  void
 636  636  as_init(void)
 637  637  {
 638  638          as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 639  639              as_constructor, as_destructor, NULL, NULL, NULL, 0);
 640  640  }
 641  641  
 642  642  /*
 643  643   * Allocate and initialize an address space data structure.
 644  644   * We call hat_alloc to allow any machine dependent
 645  645   * information in the hat structure to be initialized.
 646  646   */
 647  647  struct as *
 648  648  as_alloc(void)
 649  649  {
 650  650          struct as *as;
 651  651  
 652  652          as = kmem_cache_alloc(as_cache, KM_SLEEP);
 653  653  
 654  654          as->a_flags             = 0;
 655  655          as->a_vbits             = 0;
 656  656          as->a_hrm               = NULL;
 657  657          as->a_seglast           = NULL;
 658  658          as->a_size              = 0;
 659  659          as->a_resvsize          = 0;
 660  660          as->a_updatedir         = 0;
 661  661          gethrestime(&as->a_updatetime);
 662  662          as->a_objectdir         = NULL;
 663  663          as->a_sizedir           = 0;
 664  664          as->a_userlimit         = (caddr_t)USERLIMIT;
 665  665          as->a_lastgap           = NULL;
 666  666          as->a_lastgaphl         = NULL;
 667  667          as->a_callbacks         = NULL;
 668  668  
 669  669          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 670  670          as->a_hat = hat_alloc(as);      /* create hat for default system mmu */
 671  671          AS_LOCK_EXIT(as, &as->a_lock);
 672  672  
 673  673          return (as);
 674  674  }
 675  675  
 676  676  /*
 677  677   * Free an address space data structure.
 678  678   * Need to free the hat first and then
 679  679   * all the segments on this as and finally
 680  680   * the space for the as struct itself.
 681  681   */
 682  682  void
 683  683  as_free(struct as *as)
 684  684  {
 685  685          struct hat *hat = as->a_hat;
 686  686          struct seg *seg, *next;
 687  687          boolean_t free_started = B_FALSE;
 688  688  
 689  689  top:
 690  690          /*
 691  691           * Invoke ALL callbacks. as_do_callbacks will do one callback
 692  692           * per call, and not return (-1) until the callback has completed.
 693  693           * When as_do_callbacks returns zero, all callbacks have completed.
 694  694           */
 695  695          mutex_enter(&as->a_contents);
 696  696          while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 697  697                  ;
 698  698  
 699  699          mutex_exit(&as->a_contents);
 700  700          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 701  701  
 702  702          if (!free_started) {
 703  703                  free_started = B_TRUE;
 704  704                  hat_free_start(hat);
 705  705          }
 706  706          for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 707  707                  int err;
 708  708  
 709  709                  next = AS_SEGNEXT(as, seg);
 710  710  retry:
 711  711                  err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
 712  712                  if (err == EAGAIN) {
 713  713                          mutex_enter(&as->a_contents);
 714  714                          if (as->a_callbacks) {
 715  715                                  AS_LOCK_EXIT(as, &as->a_lock);
 716  716                          } else if (!AS_ISNOUNMAPWAIT(as)) {
 717  717                                  /*
 718  718                                   * Memory is currently locked. Wait for a
 719  719                                   * cv_signal that it has been unlocked, then
 720  720                                   * try the operation again.
 721  721                                   */
 722  722                                  if (AS_ISUNMAPWAIT(as) == 0)
 723  723                                          cv_broadcast(&as->a_cv);
 724  724                                  AS_SETUNMAPWAIT(as);
 725  725                                  AS_LOCK_EXIT(as, &as->a_lock);
 726  726                                  while (AS_ISUNMAPWAIT(as))
 727  727                                          cv_wait(&as->a_cv, &as->a_contents);
 728  728                          } else {
 729  729                                  /*
 730  730                                   * We may have raced with
 731  731                                   * segvn_reclaim()/segspt_reclaim(). In this
 732  732                                   * case clean nounmapwait flag and retry since
 733  733                                   * softlockcnt in this segment may be already
 734  734                                   * 0.  We don't drop as writer lock so our
 735  735                                   * number of retries without sleeping should
 736  736                                   * be very small. See segvn_reclaim() for
 737  737                                   * more comments.
 738  738                                   */
 739  739                                  AS_CLRNOUNMAPWAIT(as);
 740  740                                  mutex_exit(&as->a_contents);
 741  741                                  goto retry;
 742  742                          }
 743  743                          mutex_exit(&as->a_contents);
 744  744                          goto top;
 745  745                  } else {
 746  746                          /*
 747  747                           * We do not expect any other error return at this
 748  748                           * time. This is similar to an ASSERT in seg_unmap()
 749  749                           */
 750  750                          ASSERT(err == 0);
 751  751                  }
 752  752          }
 753  753          hat_free_end(hat);
 754  754          AS_LOCK_EXIT(as, &as->a_lock);
 755  755  
 756  756          /* /proc stuff */
 757  757          ASSERT(avl_numnodes(&as->a_wpage) == 0);
 758  758          if (as->a_objectdir) {
 759  759                  kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 760  760                  as->a_objectdir = NULL;
 761  761                  as->a_sizedir = 0;
 762  762          }
 763  763  
 764  764          /*
 765  765           * Free the struct as back to kmem.  Assert it has no segments.
 766  766           */
 767  767          ASSERT(avl_numnodes(&as->a_segtree) == 0);
 768  768          kmem_cache_free(as_cache, as);
 769  769  }
 770  770  
 771  771  int
 772  772  as_dup(struct as *as, struct proc *forkedproc)
 773  773  {
 774  774          struct as *newas;
 775  775          struct seg *seg, *newseg;
 776  776          size_t  purgesize = 0;
 777  777          int error;
 778  778  
 779  779          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 780  780          as_clearwatch(as);
 781  781          newas = as_alloc();
 782  782          newas->a_userlimit = as->a_userlimit;
 783  783          newas->a_proc = forkedproc;
 784  784  
 785  785          AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
 786  786  
 787  787          (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 788  788  
 789  789          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 790  790  
 791  791                  if (seg->s_flags & S_PURGE) {
 792  792                          purgesize += seg->s_size;
 793  793                          continue;
 794  794                  }
 795  795  
 796  796                  newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 797  797                  if (newseg == NULL) {
 798  798                          AS_LOCK_EXIT(newas, &newas->a_lock);
 799  799                          as_setwatch(as);
 800  800                          AS_LOCK_EXIT(as, &as->a_lock);
 801  801                          as_free(newas);
 802  802                          return (-1);
 803  803                  }
 804  804                  if ((error = SEGOP_DUP(seg, newseg)) != 0) {
 805  805                          /*
 806  806                           * We call seg_free() on the new seg
 807  807                           * because the segment is not set up
 808  808                           * completely; i.e. it has no ops.
 809  809                           */
 810  810                          as_setwatch(as);
 811  811                          AS_LOCK_EXIT(as, &as->a_lock);
 812  812                          seg_free(newseg);
 813  813                          AS_LOCK_EXIT(newas, &newas->a_lock);
 814  814                          as_free(newas);
 815  815                          return (error);
 816  816                  }
 817  817                  newas->a_size += seg->s_size;
 818  818          }
 819  819          newas->a_resvsize = as->a_resvsize - purgesize;
 820  820  
 821  821          error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 822  822  
 823  823          AS_LOCK_EXIT(newas, &newas->a_lock);
 824  824  
 825  825          as_setwatch(as);
 826  826          AS_LOCK_EXIT(as, &as->a_lock);
 827  827          if (error != 0) {
 828  828                  as_free(newas);
 829  829                  return (error);
 830  830          }
 831  831          forkedproc->p_as = newas;
 832  832          return (0);
 833  833  }
 834  834  
 835  835  /*
 836  836   * Handle a ``fault'' at addr for size bytes.
 837  837   */
 838  838  faultcode_t
 839  839  as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 840  840          enum fault_type type, enum seg_rw rw)
 841  841  {
 842  842          struct seg *seg;
 843  843          caddr_t raddr;                  /* rounded down addr */
 844  844          size_t rsize;                   /* rounded up size */
 845  845          size_t ssize;
 846  846          faultcode_t res = 0;
 847  847          caddr_t addrsav;
 848  848          struct seg *segsav;
 849  849          int as_lock_held;
 850  850          klwp_t *lwp = ttolwp(curthread);
 851  851          int holding_wpage = 0;
 852  852  
 853  853  
 854  854  
 855  855  retry:
 856  856          /*
 857  857           * Indicate that the lwp is not to be stopped while waiting for a
 858  858           * pagefault.  This is to avoid deadlock while debugging a process
 859  859           * via /proc over NFS (in particular).
 860  860           */
 861  861          if (lwp != NULL)
 862  862                  lwp->lwp_nostop++;
 863  863  
 864  864          /*
 865  865           * same length must be used when we softlock and softunlock.  We
 866  866           * don't support softunlocking lengths less than the original length
 867  867           * when there is largepage support.  See seg_dev.c for more
 868  868           * comments.
 869  869           */
 870  870          switch (type) {
 871  871  
 872  872          case F_SOFTLOCK:
 873  873                  CPU_STATS_ADD_K(vm, softlock, 1);
 874  874                  break;
 875  875  
 876  876          case F_SOFTUNLOCK:
 877  877                  break;
 878  878  
 879  879          case F_PROT:
 880  880                  CPU_STATS_ADD_K(vm, prot_fault, 1);
 881  881                  break;
 882  882  
 883  883          case F_INVAL:
 884  884                  CPU_STATS_ENTER_K();
 885  885                  CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 886  886                  if (as == &kas)
 887  887                          CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 888  888                  CPU_STATS_EXIT_K();
 889  889                  break;
 890  890          }
 891  891  
 892  892          /* Kernel probe */
 893  893          TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
 894  894              tnf_opaque, address,        addr,
 895  895              tnf_fault_type,     fault_type,     type,
 896  896              tnf_seg_access,     access,         rw);
 897  897  
 898  898          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 899  899          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 900  900              (size_t)raddr;
 901  901  
 902  902          /*
 903  903           * XXX -- Don't grab the as lock for segkmap. We should grab it for
 904  904           * correctness, but then we could be stuck holding this lock for
 905  905           * a LONG time if the fault needs to be resolved on a slow
 906  906           * filesystem, and then no-one will be able to exec new commands,
 907  907           * as exec'ing requires the write lock on the as.
 908  908           */
 909  909          if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 910  910              raddr + size < segkmap->s_base + segkmap->s_size) {
 911  911                  seg = segkmap;
 912  912                  as_lock_held = 0;
 913  913          } else {
 914  914                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
 915  915  
 916  916                  seg = as_segat(as, raddr);
 917  917                  if (seg == NULL) {
 918  918                          AS_LOCK_EXIT(as, &as->a_lock);
 919  919                          if (lwp != NULL)
 920  920                                  lwp->lwp_nostop--;
 921  921                          return (FC_NOMAP);
 922  922                  }
 923  923  
 924  924                  as_lock_held = 1;
 925  925          }
 926  926  
 927  927          addrsav = raddr;
 928  928          segsav = seg;
 929  929  
 930  930          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 931  931                  if (raddr >= seg->s_base + seg->s_size) {
 932  932                          seg = AS_SEGNEXT(as, seg);
 933  933                          if (seg == NULL || raddr != seg->s_base) {
 934  934                                  res = FC_NOMAP;
 935  935                                  break;
 936  936                          }
 937  937                  }
 938  938                  if (raddr + rsize > seg->s_base + seg->s_size)
 939  939                          ssize = seg->s_base + seg->s_size - raddr;
 940  940                  else
 941  941                          ssize = rsize;
 942  942  
 943  943                  res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
 944  944  
 945  945                  /* Restore watchpoints */
 946  946                  if (holding_wpage) {
 947  947                          as_setwatch(as);
 948  948                          holding_wpage = 0;
 949  949                  }
 950  950  
 951  951                  if (res != 0)
 952  952                          break;
 953  953          }
 954  954  
 955  955          /*
 956  956           * If we were SOFTLOCKing and encountered a failure,
 957  957           * we must SOFTUNLOCK the range we already did. (Maybe we
 958  958           * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
 959  959           * right here...)
 960  960           */
 961  961          if (res != 0 && type == F_SOFTLOCK) {
 962  962                  for (seg = segsav; addrsav < raddr; addrsav += ssize) {
 963  963                          if (addrsav >= seg->s_base + seg->s_size)
 964  964                                  seg = AS_SEGNEXT(as, seg);
 965  965                          ASSERT(seg != NULL);
 966  966                          /*
 967  967                           * Now call the fault routine again to perform the
 968  968                           * unlock using S_OTHER instead of the rw variable
 969  969                           * since we never got a chance to touch the pages.
 970  970                           */
 971  971                          if (raddr > seg->s_base + seg->s_size)
 972  972                                  ssize = seg->s_base + seg->s_size - addrsav;
 973  973                          else
 974  974                                  ssize = raddr - addrsav;
 975  975                          (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
 976  976                              F_SOFTUNLOCK, S_OTHER);
 977  977                  }
 978  978          }
 979  979          if (as_lock_held)
 980  980                  AS_LOCK_EXIT(as, &as->a_lock);
 981  981          if (lwp != NULL)
 982  982                  lwp->lwp_nostop--;
 983  983  
 984  984          /*
 985  985           * If the lower levels returned EDEADLK for a fault,
 986  986           * It means that we should retry the fault.  Let's wait
 987  987           * a bit also to let the deadlock causing condition clear.
 988  988           * This is part of a gross hack to work around a design flaw
 989  989           * in the ufs/sds logging code and should go away when the
 990  990           * logging code is re-designed to fix the problem. See bug
 991  991           * 4125102 for details of the problem.
 992  992           */
 993  993          if (FC_ERRNO(res) == EDEADLK) {
 994  994                  delay(deadlk_wait);
 995  995                  res = 0;
 996  996                  goto retry;
 997  997          }
 998  998          return (res);
 999  999  }
1000 1000  
1001 1001  
1002 1002  
1003 1003  /*
1004 1004   * Asynchronous ``fault'' at addr for size bytes.
1005 1005   */
1006 1006  faultcode_t
1007 1007  as_faulta(struct as *as, caddr_t addr, size_t size)
1008 1008  {
1009 1009          struct seg *seg;
1010 1010          caddr_t raddr;                  /* rounded down addr */
1011 1011          size_t rsize;                   /* rounded up size */
1012 1012          faultcode_t res = 0;
1013 1013          klwp_t *lwp = ttolwp(curthread);
1014 1014  
1015 1015  retry:
1016 1016          /*
1017 1017           * Indicate that the lwp is not to be stopped while waiting
1018 1018           * for a pagefault.  This is to avoid deadlock while debugging
1019 1019           * a process via /proc over NFS (in particular).
1020 1020           */
1021 1021          if (lwp != NULL)
1022 1022                  lwp->lwp_nostop++;
1023 1023  
1024 1024          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1025 1025          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1026 1026              (size_t)raddr;
1027 1027  
1028 1028          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1029 1029          seg = as_segat(as, raddr);
1030 1030          if (seg == NULL) {
1031 1031                  AS_LOCK_EXIT(as, &as->a_lock);
1032 1032                  if (lwp != NULL)
1033 1033                          lwp->lwp_nostop--;
1034 1034                  return (FC_NOMAP);
1035 1035          }
1036 1036  
1037 1037          for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1038 1038                  if (raddr >= seg->s_base + seg->s_size) {
1039 1039                          seg = AS_SEGNEXT(as, seg);
1040 1040                          if (seg == NULL || raddr != seg->s_base) {
1041 1041                                  res = FC_NOMAP;
1042 1042                                  break;
1043 1043                          }
1044 1044                  }
1045 1045                  res = SEGOP_FAULTA(seg, raddr);
1046 1046                  if (res != 0)
1047 1047                          break;
1048 1048          }
1049 1049          AS_LOCK_EXIT(as, &as->a_lock);
1050 1050          if (lwp != NULL)
1051 1051                  lwp->lwp_nostop--;
1052 1052          /*
1053 1053           * If the lower levels returned EDEADLK for a fault,
1054 1054           * It means that we should retry the fault.  Let's wait
1055 1055           * a bit also to let the deadlock causing condition clear.
1056 1056           * This is part of a gross hack to work around a design flaw
1057 1057           * in the ufs/sds logging code and should go away when the
1058 1058           * logging code is re-designed to fix the problem. See bug
1059 1059           * 4125102 for details of the problem.
1060 1060           */
1061 1061          if (FC_ERRNO(res) == EDEADLK) {
1062 1062                  delay(deadlk_wait);
1063 1063                  res = 0;
1064 1064                  goto retry;
1065 1065          }
1066 1066          return (res);
1067 1067  }
1068 1068  
1069 1069  /*
1070 1070   * Set the virtual mapping for the interval from [addr : addr + size)
1071 1071   * in address space `as' to have the specified protection.
1072 1072   * It is ok for the range to cross over several segments,
1073 1073   * as long as they are contiguous.
1074 1074   */
1075 1075  int
1076 1076  as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1077 1077  {
1078 1078          struct seg *seg;
1079 1079          struct as_callback *cb;
1080 1080          size_t ssize;
1081 1081          caddr_t raddr;                  /* rounded down addr */
1082 1082          size_t rsize;                   /* rounded up size */
1083 1083          int error = 0, writer = 0;
1084 1084          caddr_t saveraddr;
1085 1085          size_t saversize;
1086 1086  
1087 1087  setprot_top:
1088 1088          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1089 1089          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1090 1090              (size_t)raddr;
1091 1091  
1092 1092          if (raddr + rsize < raddr)              /* check for wraparound */
1093 1093                  return (ENOMEM);
1094 1094  
1095 1095          saveraddr = raddr;
1096 1096          saversize = rsize;
1097 1097  
1098 1098          /*
1099 1099           * Normally we only lock the as as a reader. But
1100 1100           * if due to setprot the segment driver needs to split
1101 1101           * a segment it will return IE_RETRY. Therefore we re-acquire
1102 1102           * the as lock as a writer so the segment driver can change
1103 1103           * the seg list. Also the segment driver will return IE_RETRY
1104 1104           * after it has changed the segment list so we therefore keep
1105 1105           * locking as a writer. Since these opeartions should be rare
1106 1106           * want to only lock as a writer when necessary.
1107 1107           */
1108 1108          if (writer || avl_numnodes(&as->a_wpage) != 0) {
1109 1109                  AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1110 1110          } else {
1111 1111                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1112 1112          }
1113 1113  
1114 1114          as_clearwatchprot(as, raddr, rsize);
1115 1115          seg = as_segat(as, raddr);
1116 1116          if (seg == NULL) {
1117 1117                  as_setwatch(as);
1118 1118                  AS_LOCK_EXIT(as, &as->a_lock);
1119 1119                  return (ENOMEM);
1120 1120          }
1121 1121  
1122 1122          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1123 1123                  if (raddr >= seg->s_base + seg->s_size) {
1124 1124                          seg = AS_SEGNEXT(as, seg);
1125 1125                          if (seg == NULL || raddr != seg->s_base) {
1126 1126                                  error = ENOMEM;
1127 1127                                  break;
1128 1128                          }
1129 1129                  }
1130 1130                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
1131 1131                          ssize = seg->s_base + seg->s_size - raddr;
1132 1132                  else
1133 1133                          ssize = rsize;
1134 1134  retry:
1135 1135                  error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1136 1136  
1137 1137                  if (error == IE_NOMEM) {
1138 1138                          error = EAGAIN;
1139 1139                          break;
1140 1140                  }
1141 1141  
1142 1142                  if (error == IE_RETRY) {
1143 1143                          AS_LOCK_EXIT(as, &as->a_lock);
1144 1144                          writer = 1;
1145 1145                          goto setprot_top;
1146 1146                  }
1147 1147  
1148 1148                  if (error == EAGAIN) {
1149 1149                          /*
1150 1150                           * Make sure we have a_lock as writer.
1151 1151                           */
1152 1152                          if (writer == 0) {
1153 1153                                  AS_LOCK_EXIT(as, &as->a_lock);
1154 1154                                  writer = 1;
1155 1155                                  goto setprot_top;
1156 1156                          }
1157 1157  
1158 1158                          /*
1159 1159                           * Memory is currently locked.  It must be unlocked
1160 1160                           * before this operation can succeed through a retry.
1161 1161                           * The possible reasons for locked memory and
1162 1162                           * corresponding strategies for unlocking are:
1163 1163                           * (1) Normal I/O
1164 1164                           *      wait for a signal that the I/O operation
1165 1165                           *      has completed and the memory is unlocked.
1166 1166                           * (2) Asynchronous I/O
1167 1167                           *      The aio subsystem does not unlock pages when
1168 1168                           *      the I/O is completed. Those pages are unlocked
1169 1169                           *      when the application calls aiowait/aioerror.
1170 1170                           *      So, to prevent blocking forever, cv_broadcast()
1171 1171                           *      is done to wake up aio_cleanup_thread.
1172 1172                           *      Subsequently, segvn_reclaim will be called, and
1173 1173                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
1174 1174                           * (3) Long term page locking:
1175 1175                           *      Drivers intending to have pages locked for a
1176 1176                           *      period considerably longer than for normal I/O
1177 1177                           *      (essentially forever) may have registered for a
1178 1178                           *      callback so they may unlock these pages on
1179 1179                           *      request. This is needed to allow this operation
1180 1180                           *      to succeed. Each entry on the callback list is
1181 1181                           *      examined. If the event or address range pertains
1182 1182                           *      the callback is invoked (unless it already is in
1183 1183                           *      progress). The a_contents lock must be dropped
1184 1184                           *      before the callback, so only one callback can
1185 1185                           *      be done at a time. Go to the top and do more
1186 1186                           *      until zero is returned. If zero is returned,
1187 1187                           *      either there were no callbacks for this event
1188 1188                           *      or they were already in progress.
1189 1189                           */
1190 1190                          mutex_enter(&as->a_contents);
1191 1191                          if (as->a_callbacks &&
1192 1192                              (cb = as_find_callback(as, AS_SETPROT_EVENT,
1193 1193                              seg->s_base, seg->s_size))) {
1194 1194                                  AS_LOCK_EXIT(as, &as->a_lock);
1195 1195                                  as_execute_callback(as, cb, AS_SETPROT_EVENT);
1196 1196                          } else if (!AS_ISNOUNMAPWAIT(as)) {
1197 1197                                  if (AS_ISUNMAPWAIT(as) == 0)
1198 1198                                          cv_broadcast(&as->a_cv);
1199 1199                                  AS_SETUNMAPWAIT(as);
1200 1200                                  AS_LOCK_EXIT(as, &as->a_lock);
1201 1201                                  while (AS_ISUNMAPWAIT(as))
1202 1202                                          cv_wait(&as->a_cv, &as->a_contents);
1203 1203                          } else {
1204 1204                                  /*
1205 1205                                   * We may have raced with
1206 1206                                   * segvn_reclaim()/segspt_reclaim(). In this
1207 1207                                   * case clean nounmapwait flag and retry since
1208 1208                                   * softlockcnt in this segment may be already
1209 1209                                   * 0.  We don't drop as writer lock so our
1210 1210                                   * number of retries without sleeping should
1211 1211                                   * be very small. See segvn_reclaim() for
1212 1212                                   * more comments.
1213 1213                                   */
1214 1214                                  AS_CLRNOUNMAPWAIT(as);
1215 1215                                  mutex_exit(&as->a_contents);
1216 1216                                  goto retry;
1217 1217                          }
1218 1218                          mutex_exit(&as->a_contents);
1219 1219                          goto setprot_top;
1220 1220                  } else if (error != 0)
1221 1221                          break;
1222 1222          }
1223 1223          if (error != 0) {
1224 1224                  as_setwatch(as);
1225 1225          } else {
1226 1226                  as_setwatchprot(as, saveraddr, saversize, prot);
1227 1227          }
1228 1228          AS_LOCK_EXIT(as, &as->a_lock);
1229 1229          return (error);
1230 1230  }
1231 1231  
1232 1232  /*
1233 1233   * Check to make sure that the interval [addr, addr + size)
1234 1234   * in address space `as' has at least the specified protection.
1235 1235   * It is ok for the range to cross over several segments, as long
1236 1236   * as they are contiguous.
1237 1237   */
1238 1238  int
1239 1239  as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1240 1240  {
1241 1241          struct seg *seg;
1242 1242          size_t ssize;
1243 1243          caddr_t raddr;                  /* rounded down addr */
1244 1244          size_t rsize;                   /* rounded up size */
1245 1245          int error = 0;
1246 1246  
1247 1247          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1248 1248          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1249 1249              (size_t)raddr;
1250 1250  
1251 1251          if (raddr + rsize < raddr)              /* check for wraparound */
1252 1252                  return (ENOMEM);
1253 1253  
1254 1254          /*
1255 1255           * This is ugly as sin...
1256 1256           * Normally, we only acquire the address space readers lock.
1257 1257           * However, if the address space has watchpoints present,
1258 1258           * we must acquire the writer lock on the address space for
1259 1259           * the benefit of as_clearwatchprot() and as_setwatchprot().
1260 1260           */
1261 1261          if (avl_numnodes(&as->a_wpage) != 0)
1262 1262                  AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1263 1263          else
1264 1264                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1265 1265          as_clearwatchprot(as, raddr, rsize);
1266 1266          seg = as_segat(as, raddr);
1267 1267          if (seg == NULL) {
1268 1268                  as_setwatch(as);
1269 1269                  AS_LOCK_EXIT(as, &as->a_lock);
1270 1270                  return (ENOMEM);
1271 1271          }
1272 1272  
1273 1273          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1274 1274                  if (raddr >= seg->s_base + seg->s_size) {
1275 1275                          seg = AS_SEGNEXT(as, seg);
1276 1276                          if (seg == NULL || raddr != seg->s_base) {
1277 1277                                  error = ENOMEM;
1278 1278                                  break;
1279 1279                          }
1280 1280                  }
1281 1281                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
1282 1282                          ssize = seg->s_base + seg->s_size - raddr;
1283 1283                  else
1284 1284                          ssize = rsize;
1285 1285  
1286 1286                  error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1287 1287                  if (error != 0)
1288 1288                          break;
1289 1289          }
1290 1290          as_setwatch(as);
1291 1291          AS_LOCK_EXIT(as, &as->a_lock);
1292 1292          return (error);
1293 1293  }
1294 1294  
1295 1295  int
1296 1296  as_unmap(struct as *as, caddr_t addr, size_t size)
1297 1297  {
1298 1298          struct seg *seg, *seg_next;
1299 1299          struct as_callback *cb;
1300 1300          caddr_t raddr, eaddr;
1301 1301          size_t ssize, rsize = 0;
1302 1302          int err;
1303 1303  
1304 1304  top:
1305 1305          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1306 1306          eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1307 1307              (uintptr_t)PAGEMASK);
1308 1308  
1309 1309          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1310 1310  
1311 1311          as->a_updatedir = 1;    /* inform /proc */
1312 1312          gethrestime(&as->a_updatetime);
1313 1313  
1314 1314          /*
1315 1315           * Use as_findseg to find the first segment in the range, then
1316 1316           * step through the segments in order, following s_next.
1317 1317           */
1318 1318          as_clearwatchprot(as, raddr, eaddr - raddr);
1319 1319  
1320 1320          for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1321 1321                  if (eaddr <= seg->s_base)
1322 1322                          break;          /* eaddr was in a gap; all done */
1323 1323  
1324 1324                  /* this is implied by the test above */
1325 1325                  ASSERT(raddr < eaddr);
1326 1326  
1327 1327                  if (raddr < seg->s_base)
1328 1328                          raddr = seg->s_base;    /* raddr was in a gap */
1329 1329  
1330 1330                  if (eaddr > (seg->s_base + seg->s_size))
1331 1331                          ssize = seg->s_base + seg->s_size - raddr;
1332 1332                  else
1333 1333                          ssize = eaddr - raddr;
1334 1334  
1335 1335                  /*
1336 1336                   * Save next segment pointer since seg can be
1337 1337                   * destroyed during the segment unmap operation.
1338 1338                   */
1339 1339                  seg_next = AS_SEGNEXT(as, seg);
1340 1340  
1341 1341                  /*
1342 1342                   * We didn't count /dev/null mappings, so ignore them here.
1343 1343                   * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1344 1344                   * we have to do this check here while we have seg.)
1345 1345                   */
1346 1346                  rsize = 0;
1347 1347                  if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1348 1348                      !SEG_IS_PARTIAL_RESV(seg))
1349 1349                          rsize = ssize;
1350 1350  
1351 1351  retry:
1352 1352                  err = SEGOP_UNMAP(seg, raddr, ssize);
1353 1353                  if (err == EAGAIN) {
1354 1354                          /*
1355 1355                           * Memory is currently locked.  It must be unlocked
1356 1356                           * before this operation can succeed through a retry.
1357 1357                           * The possible reasons for locked memory and
1358 1358                           * corresponding strategies for unlocking are:
1359 1359                           * (1) Normal I/O
1360 1360                           *      wait for a signal that the I/O operation
1361 1361                           *      has completed and the memory is unlocked.
1362 1362                           * (2) Asynchronous I/O
1363 1363                           *      The aio subsystem does not unlock pages when
1364 1364                           *      the I/O is completed. Those pages are unlocked
1365 1365                           *      when the application calls aiowait/aioerror.
1366 1366                           *      So, to prevent blocking forever, cv_broadcast()
1367 1367                           *      is done to wake up aio_cleanup_thread.
1368 1368                           *      Subsequently, segvn_reclaim will be called, and
1369 1369                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
1370 1370                           * (3) Long term page locking:
1371 1371                           *      Drivers intending to have pages locked for a
1372 1372                           *      period considerably longer than for normal I/O
1373 1373                           *      (essentially forever) may have registered for a
1374 1374                           *      callback so they may unlock these pages on
1375 1375                           *      request. This is needed to allow this operation
1376 1376                           *      to succeed. Each entry on the callback list is
1377 1377                           *      examined. If the event or address range pertains
1378 1378                           *      the callback is invoked (unless it already is in
1379 1379                           *      progress). The a_contents lock must be dropped
1380 1380                           *      before the callback, so only one callback can
1381 1381                           *      be done at a time. Go to the top and do more
1382 1382                           *      until zero is returned. If zero is returned,
1383 1383                           *      either there were no callbacks for this event
1384 1384                           *      or they were already in progress.
1385 1385                           */
1386 1386                          mutex_enter(&as->a_contents);
1387 1387                          if (as->a_callbacks &&
1388 1388                              (cb = as_find_callback(as, AS_UNMAP_EVENT,
1389 1389                              seg->s_base, seg->s_size))) {
1390 1390                                  AS_LOCK_EXIT(as, &as->a_lock);
1391 1391                                  as_execute_callback(as, cb, AS_UNMAP_EVENT);
1392 1392                          } else if (!AS_ISNOUNMAPWAIT(as)) {
1393 1393                                  if (AS_ISUNMAPWAIT(as) == 0)
1394 1394                                          cv_broadcast(&as->a_cv);
1395 1395                                  AS_SETUNMAPWAIT(as);
1396 1396                                  AS_LOCK_EXIT(as, &as->a_lock);
1397 1397                                  while (AS_ISUNMAPWAIT(as))
1398 1398                                          cv_wait(&as->a_cv, &as->a_contents);
1399 1399                          } else {
1400 1400                                  /*
1401 1401                                   * We may have raced with
1402 1402                                   * segvn_reclaim()/segspt_reclaim(). In this
1403 1403                                   * case clean nounmapwait flag and retry since
1404 1404                                   * softlockcnt in this segment may be already
1405 1405                                   * 0.  We don't drop as writer lock so our
1406 1406                                   * number of retries without sleeping should
1407 1407                                   * be very small. See segvn_reclaim() for
1408 1408                                   * more comments.
1409 1409                                   */
1410 1410                                  AS_CLRNOUNMAPWAIT(as);
1411 1411                                  mutex_exit(&as->a_contents);
1412 1412                                  goto retry;
1413 1413                          }
1414 1414                          mutex_exit(&as->a_contents);
1415 1415                          goto top;
1416 1416                  } else if (err == IE_RETRY) {
1417 1417                          AS_LOCK_EXIT(as, &as->a_lock);
1418 1418                          goto top;
1419 1419                  } else if (err) {
1420 1420                          as_setwatch(as);
1421 1421                          AS_LOCK_EXIT(as, &as->a_lock);
1422 1422                          return (-1);
1423 1423                  }
1424 1424  
1425 1425                  as->a_size -= ssize;
1426 1426                  if (rsize)
1427 1427                          as->a_resvsize -= rsize;
1428 1428                  raddr += ssize;
1429 1429          }
1430 1430          AS_LOCK_EXIT(as, &as->a_lock);
1431 1431          return (0);
1432 1432  }
1433 1433  
1434 1434  static int
1435 1435  as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1436 1436      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1437 1437  {
1438 1438          uint_t szc;
1439 1439          uint_t nszc;
1440 1440          int error;
1441 1441          caddr_t a;
1442 1442          caddr_t eaddr;
1443 1443          size_t segsize;
1444 1444          struct seg *seg;
1445 1445          size_t pgsz;
1446 1446          int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1447 1447          uint_t save_szcvec;
1448 1448  
1449 1449          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1450 1450          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1451 1451          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1452 1452          ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1453 1453          if (!do_off) {
1454 1454                  vn_a->offset = 0;
1455 1455          }
1456 1456  
1457 1457          if (szcvec <= 1) {
1458 1458                  seg = seg_alloc(as, addr, size);
1459 1459                  if (seg == NULL) {
1460 1460                          return (ENOMEM);
1461 1461                  }
1462 1462                  vn_a->szc = 0;
1463 1463                  error = (*crfp)(seg, vn_a);
1464 1464                  if (error != 0) {
1465 1465                          seg_free(seg);
1466 1466                  } else {
1467 1467                          as->a_size += size;
1468 1468                          as->a_resvsize += size;
1469 1469                  }
1470 1470                  return (error);
1471 1471          }
1472 1472  
1473 1473          eaddr = addr + size;
1474 1474          save_szcvec = szcvec;
1475 1475          szcvec >>= 1;
1476 1476          szc = 0;
1477 1477          nszc = 0;
1478 1478          while (szcvec) {
1479 1479                  if ((szcvec & 0x1) == 0) {
1480 1480                          nszc++;
1481 1481                          szcvec >>= 1;
1482 1482                          continue;
1483 1483                  }
1484 1484                  nszc++;
1485 1485                  pgsz = page_get_pagesize(nszc);
1486 1486                  a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1487 1487                  if (a != addr) {
1488 1488                          ASSERT(a < eaddr);
1489 1489                          segsize = a - addr;
1490 1490                          seg = seg_alloc(as, addr, segsize);
1491 1491                          if (seg == NULL) {
1492 1492                                  return (ENOMEM);
1493 1493                          }
1494 1494                          vn_a->szc = szc;
1495 1495                          error = (*crfp)(seg, vn_a);
1496 1496                          if (error != 0) {
1497 1497                                  seg_free(seg);
1498 1498                                  return (error);
1499 1499                          }
1500 1500                          as->a_size += segsize;
1501 1501                          as->a_resvsize += segsize;
1502 1502                          *segcreated = 1;
1503 1503                          if (do_off) {
1504 1504                                  vn_a->offset += segsize;
1505 1505                          }
1506 1506                          addr = a;
1507 1507                  }
1508 1508                  szc = nszc;
1509 1509                  szcvec >>= 1;
1510 1510          }
1511 1511  
1512 1512          ASSERT(addr < eaddr);
1513 1513          szcvec = save_szcvec | 1; /* add 8K pages */
1514 1514          while (szcvec) {
1515 1515                  a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1516 1516                  ASSERT(a >= addr);
1517 1517                  if (a != addr) {
1518 1518                          segsize = a - addr;
1519 1519                          seg = seg_alloc(as, addr, segsize);
1520 1520                          if (seg == NULL) {
1521 1521                                  return (ENOMEM);
1522 1522                          }
1523 1523                          vn_a->szc = szc;
1524 1524                          error = (*crfp)(seg, vn_a);
1525 1525                          if (error != 0) {
1526 1526                                  seg_free(seg);
1527 1527                                  return (error);
1528 1528                          }
1529 1529                          as->a_size += segsize;
1530 1530                          as->a_resvsize += segsize;
1531 1531                          *segcreated = 1;
1532 1532                          if (do_off) {
1533 1533                                  vn_a->offset += segsize;
1534 1534                          }
1535 1535                          addr = a;
1536 1536                  }
1537 1537                  szcvec &= ~(1 << szc);
1538 1538                  if (szcvec) {
1539 1539                          szc = highbit(szcvec) - 1;
1540 1540                          pgsz = page_get_pagesize(szc);
1541 1541                  }
1542 1542          }
1543 1543          ASSERT(addr == eaddr);
1544 1544  
1545 1545          return (0);
1546 1546  }
1547 1547  
1548 1548  static int
1549 1549  as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1550 1550      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1551 1551  {
1552 1552          uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1553 1553          int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1554 1554          uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1555 1555              type, 0);
1556 1556          int error;
1557 1557          struct seg *seg;
1558 1558          struct vattr va;
1559 1559          u_offset_t eoff;
1560 1560          size_t save_size = 0;
1561 1561          extern size_t textrepl_size_thresh;
1562 1562  
1563 1563          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1564 1564          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1565 1565          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1566 1566          ASSERT(vn_a->vp != NULL);
1567 1567          ASSERT(vn_a->amp == NULL);
1568 1568  
1569 1569  again:
1570 1570          if (szcvec <= 1) {
1571 1571                  seg = seg_alloc(as, addr, size);
1572 1572                  if (seg == NULL) {
1573 1573                          return (ENOMEM);
1574 1574                  }
1575 1575                  vn_a->szc = 0;
1576 1576                  error = (*crfp)(seg, vn_a);
1577 1577                  if (error != 0) {
1578 1578                          seg_free(seg);
1579 1579                  } else {
1580 1580                          as->a_size += size;
1581 1581                          as->a_resvsize += size;
1582 1582                  }
1583 1583                  return (error);
1584 1584          }
1585 1585  
1586 1586          va.va_mask = AT_SIZE;
1587 1587          if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1588 1588                  szcvec = 0;
1589 1589                  goto again;
1590 1590          }
1591 1591          eoff = vn_a->offset & PAGEMASK;
1592 1592          if (eoff >= va.va_size) {
1593 1593                  szcvec = 0;
1594 1594                  goto again;
1595 1595          }
1596 1596          eoff += size;
1597 1597          if (btopr(va.va_size) < btopr(eoff)) {
1598 1598                  save_size = size;
1599 1599                  size = va.va_size - (vn_a->offset & PAGEMASK);
1600 1600                  size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1601 1601                  szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1602 1602                      type, 0);
1603 1603                  if (szcvec <= 1) {
1604 1604                          size = save_size;
1605 1605                          goto again;
1606 1606                  }
1607 1607          }
1608 1608  
1609 1609          if (size > textrepl_size_thresh) {
1610 1610                  vn_a->flags |= _MAP_TEXTREPL;
1611 1611          }
1612 1612          error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1613 1613              segcreated);
1614 1614          if (error != 0) {
1615 1615                  return (error);
1616 1616          }
1617 1617          if (save_size) {
1618 1618                  addr += size;
1619 1619                  size = save_size - size;
1620 1620                  szcvec = 0;
1621 1621                  goto again;
1622 1622          }
1623 1623          return (0);
1624 1624  }
1625 1625  
1626 1626  /*
1627 1627   * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1628 1628   * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1629 1629   */
1630 1630  static int
1631 1631  as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1632 1632      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1633 1633  {
1634 1634          uint_t szcvec;
1635 1635          uchar_t type;
1636 1636  
1637 1637          ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1638 1638          if (vn_a->type == MAP_SHARED) {
1639 1639                  type = MAPPGSZC_SHM;
1640 1640          } else if (vn_a->type == MAP_PRIVATE) {
1641 1641                  if (vn_a->szc == AS_MAP_HEAP) {
1642 1642                          type = MAPPGSZC_HEAP;
1643 1643                  } else if (vn_a->szc == AS_MAP_STACK) {
1644 1644                          type = MAPPGSZC_STACK;
1645 1645                  } else {
1646 1646                          type = MAPPGSZC_PRIVM;
1647 1647                  }
1648 1648          }
1649 1649          szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1650 1650              (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1651 1651              (vn_a->flags & MAP_TEXT), type, 0);
1652 1652          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1653 1653          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1654 1654          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1655 1655          ASSERT(vn_a->vp == NULL);
1656 1656  
1657 1657          return (as_map_segvn_segs(as, addr, size, szcvec,
1658 1658              crfp, vn_a, segcreated));
1659 1659  }
1660 1660  
1661 1661  int
1662 1662  as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1663 1663  {
1664 1664          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1665 1665          return (as_map_locked(as, addr, size, crfp, argsp));
1666 1666  }
1667 1667  
1668 1668  int
1669 1669  as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1670 1670                  void *argsp)
1671 1671  {
1672 1672          struct seg *seg = NULL;
1673 1673          caddr_t raddr;                  /* rounded down addr */
1674 1674          size_t rsize;                   /* rounded up size */
1675 1675          int error;
1676 1676          int unmap = 0;
1677 1677          struct proc *p = curproc;
1678 1678          struct segvn_crargs crargs;
1679 1679  
1680 1680          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1681 1681          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1682 1682              (size_t)raddr;
1683 1683  
1684 1684          /*
1685 1685           * check for wrap around
1686 1686           */
1687 1687          if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1688 1688                  AS_LOCK_EXIT(as, &as->a_lock);
1689 1689                  return (ENOMEM);
1690 1690          }
1691 1691  
1692 1692          as->a_updatedir = 1;    /* inform /proc */
1693 1693          gethrestime(&as->a_updatetime);
1694 1694  
1695 1695          if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1696 1696                  AS_LOCK_EXIT(as, &as->a_lock);
1697 1697  
1698 1698                  (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1699 1699                      RCA_UNSAFE_ALL);
1700 1700  
1701 1701                  return (ENOMEM);
1702 1702          }
1703 1703  
1704 1704          if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1705 1705                  crargs = *(struct segvn_crargs *)argsp;
1706 1706                  error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1707 1707                  if (error != 0) {
1708 1708                          AS_LOCK_EXIT(as, &as->a_lock);
1709 1709                          if (unmap) {
1710 1710                                  (void) as_unmap(as, addr, size);
1711 1711                          }
1712 1712                          return (error);
1713 1713                  }
1714 1714          } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1715 1715                  crargs = *(struct segvn_crargs *)argsp;
1716 1716                  error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1717 1717                  if (error != 0) {
1718 1718                          AS_LOCK_EXIT(as, &as->a_lock);
1719 1719                          if (unmap) {
1720 1720                                  (void) as_unmap(as, addr, size);
1721 1721                          }
1722 1722                          return (error);
1723 1723                  }
1724 1724          } else {
1725 1725                  seg = seg_alloc(as, addr, size);
1726 1726                  if (seg == NULL) {
1727 1727                          AS_LOCK_EXIT(as, &as->a_lock);
1728 1728                          return (ENOMEM);
1729 1729                  }
1730 1730  
1731 1731                  error = (*crfp)(seg, argsp);
1732 1732                  if (error != 0) {
1733 1733                          seg_free(seg);
1734 1734                          AS_LOCK_EXIT(as, &as->a_lock);
1735 1735                          return (error);
1736 1736                  }
1737 1737                  /*
1738 1738                   * Add size now so as_unmap will work if as_ctl fails.
1739 1739                   */
1740 1740                  as->a_size += rsize;
1741 1741                  as->a_resvsize += rsize;
1742 1742          }
1743 1743  
1744 1744          as_setwatch(as);
1745 1745  
1746 1746          /*
1747 1747           * If the address space is locked,
1748 1748           * establish memory locks for the new segment.
1749 1749           */
1750 1750          mutex_enter(&as->a_contents);
1751 1751          if (AS_ISPGLCK(as)) {
1752 1752                  mutex_exit(&as->a_contents);
1753 1753                  AS_LOCK_EXIT(as, &as->a_lock);
1754 1754                  error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1755 1755                  if (error != 0)
1756 1756                          (void) as_unmap(as, addr, size);
1757 1757          } else {
1758 1758                  mutex_exit(&as->a_contents);
1759 1759                  AS_LOCK_EXIT(as, &as->a_lock);
1760 1760          }
1761 1761          return (error);
1762 1762  }
1763 1763  
1764 1764  
1765 1765  /*
1766 1766   * Delete all segments in the address space marked with S_PURGE.
1767 1767   * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1768 1768   * These segments are deleted as a first step before calls to as_gap(), so
1769 1769   * that they don't affect mmap() or shmat().
1770 1770   */
1771 1771  void
1772 1772  as_purge(struct as *as)
1773 1773  {
1774 1774          struct seg *seg;
1775 1775          struct seg *next_seg;
1776 1776  
1777 1777          /*
1778 1778           * the setting of NEEDSPURGE is protect by as_rangelock(), so
1779 1779           * no need to grab a_contents mutex for this check
1780 1780           */
1781 1781          if ((as->a_flags & AS_NEEDSPURGE) == 0)
1782 1782                  return;
1783 1783  
1784 1784          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1785 1785          next_seg = NULL;
1786 1786          seg = AS_SEGFIRST(as);
1787 1787          while (seg != NULL) {
1788 1788                  next_seg = AS_SEGNEXT(as, seg);
1789 1789                  if (seg->s_flags & S_PURGE)
1790 1790                          SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1791 1791                  seg = next_seg;
1792 1792          }
1793 1793          AS_LOCK_EXIT(as, &as->a_lock);
1794 1794  
1795 1795          mutex_enter(&as->a_contents);
1796 1796          as->a_flags &= ~AS_NEEDSPURGE;
1797 1797          mutex_exit(&as->a_contents);
1798 1798  }
1799 1799  
1800 1800  /*
1801 1801   * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1802 1802   * range of addresses at least "minlen" long, where the base of the range is
1803 1803   * at "off" phase from an "align" boundary and there is space for a
1804 1804   * "redzone"-sized redzone on eithe rside of the range.  Thus,
1805 1805   * if align was 4M and off was 16k, the user wants a hole which will start
1806 1806   * 16k into a 4M page.
1807 1807   *
1808 1808   * If flags specifies AH_HI, the hole will have the highest possible address
1809 1809   * in the range.  We use the as->a_lastgap field to figure out where to
1810 1810   * start looking for a gap.
1811 1811   *
1812 1812   * Otherwise, the gap will have the lowest possible address.
1813 1813   *
1814 1814   * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1815 1815   *
1816 1816   * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1817 1817   * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1818 1818   *
1819 1819   * NOTE: This routine is not correct when base+len overflows caddr_t.
1820 1820   */
1821 1821  int
1822 1822  as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1823 1823      uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1824 1824  {
1825 1825          caddr_t lobound = *basep;
1826 1826          caddr_t hibound = lobound + *lenp;
1827 1827          struct seg *lseg, *hseg;
1828 1828          caddr_t lo, hi;
1829 1829          int forward;
1830 1830          caddr_t save_base;
1831 1831          size_t save_len;
1832 1832          size_t save_minlen;
1833 1833          size_t save_redzone;
1834 1834          int fast_path = 1;
1835 1835  
1836 1836          save_base = *basep;
1837 1837          save_len = *lenp;
1838 1838          save_minlen = minlen;
1839 1839          save_redzone = redzone;
1840 1840  
1841 1841          /*
1842 1842           * For the first pass/fast_path, just add align and redzone into
1843 1843           * minlen since if we get an allocation, we can guarantee that it
1844 1844           * will fit the alignment and redzone requested.
1845 1845           * This increases the chance that hibound will be adjusted to
1846 1846           * a_lastgap->s_base which will likely allow us to find an
1847 1847           * acceptable hole in the address space quicker.
1848 1848           * If we can't find a hole with this fast_path, then we look for
1849 1849           * smaller holes in which the alignment and offset may allow
1850 1850           * the allocation to fit.
1851 1851           */
1852 1852          minlen += align;
1853 1853          minlen += 2 * redzone;
1854 1854          redzone = 0;
1855 1855  
1856 1856          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1857 1857          if (AS_SEGFIRST(as) == NULL) {
1858 1858                  if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1859 1859                      align, redzone, off)) {
1860 1860                          AS_LOCK_EXIT(as, &as->a_lock);
1861 1861                          return (0);
1862 1862                  } else {
1863 1863                          AS_LOCK_EXIT(as, &as->a_lock);
1864 1864                          *basep = save_base;
1865 1865                          *lenp = save_len;
1866 1866                          return (-1);
1867 1867                  }
1868 1868          }
1869 1869  
1870 1870  retry:
1871 1871          /*
1872 1872           * Set up to iterate over all the inter-segment holes in the given
1873 1873           * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1874 1874           * NULL for the highest-addressed hole.  If moving backwards, we reset
1875 1875           * sseg to denote the highest-addressed segment.
1876 1876           */
1877 1877          forward = (flags & AH_DIR) == AH_LO;
1878 1878          if (forward) {
1879 1879                  hseg = as_findseg(as, lobound, 1);
1880 1880                  lseg = AS_SEGPREV(as, hseg);
1881 1881          } else {
1882 1882  
1883 1883                  /*
1884 1884                   * If allocating at least as much as the last allocation,
1885 1885                   * use a_lastgap's base as a better estimate of hibound.
1886 1886                   */
1887 1887                  if (as->a_lastgap &&
1888 1888                      minlen >= as->a_lastgap->s_size &&
1889 1889                      hibound >= as->a_lastgap->s_base)
1890 1890                          hibound = as->a_lastgap->s_base;
1891 1891  
1892 1892                  hseg = as_findseg(as, hibound, 1);
1893 1893                  if (hseg->s_base + hseg->s_size < hibound) {
1894 1894                          lseg = hseg;
1895 1895                          hseg = NULL;
1896 1896                  } else {
1897 1897                          lseg = AS_SEGPREV(as, hseg);
1898 1898                  }
1899 1899          }
1900 1900  
1901 1901          for (;;) {
1902 1902                  /*
1903 1903                   * Set lo and hi to the hole's boundaries.  (We should really
1904 1904                   * use MAXADDR in place of hibound in the expression below,
1905 1905                   * but can't express it easily; using hibound in its place is
1906 1906                   * harmless.)
1907 1907                   */
1908 1908                  lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1909 1909                  hi = (hseg == NULL) ? hibound : hseg->s_base;
1910 1910                  /*
1911 1911                   * If the iteration has moved past the interval from lobound
1912 1912                   * to hibound it's pointless to continue.
1913 1913                   */
1914 1914                  if ((forward && lo > hibound) || (!forward && hi < lobound))
1915 1915                          break;
1916 1916                  else if (lo > hibound || hi < lobound)
1917 1917                          goto cont;
1918 1918                  /*
1919 1919                   * Candidate hole lies at least partially within the allowable
1920 1920                   * range.  Restrict it to fall completely within that range,
1921 1921                   * i.e., to [max(lo, lobound), min(hi, hibound)].
1922 1922                   */
1923 1923                  if (lo < lobound)
1924 1924                          lo = lobound;
1925 1925                  if (hi > hibound)
1926 1926                          hi = hibound;
1927 1927                  /*
1928 1928                   * Verify that the candidate hole is big enough and meets
1929 1929                   * hardware constraints.  If the hole is too small, no need
1930 1930                   * to do the further checks since they will fail.
1931 1931                   */
1932 1932                  *basep = lo;
1933 1933                  *lenp = hi - lo;
1934 1934                  if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1935 1935                      minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1936 1936                      ((flags & AH_CONTAIN) == 0 ||
1937 1937                      (*basep <= addr && *basep + *lenp > addr))) {
1938 1938                          if (!forward)
1939 1939                                  as->a_lastgap = hseg;
1940 1940                          if (hseg != NULL)
1941 1941                                  as->a_lastgaphl = hseg;
1942 1942                          else
1943 1943                                  as->a_lastgaphl = lseg;
1944 1944                          AS_LOCK_EXIT(as, &as->a_lock);
1945 1945                          return (0);
1946 1946                  }
1947 1947          cont:
1948 1948                  /*
1949 1949                   * Move to the next hole.
1950 1950                   */
1951 1951                  if (forward) {
1952 1952                          lseg = hseg;
1953 1953                          if (lseg == NULL)
1954 1954                                  break;
1955 1955                          hseg = AS_SEGNEXT(as, hseg);
1956 1956                  } else {
1957 1957                          hseg = lseg;
1958 1958                          if (hseg == NULL)
1959 1959                                  break;
1960 1960                          lseg = AS_SEGPREV(as, lseg);
1961 1961                  }
1962 1962          }
1963 1963          if (fast_path && (align != 0 || save_redzone != 0)) {
1964 1964                  fast_path = 0;
1965 1965                  minlen = save_minlen;
1966 1966                  redzone = save_redzone;
1967 1967                  goto retry;
1968 1968          }
1969 1969          *basep = save_base;
1970 1970          *lenp = save_len;
1971 1971          AS_LOCK_EXIT(as, &as->a_lock);
1972 1972          return (-1);
1973 1973  }
1974 1974  
1975 1975  /*
1976 1976   * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1977 1977   *
1978 1978   * If flags specifies AH_HI, the hole will have the highest possible address
1979 1979   * in the range.  We use the as->a_lastgap field to figure out where to
1980 1980   * start looking for a gap.
1981 1981   *
1982 1982   * Otherwise, the gap will have the lowest possible address.
1983 1983   *
1984 1984   * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1985 1985   *
1986 1986   * If an adequate hole is found, base and len are set to reflect the part of
1987 1987   * the hole that is within range, and 0 is returned, otherwise,
1988 1988   * -1 is returned.
1989 1989   *
1990 1990   * NOTE: This routine is not correct when base+len overflows caddr_t.
1991 1991   */
1992 1992  int
1993 1993  as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
1994 1994      caddr_t addr)
1995 1995  {
1996 1996  
1997 1997          return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
1998 1998  }
1999 1999  
2000 2000  /*
2001 2001   * Return the next range within [base, base + len) that is backed
2002 2002   * with "real memory".  Skip holes and non-seg_vn segments.
2003 2003   * We're lazy and only return one segment at a time.
2004 2004   */
2005 2005  int
2006 2006  as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2007 2007  {
2008 2008          extern struct seg_ops segspt_shmops;    /* needs a header file */
2009 2009          struct seg *seg;
2010 2010          caddr_t addr, eaddr;
2011 2011          caddr_t segend;
2012 2012  
2013 2013          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2014 2014  
2015 2015          addr = *basep;
2016 2016          eaddr = addr + *lenp;
2017 2017  
2018 2018          seg = as_findseg(as, addr, 0);
2019 2019          if (seg != NULL)
2020 2020                  addr = MAX(seg->s_base, addr);
2021 2021  
2022 2022          for (;;) {
2023 2023                  if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2024 2024                          AS_LOCK_EXIT(as, &as->a_lock);
2025 2025                          return (EINVAL);
2026 2026                  }
2027 2027  
2028 2028                  if (seg->s_ops == &segvn_ops) {
2029 2029                          segend = seg->s_base + seg->s_size;
2030 2030                          break;
2031 2031                  }
2032 2032  
2033 2033                  /*
2034 2034                   * We do ISM by looking into the private data
2035 2035                   * to determine the real size of the segment.
2036 2036                   */
2037 2037                  if (seg->s_ops == &segspt_shmops) {
2038 2038                          segend = seg->s_base + spt_realsize(seg);
2039 2039                          if (addr < segend)
2040 2040                                  break;
2041 2041                  }
2042 2042  
2043 2043                  seg = AS_SEGNEXT(as, seg);
2044 2044  
2045 2045                  if (seg != NULL)
2046 2046                          addr = seg->s_base;
2047 2047          }
2048 2048  
2049 2049          *basep = addr;
2050 2050

↓ open down ↓

2050 lines elided

↑ open up ↑

2051 2051          if (segend > eaddr)
2052 2052                  *lenp = eaddr - addr;
2053 2053          else
2054 2054                  *lenp = segend - addr;
2055 2055  
2056 2056          AS_LOCK_EXIT(as, &as->a_lock);
2057 2057          return (0);
2058 2058  }
2059 2059  
2060 2060  /*
2061      - * Swap the pages associated with the address space as out to
2062      - * secondary storage, returning the number of bytes actually
2063      - * swapped.
2064      - *
2065      - * The value returned is intended to correlate well with the process's
2066      - * memory requirements.  Its usefulness for this purpose depends on
2067      - * how well the segment-level routines do at returning accurate
2068      - * information.
2069      - */
2070      -size_t
2071      -as_swapout(struct as *as)
2072      -{
2073      -        struct seg *seg;
2074      -        size_t swpcnt = 0;
2075      -
2076      -        /*
2077      -         * Kernel-only processes have given up their address
2078      -         * spaces.  Of course, we shouldn't be attempting to
2079      -         * swap out such processes in the first place...
2080      -         */
2081      -        if (as == NULL)
2082      -                return (0);
2083      -
2084      -        AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2085      -
2086      -        /*
2087      -         * Free all mapping resources associated with the address
2088      -         * space.  The segment-level swapout routines capitalize
2089      -         * on this unmapping by scavanging pages that have become
2090      -         * unmapped here.
2091      -         */
2092      -        hat_swapout(as->a_hat);
2093      -
2094      -        /*
2095      -         * Call the swapout routines of all segments in the address
2096      -         * space to do the actual work, accumulating the amount of
2097      -         * space reclaimed.
2098      -         */
2099      -        for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2100      -                struct seg_ops *ov = seg->s_ops;
2101      -
2102      -                /*
2103      -                 * We have to check to see if the seg has
2104      -                 * an ops vector because the seg may have
2105      -                 * been in the middle of being set up when
2106      -                 * the process was picked for swapout.
2107      -                 */
2108      -                if ((ov != NULL) && (ov->swapout != NULL))
2109      -                        swpcnt += SEGOP_SWAPOUT(seg);
2110      -        }
2111      -        AS_LOCK_EXIT(as, &as->a_lock);
2112      -        return (swpcnt);
2113      -}
2114      -
2115      -/*
2116 2061   * Determine whether data from the mappings in interval [addr, addr + size)
2117 2062   * are in the primary memory (core) cache.
2118 2063   */
2119 2064  int
2120 2065  as_incore(struct as *as, caddr_t addr,
2121 2066      size_t size, char *vec, size_t *sizep)
2122 2067  {
2123 2068          struct seg *seg;
2124 2069          size_t ssize;
2125 2070          caddr_t raddr;          /* rounded down addr */

2126 2071          size_t rsize;           /* rounded up size */
2127 2072          size_t isize;                   /* iteration size */
2128 2073          int error = 0;          /* result, assume success */
2129 2074  
2130 2075          *sizep = 0;
2131 2076          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2132 2077          rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2133 2078              (size_t)raddr;
2134 2079  
2135 2080          if (raddr + rsize < raddr)              /* check for wraparound */
2136 2081                  return (ENOMEM);
2137 2082  
2138 2083          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2139 2084          seg = as_segat(as, raddr);
2140 2085          if (seg == NULL) {
2141 2086                  AS_LOCK_EXIT(as, &as->a_lock);
2142 2087                  return (-1);
2143 2088          }
2144 2089  
2145 2090          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2146 2091                  if (raddr >= seg->s_base + seg->s_size) {
2147 2092                          seg = AS_SEGNEXT(as, seg);
2148 2093                          if (seg == NULL || raddr != seg->s_base) {
2149 2094                                  error = -1;
2150 2095                                  break;
2151 2096                          }
2152 2097                  }
2153 2098                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2154 2099                          ssize = seg->s_base + seg->s_size - raddr;
2155 2100                  else
2156 2101                          ssize = rsize;
2157 2102                  *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2158 2103                  if (isize != ssize) {
2159 2104                          error = -1;
2160 2105                          break;
2161 2106                  }
2162 2107                  vec += btopr(ssize);
2163 2108          }
2164 2109          AS_LOCK_EXIT(as, &as->a_lock);
2165 2110          return (error);
2166 2111  }
2167 2112  
2168 2113  static void
2169 2114  as_segunlock(struct seg *seg, caddr_t addr, int attr,
2170 2115          ulong_t *bitmap, size_t position, size_t npages)
2171 2116  {
2172 2117          caddr_t range_start;
2173 2118          size_t  pos1 = position;
2174 2119          size_t  pos2;
2175 2120          size_t  size;
2176 2121          size_t  end_pos = npages + position;
2177 2122  
2178 2123          while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2179 2124                  size = ptob((pos2 - pos1));
2180 2125                  range_start = (caddr_t)((uintptr_t)addr +
2181 2126                      ptob(pos1 - position));
2182 2127  
2183 2128                  (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2184 2129                      (ulong_t *)NULL, (size_t)NULL);
2185 2130                  pos1 = pos2;
2186 2131          }
2187 2132  }
2188 2133  
2189 2134  static void
2190 2135  as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2191 2136          caddr_t raddr, size_t rsize)
2192 2137  {
2193 2138          struct seg *seg = as_segat(as, raddr);
2194 2139          size_t ssize;
2195 2140  
2196 2141          while (rsize != 0) {
2197 2142                  if (raddr >= seg->s_base + seg->s_size)
2198 2143                          seg = AS_SEGNEXT(as, seg);
2199 2144  
2200 2145                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2201 2146                          ssize = seg->s_base + seg->s_size - raddr;
2202 2147                  else
2203 2148                          ssize = rsize;
2204 2149  
2205 2150                  as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2206 2151  
2207 2152                  rsize -= ssize;
2208 2153                  raddr += ssize;
2209 2154          }
2210 2155  }
2211 2156  
2212 2157  /*
2213 2158   * Cache control operations over the interval [addr, addr + size) in
2214 2159   * address space "as".
2215 2160   */
2216 2161  /*ARGSUSED*/
2217 2162  int
2218 2163  as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2219 2164      uintptr_t arg, ulong_t *lock_map, size_t pos)
2220 2165  {
2221 2166          struct seg *seg;        /* working segment */
2222 2167          caddr_t raddr;          /* rounded down addr */
2223 2168          caddr_t initraddr;      /* saved initial rounded down addr */
2224 2169          size_t rsize;           /* rounded up size */
2225 2170          size_t initrsize;       /* saved initial rounded up size */
2226 2171          size_t ssize;           /* size of seg */
2227 2172          int error = 0;                  /* result */
2228 2173          size_t mlock_size;      /* size of bitmap */
2229 2174          ulong_t *mlock_map;     /* pointer to bitmap used */
2230 2175                                  /* to represent the locked */
2231 2176                                  /* pages. */
2232 2177  retry:
2233 2178          if (error == IE_RETRY)
2234 2179                  AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2235 2180          else
2236 2181                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2237 2182  
2238 2183          /*
2239 2184           * If these are address space lock/unlock operations, loop over
2240 2185           * all segments in the address space, as appropriate.
2241 2186           */
2242 2187          if (func == MC_LOCKAS) {
2243 2188                  size_t npages, idx;
2244 2189                  size_t rlen = 0;        /* rounded as length */
2245 2190  
2246 2191                  idx = pos;
2247 2192  
2248 2193                  if (arg & MCL_FUTURE) {
2249 2194                          mutex_enter(&as->a_contents);
2250 2195                          AS_SETPGLCK(as);
2251 2196                          mutex_exit(&as->a_contents);
2252 2197                  }
2253 2198                  if ((arg & MCL_CURRENT) == 0) {
2254 2199                          AS_LOCK_EXIT(as, &as->a_lock);
2255 2200                          return (0);
2256 2201                  }
2257 2202  
2258 2203                  seg = AS_SEGFIRST(as);
2259 2204                  if (seg == NULL) {
2260 2205                          AS_LOCK_EXIT(as, &as->a_lock);
2261 2206                          return (0);
2262 2207                  }
2263 2208  
2264 2209                  do {
2265 2210                          raddr = (caddr_t)((uintptr_t)seg->s_base &
2266 2211                              (uintptr_t)PAGEMASK);
2267 2212                          rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2268 2213                              PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2269 2214                  } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2270 2215  
2271 2216                  mlock_size = BT_BITOUL(btopr(rlen));
2272 2217                  if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2273 2218                      sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2274 2219                                  AS_LOCK_EXIT(as, &as->a_lock);
2275 2220                                  return (EAGAIN);
2276 2221                  }
2277 2222  
2278 2223                  for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2279 2224                          error = SEGOP_LOCKOP(seg, seg->s_base,
2280 2225                              seg->s_size, attr, MC_LOCK, mlock_map, pos);
2281 2226                          if (error != 0)
2282 2227                                  break;
2283 2228                          pos += seg_pages(seg);
2284 2229                  }
2285 2230  
2286 2231                  if (error) {
2287 2232                          for (seg = AS_SEGFIRST(as); seg != NULL;
2288 2233                              seg = AS_SEGNEXT(as, seg)) {
2289 2234  
2290 2235                                  raddr = (caddr_t)((uintptr_t)seg->s_base &
2291 2236                                      (uintptr_t)PAGEMASK);
2292 2237                                  npages = seg_pages(seg);
2293 2238                                  as_segunlock(seg, raddr, attr, mlock_map,
2294 2239                                      idx, npages);
2295 2240                                  idx += npages;
2296 2241                          }
2297 2242                  }
2298 2243  
2299 2244                  kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2300 2245                  AS_LOCK_EXIT(as, &as->a_lock);
2301 2246                  goto lockerr;
2302 2247          } else if (func == MC_UNLOCKAS) {
2303 2248                  mutex_enter(&as->a_contents);
2304 2249                  AS_CLRPGLCK(as);
2305 2250                  mutex_exit(&as->a_contents);
2306 2251  
2307 2252                  for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2308 2253                          error = SEGOP_LOCKOP(seg, seg->s_base,
2309 2254                              seg->s_size, attr, MC_UNLOCK, NULL, 0);
2310 2255                          if (error != 0)
2311 2256                                  break;
2312 2257                  }
2313 2258  
2314 2259                  AS_LOCK_EXIT(as, &as->a_lock);
2315 2260                  goto lockerr;
2316 2261          }
2317 2262  
2318 2263          /*
2319 2264           * Normalize addresses and sizes.
2320 2265           */
2321 2266          initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2322 2267          initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2323 2268              (size_t)raddr;
2324 2269  
2325 2270          if (raddr + rsize < raddr) {            /* check for wraparound */
2326 2271                  AS_LOCK_EXIT(as, &as->a_lock);
2327 2272                  return (ENOMEM);
2328 2273          }
2329 2274  
2330 2275          /*
2331 2276           * Get initial segment.
2332 2277           */
2333 2278          if ((seg = as_segat(as, raddr)) == NULL) {
2334 2279                  AS_LOCK_EXIT(as, &as->a_lock);
2335 2280                  return (ENOMEM);
2336 2281          }
2337 2282  
2338 2283          if (func == MC_LOCK) {
2339 2284                  mlock_size = BT_BITOUL(btopr(rsize));
2340 2285                  if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2341 2286                      sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2342 2287                                  AS_LOCK_EXIT(as, &as->a_lock);
2343 2288                                  return (EAGAIN);
2344 2289                  }
2345 2290          }
2346 2291  
2347 2292          /*
2348 2293           * Loop over all segments.  If a hole in the address range is
2349 2294           * discovered, then fail.  For each segment, perform the appropriate
2350 2295           * control operation.
2351 2296           */
2352 2297          while (rsize != 0) {
2353 2298  
2354 2299                  /*
2355 2300                   * Make sure there's no hole, calculate the portion
2356 2301                   * of the next segment to be operated over.
2357 2302                   */
2358 2303                  if (raddr >= seg->s_base + seg->s_size) {
2359 2304                          seg = AS_SEGNEXT(as, seg);
2360 2305                          if (seg == NULL || raddr != seg->s_base) {
2361 2306                                  if (func == MC_LOCK) {
2362 2307                                          as_unlockerr(as, attr, mlock_map,
2363 2308                                              initraddr, initrsize - rsize);
2364 2309                                          kmem_free(mlock_map,
2365 2310                                              mlock_size * sizeof (ulong_t));
2366 2311                                  }
2367 2312                                  AS_LOCK_EXIT(as, &as->a_lock);
2368 2313                                  return (ENOMEM);
2369 2314                          }
2370 2315                  }
2371 2316                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2372 2317                          ssize = seg->s_base + seg->s_size - raddr;
2373 2318                  else
2374 2319                          ssize = rsize;
2375 2320  
2376 2321                  /*
2377 2322                   * Dispatch on specific function.
2378 2323                   */
2379 2324                  switch (func) {
2380 2325  
2381 2326                  /*
2382 2327                   * Synchronize cached data from mappings with backing
2383 2328                   * objects.
2384 2329                   */
2385 2330                  case MC_SYNC:
2386 2331                          if (error = SEGOP_SYNC(seg, raddr, ssize,
2387 2332                              attr, (uint_t)arg)) {
2388 2333                                  AS_LOCK_EXIT(as, &as->a_lock);
2389 2334                                  return (error);
2390 2335                          }
2391 2336                          break;
2392 2337  
2393 2338                  /*
2394 2339                   * Lock pages in memory.
2395 2340                   */
2396 2341                  case MC_LOCK:
2397 2342                          if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2398 2343                              attr, func, mlock_map, pos)) {
2399 2344                                  as_unlockerr(as, attr, mlock_map, initraddr,
2400 2345                                      initrsize - rsize + ssize);
2401 2346                                  kmem_free(mlock_map, mlock_size *
2402 2347                                      sizeof (ulong_t));
2403 2348                                  AS_LOCK_EXIT(as, &as->a_lock);
2404 2349                                  goto lockerr;
2405 2350                          }
2406 2351                          break;
2407 2352  
2408 2353                  /*
2409 2354                   * Unlock mapped pages.
2410 2355                   */
2411 2356                  case MC_UNLOCK:
2412 2357                          (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2413 2358                              (ulong_t *)NULL, (size_t)NULL);
2414 2359                          break;
2415 2360  
2416 2361                  /*
2417 2362                   * Store VM advise for mapped pages in segment layer.
2418 2363                   */
2419 2364                  case MC_ADVISE:
2420 2365                          error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2421 2366  
2422 2367                          /*
2423 2368                           * Check for regular errors and special retry error
2424 2369                           */
2425 2370                          if (error) {
2426 2371                                  if (error == IE_RETRY) {
2427 2372                                          /*
2428 2373                                           * Need to acquire writers lock, so
2429 2374                                           * have to drop readers lock and start
2430 2375                                           * all over again
2431 2376                                           */
2432 2377                                          AS_LOCK_EXIT(as, &as->a_lock);
2433 2378                                          goto retry;
2434 2379                                  } else if (error == IE_REATTACH) {
2435 2380                                          /*
2436 2381                                           * Find segment for current address
2437 2382                                           * because current segment just got
2438 2383                                           * split or concatenated
2439 2384                                           */
2440 2385                                          seg = as_segat(as, raddr);
2441 2386                                          if (seg == NULL) {
2442 2387                                                  AS_LOCK_EXIT(as, &as->a_lock);
2443 2388                                                  return (ENOMEM);
2444 2389                                          }
2445 2390                                  } else {
2446 2391                                          /*
2447 2392                                           * Regular error
2448 2393                                           */
2449 2394                                          AS_LOCK_EXIT(as, &as->a_lock);
2450 2395                                          return (error);
2451 2396                                  }
2452 2397                          }
2453 2398                          break;
2454 2399  
2455 2400                  case MC_INHERIT_ZERO:
2456 2401                          if (seg->s_ops->inherit == NULL) {
2457 2402                                  error = ENOTSUP;
2458 2403                          } else {
2459 2404                                  error = SEGOP_INHERIT(seg, raddr, ssize,
2460 2405                                      SEGP_INH_ZERO);
2461 2406                          }
2462 2407                          if (error != 0) {
2463 2408                                  AS_LOCK_EXIT(as, &as->a_lock);
2464 2409                                  return (error);
2465 2410                          }
2466 2411                          break;
2467 2412  
2468 2413                  /*
2469 2414                   * Can't happen.
2470 2415                   */
2471 2416                  default:
2472 2417                          panic("as_ctl: bad operation %d", func);
2473 2418                          /*NOTREACHED*/
2474 2419                  }
2475 2420  
2476 2421                  rsize -= ssize;
2477 2422                  raddr += ssize;
2478 2423          }
2479 2424  
2480 2425          if (func == MC_LOCK)
2481 2426                  kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2482 2427          AS_LOCK_EXIT(as, &as->a_lock);
2483 2428          return (0);
2484 2429  lockerr:
2485 2430  
2486 2431          /*
2487 2432           * If the lower levels returned EDEADLK for a segment lockop,
2488 2433           * it means that we should retry the operation.  Let's wait
2489 2434           * a bit also to let the deadlock causing condition clear.
2490 2435           * This is part of a gross hack to work around a design flaw
2491 2436           * in the ufs/sds logging code and should go away when the
2492 2437           * logging code is re-designed to fix the problem. See bug
2493 2438           * 4125102 for details of the problem.
2494 2439           */
2495 2440          if (error == EDEADLK) {
2496 2441                  delay(deadlk_wait);
2497 2442                  error = 0;
2498 2443                  goto retry;
2499 2444          }
2500 2445          return (error);
2501 2446  }
2502 2447  
2503 2448  int
2504 2449  fc_decode(faultcode_t fault_err)
2505 2450  {
2506 2451          int error = 0;
2507 2452  
2508 2453          switch (FC_CODE(fault_err)) {
2509 2454          case FC_OBJERR:
2510 2455                  error = FC_ERRNO(fault_err);
2511 2456                  break;
2512 2457          case FC_PROT:
2513 2458                  error = EACCES;
2514 2459                  break;
2515 2460          default:
2516 2461                  error = EFAULT;
2517 2462                  break;
2518 2463          }
2519 2464          return (error);
2520 2465  }
2521 2466  
2522 2467  /*
2523 2468   * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2524 2469   * lists from each segment and copy them to one contiguous shadow list (plist)
2525 2470   * as expected by the caller.  Save pointers to per segment shadow lists at
2526 2471   * the tail of plist so that they can be used during as_pageunlock().
2527 2472   */
2528 2473  static int
2529 2474  as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2530 2475      caddr_t addr, size_t size, enum seg_rw rw)
2531 2476  {
2532 2477          caddr_t sv_addr = addr;
2533 2478          size_t sv_size = size;
2534 2479          struct seg *sv_seg = seg;
2535 2480          ulong_t segcnt = 1;
2536 2481          ulong_t cnt;
2537 2482          size_t ssize;
2538 2483          pgcnt_t npages = btop(size);
2539 2484          page_t **plist;
2540 2485          page_t **pl;
2541 2486          int error;
2542 2487          caddr_t eaddr;
2543 2488          faultcode_t fault_err = 0;
2544 2489          pgcnt_t pl_off;
2545 2490          extern struct seg_ops segspt_shmops;
2546 2491  
2547 2492          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2548 2493          ASSERT(seg != NULL);
2549 2494          ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2550 2495          ASSERT(addr + size > seg->s_base + seg->s_size);
2551 2496          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2552 2497          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2553 2498  
2554 2499          /*
2555 2500           * Count the number of segments covered by the range we are about to
2556 2501           * lock. The segment count is used to size the shadow list we return
2557 2502           * back to the caller.
2558 2503           */
2559 2504          for (; size != 0; size -= ssize, addr += ssize) {
2560 2505                  if (addr >= seg->s_base + seg->s_size) {
2561 2506  
2562 2507                          seg = AS_SEGNEXT(as, seg);
2563 2508                          if (seg == NULL || addr != seg->s_base) {
2564 2509                                  AS_LOCK_EXIT(as, &as->a_lock);
2565 2510                                  return (EFAULT);
2566 2511                          }
2567 2512                          /*
2568 2513                           * Do a quick check if subsequent segments
2569 2514                           * will most likely support pagelock.
2570 2515                           */
2571 2516                          if (seg->s_ops == &segvn_ops) {
2572 2517                                  vnode_t *vp;
2573 2518  
2574 2519                                  if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2575 2520                                      vp != NULL) {
2576 2521                                          AS_LOCK_EXIT(as, &as->a_lock);
2577 2522                                          goto slow;
2578 2523                                  }
2579 2524                          } else if (seg->s_ops != &segspt_shmops) {
2580 2525                                  AS_LOCK_EXIT(as, &as->a_lock);
2581 2526                                  goto slow;
2582 2527                          }
2583 2528                          segcnt++;
2584 2529                  }
2585 2530                  if (addr + size > seg->s_base + seg->s_size) {
2586 2531                          ssize = seg->s_base + seg->s_size - addr;
2587 2532                  } else {
2588 2533                          ssize = size;
2589 2534                  }
2590 2535          }
2591 2536          ASSERT(segcnt > 1);
2592 2537  
2593 2538          plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2594 2539  
2595 2540          addr = sv_addr;
2596 2541          size = sv_size;
2597 2542          seg = sv_seg;
2598 2543  
2599 2544          for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2600 2545                  if (addr >= seg->s_base + seg->s_size) {
2601 2546                          seg = AS_SEGNEXT(as, seg);
2602 2547                          ASSERT(seg != NULL && addr == seg->s_base);
2603 2548                          cnt++;
2604 2549                          ASSERT(cnt < segcnt);
2605 2550                  }
2606 2551                  if (addr + size > seg->s_base + seg->s_size) {
2607 2552                          ssize = seg->s_base + seg->s_size - addr;
2608 2553                  } else {
2609 2554                          ssize = size;
2610 2555                  }
2611 2556                  pl = &plist[npages + cnt];
2612 2557                  error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2613 2558                      L_PAGELOCK, rw);
2614 2559                  if (error) {
2615 2560                          break;
2616 2561                  }
2617 2562                  ASSERT(plist[npages + cnt] != NULL);
2618 2563                  ASSERT(pl_off + btop(ssize) <= npages);
2619 2564                  bcopy(plist[npages + cnt], &plist[pl_off],
2620 2565                      btop(ssize) * sizeof (page_t *));
2621 2566                  pl_off += btop(ssize);
2622 2567          }
2623 2568  
2624 2569          if (size == 0) {
2625 2570                  AS_LOCK_EXIT(as, &as->a_lock);
2626 2571                  ASSERT(cnt == segcnt - 1);
2627 2572                  *ppp = plist;
2628 2573                  return (0);
2629 2574          }
2630 2575  
2631 2576          /*
2632 2577           * one of pagelock calls failed. The error type is in error variable.
2633 2578           * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2634 2579           * type is either EFAULT or ENOTSUP. Otherwise just return the error
2635 2580           * back to the caller.
2636 2581           */
2637 2582  
2638 2583          eaddr = addr;
2639 2584          seg = sv_seg;
2640 2585  
2641 2586          for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2642 2587                  if (addr >= seg->s_base + seg->s_size) {
2643 2588                          seg = AS_SEGNEXT(as, seg);
2644 2589                          ASSERT(seg != NULL && addr == seg->s_base);
2645 2590                          cnt++;
2646 2591                          ASSERT(cnt < segcnt);
2647 2592                  }
2648 2593                  if (eaddr > seg->s_base + seg->s_size) {
2649 2594                          ssize = seg->s_base + seg->s_size - addr;
2650 2595                  } else {
2651 2596                          ssize = eaddr - addr;
2652 2597                  }
2653 2598                  pl = &plist[npages + cnt];
2654 2599                  ASSERT(*pl != NULL);
2655 2600                  (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2656 2601                      L_PAGEUNLOCK, rw);
2657 2602          }
2658 2603  
2659 2604          AS_LOCK_EXIT(as, &as->a_lock);
2660 2605  
2661 2606          kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2662 2607  
2663 2608          if (error != ENOTSUP && error != EFAULT) {
2664 2609                  return (error);
2665 2610          }
2666 2611  
2667 2612  slow:
2668 2613          /*
2669 2614           * If we are here because pagelock failed due to the need to cow fault
2670 2615           * in the pages we want to lock F_SOFTLOCK will do this job and in
2671 2616           * next as_pagelock() call for this address range pagelock will
2672 2617           * hopefully succeed.
2673 2618           */
2674 2619          fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2675 2620          if (fault_err != 0) {
2676 2621                  return (fc_decode(fault_err));
2677 2622          }
2678 2623          *ppp = NULL;
2679 2624  
2680 2625          return (0);
2681 2626  }
2682 2627  
2683 2628  /*
2684 2629   * lock pages in a given address space. Return shadow list. If
2685 2630   * the list is NULL, the MMU mapping is also locked.
2686 2631   */
2687 2632  int
2688 2633  as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2689 2634      size_t size, enum seg_rw rw)
2690 2635  {
2691 2636          size_t rsize;
2692 2637          caddr_t raddr;
2693 2638          faultcode_t fault_err;
2694 2639          struct seg *seg;
2695 2640          int err;
2696 2641  
2697 2642          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2698 2643              "as_pagelock_start: addr %p size %ld", addr, size);
2699 2644  
2700 2645          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2701 2646          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2702 2647              (size_t)raddr;
2703 2648  
2704 2649          /*
2705 2650           * if the request crosses two segments let
2706 2651           * as_fault handle it.
2707 2652           */
2708 2653          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2709 2654  
2710 2655          seg = as_segat(as, raddr);
2711 2656          if (seg == NULL) {
2712 2657                  AS_LOCK_EXIT(as, &as->a_lock);
2713 2658                  return (EFAULT);
2714 2659          }
2715 2660          ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2716 2661          if (raddr + rsize > seg->s_base + seg->s_size) {
2717 2662                  return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2718 2663          }
2719 2664          if (raddr + rsize <= raddr) {
2720 2665                  AS_LOCK_EXIT(as, &as->a_lock);
2721 2666                  return (EFAULT);
2722 2667          }
2723 2668  
2724 2669          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2725 2670              "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2726 2671  
2727 2672          /*
2728 2673           * try to lock pages and pass back shadow list
2729 2674           */
2730 2675          err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2731 2676  
2732 2677          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2733 2678  
2734 2679          AS_LOCK_EXIT(as, &as->a_lock);
2735 2680  
2736 2681          if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2737 2682                  return (err);
2738 2683          }
2739 2684  
2740 2685          /*
2741 2686           * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2742 2687           * to no pagelock support for this segment or pages need to be cow
2743 2688           * faulted in. If fault is needed F_SOFTLOCK will do this job for
2744 2689           * this as_pagelock() call and in the next as_pagelock() call for the
2745 2690           * same address range pagelock call will hopefull succeed.
2746 2691           */
2747 2692          fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2748 2693          if (fault_err != 0) {
2749 2694                  return (fc_decode(fault_err));
2750 2695          }
2751 2696          *ppp = NULL;
2752 2697  
2753 2698          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2754 2699          return (0);
2755 2700  }
2756 2701  
2757 2702  /*
2758 2703   * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2759 2704   * lists from the end of plist and call pageunlock interface for each segment.
2760 2705   * Drop as lock and free plist.
2761 2706   */
2762 2707  static void
2763 2708  as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2764 2709      struct page **plist, enum seg_rw rw)
2765 2710  {
2766 2711          ulong_t cnt;
2767 2712          caddr_t eaddr = addr + size;
2768 2713          pgcnt_t npages = btop(size);
2769 2714          size_t ssize;
2770 2715          page_t **pl;
2771 2716  
2772 2717          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2773 2718          ASSERT(seg != NULL);
2774 2719          ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2775 2720          ASSERT(addr + size > seg->s_base + seg->s_size);
2776 2721          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2777 2722          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2778 2723          ASSERT(plist != NULL);
2779 2724  
2780 2725          for (cnt = 0; addr < eaddr; addr += ssize) {
2781 2726                  if (addr >= seg->s_base + seg->s_size) {
2782 2727                          seg = AS_SEGNEXT(as, seg);
2783 2728                          ASSERT(seg != NULL && addr == seg->s_base);
2784 2729                          cnt++;
2785 2730                  }
2786 2731                  if (eaddr > seg->s_base + seg->s_size) {
2787 2732                          ssize = seg->s_base + seg->s_size - addr;
2788 2733                  } else {
2789 2734                          ssize = eaddr - addr;
2790 2735                  }
2791 2736                  pl = &plist[npages + cnt];
2792 2737                  ASSERT(*pl != NULL);
2793 2738                  (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2794 2739                      L_PAGEUNLOCK, rw);
2795 2740          }
2796 2741          ASSERT(cnt > 0);
2797 2742          AS_LOCK_EXIT(as, &as->a_lock);
2798 2743  
2799 2744          cnt++;
2800 2745          kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2801 2746  }
2802 2747  
2803 2748  /*
2804 2749   * unlock pages in a given address range
2805 2750   */
2806 2751  void
2807 2752  as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2808 2753      enum seg_rw rw)
2809 2754  {
2810 2755          struct seg *seg;
2811 2756          size_t rsize;
2812 2757          caddr_t raddr;
2813 2758  
2814 2759          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2815 2760              "as_pageunlock_start: addr %p size %ld", addr, size);
2816 2761  
2817 2762          /*
2818 2763           * if the shadow list is NULL, as_pagelock was
2819 2764           * falling back to as_fault
2820 2765           */
2821 2766          if (pp == NULL) {
2822 2767                  (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2823 2768                  return;
2824 2769          }
2825 2770  
2826 2771          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2827 2772          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2828 2773              (size_t)raddr;
2829 2774  
2830 2775          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2831 2776          seg = as_segat(as, raddr);
2832 2777          ASSERT(seg != NULL);
2833 2778  
2834 2779          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2835 2780              "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2836 2781  
2837 2782          ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2838 2783          if (raddr + rsize <= seg->s_base + seg->s_size) {
2839 2784                  SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2840 2785          } else {
2841 2786                  as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2842 2787                  return;
2843 2788          }
2844 2789          AS_LOCK_EXIT(as, &as->a_lock);
2845 2790          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2846 2791  }
2847 2792  
2848 2793  int
2849 2794  as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2850 2795      boolean_t wait)
2851 2796  {
2852 2797          struct seg *seg;
2853 2798          size_t ssize;
2854 2799          caddr_t raddr;                  /* rounded down addr */
2855 2800          size_t rsize;                   /* rounded up size */
2856 2801          int error = 0;
2857 2802          size_t pgsz = page_get_pagesize(szc);
2858 2803  
2859 2804  setpgsz_top:
2860 2805          if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2861 2806                  return (EINVAL);
2862 2807          }
2863 2808  
2864 2809          raddr = addr;
2865 2810          rsize = size;
2866 2811  
2867 2812          if (raddr + rsize < raddr)              /* check for wraparound */
2868 2813                  return (ENOMEM);
2869 2814  
2870 2815          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2871 2816          as_clearwatchprot(as, raddr, rsize);
2872 2817          seg = as_segat(as, raddr);
2873 2818          if (seg == NULL) {
2874 2819                  as_setwatch(as);
2875 2820                  AS_LOCK_EXIT(as, &as->a_lock);
2876 2821                  return (ENOMEM);
2877 2822          }
2878 2823  
2879 2824          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2880 2825                  if (raddr >= seg->s_base + seg->s_size) {
2881 2826                          seg = AS_SEGNEXT(as, seg);
2882 2827                          if (seg == NULL || raddr != seg->s_base) {
2883 2828                                  error = ENOMEM;
2884 2829                                  break;
2885 2830                          }
2886 2831                  }
2887 2832                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2888 2833                          ssize = seg->s_base + seg->s_size - raddr;
2889 2834                  } else {
2890 2835                          ssize = rsize;
2891 2836                  }
2892 2837  
2893 2838  retry:
2894 2839                  error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2895 2840  
2896 2841                  if (error == IE_NOMEM) {
2897 2842                          error = EAGAIN;
2898 2843                          break;
2899 2844                  }
2900 2845  
2901 2846                  if (error == IE_RETRY) {
2902 2847                          AS_LOCK_EXIT(as, &as->a_lock);
2903 2848                          goto setpgsz_top;
2904 2849                  }
2905 2850  
2906 2851                  if (error == ENOTSUP) {
2907 2852                          error = EINVAL;
2908 2853                          break;
2909 2854                  }
2910 2855  
2911 2856                  if (wait && (error == EAGAIN)) {
2912 2857                          /*
2913 2858                           * Memory is currently locked.  It must be unlocked
2914 2859                           * before this operation can succeed through a retry.
2915 2860                           * The possible reasons for locked memory and
2916 2861                           * corresponding strategies for unlocking are:
2917 2862                           * (1) Normal I/O
2918 2863                           *      wait for a signal that the I/O operation
2919 2864                           *      has completed and the memory is unlocked.
2920 2865                           * (2) Asynchronous I/O
2921 2866                           *      The aio subsystem does not unlock pages when
2922 2867                           *      the I/O is completed. Those pages are unlocked
2923 2868                           *      when the application calls aiowait/aioerror.
2924 2869                           *      So, to prevent blocking forever, cv_broadcast()
2925 2870                           *      is done to wake up aio_cleanup_thread.
2926 2871                           *      Subsequently, segvn_reclaim will be called, and
2927 2872                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
2928 2873                           * (3) Long term page locking:
2929 2874                           *      This is not relevant for as_setpagesize()
2930 2875                           *      because we cannot change the page size for
2931 2876                           *      driver memory. The attempt to do so will
2932 2877                           *      fail with a different error than EAGAIN so
2933 2878                           *      there's no need to trigger as callbacks like
2934 2879                           *      as_unmap, as_setprot or as_free would do.
2935 2880                           */
2936 2881                          mutex_enter(&as->a_contents);
2937 2882                          if (!AS_ISNOUNMAPWAIT(as)) {
2938 2883                                  if (AS_ISUNMAPWAIT(as) == 0) {
2939 2884                                          cv_broadcast(&as->a_cv);
2940 2885                                  }
2941 2886                                  AS_SETUNMAPWAIT(as);
2942 2887                                  AS_LOCK_EXIT(as, &as->a_lock);
2943 2888                                  while (AS_ISUNMAPWAIT(as)) {
2944 2889                                          cv_wait(&as->a_cv, &as->a_contents);
2945 2890                                  }
2946 2891                          } else {
2947 2892                                  /*
2948 2893                                   * We may have raced with
2949 2894                                   * segvn_reclaim()/segspt_reclaim(). In this
2950 2895                                   * case clean nounmapwait flag and retry since
2951 2896                                   * softlockcnt in this segment may be already
2952 2897                                   * 0.  We don't drop as writer lock so our
2953 2898                                   * number of retries without sleeping should
2954 2899                                   * be very small. See segvn_reclaim() for
2955 2900                                   * more comments.
2956 2901                                   */
2957 2902                                  AS_CLRNOUNMAPWAIT(as);
2958 2903                                  mutex_exit(&as->a_contents);
2959 2904                                  goto retry;
2960 2905                          }
2961 2906                          mutex_exit(&as->a_contents);
2962 2907                          goto setpgsz_top;
2963 2908                  } else if (error != 0) {
2964 2909                          break;
2965 2910                  }
2966 2911          }
2967 2912          as_setwatch(as);
2968 2913          AS_LOCK_EXIT(as, &as->a_lock);
2969 2914          return (error);
2970 2915  }
2971 2916  
2972 2917  /*
2973 2918   * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
2974 2919   * in its chunk where s_szc is less than the szc we want to set.
2975 2920   */
2976 2921  static int
2977 2922  as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2978 2923      int *retry)
2979 2924  {
2980 2925          struct seg *seg;
2981 2926          size_t ssize;
2982 2927          int error;
2983 2928  
2984 2929          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
2985 2930  
2986 2931          seg = as_segat(as, raddr);
2987 2932          if (seg == NULL) {
2988 2933                  panic("as_iset3_default_lpsize: no seg");
2989 2934          }
2990 2935  
2991 2936          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2992 2937                  if (raddr >= seg->s_base + seg->s_size) {
2993 2938                          seg = AS_SEGNEXT(as, seg);
2994 2939                          if (seg == NULL || raddr != seg->s_base) {
2995 2940                                  panic("as_iset3_default_lpsize: as changed");
2996 2941                          }
2997 2942                  }
2998 2943                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2999 2944                          ssize = seg->s_base + seg->s_size - raddr;
3000 2945                  } else {
3001 2946                          ssize = rsize;
3002 2947                  }
3003 2948  
3004 2949                  if (szc > seg->s_szc) {
3005 2950                          error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3006 2951                          /* Only retry on EINVAL segments that have no vnode. */
3007 2952                          if (error == EINVAL) {
3008 2953                                  vnode_t *vp = NULL;
3009 2954                                  if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3010 2955                                      (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3011 2956                                      vp == NULL)) {
3012 2957                                          *retry = 1;
3013 2958                                  } else {
3014 2959                                          *retry = 0;
3015 2960                                  }
3016 2961                          }
3017 2962                          if (error) {
3018 2963                                  return (error);
3019 2964                          }
3020 2965                  }
3021 2966          }
3022 2967          return (0);
3023 2968  }
3024 2969  
3025 2970  /*
3026 2971   * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3027 2972   * pagesize on each segment in its range, but if any fails with EINVAL,
3028 2973   * then it reduces the pagesizes to the next size in the bitmap and
3029 2974   * retries as_iset3_default_lpsize(). The reason why the code retries
3030 2975   * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3031 2976   * match the bigger sizes, and (b) it's hard to get this offset (to begin
3032 2977   * with) to pass to map_pgszcvec().
3033 2978   */
3034 2979  static int
3035 2980  as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3036 2981      uint_t szcvec)
3037 2982  {
3038 2983          int error;
3039 2984          int retry;
3040 2985  
3041 2986          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3042 2987  
3043 2988          for (;;) {
3044 2989                  error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3045 2990                  if (error == EINVAL && retry) {
3046 2991                          szcvec &= ~(1 << szc);
3047 2992                          if (szcvec <= 1) {
3048 2993                                  return (EINVAL);
3049 2994                          }
3050 2995                          szc = highbit(szcvec) - 1;
3051 2996                  } else {
3052 2997                          return (error);
3053 2998                  }
3054 2999          }
3055 3000  }
3056 3001  
3057 3002  /*
3058 3003   * as_iset1_default_lpsize() breaks its chunk into areas where existing
3059 3004   * segments have a smaller szc than we want to set. For each such area,
3060 3005   * it calls as_iset2_default_lpsize()
3061 3006   */
3062 3007  static int
3063 3008  as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3064 3009      uint_t szcvec)
3065 3010  {
3066 3011          struct seg *seg;
3067 3012          size_t ssize;
3068 3013          caddr_t setaddr = raddr;
3069 3014          size_t setsize = 0;
3070 3015          int set;
3071 3016          int error;
3072 3017  
3073 3018          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3074 3019  
3075 3020          seg = as_segat(as, raddr);
3076 3021          if (seg == NULL) {
3077 3022                  panic("as_iset1_default_lpsize: no seg");
3078 3023          }
3079 3024          if (seg->s_szc < szc) {
3080 3025                  set = 1;
3081 3026          } else {
3082 3027                  set = 0;
3083 3028          }
3084 3029  
3085 3030          for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3086 3031                  if (raddr >= seg->s_base + seg->s_size) {
3087 3032                          seg = AS_SEGNEXT(as, seg);
3088 3033                          if (seg == NULL || raddr != seg->s_base) {
3089 3034                                  panic("as_iset1_default_lpsize: as changed");
3090 3035                          }
3091 3036                          if (seg->s_szc >= szc && set) {
3092 3037                                  ASSERT(setsize != 0);
3093 3038                                  error = as_iset2_default_lpsize(as,
3094 3039                                      setaddr, setsize, szc, szcvec);
3095 3040                                  if (error) {
3096 3041                                          return (error);
3097 3042                                  }
3098 3043                                  set = 0;
3099 3044                          } else if (seg->s_szc < szc && !set) {
3100 3045                                  setaddr = raddr;
3101 3046                                  setsize = 0;
3102 3047                                  set = 1;
3103 3048                          }
3104 3049                  }
3105 3050                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3106 3051                          ssize = seg->s_base + seg->s_size - raddr;
3107 3052                  } else {
3108 3053                          ssize = rsize;
3109 3054                  }
3110 3055          }
3111 3056          error = 0;
3112 3057          if (set) {
3113 3058                  ASSERT(setsize != 0);
3114 3059                  error = as_iset2_default_lpsize(as, setaddr, setsize,
3115 3060                      szc, szcvec);
3116 3061          }
3117 3062          return (error);
3118 3063  }
3119 3064  
3120 3065  /*
3121 3066   * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3122 3067   * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3123 3068   * chunk to as_iset1_default_lpsize().
3124 3069   */
3125 3070  static int
3126 3071  as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3127 3072      int type)
3128 3073  {
3129 3074          int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3130 3075          uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3131 3076              flags, rtype, 1);
3132 3077          uint_t szc;
3133 3078          uint_t nszc;
3134 3079          int error;
3135 3080          caddr_t a;
3136 3081          caddr_t eaddr;
3137 3082          size_t segsize;
3138 3083          size_t pgsz;
3139 3084          uint_t save_szcvec;
3140 3085  
3141 3086          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3142 3087          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3143 3088          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3144 3089  
3145 3090          szcvec &= ~1;
3146 3091          if (szcvec <= 1) {      /* skip if base page size */
3147 3092                  return (0);
3148 3093          }
3149 3094  
3150 3095          /* Get the pagesize of the first larger page size. */
3151 3096          szc = lowbit(szcvec) - 1;
3152 3097          pgsz = page_get_pagesize(szc);
3153 3098          eaddr = addr + size;
3154 3099          addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3155 3100          eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3156 3101  
3157 3102          save_szcvec = szcvec;
3158 3103          szcvec >>= (szc + 1);
3159 3104          nszc = szc;
3160 3105          while (szcvec) {
3161 3106                  if ((szcvec & 0x1) == 0) {
3162 3107                          nszc++;
3163 3108                          szcvec >>= 1;
3164 3109                          continue;
3165 3110                  }
3166 3111                  nszc++;
3167 3112                  pgsz = page_get_pagesize(nszc);
3168 3113                  a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3169 3114                  if (a != addr) {
3170 3115                          ASSERT(szc > 0);
3171 3116                          ASSERT(a < eaddr);
3172 3117                          segsize = a - addr;
3173 3118                          error = as_iset1_default_lpsize(as, addr, segsize, szc,
3174 3119                              save_szcvec);
3175 3120                          if (error) {
3176 3121                                  return (error);
3177 3122                          }
3178 3123                          addr = a;
3179 3124                  }
3180 3125                  szc = nszc;
3181 3126                  szcvec >>= 1;
3182 3127          }
3183 3128  
3184 3129          ASSERT(addr < eaddr);
3185 3130          szcvec = save_szcvec;
3186 3131          while (szcvec) {
3187 3132                  a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3188 3133                  ASSERT(a >= addr);
3189 3134                  if (a != addr) {
3190 3135                          ASSERT(szc > 0);
3191 3136                          segsize = a - addr;
3192 3137                          error = as_iset1_default_lpsize(as, addr, segsize, szc,
3193 3138                              save_szcvec);
3194 3139                          if (error) {
3195 3140                                  return (error);
3196 3141                          }
3197 3142                          addr = a;
3198 3143                  }
3199 3144                  szcvec &= ~(1 << szc);
3200 3145                  if (szcvec) {
3201 3146                          szc = highbit(szcvec) - 1;
3202 3147                          pgsz = page_get_pagesize(szc);
3203 3148                  }
3204 3149          }
3205 3150          ASSERT(addr == eaddr);
3206 3151  
3207 3152          return (0);
3208 3153  }
3209 3154  
3210 3155  /*
3211 3156   * Set the default large page size for the range. Called via memcntl with
3212 3157   * page size set to 0. as_set_default_lpsize breaks the range down into
3213 3158   * chunks with the same type/flags, ignores-non segvn segments, and passes
3214 3159   * each chunk to as_iset_default_lpsize().
3215 3160   */
3216 3161  int
3217 3162  as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3218 3163  {
3219 3164          struct seg *seg;
3220 3165          caddr_t raddr;
3221 3166          size_t rsize;
3222 3167          size_t ssize;
3223 3168          int rtype, rflags;
3224 3169          int stype, sflags;
3225 3170          int error;
3226 3171          caddr_t setaddr;
3227 3172          size_t setsize;
3228 3173          int segvn;
3229 3174  
3230 3175          if (size == 0)
3231 3176                  return (0);
3232 3177  
3233 3178          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3234 3179  again:
3235 3180          error = 0;
3236 3181  
3237 3182          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3238 3183          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3239 3184              (size_t)raddr;
3240 3185  
3241 3186          if (raddr + rsize < raddr) {            /* check for wraparound */
3242 3187                  AS_LOCK_EXIT(as, &as->a_lock);
3243 3188                  return (ENOMEM);
3244 3189          }
3245 3190          as_clearwatchprot(as, raddr, rsize);
3246 3191          seg = as_segat(as, raddr);
3247 3192          if (seg == NULL) {
3248 3193                  as_setwatch(as);
3249 3194                  AS_LOCK_EXIT(as, &as->a_lock);
3250 3195                  return (ENOMEM);
3251 3196          }
3252 3197          if (seg->s_ops == &segvn_ops) {
3253 3198                  rtype = SEGOP_GETTYPE(seg, addr);
3254 3199                  rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3255 3200                  rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3256 3201                  segvn = 1;
3257 3202          } else {
3258 3203                  segvn = 0;
3259 3204          }
3260 3205          setaddr = raddr;
3261 3206          setsize = 0;
3262 3207  
3263 3208          for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3264 3209                  if (raddr >= (seg->s_base + seg->s_size)) {
3265 3210                          seg = AS_SEGNEXT(as, seg);
3266 3211                          if (seg == NULL || raddr != seg->s_base) {
3267 3212                                  error = ENOMEM;
3268 3213                                  break;
3269 3214                          }
3270 3215                          if (seg->s_ops == &segvn_ops) {
3271 3216                                  stype = SEGOP_GETTYPE(seg, raddr);
3272 3217                                  sflags = stype & (MAP_TEXT | MAP_INITDATA);
3273 3218                                  stype &= (MAP_SHARED | MAP_PRIVATE);
3274 3219                                  if (segvn && (rflags != sflags ||
3275 3220                                      rtype != stype)) {
3276 3221                                          /*
3277 3222                                           * The next segment is also segvn but
3278 3223                                           * has different flags and/or type.
3279 3224                                           */
3280 3225                                          ASSERT(setsize != 0);
3281 3226                                          error = as_iset_default_lpsize(as,
3282 3227                                              setaddr, setsize, rflags, rtype);
3283 3228                                          if (error) {
3284 3229                                                  break;
3285 3230                                          }
3286 3231                                          rflags = sflags;
3287 3232                                          rtype = stype;
3288 3233                                          setaddr = raddr;
3289 3234                                          setsize = 0;
3290 3235                                  } else if (!segvn) {
3291 3236                                          rflags = sflags;
3292 3237                                          rtype = stype;
3293 3238                                          setaddr = raddr;
3294 3239                                          setsize = 0;
3295 3240                                          segvn = 1;
3296 3241                                  }
3297 3242                          } else if (segvn) {
3298 3243                                  /* The next segment is not segvn. */
3299 3244                                  ASSERT(setsize != 0);
3300 3245                                  error = as_iset_default_lpsize(as,
3301 3246                                      setaddr, setsize, rflags, rtype);
3302 3247                                  if (error) {
3303 3248                                          break;
3304 3249                                  }
3305 3250                                  segvn = 0;
3306 3251                          }
3307 3252                  }
3308 3253                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3309 3254                          ssize = seg->s_base + seg->s_size - raddr;
3310 3255                  } else {
3311 3256                          ssize = rsize;
3312 3257                  }
3313 3258          }
3314 3259          if (error == 0 && segvn) {
3315 3260                  /* The last chunk when rsize == 0. */
3316 3261                  ASSERT(setsize != 0);
3317 3262                  error = as_iset_default_lpsize(as, setaddr, setsize,
3318 3263                      rflags, rtype);
3319 3264          }
3320 3265  
3321 3266          if (error == IE_RETRY) {
3322 3267                  goto again;
3323 3268          } else if (error == IE_NOMEM) {
3324 3269                  error = EAGAIN;
3325 3270          } else if (error == ENOTSUP) {
3326 3271                  error = EINVAL;
3327 3272          } else if (error == EAGAIN) {
3328 3273                  mutex_enter(&as->a_contents);
3329 3274                  if (!AS_ISNOUNMAPWAIT(as)) {
3330 3275                          if (AS_ISUNMAPWAIT(as) == 0) {
3331 3276                                  cv_broadcast(&as->a_cv);
3332 3277                          }
3333 3278                          AS_SETUNMAPWAIT(as);
3334 3279                          AS_LOCK_EXIT(as, &as->a_lock);
3335 3280                          while (AS_ISUNMAPWAIT(as)) {
3336 3281                                  cv_wait(&as->a_cv, &as->a_contents);
3337 3282                          }
3338 3283                          mutex_exit(&as->a_contents);
3339 3284                          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3340 3285                  } else {
3341 3286                          /*
3342 3287                           * We may have raced with
3343 3288                           * segvn_reclaim()/segspt_reclaim(). In this case
3344 3289                           * clean nounmapwait flag and retry since softlockcnt
3345 3290                           * in this segment may be already 0.  We don't drop as
3346 3291                           * writer lock so our number of retries without
3347 3292                           * sleeping should be very small. See segvn_reclaim()
3348 3293                           * for more comments.
3349 3294                           */
3350 3295                          AS_CLRNOUNMAPWAIT(as);
3351 3296                          mutex_exit(&as->a_contents);
3352 3297                  }
3353 3298                  goto again;
3354 3299          }
3355 3300  
3356 3301          as_setwatch(as);
3357 3302          AS_LOCK_EXIT(as, &as->a_lock);
3358 3303          return (error);
3359 3304  }
3360 3305  
3361 3306  /*
3362 3307   * Setup all of the uninitialized watched pages that we can.
3363 3308   */
3364 3309  void
3365 3310  as_setwatch(struct as *as)
3366 3311  {
3367 3312          struct watched_page *pwp;
3368 3313          struct seg *seg;
3369 3314          caddr_t vaddr;
3370 3315          uint_t prot;
3371 3316          int  err, retrycnt;
3372 3317  
3373 3318          if (avl_numnodes(&as->a_wpage) == 0)
3374 3319                  return;
3375 3320  
3376 3321          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3377 3322  
3378 3323          for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3379 3324              pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3380 3325                  retrycnt = 0;
3381 3326          retry:
3382 3327                  vaddr = pwp->wp_vaddr;
3383 3328                  if (pwp->wp_oprot != 0 ||       /* already set up */
3384 3329                      (seg = as_segat(as, vaddr)) == NULL ||
3385 3330                      SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3386 3331                          continue;
3387 3332  
3388 3333                  pwp->wp_oprot = prot;
3389 3334                  if (pwp->wp_read)
3390 3335                          prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3391 3336                  if (pwp->wp_write)
3392 3337                          prot &= ~PROT_WRITE;
3393 3338                  if (pwp->wp_exec)
3394 3339                          prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3395 3340                  if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3396 3341                          err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3397 3342                          if (err == IE_RETRY) {
3398 3343                                  pwp->wp_oprot = 0;
3399 3344                                  ASSERT(retrycnt == 0);
3400 3345                                  retrycnt++;
3401 3346                                  goto retry;
3402 3347                          }
3403 3348                  }
3404 3349                  pwp->wp_prot = prot;
3405 3350          }
3406 3351  }
3407 3352  
3408 3353  /*
3409 3354   * Clear all of the watched pages in the address space.
3410 3355   */
3411 3356  void
3412 3357  as_clearwatch(struct as *as)
3413 3358  {
3414 3359          struct watched_page *pwp;
3415 3360          struct seg *seg;
3416 3361          caddr_t vaddr;
3417 3362          uint_t prot;
3418 3363          int err, retrycnt;
3419 3364  
3420 3365          if (avl_numnodes(&as->a_wpage) == 0)
3421 3366                  return;
3422 3367  
3423 3368          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3424 3369  
3425 3370          for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3426 3371              pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3427 3372                  retrycnt = 0;
3428 3373          retry:
3429 3374                  vaddr = pwp->wp_vaddr;
3430 3375                  if (pwp->wp_oprot == 0 ||       /* not set up */
3431 3376                      (seg = as_segat(as, vaddr)) == NULL)
3432 3377                          continue;
3433 3378  
3434 3379                  if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3435 3380                          err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3436 3381                          if (err == IE_RETRY) {
3437 3382                                  ASSERT(retrycnt == 0);
3438 3383                                  retrycnt++;
3439 3384                                  goto retry;
3440 3385                          }
3441 3386                  }
3442 3387                  pwp->wp_oprot = 0;
3443 3388                  pwp->wp_prot = 0;
3444 3389          }
3445 3390  }
3446 3391  
3447 3392  /*
3448 3393   * Force a new setup for all the watched pages in the range.
3449 3394   */
3450 3395  static void
3451 3396  as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3452 3397  {
3453 3398          struct watched_page *pwp;
3454 3399          struct watched_page tpw;
3455 3400          caddr_t eaddr = addr + size;
3456 3401          caddr_t vaddr;
3457 3402          struct seg *seg;
3458 3403          int err, retrycnt;
3459 3404          uint_t  wprot;
3460 3405          avl_index_t where;
3461 3406  
3462 3407          if (avl_numnodes(&as->a_wpage) == 0)
3463 3408                  return;
3464 3409  
3465 3410          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3466 3411  
3467 3412          tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3468 3413          if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3469 3414                  pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3470 3415  
3471 3416          while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3472 3417                  retrycnt = 0;
3473 3418                  vaddr = pwp->wp_vaddr;
3474 3419  
3475 3420                  wprot = prot;
3476 3421                  if (pwp->wp_read)
3477 3422                          wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3478 3423                  if (pwp->wp_write)
3479 3424                          wprot &= ~PROT_WRITE;
3480 3425                  if (pwp->wp_exec)
3481 3426                          wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3482 3427                  if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3483 3428                  retry:
3484 3429                          seg = as_segat(as, vaddr);
3485 3430                          if (seg == NULL) {
3486 3431                                  panic("as_setwatchprot: no seg");
3487 3432                                  /*NOTREACHED*/
3488 3433                          }
3489 3434                          err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3490 3435                          if (err == IE_RETRY) {
3491 3436                                  ASSERT(retrycnt == 0);
3492 3437                                  retrycnt++;
3493 3438                                  goto retry;
3494 3439                          }
3495 3440                  }
3496 3441                  pwp->wp_oprot = prot;
3497 3442                  pwp->wp_prot = wprot;
3498 3443  
3499 3444                  pwp = AVL_NEXT(&as->a_wpage, pwp);
3500 3445          }
3501 3446  }
3502 3447  
3503 3448  /*
3504 3449   * Clear all of the watched pages in the range.
3505 3450   */
3506 3451  static void
3507 3452  as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3508 3453  {
3509 3454          caddr_t eaddr = addr + size;
3510 3455          struct watched_page *pwp;
3511 3456          struct watched_page tpw;
3512 3457          uint_t prot;
3513 3458          struct seg *seg;
3514 3459          int err, retrycnt;
3515 3460          avl_index_t where;
3516 3461  
3517 3462          if (avl_numnodes(&as->a_wpage) == 0)
3518 3463                  return;
3519 3464  
3520 3465          tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3521 3466          if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3522 3467                  pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3523 3468  
3524 3469          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3525 3470  
3526 3471          while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3527 3472  
3528 3473                  if ((prot = pwp->wp_oprot) != 0) {
3529 3474                          retrycnt = 0;
3530 3475  
3531 3476                          if (prot != pwp->wp_prot) {
3532 3477                          retry:
3533 3478                                  seg = as_segat(as, pwp->wp_vaddr);
3534 3479                                  if (seg == NULL)
3535 3480                                          continue;
3536 3481                                  err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3537 3482                                      PAGESIZE, prot);
3538 3483                                  if (err == IE_RETRY) {
3539 3484                                          ASSERT(retrycnt == 0);
3540 3485                                          retrycnt++;
3541 3486                                          goto retry;
3542 3487  
3543 3488                                  }
3544 3489                          }
3545 3490                          pwp->wp_oprot = 0;
3546 3491                          pwp->wp_prot = 0;
3547 3492                  }
3548 3493  
3549 3494                  pwp = AVL_NEXT(&as->a_wpage, pwp);
3550 3495          }
3551 3496  }
3552 3497  
3553 3498  void
3554 3499  as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3555 3500  {
3556 3501          struct proc *p;
3557 3502  
3558 3503          mutex_enter(&pidlock);
3559 3504          for (p = practive; p; p = p->p_next) {
3560 3505                  if (p->p_as == as) {
3561 3506                          mutex_enter(&p->p_lock);
3562 3507                          if (p->p_as == as)
3563 3508                                  sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3564 3509                          mutex_exit(&p->p_lock);
3565 3510                  }
3566 3511          }
3567 3512          mutex_exit(&pidlock);
3568 3513  }
3569 3514  
3570 3515  /*
3571 3516   * return memory object ID
3572 3517   */
3573 3518  int
3574 3519  as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3575 3520  {
3576 3521          struct seg      *seg;
3577 3522          int             sts;
3578 3523  
3579 3524          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
3580 3525          seg = as_segat(as, addr);
3581 3526          if (seg == NULL) {
3582 3527                  AS_LOCK_EXIT(as, &as->a_lock);
3583 3528                  return (EFAULT);
3584 3529          }
3585 3530          /*
3586 3531           * catch old drivers which may not support getmemid
3587 3532           */
3588 3533          if (seg->s_ops->getmemid == NULL) {
3589 3534                  AS_LOCK_EXIT(as, &as->a_lock);
3590 3535                  return (ENODEV);
3591 3536          }
3592 3537  
3593 3538          sts = SEGOP_GETMEMID(seg, addr, memidp);
3594 3539  
3595 3540          AS_LOCK_EXIT(as, &as->a_lock);
3596 3541          return (sts);
3597 3542  }

↓ open down ↓

1472 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX