swapping-v2 Wdiff usr/src/uts/common/vm/vm_as.c

Print this page

patch remove-as_swapout

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_as.c
          +++ new/usr/src/uts/common/vm/vm_as.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  27   27  /*        All Rights Reserved   */
  28   28  
  29   29  /*
  30   30   * University Copyright- Copyright (c) 1982, 1986, 1988
  31   31   * The Regents of the University of California
  32   32   * All Rights Reserved
  33   33   *
  34   34   * University Acknowledgment- Portions of this document are derived from
  35   35   * software developed by the University of California, Berkeley, and its
  36   36   * contributors.
  37   37   */
  38   38  
  39   39  /*
  40   40   * VM - address spaces.
  41   41   */
  42   42  
  43   43  #include <sys/types.h>
  44   44  #include <sys/t_lock.h>
  45   45  #include <sys/param.h>
  46   46  #include <sys/errno.h>
  47   47  #include <sys/systm.h>
  48   48  #include <sys/mman.h>
  49   49  #include <sys/sysmacros.h>
  50   50  #include <sys/cpuvar.h>
  51   51  #include <sys/sysinfo.h>
  52   52  #include <sys/kmem.h>
  53   53  #include <sys/vnode.h>
  54   54  #include <sys/vmsystm.h>
  55   55  #include <sys/cmn_err.h>
  56   56  #include <sys/debug.h>
  57   57  #include <sys/tnf_probe.h>
  58   58  #include <sys/vtrace.h>
  59   59  
  60   60  #include <vm/hat.h>
  61   61  #include <vm/xhat.h>
  62   62  #include <vm/as.h>
  63   63  #include <vm/seg.h>
  64   64  #include <vm/seg_vn.h>
  65   65  #include <vm/seg_dev.h>
  66   66  #include <vm/seg_kmem.h>
  67   67  #include <vm/seg_map.h>
  68   68  #include <vm/seg_spt.h>
  69   69  #include <vm/page.h>
  70   70  
  71   71  clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
  72   72  
  73   73  static struct kmem_cache *as_cache;
  74   74  
  75   75  static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  76   76  static void as_clearwatchprot(struct as *, caddr_t, size_t);
  77   77  int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
  78   78  
  79   79  
  80   80  /*
  81   81   * Verifying the segment lists is very time-consuming; it may not be
  82   82   * desirable always to define VERIFY_SEGLIST when DEBUG is set.
  83   83   */
  84   84  #ifdef DEBUG
  85   85  #define VERIFY_SEGLIST
  86   86  int do_as_verify = 0;
  87   87  #endif
  88   88  
  89   89  /*
  90   90   * Allocate a new callback data structure entry and fill in the events of
  91   91   * interest, the address range of interest, and the callback argument.
  92   92   * Link the entry on the as->a_callbacks list. A callback entry for the
  93   93   * entire address space may be specified with vaddr = 0 and size = -1.
  94   94   *
  95   95   * CALLERS RESPONSIBILITY: If not calling from within the process context for
  96   96   * the specified as, the caller must guarantee persistence of the specified as
  97   97   * for the duration of this function (eg. pages being locked within the as
  98   98   * will guarantee persistence).
  99   99   */
 100  100  int
 101  101  as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 102  102                  caddr_t vaddr, size_t size, int sleepflag)
 103  103  {
 104  104          struct as_callback      *current_head, *cb;
 105  105          caddr_t                 saddr;
 106  106          size_t                  rsize;
 107  107  
 108  108          /* callback function and an event are mandatory */
 109  109          if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 110  110                  return (EINVAL);
 111  111  
 112  112          /* Adding a callback after as_free has been called is not allowed */
 113  113          if (as == &kas)
 114  114                  return (ENOMEM);
 115  115  
 116  116          /*
 117  117           * vaddr = 0 and size = -1 is used to indicate that the callback range
 118  118           * is the entire address space so no rounding is done in that case.
 119  119           */
 120  120          if (size != -1) {
 121  121                  saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 122  122                  rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 123  123                      (size_t)saddr;
 124  124                  /* check for wraparound */
 125  125                  if (saddr + rsize < saddr)
 126  126                          return (ENOMEM);
 127  127          } else {
 128  128                  if (vaddr != 0)
 129  129                          return (EINVAL);
 130  130                  saddr = vaddr;
 131  131                  rsize = size;
 132  132          }
 133  133  
 134  134          /* Allocate and initialize a callback entry */
 135  135          cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 136  136          if (cb == NULL)
 137  137                  return (EAGAIN);
 138  138  
 139  139          cb->ascb_func = cb_func;
 140  140          cb->ascb_arg = arg;
 141  141          cb->ascb_events = events;
 142  142          cb->ascb_saddr = saddr;
 143  143          cb->ascb_len = rsize;
 144  144  
 145  145          /* Add the entry to the list */
 146  146          mutex_enter(&as->a_contents);
 147  147          current_head = as->a_callbacks;
 148  148          as->a_callbacks = cb;
 149  149          cb->ascb_next = current_head;
 150  150  
 151  151          /*
 152  152           * The call to this function may lose in a race with
 153  153           * a pertinent event - eg. a thread does long term memory locking
 154  154           * but before the callback is added another thread executes as_unmap.
 155  155           * A broadcast here resolves that.
 156  156           */
 157  157          if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 158  158                  AS_CLRUNMAPWAIT(as);
 159  159                  cv_broadcast(&as->a_cv);
 160  160          }
 161  161  
 162  162          mutex_exit(&as->a_contents);
 163  163          return (0);
 164  164  }
 165  165  
 166  166  /*
 167  167   * Search the callback list for an entry which pertains to arg.
 168  168   *
 169  169   * This is called from within the client upon completion of the callback.
 170  170   * RETURN VALUES:
 171  171   *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 172  172   *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 173  173   *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 174  174   *                      entry will be made in as_do_callbacks)
 175  175   *
 176  176   * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 177  177   * set, it indicates that as_do_callbacks is processing this entry.  The
 178  178   * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 179  179   * to unblock as_do_callbacks, in case it is blocked.
 180  180   *
 181  181   * CALLERS RESPONSIBILITY: If not calling from within the process context for
 182  182   * the specified as, the caller must guarantee persistence of the specified as
 183  183   * for the duration of this function (eg. pages being locked within the as
 184  184   * will guarantee persistence).
 185  185   */
 186  186  uint_t
 187  187  as_delete_callback(struct as *as, void *arg)
 188  188  {
 189  189          struct as_callback **prevcb = &as->a_callbacks;
 190  190          struct as_callback *cb;
 191  191          uint_t rc = AS_CALLBACK_NOTFOUND;
 192  192  
 193  193          mutex_enter(&as->a_contents);
 194  194          for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 195  195                  if (cb->ascb_arg != arg)
 196  196                          continue;
 197  197  
 198  198                  /*
 199  199                   * If the events indicate AS_CALLBACK_CALLED, just clear
 200  200                   * AS_ALL_EVENT in the events field and wakeup the thread
 201  201                   * that may be waiting in as_do_callbacks.  as_do_callbacks
 202  202                   * will take care of removing this entry from the list.  In
 203  203                   * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 204  204                   * (AS_CALLBACK_CALLED not set), just remove it from the
 205  205                   * list, return the memory and return AS_CALLBACK_DELETED.
 206  206                   */
 207  207                  if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 208  208                          /* leave AS_CALLBACK_CALLED */
 209  209                          cb->ascb_events &= ~AS_ALL_EVENT;
 210  210                          rc = AS_CALLBACK_DELETE_DEFERRED;
 211  211                          cv_broadcast(&as->a_cv);
 212  212                  } else {
 213  213                          *prevcb = cb->ascb_next;
 214  214                          kmem_free(cb, sizeof (struct as_callback));
 215  215                          rc = AS_CALLBACK_DELETED;
 216  216                  }
 217  217                  break;
 218  218          }
 219  219          mutex_exit(&as->a_contents);
 220  220          return (rc);
 221  221  }
 222  222  
 223  223  /*
 224  224   * Searches the as callback list for a matching entry.
 225  225   * Returns a pointer to the first matching callback, or NULL if
 226  226   * nothing is found.
 227  227   * This function never sleeps so it is ok to call it with more
 228  228   * locks held but the (required) a_contents mutex.
 229  229   *
 230  230   * See also comment on as_do_callbacks below.
 231  231   */
 232  232  static struct as_callback *
 233  233  as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 234  234                          size_t event_len)
 235  235  {
 236  236          struct as_callback      *cb;
 237  237  
 238  238          ASSERT(MUTEX_HELD(&as->a_contents));
 239  239          for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 240  240                  /*
 241  241                   * If the callback has not already been called, then
 242  242                   * check if events or address range pertains.  An event_len
 243  243                   * of zero means do an unconditional callback.
 244  244                   */
 245  245                  if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 246  246                      ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 247  247                      (event_addr + event_len < cb->ascb_saddr) ||
 248  248                      (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 249  249                          continue;
 250  250                  }
 251  251                  break;
 252  252          }
 253  253          return (cb);
 254  254  }
 255  255  
 256  256  /*
 257  257   * Executes a given callback and removes it from the callback list for
 258  258   * this address space.
 259  259   * This function may sleep so the caller must drop all locks except
 260  260   * a_contents before calling this func.
 261  261   *
 262  262   * See also comments on as_do_callbacks below.
 263  263   */
 264  264  static void
 265  265  as_execute_callback(struct as *as, struct as_callback *cb,
 266  266                                  uint_t events)
 267  267  {
 268  268          struct as_callback **prevcb;
 269  269          void    *cb_arg;
 270  270  
 271  271          ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 272  272          cb->ascb_events |= AS_CALLBACK_CALLED;
 273  273          mutex_exit(&as->a_contents);
 274  274          (*cb->ascb_func)(as, cb->ascb_arg, events);
 275  275          mutex_enter(&as->a_contents);
 276  276          /*
 277  277           * the callback function is required to delete the callback
 278  278           * when the callback function determines it is OK for
 279  279           * this thread to continue. as_delete_callback will clear
 280  280           * the AS_ALL_EVENT in the events field when it is deleted.
 281  281           * If the callback function called as_delete_callback,
 282  282           * events will already be cleared and there will be no blocking.
 283  283           */
 284  284          while ((cb->ascb_events & events) != 0) {
 285  285                  cv_wait(&as->a_cv, &as->a_contents);
 286  286          }
 287  287          /*
 288  288           * This entry needs to be taken off the list. Normally, the
 289  289           * callback func itself does that, but unfortunately the list
 290  290           * may have changed while the callback was running because the
 291  291           * a_contents mutex was dropped and someone else other than the
 292  292           * callback func itself could have called as_delete_callback,
 293  293           * so we have to search to find this entry again.  The entry
 294  294           * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 295  295           */
 296  296          cb_arg = cb->ascb_arg;
 297  297          prevcb = &as->a_callbacks;
 298  298          for (cb = as->a_callbacks; cb != NULL;
 299  299              prevcb = &cb->ascb_next, cb = *prevcb) {
 300  300                  if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 301  301                      (cb_arg != cb->ascb_arg)) {
 302  302                          continue;
 303  303                  }
 304  304                  *prevcb = cb->ascb_next;
 305  305                  kmem_free(cb, sizeof (struct as_callback));
 306  306                  break;
 307  307          }
 308  308  }
 309  309  
 310  310  /*
 311  311   * Check the callback list for a matching event and intersection of
 312  312   * address range. If there is a match invoke the callback.  Skip an entry if:
 313  313   *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 314  314   *    - not event of interest
 315  315   *    - not address range of interest
 316  316   *
 317  317   * An event_len of zero indicates a request for an unconditional callback
 318  318   * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 319  319   * a_contents lock must be dropped before a callback, so only one callback
 320  320   * can be done before returning. Return -1 (true) if a callback was
 321  321   * executed and removed from the list, else return 0 (false).
 322  322   *
 323  323   * The logically separate parts, i.e. finding a matching callback and
 324  324   * executing a given callback have been separated into two functions
 325  325   * so that they can be called with different sets of locks held beyond
 326  326   * the always-required a_contents. as_find_callback does not sleep so
 327  327   * it is ok to call it if more locks than a_contents (i.e. the a_lock
 328  328   * rwlock) are held. as_execute_callback on the other hand may sleep
 329  329   * so all locks beyond a_contents must be dropped by the caller if one
 330  330   * does not want to end comatose.
 331  331   */
 332  332  static int
 333  333  as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 334  334                          size_t event_len)
 335  335  {
 336  336          struct as_callback *cb;
 337  337  
 338  338          if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 339  339                  as_execute_callback(as, cb, events);
 340  340                  return (-1);
 341  341          }
 342  342          return (0);
 343  343  }
 344  344  
 345  345  /*
 346  346   * Search for the segment containing addr. If a segment containing addr
 347  347   * exists, that segment is returned.  If no such segment exists, and
 348  348   * the list spans addresses greater than addr, then the first segment
 349  349   * whose base is greater than addr is returned; otherwise, NULL is
 350  350   * returned unless tail is true, in which case the last element of the
 351  351   * list is returned.
 352  352   *
 353  353   * a_seglast is used to cache the last found segment for repeated
 354  354   * searches to the same addr (which happens frequently).
 355  355   */
 356  356  struct seg *
 357  357  as_findseg(struct as *as, caddr_t addr, int tail)
 358  358  {
 359  359          struct seg *seg = as->a_seglast;
 360  360          avl_index_t where;
 361  361  
 362  362          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 363  363  
 364  364          if (seg != NULL &&
 365  365              seg->s_base <= addr &&
 366  366              addr < seg->s_base + seg->s_size)
 367  367                  return (seg);
 368  368  
 369  369          seg = avl_find(&as->a_segtree, &addr, &where);
 370  370          if (seg != NULL)
 371  371                  return (as->a_seglast = seg);
 372  372  
 373  373          seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 374  374          if (seg == NULL && tail)
 375  375                  seg = avl_last(&as->a_segtree);
 376  376          return (as->a_seglast = seg);
 377  377  }
 378  378  
 379  379  #ifdef VERIFY_SEGLIST
 380  380  /*
 381  381   * verify that the linked list is coherent
 382  382   */
 383  383  static void
 384  384  as_verify(struct as *as)
 385  385  {
 386  386          struct seg *seg, *seglast, *p, *n;
 387  387          uint_t nsegs = 0;
 388  388  
 389  389          if (do_as_verify == 0)
 390  390                  return;
 391  391  
 392  392          seglast = as->a_seglast;
 393  393  
 394  394          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 395  395                  ASSERT(seg->s_as == as);
 396  396                  p = AS_SEGPREV(as, seg);
 397  397                  n = AS_SEGNEXT(as, seg);
 398  398                  ASSERT(p == NULL || p->s_as == as);
 399  399                  ASSERT(p == NULL || p->s_base < seg->s_base);
 400  400                  ASSERT(n == NULL || n->s_base > seg->s_base);
 401  401                  ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 402  402                  if (seg == seglast)
 403  403                          seglast = NULL;
 404  404                  nsegs++;
 405  405          }
 406  406          ASSERT(seglast == NULL);
 407  407          ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 408  408  }
 409  409  #endif /* VERIFY_SEGLIST */
 410  410  
 411  411  /*
 412  412   * Add a new segment to the address space. The avl_find()
 413  413   * may be expensive so we attempt to use last segment accessed
 414  414   * in as_gap() as an insertion point.
 415  415   */
 416  416  int
 417  417  as_addseg(struct as  *as, struct seg *newseg)
 418  418  {
 419  419          struct seg *seg;
 420  420          caddr_t addr;
 421  421          caddr_t eaddr;
 422  422          avl_index_t where;
 423  423  
 424  424          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 425  425  
 426  426          as->a_updatedir = 1;    /* inform /proc */
 427  427          gethrestime(&as->a_updatetime);
 428  428  
 429  429          if (as->a_lastgaphl != NULL) {
 430  430                  struct seg *hseg = NULL;
 431  431                  struct seg *lseg = NULL;
 432  432  
 433  433                  if (as->a_lastgaphl->s_base > newseg->s_base) {
 434  434                          hseg = as->a_lastgaphl;
 435  435                          lseg = AVL_PREV(&as->a_segtree, hseg);
 436  436                  } else {
 437  437                          lseg = as->a_lastgaphl;
 438  438                          hseg = AVL_NEXT(&as->a_segtree, lseg);
 439  439                  }
 440  440  
 441  441                  if (hseg && lseg && lseg->s_base < newseg->s_base &&
 442  442                      hseg->s_base > newseg->s_base) {
 443  443                          avl_insert_here(&as->a_segtree, newseg, lseg,
 444  444                              AVL_AFTER);
 445  445                          as->a_lastgaphl = NULL;
 446  446                          as->a_seglast = newseg;
 447  447                          return (0);
 448  448                  }
 449  449                  as->a_lastgaphl = NULL;
 450  450          }
 451  451  
 452  452          addr = newseg->s_base;
 453  453          eaddr = addr + newseg->s_size;
 454  454  again:
 455  455  
 456  456          seg = avl_find(&as->a_segtree, &addr, &where);
 457  457  
 458  458          if (seg == NULL)
 459  459                  seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 460  460  
 461  461          if (seg == NULL)
 462  462                  seg = avl_last(&as->a_segtree);
 463  463  
 464  464          if (seg != NULL) {
 465  465                  caddr_t base = seg->s_base;
 466  466  
 467  467                  /*
 468  468                   * If top of seg is below the requested address, then
 469  469                   * the insertion point is at the end of the linked list,
 470  470                   * and seg points to the tail of the list.  Otherwise,
 471  471                   * the insertion point is immediately before seg.
 472  472                   */
 473  473                  if (base + seg->s_size > addr) {
 474  474                          if (addr >= base || eaddr > base) {
 475  475  #ifdef __sparc
 476  476                                  extern struct seg_ops segnf_ops;
 477  477  
 478  478                                  /*
 479  479                                   * no-fault segs must disappear if overlaid.
 480  480                                   * XXX need new segment type so
 481  481                                   * we don't have to check s_ops
 482  482                                   */
 483  483                                  if (seg->s_ops == &segnf_ops) {
 484  484                                          seg_unmap(seg);
 485  485                                          goto again;
 486  486                                  }
 487  487  #endif
 488  488                                  return (-1);    /* overlapping segment */
 489  489                          }
 490  490                  }
 491  491          }
 492  492          as->a_seglast = newseg;
 493  493          avl_insert(&as->a_segtree, newseg, where);
 494  494  
 495  495  #ifdef VERIFY_SEGLIST
 496  496          as_verify(as);
 497  497  #endif
 498  498          return (0);
 499  499  }
 500  500  
 501  501  struct seg *
 502  502  as_removeseg(struct as *as, struct seg *seg)
 503  503  {
 504  504          avl_tree_t *t;
 505  505  
 506  506          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 507  507  
 508  508          as->a_updatedir = 1;    /* inform /proc */
 509  509          gethrestime(&as->a_updatetime);
 510  510  
 511  511          if (seg == NULL)
 512  512                  return (NULL);
 513  513  
 514  514          t = &as->a_segtree;
 515  515          if (as->a_seglast == seg)
 516  516                  as->a_seglast = NULL;
 517  517          as->a_lastgaphl = NULL;
 518  518  
 519  519          /*
 520  520           * if this segment is at an address higher than
 521  521           * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 522  522           */
 523  523          if (as->a_lastgap &&
 524  524              (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 525  525                  as->a_lastgap = AVL_NEXT(t, seg);
 526  526  
 527  527          /*
 528  528           * remove the segment from the seg tree
 529  529           */
 530  530          avl_remove(t, seg);
 531  531  
 532  532  #ifdef VERIFY_SEGLIST
 533  533          as_verify(as);
 534  534  #endif
 535  535          return (seg);
 536  536  }
 537  537  
 538  538  /*
 539  539   * Find a segment containing addr.
 540  540   */
 541  541  struct seg *
 542  542  as_segat(struct as *as, caddr_t addr)
 543  543  {
 544  544          struct seg *seg = as->a_seglast;
 545  545  
 546  546          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 547  547  
 548  548          if (seg != NULL && seg->s_base <= addr &&
 549  549              addr < seg->s_base + seg->s_size)
 550  550                  return (seg);
 551  551  
 552  552          seg = avl_find(&as->a_segtree, &addr, NULL);
 553  553          return (seg);
 554  554  }
 555  555  
 556  556  /*
 557  557   * Serialize all searches for holes in an address space to
 558  558   * prevent two or more threads from allocating the same virtual
 559  559   * address range.  The address space must not be "read/write"
 560  560   * locked by the caller since we may block.
 561  561   */
 562  562  void
 563  563  as_rangelock(struct as *as)
 564  564  {
 565  565          mutex_enter(&as->a_contents);
 566  566          while (AS_ISCLAIMGAP(as))
 567  567                  cv_wait(&as->a_cv, &as->a_contents);
 568  568          AS_SETCLAIMGAP(as);
 569  569          mutex_exit(&as->a_contents);
 570  570  }
 571  571  
 572  572  /*
 573  573   * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 574  574   */
 575  575  void
 576  576  as_rangeunlock(struct as *as)
 577  577  {
 578  578          mutex_enter(&as->a_contents);
 579  579          AS_CLRCLAIMGAP(as);
 580  580          cv_signal(&as->a_cv);
 581  581          mutex_exit(&as->a_contents);
 582  582  }
 583  583  
 584  584  /*
 585  585   * compar segments (or just an address) by segment address range
 586  586   */
 587  587  static int
 588  588  as_segcompar(const void *x, const void *y)
 589  589  {
 590  590          struct seg *a = (struct seg *)x;
 591  591          struct seg *b = (struct seg *)y;
 592  592  
 593  593          if (a->s_base < b->s_base)
 594  594                  return (-1);
 595  595          if (a->s_base >= b->s_base + b->s_size)
 596  596                  return (1);
 597  597          return (0);
 598  598  }
 599  599  
 600  600  
 601  601  void
 602  602  as_avlinit(struct as *as)
 603  603  {
 604  604          avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 605  605              offsetof(struct seg, s_tree));
 606  606          avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 607  607              offsetof(struct watched_page, wp_link));
 608  608  }
 609  609  
 610  610  /*ARGSUSED*/
 611  611  static int
 612  612  as_constructor(void *buf, void *cdrarg, int kmflags)
 613  613  {
 614  614          struct as *as = buf;
 615  615  
 616  616          mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 617  617          cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 618  618          rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 619  619          as_avlinit(as);
 620  620          return (0);
 621  621  }
 622  622  
 623  623  /*ARGSUSED1*/
 624  624  static void
 625  625  as_destructor(void *buf, void *cdrarg)
 626  626  {
 627  627          struct as *as = buf;
 628  628  
 629  629          avl_destroy(&as->a_segtree);
 630  630          mutex_destroy(&as->a_contents);
 631  631          cv_destroy(&as->a_cv);
 632  632          rw_destroy(&as->a_lock);
 633  633  }
 634  634  
 635  635  void
 636  636  as_init(void)
 637  637  {
 638  638          as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 639  639              as_constructor, as_destructor, NULL, NULL, NULL, 0);
 640  640  }
 641  641  
 642  642  /*
 643  643   * Allocate and initialize an address space data structure.
 644  644   * We call hat_alloc to allow any machine dependent
 645  645   * information in the hat structure to be initialized.
 646  646   */
 647  647  struct as *
 648  648  as_alloc(void)
 649  649  {
 650  650          struct as *as;
 651  651  
 652  652          as = kmem_cache_alloc(as_cache, KM_SLEEP);
 653  653  
 654  654          as->a_flags             = 0;
 655  655          as->a_vbits             = 0;
 656  656          as->a_hrm               = NULL;
 657  657          as->a_seglast           = NULL;
 658  658          as->a_size              = 0;
 659  659          as->a_resvsize          = 0;
 660  660          as->a_updatedir         = 0;
 661  661          gethrestime(&as->a_updatetime);
 662  662          as->a_objectdir         = NULL;
 663  663          as->a_sizedir           = 0;
 664  664          as->a_userlimit         = (caddr_t)USERLIMIT;
 665  665          as->a_lastgap           = NULL;
 666  666          as->a_lastgaphl         = NULL;
 667  667          as->a_callbacks         = NULL;
 668  668  
 669  669          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 670  670          as->a_hat = hat_alloc(as);      /* create hat for default system mmu */
 671  671          AS_LOCK_EXIT(as, &as->a_lock);
 672  672  
 673  673          as->a_xhat = NULL;
 674  674  
 675  675          return (as);
 676  676  }
 677  677  
 678  678  /*
 679  679   * Free an address space data structure.
 680  680   * Need to free the hat first and then
 681  681   * all the segments on this as and finally
 682  682   * the space for the as struct itself.
 683  683   */
 684  684  void
 685  685  as_free(struct as *as)
 686  686  {
 687  687          struct hat *hat = as->a_hat;
 688  688          struct seg *seg, *next;
 689  689          int called = 0;
 690  690  
 691  691  top:
 692  692          /*
 693  693           * Invoke ALL callbacks. as_do_callbacks will do one callback
 694  694           * per call, and not return (-1) until the callback has completed.
 695  695           * When as_do_callbacks returns zero, all callbacks have completed.
 696  696           */
 697  697          mutex_enter(&as->a_contents);
 698  698          while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 699  699                  ;
 700  700  
 701  701          /* This will prevent new XHATs from attaching to as */
 702  702          if (!called)
 703  703                  AS_SETBUSY(as);
 704  704          mutex_exit(&as->a_contents);
 705  705          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 706  706  
 707  707          if (!called) {
 708  708                  called = 1;
 709  709                  hat_free_start(hat);
 710  710                  if (as->a_xhat != NULL)
 711  711                          xhat_free_start_all(as);
 712  712          }
 713  713          for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 714  714                  int err;
 715  715  
 716  716                  next = AS_SEGNEXT(as, seg);
 717  717  retry:
 718  718                  err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
 719  719                  if (err == EAGAIN) {
 720  720                          mutex_enter(&as->a_contents);
 721  721                          if (as->a_callbacks) {
 722  722                                  AS_LOCK_EXIT(as, &as->a_lock);
 723  723                          } else if (!AS_ISNOUNMAPWAIT(as)) {
 724  724                                  /*
 725  725                                   * Memory is currently locked. Wait for a
 726  726                                   * cv_signal that it has been unlocked, then
 727  727                                   * try the operation again.
 728  728                                   */
 729  729                                  if (AS_ISUNMAPWAIT(as) == 0)
 730  730                                          cv_broadcast(&as->a_cv);
 731  731                                  AS_SETUNMAPWAIT(as);
 732  732                                  AS_LOCK_EXIT(as, &as->a_lock);
 733  733                                  while (AS_ISUNMAPWAIT(as))
 734  734                                          cv_wait(&as->a_cv, &as->a_contents);
 735  735                          } else {
 736  736                                  /*
 737  737                                   * We may have raced with
 738  738                                   * segvn_reclaim()/segspt_reclaim(). In this
 739  739                                   * case clean nounmapwait flag and retry since
 740  740                                   * softlockcnt in this segment may be already
 741  741                                   * 0.  We don't drop as writer lock so our
 742  742                                   * number of retries without sleeping should
 743  743                                   * be very small. See segvn_reclaim() for
 744  744                                   * more comments.
 745  745                                   */
 746  746                                  AS_CLRNOUNMAPWAIT(as);
 747  747                                  mutex_exit(&as->a_contents);
 748  748                                  goto retry;
 749  749                          }
 750  750                          mutex_exit(&as->a_contents);
 751  751                          goto top;
 752  752                  } else {
 753  753                          /*
 754  754                           * We do not expect any other error return at this
 755  755                           * time. This is similar to an ASSERT in seg_unmap()
 756  756                           */
 757  757                          ASSERT(err == 0);
 758  758                  }
 759  759          }
 760  760          hat_free_end(hat);
 761  761          if (as->a_xhat != NULL)
 762  762                  xhat_free_end_all(as);
 763  763          AS_LOCK_EXIT(as, &as->a_lock);
 764  764  
 765  765          /* /proc stuff */
 766  766          ASSERT(avl_numnodes(&as->a_wpage) == 0);
 767  767          if (as->a_objectdir) {
 768  768                  kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 769  769                  as->a_objectdir = NULL;
 770  770                  as->a_sizedir = 0;
 771  771          }
 772  772  
 773  773          /*
 774  774           * Free the struct as back to kmem.  Assert it has no segments.
 775  775           */
 776  776          ASSERT(avl_numnodes(&as->a_segtree) == 0);
 777  777          kmem_cache_free(as_cache, as);
 778  778  }
 779  779  
 780  780  int
 781  781  as_dup(struct as *as, struct proc *forkedproc)
 782  782  {
 783  783          struct as *newas;
 784  784          struct seg *seg, *newseg;
 785  785          size_t  purgesize = 0;
 786  786          int error;
 787  787  
 788  788          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 789  789          as_clearwatch(as);
 790  790          newas = as_alloc();
 791  791          newas->a_userlimit = as->a_userlimit;
 792  792          newas->a_proc = forkedproc;
 793  793  
 794  794          AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
 795  795  
 796  796          /* This will prevent new XHATs from attaching */
 797  797          mutex_enter(&as->a_contents);
 798  798          AS_SETBUSY(as);
 799  799          mutex_exit(&as->a_contents);
 800  800          mutex_enter(&newas->a_contents);
 801  801          AS_SETBUSY(newas);
 802  802          mutex_exit(&newas->a_contents);
 803  803  
 804  804          (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 805  805  
 806  806          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 807  807  
 808  808                  if (seg->s_flags & S_PURGE) {
 809  809                          purgesize += seg->s_size;
 810  810                          continue;
 811  811                  }
 812  812  
 813  813                  newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 814  814                  if (newseg == NULL) {
 815  815                          AS_LOCK_EXIT(newas, &newas->a_lock);
 816  816                          as_setwatch(as);
 817  817                          mutex_enter(&as->a_contents);
 818  818                          AS_CLRBUSY(as);
 819  819                          mutex_exit(&as->a_contents);
 820  820                          AS_LOCK_EXIT(as, &as->a_lock);
 821  821                          as_free(newas);
 822  822                          return (-1);
 823  823                  }
 824  824                  if ((error = SEGOP_DUP(seg, newseg)) != 0) {
 825  825                          /*
 826  826                           * We call seg_free() on the new seg
 827  827                           * because the segment is not set up
 828  828                           * completely; i.e. it has no ops.
 829  829                           */
 830  830                          as_setwatch(as);
 831  831                          mutex_enter(&as->a_contents);
 832  832                          AS_CLRBUSY(as);
 833  833                          mutex_exit(&as->a_contents);
 834  834                          AS_LOCK_EXIT(as, &as->a_lock);
 835  835                          seg_free(newseg);
 836  836                          AS_LOCK_EXIT(newas, &newas->a_lock);
 837  837                          as_free(newas);
 838  838                          return (error);
 839  839                  }
 840  840                  newas->a_size += seg->s_size;
 841  841          }
 842  842          newas->a_resvsize = as->a_resvsize - purgesize;
 843  843  
 844  844          error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 845  845          if (as->a_xhat != NULL)
 846  846                  error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL);
 847  847  
 848  848          mutex_enter(&newas->a_contents);
 849  849          AS_CLRBUSY(newas);
 850  850          mutex_exit(&newas->a_contents);
 851  851          AS_LOCK_EXIT(newas, &newas->a_lock);
 852  852  
 853  853          as_setwatch(as);
 854  854          mutex_enter(&as->a_contents);
 855  855          AS_CLRBUSY(as);
 856  856          mutex_exit(&as->a_contents);
 857  857          AS_LOCK_EXIT(as, &as->a_lock);
 858  858          if (error != 0) {
 859  859                  as_free(newas);
 860  860                  return (error);
 861  861          }
 862  862          forkedproc->p_as = newas;
 863  863          return (0);
 864  864  }
 865  865  
 866  866  /*
 867  867   * Handle a ``fault'' at addr for size bytes.
 868  868   */
 869  869  faultcode_t
 870  870  as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 871  871          enum fault_type type, enum seg_rw rw)
 872  872  {
 873  873          struct seg *seg;
 874  874          caddr_t raddr;                  /* rounded down addr */
 875  875          size_t rsize;                   /* rounded up size */
 876  876          size_t ssize;
 877  877          faultcode_t res = 0;
 878  878          caddr_t addrsav;
 879  879          struct seg *segsav;
 880  880          int as_lock_held;
 881  881          klwp_t *lwp = ttolwp(curthread);
 882  882          int is_xhat = 0;
 883  883          int holding_wpage = 0;
 884  884          extern struct seg_ops   segdev_ops;
 885  885  
 886  886  
 887  887  
 888  888          if (as->a_hat != hat) {
 889  889                  /* This must be an XHAT then */
 890  890                  is_xhat = 1;
 891  891  
 892  892                  if ((type != F_INVAL) || (as == &kas))
 893  893                          return (FC_NOSUPPORT);
 894  894          }
 895  895  
 896  896  retry:
 897  897          if (!is_xhat) {
 898  898                  /*
 899  899                   * Indicate that the lwp is not to be stopped while waiting
 900  900                   * for a pagefault.  This is to avoid deadlock while debugging
 901  901                   * a process via /proc over NFS (in particular).
 902  902                   */
 903  903                  if (lwp != NULL)
 904  904                          lwp->lwp_nostop++;
 905  905  
 906  906                  /*
 907  907                   * same length must be used when we softlock and softunlock.
 908  908                   * We don't support softunlocking lengths less than
 909  909                   * the original length when there is largepage support.
 910  910                   * See seg_dev.c for more comments.
 911  911                   */
 912  912                  switch (type) {
 913  913  
 914  914                  case F_SOFTLOCK:
 915  915                          CPU_STATS_ADD_K(vm, softlock, 1);
 916  916                          break;
 917  917  
 918  918                  case F_SOFTUNLOCK:
 919  919                          break;
 920  920  
 921  921                  case F_PROT:
 922  922                          CPU_STATS_ADD_K(vm, prot_fault, 1);
 923  923                          break;
 924  924  
 925  925                  case F_INVAL:
 926  926                          CPU_STATS_ENTER_K();
 927  927                          CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 928  928                          if (as == &kas)
 929  929                                  CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 930  930                          CPU_STATS_EXIT_K();
 931  931                          break;
 932  932                  }
 933  933          }
 934  934  
 935  935          /* Kernel probe */
 936  936          TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
 937  937              tnf_opaque, address,        addr,
 938  938              tnf_fault_type,     fault_type,     type,
 939  939              tnf_seg_access,     access,         rw);
 940  940  
 941  941          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 942  942          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 943  943              (size_t)raddr;
 944  944  
 945  945          /*
 946  946           * XXX -- Don't grab the as lock for segkmap. We should grab it for
 947  947           * correctness, but then we could be stuck holding this lock for
 948  948           * a LONG time if the fault needs to be resolved on a slow
 949  949           * filesystem, and then no-one will be able to exec new commands,
 950  950           * as exec'ing requires the write lock on the as.
 951  951           */
 952  952          if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 953  953              raddr + size < segkmap->s_base + segkmap->s_size) {
 954  954                  /*
 955  955                   * if (as==&kas), this can't be XHAT: we've already returned
 956  956                   * FC_NOSUPPORT.
 957  957                   */
 958  958                  seg = segkmap;
 959  959                  as_lock_held = 0;
 960  960          } else {
 961  961                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
 962  962                  if (is_xhat && avl_numnodes(&as->a_wpage) != 0) {
 963  963                          /*
 964  964                           * Grab and hold the writers' lock on the as
 965  965                           * if the fault is to a watched page.
 966  966                           * This will keep CPUs from "peeking" at the
 967  967                           * address range while we're temporarily boosting
 968  968                           * the permissions for the XHAT device to
 969  969                           * resolve the fault in the segment layer.
 970  970                           *
 971  971                           * We could check whether faulted address
 972  972                           * is within a watched page and only then grab
 973  973                           * the writer lock, but this is simpler.
 974  974                           */
 975  975                          AS_LOCK_EXIT(as, &as->a_lock);
 976  976                          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 977  977                  }
 978  978  
 979  979                  seg = as_segat(as, raddr);
 980  980                  if (seg == NULL) {
 981  981                          AS_LOCK_EXIT(as, &as->a_lock);
 982  982                          if ((lwp != NULL) && (!is_xhat))
 983  983                                  lwp->lwp_nostop--;
 984  984                          return (FC_NOMAP);
 985  985                  }
 986  986  
 987  987                  as_lock_held = 1;
 988  988          }
 989  989  
 990  990          addrsav = raddr;
 991  991          segsav = seg;
 992  992  
 993  993          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 994  994                  if (raddr >= seg->s_base + seg->s_size) {
 995  995                          seg = AS_SEGNEXT(as, seg);
 996  996                          if (seg == NULL || raddr != seg->s_base) {
 997  997                                  res = FC_NOMAP;
 998  998                                  break;
 999  999                          }
1000 1000                  }
1001 1001                  if (raddr + rsize > seg->s_base + seg->s_size)
1002 1002                          ssize = seg->s_base + seg->s_size - raddr;
1003 1003                  else
1004 1004                          ssize = rsize;
1005 1005  
1006 1006                  if (!is_xhat || (seg->s_ops != &segdev_ops)) {
1007 1007  
1008 1008                          if (is_xhat && avl_numnodes(&as->a_wpage) != 0 &&
1009 1009                              pr_is_watchpage_as(raddr, rw, as)) {
1010 1010                                  /*
1011 1011                                   * Handle watch pages.  If we're faulting on a
1012 1012                                   * watched page from an X-hat, we have to
1013 1013                                   * restore the original permissions while we
1014 1014                                   * handle the fault.
1015 1015                                   */
1016 1016                                  as_clearwatch(as);
1017 1017                                  holding_wpage = 1;
1018 1018                          }
1019 1019  
1020 1020                          res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
1021 1021  
1022 1022                          /* Restore watchpoints */
1023 1023                          if (holding_wpage) {
1024 1024                                  as_setwatch(as);
1025 1025                                  holding_wpage = 0;
1026 1026                          }
1027 1027  
1028 1028                          if (res != 0)
1029 1029                                  break;
1030 1030                  } else {
1031 1031                          /* XHAT does not support seg_dev */
1032 1032                          res = FC_NOSUPPORT;
1033 1033                          break;
1034 1034                  }
1035 1035          }
1036 1036  
1037 1037          /*
1038 1038           * If we were SOFTLOCKing and encountered a failure,
1039 1039           * we must SOFTUNLOCK the range we already did. (Maybe we
1040 1040           * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
1041 1041           * right here...)
1042 1042           */
1043 1043          if (res != 0 && type == F_SOFTLOCK) {
1044 1044                  for (seg = segsav; addrsav < raddr; addrsav += ssize) {
1045 1045                          if (addrsav >= seg->s_base + seg->s_size)
1046 1046                                  seg = AS_SEGNEXT(as, seg);
1047 1047                          ASSERT(seg != NULL);
1048 1048                          /*
1049 1049                           * Now call the fault routine again to perform the
1050 1050                           * unlock using S_OTHER instead of the rw variable
1051 1051                           * since we never got a chance to touch the pages.
1052 1052                           */
1053 1053                          if (raddr > seg->s_base + seg->s_size)
1054 1054                                  ssize = seg->s_base + seg->s_size - addrsav;
1055 1055                          else
1056 1056                                  ssize = raddr - addrsav;
1057 1057                          (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
1058 1058                              F_SOFTUNLOCK, S_OTHER);
1059 1059                  }
1060 1060          }
1061 1061          if (as_lock_held)
1062 1062                  AS_LOCK_EXIT(as, &as->a_lock);
1063 1063          if ((lwp != NULL) && (!is_xhat))
1064 1064                  lwp->lwp_nostop--;
1065 1065  
1066 1066          /*
1067 1067           * If the lower levels returned EDEADLK for a fault,
1068 1068           * It means that we should retry the fault.  Let's wait
1069 1069           * a bit also to let the deadlock causing condition clear.
1070 1070           * This is part of a gross hack to work around a design flaw
1071 1071           * in the ufs/sds logging code and should go away when the
1072 1072           * logging code is re-designed to fix the problem. See bug
1073 1073           * 4125102 for details of the problem.
1074 1074           */
1075 1075          if (FC_ERRNO(res) == EDEADLK) {
1076 1076                  delay(deadlk_wait);
1077 1077                  res = 0;
1078 1078                  goto retry;
1079 1079          }
1080 1080          return (res);
1081 1081  }
1082 1082  
1083 1083  
1084 1084  
1085 1085  /*
1086 1086   * Asynchronous ``fault'' at addr for size bytes.
1087 1087   */
1088 1088  faultcode_t
1089 1089  as_faulta(struct as *as, caddr_t addr, size_t size)
1090 1090  {
1091 1091          struct seg *seg;
1092 1092          caddr_t raddr;                  /* rounded down addr */
1093 1093          size_t rsize;                   /* rounded up size */
1094 1094          faultcode_t res = 0;
1095 1095          klwp_t *lwp = ttolwp(curthread);
1096 1096  
1097 1097  retry:
1098 1098          /*
1099 1099           * Indicate that the lwp is not to be stopped while waiting
1100 1100           * for a pagefault.  This is to avoid deadlock while debugging
1101 1101           * a process via /proc over NFS (in particular).
1102 1102           */
1103 1103          if (lwp != NULL)
1104 1104                  lwp->lwp_nostop++;
1105 1105  
1106 1106          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1107 1107          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1108 1108              (size_t)raddr;
1109 1109  
1110 1110          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1111 1111          seg = as_segat(as, raddr);
1112 1112          if (seg == NULL) {
1113 1113                  AS_LOCK_EXIT(as, &as->a_lock);
1114 1114                  if (lwp != NULL)
1115 1115                          lwp->lwp_nostop--;
1116 1116                  return (FC_NOMAP);
1117 1117          }
1118 1118  
1119 1119          for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1120 1120                  if (raddr >= seg->s_base + seg->s_size) {
1121 1121                          seg = AS_SEGNEXT(as, seg);
1122 1122                          if (seg == NULL || raddr != seg->s_base) {
1123 1123                                  res = FC_NOMAP;
1124 1124                                  break;
1125 1125                          }
1126 1126                  }
1127 1127                  res = SEGOP_FAULTA(seg, raddr);
1128 1128                  if (res != 0)
1129 1129                          break;
1130 1130          }
1131 1131          AS_LOCK_EXIT(as, &as->a_lock);
1132 1132          if (lwp != NULL)
1133 1133                  lwp->lwp_nostop--;
1134 1134          /*
1135 1135           * If the lower levels returned EDEADLK for a fault,
1136 1136           * It means that we should retry the fault.  Let's wait
1137 1137           * a bit also to let the deadlock causing condition clear.
1138 1138           * This is part of a gross hack to work around a design flaw
1139 1139           * in the ufs/sds logging code and should go away when the
1140 1140           * logging code is re-designed to fix the problem. See bug
1141 1141           * 4125102 for details of the problem.
1142 1142           */
1143 1143          if (FC_ERRNO(res) == EDEADLK) {
1144 1144                  delay(deadlk_wait);
1145 1145                  res = 0;
1146 1146                  goto retry;
1147 1147          }
1148 1148          return (res);
1149 1149  }
1150 1150  
1151 1151  /*
1152 1152   * Set the virtual mapping for the interval from [addr : addr + size)
1153 1153   * in address space `as' to have the specified protection.
1154 1154   * It is ok for the range to cross over several segments,
1155 1155   * as long as they are contiguous.
1156 1156   */
1157 1157  int
1158 1158  as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1159 1159  {
1160 1160          struct seg *seg;
1161 1161          struct as_callback *cb;
1162 1162          size_t ssize;
1163 1163          caddr_t raddr;                  /* rounded down addr */
1164 1164          size_t rsize;                   /* rounded up size */
1165 1165          int error = 0, writer = 0;
1166 1166          caddr_t saveraddr;
1167 1167          size_t saversize;
1168 1168  
1169 1169  setprot_top:
1170 1170          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1171 1171          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1172 1172              (size_t)raddr;
1173 1173  
1174 1174          if (raddr + rsize < raddr)              /* check for wraparound */
1175 1175                  return (ENOMEM);
1176 1176  
1177 1177          saveraddr = raddr;
1178 1178          saversize = rsize;
1179 1179  
1180 1180          /*
1181 1181           * Normally we only lock the as as a reader. But
1182 1182           * if due to setprot the segment driver needs to split
1183 1183           * a segment it will return IE_RETRY. Therefore we re-acquire
1184 1184           * the as lock as a writer so the segment driver can change
1185 1185           * the seg list. Also the segment driver will return IE_RETRY
1186 1186           * after it has changed the segment list so we therefore keep
1187 1187           * locking as a writer. Since these opeartions should be rare
1188 1188           * want to only lock as a writer when necessary.
1189 1189           */
1190 1190          if (writer || avl_numnodes(&as->a_wpage) != 0) {
1191 1191                  AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1192 1192          } else {
1193 1193                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1194 1194          }
1195 1195  
1196 1196          as_clearwatchprot(as, raddr, rsize);
1197 1197          seg = as_segat(as, raddr);
1198 1198          if (seg == NULL) {
1199 1199                  as_setwatch(as);
1200 1200                  AS_LOCK_EXIT(as, &as->a_lock);
1201 1201                  return (ENOMEM);
1202 1202          }
1203 1203  
1204 1204          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1205 1205                  if (raddr >= seg->s_base + seg->s_size) {
1206 1206                          seg = AS_SEGNEXT(as, seg);
1207 1207                          if (seg == NULL || raddr != seg->s_base) {
1208 1208                                  error = ENOMEM;
1209 1209                                  break;
1210 1210                          }
1211 1211                  }
1212 1212                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
1213 1213                          ssize = seg->s_base + seg->s_size - raddr;
1214 1214                  else
1215 1215                          ssize = rsize;
1216 1216  retry:
1217 1217                  error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1218 1218  
1219 1219                  if (error == IE_NOMEM) {
1220 1220                          error = EAGAIN;
1221 1221                          break;
1222 1222                  }
1223 1223  
1224 1224                  if (error == IE_RETRY) {
1225 1225                          AS_LOCK_EXIT(as, &as->a_lock);
1226 1226                          writer = 1;
1227 1227                          goto setprot_top;
1228 1228                  }
1229 1229  
1230 1230                  if (error == EAGAIN) {
1231 1231                          /*
1232 1232                           * Make sure we have a_lock as writer.
1233 1233                           */
1234 1234                          if (writer == 0) {
1235 1235                                  AS_LOCK_EXIT(as, &as->a_lock);
1236 1236                                  writer = 1;
1237 1237                                  goto setprot_top;
1238 1238                          }
1239 1239  
1240 1240                          /*
1241 1241                           * Memory is currently locked.  It must be unlocked
1242 1242                           * before this operation can succeed through a retry.
1243 1243                           * The possible reasons for locked memory and
1244 1244                           * corresponding strategies for unlocking are:
1245 1245                           * (1) Normal I/O
1246 1246                           *      wait for a signal that the I/O operation
1247 1247                           *      has completed and the memory is unlocked.
1248 1248                           * (2) Asynchronous I/O
1249 1249                           *      The aio subsystem does not unlock pages when
1250 1250                           *      the I/O is completed. Those pages are unlocked
1251 1251                           *      when the application calls aiowait/aioerror.
1252 1252                           *      So, to prevent blocking forever, cv_broadcast()
1253 1253                           *      is done to wake up aio_cleanup_thread.
1254 1254                           *      Subsequently, segvn_reclaim will be called, and
1255 1255                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
1256 1256                           * (3) Long term page locking:
1257 1257                           *      Drivers intending to have pages locked for a
1258 1258                           *      period considerably longer than for normal I/O
1259 1259                           *      (essentially forever) may have registered for a
1260 1260                           *      callback so they may unlock these pages on
1261 1261                           *      request. This is needed to allow this operation
1262 1262                           *      to succeed. Each entry on the callback list is
1263 1263                           *      examined. If the event or address range pertains
1264 1264                           *      the callback is invoked (unless it already is in
1265 1265                           *      progress). The a_contents lock must be dropped
1266 1266                           *      before the callback, so only one callback can
1267 1267                           *      be done at a time. Go to the top and do more
1268 1268                           *      until zero is returned. If zero is returned,
1269 1269                           *      either there were no callbacks for this event
1270 1270                           *      or they were already in progress.
1271 1271                           */
1272 1272                          mutex_enter(&as->a_contents);
1273 1273                          if (as->a_callbacks &&
1274 1274                              (cb = as_find_callback(as, AS_SETPROT_EVENT,
1275 1275                              seg->s_base, seg->s_size))) {
1276 1276                                  AS_LOCK_EXIT(as, &as->a_lock);
1277 1277                                  as_execute_callback(as, cb, AS_SETPROT_EVENT);
1278 1278                          } else if (!AS_ISNOUNMAPWAIT(as)) {
1279 1279                                  if (AS_ISUNMAPWAIT(as) == 0)
1280 1280                                          cv_broadcast(&as->a_cv);
1281 1281                                  AS_SETUNMAPWAIT(as);
1282 1282                                  AS_LOCK_EXIT(as, &as->a_lock);
1283 1283                                  while (AS_ISUNMAPWAIT(as))
1284 1284                                          cv_wait(&as->a_cv, &as->a_contents);
1285 1285                          } else {
1286 1286                                  /*
1287 1287                                   * We may have raced with
1288 1288                                   * segvn_reclaim()/segspt_reclaim(). In this
1289 1289                                   * case clean nounmapwait flag and retry since
1290 1290                                   * softlockcnt in this segment may be already
1291 1291                                   * 0.  We don't drop as writer lock so our
1292 1292                                   * number of retries without sleeping should
1293 1293                                   * be very small. See segvn_reclaim() for
1294 1294                                   * more comments.
1295 1295                                   */
1296 1296                                  AS_CLRNOUNMAPWAIT(as);
1297 1297                                  mutex_exit(&as->a_contents);
1298 1298                                  goto retry;
1299 1299                          }
1300 1300                          mutex_exit(&as->a_contents);
1301 1301                          goto setprot_top;
1302 1302                  } else if (error != 0)
1303 1303                          break;
1304 1304          }
1305 1305          if (error != 0) {
1306 1306                  as_setwatch(as);
1307 1307          } else {
1308 1308                  as_setwatchprot(as, saveraddr, saversize, prot);
1309 1309          }
1310 1310          AS_LOCK_EXIT(as, &as->a_lock);
1311 1311          return (error);
1312 1312  }
1313 1313  
1314 1314  /*
1315 1315   * Check to make sure that the interval [addr, addr + size)
1316 1316   * in address space `as' has at least the specified protection.
1317 1317   * It is ok for the range to cross over several segments, as long
1318 1318   * as they are contiguous.
1319 1319   */
1320 1320  int
1321 1321  as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1322 1322  {
1323 1323          struct seg *seg;
1324 1324          size_t ssize;
1325 1325          caddr_t raddr;                  /* rounded down addr */
1326 1326          size_t rsize;                   /* rounded up size */
1327 1327          int error = 0;
1328 1328  
1329 1329          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1330 1330          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1331 1331              (size_t)raddr;
1332 1332  
1333 1333          if (raddr + rsize < raddr)              /* check for wraparound */
1334 1334                  return (ENOMEM);
1335 1335  
1336 1336          /*
1337 1337           * This is ugly as sin...
1338 1338           * Normally, we only acquire the address space readers lock.
1339 1339           * However, if the address space has watchpoints present,
1340 1340           * we must acquire the writer lock on the address space for
1341 1341           * the benefit of as_clearwatchprot() and as_setwatchprot().
1342 1342           */
1343 1343          if (avl_numnodes(&as->a_wpage) != 0)
1344 1344                  AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1345 1345          else
1346 1346                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1347 1347          as_clearwatchprot(as, raddr, rsize);
1348 1348          seg = as_segat(as, raddr);
1349 1349          if (seg == NULL) {
1350 1350                  as_setwatch(as);
1351 1351                  AS_LOCK_EXIT(as, &as->a_lock);
1352 1352                  return (ENOMEM);
1353 1353          }
1354 1354  
1355 1355          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1356 1356                  if (raddr >= seg->s_base + seg->s_size) {
1357 1357                          seg = AS_SEGNEXT(as, seg);
1358 1358                          if (seg == NULL || raddr != seg->s_base) {
1359 1359                                  error = ENOMEM;
1360 1360                                  break;
1361 1361                          }
1362 1362                  }
1363 1363                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
1364 1364                          ssize = seg->s_base + seg->s_size - raddr;
1365 1365                  else
1366 1366                          ssize = rsize;
1367 1367  
1368 1368                  error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1369 1369                  if (error != 0)
1370 1370                          break;
1371 1371          }
1372 1372          as_setwatch(as);
1373 1373          AS_LOCK_EXIT(as, &as->a_lock);
1374 1374          return (error);
1375 1375  }
1376 1376  
1377 1377  int
1378 1378  as_unmap(struct as *as, caddr_t addr, size_t size)
1379 1379  {
1380 1380          struct seg *seg, *seg_next;
1381 1381          struct as_callback *cb;
1382 1382          caddr_t raddr, eaddr;
1383 1383          size_t ssize, rsize = 0;
1384 1384          int err;
1385 1385  
1386 1386  top:
1387 1387          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1388 1388          eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1389 1389              (uintptr_t)PAGEMASK);
1390 1390  
1391 1391          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1392 1392  
1393 1393          as->a_updatedir = 1;    /* inform /proc */
1394 1394          gethrestime(&as->a_updatetime);
1395 1395  
1396 1396          /*
1397 1397           * Use as_findseg to find the first segment in the range, then
1398 1398           * step through the segments in order, following s_next.
1399 1399           */
1400 1400          as_clearwatchprot(as, raddr, eaddr - raddr);
1401 1401  
1402 1402          for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1403 1403                  if (eaddr <= seg->s_base)
1404 1404                          break;          /* eaddr was in a gap; all done */
1405 1405  
1406 1406                  /* this is implied by the test above */
1407 1407                  ASSERT(raddr < eaddr);
1408 1408  
1409 1409                  if (raddr < seg->s_base)
1410 1410                          raddr = seg->s_base;    /* raddr was in a gap */
1411 1411  
1412 1412                  if (eaddr > (seg->s_base + seg->s_size))
1413 1413                          ssize = seg->s_base + seg->s_size - raddr;
1414 1414                  else
1415 1415                          ssize = eaddr - raddr;
1416 1416  
1417 1417                  /*
1418 1418                   * Save next segment pointer since seg can be
1419 1419                   * destroyed during the segment unmap operation.
1420 1420                   */
1421 1421                  seg_next = AS_SEGNEXT(as, seg);
1422 1422  
1423 1423                  /*
1424 1424                   * We didn't count /dev/null mappings, so ignore them here.
1425 1425                   * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1426 1426                   * we have to do this check here while we have seg.)
1427 1427                   */
1428 1428                  rsize = 0;
1429 1429                  if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1430 1430                      !SEG_IS_PARTIAL_RESV(seg))
1431 1431                          rsize = ssize;
1432 1432  
1433 1433  retry:
1434 1434                  err = SEGOP_UNMAP(seg, raddr, ssize);
1435 1435                  if (err == EAGAIN) {
1436 1436                          /*
1437 1437                           * Memory is currently locked.  It must be unlocked
1438 1438                           * before this operation can succeed through a retry.
1439 1439                           * The possible reasons for locked memory and
1440 1440                           * corresponding strategies for unlocking are:
1441 1441                           * (1) Normal I/O
1442 1442                           *      wait for a signal that the I/O operation
1443 1443                           *      has completed and the memory is unlocked.
1444 1444                           * (2) Asynchronous I/O
1445 1445                           *      The aio subsystem does not unlock pages when
1446 1446                           *      the I/O is completed. Those pages are unlocked
1447 1447                           *      when the application calls aiowait/aioerror.
1448 1448                           *      So, to prevent blocking forever, cv_broadcast()
1449 1449                           *      is done to wake up aio_cleanup_thread.
1450 1450                           *      Subsequently, segvn_reclaim will be called, and
1451 1451                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
1452 1452                           * (3) Long term page locking:
1453 1453                           *      Drivers intending to have pages locked for a
1454 1454                           *      period considerably longer than for normal I/O
1455 1455                           *      (essentially forever) may have registered for a
1456 1456                           *      callback so they may unlock these pages on
1457 1457                           *      request. This is needed to allow this operation
1458 1458                           *      to succeed. Each entry on the callback list is
1459 1459                           *      examined. If the event or address range pertains
1460 1460                           *      the callback is invoked (unless it already is in
1461 1461                           *      progress). The a_contents lock must be dropped
1462 1462                           *      before the callback, so only one callback can
1463 1463                           *      be done at a time. Go to the top and do more
1464 1464                           *      until zero is returned. If zero is returned,
1465 1465                           *      either there were no callbacks for this event
1466 1466                           *      or they were already in progress.
1467 1467                           */
1468 1468                          mutex_enter(&as->a_contents);
1469 1469                          if (as->a_callbacks &&
1470 1470                              (cb = as_find_callback(as, AS_UNMAP_EVENT,
1471 1471                              seg->s_base, seg->s_size))) {
1472 1472                                  AS_LOCK_EXIT(as, &as->a_lock);
1473 1473                                  as_execute_callback(as, cb, AS_UNMAP_EVENT);
1474 1474                          } else if (!AS_ISNOUNMAPWAIT(as)) {
1475 1475                                  if (AS_ISUNMAPWAIT(as) == 0)
1476 1476                                          cv_broadcast(&as->a_cv);
1477 1477                                  AS_SETUNMAPWAIT(as);
1478 1478                                  AS_LOCK_EXIT(as, &as->a_lock);
1479 1479                                  while (AS_ISUNMAPWAIT(as))
1480 1480                                          cv_wait(&as->a_cv, &as->a_contents);
1481 1481                          } else {
1482 1482                                  /*
1483 1483                                   * We may have raced with
1484 1484                                   * segvn_reclaim()/segspt_reclaim(). In this
1485 1485                                   * case clean nounmapwait flag and retry since
1486 1486                                   * softlockcnt in this segment may be already
1487 1487                                   * 0.  We don't drop as writer lock so our
1488 1488                                   * number of retries without sleeping should
1489 1489                                   * be very small. See segvn_reclaim() for
1490 1490                                   * more comments.
1491 1491                                   */
1492 1492                                  AS_CLRNOUNMAPWAIT(as);
1493 1493                                  mutex_exit(&as->a_contents);
1494 1494                                  goto retry;
1495 1495                          }
1496 1496                          mutex_exit(&as->a_contents);
1497 1497                          goto top;
1498 1498                  } else if (err == IE_RETRY) {
1499 1499                          AS_LOCK_EXIT(as, &as->a_lock);
1500 1500                          goto top;
1501 1501                  } else if (err) {
1502 1502                          as_setwatch(as);
1503 1503                          AS_LOCK_EXIT(as, &as->a_lock);
1504 1504                          return (-1);
1505 1505                  }
1506 1506  
1507 1507                  as->a_size -= ssize;
1508 1508                  if (rsize)
1509 1509                          as->a_resvsize -= rsize;
1510 1510                  raddr += ssize;
1511 1511          }
1512 1512          AS_LOCK_EXIT(as, &as->a_lock);
1513 1513          return (0);
1514 1514  }
1515 1515  
1516 1516  static int
1517 1517  as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1518 1518      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1519 1519  {
1520 1520          uint_t szc;
1521 1521          uint_t nszc;
1522 1522          int error;
1523 1523          caddr_t a;
1524 1524          caddr_t eaddr;
1525 1525          size_t segsize;
1526 1526          struct seg *seg;
1527 1527          size_t pgsz;
1528 1528          int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1529 1529          uint_t save_szcvec;
1530 1530  
1531 1531          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1532 1532          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1533 1533          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1534 1534          ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1535 1535          if (!do_off) {
1536 1536                  vn_a->offset = 0;
1537 1537          }
1538 1538  
1539 1539          if (szcvec <= 1) {
1540 1540                  seg = seg_alloc(as, addr, size);
1541 1541                  if (seg == NULL) {
1542 1542                          return (ENOMEM);
1543 1543                  }
1544 1544                  vn_a->szc = 0;
1545 1545                  error = (*crfp)(seg, vn_a);
1546 1546                  if (error != 0) {
1547 1547                          seg_free(seg);
1548 1548                  } else {
1549 1549                          as->a_size += size;
1550 1550                          as->a_resvsize += size;
1551 1551                  }
1552 1552                  return (error);
1553 1553          }
1554 1554  
1555 1555          eaddr = addr + size;
1556 1556          save_szcvec = szcvec;
1557 1557          szcvec >>= 1;
1558 1558          szc = 0;
1559 1559          nszc = 0;
1560 1560          while (szcvec) {
1561 1561                  if ((szcvec & 0x1) == 0) {
1562 1562                          nszc++;
1563 1563                          szcvec >>= 1;
1564 1564                          continue;
1565 1565                  }
1566 1566                  nszc++;
1567 1567                  pgsz = page_get_pagesize(nszc);
1568 1568                  a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1569 1569                  if (a != addr) {
1570 1570                          ASSERT(a < eaddr);
1571 1571                          segsize = a - addr;
1572 1572                          seg = seg_alloc(as, addr, segsize);
1573 1573                          if (seg == NULL) {
1574 1574                                  return (ENOMEM);
1575 1575                          }
1576 1576                          vn_a->szc = szc;
1577 1577                          error = (*crfp)(seg, vn_a);
1578 1578                          if (error != 0) {
1579 1579                                  seg_free(seg);
1580 1580                                  return (error);
1581 1581                          }
1582 1582                          as->a_size += segsize;
1583 1583                          as->a_resvsize += segsize;
1584 1584                          *segcreated = 1;
1585 1585                          if (do_off) {
1586 1586                                  vn_a->offset += segsize;
1587 1587                          }
1588 1588                          addr = a;
1589 1589                  }
1590 1590                  szc = nszc;
1591 1591                  szcvec >>= 1;
1592 1592          }
1593 1593  
1594 1594          ASSERT(addr < eaddr);
1595 1595          szcvec = save_szcvec | 1; /* add 8K pages */
1596 1596          while (szcvec) {
1597 1597                  a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1598 1598                  ASSERT(a >= addr);
1599 1599                  if (a != addr) {
1600 1600                          segsize = a - addr;
1601 1601                          seg = seg_alloc(as, addr, segsize);
1602 1602                          if (seg == NULL) {
1603 1603                                  return (ENOMEM);
1604 1604                          }
1605 1605                          vn_a->szc = szc;
1606 1606                          error = (*crfp)(seg, vn_a);
1607 1607                          if (error != 0) {
1608 1608                                  seg_free(seg);
1609 1609                                  return (error);
1610 1610                          }
1611 1611                          as->a_size += segsize;
1612 1612                          as->a_resvsize += segsize;
1613 1613                          *segcreated = 1;
1614 1614                          if (do_off) {
1615 1615                                  vn_a->offset += segsize;
1616 1616                          }
1617 1617                          addr = a;
1618 1618                  }
1619 1619                  szcvec &= ~(1 << szc);
1620 1620                  if (szcvec) {
1621 1621                          szc = highbit(szcvec) - 1;
1622 1622                          pgsz = page_get_pagesize(szc);
1623 1623                  }
1624 1624          }
1625 1625          ASSERT(addr == eaddr);
1626 1626  
1627 1627          return (0);
1628 1628  }
1629 1629  
1630 1630  static int
1631 1631  as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1632 1632      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1633 1633  {
1634 1634          uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1635 1635          int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1636 1636          uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1637 1637              type, 0);
1638 1638          int error;
1639 1639          struct seg *seg;
1640 1640          struct vattr va;
1641 1641          u_offset_t eoff;
1642 1642          size_t save_size = 0;
1643 1643          extern size_t textrepl_size_thresh;
1644 1644  
1645 1645          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1646 1646          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1647 1647          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1648 1648          ASSERT(vn_a->vp != NULL);
1649 1649          ASSERT(vn_a->amp == NULL);
1650 1650  
1651 1651  again:
1652 1652          if (szcvec <= 1) {
1653 1653                  seg = seg_alloc(as, addr, size);
1654 1654                  if (seg == NULL) {
1655 1655                          return (ENOMEM);
1656 1656                  }
1657 1657                  vn_a->szc = 0;
1658 1658                  error = (*crfp)(seg, vn_a);
1659 1659                  if (error != 0) {
1660 1660                          seg_free(seg);
1661 1661                  } else {
1662 1662                          as->a_size += size;
1663 1663                          as->a_resvsize += size;
1664 1664                  }
1665 1665                  return (error);
1666 1666          }
1667 1667  
1668 1668          va.va_mask = AT_SIZE;
1669 1669          if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1670 1670                  szcvec = 0;
1671 1671                  goto again;
1672 1672          }
1673 1673          eoff = vn_a->offset & PAGEMASK;
1674 1674          if (eoff >= va.va_size) {
1675 1675                  szcvec = 0;
1676 1676                  goto again;
1677 1677          }
1678 1678          eoff += size;
1679 1679          if (btopr(va.va_size) < btopr(eoff)) {
1680 1680                  save_size = size;
1681 1681                  size = va.va_size - (vn_a->offset & PAGEMASK);
1682 1682                  size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1683 1683                  szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1684 1684                      type, 0);
1685 1685                  if (szcvec <= 1) {
1686 1686                          size = save_size;
1687 1687                          goto again;
1688 1688                  }
1689 1689          }
1690 1690  
1691 1691          if (size > textrepl_size_thresh) {
1692 1692                  vn_a->flags |= _MAP_TEXTREPL;
1693 1693          }
1694 1694          error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1695 1695              segcreated);
1696 1696          if (error != 0) {
1697 1697                  return (error);
1698 1698          }
1699 1699          if (save_size) {
1700 1700                  addr += size;
1701 1701                  size = save_size - size;
1702 1702                  szcvec = 0;
1703 1703                  goto again;
1704 1704          }
1705 1705          return (0);
1706 1706  }
1707 1707  
1708 1708  /*
1709 1709   * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1710 1710   * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1711 1711   */
1712 1712  static int
1713 1713  as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1714 1714      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1715 1715  {
1716 1716          uint_t szcvec;
1717 1717          uchar_t type;
1718 1718  
1719 1719          ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1720 1720          if (vn_a->type == MAP_SHARED) {
1721 1721                  type = MAPPGSZC_SHM;
1722 1722          } else if (vn_a->type == MAP_PRIVATE) {
1723 1723                  if (vn_a->szc == AS_MAP_HEAP) {
1724 1724                          type = MAPPGSZC_HEAP;
1725 1725                  } else if (vn_a->szc == AS_MAP_STACK) {
1726 1726                          type = MAPPGSZC_STACK;
1727 1727                  } else {
1728 1728                          type = MAPPGSZC_PRIVM;
1729 1729                  }
1730 1730          }
1731 1731          szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1732 1732              (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1733 1733              (vn_a->flags & MAP_TEXT), type, 0);
1734 1734          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1735 1735          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1736 1736          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1737 1737          ASSERT(vn_a->vp == NULL);
1738 1738  
1739 1739          return (as_map_segvn_segs(as, addr, size, szcvec,
1740 1740              crfp, vn_a, segcreated));
1741 1741  }
1742 1742  
1743 1743  int
1744 1744  as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1745 1745  {
1746 1746          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1747 1747          return (as_map_locked(as, addr, size, crfp, argsp));
1748 1748  }
1749 1749  
1750 1750  int
1751 1751  as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1752 1752                  void *argsp)
1753 1753  {
1754 1754          struct seg *seg = NULL;
1755 1755          caddr_t raddr;                  /* rounded down addr */
1756 1756          size_t rsize;                   /* rounded up size */
1757 1757          int error;
1758 1758          int unmap = 0;
1759 1759          struct proc *p = curproc;
1760 1760          struct segvn_crargs crargs;
1761 1761  
1762 1762          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1763 1763          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1764 1764              (size_t)raddr;
1765 1765  
1766 1766          /*
1767 1767           * check for wrap around
1768 1768           */
1769 1769          if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1770 1770                  AS_LOCK_EXIT(as, &as->a_lock);
1771 1771                  return (ENOMEM);
1772 1772          }
1773 1773  
1774 1774          as->a_updatedir = 1;    /* inform /proc */
1775 1775          gethrestime(&as->a_updatetime);
1776 1776  
1777 1777          if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1778 1778                  AS_LOCK_EXIT(as, &as->a_lock);
1779 1779  
1780 1780                  (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1781 1781                      RCA_UNSAFE_ALL);
1782 1782  
1783 1783                  return (ENOMEM);
1784 1784          }
1785 1785  
1786 1786          if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1787 1787                  crargs = *(struct segvn_crargs *)argsp;
1788 1788                  error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1789 1789                  if (error != 0) {
1790 1790                          AS_LOCK_EXIT(as, &as->a_lock);
1791 1791                          if (unmap) {
1792 1792                                  (void) as_unmap(as, addr, size);
1793 1793                          }
1794 1794                          return (error);
1795 1795                  }
1796 1796          } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1797 1797                  crargs = *(struct segvn_crargs *)argsp;
1798 1798                  error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1799 1799                  if (error != 0) {
1800 1800                          AS_LOCK_EXIT(as, &as->a_lock);
1801 1801                          if (unmap) {
1802 1802                                  (void) as_unmap(as, addr, size);
1803 1803                          }
1804 1804                          return (error);
1805 1805                  }
1806 1806          } else {
1807 1807                  seg = seg_alloc(as, addr, size);
1808 1808                  if (seg == NULL) {
1809 1809                          AS_LOCK_EXIT(as, &as->a_lock);
1810 1810                          return (ENOMEM);
1811 1811                  }
1812 1812  
1813 1813                  error = (*crfp)(seg, argsp);
1814 1814                  if (error != 0) {
1815 1815                          seg_free(seg);
1816 1816                          AS_LOCK_EXIT(as, &as->a_lock);
1817 1817                          return (error);
1818 1818                  }
1819 1819                  /*
1820 1820                   * Add size now so as_unmap will work if as_ctl fails.
1821 1821                   */
1822 1822                  as->a_size += rsize;
1823 1823                  as->a_resvsize += rsize;
1824 1824          }
1825 1825  
1826 1826          as_setwatch(as);
1827 1827  
1828 1828          /*
1829 1829           * If the address space is locked,
1830 1830           * establish memory locks for the new segment.
1831 1831           */
1832 1832          mutex_enter(&as->a_contents);
1833 1833          if (AS_ISPGLCK(as)) {
1834 1834                  mutex_exit(&as->a_contents);
1835 1835                  AS_LOCK_EXIT(as, &as->a_lock);
1836 1836                  error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1837 1837                  if (error != 0)
1838 1838                          (void) as_unmap(as, addr, size);
1839 1839          } else {
1840 1840                  mutex_exit(&as->a_contents);
1841 1841                  AS_LOCK_EXIT(as, &as->a_lock);
1842 1842          }
1843 1843          return (error);
1844 1844  }
1845 1845  
1846 1846  
1847 1847  /*
1848 1848   * Delete all segments in the address space marked with S_PURGE.
1849 1849   * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1850 1850   * These segments are deleted as a first step before calls to as_gap(), so
1851 1851   * that they don't affect mmap() or shmat().
1852 1852   */
1853 1853  void
1854 1854  as_purge(struct as *as)
1855 1855  {
1856 1856          struct seg *seg;
1857 1857          struct seg *next_seg;
1858 1858  
1859 1859          /*
1860 1860           * the setting of NEEDSPURGE is protect by as_rangelock(), so
1861 1861           * no need to grab a_contents mutex for this check
1862 1862           */
1863 1863          if ((as->a_flags & AS_NEEDSPURGE) == 0)
1864 1864                  return;
1865 1865  
1866 1866          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1867 1867          next_seg = NULL;
1868 1868          seg = AS_SEGFIRST(as);
1869 1869          while (seg != NULL) {
1870 1870                  next_seg = AS_SEGNEXT(as, seg);
1871 1871                  if (seg->s_flags & S_PURGE)
1872 1872                          SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1873 1873                  seg = next_seg;
1874 1874          }
1875 1875          AS_LOCK_EXIT(as, &as->a_lock);
1876 1876  
1877 1877          mutex_enter(&as->a_contents);
1878 1878          as->a_flags &= ~AS_NEEDSPURGE;
1879 1879          mutex_exit(&as->a_contents);
1880 1880  }
1881 1881  
1882 1882  /*
1883 1883   * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1884 1884   * range of addresses at least "minlen" long, where the base of the range is
1885 1885   * at "off" phase from an "align" boundary and there is space for a
1886 1886   * "redzone"-sized redzone on eithe rside of the range.  Thus,
1887 1887   * if align was 4M and off was 16k, the user wants a hole which will start
1888 1888   * 16k into a 4M page.
1889 1889   *
1890 1890   * If flags specifies AH_HI, the hole will have the highest possible address
1891 1891   * in the range.  We use the as->a_lastgap field to figure out where to
1892 1892   * start looking for a gap.
1893 1893   *
1894 1894   * Otherwise, the gap will have the lowest possible address.
1895 1895   *
1896 1896   * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1897 1897   *
1898 1898   * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1899 1899   * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1900 1900   *
1901 1901   * NOTE: This routine is not correct when base+len overflows caddr_t.
1902 1902   */
1903 1903  int
1904 1904  as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1905 1905      uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1906 1906  {
1907 1907          caddr_t lobound = *basep;
1908 1908          caddr_t hibound = lobound + *lenp;
1909 1909          struct seg *lseg, *hseg;
1910 1910          caddr_t lo, hi;
1911 1911          int forward;
1912 1912          caddr_t save_base;
1913 1913          size_t save_len;
1914 1914          size_t save_minlen;
1915 1915          size_t save_redzone;
1916 1916          int fast_path = 1;
1917 1917  
1918 1918          save_base = *basep;
1919 1919          save_len = *lenp;
1920 1920          save_minlen = minlen;
1921 1921          save_redzone = redzone;
1922 1922  
1923 1923          /*
1924 1924           * For the first pass/fast_path, just add align and redzone into
1925 1925           * minlen since if we get an allocation, we can guarantee that it
1926 1926           * will fit the alignment and redzone requested.
1927 1927           * This increases the chance that hibound will be adjusted to
1928 1928           * a_lastgap->s_base which will likely allow us to find an
1929 1929           * acceptable hole in the address space quicker.
1930 1930           * If we can't find a hole with this fast_path, then we look for
1931 1931           * smaller holes in which the alignment and offset may allow
1932 1932           * the allocation to fit.
1933 1933           */
1934 1934          minlen += align;
1935 1935          minlen += 2 * redzone;
1936 1936          redzone = 0;
1937 1937  
1938 1938          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1939 1939          if (AS_SEGFIRST(as) == NULL) {
1940 1940                  if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1941 1941                      align, redzone, off)) {
1942 1942                          AS_LOCK_EXIT(as, &as->a_lock);
1943 1943                          return (0);
1944 1944                  } else {
1945 1945                          AS_LOCK_EXIT(as, &as->a_lock);
1946 1946                          *basep = save_base;
1947 1947                          *lenp = save_len;
1948 1948                          return (-1);
1949 1949                  }
1950 1950          }
1951 1951  
1952 1952  retry:
1953 1953          /*
1954 1954           * Set up to iterate over all the inter-segment holes in the given
1955 1955           * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1956 1956           * NULL for the highest-addressed hole.  If moving backwards, we reset
1957 1957           * sseg to denote the highest-addressed segment.
1958 1958           */
1959 1959          forward = (flags & AH_DIR) == AH_LO;
1960 1960          if (forward) {
1961 1961                  hseg = as_findseg(as, lobound, 1);
1962 1962                  lseg = AS_SEGPREV(as, hseg);
1963 1963          } else {
1964 1964  
1965 1965                  /*
1966 1966                   * If allocating at least as much as the last allocation,
1967 1967                   * use a_lastgap's base as a better estimate of hibound.
1968 1968                   */
1969 1969                  if (as->a_lastgap &&
1970 1970                      minlen >= as->a_lastgap->s_size &&
1971 1971                      hibound >= as->a_lastgap->s_base)
1972 1972                          hibound = as->a_lastgap->s_base;
1973 1973  
1974 1974                  hseg = as_findseg(as, hibound, 1);
1975 1975                  if (hseg->s_base + hseg->s_size < hibound) {
1976 1976                          lseg = hseg;
1977 1977                          hseg = NULL;
1978 1978                  } else {
1979 1979                          lseg = AS_SEGPREV(as, hseg);
1980 1980                  }
1981 1981          }
1982 1982  
1983 1983          for (;;) {
1984 1984                  /*
1985 1985                   * Set lo and hi to the hole's boundaries.  (We should really
1986 1986                   * use MAXADDR in place of hibound in the expression below,
1987 1987                   * but can't express it easily; using hibound in its place is
1988 1988                   * harmless.)
1989 1989                   */
1990 1990                  lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1991 1991                  hi = (hseg == NULL) ? hibound : hseg->s_base;
1992 1992                  /*
1993 1993                   * If the iteration has moved past the interval from lobound
1994 1994                   * to hibound it's pointless to continue.
1995 1995                   */
1996 1996                  if ((forward && lo > hibound) || (!forward && hi < lobound))
1997 1997                          break;
1998 1998                  else if (lo > hibound || hi < lobound)
1999 1999                          goto cont;
2000 2000                  /*
2001 2001                   * Candidate hole lies at least partially within the allowable
2002 2002                   * range.  Restrict it to fall completely within that range,
2003 2003                   * i.e., to [max(lo, lobound), min(hi, hibound)].
2004 2004                   */
2005 2005                  if (lo < lobound)
2006 2006                          lo = lobound;
2007 2007                  if (hi > hibound)
2008 2008                          hi = hibound;
2009 2009                  /*
2010 2010                   * Verify that the candidate hole is big enough and meets
2011 2011                   * hardware constraints.  If the hole is too small, no need
2012 2012                   * to do the further checks since they will fail.
2013 2013                   */
2014 2014                  *basep = lo;
2015 2015                  *lenp = hi - lo;
2016 2016                  if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
2017 2017                      minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
2018 2018                      ((flags & AH_CONTAIN) == 0 ||
2019 2019                      (*basep <= addr && *basep + *lenp > addr))) {
2020 2020                          if (!forward)
2021 2021                                  as->a_lastgap = hseg;
2022 2022                          if (hseg != NULL)
2023 2023                                  as->a_lastgaphl = hseg;
2024 2024                          else
2025 2025                                  as->a_lastgaphl = lseg;
2026 2026                          AS_LOCK_EXIT(as, &as->a_lock);
2027 2027                          return (0);
2028 2028                  }
2029 2029          cont:
2030 2030                  /*
2031 2031                   * Move to the next hole.
2032 2032                   */
2033 2033                  if (forward) {
2034 2034                          lseg = hseg;
2035 2035                          if (lseg == NULL)
2036 2036                                  break;
2037 2037                          hseg = AS_SEGNEXT(as, hseg);
2038 2038                  } else {
2039 2039                          hseg = lseg;
2040 2040                          if (hseg == NULL)
2041 2041                                  break;
2042 2042                          lseg = AS_SEGPREV(as, lseg);
2043 2043                  }
2044 2044          }
2045 2045          if (fast_path && (align != 0 || save_redzone != 0)) {
2046 2046                  fast_path = 0;
2047 2047                  minlen = save_minlen;
2048 2048                  redzone = save_redzone;
2049 2049                  goto retry;
2050 2050          }
2051 2051          *basep = save_base;
2052 2052          *lenp = save_len;
2053 2053          AS_LOCK_EXIT(as, &as->a_lock);
2054 2054          return (-1);
2055 2055  }
2056 2056  
2057 2057  /*
2058 2058   * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2059 2059   *
2060 2060   * If flags specifies AH_HI, the hole will have the highest possible address
2061 2061   * in the range.  We use the as->a_lastgap field to figure out where to
2062 2062   * start looking for a gap.
2063 2063   *
2064 2064   * Otherwise, the gap will have the lowest possible address.
2065 2065   *
2066 2066   * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2067 2067   *
2068 2068   * If an adequate hole is found, base and len are set to reflect the part of
2069 2069   * the hole that is within range, and 0 is returned, otherwise,
2070 2070   * -1 is returned.
2071 2071   *
2072 2072   * NOTE: This routine is not correct when base+len overflows caddr_t.
2073 2073   */
2074 2074  int
2075 2075  as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2076 2076      caddr_t addr)
2077 2077  {
2078 2078  
2079 2079          return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2080 2080  }
2081 2081  
2082 2082  /*
2083 2083   * Return the next range within [base, base + len) that is backed
2084 2084   * with "real memory".  Skip holes and non-seg_vn segments.
2085 2085   * We're lazy and only return one segment at a time.
2086 2086   */
2087 2087  int
2088 2088  as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2089 2089  {
2090 2090          extern struct seg_ops segspt_shmops;    /* needs a header file */
2091 2091          struct seg *seg;
2092 2092          caddr_t addr, eaddr;
2093 2093          caddr_t segend;
2094 2094  
2095 2095          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2096 2096  
2097 2097          addr = *basep;
2098 2098          eaddr = addr + *lenp;
2099 2099  
2100 2100          seg = as_findseg(as, addr, 0);
2101 2101          if (seg != NULL)
2102 2102                  addr = MAX(seg->s_base, addr);
2103 2103  
2104 2104          for (;;) {
2105 2105                  if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2106 2106                          AS_LOCK_EXIT(as, &as->a_lock);
2107 2107                          return (EINVAL);
2108 2108                  }
2109 2109  
2110 2110                  if (seg->s_ops == &segvn_ops) {
2111 2111                          segend = seg->s_base + seg->s_size;
2112 2112                          break;
2113 2113                  }
2114 2114  
2115 2115                  /*
2116 2116                   * We do ISM by looking into the private data
2117 2117                   * to determine the real size of the segment.
2118 2118                   */
2119 2119                  if (seg->s_ops == &segspt_shmops) {
2120 2120                          segend = seg->s_base + spt_realsize(seg);
2121 2121                          if (addr < segend)
2122 2122                                  break;
2123 2123                  }
2124 2124  
2125 2125                  seg = AS_SEGNEXT(as, seg);
2126 2126  
2127 2127                  if (seg != NULL)
2128 2128                          addr = seg->s_base;
2129 2129          }
2130 2130  
2131 2131          *basep = addr;
2132 2132

↓ open down ↓

2132 lines elided

↑ open up ↑

2133 2133          if (segend > eaddr)
2134 2134                  *lenp = eaddr - addr;
2135 2135          else
2136 2136                  *lenp = segend - addr;
2137 2137  
2138 2138          AS_LOCK_EXIT(as, &as->a_lock);
2139 2139          return (0);
2140 2140  }
2141 2141  
2142 2142  /*
2143      - * Swap the pages associated with the address space as out to
2144      - * secondary storage, returning the number of bytes actually
2145      - * swapped.
2146      - *
2147      - * The value returned is intended to correlate well with the process's
2148      - * memory requirements.  Its usefulness for this purpose depends on
2149      - * how well the segment-level routines do at returning accurate
2150      - * information.
2151      - */
2152      -size_t
2153      -as_swapout(struct as *as)
2154      -{
2155      -        struct seg *seg;
2156      -        size_t swpcnt = 0;
2157      -
2158      -        /*
2159      -         * Kernel-only processes have given up their address
2160      -         * spaces.  Of course, we shouldn't be attempting to
2161      -         * swap out such processes in the first place...
2162      -         */
2163      -        if (as == NULL)
2164      -                return (0);
2165      -
2166      -        AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2167      -
2168      -        /* Prevent XHATs from attaching */
2169      -        mutex_enter(&as->a_contents);
2170      -        AS_SETBUSY(as);
2171      -        mutex_exit(&as->a_contents);
2172      -
2173      -
2174      -        /*
2175      -         * Free all mapping resources associated with the address
2176      -         * space.  The segment-level swapout routines capitalize
2177      -         * on this unmapping by scavanging pages that have become
2178      -         * unmapped here.
2179      -         */
2180      -        hat_swapout(as->a_hat);
2181      -        if (as->a_xhat != NULL)
2182      -                xhat_swapout_all(as);
2183      -
2184      -        mutex_enter(&as->a_contents);
2185      -        AS_CLRBUSY(as);
2186      -        mutex_exit(&as->a_contents);
2187      -
2188      -        /*
2189      -         * Call the swapout routines of all segments in the address
2190      -         * space to do the actual work, accumulating the amount of
2191      -         * space reclaimed.
2192      -         */
2193      -        for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2194      -                struct seg_ops *ov = seg->s_ops;
2195      -
2196      -                /*
2197      -                 * We have to check to see if the seg has
2198      -                 * an ops vector because the seg may have
2199      -                 * been in the middle of being set up when
2200      -                 * the process was picked for swapout.
2201      -                 */
2202      -                if ((ov != NULL) && (ov->swapout != NULL))
2203      -                        swpcnt += SEGOP_SWAPOUT(seg);
2204      -        }
2205      -        AS_LOCK_EXIT(as, &as->a_lock);
2206      -        return (swpcnt);
2207      -}
2208      -
2209      -/*
2210 2143   * Determine whether data from the mappings in interval [addr, addr + size)
2211 2144   * are in the primary memory (core) cache.
2212 2145   */
2213 2146  int
2214 2147  as_incore(struct as *as, caddr_t addr,
2215 2148      size_t size, char *vec, size_t *sizep)
2216 2149  {
2217 2150          struct seg *seg;
2218 2151          size_t ssize;
2219 2152          caddr_t raddr;          /* rounded down addr */

2220 2153          size_t rsize;           /* rounded up size */
2221 2154          size_t isize;                   /* iteration size */
2222 2155          int error = 0;          /* result, assume success */
2223 2156  
2224 2157          *sizep = 0;
2225 2158          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2226 2159          rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2227 2160              (size_t)raddr;
2228 2161  
2229 2162          if (raddr + rsize < raddr)              /* check for wraparound */
2230 2163                  return (ENOMEM);
2231 2164  
2232 2165          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2233 2166          seg = as_segat(as, raddr);
2234 2167          if (seg == NULL) {
2235 2168                  AS_LOCK_EXIT(as, &as->a_lock);
2236 2169                  return (-1);
2237 2170          }
2238 2171  
2239 2172          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2240 2173                  if (raddr >= seg->s_base + seg->s_size) {
2241 2174                          seg = AS_SEGNEXT(as, seg);
2242 2175                          if (seg == NULL || raddr != seg->s_base) {
2243 2176                                  error = -1;
2244 2177                                  break;
2245 2178                          }
2246 2179                  }
2247 2180                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2248 2181                          ssize = seg->s_base + seg->s_size - raddr;
2249 2182                  else
2250 2183                          ssize = rsize;
2251 2184                  *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2252 2185                  if (isize != ssize) {
2253 2186                          error = -1;
2254 2187                          break;
2255 2188                  }
2256 2189                  vec += btopr(ssize);
2257 2190          }
2258 2191          AS_LOCK_EXIT(as, &as->a_lock);
2259 2192          return (error);
2260 2193  }
2261 2194  
2262 2195  static void
2263 2196  as_segunlock(struct seg *seg, caddr_t addr, int attr,
2264 2197          ulong_t *bitmap, size_t position, size_t npages)
2265 2198  {
2266 2199          caddr_t range_start;
2267 2200          size_t  pos1 = position;
2268 2201          size_t  pos2;
2269 2202          size_t  size;
2270 2203          size_t  end_pos = npages + position;
2271 2204  
2272 2205          while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2273 2206                  size = ptob((pos2 - pos1));
2274 2207                  range_start = (caddr_t)((uintptr_t)addr +
2275 2208                      ptob(pos1 - position));
2276 2209  
2277 2210                  (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2278 2211                      (ulong_t *)NULL, (size_t)NULL);
2279 2212                  pos1 = pos2;
2280 2213          }
2281 2214  }
2282 2215  
2283 2216  static void
2284 2217  as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2285 2218          caddr_t raddr, size_t rsize)
2286 2219  {
2287 2220          struct seg *seg = as_segat(as, raddr);
2288 2221          size_t ssize;
2289 2222  
2290 2223          while (rsize != 0) {
2291 2224                  if (raddr >= seg->s_base + seg->s_size)
2292 2225                          seg = AS_SEGNEXT(as, seg);
2293 2226  
2294 2227                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2295 2228                          ssize = seg->s_base + seg->s_size - raddr;
2296 2229                  else
2297 2230                          ssize = rsize;
2298 2231  
2299 2232                  as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2300 2233  
2301 2234                  rsize -= ssize;
2302 2235                  raddr += ssize;
2303 2236          }
2304 2237  }
2305 2238  
2306 2239  /*
2307 2240   * Cache control operations over the interval [addr, addr + size) in
2308 2241   * address space "as".
2309 2242   */
2310 2243  /*ARGSUSED*/
2311 2244  int
2312 2245  as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2313 2246      uintptr_t arg, ulong_t *lock_map, size_t pos)
2314 2247  {
2315 2248          struct seg *seg;        /* working segment */
2316 2249          caddr_t raddr;          /* rounded down addr */
2317 2250          caddr_t initraddr;      /* saved initial rounded down addr */
2318 2251          size_t rsize;           /* rounded up size */
2319 2252          size_t initrsize;       /* saved initial rounded up size */
2320 2253          size_t ssize;           /* size of seg */
2321 2254          int error = 0;                  /* result */
2322 2255          size_t mlock_size;      /* size of bitmap */
2323 2256          ulong_t *mlock_map;     /* pointer to bitmap used */
2324 2257                                  /* to represent the locked */
2325 2258                                  /* pages. */
2326 2259  retry:
2327 2260          if (error == IE_RETRY)
2328 2261                  AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2329 2262          else
2330 2263                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2331 2264  
2332 2265          /*
2333 2266           * If these are address space lock/unlock operations, loop over
2334 2267           * all segments in the address space, as appropriate.
2335 2268           */
2336 2269          if (func == MC_LOCKAS) {
2337 2270                  size_t npages, idx;
2338 2271                  size_t rlen = 0;        /* rounded as length */
2339 2272  
2340 2273                  idx = pos;
2341 2274  
2342 2275                  if (arg & MCL_FUTURE) {
2343 2276                          mutex_enter(&as->a_contents);
2344 2277                          AS_SETPGLCK(as);
2345 2278                          mutex_exit(&as->a_contents);
2346 2279                  }
2347 2280                  if ((arg & MCL_CURRENT) == 0) {
2348 2281                          AS_LOCK_EXIT(as, &as->a_lock);
2349 2282                          return (0);
2350 2283                  }
2351 2284  
2352 2285                  seg = AS_SEGFIRST(as);
2353 2286                  if (seg == NULL) {
2354 2287                          AS_LOCK_EXIT(as, &as->a_lock);
2355 2288                          return (0);
2356 2289                  }
2357 2290  
2358 2291                  do {
2359 2292                          raddr = (caddr_t)((uintptr_t)seg->s_base &
2360 2293                              (uintptr_t)PAGEMASK);
2361 2294                          rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2362 2295                              PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2363 2296                  } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2364 2297  
2365 2298                  mlock_size = BT_BITOUL(btopr(rlen));
2366 2299                  if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2367 2300                      sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2368 2301                                  AS_LOCK_EXIT(as, &as->a_lock);
2369 2302                                  return (EAGAIN);
2370 2303                  }
2371 2304  
2372 2305                  for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2373 2306                          error = SEGOP_LOCKOP(seg, seg->s_base,
2374 2307                              seg->s_size, attr, MC_LOCK, mlock_map, pos);
2375 2308                          if (error != 0)
2376 2309                                  break;
2377 2310                          pos += seg_pages(seg);
2378 2311                  }
2379 2312  
2380 2313                  if (error) {
2381 2314                          for (seg = AS_SEGFIRST(as); seg != NULL;
2382 2315                              seg = AS_SEGNEXT(as, seg)) {
2383 2316  
2384 2317                                  raddr = (caddr_t)((uintptr_t)seg->s_base &
2385 2318                                      (uintptr_t)PAGEMASK);
2386 2319                                  npages = seg_pages(seg);
2387 2320                                  as_segunlock(seg, raddr, attr, mlock_map,
2388 2321                                      idx, npages);
2389 2322                                  idx += npages;
2390 2323                          }
2391 2324                  }
2392 2325  
2393 2326                  kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2394 2327                  AS_LOCK_EXIT(as, &as->a_lock);
2395 2328                  goto lockerr;
2396 2329          } else if (func == MC_UNLOCKAS) {
2397 2330                  mutex_enter(&as->a_contents);
2398 2331                  AS_CLRPGLCK(as);
2399 2332                  mutex_exit(&as->a_contents);
2400 2333  
2401 2334                  for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2402 2335                          error = SEGOP_LOCKOP(seg, seg->s_base,
2403 2336                              seg->s_size, attr, MC_UNLOCK, NULL, 0);
2404 2337                          if (error != 0)
2405 2338                                  break;
2406 2339                  }
2407 2340  
2408 2341                  AS_LOCK_EXIT(as, &as->a_lock);
2409 2342                  goto lockerr;
2410 2343          }
2411 2344  
2412 2345          /*
2413 2346           * Normalize addresses and sizes.
2414 2347           */
2415 2348          initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2416 2349          initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2417 2350              (size_t)raddr;
2418 2351  
2419 2352          if (raddr + rsize < raddr) {            /* check for wraparound */
2420 2353                  AS_LOCK_EXIT(as, &as->a_lock);
2421 2354                  return (ENOMEM);
2422 2355          }
2423 2356  
2424 2357          /*
2425 2358           * Get initial segment.
2426 2359           */
2427 2360          if ((seg = as_segat(as, raddr)) == NULL) {
2428 2361                  AS_LOCK_EXIT(as, &as->a_lock);
2429 2362                  return (ENOMEM);
2430 2363          }
2431 2364  
2432 2365          if (func == MC_LOCK) {
2433 2366                  mlock_size = BT_BITOUL(btopr(rsize));
2434 2367                  if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2435 2368                      sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2436 2369                                  AS_LOCK_EXIT(as, &as->a_lock);
2437 2370                                  return (EAGAIN);
2438 2371                  }
2439 2372          }
2440 2373  
2441 2374          /*
2442 2375           * Loop over all segments.  If a hole in the address range is
2443 2376           * discovered, then fail.  For each segment, perform the appropriate
2444 2377           * control operation.
2445 2378           */
2446 2379          while (rsize != 0) {
2447 2380  
2448 2381                  /*
2449 2382                   * Make sure there's no hole, calculate the portion
2450 2383                   * of the next segment to be operated over.
2451 2384                   */
2452 2385                  if (raddr >= seg->s_base + seg->s_size) {
2453 2386                          seg = AS_SEGNEXT(as, seg);
2454 2387                          if (seg == NULL || raddr != seg->s_base) {
2455 2388                                  if (func == MC_LOCK) {
2456 2389                                          as_unlockerr(as, attr, mlock_map,
2457 2390                                              initraddr, initrsize - rsize);
2458 2391                                          kmem_free(mlock_map,
2459 2392                                              mlock_size * sizeof (ulong_t));
2460 2393                                  }
2461 2394                                  AS_LOCK_EXIT(as, &as->a_lock);
2462 2395                                  return (ENOMEM);
2463 2396                          }
2464 2397                  }
2465 2398                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2466 2399                          ssize = seg->s_base + seg->s_size - raddr;
2467 2400                  else
2468 2401                          ssize = rsize;
2469 2402  
2470 2403                  /*
2471 2404                   * Dispatch on specific function.
2472 2405                   */
2473 2406                  switch (func) {
2474 2407  
2475 2408                  /*
2476 2409                   * Synchronize cached data from mappings with backing
2477 2410                   * objects.
2478 2411                   */
2479 2412                  case MC_SYNC:
2480 2413                          if (error = SEGOP_SYNC(seg, raddr, ssize,
2481 2414                              attr, (uint_t)arg)) {
2482 2415                                  AS_LOCK_EXIT(as, &as->a_lock);
2483 2416                                  return (error);
2484 2417                          }
2485 2418                          break;
2486 2419  
2487 2420                  /*
2488 2421                   * Lock pages in memory.
2489 2422                   */
2490 2423                  case MC_LOCK:
2491 2424                          if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2492 2425                              attr, func, mlock_map, pos)) {
2493 2426                                  as_unlockerr(as, attr, mlock_map, initraddr,
2494 2427                                      initrsize - rsize + ssize);
2495 2428                                  kmem_free(mlock_map, mlock_size *
2496 2429                                      sizeof (ulong_t));
2497 2430                                  AS_LOCK_EXIT(as, &as->a_lock);
2498 2431                                  goto lockerr;
2499 2432                          }
2500 2433                          break;
2501 2434  
2502 2435                  /*
2503 2436                   * Unlock mapped pages.
2504 2437                   */
2505 2438                  case MC_UNLOCK:
2506 2439                          (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2507 2440                              (ulong_t *)NULL, (size_t)NULL);
2508 2441                          break;
2509 2442  
2510 2443                  /*
2511 2444                   * Store VM advise for mapped pages in segment layer.
2512 2445                   */
2513 2446                  case MC_ADVISE:
2514 2447                          error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2515 2448  
2516 2449                          /*
2517 2450                           * Check for regular errors and special retry error
2518 2451                           */
2519 2452                          if (error) {
2520 2453                                  if (error == IE_RETRY) {
2521 2454                                          /*
2522 2455                                           * Need to acquire writers lock, so
2523 2456                                           * have to drop readers lock and start
2524 2457                                           * all over again
2525 2458                                           */
2526 2459                                          AS_LOCK_EXIT(as, &as->a_lock);
2527 2460                                          goto retry;
2528 2461                                  } else if (error == IE_REATTACH) {
2529 2462                                          /*
2530 2463                                           * Find segment for current address
2531 2464                                           * because current segment just got
2532 2465                                           * split or concatenated
2533 2466                                           */
2534 2467                                          seg = as_segat(as, raddr);
2535 2468                                          if (seg == NULL) {
2536 2469                                                  AS_LOCK_EXIT(as, &as->a_lock);
2537 2470                                                  return (ENOMEM);
2538 2471                                          }
2539 2472                                  } else {
2540 2473                                          /*
2541 2474                                           * Regular error
2542 2475                                           */
2543 2476                                          AS_LOCK_EXIT(as, &as->a_lock);
2544 2477                                          return (error);
2545 2478                                  }
2546 2479                          }
2547 2480                          break;
2548 2481  
2549 2482                  /*
2550 2483                   * Can't happen.
2551 2484                   */
2552 2485                  default:
2553 2486                          panic("as_ctl: bad operation %d", func);
2554 2487                          /*NOTREACHED*/
2555 2488                  }
2556 2489  
2557 2490                  rsize -= ssize;
2558 2491                  raddr += ssize;
2559 2492          }
2560 2493  
2561 2494          if (func == MC_LOCK)
2562 2495                  kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2563 2496          AS_LOCK_EXIT(as, &as->a_lock);
2564 2497          return (0);
2565 2498  lockerr:
2566 2499  
2567 2500          /*
2568 2501           * If the lower levels returned EDEADLK for a segment lockop,
2569 2502           * it means that we should retry the operation.  Let's wait
2570 2503           * a bit also to let the deadlock causing condition clear.
2571 2504           * This is part of a gross hack to work around a design flaw
2572 2505           * in the ufs/sds logging code and should go away when the
2573 2506           * logging code is re-designed to fix the problem. See bug
2574 2507           * 4125102 for details of the problem.
2575 2508           */
2576 2509          if (error == EDEADLK) {
2577 2510                  delay(deadlk_wait);
2578 2511                  error = 0;
2579 2512                  goto retry;
2580 2513          }
2581 2514          return (error);
2582 2515  }
2583 2516  
2584 2517  int
2585 2518  fc_decode(faultcode_t fault_err)
2586 2519  {
2587 2520          int error = 0;
2588 2521  
2589 2522          switch (FC_CODE(fault_err)) {
2590 2523          case FC_OBJERR:
2591 2524                  error = FC_ERRNO(fault_err);
2592 2525                  break;
2593 2526          case FC_PROT:
2594 2527                  error = EACCES;
2595 2528                  break;
2596 2529          default:
2597 2530                  error = EFAULT;
2598 2531                  break;
2599 2532          }
2600 2533          return (error);
2601 2534  }
2602 2535  
2603 2536  /*
2604 2537   * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2605 2538   * lists from each segment and copy them to one contiguous shadow list (plist)
2606 2539   * as expected by the caller.  Save pointers to per segment shadow lists at
2607 2540   * the tail of plist so that they can be used during as_pageunlock().
2608 2541   */
2609 2542  static int
2610 2543  as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2611 2544      caddr_t addr, size_t size, enum seg_rw rw)
2612 2545  {
2613 2546          caddr_t sv_addr = addr;
2614 2547          size_t sv_size = size;
2615 2548          struct seg *sv_seg = seg;
2616 2549          ulong_t segcnt = 1;
2617 2550          ulong_t cnt;
2618 2551          size_t ssize;
2619 2552          pgcnt_t npages = btop(size);
2620 2553          page_t **plist;
2621 2554          page_t **pl;
2622 2555          int error;
2623 2556          caddr_t eaddr;
2624 2557          faultcode_t fault_err = 0;
2625 2558          pgcnt_t pl_off;
2626 2559          extern struct seg_ops segspt_shmops;
2627 2560  
2628 2561          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2629 2562          ASSERT(seg != NULL);
2630 2563          ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2631 2564          ASSERT(addr + size > seg->s_base + seg->s_size);
2632 2565          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2633 2566          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2634 2567  
2635 2568          /*
2636 2569           * Count the number of segments covered by the range we are about to
2637 2570           * lock. The segment count is used to size the shadow list we return
2638 2571           * back to the caller.
2639 2572           */
2640 2573          for (; size != 0; size -= ssize, addr += ssize) {
2641 2574                  if (addr >= seg->s_base + seg->s_size) {
2642 2575  
2643 2576                          seg = AS_SEGNEXT(as, seg);
2644 2577                          if (seg == NULL || addr != seg->s_base) {
2645 2578                                  AS_LOCK_EXIT(as, &as->a_lock);
2646 2579                                  return (EFAULT);
2647 2580                          }
2648 2581                          /*
2649 2582                           * Do a quick check if subsequent segments
2650 2583                           * will most likely support pagelock.
2651 2584                           */
2652 2585                          if (seg->s_ops == &segvn_ops) {
2653 2586                                  vnode_t *vp;
2654 2587  
2655 2588                                  if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2656 2589                                      vp != NULL) {
2657 2590                                          AS_LOCK_EXIT(as, &as->a_lock);
2658 2591                                          goto slow;
2659 2592                                  }
2660 2593                          } else if (seg->s_ops != &segspt_shmops) {
2661 2594                                  AS_LOCK_EXIT(as, &as->a_lock);
2662 2595                                  goto slow;
2663 2596                          }
2664 2597                          segcnt++;
2665 2598                  }
2666 2599                  if (addr + size > seg->s_base + seg->s_size) {
2667 2600                          ssize = seg->s_base + seg->s_size - addr;
2668 2601                  } else {
2669 2602                          ssize = size;
2670 2603                  }
2671 2604          }
2672 2605          ASSERT(segcnt > 1);
2673 2606  
2674 2607          plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2675 2608  
2676 2609          addr = sv_addr;
2677 2610          size = sv_size;
2678 2611          seg = sv_seg;
2679 2612  
2680 2613          for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2681 2614                  if (addr >= seg->s_base + seg->s_size) {
2682 2615                          seg = AS_SEGNEXT(as, seg);
2683 2616                          ASSERT(seg != NULL && addr == seg->s_base);
2684 2617                          cnt++;
2685 2618                          ASSERT(cnt < segcnt);
2686 2619                  }
2687 2620                  if (addr + size > seg->s_base + seg->s_size) {
2688 2621                          ssize = seg->s_base + seg->s_size - addr;
2689 2622                  } else {
2690 2623                          ssize = size;
2691 2624                  }
2692 2625                  pl = &plist[npages + cnt];
2693 2626                  error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2694 2627                      L_PAGELOCK, rw);
2695 2628                  if (error) {
2696 2629                          break;
2697 2630                  }
2698 2631                  ASSERT(plist[npages + cnt] != NULL);
2699 2632                  ASSERT(pl_off + btop(ssize) <= npages);
2700 2633                  bcopy(plist[npages + cnt], &plist[pl_off],
2701 2634                      btop(ssize) * sizeof (page_t *));
2702 2635                  pl_off += btop(ssize);
2703 2636          }
2704 2637  
2705 2638          if (size == 0) {
2706 2639                  AS_LOCK_EXIT(as, &as->a_lock);
2707 2640                  ASSERT(cnt == segcnt - 1);
2708 2641                  *ppp = plist;
2709 2642                  return (0);
2710 2643          }
2711 2644  
2712 2645          /*
2713 2646           * one of pagelock calls failed. The error type is in error variable.
2714 2647           * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2715 2648           * type is either EFAULT or ENOTSUP. Otherwise just return the error
2716 2649           * back to the caller.
2717 2650           */
2718 2651  
2719 2652          eaddr = addr;
2720 2653          seg = sv_seg;
2721 2654  
2722 2655          for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2723 2656                  if (addr >= seg->s_base + seg->s_size) {
2724 2657                          seg = AS_SEGNEXT(as, seg);
2725 2658                          ASSERT(seg != NULL && addr == seg->s_base);
2726 2659                          cnt++;
2727 2660                          ASSERT(cnt < segcnt);
2728 2661                  }
2729 2662                  if (eaddr > seg->s_base + seg->s_size) {
2730 2663                          ssize = seg->s_base + seg->s_size - addr;
2731 2664                  } else {
2732 2665                          ssize = eaddr - addr;
2733 2666                  }
2734 2667                  pl = &plist[npages + cnt];
2735 2668                  ASSERT(*pl != NULL);
2736 2669                  (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2737 2670                      L_PAGEUNLOCK, rw);
2738 2671          }
2739 2672  
2740 2673          AS_LOCK_EXIT(as, &as->a_lock);
2741 2674  
2742 2675          kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2743 2676  
2744 2677          if (error != ENOTSUP && error != EFAULT) {
2745 2678                  return (error);
2746 2679          }
2747 2680  
2748 2681  slow:
2749 2682          /*
2750 2683           * If we are here because pagelock failed due to the need to cow fault
2751 2684           * in the pages we want to lock F_SOFTLOCK will do this job and in
2752 2685           * next as_pagelock() call for this address range pagelock will
2753 2686           * hopefully succeed.
2754 2687           */
2755 2688          fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2756 2689          if (fault_err != 0) {
2757 2690                  return (fc_decode(fault_err));
2758 2691          }
2759 2692          *ppp = NULL;
2760 2693  
2761 2694          return (0);
2762 2695  }
2763 2696  
2764 2697  /*
2765 2698   * lock pages in a given address space. Return shadow list. If
2766 2699   * the list is NULL, the MMU mapping is also locked.
2767 2700   */
2768 2701  int
2769 2702  as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2770 2703      size_t size, enum seg_rw rw)
2771 2704  {
2772 2705          size_t rsize;
2773 2706          caddr_t raddr;
2774 2707          faultcode_t fault_err;
2775 2708          struct seg *seg;
2776 2709          int err;
2777 2710  
2778 2711          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2779 2712              "as_pagelock_start: addr %p size %ld", addr, size);
2780 2713  
2781 2714          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2782 2715          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2783 2716              (size_t)raddr;
2784 2717  
2785 2718          /*
2786 2719           * if the request crosses two segments let
2787 2720           * as_fault handle it.
2788 2721           */
2789 2722          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2790 2723  
2791 2724          seg = as_segat(as, raddr);
2792 2725          if (seg == NULL) {
2793 2726                  AS_LOCK_EXIT(as, &as->a_lock);
2794 2727                  return (EFAULT);
2795 2728          }
2796 2729          ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2797 2730          if (raddr + rsize > seg->s_base + seg->s_size) {
2798 2731                  return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2799 2732          }
2800 2733          if (raddr + rsize <= raddr) {
2801 2734                  AS_LOCK_EXIT(as, &as->a_lock);
2802 2735                  return (EFAULT);
2803 2736          }
2804 2737  
2805 2738          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2806 2739              "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2807 2740  
2808 2741          /*
2809 2742           * try to lock pages and pass back shadow list
2810 2743           */
2811 2744          err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2812 2745  
2813 2746          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2814 2747  
2815 2748          AS_LOCK_EXIT(as, &as->a_lock);
2816 2749  
2817 2750          if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2818 2751                  return (err);
2819 2752          }
2820 2753  
2821 2754          /*
2822 2755           * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2823 2756           * to no pagelock support for this segment or pages need to be cow
2824 2757           * faulted in. If fault is needed F_SOFTLOCK will do this job for
2825 2758           * this as_pagelock() call and in the next as_pagelock() call for the
2826 2759           * same address range pagelock call will hopefull succeed.
2827 2760           */
2828 2761          fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2829 2762          if (fault_err != 0) {
2830 2763                  return (fc_decode(fault_err));
2831 2764          }
2832 2765          *ppp = NULL;
2833 2766  
2834 2767          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2835 2768          return (0);
2836 2769  }
2837 2770  
2838 2771  /*
2839 2772   * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2840 2773   * lists from the end of plist and call pageunlock interface for each segment.
2841 2774   * Drop as lock and free plist.
2842 2775   */
2843 2776  static void
2844 2777  as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2845 2778      struct page **plist, enum seg_rw rw)
2846 2779  {
2847 2780          ulong_t cnt;
2848 2781          caddr_t eaddr = addr + size;
2849 2782          pgcnt_t npages = btop(size);
2850 2783          size_t ssize;
2851 2784          page_t **pl;
2852 2785  
2853 2786          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2854 2787          ASSERT(seg != NULL);
2855 2788          ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2856 2789          ASSERT(addr + size > seg->s_base + seg->s_size);
2857 2790          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2858 2791          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2859 2792          ASSERT(plist != NULL);
2860 2793  
2861 2794          for (cnt = 0; addr < eaddr; addr += ssize) {
2862 2795                  if (addr >= seg->s_base + seg->s_size) {
2863 2796                          seg = AS_SEGNEXT(as, seg);
2864 2797                          ASSERT(seg != NULL && addr == seg->s_base);
2865 2798                          cnt++;
2866 2799                  }
2867 2800                  if (eaddr > seg->s_base + seg->s_size) {
2868 2801                          ssize = seg->s_base + seg->s_size - addr;
2869 2802                  } else {
2870 2803                          ssize = eaddr - addr;
2871 2804                  }
2872 2805                  pl = &plist[npages + cnt];
2873 2806                  ASSERT(*pl != NULL);
2874 2807                  (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2875 2808                      L_PAGEUNLOCK, rw);
2876 2809          }
2877 2810          ASSERT(cnt > 0);
2878 2811          AS_LOCK_EXIT(as, &as->a_lock);
2879 2812  
2880 2813          cnt++;
2881 2814          kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2882 2815  }
2883 2816  
2884 2817  /*
2885 2818   * unlock pages in a given address range
2886 2819   */
2887 2820  void
2888 2821  as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2889 2822      enum seg_rw rw)
2890 2823  {
2891 2824          struct seg *seg;
2892 2825          size_t rsize;
2893 2826          caddr_t raddr;
2894 2827  
2895 2828          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2896 2829              "as_pageunlock_start: addr %p size %ld", addr, size);
2897 2830  
2898 2831          /*
2899 2832           * if the shadow list is NULL, as_pagelock was
2900 2833           * falling back to as_fault
2901 2834           */
2902 2835          if (pp == NULL) {
2903 2836                  (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2904 2837                  return;
2905 2838          }
2906 2839  
2907 2840          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2908 2841          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2909 2842              (size_t)raddr;
2910 2843  
2911 2844          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2912 2845          seg = as_segat(as, raddr);
2913 2846          ASSERT(seg != NULL);
2914 2847  
2915 2848          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2916 2849              "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2917 2850  
2918 2851          ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2919 2852          if (raddr + rsize <= seg->s_base + seg->s_size) {
2920 2853                  SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2921 2854          } else {
2922 2855                  as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2923 2856                  return;
2924 2857          }
2925 2858          AS_LOCK_EXIT(as, &as->a_lock);
2926 2859          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2927 2860  }
2928 2861  
2929 2862  int
2930 2863  as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2931 2864      boolean_t wait)
2932 2865  {
2933 2866          struct seg *seg;
2934 2867          size_t ssize;
2935 2868          caddr_t raddr;                  /* rounded down addr */
2936 2869          size_t rsize;                   /* rounded up size */
2937 2870          int error = 0;
2938 2871          size_t pgsz = page_get_pagesize(szc);
2939 2872  
2940 2873  setpgsz_top:
2941 2874          if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2942 2875                  return (EINVAL);
2943 2876          }
2944 2877  
2945 2878          raddr = addr;
2946 2879          rsize = size;
2947 2880  
2948 2881          if (raddr + rsize < raddr)              /* check for wraparound */
2949 2882                  return (ENOMEM);
2950 2883  
2951 2884          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2952 2885          as_clearwatchprot(as, raddr, rsize);
2953 2886          seg = as_segat(as, raddr);
2954 2887          if (seg == NULL) {
2955 2888                  as_setwatch(as);
2956 2889                  AS_LOCK_EXIT(as, &as->a_lock);
2957 2890                  return (ENOMEM);
2958 2891          }
2959 2892  
2960 2893          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2961 2894                  if (raddr >= seg->s_base + seg->s_size) {
2962 2895                          seg = AS_SEGNEXT(as, seg);
2963 2896                          if (seg == NULL || raddr != seg->s_base) {
2964 2897                                  error = ENOMEM;
2965 2898                                  break;
2966 2899                          }
2967 2900                  }
2968 2901                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2969 2902                          ssize = seg->s_base + seg->s_size - raddr;
2970 2903                  } else {
2971 2904                          ssize = rsize;
2972 2905                  }
2973 2906  
2974 2907  retry:
2975 2908                  error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2976 2909  
2977 2910                  if (error == IE_NOMEM) {
2978 2911                          error = EAGAIN;
2979 2912                          break;
2980 2913                  }
2981 2914  
2982 2915                  if (error == IE_RETRY) {
2983 2916                          AS_LOCK_EXIT(as, &as->a_lock);
2984 2917                          goto setpgsz_top;
2985 2918                  }
2986 2919  
2987 2920                  if (error == ENOTSUP) {
2988 2921                          error = EINVAL;
2989 2922                          break;
2990 2923                  }
2991 2924  
2992 2925                  if (wait && (error == EAGAIN)) {
2993 2926                          /*
2994 2927                           * Memory is currently locked.  It must be unlocked
2995 2928                           * before this operation can succeed through a retry.
2996 2929                           * The possible reasons for locked memory and
2997 2930                           * corresponding strategies for unlocking are:
2998 2931                           * (1) Normal I/O
2999 2932                           *      wait for a signal that the I/O operation
3000 2933                           *      has completed and the memory is unlocked.
3001 2934                           * (2) Asynchronous I/O
3002 2935                           *      The aio subsystem does not unlock pages when
3003 2936                           *      the I/O is completed. Those pages are unlocked
3004 2937                           *      when the application calls aiowait/aioerror.
3005 2938                           *      So, to prevent blocking forever, cv_broadcast()
3006 2939                           *      is done to wake up aio_cleanup_thread.
3007 2940                           *      Subsequently, segvn_reclaim will be called, and
3008 2941                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
3009 2942                           * (3) Long term page locking:
3010 2943                           *      This is not relevant for as_setpagesize()
3011 2944                           *      because we cannot change the page size for
3012 2945                           *      driver memory. The attempt to do so will
3013 2946                           *      fail with a different error than EAGAIN so
3014 2947                           *      there's no need to trigger as callbacks like
3015 2948                           *      as_unmap, as_setprot or as_free would do.
3016 2949                           */
3017 2950                          mutex_enter(&as->a_contents);
3018 2951                          if (!AS_ISNOUNMAPWAIT(as)) {
3019 2952                                  if (AS_ISUNMAPWAIT(as) == 0) {
3020 2953                                          cv_broadcast(&as->a_cv);
3021 2954                                  }
3022 2955                                  AS_SETUNMAPWAIT(as);
3023 2956                                  AS_LOCK_EXIT(as, &as->a_lock);
3024 2957                                  while (AS_ISUNMAPWAIT(as)) {
3025 2958                                          cv_wait(&as->a_cv, &as->a_contents);
3026 2959                                  }
3027 2960                          } else {
3028 2961                                  /*
3029 2962                                   * We may have raced with
3030 2963                                   * segvn_reclaim()/segspt_reclaim(). In this
3031 2964                                   * case clean nounmapwait flag and retry since
3032 2965                                   * softlockcnt in this segment may be already
3033 2966                                   * 0.  We don't drop as writer lock so our
3034 2967                                   * number of retries without sleeping should
3035 2968                                   * be very small. See segvn_reclaim() for
3036 2969                                   * more comments.
3037 2970                                   */
3038 2971                                  AS_CLRNOUNMAPWAIT(as);
3039 2972                                  mutex_exit(&as->a_contents);
3040 2973                                  goto retry;
3041 2974                          }
3042 2975                          mutex_exit(&as->a_contents);
3043 2976                          goto setpgsz_top;
3044 2977                  } else if (error != 0) {
3045 2978                          break;
3046 2979                  }
3047 2980          }
3048 2981          as_setwatch(as);
3049 2982          AS_LOCK_EXIT(as, &as->a_lock);
3050 2983          return (error);
3051 2984  }
3052 2985  
3053 2986  /*
3054 2987   * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
3055 2988   * in its chunk where s_szc is less than the szc we want to set.
3056 2989   */
3057 2990  static int
3058 2991  as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3059 2992      int *retry)
3060 2993  {
3061 2994          struct seg *seg;
3062 2995          size_t ssize;
3063 2996          int error;
3064 2997  
3065 2998          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3066 2999  
3067 3000          seg = as_segat(as, raddr);
3068 3001          if (seg == NULL) {
3069 3002                  panic("as_iset3_default_lpsize: no seg");
3070 3003          }
3071 3004  
3072 3005          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3073 3006                  if (raddr >= seg->s_base + seg->s_size) {
3074 3007                          seg = AS_SEGNEXT(as, seg);
3075 3008                          if (seg == NULL || raddr != seg->s_base) {
3076 3009                                  panic("as_iset3_default_lpsize: as changed");
3077 3010                          }
3078 3011                  }
3079 3012                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3080 3013                          ssize = seg->s_base + seg->s_size - raddr;
3081 3014                  } else {
3082 3015                          ssize = rsize;
3083 3016                  }
3084 3017  
3085 3018                  if (szc > seg->s_szc) {
3086 3019                          error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3087 3020                          /* Only retry on EINVAL segments that have no vnode. */
3088 3021                          if (error == EINVAL) {
3089 3022                                  vnode_t *vp = NULL;
3090 3023                                  if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3091 3024                                      (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3092 3025                                      vp == NULL)) {
3093 3026                                          *retry = 1;
3094 3027                                  } else {
3095 3028                                          *retry = 0;
3096 3029                                  }
3097 3030                          }
3098 3031                          if (error) {
3099 3032                                  return (error);
3100 3033                          }
3101 3034                  }
3102 3035          }
3103 3036          return (0);
3104 3037  }
3105 3038  
3106 3039  /*
3107 3040   * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3108 3041   * pagesize on each segment in its range, but if any fails with EINVAL,
3109 3042   * then it reduces the pagesizes to the next size in the bitmap and
3110 3043   * retries as_iset3_default_lpsize(). The reason why the code retries
3111 3044   * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3112 3045   * match the bigger sizes, and (b) it's hard to get this offset (to begin
3113 3046   * with) to pass to map_pgszcvec().
3114 3047   */
3115 3048  static int
3116 3049  as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3117 3050      uint_t szcvec)
3118 3051  {
3119 3052          int error;
3120 3053          int retry;
3121 3054  
3122 3055          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3123 3056  
3124 3057          for (;;) {
3125 3058                  error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3126 3059                  if (error == EINVAL && retry) {
3127 3060                          szcvec &= ~(1 << szc);
3128 3061                          if (szcvec <= 1) {
3129 3062                                  return (EINVAL);
3130 3063                          }
3131 3064                          szc = highbit(szcvec) - 1;
3132 3065                  } else {
3133 3066                          return (error);
3134 3067                  }
3135 3068          }
3136 3069  }
3137 3070  
3138 3071  /*
3139 3072   * as_iset1_default_lpsize() breaks its chunk into areas where existing
3140 3073   * segments have a smaller szc than we want to set. For each such area,
3141 3074   * it calls as_iset2_default_lpsize()
3142 3075   */
3143 3076  static int
3144 3077  as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3145 3078      uint_t szcvec)
3146 3079  {
3147 3080          struct seg *seg;
3148 3081          size_t ssize;
3149 3082          caddr_t setaddr = raddr;
3150 3083          size_t setsize = 0;
3151 3084          int set;
3152 3085          int error;
3153 3086  
3154 3087          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3155 3088  
3156 3089          seg = as_segat(as, raddr);
3157 3090          if (seg == NULL) {
3158 3091                  panic("as_iset1_default_lpsize: no seg");
3159 3092          }
3160 3093          if (seg->s_szc < szc) {
3161 3094                  set = 1;
3162 3095          } else {
3163 3096                  set = 0;
3164 3097          }
3165 3098  
3166 3099          for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3167 3100                  if (raddr >= seg->s_base + seg->s_size) {
3168 3101                          seg = AS_SEGNEXT(as, seg);
3169 3102                          if (seg == NULL || raddr != seg->s_base) {
3170 3103                                  panic("as_iset1_default_lpsize: as changed");
3171 3104                          }
3172 3105                          if (seg->s_szc >= szc && set) {
3173 3106                                  ASSERT(setsize != 0);
3174 3107                                  error = as_iset2_default_lpsize(as,
3175 3108                                      setaddr, setsize, szc, szcvec);
3176 3109                                  if (error) {
3177 3110                                          return (error);
3178 3111                                  }
3179 3112                                  set = 0;
3180 3113                          } else if (seg->s_szc < szc && !set) {
3181 3114                                  setaddr = raddr;
3182 3115                                  setsize = 0;
3183 3116                                  set = 1;
3184 3117                          }
3185 3118                  }
3186 3119                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3187 3120                          ssize = seg->s_base + seg->s_size - raddr;
3188 3121                  } else {
3189 3122                          ssize = rsize;
3190 3123                  }
3191 3124          }
3192 3125          error = 0;
3193 3126          if (set) {
3194 3127                  ASSERT(setsize != 0);
3195 3128                  error = as_iset2_default_lpsize(as, setaddr, setsize,
3196 3129                      szc, szcvec);
3197 3130          }
3198 3131          return (error);
3199 3132  }
3200 3133  
3201 3134  /*
3202 3135   * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3203 3136   * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3204 3137   * chunk to as_iset1_default_lpsize().
3205 3138   */
3206 3139  static int
3207 3140  as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3208 3141      int type)
3209 3142  {
3210 3143          int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3211 3144          uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3212 3145              flags, rtype, 1);
3213 3146          uint_t szc;
3214 3147          uint_t nszc;
3215 3148          int error;
3216 3149          caddr_t a;
3217 3150          caddr_t eaddr;
3218 3151          size_t segsize;
3219 3152          size_t pgsz;
3220 3153          uint_t save_szcvec;
3221 3154  
3222 3155          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3223 3156          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3224 3157          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3225 3158  
3226 3159          szcvec &= ~1;
3227 3160          if (szcvec <= 1) {      /* skip if base page size */
3228 3161                  return (0);
3229 3162          }
3230 3163  
3231 3164          /* Get the pagesize of the first larger page size. */
3232 3165          szc = lowbit(szcvec) - 1;
3233 3166          pgsz = page_get_pagesize(szc);
3234 3167          eaddr = addr + size;
3235 3168          addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3236 3169          eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3237 3170  
3238 3171          save_szcvec = szcvec;
3239 3172          szcvec >>= (szc + 1);
3240 3173          nszc = szc;
3241 3174          while (szcvec) {
3242 3175                  if ((szcvec & 0x1) == 0) {
3243 3176                          nszc++;
3244 3177                          szcvec >>= 1;
3245 3178                          continue;
3246 3179                  }
3247 3180                  nszc++;
3248 3181                  pgsz = page_get_pagesize(nszc);
3249 3182                  a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3250 3183                  if (a != addr) {
3251 3184                          ASSERT(szc > 0);
3252 3185                          ASSERT(a < eaddr);
3253 3186                          segsize = a - addr;
3254 3187                          error = as_iset1_default_lpsize(as, addr, segsize, szc,
3255 3188                              save_szcvec);
3256 3189                          if (error) {
3257 3190                                  return (error);
3258 3191                          }
3259 3192                          addr = a;
3260 3193                  }
3261 3194                  szc = nszc;
3262 3195                  szcvec >>= 1;
3263 3196          }
3264 3197  
3265 3198          ASSERT(addr < eaddr);
3266 3199          szcvec = save_szcvec;
3267 3200          while (szcvec) {
3268 3201                  a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3269 3202                  ASSERT(a >= addr);
3270 3203                  if (a != addr) {
3271 3204                          ASSERT(szc > 0);
3272 3205                          segsize = a - addr;
3273 3206                          error = as_iset1_default_lpsize(as, addr, segsize, szc,
3274 3207                              save_szcvec);
3275 3208                          if (error) {
3276 3209                                  return (error);
3277 3210                          }
3278 3211                          addr = a;
3279 3212                  }
3280 3213                  szcvec &= ~(1 << szc);
3281 3214                  if (szcvec) {
3282 3215                          szc = highbit(szcvec) - 1;
3283 3216                          pgsz = page_get_pagesize(szc);
3284 3217                  }
3285 3218          }
3286 3219          ASSERT(addr == eaddr);
3287 3220  
3288 3221          return (0);
3289 3222  }
3290 3223  
3291 3224  /*
3292 3225   * Set the default large page size for the range. Called via memcntl with
3293 3226   * page size set to 0. as_set_default_lpsize breaks the range down into
3294 3227   * chunks with the same type/flags, ignores-non segvn segments, and passes
3295 3228   * each chunk to as_iset_default_lpsize().
3296 3229   */
3297 3230  int
3298 3231  as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3299 3232  {
3300 3233          struct seg *seg;
3301 3234          caddr_t raddr;
3302 3235          size_t rsize;
3303 3236          size_t ssize;
3304 3237          int rtype, rflags;
3305 3238          int stype, sflags;
3306 3239          int error;
3307 3240          caddr_t setaddr;
3308 3241          size_t setsize;
3309 3242          int segvn;
3310 3243  
3311 3244          if (size == 0)
3312 3245                  return (0);
3313 3246  
3314 3247          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3315 3248  again:
3316 3249          error = 0;
3317 3250  
3318 3251          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3319 3252          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3320 3253              (size_t)raddr;
3321 3254  
3322 3255          if (raddr + rsize < raddr) {            /* check for wraparound */
3323 3256                  AS_LOCK_EXIT(as, &as->a_lock);
3324 3257                  return (ENOMEM);
3325 3258          }
3326 3259          as_clearwatchprot(as, raddr, rsize);
3327 3260          seg = as_segat(as, raddr);
3328 3261          if (seg == NULL) {
3329 3262                  as_setwatch(as);
3330 3263                  AS_LOCK_EXIT(as, &as->a_lock);
3331 3264                  return (ENOMEM);
3332 3265          }
3333 3266          if (seg->s_ops == &segvn_ops) {
3334 3267                  rtype = SEGOP_GETTYPE(seg, addr);
3335 3268                  rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3336 3269                  rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3337 3270                  segvn = 1;
3338 3271          } else {
3339 3272                  segvn = 0;
3340 3273          }
3341 3274          setaddr = raddr;
3342 3275          setsize = 0;
3343 3276  
3344 3277          for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3345 3278                  if (raddr >= (seg->s_base + seg->s_size)) {
3346 3279                          seg = AS_SEGNEXT(as, seg);
3347 3280                          if (seg == NULL || raddr != seg->s_base) {
3348 3281                                  error = ENOMEM;
3349 3282                                  break;
3350 3283                          }
3351 3284                          if (seg->s_ops == &segvn_ops) {
3352 3285                                  stype = SEGOP_GETTYPE(seg, raddr);
3353 3286                                  sflags = stype & (MAP_TEXT | MAP_INITDATA);
3354 3287                                  stype &= (MAP_SHARED | MAP_PRIVATE);
3355 3288                                  if (segvn && (rflags != sflags ||
3356 3289                                      rtype != stype)) {
3357 3290                                          /*
3358 3291                                           * The next segment is also segvn but
3359 3292                                           * has different flags and/or type.
3360 3293                                           */
3361 3294                                          ASSERT(setsize != 0);
3362 3295                                          error = as_iset_default_lpsize(as,
3363 3296                                              setaddr, setsize, rflags, rtype);
3364 3297                                          if (error) {
3365 3298                                                  break;
3366 3299                                          }
3367 3300                                          rflags = sflags;
3368 3301                                          rtype = stype;
3369 3302                                          setaddr = raddr;
3370 3303                                          setsize = 0;
3371 3304                                  } else if (!segvn) {
3372 3305                                          rflags = sflags;
3373 3306                                          rtype = stype;
3374 3307                                          setaddr = raddr;
3375 3308                                          setsize = 0;
3376 3309                                          segvn = 1;
3377 3310                                  }
3378 3311                          } else if (segvn) {
3379 3312                                  /* The next segment is not segvn. */
3380 3313                                  ASSERT(setsize != 0);
3381 3314                                  error = as_iset_default_lpsize(as,
3382 3315                                      setaddr, setsize, rflags, rtype);
3383 3316                                  if (error) {
3384 3317                                          break;
3385 3318                                  }
3386 3319                                  segvn = 0;
3387 3320                          }
3388 3321                  }
3389 3322                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3390 3323                          ssize = seg->s_base + seg->s_size - raddr;
3391 3324                  } else {
3392 3325                          ssize = rsize;
3393 3326                  }
3394 3327          }
3395 3328          if (error == 0 && segvn) {
3396 3329                  /* The last chunk when rsize == 0. */
3397 3330                  ASSERT(setsize != 0);
3398 3331                  error = as_iset_default_lpsize(as, setaddr, setsize,
3399 3332                      rflags, rtype);
3400 3333          }
3401 3334  
3402 3335          if (error == IE_RETRY) {
3403 3336                  goto again;
3404 3337          } else if (error == IE_NOMEM) {
3405 3338                  error = EAGAIN;
3406 3339          } else if (error == ENOTSUP) {
3407 3340                  error = EINVAL;
3408 3341          } else if (error == EAGAIN) {
3409 3342                  mutex_enter(&as->a_contents);
3410 3343                  if (!AS_ISNOUNMAPWAIT(as)) {
3411 3344                          if (AS_ISUNMAPWAIT(as) == 0) {
3412 3345                                  cv_broadcast(&as->a_cv);
3413 3346                          }
3414 3347                          AS_SETUNMAPWAIT(as);
3415 3348                          AS_LOCK_EXIT(as, &as->a_lock);
3416 3349                          while (AS_ISUNMAPWAIT(as)) {
3417 3350                                  cv_wait(&as->a_cv, &as->a_contents);
3418 3351                          }
3419 3352                          mutex_exit(&as->a_contents);
3420 3353                          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3421 3354                  } else {
3422 3355                          /*
3423 3356                           * We may have raced with
3424 3357                           * segvn_reclaim()/segspt_reclaim(). In this case
3425 3358                           * clean nounmapwait flag and retry since softlockcnt
3426 3359                           * in this segment may be already 0.  We don't drop as
3427 3360                           * writer lock so our number of retries without
3428 3361                           * sleeping should be very small. See segvn_reclaim()
3429 3362                           * for more comments.
3430 3363                           */
3431 3364                          AS_CLRNOUNMAPWAIT(as);
3432 3365                          mutex_exit(&as->a_contents);
3433 3366                  }
3434 3367                  goto again;
3435 3368          }
3436 3369  
3437 3370          as_setwatch(as);
3438 3371          AS_LOCK_EXIT(as, &as->a_lock);
3439 3372          return (error);
3440 3373  }
3441 3374  
3442 3375  /*
3443 3376   * Setup all of the uninitialized watched pages that we can.
3444 3377   */
3445 3378  void
3446 3379  as_setwatch(struct as *as)
3447 3380  {
3448 3381          struct watched_page *pwp;
3449 3382          struct seg *seg;
3450 3383          caddr_t vaddr;
3451 3384          uint_t prot;
3452 3385          int  err, retrycnt;
3453 3386  
3454 3387          if (avl_numnodes(&as->a_wpage) == 0)
3455 3388                  return;
3456 3389  
3457 3390          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3458 3391  
3459 3392          for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3460 3393              pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3461 3394                  retrycnt = 0;
3462 3395          retry:
3463 3396                  vaddr = pwp->wp_vaddr;
3464 3397                  if (pwp->wp_oprot != 0 ||       /* already set up */
3465 3398                      (seg = as_segat(as, vaddr)) == NULL ||
3466 3399                      SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3467 3400                          continue;
3468 3401  
3469 3402                  pwp->wp_oprot = prot;
3470 3403                  if (pwp->wp_read)
3471 3404                          prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3472 3405                  if (pwp->wp_write)
3473 3406                          prot &= ~PROT_WRITE;
3474 3407                  if (pwp->wp_exec)
3475 3408                          prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3476 3409                  if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3477 3410                          err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3478 3411                          if (err == IE_RETRY) {
3479 3412                                  pwp->wp_oprot = 0;
3480 3413                                  ASSERT(retrycnt == 0);
3481 3414                                  retrycnt++;
3482 3415                                  goto retry;
3483 3416                          }
3484 3417                  }
3485 3418                  pwp->wp_prot = prot;
3486 3419          }
3487 3420  }
3488 3421  
3489 3422  /*
3490 3423   * Clear all of the watched pages in the address space.
3491 3424   */
3492 3425  void
3493 3426  as_clearwatch(struct as *as)
3494 3427  {
3495 3428          struct watched_page *pwp;
3496 3429          struct seg *seg;
3497 3430          caddr_t vaddr;
3498 3431          uint_t prot;
3499 3432          int err, retrycnt;
3500 3433  
3501 3434          if (avl_numnodes(&as->a_wpage) == 0)
3502 3435                  return;
3503 3436  
3504 3437          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3505 3438  
3506 3439          for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3507 3440              pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3508 3441                  retrycnt = 0;
3509 3442          retry:
3510 3443                  vaddr = pwp->wp_vaddr;
3511 3444                  if (pwp->wp_oprot == 0 ||       /* not set up */
3512 3445                      (seg = as_segat(as, vaddr)) == NULL)
3513 3446                          continue;
3514 3447  
3515 3448                  if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3516 3449                          err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3517 3450                          if (err == IE_RETRY) {
3518 3451                                  ASSERT(retrycnt == 0);
3519 3452                                  retrycnt++;
3520 3453                                  goto retry;
3521 3454                          }
3522 3455                  }
3523 3456                  pwp->wp_oprot = 0;
3524 3457                  pwp->wp_prot = 0;
3525 3458          }
3526 3459  }
3527 3460  
3528 3461  /*
3529 3462   * Force a new setup for all the watched pages in the range.
3530 3463   */
3531 3464  static void
3532 3465  as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3533 3466  {
3534 3467          struct watched_page *pwp;
3535 3468          struct watched_page tpw;
3536 3469          caddr_t eaddr = addr + size;
3537 3470          caddr_t vaddr;
3538 3471          struct seg *seg;
3539 3472          int err, retrycnt;
3540 3473          uint_t  wprot;
3541 3474          avl_index_t where;
3542 3475  
3543 3476          if (avl_numnodes(&as->a_wpage) == 0)
3544 3477                  return;
3545 3478  
3546 3479          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3547 3480  
3548 3481          tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3549 3482          if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3550 3483                  pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3551 3484  
3552 3485          while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3553 3486                  retrycnt = 0;
3554 3487                  vaddr = pwp->wp_vaddr;
3555 3488  
3556 3489                  wprot = prot;
3557 3490                  if (pwp->wp_read)
3558 3491                          wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3559 3492                  if (pwp->wp_write)
3560 3493                          wprot &= ~PROT_WRITE;
3561 3494                  if (pwp->wp_exec)
3562 3495                          wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3563 3496                  if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3564 3497                  retry:
3565 3498                          seg = as_segat(as, vaddr);
3566 3499                          if (seg == NULL) {
3567 3500                                  panic("as_setwatchprot: no seg");
3568 3501                                  /*NOTREACHED*/
3569 3502                          }
3570 3503                          err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3571 3504                          if (err == IE_RETRY) {
3572 3505                                  ASSERT(retrycnt == 0);
3573 3506                                  retrycnt++;
3574 3507                                  goto retry;
3575 3508                          }
3576 3509                  }
3577 3510                  pwp->wp_oprot = prot;
3578 3511                  pwp->wp_prot = wprot;
3579 3512  
3580 3513                  pwp = AVL_NEXT(&as->a_wpage, pwp);
3581 3514          }
3582 3515  }
3583 3516  
3584 3517  /*
3585 3518   * Clear all of the watched pages in the range.
3586 3519   */
3587 3520  static void
3588 3521  as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3589 3522  {
3590 3523          caddr_t eaddr = addr + size;
3591 3524          struct watched_page *pwp;
3592 3525          struct watched_page tpw;
3593 3526          uint_t prot;
3594 3527          struct seg *seg;
3595 3528          int err, retrycnt;
3596 3529          avl_index_t where;
3597 3530  
3598 3531          if (avl_numnodes(&as->a_wpage) == 0)
3599 3532                  return;
3600 3533  
3601 3534          tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3602 3535          if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3603 3536                  pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3604 3537  
3605 3538          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3606 3539  
3607 3540          while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3608 3541  
3609 3542                  if ((prot = pwp->wp_oprot) != 0) {
3610 3543                          retrycnt = 0;
3611 3544  
3612 3545                          if (prot != pwp->wp_prot) {
3613 3546                          retry:
3614 3547                                  seg = as_segat(as, pwp->wp_vaddr);
3615 3548                                  if (seg == NULL)
3616 3549                                          continue;
3617 3550                                  err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3618 3551                                      PAGESIZE, prot);
3619 3552                                  if (err == IE_RETRY) {
3620 3553                                          ASSERT(retrycnt == 0);
3621 3554                                          retrycnt++;
3622 3555                                          goto retry;
3623 3556  
3624 3557                                  }
3625 3558                          }
3626 3559                          pwp->wp_oprot = 0;
3627 3560                          pwp->wp_prot = 0;
3628 3561                  }
3629 3562  
3630 3563                  pwp = AVL_NEXT(&as->a_wpage, pwp);
3631 3564          }
3632 3565  }
3633 3566  
3634 3567  void
3635 3568  as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3636 3569  {
3637 3570          struct proc *p;
3638 3571  
3639 3572          mutex_enter(&pidlock);
3640 3573          for (p = practive; p; p = p->p_next) {
3641 3574                  if (p->p_as == as) {
3642 3575                          mutex_enter(&p->p_lock);
3643 3576                          if (p->p_as == as)
3644 3577                                  sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3645 3578                          mutex_exit(&p->p_lock);
3646 3579                  }
3647 3580          }
3648 3581          mutex_exit(&pidlock);
3649 3582  }
3650 3583  
3651 3584  /*
3652 3585   * return memory object ID
3653 3586   */
3654 3587  int
3655 3588  as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3656 3589  {
3657 3590          struct seg      *seg;
3658 3591          int             sts;
3659 3592  
3660 3593          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
3661 3594          seg = as_segat(as, addr);
3662 3595          if (seg == NULL) {
3663 3596                  AS_LOCK_EXIT(as, &as->a_lock);
3664 3597                  return (EFAULT);
3665 3598          }
3666 3599          /*
3667 3600           * catch old drivers which may not support getmemid
3668 3601           */
3669 3602          if (seg->s_ops->getmemid == NULL) {
3670 3603                  AS_LOCK_EXIT(as, &as->a_lock);
3671 3604                  return (ENODEV);
3672 3605          }
3673 3606  
3674 3607          sts = SEGOP_GETMEMID(seg, addr, memidp);
3675 3608  
3676 3609          AS_LOCK_EXIT(as, &as->a_lock);
3677 3610          return (sts);
3678 3611  }

↓ open down ↓

1459 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX