use-NULL-getmemid-segop-as-a-shorthand-for-ENODEV Wdiff usr/src/uts/common/vm/vm_as.c

Print this page

use NULL getmemid segop as a shorthand for ENODEV
Instead of forcing every segment driver to implement a dummy function to
return (hopefully) ENODEV, handle NULL getmemid segop function pointer as
"return ENODEV" shorthand.

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_as.c
          +++ new/usr/src/uts/common/vm/vm_as.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   * Copyright 2015, Joyent, Inc.  All rights reserved.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28   28  /*        All Rights Reserved   */
  29   29  
  30   30  /*
  31   31   * University Copyright- Copyright (c) 1982, 1986, 1988
  32   32   * The Regents of the University of California
  33   33   * All Rights Reserved
  34   34   *
  35   35   * University Acknowledgment- Portions of this document are derived from
  36   36   * software developed by the University of California, Berkeley, and its
  37   37   * contributors.
  38   38   */
  39   39  
  40   40  /*
  41   41   * VM - address spaces.
  42   42   */
  43   43  
  44   44  #include <sys/types.h>
  45   45  #include <sys/t_lock.h>
  46   46  #include <sys/param.h>
  47   47  #include <sys/errno.h>
  48   48  #include <sys/systm.h>
  49   49  #include <sys/mman.h>
  50   50  #include <sys/sysmacros.h>
  51   51  #include <sys/cpuvar.h>
  52   52  #include <sys/sysinfo.h>
  53   53  #include <sys/kmem.h>
  54   54  #include <sys/vnode.h>
  55   55  #include <sys/vmsystm.h>
  56   56  #include <sys/cmn_err.h>
  57   57  #include <sys/debug.h>
  58   58  #include <sys/tnf_probe.h>
  59   59  #include <sys/vtrace.h>
  60   60  
  61   61  #include <vm/hat.h>
  62   62  #include <vm/as.h>
  63   63  #include <vm/seg.h>
  64   64  #include <vm/seg_vn.h>
  65   65  #include <vm/seg_dev.h>
  66   66  #include <vm/seg_kmem.h>
  67   67  #include <vm/seg_map.h>
  68   68  #include <vm/seg_spt.h>
  69   69  #include <vm/page.h>
  70   70  
  71   71  clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
  72   72  
  73   73  static struct kmem_cache *as_cache;
  74   74  
  75   75  static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  76   76  static void as_clearwatchprot(struct as *, caddr_t, size_t);
  77   77  int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
  78   78  
  79   79  
  80   80  /*
  81   81   * Verifying the segment lists is very time-consuming; it may not be
  82   82   * desirable always to define VERIFY_SEGLIST when DEBUG is set.
  83   83   */
  84   84  #ifdef DEBUG
  85   85  #define VERIFY_SEGLIST
  86   86  int do_as_verify = 0;
  87   87  #endif
  88   88  
  89   89  /*
  90   90   * Allocate a new callback data structure entry and fill in the events of
  91   91   * interest, the address range of interest, and the callback argument.
  92   92   * Link the entry on the as->a_callbacks list. A callback entry for the
  93   93   * entire address space may be specified with vaddr = 0 and size = -1.
  94   94   *
  95   95   * CALLERS RESPONSIBILITY: If not calling from within the process context for
  96   96   * the specified as, the caller must guarantee persistence of the specified as
  97   97   * for the duration of this function (eg. pages being locked within the as
  98   98   * will guarantee persistence).
  99   99   */
 100  100  int
 101  101  as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 102  102                  caddr_t vaddr, size_t size, int sleepflag)
 103  103  {
 104  104          struct as_callback      *current_head, *cb;
 105  105          caddr_t                 saddr;
 106  106          size_t                  rsize;
 107  107  
 108  108          /* callback function and an event are mandatory */
 109  109          if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 110  110                  return (EINVAL);
 111  111  
 112  112          /* Adding a callback after as_free has been called is not allowed */
 113  113          if (as == &kas)
 114  114                  return (ENOMEM);
 115  115  
 116  116          /*
 117  117           * vaddr = 0 and size = -1 is used to indicate that the callback range
 118  118           * is the entire address space so no rounding is done in that case.
 119  119           */
 120  120          if (size != -1) {
 121  121                  saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 122  122                  rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 123  123                      (size_t)saddr;
 124  124                  /* check for wraparound */
 125  125                  if (saddr + rsize < saddr)
 126  126                          return (ENOMEM);
 127  127          } else {
 128  128                  if (vaddr != 0)
 129  129                          return (EINVAL);
 130  130                  saddr = vaddr;
 131  131                  rsize = size;
 132  132          }
 133  133  
 134  134          /* Allocate and initialize a callback entry */
 135  135          cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 136  136          if (cb == NULL)
 137  137                  return (EAGAIN);
 138  138  
 139  139          cb->ascb_func = cb_func;
 140  140          cb->ascb_arg = arg;
 141  141          cb->ascb_events = events;
 142  142          cb->ascb_saddr = saddr;
 143  143          cb->ascb_len = rsize;
 144  144  
 145  145          /* Add the entry to the list */
 146  146          mutex_enter(&as->a_contents);
 147  147          current_head = as->a_callbacks;
 148  148          as->a_callbacks = cb;
 149  149          cb->ascb_next = current_head;
 150  150  
 151  151          /*
 152  152           * The call to this function may lose in a race with
 153  153           * a pertinent event - eg. a thread does long term memory locking
 154  154           * but before the callback is added another thread executes as_unmap.
 155  155           * A broadcast here resolves that.
 156  156           */
 157  157          if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 158  158                  AS_CLRUNMAPWAIT(as);
 159  159                  cv_broadcast(&as->a_cv);
 160  160          }
 161  161  
 162  162          mutex_exit(&as->a_contents);
 163  163          return (0);
 164  164  }
 165  165  
 166  166  /*
 167  167   * Search the callback list for an entry which pertains to arg.
 168  168   *
 169  169   * This is called from within the client upon completion of the callback.
 170  170   * RETURN VALUES:
 171  171   *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 172  172   *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 173  173   *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 174  174   *                      entry will be made in as_do_callbacks)
 175  175   *
 176  176   * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 177  177   * set, it indicates that as_do_callbacks is processing this entry.  The
 178  178   * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 179  179   * to unblock as_do_callbacks, in case it is blocked.
 180  180   *
 181  181   * CALLERS RESPONSIBILITY: If not calling from within the process context for
 182  182   * the specified as, the caller must guarantee persistence of the specified as
 183  183   * for the duration of this function (eg. pages being locked within the as
 184  184   * will guarantee persistence).
 185  185   */
 186  186  uint_t
 187  187  as_delete_callback(struct as *as, void *arg)
 188  188  {
 189  189          struct as_callback **prevcb = &as->a_callbacks;
 190  190          struct as_callback *cb;
 191  191          uint_t rc = AS_CALLBACK_NOTFOUND;
 192  192  
 193  193          mutex_enter(&as->a_contents);
 194  194          for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 195  195                  if (cb->ascb_arg != arg)
 196  196                          continue;
 197  197  
 198  198                  /*
 199  199                   * If the events indicate AS_CALLBACK_CALLED, just clear
 200  200                   * AS_ALL_EVENT in the events field and wakeup the thread
 201  201                   * that may be waiting in as_do_callbacks.  as_do_callbacks
 202  202                   * will take care of removing this entry from the list.  In
 203  203                   * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 204  204                   * (AS_CALLBACK_CALLED not set), just remove it from the
 205  205                   * list, return the memory and return AS_CALLBACK_DELETED.
 206  206                   */
 207  207                  if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 208  208                          /* leave AS_CALLBACK_CALLED */
 209  209                          cb->ascb_events &= ~AS_ALL_EVENT;
 210  210                          rc = AS_CALLBACK_DELETE_DEFERRED;
 211  211                          cv_broadcast(&as->a_cv);
 212  212                  } else {
 213  213                          *prevcb = cb->ascb_next;
 214  214                          kmem_free(cb, sizeof (struct as_callback));
 215  215                          rc = AS_CALLBACK_DELETED;
 216  216                  }
 217  217                  break;
 218  218          }
 219  219          mutex_exit(&as->a_contents);
 220  220          return (rc);
 221  221  }
 222  222  
 223  223  /*
 224  224   * Searches the as callback list for a matching entry.
 225  225   * Returns a pointer to the first matching callback, or NULL if
 226  226   * nothing is found.
 227  227   * This function never sleeps so it is ok to call it with more
 228  228   * locks held but the (required) a_contents mutex.
 229  229   *
 230  230   * See also comment on as_do_callbacks below.
 231  231   */
 232  232  static struct as_callback *
 233  233  as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 234  234                          size_t event_len)
 235  235  {
 236  236          struct as_callback      *cb;
 237  237  
 238  238          ASSERT(MUTEX_HELD(&as->a_contents));
 239  239          for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 240  240                  /*
 241  241                   * If the callback has not already been called, then
 242  242                   * check if events or address range pertains.  An event_len
 243  243                   * of zero means do an unconditional callback.
 244  244                   */
 245  245                  if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 246  246                      ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 247  247                      (event_addr + event_len < cb->ascb_saddr) ||
 248  248                      (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 249  249                          continue;
 250  250                  }
 251  251                  break;
 252  252          }
 253  253          return (cb);
 254  254  }
 255  255  
 256  256  /*
 257  257   * Executes a given callback and removes it from the callback list for
 258  258   * this address space.
 259  259   * This function may sleep so the caller must drop all locks except
 260  260   * a_contents before calling this func.
 261  261   *
 262  262   * See also comments on as_do_callbacks below.
 263  263   */
 264  264  static void
 265  265  as_execute_callback(struct as *as, struct as_callback *cb,
 266  266                                  uint_t events)
 267  267  {
 268  268          struct as_callback **prevcb;
 269  269          void    *cb_arg;
 270  270  
 271  271          ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 272  272          cb->ascb_events |= AS_CALLBACK_CALLED;
 273  273          mutex_exit(&as->a_contents);
 274  274          (*cb->ascb_func)(as, cb->ascb_arg, events);
 275  275          mutex_enter(&as->a_contents);
 276  276          /*
 277  277           * the callback function is required to delete the callback
 278  278           * when the callback function determines it is OK for
 279  279           * this thread to continue. as_delete_callback will clear
 280  280           * the AS_ALL_EVENT in the events field when it is deleted.
 281  281           * If the callback function called as_delete_callback,
 282  282           * events will already be cleared and there will be no blocking.
 283  283           */
 284  284          while ((cb->ascb_events & events) != 0) {
 285  285                  cv_wait(&as->a_cv, &as->a_contents);
 286  286          }
 287  287          /*
 288  288           * This entry needs to be taken off the list. Normally, the
 289  289           * callback func itself does that, but unfortunately the list
 290  290           * may have changed while the callback was running because the
 291  291           * a_contents mutex was dropped and someone else other than the
 292  292           * callback func itself could have called as_delete_callback,
 293  293           * so we have to search to find this entry again.  The entry
 294  294           * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 295  295           */
 296  296          cb_arg = cb->ascb_arg;
 297  297          prevcb = &as->a_callbacks;
 298  298          for (cb = as->a_callbacks; cb != NULL;
 299  299              prevcb = &cb->ascb_next, cb = *prevcb) {
 300  300                  if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 301  301                      (cb_arg != cb->ascb_arg)) {
 302  302                          continue;
 303  303                  }
 304  304                  *prevcb = cb->ascb_next;
 305  305                  kmem_free(cb, sizeof (struct as_callback));
 306  306                  break;
 307  307          }
 308  308  }
 309  309  
 310  310  /*
 311  311   * Check the callback list for a matching event and intersection of
 312  312   * address range. If there is a match invoke the callback.  Skip an entry if:
 313  313   *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 314  314   *    - not event of interest
 315  315   *    - not address range of interest
 316  316   *
 317  317   * An event_len of zero indicates a request for an unconditional callback
 318  318   * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 319  319   * a_contents lock must be dropped before a callback, so only one callback
 320  320   * can be done before returning. Return -1 (true) if a callback was
 321  321   * executed and removed from the list, else return 0 (false).
 322  322   *
 323  323   * The logically separate parts, i.e. finding a matching callback and
 324  324   * executing a given callback have been separated into two functions
 325  325   * so that they can be called with different sets of locks held beyond
 326  326   * the always-required a_contents. as_find_callback does not sleep so
 327  327   * it is ok to call it if more locks than a_contents (i.e. the a_lock
 328  328   * rwlock) are held. as_execute_callback on the other hand may sleep
 329  329   * so all locks beyond a_contents must be dropped by the caller if one
 330  330   * does not want to end comatose.
 331  331   */
 332  332  static int
 333  333  as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 334  334                          size_t event_len)
 335  335  {
 336  336          struct as_callback *cb;
 337  337  
 338  338          if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 339  339                  as_execute_callback(as, cb, events);
 340  340                  return (-1);
 341  341          }
 342  342          return (0);
 343  343  }
 344  344  
 345  345  /*
 346  346   * Search for the segment containing addr. If a segment containing addr
 347  347   * exists, that segment is returned.  If no such segment exists, and
 348  348   * the list spans addresses greater than addr, then the first segment
 349  349   * whose base is greater than addr is returned; otherwise, NULL is
 350  350   * returned unless tail is true, in which case the last element of the
 351  351   * list is returned.
 352  352   *
 353  353   * a_seglast is used to cache the last found segment for repeated
 354  354   * searches to the same addr (which happens frequently).
 355  355   */
 356  356  struct seg *
 357  357  as_findseg(struct as *as, caddr_t addr, int tail)
 358  358  {
 359  359          struct seg *seg = as->a_seglast;
 360  360          avl_index_t where;
 361  361  
 362  362          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 363  363  
 364  364          if (seg != NULL &&
 365  365              seg->s_base <= addr &&
 366  366              addr < seg->s_base + seg->s_size)
 367  367                  return (seg);
 368  368  
 369  369          seg = avl_find(&as->a_segtree, &addr, &where);
 370  370          if (seg != NULL)
 371  371                  return (as->a_seglast = seg);
 372  372  
 373  373          seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 374  374          if (seg == NULL && tail)
 375  375                  seg = avl_last(&as->a_segtree);
 376  376          return (as->a_seglast = seg);
 377  377  }
 378  378  
 379  379  #ifdef VERIFY_SEGLIST
 380  380  /*
 381  381   * verify that the linked list is coherent
 382  382   */
 383  383  static void
 384  384  as_verify(struct as *as)
 385  385  {
 386  386          struct seg *seg, *seglast, *p, *n;
 387  387          uint_t nsegs = 0;
 388  388  
 389  389          if (do_as_verify == 0)
 390  390                  return;
 391  391  
 392  392          seglast = as->a_seglast;
 393  393  
 394  394          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 395  395                  ASSERT(seg->s_as == as);
 396  396                  p = AS_SEGPREV(as, seg);
 397  397                  n = AS_SEGNEXT(as, seg);
 398  398                  ASSERT(p == NULL || p->s_as == as);
 399  399                  ASSERT(p == NULL || p->s_base < seg->s_base);
 400  400                  ASSERT(n == NULL || n->s_base > seg->s_base);
 401  401                  ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 402  402                  if (seg == seglast)
 403  403                          seglast = NULL;
 404  404                  nsegs++;
 405  405          }
 406  406          ASSERT(seglast == NULL);
 407  407          ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 408  408  }
 409  409  #endif /* VERIFY_SEGLIST */
 410  410  
 411  411  /*
 412  412   * Add a new segment to the address space. The avl_find()
 413  413   * may be expensive so we attempt to use last segment accessed
 414  414   * in as_gap() as an insertion point.
 415  415   */
 416  416  int
 417  417  as_addseg(struct as  *as, struct seg *newseg)
 418  418  {
 419  419          struct seg *seg;
 420  420          caddr_t addr;
 421  421          caddr_t eaddr;
 422  422          avl_index_t where;
 423  423  
 424  424          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 425  425  
 426  426          as->a_updatedir = 1;    /* inform /proc */
 427  427          gethrestime(&as->a_updatetime);
 428  428  
 429  429          if (as->a_lastgaphl != NULL) {
 430  430                  struct seg *hseg = NULL;
 431  431                  struct seg *lseg = NULL;
 432  432  
 433  433                  if (as->a_lastgaphl->s_base > newseg->s_base) {
 434  434                          hseg = as->a_lastgaphl;
 435  435                          lseg = AVL_PREV(&as->a_segtree, hseg);
 436  436                  } else {
 437  437                          lseg = as->a_lastgaphl;
 438  438                          hseg = AVL_NEXT(&as->a_segtree, lseg);
 439  439                  }
 440  440  
 441  441                  if (hseg && lseg && lseg->s_base < newseg->s_base &&
 442  442                      hseg->s_base > newseg->s_base) {
 443  443                          avl_insert_here(&as->a_segtree, newseg, lseg,
 444  444                              AVL_AFTER);
 445  445                          as->a_lastgaphl = NULL;
 446  446                          as->a_seglast = newseg;
 447  447                          return (0);
 448  448                  }
 449  449                  as->a_lastgaphl = NULL;
 450  450          }
 451  451  
 452  452          addr = newseg->s_base;
 453  453          eaddr = addr + newseg->s_size;
 454  454  again:
 455  455  
 456  456          seg = avl_find(&as->a_segtree, &addr, &where);
 457  457  
 458  458          if (seg == NULL)
 459  459                  seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 460  460  
 461  461          if (seg == NULL)
 462  462                  seg = avl_last(&as->a_segtree);
 463  463  
 464  464          if (seg != NULL) {
 465  465                  caddr_t base = seg->s_base;
 466  466  
 467  467                  /*
 468  468                   * If top of seg is below the requested address, then
 469  469                   * the insertion point is at the end of the linked list,
 470  470                   * and seg points to the tail of the list.  Otherwise,
 471  471                   * the insertion point is immediately before seg.
 472  472                   */
 473  473                  if (base + seg->s_size > addr) {
 474  474                          if (addr >= base || eaddr > base) {
 475  475  #ifdef __sparc
 476  476                                  extern struct seg_ops segnf_ops;
 477  477  
 478  478                                  /*
 479  479                                   * no-fault segs must disappear if overlaid.
 480  480                                   * XXX need new segment type so
 481  481                                   * we don't have to check s_ops
 482  482                                   */
 483  483                                  if (seg->s_ops == &segnf_ops) {
 484  484                                          seg_unmap(seg);
 485  485                                          goto again;
 486  486                                  }
 487  487  #endif
 488  488                                  return (-1);    /* overlapping segment */
 489  489                          }
 490  490                  }
 491  491          }
 492  492          as->a_seglast = newseg;
 493  493          avl_insert(&as->a_segtree, newseg, where);
 494  494  
 495  495  #ifdef VERIFY_SEGLIST
 496  496          as_verify(as);
 497  497  #endif
 498  498          return (0);
 499  499  }
 500  500  
 501  501  struct seg *
 502  502  as_removeseg(struct as *as, struct seg *seg)
 503  503  {
 504  504          avl_tree_t *t;
 505  505  
 506  506          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 507  507  
 508  508          as->a_updatedir = 1;    /* inform /proc */
 509  509          gethrestime(&as->a_updatetime);
 510  510  
 511  511          if (seg == NULL)
 512  512                  return (NULL);
 513  513  
 514  514          t = &as->a_segtree;
 515  515          if (as->a_seglast == seg)
 516  516                  as->a_seglast = NULL;
 517  517          as->a_lastgaphl = NULL;
 518  518  
 519  519          /*
 520  520           * if this segment is at an address higher than
 521  521           * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 522  522           */
 523  523          if (as->a_lastgap &&
 524  524              (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 525  525                  as->a_lastgap = AVL_NEXT(t, seg);
 526  526  
 527  527          /*
 528  528           * remove the segment from the seg tree
 529  529           */
 530  530          avl_remove(t, seg);
 531  531  
 532  532  #ifdef VERIFY_SEGLIST
 533  533          as_verify(as);
 534  534  #endif
 535  535          return (seg);
 536  536  }
 537  537  
 538  538  /*
 539  539   * Find a segment containing addr.
 540  540   */
 541  541  struct seg *
 542  542  as_segat(struct as *as, caddr_t addr)
 543  543  {
 544  544          struct seg *seg = as->a_seglast;
 545  545  
 546  546          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 547  547  
 548  548          if (seg != NULL && seg->s_base <= addr &&
 549  549              addr < seg->s_base + seg->s_size)
 550  550                  return (seg);
 551  551  
 552  552          seg = avl_find(&as->a_segtree, &addr, NULL);
 553  553          return (seg);
 554  554  }
 555  555  
 556  556  /*
 557  557   * Serialize all searches for holes in an address space to
 558  558   * prevent two or more threads from allocating the same virtual
 559  559   * address range.  The address space must not be "read/write"
 560  560   * locked by the caller since we may block.
 561  561   */
 562  562  void
 563  563  as_rangelock(struct as *as)
 564  564  {
 565  565          mutex_enter(&as->a_contents);
 566  566          while (AS_ISCLAIMGAP(as))
 567  567                  cv_wait(&as->a_cv, &as->a_contents);
 568  568          AS_SETCLAIMGAP(as);
 569  569          mutex_exit(&as->a_contents);
 570  570  }
 571  571  
 572  572  /*
 573  573   * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 574  574   */
 575  575  void
 576  576  as_rangeunlock(struct as *as)
 577  577  {
 578  578          mutex_enter(&as->a_contents);
 579  579          AS_CLRCLAIMGAP(as);
 580  580          cv_signal(&as->a_cv);
 581  581          mutex_exit(&as->a_contents);
 582  582  }
 583  583  
 584  584  /*
 585  585   * compar segments (or just an address) by segment address range
 586  586   */
 587  587  static int
 588  588  as_segcompar(const void *x, const void *y)
 589  589  {
 590  590          struct seg *a = (struct seg *)x;
 591  591          struct seg *b = (struct seg *)y;
 592  592  
 593  593          if (a->s_base < b->s_base)
 594  594                  return (-1);
 595  595          if (a->s_base >= b->s_base + b->s_size)
 596  596                  return (1);
 597  597          return (0);
 598  598  }
 599  599  
 600  600  
 601  601  void
 602  602  as_avlinit(struct as *as)
 603  603  {
 604  604          avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 605  605              offsetof(struct seg, s_tree));
 606  606          avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 607  607              offsetof(struct watched_page, wp_link));
 608  608  }
 609  609  
 610  610  /*ARGSUSED*/
 611  611  static int
 612  612  as_constructor(void *buf, void *cdrarg, int kmflags)
 613  613  {
 614  614          struct as *as = buf;
 615  615  
 616  616          mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 617  617          cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 618  618          rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 619  619          as_avlinit(as);
 620  620          return (0);
 621  621  }
 622  622  
 623  623  /*ARGSUSED1*/
 624  624  static void
 625  625  as_destructor(void *buf, void *cdrarg)
 626  626  {
 627  627          struct as *as = buf;
 628  628  
 629  629          avl_destroy(&as->a_segtree);
 630  630          mutex_destroy(&as->a_contents);
 631  631          cv_destroy(&as->a_cv);
 632  632          rw_destroy(&as->a_lock);
 633  633  }
 634  634  
 635  635  void
 636  636  as_init(void)
 637  637  {
 638  638          as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 639  639              as_constructor, as_destructor, NULL, NULL, NULL, 0);
 640  640  }
 641  641  
 642  642  /*
 643  643   * Allocate and initialize an address space data structure.
 644  644   * We call hat_alloc to allow any machine dependent
 645  645   * information in the hat structure to be initialized.
 646  646   */
 647  647  struct as *
 648  648  as_alloc(void)
 649  649  {
 650  650          struct as *as;
 651  651  
 652  652          as = kmem_cache_alloc(as_cache, KM_SLEEP);
 653  653  
 654  654          as->a_flags             = 0;
 655  655          as->a_vbits             = 0;
 656  656          as->a_hrm               = NULL;
 657  657          as->a_seglast           = NULL;
 658  658          as->a_size              = 0;
 659  659          as->a_resvsize          = 0;
 660  660          as->a_updatedir         = 0;
 661  661          gethrestime(&as->a_updatetime);
 662  662          as->a_objectdir         = NULL;
 663  663          as->a_sizedir           = 0;
 664  664          as->a_userlimit         = (caddr_t)USERLIMIT;
 665  665          as->a_lastgap           = NULL;
 666  666          as->a_lastgaphl         = NULL;
 667  667          as->a_callbacks         = NULL;
 668  668  
 669  669          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 670  670          as->a_hat = hat_alloc(as);      /* create hat for default system mmu */
 671  671          AS_LOCK_EXIT(as, &as->a_lock);
 672  672  
 673  673          return (as);
 674  674  }
 675  675  
 676  676  /*
 677  677   * Free an address space data structure.
 678  678   * Need to free the hat first and then
 679  679   * all the segments on this as and finally
 680  680   * the space for the as struct itself.
 681  681   */
 682  682  void
 683  683  as_free(struct as *as)
 684  684  {
 685  685          struct hat *hat = as->a_hat;
 686  686          struct seg *seg, *next;
 687  687          boolean_t free_started = B_FALSE;
 688  688  
 689  689  top:
 690  690          /*
 691  691           * Invoke ALL callbacks. as_do_callbacks will do one callback
 692  692           * per call, and not return (-1) until the callback has completed.
 693  693           * When as_do_callbacks returns zero, all callbacks have completed.
 694  694           */
 695  695          mutex_enter(&as->a_contents);
 696  696          while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 697  697                  ;
 698  698  
 699  699          mutex_exit(&as->a_contents);
 700  700          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 701  701  
 702  702          if (!free_started) {
 703  703                  free_started = B_TRUE;
 704  704                  hat_free_start(hat);
 705  705          }
 706  706          for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 707  707                  int err;
 708  708  
 709  709                  next = AS_SEGNEXT(as, seg);
 710  710  retry:
 711  711                  err = segop_unmap(seg, seg->s_base, seg->s_size);
 712  712                  if (err == EAGAIN) {
 713  713                          mutex_enter(&as->a_contents);
 714  714                          if (as->a_callbacks) {
 715  715                                  AS_LOCK_EXIT(as, &as->a_lock);
 716  716                          } else if (!AS_ISNOUNMAPWAIT(as)) {
 717  717                                  /*
 718  718                                   * Memory is currently locked. Wait for a
 719  719                                   * cv_signal that it has been unlocked, then
 720  720                                   * try the operation again.
 721  721                                   */
 722  722                                  if (AS_ISUNMAPWAIT(as) == 0)
 723  723                                          cv_broadcast(&as->a_cv);
 724  724                                  AS_SETUNMAPWAIT(as);
 725  725                                  AS_LOCK_EXIT(as, &as->a_lock);
 726  726                                  while (AS_ISUNMAPWAIT(as))
 727  727                                          cv_wait(&as->a_cv, &as->a_contents);
 728  728                          } else {
 729  729                                  /*
 730  730                                   * We may have raced with
 731  731                                   * segvn_reclaim()/segspt_reclaim(). In this
 732  732                                   * case clean nounmapwait flag and retry since
 733  733                                   * softlockcnt in this segment may be already
 734  734                                   * 0.  We don't drop as writer lock so our
 735  735                                   * number of retries without sleeping should
 736  736                                   * be very small. See segvn_reclaim() for
 737  737                                   * more comments.
 738  738                                   */
 739  739                                  AS_CLRNOUNMAPWAIT(as);
 740  740                                  mutex_exit(&as->a_contents);
 741  741                                  goto retry;
 742  742                          }
 743  743                          mutex_exit(&as->a_contents);
 744  744                          goto top;
 745  745                  } else {
 746  746                          /*
 747  747                           * We do not expect any other error return at this
 748  748                           * time. This is similar to an ASSERT in seg_unmap()
 749  749                           */
 750  750                          ASSERT(err == 0);
 751  751                  }
 752  752          }
 753  753          hat_free_end(hat);
 754  754          AS_LOCK_EXIT(as, &as->a_lock);
 755  755  
 756  756          /* /proc stuff */
 757  757          ASSERT(avl_numnodes(&as->a_wpage) == 0);
 758  758          if (as->a_objectdir) {
 759  759                  kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 760  760                  as->a_objectdir = NULL;
 761  761                  as->a_sizedir = 0;
 762  762          }
 763  763  
 764  764          /*
 765  765           * Free the struct as back to kmem.  Assert it has no segments.
 766  766           */
 767  767          ASSERT(avl_numnodes(&as->a_segtree) == 0);
 768  768          kmem_cache_free(as_cache, as);
 769  769  }
 770  770  
 771  771  int
 772  772  as_dup(struct as *as, struct proc *forkedproc)
 773  773  {
 774  774          struct as *newas;
 775  775          struct seg *seg, *newseg;
 776  776          size_t  purgesize = 0;
 777  777          int error;
 778  778  
 779  779          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 780  780          as_clearwatch(as);
 781  781          newas = as_alloc();
 782  782          newas->a_userlimit = as->a_userlimit;
 783  783          newas->a_proc = forkedproc;
 784  784  
 785  785          AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
 786  786  
 787  787          (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 788  788  
 789  789          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 790  790  
 791  791                  if (seg->s_flags & S_PURGE) {
 792  792                          purgesize += seg->s_size;
 793  793                          continue;
 794  794                  }
 795  795  
 796  796                  newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 797  797                  if (newseg == NULL) {
 798  798                          AS_LOCK_EXIT(newas, &newas->a_lock);
 799  799                          as_setwatch(as);
 800  800                          AS_LOCK_EXIT(as, &as->a_lock);
 801  801                          as_free(newas);
 802  802                          return (-1);
 803  803                  }
 804  804                  if ((error = segop_dup(seg, newseg)) != 0) {
 805  805                          /*
 806  806                           * We call seg_free() on the new seg
 807  807                           * because the segment is not set up
 808  808                           * completely; i.e. it has no ops.
 809  809                           */
 810  810                          as_setwatch(as);
 811  811                          AS_LOCK_EXIT(as, &as->a_lock);
 812  812                          seg_free(newseg);
 813  813                          AS_LOCK_EXIT(newas, &newas->a_lock);
 814  814                          as_free(newas);
 815  815                          return (error);
 816  816                  }
 817  817                  newas->a_size += seg->s_size;
 818  818          }
 819  819          newas->a_resvsize = as->a_resvsize - purgesize;
 820  820  
 821  821          error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 822  822  
 823  823          AS_LOCK_EXIT(newas, &newas->a_lock);
 824  824  
 825  825          as_setwatch(as);
 826  826          AS_LOCK_EXIT(as, &as->a_lock);
 827  827          if (error != 0) {
 828  828                  as_free(newas);
 829  829                  return (error);
 830  830          }
 831  831          forkedproc->p_as = newas;
 832  832          return (0);
 833  833  }
 834  834  
 835  835  /*
 836  836   * Handle a ``fault'' at addr for size bytes.
 837  837   */
 838  838  faultcode_t
 839  839  as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 840  840          enum fault_type type, enum seg_rw rw)
 841  841  {
 842  842          struct seg *seg;
 843  843          caddr_t raddr;                  /* rounded down addr */
 844  844          size_t rsize;                   /* rounded up size */
 845  845          size_t ssize;
 846  846          faultcode_t res = 0;
 847  847          caddr_t addrsav;
 848  848          struct seg *segsav;
 849  849          int as_lock_held;
 850  850          klwp_t *lwp = ttolwp(curthread);
 851  851          int holding_wpage = 0;
 852  852  
 853  853  
 854  854  
 855  855  retry:
 856  856          /*
 857  857           * Indicate that the lwp is not to be stopped while waiting for a
 858  858           * pagefault.  This is to avoid deadlock while debugging a process
 859  859           * via /proc over NFS (in particular).
 860  860           */
 861  861          if (lwp != NULL)
 862  862                  lwp->lwp_nostop++;
 863  863  
 864  864          /*
 865  865           * same length must be used when we softlock and softunlock.  We
 866  866           * don't support softunlocking lengths less than the original length
 867  867           * when there is largepage support.  See seg_dev.c for more
 868  868           * comments.
 869  869           */
 870  870          switch (type) {
 871  871  
 872  872          case F_SOFTLOCK:
 873  873                  CPU_STATS_ADD_K(vm, softlock, 1);
 874  874                  break;
 875  875  
 876  876          case F_SOFTUNLOCK:
 877  877                  break;
 878  878  
 879  879          case F_PROT:
 880  880                  CPU_STATS_ADD_K(vm, prot_fault, 1);
 881  881                  break;
 882  882  
 883  883          case F_INVAL:
 884  884                  CPU_STATS_ENTER_K();
 885  885                  CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 886  886                  if (as == &kas)
 887  887                          CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 888  888                  CPU_STATS_EXIT_K();
 889  889                  break;
 890  890          }
 891  891  
 892  892          /* Kernel probe */
 893  893          TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
 894  894              tnf_opaque, address,        addr,
 895  895              tnf_fault_type,     fault_type,     type,
 896  896              tnf_seg_access,     access,         rw);
 897  897  
 898  898          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 899  899          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 900  900              (size_t)raddr;
 901  901  
 902  902          /*
 903  903           * XXX -- Don't grab the as lock for segkmap. We should grab it for
 904  904           * correctness, but then we could be stuck holding this lock for
 905  905           * a LONG time if the fault needs to be resolved on a slow
 906  906           * filesystem, and then no-one will be able to exec new commands,
 907  907           * as exec'ing requires the write lock on the as.
 908  908           */
 909  909          if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 910  910              raddr + size < segkmap->s_base + segkmap->s_size) {
 911  911                  seg = segkmap;
 912  912                  as_lock_held = 0;
 913  913          } else {
 914  914                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
 915  915  
 916  916                  seg = as_segat(as, raddr);
 917  917                  if (seg == NULL) {
 918  918                          AS_LOCK_EXIT(as, &as->a_lock);
 919  919                          if (lwp != NULL)
 920  920                                  lwp->lwp_nostop--;
 921  921                          return (FC_NOMAP);
 922  922                  }
 923  923  
 924  924                  as_lock_held = 1;
 925  925          }
 926  926  
 927  927          addrsav = raddr;
 928  928          segsav = seg;
 929  929  
 930  930          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 931  931                  if (raddr >= seg->s_base + seg->s_size) {
 932  932                          seg = AS_SEGNEXT(as, seg);
 933  933                          if (seg == NULL || raddr != seg->s_base) {
 934  934                                  res = FC_NOMAP;
 935  935                                  break;
 936  936                          }
 937  937                  }
 938  938                  if (raddr + rsize > seg->s_base + seg->s_size)
 939  939                          ssize = seg->s_base + seg->s_size - raddr;
 940  940                  else
 941  941                          ssize = rsize;
 942  942  
 943  943                  res = segop_fault(hat, seg, raddr, ssize, type, rw);
 944  944  
 945  945                  /* Restore watchpoints */
 946  946                  if (holding_wpage) {
 947  947                          as_setwatch(as);
 948  948                          holding_wpage = 0;
 949  949                  }
 950  950  
 951  951                  if (res != 0)
 952  952                          break;
 953  953          }
 954  954  
 955  955          /*
 956  956           * If we were SOFTLOCKing and encountered a failure,
 957  957           * we must SOFTUNLOCK the range we already did. (Maybe we
 958  958           * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
 959  959           * right here...)
 960  960           */
 961  961          if (res != 0 && type == F_SOFTLOCK) {
 962  962                  for (seg = segsav; addrsav < raddr; addrsav += ssize) {
 963  963                          if (addrsav >= seg->s_base + seg->s_size)
 964  964                                  seg = AS_SEGNEXT(as, seg);
 965  965                          ASSERT(seg != NULL);
 966  966                          /*
 967  967                           * Now call the fault routine again to perform the
 968  968                           * unlock using S_OTHER instead of the rw variable
 969  969                           * since we never got a chance to touch the pages.
 970  970                           */
 971  971                          if (raddr > seg->s_base + seg->s_size)
 972  972                                  ssize = seg->s_base + seg->s_size - addrsav;
 973  973                          else
 974  974                                  ssize = raddr - addrsav;
 975  975                          (void) segop_fault(hat, seg, addrsav, ssize,
 976  976                              F_SOFTUNLOCK, S_OTHER);
 977  977                  }
 978  978          }
 979  979          if (as_lock_held)
 980  980                  AS_LOCK_EXIT(as, &as->a_lock);
 981  981          if (lwp != NULL)
 982  982                  lwp->lwp_nostop--;
 983  983  
 984  984          /*
 985  985           * If the lower levels returned EDEADLK for a fault,
 986  986           * It means that we should retry the fault.  Let's wait
 987  987           * a bit also to let the deadlock causing condition clear.
 988  988           * This is part of a gross hack to work around a design flaw
 989  989           * in the ufs/sds logging code and should go away when the
 990  990           * logging code is re-designed to fix the problem. See bug
 991  991           * 4125102 for details of the problem.
 992  992           */
 993  993          if (FC_ERRNO(res) == EDEADLK) {
 994  994                  delay(deadlk_wait);
 995  995                  res = 0;
 996  996                  goto retry;
 997  997          }
 998  998          return (res);
 999  999  }
1000 1000  
1001 1001  
1002 1002  
1003 1003  /*
1004 1004   * Asynchronous ``fault'' at addr for size bytes.
1005 1005   */
1006 1006  faultcode_t
1007 1007  as_faulta(struct as *as, caddr_t addr, size_t size)
1008 1008  {
1009 1009          struct seg *seg;
1010 1010          caddr_t raddr;                  /* rounded down addr */
1011 1011          size_t rsize;                   /* rounded up size */
1012 1012          faultcode_t res = 0;
1013 1013          klwp_t *lwp = ttolwp(curthread);
1014 1014  
1015 1015  retry:
1016 1016          /*
1017 1017           * Indicate that the lwp is not to be stopped while waiting
1018 1018           * for a pagefault.  This is to avoid deadlock while debugging
1019 1019           * a process via /proc over NFS (in particular).
1020 1020           */
1021 1021          if (lwp != NULL)
1022 1022                  lwp->lwp_nostop++;
1023 1023  
1024 1024          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1025 1025          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1026 1026              (size_t)raddr;
1027 1027  
1028 1028          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1029 1029          seg = as_segat(as, raddr);
1030 1030          if (seg == NULL) {
1031 1031                  AS_LOCK_EXIT(as, &as->a_lock);
1032 1032                  if (lwp != NULL)
1033 1033                          lwp->lwp_nostop--;
1034 1034                  return (FC_NOMAP);
1035 1035          }
1036 1036  
1037 1037          for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1038 1038                  if (raddr >= seg->s_base + seg->s_size) {
1039 1039                          seg = AS_SEGNEXT(as, seg);
1040 1040                          if (seg == NULL || raddr != seg->s_base) {
1041 1041                                  res = FC_NOMAP;
1042 1042                                  break;
1043 1043                          }
1044 1044                  }
1045 1045                  res = segop_faulta(seg, raddr);
1046 1046                  if (res != 0)
1047 1047                          break;
1048 1048          }
1049 1049          AS_LOCK_EXIT(as, &as->a_lock);
1050 1050          if (lwp != NULL)
1051 1051                  lwp->lwp_nostop--;
1052 1052          /*
1053 1053           * If the lower levels returned EDEADLK for a fault,
1054 1054           * It means that we should retry the fault.  Let's wait
1055 1055           * a bit also to let the deadlock causing condition clear.
1056 1056           * This is part of a gross hack to work around a design flaw
1057 1057           * in the ufs/sds logging code and should go away when the
1058 1058           * logging code is re-designed to fix the problem. See bug
1059 1059           * 4125102 for details of the problem.
1060 1060           */
1061 1061          if (FC_ERRNO(res) == EDEADLK) {
1062 1062                  delay(deadlk_wait);
1063 1063                  res = 0;
1064 1064                  goto retry;
1065 1065          }
1066 1066          return (res);
1067 1067  }
1068 1068  
1069 1069  /*
1070 1070   * Set the virtual mapping for the interval from [addr : addr + size)
1071 1071   * in address space `as' to have the specified protection.
1072 1072   * It is ok for the range to cross over several segments,
1073 1073   * as long as they are contiguous.
1074 1074   */
1075 1075  int
1076 1076  as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1077 1077  {
1078 1078          struct seg *seg;
1079 1079          struct as_callback *cb;
1080 1080          size_t ssize;
1081 1081          caddr_t raddr;                  /* rounded down addr */
1082 1082          size_t rsize;                   /* rounded up size */
1083 1083          int error = 0, writer = 0;
1084 1084          caddr_t saveraddr;
1085 1085          size_t saversize;
1086 1086  
1087 1087  setprot_top:
1088 1088          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1089 1089          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1090 1090              (size_t)raddr;
1091 1091  
1092 1092          if (raddr + rsize < raddr)              /* check for wraparound */
1093 1093                  return (ENOMEM);
1094 1094  
1095 1095          saveraddr = raddr;
1096 1096          saversize = rsize;
1097 1097  
1098 1098          /*
1099 1099           * Normally we only lock the as as a reader. But
1100 1100           * if due to setprot the segment driver needs to split
1101 1101           * a segment it will return IE_RETRY. Therefore we re-acquire
1102 1102           * the as lock as a writer so the segment driver can change
1103 1103           * the seg list. Also the segment driver will return IE_RETRY
1104 1104           * after it has changed the segment list so we therefore keep
1105 1105           * locking as a writer. Since these opeartions should be rare
1106 1106           * want to only lock as a writer when necessary.
1107 1107           */
1108 1108          if (writer || avl_numnodes(&as->a_wpage) != 0) {
1109 1109                  AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1110 1110          } else {
1111 1111                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1112 1112          }
1113 1113  
1114 1114          as_clearwatchprot(as, raddr, rsize);
1115 1115          seg = as_segat(as, raddr);
1116 1116          if (seg == NULL) {
1117 1117                  as_setwatch(as);
1118 1118                  AS_LOCK_EXIT(as, &as->a_lock);
1119 1119                  return (ENOMEM);
1120 1120          }
1121 1121  
1122 1122          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1123 1123                  if (raddr >= seg->s_base + seg->s_size) {
1124 1124                          seg = AS_SEGNEXT(as, seg);
1125 1125                          if (seg == NULL || raddr != seg->s_base) {
1126 1126                                  error = ENOMEM;
1127 1127                                  break;
1128 1128                          }
1129 1129                  }
1130 1130                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
1131 1131                          ssize = seg->s_base + seg->s_size - raddr;
1132 1132                  else
1133 1133                          ssize = rsize;
1134 1134  retry:
1135 1135                  error = segop_setprot(seg, raddr, ssize, prot);
1136 1136  
1137 1137                  if (error == IE_NOMEM) {
1138 1138                          error = EAGAIN;
1139 1139                          break;
1140 1140                  }
1141 1141  
1142 1142                  if (error == IE_RETRY) {
1143 1143                          AS_LOCK_EXIT(as, &as->a_lock);
1144 1144                          writer = 1;
1145 1145                          goto setprot_top;
1146 1146                  }
1147 1147  
1148 1148                  if (error == EAGAIN) {
1149 1149                          /*
1150 1150                           * Make sure we have a_lock as writer.
1151 1151                           */
1152 1152                          if (writer == 0) {
1153 1153                                  AS_LOCK_EXIT(as, &as->a_lock);
1154 1154                                  writer = 1;
1155 1155                                  goto setprot_top;
1156 1156                          }
1157 1157  
1158 1158                          /*
1159 1159                           * Memory is currently locked.  It must be unlocked
1160 1160                           * before this operation can succeed through a retry.
1161 1161                           * The possible reasons for locked memory and
1162 1162                           * corresponding strategies for unlocking are:
1163 1163                           * (1) Normal I/O
1164 1164                           *      wait for a signal that the I/O operation
1165 1165                           *      has completed and the memory is unlocked.
1166 1166                           * (2) Asynchronous I/O
1167 1167                           *      The aio subsystem does not unlock pages when
1168 1168                           *      the I/O is completed. Those pages are unlocked
1169 1169                           *      when the application calls aiowait/aioerror.
1170 1170                           *      So, to prevent blocking forever, cv_broadcast()
1171 1171                           *      is done to wake up aio_cleanup_thread.
1172 1172                           *      Subsequently, segvn_reclaim will be called, and
1173 1173                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
1174 1174                           * (3) Long term page locking:
1175 1175                           *      Drivers intending to have pages locked for a
1176 1176                           *      period considerably longer than for normal I/O
1177 1177                           *      (essentially forever) may have registered for a
1178 1178                           *      callback so they may unlock these pages on
1179 1179                           *      request. This is needed to allow this operation
1180 1180                           *      to succeed. Each entry on the callback list is
1181 1181                           *      examined. If the event or address range pertains
1182 1182                           *      the callback is invoked (unless it already is in
1183 1183                           *      progress). The a_contents lock must be dropped
1184 1184                           *      before the callback, so only one callback can
1185 1185                           *      be done at a time. Go to the top and do more
1186 1186                           *      until zero is returned. If zero is returned,
1187 1187                           *      either there were no callbacks for this event
1188 1188                           *      or they were already in progress.
1189 1189                           */
1190 1190                          mutex_enter(&as->a_contents);
1191 1191                          if (as->a_callbacks &&
1192 1192                              (cb = as_find_callback(as, AS_SETPROT_EVENT,
1193 1193                              seg->s_base, seg->s_size))) {
1194 1194                                  AS_LOCK_EXIT(as, &as->a_lock);
1195 1195                                  as_execute_callback(as, cb, AS_SETPROT_EVENT);
1196 1196                          } else if (!AS_ISNOUNMAPWAIT(as)) {
1197 1197                                  if (AS_ISUNMAPWAIT(as) == 0)
1198 1198                                          cv_broadcast(&as->a_cv);
1199 1199                                  AS_SETUNMAPWAIT(as);
1200 1200                                  AS_LOCK_EXIT(as, &as->a_lock);
1201 1201                                  while (AS_ISUNMAPWAIT(as))
1202 1202                                          cv_wait(&as->a_cv, &as->a_contents);
1203 1203                          } else {
1204 1204                                  /*
1205 1205                                   * We may have raced with
1206 1206                                   * segvn_reclaim()/segspt_reclaim(). In this
1207 1207                                   * case clean nounmapwait flag and retry since
1208 1208                                   * softlockcnt in this segment may be already
1209 1209                                   * 0.  We don't drop as writer lock so our
1210 1210                                   * number of retries without sleeping should
1211 1211                                   * be very small. See segvn_reclaim() for
1212 1212                                   * more comments.
1213 1213                                   */
1214 1214                                  AS_CLRNOUNMAPWAIT(as);
1215 1215                                  mutex_exit(&as->a_contents);
1216 1216                                  goto retry;
1217 1217                          }
1218 1218                          mutex_exit(&as->a_contents);
1219 1219                          goto setprot_top;
1220 1220                  } else if (error != 0)
1221 1221                          break;
1222 1222          }
1223 1223          if (error != 0) {
1224 1224                  as_setwatch(as);
1225 1225          } else {
1226 1226                  as_setwatchprot(as, saveraddr, saversize, prot);
1227 1227          }
1228 1228          AS_LOCK_EXIT(as, &as->a_lock);
1229 1229          return (error);
1230 1230  }
1231 1231  
1232 1232  /*
1233 1233   * Check to make sure that the interval [addr, addr + size)
1234 1234   * in address space `as' has at least the specified protection.
1235 1235   * It is ok for the range to cross over several segments, as long
1236 1236   * as they are contiguous.
1237 1237   */
1238 1238  int
1239 1239  as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1240 1240  {
1241 1241          struct seg *seg;
1242 1242          size_t ssize;
1243 1243          caddr_t raddr;                  /* rounded down addr */
1244 1244          size_t rsize;                   /* rounded up size */
1245 1245          int error = 0;
1246 1246  
1247 1247          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1248 1248          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1249 1249              (size_t)raddr;
1250 1250  
1251 1251          if (raddr + rsize < raddr)              /* check for wraparound */
1252 1252                  return (ENOMEM);
1253 1253  
1254 1254          /*
1255 1255           * This is ugly as sin...
1256 1256           * Normally, we only acquire the address space readers lock.
1257 1257           * However, if the address space has watchpoints present,
1258 1258           * we must acquire the writer lock on the address space for
1259 1259           * the benefit of as_clearwatchprot() and as_setwatchprot().
1260 1260           */
1261 1261          if (avl_numnodes(&as->a_wpage) != 0)
1262 1262                  AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1263 1263          else
1264 1264                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1265 1265          as_clearwatchprot(as, raddr, rsize);
1266 1266          seg = as_segat(as, raddr);
1267 1267          if (seg == NULL) {
1268 1268                  as_setwatch(as);
1269 1269                  AS_LOCK_EXIT(as, &as->a_lock);
1270 1270                  return (ENOMEM);
1271 1271          }
1272 1272  
1273 1273          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1274 1274                  if (raddr >= seg->s_base + seg->s_size) {
1275 1275                          seg = AS_SEGNEXT(as, seg);
1276 1276                          if (seg == NULL || raddr != seg->s_base) {
1277 1277                                  error = ENOMEM;
1278 1278                                  break;
1279 1279                          }
1280 1280                  }
1281 1281                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
1282 1282                          ssize = seg->s_base + seg->s_size - raddr;
1283 1283                  else
1284 1284                          ssize = rsize;
1285 1285  
1286 1286                  error = segop_checkprot(seg, raddr, ssize, prot);
1287 1287                  if (error != 0)
1288 1288                          break;
1289 1289          }
1290 1290          as_setwatch(as);
1291 1291          AS_LOCK_EXIT(as, &as->a_lock);
1292 1292          return (error);
1293 1293  }
1294 1294  
1295 1295  int
1296 1296  as_unmap(struct as *as, caddr_t addr, size_t size)
1297 1297  {
1298 1298          struct seg *seg, *seg_next;
1299 1299          struct as_callback *cb;
1300 1300          caddr_t raddr, eaddr;
1301 1301          size_t ssize, rsize = 0;
1302 1302          int err;
1303 1303  
1304 1304  top:
1305 1305          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1306 1306          eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1307 1307              (uintptr_t)PAGEMASK);
1308 1308  
1309 1309          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1310 1310  
1311 1311          as->a_updatedir = 1;    /* inform /proc */
1312 1312          gethrestime(&as->a_updatetime);
1313 1313  
1314 1314          /*
1315 1315           * Use as_findseg to find the first segment in the range, then
1316 1316           * step through the segments in order, following s_next.
1317 1317           */
1318 1318          as_clearwatchprot(as, raddr, eaddr - raddr);
1319 1319  
1320 1320          for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1321 1321                  if (eaddr <= seg->s_base)
1322 1322                          break;          /* eaddr was in a gap; all done */
1323 1323  
1324 1324                  /* this is implied by the test above */
1325 1325                  ASSERT(raddr < eaddr);
1326 1326  
1327 1327                  if (raddr < seg->s_base)
1328 1328                          raddr = seg->s_base;    /* raddr was in a gap */
1329 1329  
1330 1330                  if (eaddr > (seg->s_base + seg->s_size))
1331 1331                          ssize = seg->s_base + seg->s_size - raddr;
1332 1332                  else
1333 1333                          ssize = eaddr - raddr;
1334 1334  
1335 1335                  /*
1336 1336                   * Save next segment pointer since seg can be
1337 1337                   * destroyed during the segment unmap operation.
1338 1338                   */
1339 1339                  seg_next = AS_SEGNEXT(as, seg);
1340 1340  
1341 1341                  /*
1342 1342                   * We didn't count /dev/null mappings, so ignore them here.
1343 1343                   * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1344 1344                   * we have to do this check here while we have seg.)
1345 1345                   */
1346 1346                  rsize = 0;
1347 1347                  if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1348 1348                      !SEG_IS_PARTIAL_RESV(seg))
1349 1349                          rsize = ssize;
1350 1350  
1351 1351  retry:
1352 1352                  err = segop_unmap(seg, raddr, ssize);
1353 1353                  if (err == EAGAIN) {
1354 1354                          /*
1355 1355                           * Memory is currently locked.  It must be unlocked
1356 1356                           * before this operation can succeed through a retry.
1357 1357                           * The possible reasons for locked memory and
1358 1358                           * corresponding strategies for unlocking are:
1359 1359                           * (1) Normal I/O
1360 1360                           *      wait for a signal that the I/O operation
1361 1361                           *      has completed and the memory is unlocked.
1362 1362                           * (2) Asynchronous I/O
1363 1363                           *      The aio subsystem does not unlock pages when
1364 1364                           *      the I/O is completed. Those pages are unlocked
1365 1365                           *      when the application calls aiowait/aioerror.
1366 1366                           *      So, to prevent blocking forever, cv_broadcast()
1367 1367                           *      is done to wake up aio_cleanup_thread.
1368 1368                           *      Subsequently, segvn_reclaim will be called, and
1369 1369                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
1370 1370                           * (3) Long term page locking:
1371 1371                           *      Drivers intending to have pages locked for a
1372 1372                           *      period considerably longer than for normal I/O
1373 1373                           *      (essentially forever) may have registered for a
1374 1374                           *      callback so they may unlock these pages on
1375 1375                           *      request. This is needed to allow this operation
1376 1376                           *      to succeed. Each entry on the callback list is
1377 1377                           *      examined. If the event or address range pertains
1378 1378                           *      the callback is invoked (unless it already is in
1379 1379                           *      progress). The a_contents lock must be dropped
1380 1380                           *      before the callback, so only one callback can
1381 1381                           *      be done at a time. Go to the top and do more
1382 1382                           *      until zero is returned. If zero is returned,
1383 1383                           *      either there were no callbacks for this event
1384 1384                           *      or they were already in progress.
1385 1385                           */
1386 1386                          mutex_enter(&as->a_contents);
1387 1387                          if (as->a_callbacks &&
1388 1388                              (cb = as_find_callback(as, AS_UNMAP_EVENT,
1389 1389                              seg->s_base, seg->s_size))) {
1390 1390                                  AS_LOCK_EXIT(as, &as->a_lock);
1391 1391                                  as_execute_callback(as, cb, AS_UNMAP_EVENT);
1392 1392                          } else if (!AS_ISNOUNMAPWAIT(as)) {
1393 1393                                  if (AS_ISUNMAPWAIT(as) == 0)
1394 1394                                          cv_broadcast(&as->a_cv);
1395 1395                                  AS_SETUNMAPWAIT(as);
1396 1396                                  AS_LOCK_EXIT(as, &as->a_lock);
1397 1397                                  while (AS_ISUNMAPWAIT(as))
1398 1398                                          cv_wait(&as->a_cv, &as->a_contents);
1399 1399                          } else {
1400 1400                                  /*
1401 1401                                   * We may have raced with
1402 1402                                   * segvn_reclaim()/segspt_reclaim(). In this
1403 1403                                   * case clean nounmapwait flag and retry since
1404 1404                                   * softlockcnt in this segment may be already
1405 1405                                   * 0.  We don't drop as writer lock so our
1406 1406                                   * number of retries without sleeping should
1407 1407                                   * be very small. See segvn_reclaim() for
1408 1408                                   * more comments.
1409 1409                                   */
1410 1410                                  AS_CLRNOUNMAPWAIT(as);
1411 1411                                  mutex_exit(&as->a_contents);
1412 1412                                  goto retry;
1413 1413                          }
1414 1414                          mutex_exit(&as->a_contents);
1415 1415                          goto top;
1416 1416                  } else if (err == IE_RETRY) {
1417 1417                          AS_LOCK_EXIT(as, &as->a_lock);
1418 1418                          goto top;
1419 1419                  } else if (err) {
1420 1420                          as_setwatch(as);
1421 1421                          AS_LOCK_EXIT(as, &as->a_lock);
1422 1422                          return (-1);
1423 1423                  }
1424 1424  
1425 1425                  as->a_size -= ssize;
1426 1426                  if (rsize)
1427 1427                          as->a_resvsize -= rsize;
1428 1428                  raddr += ssize;
1429 1429          }
1430 1430          AS_LOCK_EXIT(as, &as->a_lock);
1431 1431          return (0);
1432 1432  }
1433 1433  
1434 1434  static int
1435 1435  as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1436 1436      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1437 1437  {
1438 1438          uint_t szc;
1439 1439          uint_t nszc;
1440 1440          int error;
1441 1441          caddr_t a;
1442 1442          caddr_t eaddr;
1443 1443          size_t segsize;
1444 1444          struct seg *seg;
1445 1445          size_t pgsz;
1446 1446          int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1447 1447          uint_t save_szcvec;
1448 1448  
1449 1449          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1450 1450          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1451 1451          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1452 1452          ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1453 1453          if (!do_off) {
1454 1454                  vn_a->offset = 0;
1455 1455          }
1456 1456  
1457 1457          if (szcvec <= 1) {
1458 1458                  seg = seg_alloc(as, addr, size);
1459 1459                  if (seg == NULL) {
1460 1460                          return (ENOMEM);
1461 1461                  }
1462 1462                  vn_a->szc = 0;
1463 1463                  error = (*crfp)(seg, vn_a);
1464 1464                  if (error != 0) {
1465 1465                          seg_free(seg);
1466 1466                  } else {
1467 1467                          as->a_size += size;
1468 1468                          as->a_resvsize += size;
1469 1469                  }
1470 1470                  return (error);
1471 1471          }
1472 1472  
1473 1473          eaddr = addr + size;
1474 1474          save_szcvec = szcvec;
1475 1475          szcvec >>= 1;
1476 1476          szc = 0;
1477 1477          nszc = 0;
1478 1478          while (szcvec) {
1479 1479                  if ((szcvec & 0x1) == 0) {
1480 1480                          nszc++;
1481 1481                          szcvec >>= 1;
1482 1482                          continue;
1483 1483                  }
1484 1484                  nszc++;
1485 1485                  pgsz = page_get_pagesize(nszc);
1486 1486                  a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1487 1487                  if (a != addr) {
1488 1488                          ASSERT(a < eaddr);
1489 1489                          segsize = a - addr;
1490 1490                          seg = seg_alloc(as, addr, segsize);
1491 1491                          if (seg == NULL) {
1492 1492                                  return (ENOMEM);
1493 1493                          }
1494 1494                          vn_a->szc = szc;
1495 1495                          error = (*crfp)(seg, vn_a);
1496 1496                          if (error != 0) {
1497 1497                                  seg_free(seg);
1498 1498                                  return (error);
1499 1499                          }
1500 1500                          as->a_size += segsize;
1501 1501                          as->a_resvsize += segsize;
1502 1502                          *segcreated = 1;
1503 1503                          if (do_off) {
1504 1504                                  vn_a->offset += segsize;
1505 1505                          }
1506 1506                          addr = a;
1507 1507                  }
1508 1508                  szc = nszc;
1509 1509                  szcvec >>= 1;
1510 1510          }
1511 1511  
1512 1512          ASSERT(addr < eaddr);
1513 1513          szcvec = save_szcvec | 1; /* add 8K pages */
1514 1514          while (szcvec) {
1515 1515                  a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1516 1516                  ASSERT(a >= addr);
1517 1517                  if (a != addr) {
1518 1518                          segsize = a - addr;
1519 1519                          seg = seg_alloc(as, addr, segsize);
1520 1520                          if (seg == NULL) {
1521 1521                                  return (ENOMEM);
1522 1522                          }
1523 1523                          vn_a->szc = szc;
1524 1524                          error = (*crfp)(seg, vn_a);
1525 1525                          if (error != 0) {
1526 1526                                  seg_free(seg);
1527 1527                                  return (error);
1528 1528                          }
1529 1529                          as->a_size += segsize;
1530 1530                          as->a_resvsize += segsize;
1531 1531                          *segcreated = 1;
1532 1532                          if (do_off) {
1533 1533                                  vn_a->offset += segsize;
1534 1534                          }
1535 1535                          addr = a;
1536 1536                  }
1537 1537                  szcvec &= ~(1 << szc);
1538 1538                  if (szcvec) {
1539 1539                          szc = highbit(szcvec) - 1;
1540 1540                          pgsz = page_get_pagesize(szc);
1541 1541                  }
1542 1542          }
1543 1543          ASSERT(addr == eaddr);
1544 1544  
1545 1545          return (0);
1546 1546  }
1547 1547  
1548 1548  static int
1549 1549  as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1550 1550      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1551 1551  {
1552 1552          uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1553 1553          int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1554 1554          uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1555 1555              type, 0);
1556 1556          int error;
1557 1557          struct seg *seg;
1558 1558          struct vattr va;
1559 1559          u_offset_t eoff;
1560 1560          size_t save_size = 0;
1561 1561          extern size_t textrepl_size_thresh;
1562 1562  
1563 1563          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1564 1564          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1565 1565          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1566 1566          ASSERT(vn_a->vp != NULL);
1567 1567          ASSERT(vn_a->amp == NULL);
1568 1568  
1569 1569  again:
1570 1570          if (szcvec <= 1) {
1571 1571                  seg = seg_alloc(as, addr, size);
1572 1572                  if (seg == NULL) {
1573 1573                          return (ENOMEM);
1574 1574                  }
1575 1575                  vn_a->szc = 0;
1576 1576                  error = (*crfp)(seg, vn_a);
1577 1577                  if (error != 0) {
1578 1578                          seg_free(seg);
1579 1579                  } else {
1580 1580                          as->a_size += size;
1581 1581                          as->a_resvsize += size;
1582 1582                  }
1583 1583                  return (error);
1584 1584          }
1585 1585  
1586 1586          va.va_mask = AT_SIZE;
1587 1587          if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1588 1588                  szcvec = 0;
1589 1589                  goto again;
1590 1590          }
1591 1591          eoff = vn_a->offset & PAGEMASK;
1592 1592          if (eoff >= va.va_size) {
1593 1593                  szcvec = 0;
1594 1594                  goto again;
1595 1595          }
1596 1596          eoff += size;
1597 1597          if (btopr(va.va_size) < btopr(eoff)) {
1598 1598                  save_size = size;
1599 1599                  size = va.va_size - (vn_a->offset & PAGEMASK);
1600 1600                  size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1601 1601                  szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1602 1602                      type, 0);
1603 1603                  if (szcvec <= 1) {
1604 1604                          size = save_size;
1605 1605                          goto again;
1606 1606                  }
1607 1607          }
1608 1608  
1609 1609          if (size > textrepl_size_thresh) {
1610 1610                  vn_a->flags |= _MAP_TEXTREPL;
1611 1611          }
1612 1612          error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1613 1613              segcreated);
1614 1614          if (error != 0) {
1615 1615                  return (error);
1616 1616          }
1617 1617          if (save_size) {
1618 1618                  addr += size;
1619 1619                  size = save_size - size;
1620 1620                  szcvec = 0;
1621 1621                  goto again;
1622 1622          }
1623 1623          return (0);
1624 1624  }
1625 1625  
1626 1626  /*
1627 1627   * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1628 1628   * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1629 1629   */
1630 1630  static int
1631 1631  as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1632 1632      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1633 1633  {
1634 1634          uint_t szcvec;
1635 1635          uchar_t type;
1636 1636  
1637 1637          ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1638 1638          if (vn_a->type == MAP_SHARED) {
1639 1639                  type = MAPPGSZC_SHM;
1640 1640          } else if (vn_a->type == MAP_PRIVATE) {
1641 1641                  if (vn_a->szc == AS_MAP_HEAP) {
1642 1642                          type = MAPPGSZC_HEAP;
1643 1643                  } else if (vn_a->szc == AS_MAP_STACK) {
1644 1644                          type = MAPPGSZC_STACK;
1645 1645                  } else {
1646 1646                          type = MAPPGSZC_PRIVM;
1647 1647                  }
1648 1648          }
1649 1649          szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1650 1650              (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1651 1651              (vn_a->flags & MAP_TEXT), type, 0);
1652 1652          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1653 1653          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1654 1654          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1655 1655          ASSERT(vn_a->vp == NULL);
1656 1656  
1657 1657          return (as_map_segvn_segs(as, addr, size, szcvec,
1658 1658              crfp, vn_a, segcreated));
1659 1659  }
1660 1660  
1661 1661  int
1662 1662  as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1663 1663  {
1664 1664          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1665 1665          return (as_map_locked(as, addr, size, crfp, argsp));
1666 1666  }
1667 1667  
1668 1668  int
1669 1669  as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1670 1670                  void *argsp)
1671 1671  {
1672 1672          struct seg *seg = NULL;
1673 1673          caddr_t raddr;                  /* rounded down addr */
1674 1674          size_t rsize;                   /* rounded up size */
1675 1675          int error;
1676 1676          int unmap = 0;
1677 1677          struct proc *p = curproc;
1678 1678          struct segvn_crargs crargs;
1679 1679  
1680 1680          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1681 1681          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1682 1682              (size_t)raddr;
1683 1683  
1684 1684          /*
1685 1685           * check for wrap around
1686 1686           */
1687 1687          if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1688 1688                  AS_LOCK_EXIT(as, &as->a_lock);
1689 1689                  return (ENOMEM);
1690 1690          }
1691 1691  
1692 1692          as->a_updatedir = 1;    /* inform /proc */
1693 1693          gethrestime(&as->a_updatetime);
1694 1694  
1695 1695          if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1696 1696                  AS_LOCK_EXIT(as, &as->a_lock);
1697 1697  
1698 1698                  (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1699 1699                      RCA_UNSAFE_ALL);
1700 1700  
1701 1701                  return (ENOMEM);
1702 1702          }
1703 1703  
1704 1704          if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1705 1705                  crargs = *(struct segvn_crargs *)argsp;
1706 1706                  error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1707 1707                  if (error != 0) {
1708 1708                          AS_LOCK_EXIT(as, &as->a_lock);
1709 1709                          if (unmap) {
1710 1710                                  (void) as_unmap(as, addr, size);
1711 1711                          }
1712 1712                          return (error);
1713 1713                  }
1714 1714          } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1715 1715                  crargs = *(struct segvn_crargs *)argsp;
1716 1716                  error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1717 1717                  if (error != 0) {
1718 1718                          AS_LOCK_EXIT(as, &as->a_lock);
1719 1719                          if (unmap) {
1720 1720                                  (void) as_unmap(as, addr, size);
1721 1721                          }
1722 1722                          return (error);
1723 1723                  }
1724 1724          } else {
1725 1725                  seg = seg_alloc(as, addr, size);
1726 1726                  if (seg == NULL) {
1727 1727                          AS_LOCK_EXIT(as, &as->a_lock);
1728 1728                          return (ENOMEM);
1729 1729                  }
1730 1730  
1731 1731                  error = (*crfp)(seg, argsp);
1732 1732                  if (error != 0) {
1733 1733                          seg_free(seg);
1734 1734                          AS_LOCK_EXIT(as, &as->a_lock);
1735 1735                          return (error);
1736 1736                  }
1737 1737                  /*
1738 1738                   * Add size now so as_unmap will work if as_ctl fails.
1739 1739                   */
1740 1740                  as->a_size += rsize;
1741 1741                  as->a_resvsize += rsize;
1742 1742          }
1743 1743  
1744 1744          as_setwatch(as);
1745 1745  
1746 1746          /*
1747 1747           * If the address space is locked,
1748 1748           * establish memory locks for the new segment.
1749 1749           */
1750 1750          mutex_enter(&as->a_contents);
1751 1751          if (AS_ISPGLCK(as)) {
1752 1752                  mutex_exit(&as->a_contents);
1753 1753                  AS_LOCK_EXIT(as, &as->a_lock);
1754 1754                  error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1755 1755                  if (error != 0)
1756 1756                          (void) as_unmap(as, addr, size);
1757 1757          } else {
1758 1758                  mutex_exit(&as->a_contents);
1759 1759                  AS_LOCK_EXIT(as, &as->a_lock);
1760 1760          }
1761 1761          return (error);
1762 1762  }
1763 1763  
1764 1764  
1765 1765  /*
1766 1766   * Delete all segments in the address space marked with S_PURGE.
1767 1767   * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1768 1768   * These segments are deleted as a first step before calls to as_gap(), so
1769 1769   * that they don't affect mmap() or shmat().
1770 1770   */
1771 1771  void
1772 1772  as_purge(struct as *as)
1773 1773  {
1774 1774          struct seg *seg;
1775 1775          struct seg *next_seg;
1776 1776  
1777 1777          /*
1778 1778           * the setting of NEEDSPURGE is protect by as_rangelock(), so
1779 1779           * no need to grab a_contents mutex for this check
1780 1780           */
1781 1781          if ((as->a_flags & AS_NEEDSPURGE) == 0)
1782 1782                  return;
1783 1783  
1784 1784          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1785 1785          next_seg = NULL;
1786 1786          seg = AS_SEGFIRST(as);
1787 1787          while (seg != NULL) {
1788 1788                  next_seg = AS_SEGNEXT(as, seg);
1789 1789                  if (seg->s_flags & S_PURGE)
1790 1790                          segop_unmap(seg, seg->s_base, seg->s_size);
1791 1791                  seg = next_seg;
1792 1792          }
1793 1793          AS_LOCK_EXIT(as, &as->a_lock);
1794 1794  
1795 1795          mutex_enter(&as->a_contents);
1796 1796          as->a_flags &= ~AS_NEEDSPURGE;
1797 1797          mutex_exit(&as->a_contents);
1798 1798  }
1799 1799  
1800 1800  /*
1801 1801   * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1802 1802   * range of addresses at least "minlen" long, where the base of the range is
1803 1803   * at "off" phase from an "align" boundary and there is space for a
1804 1804   * "redzone"-sized redzone on eithe rside of the range.  Thus,
1805 1805   * if align was 4M and off was 16k, the user wants a hole which will start
1806 1806   * 16k into a 4M page.
1807 1807   *
1808 1808   * If flags specifies AH_HI, the hole will have the highest possible address
1809 1809   * in the range.  We use the as->a_lastgap field to figure out where to
1810 1810   * start looking for a gap.
1811 1811   *
1812 1812   * Otherwise, the gap will have the lowest possible address.
1813 1813   *
1814 1814   * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1815 1815   *
1816 1816   * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1817 1817   * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1818 1818   *
1819 1819   * NOTE: This routine is not correct when base+len overflows caddr_t.
1820 1820   */
1821 1821  int
1822 1822  as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1823 1823      uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1824 1824  {
1825 1825          caddr_t lobound = *basep;
1826 1826          caddr_t hibound = lobound + *lenp;
1827 1827          struct seg *lseg, *hseg;
1828 1828          caddr_t lo, hi;
1829 1829          int forward;
1830 1830          caddr_t save_base;
1831 1831          size_t save_len;
1832 1832          size_t save_minlen;
1833 1833          size_t save_redzone;
1834 1834          int fast_path = 1;
1835 1835  
1836 1836          save_base = *basep;
1837 1837          save_len = *lenp;
1838 1838          save_minlen = minlen;
1839 1839          save_redzone = redzone;
1840 1840  
1841 1841          /*
1842 1842           * For the first pass/fast_path, just add align and redzone into
1843 1843           * minlen since if we get an allocation, we can guarantee that it
1844 1844           * will fit the alignment and redzone requested.
1845 1845           * This increases the chance that hibound will be adjusted to
1846 1846           * a_lastgap->s_base which will likely allow us to find an
1847 1847           * acceptable hole in the address space quicker.
1848 1848           * If we can't find a hole with this fast_path, then we look for
1849 1849           * smaller holes in which the alignment and offset may allow
1850 1850           * the allocation to fit.
1851 1851           */
1852 1852          minlen += align;
1853 1853          minlen += 2 * redzone;
1854 1854          redzone = 0;
1855 1855  
1856 1856          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1857 1857          if (AS_SEGFIRST(as) == NULL) {
1858 1858                  if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1859 1859                      align, redzone, off)) {
1860 1860                          AS_LOCK_EXIT(as, &as->a_lock);
1861 1861                          return (0);
1862 1862                  } else {
1863 1863                          AS_LOCK_EXIT(as, &as->a_lock);
1864 1864                          *basep = save_base;
1865 1865                          *lenp = save_len;
1866 1866                          return (-1);
1867 1867                  }
1868 1868          }
1869 1869  
1870 1870  retry:
1871 1871          /*
1872 1872           * Set up to iterate over all the inter-segment holes in the given
1873 1873           * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1874 1874           * NULL for the highest-addressed hole.  If moving backwards, we reset
1875 1875           * sseg to denote the highest-addressed segment.
1876 1876           */
1877 1877          forward = (flags & AH_DIR) == AH_LO;
1878 1878          if (forward) {
1879 1879                  hseg = as_findseg(as, lobound, 1);
1880 1880                  lseg = AS_SEGPREV(as, hseg);
1881 1881          } else {
1882 1882  
1883 1883                  /*
1884 1884                   * If allocating at least as much as the last allocation,
1885 1885                   * use a_lastgap's base as a better estimate of hibound.
1886 1886                   */
1887 1887                  if (as->a_lastgap &&
1888 1888                      minlen >= as->a_lastgap->s_size &&
1889 1889                      hibound >= as->a_lastgap->s_base)
1890 1890                          hibound = as->a_lastgap->s_base;
1891 1891  
1892 1892                  hseg = as_findseg(as, hibound, 1);
1893 1893                  if (hseg->s_base + hseg->s_size < hibound) {
1894 1894                          lseg = hseg;
1895 1895                          hseg = NULL;
1896 1896                  } else {
1897 1897                          lseg = AS_SEGPREV(as, hseg);
1898 1898                  }
1899 1899          }
1900 1900  
1901 1901          for (;;) {
1902 1902                  /*
1903 1903                   * Set lo and hi to the hole's boundaries.  (We should really
1904 1904                   * use MAXADDR in place of hibound in the expression below,
1905 1905                   * but can't express it easily; using hibound in its place is
1906 1906                   * harmless.)
1907 1907                   */
1908 1908                  lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1909 1909                  hi = (hseg == NULL) ? hibound : hseg->s_base;
1910 1910                  /*
1911 1911                   * If the iteration has moved past the interval from lobound
1912 1912                   * to hibound it's pointless to continue.
1913 1913                   */
1914 1914                  if ((forward && lo > hibound) || (!forward && hi < lobound))
1915 1915                          break;
1916 1916                  else if (lo > hibound || hi < lobound)
1917 1917                          goto cont;
1918 1918                  /*
1919 1919                   * Candidate hole lies at least partially within the allowable
1920 1920                   * range.  Restrict it to fall completely within that range,
1921 1921                   * i.e., to [max(lo, lobound), min(hi, hibound)].
1922 1922                   */
1923 1923                  if (lo < lobound)
1924 1924                          lo = lobound;
1925 1925                  if (hi > hibound)
1926 1926                          hi = hibound;
1927 1927                  /*
1928 1928                   * Verify that the candidate hole is big enough and meets
1929 1929                   * hardware constraints.  If the hole is too small, no need
1930 1930                   * to do the further checks since they will fail.
1931 1931                   */
1932 1932                  *basep = lo;
1933 1933                  *lenp = hi - lo;
1934 1934                  if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1935 1935                      minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1936 1936                      ((flags & AH_CONTAIN) == 0 ||
1937 1937                      (*basep <= addr && *basep + *lenp > addr))) {
1938 1938                          if (!forward)
1939 1939                                  as->a_lastgap = hseg;
1940 1940                          if (hseg != NULL)
1941 1941                                  as->a_lastgaphl = hseg;
1942 1942                          else
1943 1943                                  as->a_lastgaphl = lseg;
1944 1944                          AS_LOCK_EXIT(as, &as->a_lock);
1945 1945                          return (0);
1946 1946                  }
1947 1947          cont:
1948 1948                  /*
1949 1949                   * Move to the next hole.
1950 1950                   */
1951 1951                  if (forward) {
1952 1952                          lseg = hseg;
1953 1953                          if (lseg == NULL)
1954 1954                                  break;
1955 1955                          hseg = AS_SEGNEXT(as, hseg);
1956 1956                  } else {
1957 1957                          hseg = lseg;
1958 1958                          if (hseg == NULL)
1959 1959                                  break;
1960 1960                          lseg = AS_SEGPREV(as, lseg);
1961 1961                  }
1962 1962          }
1963 1963          if (fast_path && (align != 0 || save_redzone != 0)) {
1964 1964                  fast_path = 0;
1965 1965                  minlen = save_minlen;
1966 1966                  redzone = save_redzone;
1967 1967                  goto retry;
1968 1968          }
1969 1969          *basep = save_base;
1970 1970          *lenp = save_len;
1971 1971          AS_LOCK_EXIT(as, &as->a_lock);
1972 1972          return (-1);
1973 1973  }
1974 1974  
1975 1975  /*
1976 1976   * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1977 1977   *
1978 1978   * If flags specifies AH_HI, the hole will have the highest possible address
1979 1979   * in the range.  We use the as->a_lastgap field to figure out where to
1980 1980   * start looking for a gap.
1981 1981   *
1982 1982   * Otherwise, the gap will have the lowest possible address.
1983 1983   *
1984 1984   * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1985 1985   *
1986 1986   * If an adequate hole is found, base and len are set to reflect the part of
1987 1987   * the hole that is within range, and 0 is returned, otherwise,
1988 1988   * -1 is returned.
1989 1989   *
1990 1990   * NOTE: This routine is not correct when base+len overflows caddr_t.
1991 1991   */
1992 1992  int
1993 1993  as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
1994 1994      caddr_t addr)
1995 1995  {
1996 1996  
1997 1997          return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
1998 1998  }
1999 1999  
2000 2000  /*
2001 2001   * Return the next range within [base, base + len) that is backed
2002 2002   * with "real memory".  Skip holes and non-seg_vn segments.
2003 2003   * We're lazy and only return one segment at a time.
2004 2004   */
2005 2005  int
2006 2006  as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2007 2007  {
2008 2008          extern struct seg_ops segspt_shmops;    /* needs a header file */
2009 2009          struct seg *seg;
2010 2010          caddr_t addr, eaddr;
2011 2011          caddr_t segend;
2012 2012  
2013 2013          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2014 2014  
2015 2015          addr = *basep;
2016 2016          eaddr = addr + *lenp;
2017 2017  
2018 2018          seg = as_findseg(as, addr, 0);
2019 2019          if (seg != NULL)
2020 2020                  addr = MAX(seg->s_base, addr);
2021 2021  
2022 2022          for (;;) {
2023 2023                  if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2024 2024                          AS_LOCK_EXIT(as, &as->a_lock);
2025 2025                          return (EINVAL);
2026 2026                  }
2027 2027  
2028 2028                  if (seg->s_ops == &segvn_ops) {
2029 2029                          segend = seg->s_base + seg->s_size;
2030 2030                          break;
2031 2031                  }
2032 2032  
2033 2033                  /*
2034 2034                   * We do ISM by looking into the private data
2035 2035                   * to determine the real size of the segment.
2036 2036                   */
2037 2037                  if (seg->s_ops == &segspt_shmops) {
2038 2038                          segend = seg->s_base + spt_realsize(seg);
2039 2039                          if (addr < segend)
2040 2040                                  break;
2041 2041                  }
2042 2042  
2043 2043                  seg = AS_SEGNEXT(as, seg);
2044 2044  
2045 2045                  if (seg != NULL)
2046 2046                          addr = seg->s_base;
2047 2047          }
2048 2048  
2049 2049          *basep = addr;
2050 2050  
2051 2051          if (segend > eaddr)
2052 2052                  *lenp = eaddr - addr;
2053 2053          else
2054 2054                  *lenp = segend - addr;
2055 2055  
2056 2056          AS_LOCK_EXIT(as, &as->a_lock);
2057 2057          return (0);
2058 2058  }
2059 2059  
2060 2060  /*
2061 2061   * Determine whether data from the mappings in interval [addr, addr + size)
2062 2062   * are in the primary memory (core) cache.
2063 2063   */
2064 2064  int
2065 2065  as_incore(struct as *as, caddr_t addr,
2066 2066      size_t size, char *vec, size_t *sizep)
2067 2067  {
2068 2068          struct seg *seg;
2069 2069          size_t ssize;
2070 2070          caddr_t raddr;          /* rounded down addr */
2071 2071          size_t rsize;           /* rounded up size */
2072 2072          size_t isize;                   /* iteration size */
2073 2073          int error = 0;          /* result, assume success */
2074 2074  
2075 2075          *sizep = 0;
2076 2076          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2077 2077          rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2078 2078              (size_t)raddr;
2079 2079  
2080 2080          if (raddr + rsize < raddr)              /* check for wraparound */
2081 2081                  return (ENOMEM);
2082 2082  
2083 2083          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2084 2084          seg = as_segat(as, raddr);
2085 2085          if (seg == NULL) {
2086 2086                  AS_LOCK_EXIT(as, &as->a_lock);
2087 2087                  return (-1);
2088 2088          }
2089 2089  
2090 2090          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2091 2091                  if (raddr >= seg->s_base + seg->s_size) {
2092 2092                          seg = AS_SEGNEXT(as, seg);
2093 2093                          if (seg == NULL || raddr != seg->s_base) {
2094 2094                                  error = -1;
2095 2095                                  break;
2096 2096                          }
2097 2097                  }
2098 2098                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2099 2099                          ssize = seg->s_base + seg->s_size - raddr;
2100 2100                  else
2101 2101                          ssize = rsize;
2102 2102                  *sizep += isize = segop_incore(seg, raddr, ssize, vec);
2103 2103                  if (isize != ssize) {
2104 2104                          error = -1;
2105 2105                          break;
2106 2106                  }
2107 2107                  vec += btopr(ssize);
2108 2108          }
2109 2109          AS_LOCK_EXIT(as, &as->a_lock);
2110 2110          return (error);
2111 2111  }
2112 2112  
2113 2113  static void
2114 2114  as_segunlock(struct seg *seg, caddr_t addr, int attr,
2115 2115          ulong_t *bitmap, size_t position, size_t npages)
2116 2116  {
2117 2117          caddr_t range_start;
2118 2118          size_t  pos1 = position;
2119 2119          size_t  pos2;
2120 2120          size_t  size;
2121 2121          size_t  end_pos = npages + position;
2122 2122  
2123 2123          while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2124 2124                  size = ptob((pos2 - pos1));
2125 2125                  range_start = (caddr_t)((uintptr_t)addr +
2126 2126                      ptob(pos1 - position));
2127 2127  
2128 2128                  (void) segop_lockop(seg, range_start, size, attr, MC_UNLOCK,
2129 2129                      (ulong_t *)NULL, (size_t)NULL);
2130 2130                  pos1 = pos2;
2131 2131          }
2132 2132  }
2133 2133  
2134 2134  static void
2135 2135  as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2136 2136          caddr_t raddr, size_t rsize)
2137 2137  {
2138 2138          struct seg *seg = as_segat(as, raddr);
2139 2139          size_t ssize;
2140 2140  
2141 2141          while (rsize != 0) {
2142 2142                  if (raddr >= seg->s_base + seg->s_size)
2143 2143                          seg = AS_SEGNEXT(as, seg);
2144 2144  
2145 2145                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2146 2146                          ssize = seg->s_base + seg->s_size - raddr;
2147 2147                  else
2148 2148                          ssize = rsize;
2149 2149  
2150 2150                  as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2151 2151  
2152 2152                  rsize -= ssize;
2153 2153                  raddr += ssize;
2154 2154          }
2155 2155  }
2156 2156  
2157 2157  /*
2158 2158   * Cache control operations over the interval [addr, addr + size) in
2159 2159   * address space "as".
2160 2160   */
2161 2161  /*ARGSUSED*/
2162 2162  int
2163 2163  as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2164 2164      uintptr_t arg, ulong_t *lock_map, size_t pos)
2165 2165  {
2166 2166          struct seg *seg;        /* working segment */
2167 2167          caddr_t raddr;          /* rounded down addr */
2168 2168          caddr_t initraddr;      /* saved initial rounded down addr */
2169 2169          size_t rsize;           /* rounded up size */
2170 2170          size_t initrsize;       /* saved initial rounded up size */
2171 2171          size_t ssize;           /* size of seg */
2172 2172          int error = 0;                  /* result */
2173 2173          size_t mlock_size;      /* size of bitmap */
2174 2174          ulong_t *mlock_map;     /* pointer to bitmap used */
2175 2175                                  /* to represent the locked */
2176 2176                                  /* pages. */
2177 2177  retry:
2178 2178          if (error == IE_RETRY)
2179 2179                  AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2180 2180          else
2181 2181                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2182 2182  
2183 2183          /*
2184 2184           * If these are address space lock/unlock operations, loop over
2185 2185           * all segments in the address space, as appropriate.
2186 2186           */
2187 2187          if (func == MC_LOCKAS) {
2188 2188                  size_t npages, idx;
2189 2189                  size_t rlen = 0;        /* rounded as length */
2190 2190  
2191 2191                  idx = pos;
2192 2192  
2193 2193                  if (arg & MCL_FUTURE) {
2194 2194                          mutex_enter(&as->a_contents);
2195 2195                          AS_SETPGLCK(as);
2196 2196                          mutex_exit(&as->a_contents);
2197 2197                  }
2198 2198                  if ((arg & MCL_CURRENT) == 0) {
2199 2199                          AS_LOCK_EXIT(as, &as->a_lock);
2200 2200                          return (0);
2201 2201                  }
2202 2202  
2203 2203                  seg = AS_SEGFIRST(as);
2204 2204                  if (seg == NULL) {
2205 2205                          AS_LOCK_EXIT(as, &as->a_lock);
2206 2206                          return (0);
2207 2207                  }
2208 2208  
2209 2209                  do {
2210 2210                          raddr = (caddr_t)((uintptr_t)seg->s_base &
2211 2211                              (uintptr_t)PAGEMASK);
2212 2212                          rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2213 2213                              PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2214 2214                  } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2215 2215  
2216 2216                  mlock_size = BT_BITOUL(btopr(rlen));
2217 2217                  if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2218 2218                      sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2219 2219                                  AS_LOCK_EXIT(as, &as->a_lock);
2220 2220                                  return (EAGAIN);
2221 2221                  }
2222 2222  
2223 2223                  for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2224 2224                          error = segop_lockop(seg, seg->s_base,
2225 2225                              seg->s_size, attr, MC_LOCK, mlock_map, pos);
2226 2226                          if (error != 0)
2227 2227                                  break;
2228 2228                          pos += seg_pages(seg);
2229 2229                  }
2230 2230  
2231 2231                  if (error) {
2232 2232                          for (seg = AS_SEGFIRST(as); seg != NULL;
2233 2233                              seg = AS_SEGNEXT(as, seg)) {
2234 2234  
2235 2235                                  raddr = (caddr_t)((uintptr_t)seg->s_base &
2236 2236                                      (uintptr_t)PAGEMASK);
2237 2237                                  npages = seg_pages(seg);
2238 2238                                  as_segunlock(seg, raddr, attr, mlock_map,
2239 2239                                      idx, npages);
2240 2240                                  idx += npages;
2241 2241                          }
2242 2242                  }
2243 2243  
2244 2244                  kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2245 2245                  AS_LOCK_EXIT(as, &as->a_lock);
2246 2246                  goto lockerr;
2247 2247          } else if (func == MC_UNLOCKAS) {
2248 2248                  mutex_enter(&as->a_contents);
2249 2249                  AS_CLRPGLCK(as);
2250 2250                  mutex_exit(&as->a_contents);
2251 2251  
2252 2252                  for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2253 2253                          error = segop_lockop(seg, seg->s_base,
2254 2254                              seg->s_size, attr, MC_UNLOCK, NULL, 0);
2255 2255                          if (error != 0)
2256 2256                                  break;
2257 2257                  }
2258 2258  
2259 2259                  AS_LOCK_EXIT(as, &as->a_lock);
2260 2260                  goto lockerr;
2261 2261          }
2262 2262  
2263 2263          /*
2264 2264           * Normalize addresses and sizes.
2265 2265           */
2266 2266          initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2267 2267          initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2268 2268              (size_t)raddr;
2269 2269  
2270 2270          if (raddr + rsize < raddr) {            /* check for wraparound */
2271 2271                  AS_LOCK_EXIT(as, &as->a_lock);
2272 2272                  return (ENOMEM);
2273 2273          }
2274 2274  
2275 2275          /*
2276 2276           * Get initial segment.
2277 2277           */
2278 2278          if ((seg = as_segat(as, raddr)) == NULL) {
2279 2279                  AS_LOCK_EXIT(as, &as->a_lock);
2280 2280                  return (ENOMEM);
2281 2281          }
2282 2282  
2283 2283          if (func == MC_LOCK) {
2284 2284                  mlock_size = BT_BITOUL(btopr(rsize));
2285 2285                  if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2286 2286                      sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2287 2287                                  AS_LOCK_EXIT(as, &as->a_lock);
2288 2288                                  return (EAGAIN);
2289 2289                  }
2290 2290          }
2291 2291  
2292 2292          /*
2293 2293           * Loop over all segments.  If a hole in the address range is
2294 2294           * discovered, then fail.  For each segment, perform the appropriate
2295 2295           * control operation.
2296 2296           */
2297 2297          while (rsize != 0) {
2298 2298  
2299 2299                  /*
2300 2300                   * Make sure there's no hole, calculate the portion
2301 2301                   * of the next segment to be operated over.
2302 2302                   */
2303 2303                  if (raddr >= seg->s_base + seg->s_size) {
2304 2304                          seg = AS_SEGNEXT(as, seg);
2305 2305                          if (seg == NULL || raddr != seg->s_base) {
2306 2306                                  if (func == MC_LOCK) {
2307 2307                                          as_unlockerr(as, attr, mlock_map,
2308 2308                                              initraddr, initrsize - rsize);
2309 2309                                          kmem_free(mlock_map,
2310 2310                                              mlock_size * sizeof (ulong_t));
2311 2311                                  }
2312 2312                                  AS_LOCK_EXIT(as, &as->a_lock);
2313 2313                                  return (ENOMEM);
2314 2314                          }
2315 2315                  }
2316 2316                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2317 2317                          ssize = seg->s_base + seg->s_size - raddr;
2318 2318                  else
2319 2319                          ssize = rsize;
2320 2320  
2321 2321                  /*
2322 2322                   * Dispatch on specific function.
2323 2323                   */
2324 2324                  switch (func) {
2325 2325  
2326 2326                  /*
2327 2327                   * Synchronize cached data from mappings with backing
2328 2328                   * objects.
2329 2329                   */
2330 2330                  case MC_SYNC:
2331 2331                          if (error = segop_sync(seg, raddr, ssize,
2332 2332                              attr, (uint_t)arg)) {
2333 2333                                  AS_LOCK_EXIT(as, &as->a_lock);
2334 2334                                  return (error);
2335 2335                          }
2336 2336                          break;
2337 2337  
2338 2338                  /*
2339 2339                   * Lock pages in memory.
2340 2340                   */
2341 2341                  case MC_LOCK:
2342 2342                          if (error = segop_lockop(seg, raddr, ssize,
2343 2343                              attr, func, mlock_map, pos)) {
2344 2344                                  as_unlockerr(as, attr, mlock_map, initraddr,
2345 2345                                      initrsize - rsize + ssize);
2346 2346                                  kmem_free(mlock_map, mlock_size *
2347 2347                                      sizeof (ulong_t));
2348 2348                                  AS_LOCK_EXIT(as, &as->a_lock);
2349 2349                                  goto lockerr;
2350 2350                          }
2351 2351                          break;
2352 2352  
2353 2353                  /*
2354 2354                   * Unlock mapped pages.
2355 2355                   */
2356 2356                  case MC_UNLOCK:
2357 2357                          (void) segop_lockop(seg, raddr, ssize, attr, func,
2358 2358                              (ulong_t *)NULL, (size_t)NULL);
2359 2359                          break;
2360 2360  
2361 2361                  /*
2362 2362                   * Store VM advise for mapped pages in segment layer.
2363 2363                   */
2364 2364                  case MC_ADVISE:
2365 2365                          error = segop_advise(seg, raddr, ssize, (uint_t)arg);
2366 2366  
2367 2367                          /*
2368 2368                           * Check for regular errors and special retry error
2369 2369                           */
2370 2370                          if (error) {
2371 2371                                  if (error == IE_RETRY) {
2372 2372                                          /*
2373 2373                                           * Need to acquire writers lock, so
2374 2374                                           * have to drop readers lock and start
2375 2375                                           * all over again
2376 2376                                           */
2377 2377                                          AS_LOCK_EXIT(as, &as->a_lock);
2378 2378                                          goto retry;
2379 2379                                  } else if (error == IE_REATTACH) {
2380 2380                                          /*
2381 2381                                           * Find segment for current address
2382 2382                                           * because current segment just got
2383 2383                                           * split or concatenated
2384 2384                                           */
2385 2385                                          seg = as_segat(as, raddr);
2386 2386                                          if (seg == NULL) {
2387 2387                                                  AS_LOCK_EXIT(as, &as->a_lock);
2388 2388                                                  return (ENOMEM);
2389 2389                                          }
2390 2390                                  } else {
2391 2391                                          /*
2392 2392                                           * Regular error
2393 2393                                           */
2394 2394                                          AS_LOCK_EXIT(as, &as->a_lock);
2395 2395                                          return (error);
2396 2396                                  }
2397 2397                          }
2398 2398                          break;
2399 2399  
2400 2400                  case MC_INHERIT_ZERO:
2401 2401                          error = segop_inherit(seg, raddr, ssize, SEGP_INH_ZERO);
2402 2402                          if (error != 0) {
2403 2403                                  AS_LOCK_EXIT(as, &as->a_lock);
2404 2404                                  return (error);
2405 2405                          }
2406 2406                          break;
2407 2407  
2408 2408                  /*
2409 2409                   * Can't happen.
2410 2410                   */
2411 2411                  default:
2412 2412                          panic("as_ctl: bad operation %d", func);
2413 2413                          /*NOTREACHED*/
2414 2414                  }
2415 2415  
2416 2416                  rsize -= ssize;
2417 2417                  raddr += ssize;
2418 2418          }
2419 2419  
2420 2420          if (func == MC_LOCK)
2421 2421                  kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2422 2422          AS_LOCK_EXIT(as, &as->a_lock);
2423 2423          return (0);
2424 2424  lockerr:
2425 2425  
2426 2426          /*
2427 2427           * If the lower levels returned EDEADLK for a segment lockop,
2428 2428           * it means that we should retry the operation.  Let's wait
2429 2429           * a bit also to let the deadlock causing condition clear.
2430 2430           * This is part of a gross hack to work around a design flaw
2431 2431           * in the ufs/sds logging code and should go away when the
2432 2432           * logging code is re-designed to fix the problem. See bug
2433 2433           * 4125102 for details of the problem.
2434 2434           */
2435 2435          if (error == EDEADLK) {
2436 2436                  delay(deadlk_wait);
2437 2437                  error = 0;
2438 2438                  goto retry;
2439 2439          }
2440 2440          return (error);
2441 2441  }
2442 2442  
2443 2443  int
2444 2444  fc_decode(faultcode_t fault_err)
2445 2445  {
2446 2446          int error = 0;
2447 2447  
2448 2448          switch (FC_CODE(fault_err)) {
2449 2449          case FC_OBJERR:
2450 2450                  error = FC_ERRNO(fault_err);
2451 2451                  break;
2452 2452          case FC_PROT:
2453 2453                  error = EACCES;
2454 2454                  break;
2455 2455          default:
2456 2456                  error = EFAULT;
2457 2457                  break;
2458 2458          }
2459 2459          return (error);
2460 2460  }
2461 2461  
2462 2462  /*
2463 2463   * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2464 2464   * lists from each segment and copy them to one contiguous shadow list (plist)
2465 2465   * as expected by the caller.  Save pointers to per segment shadow lists at
2466 2466   * the tail of plist so that they can be used during as_pageunlock().
2467 2467   */
2468 2468  static int
2469 2469  as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2470 2470      caddr_t addr, size_t size, enum seg_rw rw)
2471 2471  {
2472 2472          caddr_t sv_addr = addr;
2473 2473          size_t sv_size = size;
2474 2474          struct seg *sv_seg = seg;
2475 2475          ulong_t segcnt = 1;
2476 2476          ulong_t cnt;
2477 2477          size_t ssize;
2478 2478          pgcnt_t npages = btop(size);
2479 2479          page_t **plist;
2480 2480          page_t **pl;
2481 2481          int error;
2482 2482          caddr_t eaddr;
2483 2483          faultcode_t fault_err = 0;
2484 2484          pgcnt_t pl_off;
2485 2485          extern struct seg_ops segspt_shmops;
2486 2486  
2487 2487          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2488 2488          ASSERT(seg != NULL);
2489 2489          ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2490 2490          ASSERT(addr + size > seg->s_base + seg->s_size);
2491 2491          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2492 2492          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2493 2493  
2494 2494          /*
2495 2495           * Count the number of segments covered by the range we are about to
2496 2496           * lock. The segment count is used to size the shadow list we return
2497 2497           * back to the caller.
2498 2498           */
2499 2499          for (; size != 0; size -= ssize, addr += ssize) {
2500 2500                  if (addr >= seg->s_base + seg->s_size) {
2501 2501  
2502 2502                          seg = AS_SEGNEXT(as, seg);
2503 2503                          if (seg == NULL || addr != seg->s_base) {
2504 2504                                  AS_LOCK_EXIT(as, &as->a_lock);
2505 2505                                  return (EFAULT);
2506 2506                          }
2507 2507                          /*
2508 2508                           * Do a quick check if subsequent segments
2509 2509                           * will most likely support pagelock.
2510 2510                           */
2511 2511                          if (seg->s_ops == &segvn_ops) {
2512 2512                                  vnode_t *vp;
2513 2513  
2514 2514                                  if (segop_getvp(seg, addr, &vp) != 0 ||
2515 2515                                      vp != NULL) {
2516 2516                                          AS_LOCK_EXIT(as, &as->a_lock);
2517 2517                                          goto slow;
2518 2518                                  }
2519 2519                          } else if (seg->s_ops != &segspt_shmops) {
2520 2520                                  AS_LOCK_EXIT(as, &as->a_lock);
2521 2521                                  goto slow;
2522 2522                          }
2523 2523                          segcnt++;
2524 2524                  }
2525 2525                  if (addr + size > seg->s_base + seg->s_size) {
2526 2526                          ssize = seg->s_base + seg->s_size - addr;
2527 2527                  } else {
2528 2528                          ssize = size;
2529 2529                  }
2530 2530          }
2531 2531          ASSERT(segcnt > 1);
2532 2532  
2533 2533          plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2534 2534  
2535 2535          addr = sv_addr;
2536 2536          size = sv_size;
2537 2537          seg = sv_seg;
2538 2538  
2539 2539          for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2540 2540                  if (addr >= seg->s_base + seg->s_size) {
2541 2541                          seg = AS_SEGNEXT(as, seg);
2542 2542                          ASSERT(seg != NULL && addr == seg->s_base);
2543 2543                          cnt++;
2544 2544                          ASSERT(cnt < segcnt);
2545 2545                  }
2546 2546                  if (addr + size > seg->s_base + seg->s_size) {
2547 2547                          ssize = seg->s_base + seg->s_size - addr;
2548 2548                  } else {
2549 2549                          ssize = size;
2550 2550                  }
2551 2551                  pl = &plist[npages + cnt];
2552 2552                  error = segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2553 2553                      L_PAGELOCK, rw);
2554 2554                  if (error) {
2555 2555                          break;
2556 2556                  }
2557 2557                  ASSERT(plist[npages + cnt] != NULL);
2558 2558                  ASSERT(pl_off + btop(ssize) <= npages);
2559 2559                  bcopy(plist[npages + cnt], &plist[pl_off],
2560 2560                      btop(ssize) * sizeof (page_t *));
2561 2561                  pl_off += btop(ssize);
2562 2562          }
2563 2563  
2564 2564          if (size == 0) {
2565 2565                  AS_LOCK_EXIT(as, &as->a_lock);
2566 2566                  ASSERT(cnt == segcnt - 1);
2567 2567                  *ppp = plist;
2568 2568                  return (0);
2569 2569          }
2570 2570  
2571 2571          /*
2572 2572           * one of pagelock calls failed. The error type is in error variable.
2573 2573           * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2574 2574           * type is either EFAULT or ENOTSUP. Otherwise just return the error
2575 2575           * back to the caller.
2576 2576           */
2577 2577  
2578 2578          eaddr = addr;
2579 2579          seg = sv_seg;
2580 2580  
2581 2581          for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2582 2582                  if (addr >= seg->s_base + seg->s_size) {
2583 2583                          seg = AS_SEGNEXT(as, seg);
2584 2584                          ASSERT(seg != NULL && addr == seg->s_base);
2585 2585                          cnt++;
2586 2586                          ASSERT(cnt < segcnt);
2587 2587                  }
2588 2588                  if (eaddr > seg->s_base + seg->s_size) {
2589 2589                          ssize = seg->s_base + seg->s_size - addr;
2590 2590                  } else {
2591 2591                          ssize = eaddr - addr;
2592 2592                  }
2593 2593                  pl = &plist[npages + cnt];
2594 2594                  ASSERT(*pl != NULL);
2595 2595                  (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2596 2596                      L_PAGEUNLOCK, rw);
2597 2597          }
2598 2598  
2599 2599          AS_LOCK_EXIT(as, &as->a_lock);
2600 2600  
2601 2601          kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2602 2602  
2603 2603          if (error != ENOTSUP && error != EFAULT) {
2604 2604                  return (error);
2605 2605          }
2606 2606  
2607 2607  slow:
2608 2608          /*
2609 2609           * If we are here because pagelock failed due to the need to cow fault
2610 2610           * in the pages we want to lock F_SOFTLOCK will do this job and in
2611 2611           * next as_pagelock() call for this address range pagelock will
2612 2612           * hopefully succeed.
2613 2613           */
2614 2614          fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2615 2615          if (fault_err != 0) {
2616 2616                  return (fc_decode(fault_err));
2617 2617          }
2618 2618          *ppp = NULL;
2619 2619  
2620 2620          return (0);
2621 2621  }
2622 2622  
2623 2623  /*
2624 2624   * lock pages in a given address space. Return shadow list. If
2625 2625   * the list is NULL, the MMU mapping is also locked.
2626 2626   */
2627 2627  int
2628 2628  as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2629 2629      size_t size, enum seg_rw rw)
2630 2630  {
2631 2631          size_t rsize;
2632 2632          caddr_t raddr;
2633 2633          faultcode_t fault_err;
2634 2634          struct seg *seg;
2635 2635          int err;
2636 2636  
2637 2637          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2638 2638              "as_pagelock_start: addr %p size %ld", addr, size);
2639 2639  
2640 2640          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2641 2641          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2642 2642              (size_t)raddr;
2643 2643  
2644 2644          /*
2645 2645           * if the request crosses two segments let
2646 2646           * as_fault handle it.
2647 2647           */
2648 2648          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2649 2649  
2650 2650          seg = as_segat(as, raddr);
2651 2651          if (seg == NULL) {
2652 2652                  AS_LOCK_EXIT(as, &as->a_lock);
2653 2653                  return (EFAULT);
2654 2654          }
2655 2655          ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2656 2656          if (raddr + rsize > seg->s_base + seg->s_size) {
2657 2657                  return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2658 2658          }
2659 2659          if (raddr + rsize <= raddr) {
2660 2660                  AS_LOCK_EXIT(as, &as->a_lock);
2661 2661                  return (EFAULT);
2662 2662          }
2663 2663  
2664 2664          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2665 2665              "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2666 2666  
2667 2667          /*
2668 2668           * try to lock pages and pass back shadow list
2669 2669           */
2670 2670          err = segop_pagelock(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2671 2671  
2672 2672          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2673 2673  
2674 2674          AS_LOCK_EXIT(as, &as->a_lock);
2675 2675  
2676 2676          if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2677 2677                  return (err);
2678 2678          }
2679 2679  
2680 2680          /*
2681 2681           * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2682 2682           * to no pagelock support for this segment or pages need to be cow
2683 2683           * faulted in. If fault is needed F_SOFTLOCK will do this job for
2684 2684           * this as_pagelock() call and in the next as_pagelock() call for the
2685 2685           * same address range pagelock call will hopefull succeed.
2686 2686           */
2687 2687          fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2688 2688          if (fault_err != 0) {
2689 2689                  return (fc_decode(fault_err));
2690 2690          }
2691 2691          *ppp = NULL;
2692 2692  
2693 2693          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2694 2694          return (0);
2695 2695  }
2696 2696  
2697 2697  /*
2698 2698   * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2699 2699   * lists from the end of plist and call pageunlock interface for each segment.
2700 2700   * Drop as lock and free plist.
2701 2701   */
2702 2702  static void
2703 2703  as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2704 2704      struct page **plist, enum seg_rw rw)
2705 2705  {
2706 2706          ulong_t cnt;
2707 2707          caddr_t eaddr = addr + size;
2708 2708          pgcnt_t npages = btop(size);
2709 2709          size_t ssize;
2710 2710          page_t **pl;
2711 2711  
2712 2712          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2713 2713          ASSERT(seg != NULL);
2714 2714          ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2715 2715          ASSERT(addr + size > seg->s_base + seg->s_size);
2716 2716          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2717 2717          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2718 2718          ASSERT(plist != NULL);
2719 2719  
2720 2720          for (cnt = 0; addr < eaddr; addr += ssize) {
2721 2721                  if (addr >= seg->s_base + seg->s_size) {
2722 2722                          seg = AS_SEGNEXT(as, seg);
2723 2723                          ASSERT(seg != NULL && addr == seg->s_base);
2724 2724                          cnt++;
2725 2725                  }
2726 2726                  if (eaddr > seg->s_base + seg->s_size) {
2727 2727                          ssize = seg->s_base + seg->s_size - addr;
2728 2728                  } else {
2729 2729                          ssize = eaddr - addr;
2730 2730                  }
2731 2731                  pl = &plist[npages + cnt];
2732 2732                  ASSERT(*pl != NULL);
2733 2733                  (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2734 2734                      L_PAGEUNLOCK, rw);
2735 2735          }
2736 2736          ASSERT(cnt > 0);
2737 2737          AS_LOCK_EXIT(as, &as->a_lock);
2738 2738  
2739 2739          cnt++;
2740 2740          kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2741 2741  }
2742 2742  
2743 2743  /*
2744 2744   * unlock pages in a given address range
2745 2745   */
2746 2746  void
2747 2747  as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2748 2748      enum seg_rw rw)
2749 2749  {
2750 2750          struct seg *seg;
2751 2751          size_t rsize;
2752 2752          caddr_t raddr;
2753 2753  
2754 2754          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2755 2755              "as_pageunlock_start: addr %p size %ld", addr, size);
2756 2756  
2757 2757          /*
2758 2758           * if the shadow list is NULL, as_pagelock was
2759 2759           * falling back to as_fault
2760 2760           */
2761 2761          if (pp == NULL) {
2762 2762                  (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2763 2763                  return;
2764 2764          }
2765 2765  
2766 2766          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2767 2767          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2768 2768              (size_t)raddr;
2769 2769  
2770 2770          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2771 2771          seg = as_segat(as, raddr);
2772 2772          ASSERT(seg != NULL);
2773 2773  
2774 2774          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2775 2775              "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2776 2776  
2777 2777          ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2778 2778          if (raddr + rsize <= seg->s_base + seg->s_size) {
2779 2779                  segop_pagelock(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2780 2780          } else {
2781 2781                  as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2782 2782                  return;
2783 2783          }
2784 2784          AS_LOCK_EXIT(as, &as->a_lock);
2785 2785          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2786 2786  }
2787 2787  
2788 2788  int
2789 2789  as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2790 2790      boolean_t wait)
2791 2791  {
2792 2792          struct seg *seg;
2793 2793          size_t ssize;
2794 2794          caddr_t raddr;                  /* rounded down addr */
2795 2795          size_t rsize;                   /* rounded up size */
2796 2796          int error = 0;
2797 2797          size_t pgsz = page_get_pagesize(szc);
2798 2798  
2799 2799  setpgsz_top:
2800 2800          if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2801 2801                  return (EINVAL);
2802 2802          }
2803 2803  
2804 2804          raddr = addr;
2805 2805          rsize = size;
2806 2806  
2807 2807          if (raddr + rsize < raddr)              /* check for wraparound */
2808 2808                  return (ENOMEM);
2809 2809  
2810 2810          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2811 2811          as_clearwatchprot(as, raddr, rsize);
2812 2812          seg = as_segat(as, raddr);
2813 2813          if (seg == NULL) {
2814 2814                  as_setwatch(as);
2815 2815                  AS_LOCK_EXIT(as, &as->a_lock);
2816 2816                  return (ENOMEM);
2817 2817          }
2818 2818  
2819 2819          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2820 2820                  if (raddr >= seg->s_base + seg->s_size) {
2821 2821                          seg = AS_SEGNEXT(as, seg);
2822 2822                          if (seg == NULL || raddr != seg->s_base) {
2823 2823                                  error = ENOMEM;
2824 2824                                  break;
2825 2825                          }
2826 2826                  }
2827 2827                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2828 2828                          ssize = seg->s_base + seg->s_size - raddr;
2829 2829                  } else {
2830 2830                          ssize = rsize;
2831 2831                  }
2832 2832  
2833 2833  retry:
2834 2834                  error = segop_setpagesize(seg, raddr, ssize, szc);
2835 2835  
2836 2836                  if (error == IE_NOMEM) {
2837 2837                          error = EAGAIN;
2838 2838                          break;
2839 2839                  }
2840 2840  
2841 2841                  if (error == IE_RETRY) {
2842 2842                          AS_LOCK_EXIT(as, &as->a_lock);
2843 2843                          goto setpgsz_top;
2844 2844                  }
2845 2845  
2846 2846                  if (error == ENOTSUP) {
2847 2847                          error = EINVAL;
2848 2848                          break;
2849 2849                  }
2850 2850  
2851 2851                  if (wait && (error == EAGAIN)) {
2852 2852                          /*
2853 2853                           * Memory is currently locked.  It must be unlocked
2854 2854                           * before this operation can succeed through a retry.
2855 2855                           * The possible reasons for locked memory and
2856 2856                           * corresponding strategies for unlocking are:
2857 2857                           * (1) Normal I/O
2858 2858                           *      wait for a signal that the I/O operation
2859 2859                           *      has completed and the memory is unlocked.
2860 2860                           * (2) Asynchronous I/O
2861 2861                           *      The aio subsystem does not unlock pages when
2862 2862                           *      the I/O is completed. Those pages are unlocked
2863 2863                           *      when the application calls aiowait/aioerror.
2864 2864                           *      So, to prevent blocking forever, cv_broadcast()
2865 2865                           *      is done to wake up aio_cleanup_thread.
2866 2866                           *      Subsequently, segvn_reclaim will be called, and
2867 2867                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
2868 2868                           * (3) Long term page locking:
2869 2869                           *      This is not relevant for as_setpagesize()
2870 2870                           *      because we cannot change the page size for
2871 2871                           *      driver memory. The attempt to do so will
2872 2872                           *      fail with a different error than EAGAIN so
2873 2873                           *      there's no need to trigger as callbacks like
2874 2874                           *      as_unmap, as_setprot or as_free would do.
2875 2875                           */
2876 2876                          mutex_enter(&as->a_contents);
2877 2877                          if (!AS_ISNOUNMAPWAIT(as)) {
2878 2878                                  if (AS_ISUNMAPWAIT(as) == 0) {
2879 2879                                          cv_broadcast(&as->a_cv);
2880 2880                                  }
2881 2881                                  AS_SETUNMAPWAIT(as);
2882 2882                                  AS_LOCK_EXIT(as, &as->a_lock);
2883 2883                                  while (AS_ISUNMAPWAIT(as)) {
2884 2884                                          cv_wait(&as->a_cv, &as->a_contents);
2885 2885                                  }
2886 2886                          } else {
2887 2887                                  /*
2888 2888                                   * We may have raced with
2889 2889                                   * segvn_reclaim()/segspt_reclaim(). In this
2890 2890                                   * case clean nounmapwait flag and retry since
2891 2891                                   * softlockcnt in this segment may be already
2892 2892                                   * 0.  We don't drop as writer lock so our
2893 2893                                   * number of retries without sleeping should
2894 2894                                   * be very small. See segvn_reclaim() for
2895 2895                                   * more comments.
2896 2896                                   */
2897 2897                                  AS_CLRNOUNMAPWAIT(as);
2898 2898                                  mutex_exit(&as->a_contents);
2899 2899                                  goto retry;
2900 2900                          }
2901 2901                          mutex_exit(&as->a_contents);
2902 2902                          goto setpgsz_top;
2903 2903                  } else if (error != 0) {
2904 2904                          break;
2905 2905                  }
2906 2906          }
2907 2907          as_setwatch(as);
2908 2908          AS_LOCK_EXIT(as, &as->a_lock);
2909 2909          return (error);
2910 2910  }
2911 2911  
2912 2912  /*
2913 2913   * as_iset3_default_lpsize() just calls segop_setpagesize() on all segments
2914 2914   * in its chunk where s_szc is less than the szc we want to set.
2915 2915   */
2916 2916  static int
2917 2917  as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2918 2918      int *retry)
2919 2919  {
2920 2920          struct seg *seg;
2921 2921          size_t ssize;
2922 2922          int error;
2923 2923  
2924 2924          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
2925 2925  
2926 2926          seg = as_segat(as, raddr);
2927 2927          if (seg == NULL) {
2928 2928                  panic("as_iset3_default_lpsize: no seg");
2929 2929          }
2930 2930  
2931 2931          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2932 2932                  if (raddr >= seg->s_base + seg->s_size) {
2933 2933                          seg = AS_SEGNEXT(as, seg);
2934 2934                          if (seg == NULL || raddr != seg->s_base) {
2935 2935                                  panic("as_iset3_default_lpsize: as changed");
2936 2936                          }
2937 2937                  }
2938 2938                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2939 2939                          ssize = seg->s_base + seg->s_size - raddr;
2940 2940                  } else {
2941 2941                          ssize = rsize;
2942 2942                  }
2943 2943  
2944 2944                  if (szc > seg->s_szc) {
2945 2945                          error = segop_setpagesize(seg, raddr, ssize, szc);
2946 2946                          /* Only retry on EINVAL segments that have no vnode. */
2947 2947                          if (error == EINVAL) {
2948 2948                                  vnode_t *vp = NULL;
2949 2949                                  if ((segop_gettype(seg, raddr) & MAP_SHARED) &&
2950 2950                                      (segop_getvp(seg, raddr, &vp) != 0 ||
2951 2951                                      vp == NULL)) {
2952 2952                                          *retry = 1;
2953 2953                                  } else {
2954 2954                                          *retry = 0;
2955 2955                                  }
2956 2956                          }
2957 2957                          if (error) {
2958 2958                                  return (error);
2959 2959                          }
2960 2960                  }
2961 2961          }
2962 2962          return (0);
2963 2963  }
2964 2964  
2965 2965  /*
2966 2966   * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
2967 2967   * pagesize on each segment in its range, but if any fails with EINVAL,
2968 2968   * then it reduces the pagesizes to the next size in the bitmap and
2969 2969   * retries as_iset3_default_lpsize(). The reason why the code retries
2970 2970   * smaller allowed sizes on EINVAL is because (a) the anon offset may not
2971 2971   * match the bigger sizes, and (b) it's hard to get this offset (to begin
2972 2972   * with) to pass to map_pgszcvec().
2973 2973   */
2974 2974  static int
2975 2975  as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2976 2976      uint_t szcvec)
2977 2977  {
2978 2978          int error;
2979 2979          int retry;
2980 2980  
2981 2981          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
2982 2982  
2983 2983          for (;;) {
2984 2984                  error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
2985 2985                  if (error == EINVAL && retry) {
2986 2986                          szcvec &= ~(1 << szc);
2987 2987                          if (szcvec <= 1) {
2988 2988                                  return (EINVAL);
2989 2989                          }
2990 2990                          szc = highbit(szcvec) - 1;
2991 2991                  } else {
2992 2992                          return (error);
2993 2993                  }
2994 2994          }
2995 2995  }
2996 2996  
2997 2997  /*
2998 2998   * as_iset1_default_lpsize() breaks its chunk into areas where existing
2999 2999   * segments have a smaller szc than we want to set. For each such area,
3000 3000   * it calls as_iset2_default_lpsize()
3001 3001   */
3002 3002  static int
3003 3003  as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3004 3004      uint_t szcvec)
3005 3005  {
3006 3006          struct seg *seg;
3007 3007          size_t ssize;
3008 3008          caddr_t setaddr = raddr;
3009 3009          size_t setsize = 0;
3010 3010          int set;
3011 3011          int error;
3012 3012  
3013 3013          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3014 3014  
3015 3015          seg = as_segat(as, raddr);
3016 3016          if (seg == NULL) {
3017 3017                  panic("as_iset1_default_lpsize: no seg");
3018 3018          }
3019 3019          if (seg->s_szc < szc) {
3020 3020                  set = 1;
3021 3021          } else {
3022 3022                  set = 0;
3023 3023          }
3024 3024  
3025 3025          for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3026 3026                  if (raddr >= seg->s_base + seg->s_size) {
3027 3027                          seg = AS_SEGNEXT(as, seg);
3028 3028                          if (seg == NULL || raddr != seg->s_base) {
3029 3029                                  panic("as_iset1_default_lpsize: as changed");
3030 3030                          }
3031 3031                          if (seg->s_szc >= szc && set) {
3032 3032                                  ASSERT(setsize != 0);
3033 3033                                  error = as_iset2_default_lpsize(as,
3034 3034                                      setaddr, setsize, szc, szcvec);
3035 3035                                  if (error) {
3036 3036                                          return (error);
3037 3037                                  }
3038 3038                                  set = 0;
3039 3039                          } else if (seg->s_szc < szc && !set) {
3040 3040                                  setaddr = raddr;
3041 3041                                  setsize = 0;
3042 3042                                  set = 1;
3043 3043                          }
3044 3044                  }
3045 3045                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3046 3046                          ssize = seg->s_base + seg->s_size - raddr;
3047 3047                  } else {
3048 3048                          ssize = rsize;
3049 3049                  }
3050 3050          }
3051 3051          error = 0;
3052 3052          if (set) {
3053 3053                  ASSERT(setsize != 0);
3054 3054                  error = as_iset2_default_lpsize(as, setaddr, setsize,
3055 3055                      szc, szcvec);
3056 3056          }
3057 3057          return (error);
3058 3058  }
3059 3059  
3060 3060  /*
3061 3061   * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3062 3062   * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3063 3063   * chunk to as_iset1_default_lpsize().
3064 3064   */
3065 3065  static int
3066 3066  as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3067 3067      int type)
3068 3068  {
3069 3069          int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3070 3070          uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3071 3071              flags, rtype, 1);
3072 3072          uint_t szc;
3073 3073          uint_t nszc;
3074 3074          int error;
3075 3075          caddr_t a;
3076 3076          caddr_t eaddr;
3077 3077          size_t segsize;
3078 3078          size_t pgsz;
3079 3079          uint_t save_szcvec;
3080 3080  
3081 3081          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3082 3082          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3083 3083          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3084 3084  
3085 3085          szcvec &= ~1;
3086 3086          if (szcvec <= 1) {      /* skip if base page size */
3087 3087                  return (0);
3088 3088          }
3089 3089  
3090 3090          /* Get the pagesize of the first larger page size. */
3091 3091          szc = lowbit(szcvec) - 1;
3092 3092          pgsz = page_get_pagesize(szc);
3093 3093          eaddr = addr + size;
3094 3094          addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3095 3095          eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3096 3096  
3097 3097          save_szcvec = szcvec;
3098 3098          szcvec >>= (szc + 1);
3099 3099          nszc = szc;
3100 3100          while (szcvec) {
3101 3101                  if ((szcvec & 0x1) == 0) {
3102 3102                          nszc++;
3103 3103                          szcvec >>= 1;
3104 3104                          continue;
3105 3105                  }
3106 3106                  nszc++;
3107 3107                  pgsz = page_get_pagesize(nszc);
3108 3108                  a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3109 3109                  if (a != addr) {
3110 3110                          ASSERT(szc > 0);
3111 3111                          ASSERT(a < eaddr);
3112 3112                          segsize = a - addr;
3113 3113                          error = as_iset1_default_lpsize(as, addr, segsize, szc,
3114 3114                              save_szcvec);
3115 3115                          if (error) {
3116 3116                                  return (error);
3117 3117                          }
3118 3118                          addr = a;
3119 3119                  }
3120 3120                  szc = nszc;
3121 3121                  szcvec >>= 1;
3122 3122          }
3123 3123  
3124 3124          ASSERT(addr < eaddr);
3125 3125          szcvec = save_szcvec;
3126 3126          while (szcvec) {
3127 3127                  a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3128 3128                  ASSERT(a >= addr);
3129 3129                  if (a != addr) {
3130 3130                          ASSERT(szc > 0);
3131 3131                          segsize = a - addr;
3132 3132                          error = as_iset1_default_lpsize(as, addr, segsize, szc,
3133 3133                              save_szcvec);
3134 3134                          if (error) {
3135 3135                                  return (error);
3136 3136                          }
3137 3137                          addr = a;
3138 3138                  }
3139 3139                  szcvec &= ~(1 << szc);
3140 3140                  if (szcvec) {
3141 3141                          szc = highbit(szcvec) - 1;
3142 3142                          pgsz = page_get_pagesize(szc);
3143 3143                  }
3144 3144          }
3145 3145          ASSERT(addr == eaddr);
3146 3146  
3147 3147          return (0);
3148 3148  }
3149 3149  
3150 3150  /*
3151 3151   * Set the default large page size for the range. Called via memcntl with
3152 3152   * page size set to 0. as_set_default_lpsize breaks the range down into
3153 3153   * chunks with the same type/flags, ignores-non segvn segments, and passes
3154 3154   * each chunk to as_iset_default_lpsize().
3155 3155   */
3156 3156  int
3157 3157  as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3158 3158  {
3159 3159          struct seg *seg;
3160 3160          caddr_t raddr;
3161 3161          size_t rsize;
3162 3162          size_t ssize;
3163 3163          int rtype, rflags;
3164 3164          int stype, sflags;
3165 3165          int error;
3166 3166          caddr_t setaddr;
3167 3167          size_t setsize;
3168 3168          int segvn;
3169 3169  
3170 3170          if (size == 0)
3171 3171                  return (0);
3172 3172  
3173 3173          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3174 3174  again:
3175 3175          error = 0;
3176 3176  
3177 3177          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3178 3178          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3179 3179              (size_t)raddr;
3180 3180  
3181 3181          if (raddr + rsize < raddr) {            /* check for wraparound */
3182 3182                  AS_LOCK_EXIT(as, &as->a_lock);
3183 3183                  return (ENOMEM);
3184 3184          }
3185 3185          as_clearwatchprot(as, raddr, rsize);
3186 3186          seg = as_segat(as, raddr);
3187 3187          if (seg == NULL) {
3188 3188                  as_setwatch(as);
3189 3189                  AS_LOCK_EXIT(as, &as->a_lock);
3190 3190                  return (ENOMEM);
3191 3191          }
3192 3192          if (seg->s_ops == &segvn_ops) {
3193 3193                  rtype = segop_gettype(seg, addr);
3194 3194                  rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3195 3195                  rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3196 3196                  segvn = 1;
3197 3197          } else {
3198 3198                  segvn = 0;
3199 3199          }
3200 3200          setaddr = raddr;
3201 3201          setsize = 0;
3202 3202  
3203 3203          for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3204 3204                  if (raddr >= (seg->s_base + seg->s_size)) {
3205 3205                          seg = AS_SEGNEXT(as, seg);
3206 3206                          if (seg == NULL || raddr != seg->s_base) {
3207 3207                                  error = ENOMEM;
3208 3208                                  break;
3209 3209                          }
3210 3210                          if (seg->s_ops == &segvn_ops) {
3211 3211                                  stype = segop_gettype(seg, raddr);
3212 3212                                  sflags = stype & (MAP_TEXT | MAP_INITDATA);
3213 3213                                  stype &= (MAP_SHARED | MAP_PRIVATE);
3214 3214                                  if (segvn && (rflags != sflags ||
3215 3215                                      rtype != stype)) {
3216 3216                                          /*
3217 3217                                           * The next segment is also segvn but
3218 3218                                           * has different flags and/or type.
3219 3219                                           */
3220 3220                                          ASSERT(setsize != 0);
3221 3221                                          error = as_iset_default_lpsize(as,
3222 3222                                              setaddr, setsize, rflags, rtype);
3223 3223                                          if (error) {
3224 3224                                                  break;
3225 3225                                          }
3226 3226                                          rflags = sflags;
3227 3227                                          rtype = stype;
3228 3228                                          setaddr = raddr;
3229 3229                                          setsize = 0;
3230 3230                                  } else if (!segvn) {
3231 3231                                          rflags = sflags;
3232 3232                                          rtype = stype;
3233 3233                                          setaddr = raddr;
3234 3234                                          setsize = 0;
3235 3235                                          segvn = 1;
3236 3236                                  }
3237 3237                          } else if (segvn) {
3238 3238                                  /* The next segment is not segvn. */
3239 3239                                  ASSERT(setsize != 0);
3240 3240                                  error = as_iset_default_lpsize(as,
3241 3241                                      setaddr, setsize, rflags, rtype);
3242 3242                                  if (error) {
3243 3243                                          break;
3244 3244                                  }
3245 3245                                  segvn = 0;
3246 3246                          }
3247 3247                  }
3248 3248                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3249 3249                          ssize = seg->s_base + seg->s_size - raddr;
3250 3250                  } else {
3251 3251                          ssize = rsize;
3252 3252                  }
3253 3253          }
3254 3254          if (error == 0 && segvn) {
3255 3255                  /* The last chunk when rsize == 0. */
3256 3256                  ASSERT(setsize != 0);
3257 3257                  error = as_iset_default_lpsize(as, setaddr, setsize,
3258 3258                      rflags, rtype);
3259 3259          }
3260 3260  
3261 3261          if (error == IE_RETRY) {
3262 3262                  goto again;
3263 3263          } else if (error == IE_NOMEM) {
3264 3264                  error = EAGAIN;
3265 3265          } else if (error == ENOTSUP) {
3266 3266                  error = EINVAL;
3267 3267          } else if (error == EAGAIN) {
3268 3268                  mutex_enter(&as->a_contents);
3269 3269                  if (!AS_ISNOUNMAPWAIT(as)) {
3270 3270                          if (AS_ISUNMAPWAIT(as) == 0) {
3271 3271                                  cv_broadcast(&as->a_cv);
3272 3272                          }
3273 3273                          AS_SETUNMAPWAIT(as);
3274 3274                          AS_LOCK_EXIT(as, &as->a_lock);
3275 3275                          while (AS_ISUNMAPWAIT(as)) {
3276 3276                                  cv_wait(&as->a_cv, &as->a_contents);
3277 3277                          }
3278 3278                          mutex_exit(&as->a_contents);
3279 3279                          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3280 3280                  } else {
3281 3281                          /*
3282 3282                           * We may have raced with
3283 3283                           * segvn_reclaim()/segspt_reclaim(). In this case
3284 3284                           * clean nounmapwait flag and retry since softlockcnt
3285 3285                           * in this segment may be already 0.  We don't drop as
3286 3286                           * writer lock so our number of retries without
3287 3287                           * sleeping should be very small. See segvn_reclaim()
3288 3288                           * for more comments.
3289 3289                           */
3290 3290                          AS_CLRNOUNMAPWAIT(as);
3291 3291                          mutex_exit(&as->a_contents);
3292 3292                  }
3293 3293                  goto again;
3294 3294          }
3295 3295  
3296 3296          as_setwatch(as);
3297 3297          AS_LOCK_EXIT(as, &as->a_lock);
3298 3298          return (error);
3299 3299  }
3300 3300  
3301 3301  /*
3302 3302   * Setup all of the uninitialized watched pages that we can.
3303 3303   */
3304 3304  void
3305 3305  as_setwatch(struct as *as)
3306 3306  {
3307 3307          struct watched_page *pwp;
3308 3308          struct seg *seg;
3309 3309          caddr_t vaddr;
3310 3310          uint_t prot;
3311 3311          int  err, retrycnt;
3312 3312  
3313 3313          if (avl_numnodes(&as->a_wpage) == 0)
3314 3314                  return;
3315 3315  
3316 3316          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3317 3317  
3318 3318          for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3319 3319              pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3320 3320                  retrycnt = 0;
3321 3321          retry:
3322 3322                  vaddr = pwp->wp_vaddr;
3323 3323                  if (pwp->wp_oprot != 0 ||       /* already set up */
3324 3324                      (seg = as_segat(as, vaddr)) == NULL ||
3325 3325                      segop_getprot(seg, vaddr, 0, &prot) != 0)
3326 3326                          continue;
3327 3327  
3328 3328                  pwp->wp_oprot = prot;
3329 3329                  if (pwp->wp_read)
3330 3330                          prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3331 3331                  if (pwp->wp_write)
3332 3332                          prot &= ~PROT_WRITE;
3333 3333                  if (pwp->wp_exec)
3334 3334                          prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3335 3335                  if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3336 3336                          err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3337 3337                          if (err == IE_RETRY) {
3338 3338                                  pwp->wp_oprot = 0;
3339 3339                                  ASSERT(retrycnt == 0);
3340 3340                                  retrycnt++;
3341 3341                                  goto retry;
3342 3342                          }
3343 3343                  }
3344 3344                  pwp->wp_prot = prot;
3345 3345          }
3346 3346  }
3347 3347  
3348 3348  /*
3349 3349   * Clear all of the watched pages in the address space.
3350 3350   */
3351 3351  void
3352 3352  as_clearwatch(struct as *as)
3353 3353  {
3354 3354          struct watched_page *pwp;
3355 3355          struct seg *seg;
3356 3356          caddr_t vaddr;
3357 3357          uint_t prot;
3358 3358          int err, retrycnt;
3359 3359  
3360 3360          if (avl_numnodes(&as->a_wpage) == 0)
3361 3361                  return;
3362 3362  
3363 3363          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3364 3364  
3365 3365          for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3366 3366              pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3367 3367                  retrycnt = 0;
3368 3368          retry:
3369 3369                  vaddr = pwp->wp_vaddr;
3370 3370                  if (pwp->wp_oprot == 0 ||       /* not set up */
3371 3371                      (seg = as_segat(as, vaddr)) == NULL)
3372 3372                          continue;
3373 3373  
3374 3374                  if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3375 3375                          err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3376 3376                          if (err == IE_RETRY) {
3377 3377                                  ASSERT(retrycnt == 0);
3378 3378                                  retrycnt++;
3379 3379                                  goto retry;
3380 3380                          }
3381 3381                  }
3382 3382                  pwp->wp_oprot = 0;
3383 3383                  pwp->wp_prot = 0;
3384 3384          }
3385 3385  }
3386 3386  
3387 3387  /*
3388 3388   * Force a new setup for all the watched pages in the range.
3389 3389   */
3390 3390  static void
3391 3391  as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3392 3392  {
3393 3393          struct watched_page *pwp;
3394 3394          struct watched_page tpw;
3395 3395          caddr_t eaddr = addr + size;
3396 3396          caddr_t vaddr;
3397 3397          struct seg *seg;
3398 3398          int err, retrycnt;
3399 3399          uint_t  wprot;
3400 3400          avl_index_t where;
3401 3401  
3402 3402          if (avl_numnodes(&as->a_wpage) == 0)
3403 3403                  return;
3404 3404  
3405 3405          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3406 3406  
3407 3407          tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3408 3408          if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3409 3409                  pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3410 3410  
3411 3411          while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3412 3412                  retrycnt = 0;
3413 3413                  vaddr = pwp->wp_vaddr;
3414 3414  
3415 3415                  wprot = prot;
3416 3416                  if (pwp->wp_read)
3417 3417                          wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3418 3418                  if (pwp->wp_write)
3419 3419                          wprot &= ~PROT_WRITE;
3420 3420                  if (pwp->wp_exec)
3421 3421                          wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3422 3422                  if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3423 3423                  retry:
3424 3424                          seg = as_segat(as, vaddr);
3425 3425                          if (seg == NULL) {
3426 3426                                  panic("as_setwatchprot: no seg");
3427 3427                                  /*NOTREACHED*/
3428 3428                          }
3429 3429                          err = segop_setprot(seg, vaddr, PAGESIZE, wprot);
3430 3430                          if (err == IE_RETRY) {
3431 3431                                  ASSERT(retrycnt == 0);
3432 3432                                  retrycnt++;
3433 3433                                  goto retry;
3434 3434                          }
3435 3435                  }
3436 3436                  pwp->wp_oprot = prot;
3437 3437                  pwp->wp_prot = wprot;
3438 3438  
3439 3439                  pwp = AVL_NEXT(&as->a_wpage, pwp);
3440 3440          }
3441 3441  }
3442 3442  
3443 3443  /*
3444 3444   * Clear all of the watched pages in the range.
3445 3445   */
3446 3446  static void
3447 3447  as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3448 3448  {
3449 3449          caddr_t eaddr = addr + size;
3450 3450          struct watched_page *pwp;
3451 3451          struct watched_page tpw;
3452 3452          uint_t prot;
3453 3453          struct seg *seg;
3454 3454          int err, retrycnt;
3455 3455          avl_index_t where;
3456 3456  
3457 3457          if (avl_numnodes(&as->a_wpage) == 0)
3458 3458                  return;
3459 3459  
3460 3460          tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3461 3461          if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3462 3462                  pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3463 3463  
3464 3464          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3465 3465  
3466 3466          while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3467 3467  
3468 3468                  if ((prot = pwp->wp_oprot) != 0) {
3469 3469                          retrycnt = 0;
3470 3470  
3471 3471                          if (prot != pwp->wp_prot) {
3472 3472                          retry:
3473 3473                                  seg = as_segat(as, pwp->wp_vaddr);
3474 3474                                  if (seg == NULL)
3475 3475                                          continue;
3476 3476                                  err = segop_setprot(seg, pwp->wp_vaddr,
3477 3477                                      PAGESIZE, prot);
3478 3478                                  if (err == IE_RETRY) {
3479 3479                                          ASSERT(retrycnt == 0);
3480 3480                                          retrycnt++;
3481 3481                                          goto retry;
3482 3482  
3483 3483                                  }
3484 3484                          }
3485 3485                          pwp->wp_oprot = 0;
3486 3486                          pwp->wp_prot = 0;
3487 3487                  }
3488 3488  
3489 3489                  pwp = AVL_NEXT(&as->a_wpage, pwp);
3490 3490          }
3491 3491  }
3492 3492  
3493 3493  void
3494 3494  as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3495 3495  {
3496 3496          struct proc *p;
3497 3497  
3498 3498          mutex_enter(&pidlock);
3499 3499          for (p = practive; p; p = p->p_next) {
3500 3500                  if (p->p_as == as) {
3501 3501                          mutex_enter(&p->p_lock);
3502 3502                          if (p->p_as == as)
3503 3503                                  sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3504 3504                          mutex_exit(&p->p_lock);
3505 3505                  }
3506 3506          }
3507 3507          mutex_exit(&pidlock);
3508 3508  }
3509 3509  
3510 3510  /*
3511 3511   * return memory object ID
3512 3512   */
3513 3513  int
3514 3514  as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)

↓ open down ↓

3514 lines elided

↑ open up ↑

3515 3515  {
3516 3516          struct seg      *seg;
3517 3517          int             sts;
3518 3518  
3519 3519          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
3520 3520          seg = as_segat(as, addr);
3521 3521          if (seg == NULL) {
3522 3522                  AS_LOCK_EXIT(as, &as->a_lock);
3523 3523                  return (EFAULT);
3524 3524          }
3525      -        /*
3526      -         * catch old drivers which may not support getmemid
3527      -         */
3528      -        if (seg->s_ops->getmemid == NULL) {
3529      -                AS_LOCK_EXIT(as, &as->a_lock);
3530      -                return (ENODEV);
3531      -        }
3532 3525  
3533 3526          sts = segop_getmemid(seg, addr, memidp);
3534 3527  
3535 3528          AS_LOCK_EXIT(as, &as->a_lock);
3536 3529          return (sts);
3537 3530  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX