remove-xhat Wdiff usr/src/uts/common/vm/vm_as.c

Print this page

remove xhat
The xhat infrastructure was added to support hardware such as the zulu
graphics card - hardware which had on-board MMUs.  The VM used the xhat code
to keep the CPU's and Zulu's page tables in-sync.  Since the only xhat user
was zulu (which is gone), we can safely remove it simplifying the whole VM
subsystem.
Assorted notes:
- AS_BUSY flag was used solely by xhat

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_as.c
          +++ new/usr/src/uts/common/vm/vm_as.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   * Copyright 2015, Joyent, Inc.  All rights reserved.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28   28  /*        All Rights Reserved   */
  29   29  
  30   30  /*
  31   31   * University Copyright- Copyright (c) 1982, 1986, 1988
  32   32   * The Regents of the University of California
  33   33   * All Rights Reserved
  34   34   *
  35   35   * University Acknowledgment- Portions of this document are derived from
  36   36   * software developed by the University of California, Berkeley, and its
  37   37   * contributors.
  38   38   */
  39   39  
  40   40  /*
  41   41   * VM - address spaces.
  42   42   */
  43   43  
  44   44  #include <sys/types.h>
  45   45  #include <sys/t_lock.h>
  46   46  #include <sys/param.h>
  47   47  #include <sys/errno.h>
  48   48  #include <sys/systm.h>
  49   49  #include <sys/mman.h>
  50   50  #include <sys/sysmacros.h>
  51   51  #include <sys/cpuvar.h>

↓ open down ↓

51 lines elided

↑ open up ↑

  52   52  #include <sys/sysinfo.h>
  53   53  #include <sys/kmem.h>
  54   54  #include <sys/vnode.h>
  55   55  #include <sys/vmsystm.h>
  56   56  #include <sys/cmn_err.h>
  57   57  #include <sys/debug.h>
  58   58  #include <sys/tnf_probe.h>
  59   59  #include <sys/vtrace.h>
  60   60  
  61   61  #include <vm/hat.h>
  62      -#include <vm/xhat.h>
  63   62  #include <vm/as.h>
  64   63  #include <vm/seg.h>
  65   64  #include <vm/seg_vn.h>
  66   65  #include <vm/seg_dev.h>
  67   66  #include <vm/seg_kmem.h>
  68   67  #include <vm/seg_map.h>
  69   68  #include <vm/seg_spt.h>
  70   69  #include <vm/page.h>
  71   70  
  72   71  clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */

  73   72  
  74   73  static struct kmem_cache *as_cache;
  75   74  
  76   75  static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  77   76  static void as_clearwatchprot(struct as *, caddr_t, size_t);
  78   77  int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
  79   78  
  80   79  
  81   80  /*
  82   81   * Verifying the segment lists is very time-consuming; it may not be
  83   82   * desirable always to define VERIFY_SEGLIST when DEBUG is set.
  84   83   */
  85   84  #ifdef DEBUG
  86   85  #define VERIFY_SEGLIST
  87   86  int do_as_verify = 0;
  88   87  #endif
  89   88  
  90   89  /*
  91   90   * Allocate a new callback data structure entry and fill in the events of
  92   91   * interest, the address range of interest, and the callback argument.
  93   92   * Link the entry on the as->a_callbacks list. A callback entry for the
  94   93   * entire address space may be specified with vaddr = 0 and size = -1.
  95   94   *
  96   95   * CALLERS RESPONSIBILITY: If not calling from within the process context for
  97   96   * the specified as, the caller must guarantee persistence of the specified as
  98   97   * for the duration of this function (eg. pages being locked within the as
  99   98   * will guarantee persistence).
 100   99   */
 101  100  int
 102  101  as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 103  102                  caddr_t vaddr, size_t size, int sleepflag)
 104  103  {
 105  104          struct as_callback      *current_head, *cb;
 106  105          caddr_t                 saddr;
 107  106          size_t                  rsize;
 108  107  
 109  108          /* callback function and an event are mandatory */
 110  109          if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 111  110                  return (EINVAL);
 112  111  
 113  112          /* Adding a callback after as_free has been called is not allowed */
 114  113          if (as == &kas)
 115  114                  return (ENOMEM);
 116  115  
 117  116          /*
 118  117           * vaddr = 0 and size = -1 is used to indicate that the callback range
 119  118           * is the entire address space so no rounding is done in that case.
 120  119           */
 121  120          if (size != -1) {
 122  121                  saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 123  122                  rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 124  123                      (size_t)saddr;
 125  124                  /* check for wraparound */
 126  125                  if (saddr + rsize < saddr)
 127  126                          return (ENOMEM);
 128  127          } else {
 129  128                  if (vaddr != 0)
 130  129                          return (EINVAL);
 131  130                  saddr = vaddr;
 132  131                  rsize = size;
 133  132          }
 134  133  
 135  134          /* Allocate and initialize a callback entry */
 136  135          cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 137  136          if (cb == NULL)
 138  137                  return (EAGAIN);
 139  138  
 140  139          cb->ascb_func = cb_func;
 141  140          cb->ascb_arg = arg;
 142  141          cb->ascb_events = events;
 143  142          cb->ascb_saddr = saddr;
 144  143          cb->ascb_len = rsize;
 145  144  
 146  145          /* Add the entry to the list */
 147  146          mutex_enter(&as->a_contents);
 148  147          current_head = as->a_callbacks;
 149  148          as->a_callbacks = cb;
 150  149          cb->ascb_next = current_head;
 151  150  
 152  151          /*
 153  152           * The call to this function may lose in a race with
 154  153           * a pertinent event - eg. a thread does long term memory locking
 155  154           * but before the callback is added another thread executes as_unmap.
 156  155           * A broadcast here resolves that.
 157  156           */
 158  157          if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 159  158                  AS_CLRUNMAPWAIT(as);
 160  159                  cv_broadcast(&as->a_cv);
 161  160          }
 162  161  
 163  162          mutex_exit(&as->a_contents);
 164  163          return (0);
 165  164  }
 166  165  
 167  166  /*
 168  167   * Search the callback list for an entry which pertains to arg.
 169  168   *
 170  169   * This is called from within the client upon completion of the callback.
 171  170   * RETURN VALUES:
 172  171   *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 173  172   *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 174  173   *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 175  174   *                      entry will be made in as_do_callbacks)
 176  175   *
 177  176   * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 178  177   * set, it indicates that as_do_callbacks is processing this entry.  The
 179  178   * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 180  179   * to unblock as_do_callbacks, in case it is blocked.
 181  180   *
 182  181   * CALLERS RESPONSIBILITY: If not calling from within the process context for
 183  182   * the specified as, the caller must guarantee persistence of the specified as
 184  183   * for the duration of this function (eg. pages being locked within the as
 185  184   * will guarantee persistence).
 186  185   */
 187  186  uint_t
 188  187  as_delete_callback(struct as *as, void *arg)
 189  188  {
 190  189          struct as_callback **prevcb = &as->a_callbacks;
 191  190          struct as_callback *cb;
 192  191          uint_t rc = AS_CALLBACK_NOTFOUND;
 193  192  
 194  193          mutex_enter(&as->a_contents);
 195  194          for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 196  195                  if (cb->ascb_arg != arg)
 197  196                          continue;
 198  197  
 199  198                  /*
 200  199                   * If the events indicate AS_CALLBACK_CALLED, just clear
 201  200                   * AS_ALL_EVENT in the events field and wakeup the thread
 202  201                   * that may be waiting in as_do_callbacks.  as_do_callbacks
 203  202                   * will take care of removing this entry from the list.  In
 204  203                   * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 205  204                   * (AS_CALLBACK_CALLED not set), just remove it from the
 206  205                   * list, return the memory and return AS_CALLBACK_DELETED.
 207  206                   */
 208  207                  if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 209  208                          /* leave AS_CALLBACK_CALLED */
 210  209                          cb->ascb_events &= ~AS_ALL_EVENT;
 211  210                          rc = AS_CALLBACK_DELETE_DEFERRED;
 212  211                          cv_broadcast(&as->a_cv);
 213  212                  } else {
 214  213                          *prevcb = cb->ascb_next;
 215  214                          kmem_free(cb, sizeof (struct as_callback));
 216  215                          rc = AS_CALLBACK_DELETED;
 217  216                  }
 218  217                  break;
 219  218          }
 220  219          mutex_exit(&as->a_contents);
 221  220          return (rc);
 222  221  }
 223  222  
 224  223  /*
 225  224   * Searches the as callback list for a matching entry.
 226  225   * Returns a pointer to the first matching callback, or NULL if
 227  226   * nothing is found.
 228  227   * This function never sleeps so it is ok to call it with more
 229  228   * locks held but the (required) a_contents mutex.
 230  229   *
 231  230   * See also comment on as_do_callbacks below.
 232  231   */
 233  232  static struct as_callback *
 234  233  as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 235  234                          size_t event_len)
 236  235  {
 237  236          struct as_callback      *cb;
 238  237  
 239  238          ASSERT(MUTEX_HELD(&as->a_contents));
 240  239          for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 241  240                  /*
 242  241                   * If the callback has not already been called, then
 243  242                   * check if events or address range pertains.  An event_len
 244  243                   * of zero means do an unconditional callback.
 245  244                   */
 246  245                  if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 247  246                      ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 248  247                      (event_addr + event_len < cb->ascb_saddr) ||
 249  248                      (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 250  249                          continue;
 251  250                  }
 252  251                  break;
 253  252          }
 254  253          return (cb);
 255  254  }
 256  255  
 257  256  /*
 258  257   * Executes a given callback and removes it from the callback list for
 259  258   * this address space.
 260  259   * This function may sleep so the caller must drop all locks except
 261  260   * a_contents before calling this func.
 262  261   *
 263  262   * See also comments on as_do_callbacks below.
 264  263   */
 265  264  static void
 266  265  as_execute_callback(struct as *as, struct as_callback *cb,
 267  266                                  uint_t events)
 268  267  {
 269  268          struct as_callback **prevcb;
 270  269          void    *cb_arg;
 271  270  
 272  271          ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 273  272          cb->ascb_events |= AS_CALLBACK_CALLED;
 274  273          mutex_exit(&as->a_contents);
 275  274          (*cb->ascb_func)(as, cb->ascb_arg, events);
 276  275          mutex_enter(&as->a_contents);
 277  276          /*
 278  277           * the callback function is required to delete the callback
 279  278           * when the callback function determines it is OK for
 280  279           * this thread to continue. as_delete_callback will clear
 281  280           * the AS_ALL_EVENT in the events field when it is deleted.
 282  281           * If the callback function called as_delete_callback,
 283  282           * events will already be cleared and there will be no blocking.
 284  283           */
 285  284          while ((cb->ascb_events & events) != 0) {
 286  285                  cv_wait(&as->a_cv, &as->a_contents);
 287  286          }
 288  287          /*
 289  288           * This entry needs to be taken off the list. Normally, the
 290  289           * callback func itself does that, but unfortunately the list
 291  290           * may have changed while the callback was running because the
 292  291           * a_contents mutex was dropped and someone else other than the
 293  292           * callback func itself could have called as_delete_callback,
 294  293           * so we have to search to find this entry again.  The entry
 295  294           * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 296  295           */
 297  296          cb_arg = cb->ascb_arg;
 298  297          prevcb = &as->a_callbacks;
 299  298          for (cb = as->a_callbacks; cb != NULL;
 300  299              prevcb = &cb->ascb_next, cb = *prevcb) {
 301  300                  if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 302  301                      (cb_arg != cb->ascb_arg)) {
 303  302                          continue;
 304  303                  }
 305  304                  *prevcb = cb->ascb_next;
 306  305                  kmem_free(cb, sizeof (struct as_callback));
 307  306                  break;
 308  307          }
 309  308  }
 310  309  
 311  310  /*
 312  311   * Check the callback list for a matching event and intersection of
 313  312   * address range. If there is a match invoke the callback.  Skip an entry if:
 314  313   *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 315  314   *    - not event of interest
 316  315   *    - not address range of interest
 317  316   *
 318  317   * An event_len of zero indicates a request for an unconditional callback
 319  318   * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 320  319   * a_contents lock must be dropped before a callback, so only one callback
 321  320   * can be done before returning. Return -1 (true) if a callback was
 322  321   * executed and removed from the list, else return 0 (false).
 323  322   *
 324  323   * The logically separate parts, i.e. finding a matching callback and
 325  324   * executing a given callback have been separated into two functions
 326  325   * so that they can be called with different sets of locks held beyond
 327  326   * the always-required a_contents. as_find_callback does not sleep so
 328  327   * it is ok to call it if more locks than a_contents (i.e. the a_lock
 329  328   * rwlock) are held. as_execute_callback on the other hand may sleep
 330  329   * so all locks beyond a_contents must be dropped by the caller if one
 331  330   * does not want to end comatose.
 332  331   */
 333  332  static int
 334  333  as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 335  334                          size_t event_len)
 336  335  {
 337  336          struct as_callback *cb;
 338  337  
 339  338          if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 340  339                  as_execute_callback(as, cb, events);
 341  340                  return (-1);
 342  341          }
 343  342          return (0);
 344  343  }
 345  344  
 346  345  /*
 347  346   * Search for the segment containing addr. If a segment containing addr
 348  347   * exists, that segment is returned.  If no such segment exists, and
 349  348   * the list spans addresses greater than addr, then the first segment
 350  349   * whose base is greater than addr is returned; otherwise, NULL is
 351  350   * returned unless tail is true, in which case the last element of the
 352  351   * list is returned.
 353  352   *
 354  353   * a_seglast is used to cache the last found segment for repeated
 355  354   * searches to the same addr (which happens frequently).
 356  355   */
 357  356  struct seg *
 358  357  as_findseg(struct as *as, caddr_t addr, int tail)
 359  358  {
 360  359          struct seg *seg = as->a_seglast;
 361  360          avl_index_t where;
 362  361  
 363  362          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 364  363  
 365  364          if (seg != NULL &&
 366  365              seg->s_base <= addr &&
 367  366              addr < seg->s_base + seg->s_size)
 368  367                  return (seg);
 369  368  
 370  369          seg = avl_find(&as->a_segtree, &addr, &where);
 371  370          if (seg != NULL)
 372  371                  return (as->a_seglast = seg);
 373  372  
 374  373          seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 375  374          if (seg == NULL && tail)
 376  375                  seg = avl_last(&as->a_segtree);
 377  376          return (as->a_seglast = seg);
 378  377  }
 379  378  
 380  379  #ifdef VERIFY_SEGLIST
 381  380  /*
 382  381   * verify that the linked list is coherent
 383  382   */
 384  383  static void
 385  384  as_verify(struct as *as)
 386  385  {
 387  386          struct seg *seg, *seglast, *p, *n;
 388  387          uint_t nsegs = 0;
 389  388  
 390  389          if (do_as_verify == 0)
 391  390                  return;
 392  391  
 393  392          seglast = as->a_seglast;
 394  393  
 395  394          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 396  395                  ASSERT(seg->s_as == as);
 397  396                  p = AS_SEGPREV(as, seg);
 398  397                  n = AS_SEGNEXT(as, seg);
 399  398                  ASSERT(p == NULL || p->s_as == as);
 400  399                  ASSERT(p == NULL || p->s_base < seg->s_base);
 401  400                  ASSERT(n == NULL || n->s_base > seg->s_base);
 402  401                  ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 403  402                  if (seg == seglast)
 404  403                          seglast = NULL;
 405  404                  nsegs++;
 406  405          }
 407  406          ASSERT(seglast == NULL);
 408  407          ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 409  408  }
 410  409  #endif /* VERIFY_SEGLIST */
 411  410  
 412  411  /*
 413  412   * Add a new segment to the address space. The avl_find()
 414  413   * may be expensive so we attempt to use last segment accessed
 415  414   * in as_gap() as an insertion point.
 416  415   */
 417  416  int
 418  417  as_addseg(struct as  *as, struct seg *newseg)
 419  418  {
 420  419          struct seg *seg;
 421  420          caddr_t addr;
 422  421          caddr_t eaddr;
 423  422          avl_index_t where;
 424  423  
 425  424          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 426  425  
 427  426          as->a_updatedir = 1;    /* inform /proc */
 428  427          gethrestime(&as->a_updatetime);
 429  428  
 430  429          if (as->a_lastgaphl != NULL) {
 431  430                  struct seg *hseg = NULL;
 432  431                  struct seg *lseg = NULL;
 433  432  
 434  433                  if (as->a_lastgaphl->s_base > newseg->s_base) {
 435  434                          hseg = as->a_lastgaphl;
 436  435                          lseg = AVL_PREV(&as->a_segtree, hseg);
 437  436                  } else {
 438  437                          lseg = as->a_lastgaphl;
 439  438                          hseg = AVL_NEXT(&as->a_segtree, lseg);
 440  439                  }
 441  440  
 442  441                  if (hseg && lseg && lseg->s_base < newseg->s_base &&
 443  442                      hseg->s_base > newseg->s_base) {
 444  443                          avl_insert_here(&as->a_segtree, newseg, lseg,
 445  444                              AVL_AFTER);
 446  445                          as->a_lastgaphl = NULL;
 447  446                          as->a_seglast = newseg;
 448  447                          return (0);
 449  448                  }
 450  449                  as->a_lastgaphl = NULL;
 451  450          }
 452  451  
 453  452          addr = newseg->s_base;
 454  453          eaddr = addr + newseg->s_size;
 455  454  again:
 456  455  
 457  456          seg = avl_find(&as->a_segtree, &addr, &where);
 458  457  
 459  458          if (seg == NULL)
 460  459                  seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 461  460  
 462  461          if (seg == NULL)
 463  462                  seg = avl_last(&as->a_segtree);
 464  463  
 465  464          if (seg != NULL) {
 466  465                  caddr_t base = seg->s_base;
 467  466  
 468  467                  /*
 469  468                   * If top of seg is below the requested address, then
 470  469                   * the insertion point is at the end of the linked list,
 471  470                   * and seg points to the tail of the list.  Otherwise,
 472  471                   * the insertion point is immediately before seg.
 473  472                   */
 474  473                  if (base + seg->s_size > addr) {
 475  474                          if (addr >= base || eaddr > base) {
 476  475  #ifdef __sparc
 477  476                                  extern struct seg_ops segnf_ops;
 478  477  
 479  478                                  /*
 480  479                                   * no-fault segs must disappear if overlaid.
 481  480                                   * XXX need new segment type so
 482  481                                   * we don't have to check s_ops
 483  482                                   */
 484  483                                  if (seg->s_ops == &segnf_ops) {
 485  484                                          seg_unmap(seg);
 486  485                                          goto again;
 487  486                                  }
 488  487  #endif
 489  488                                  return (-1);    /* overlapping segment */
 490  489                          }
 491  490                  }
 492  491          }
 493  492          as->a_seglast = newseg;
 494  493          avl_insert(&as->a_segtree, newseg, where);
 495  494  
 496  495  #ifdef VERIFY_SEGLIST
 497  496          as_verify(as);
 498  497  #endif
 499  498          return (0);
 500  499  }
 501  500  
 502  501  struct seg *
 503  502  as_removeseg(struct as *as, struct seg *seg)
 504  503  {
 505  504          avl_tree_t *t;
 506  505  
 507  506          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 508  507  
 509  508          as->a_updatedir = 1;    /* inform /proc */
 510  509          gethrestime(&as->a_updatetime);
 511  510  
 512  511          if (seg == NULL)
 513  512                  return (NULL);
 514  513  
 515  514          t = &as->a_segtree;
 516  515          if (as->a_seglast == seg)
 517  516                  as->a_seglast = NULL;
 518  517          as->a_lastgaphl = NULL;
 519  518  
 520  519          /*
 521  520           * if this segment is at an address higher than
 522  521           * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 523  522           */
 524  523          if (as->a_lastgap &&
 525  524              (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 526  525                  as->a_lastgap = AVL_NEXT(t, seg);
 527  526  
 528  527          /*
 529  528           * remove the segment from the seg tree
 530  529           */
 531  530          avl_remove(t, seg);
 532  531  
 533  532  #ifdef VERIFY_SEGLIST
 534  533          as_verify(as);
 535  534  #endif
 536  535          return (seg);
 537  536  }
 538  537  
 539  538  /*
 540  539   * Find a segment containing addr.
 541  540   */
 542  541  struct seg *
 543  542  as_segat(struct as *as, caddr_t addr)
 544  543  {
 545  544          struct seg *seg = as->a_seglast;
 546  545  
 547  546          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 548  547  
 549  548          if (seg != NULL && seg->s_base <= addr &&
 550  549              addr < seg->s_base + seg->s_size)
 551  550                  return (seg);
 552  551  
 553  552          seg = avl_find(&as->a_segtree, &addr, NULL);
 554  553          return (seg);
 555  554  }
 556  555  
 557  556  /*
 558  557   * Serialize all searches for holes in an address space to
 559  558   * prevent two or more threads from allocating the same virtual
 560  559   * address range.  The address space must not be "read/write"
 561  560   * locked by the caller since we may block.
 562  561   */
 563  562  void
 564  563  as_rangelock(struct as *as)
 565  564  {
 566  565          mutex_enter(&as->a_contents);
 567  566          while (AS_ISCLAIMGAP(as))
 568  567                  cv_wait(&as->a_cv, &as->a_contents);
 569  568          AS_SETCLAIMGAP(as);
 570  569          mutex_exit(&as->a_contents);
 571  570  }
 572  571  
 573  572  /*
 574  573   * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 575  574   */
 576  575  void
 577  576  as_rangeunlock(struct as *as)
 578  577  {
 579  578          mutex_enter(&as->a_contents);
 580  579          AS_CLRCLAIMGAP(as);
 581  580          cv_signal(&as->a_cv);
 582  581          mutex_exit(&as->a_contents);
 583  582  }
 584  583  
 585  584  /*
 586  585   * compar segments (or just an address) by segment address range
 587  586   */
 588  587  static int
 589  588  as_segcompar(const void *x, const void *y)
 590  589  {
 591  590          struct seg *a = (struct seg *)x;
 592  591          struct seg *b = (struct seg *)y;
 593  592  
 594  593          if (a->s_base < b->s_base)
 595  594                  return (-1);
 596  595          if (a->s_base >= b->s_base + b->s_size)
 597  596                  return (1);
 598  597          return (0);
 599  598  }
 600  599  
 601  600  
 602  601  void
 603  602  as_avlinit(struct as *as)
 604  603  {
 605  604          avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 606  605              offsetof(struct seg, s_tree));
 607  606          avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 608  607              offsetof(struct watched_page, wp_link));
 609  608  }
 610  609  
 611  610  /*ARGSUSED*/
 612  611  static int
 613  612  as_constructor(void *buf, void *cdrarg, int kmflags)
 614  613  {
 615  614          struct as *as = buf;
 616  615  
 617  616          mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 618  617          cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 619  618          rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 620  619          as_avlinit(as);
 621  620          return (0);
 622  621  }
 623  622  
 624  623  /*ARGSUSED1*/
 625  624  static void
 626  625  as_destructor(void *buf, void *cdrarg)
 627  626  {
 628  627          struct as *as = buf;
 629  628  
 630  629          avl_destroy(&as->a_segtree);
 631  630          mutex_destroy(&as->a_contents);
 632  631          cv_destroy(&as->a_cv);
 633  632          rw_destroy(&as->a_lock);
 634  633  }
 635  634  
 636  635  void
 637  636  as_init(void)
 638  637  {
 639  638          as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 640  639              as_constructor, as_destructor, NULL, NULL, NULL, 0);
 641  640  }
 642  641  
 643  642  /*
 644  643   * Allocate and initialize an address space data structure.
 645  644   * We call hat_alloc to allow any machine dependent
 646  645   * information in the hat structure to be initialized.
 647  646   */
 648  647  struct as *
 649  648  as_alloc(void)
 650  649  {
 651  650          struct as *as;
 652  651  
 653  652          as = kmem_cache_alloc(as_cache, KM_SLEEP);
 654  653  
 655  654          as->a_flags             = 0;
 656  655          as->a_vbits             = 0;
 657  656          as->a_hrm               = NULL;
 658  657          as->a_seglast           = NULL;
 659  658          as->a_size              = 0;
 660  659          as->a_resvsize          = 0;
 661  660          as->a_updatedir         = 0;
 662  661          gethrestime(&as->a_updatetime);
 663  662          as->a_objectdir         = NULL;

↓ open down ↓

591 lines elided

↑ open up ↑

 664  663          as->a_sizedir           = 0;
 665  664          as->a_userlimit         = (caddr_t)USERLIMIT;
 666  665          as->a_lastgap           = NULL;
 667  666          as->a_lastgaphl         = NULL;
 668  667          as->a_callbacks         = NULL;
 669  668  
 670  669          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 671  670          as->a_hat = hat_alloc(as);      /* create hat for default system mmu */
 672  671          AS_LOCK_EXIT(as, &as->a_lock);
 673  672  
 674      -        as->a_xhat = NULL;
 675      -
 676  673          return (as);
 677  674  }
 678  675  
 679  676  /*
 680  677   * Free an address space data structure.
 681  678   * Need to free the hat first and then
 682  679   * all the segments on this as and finally
 683  680   * the space for the as struct itself.
 684  681   */
 685  682  void
 686  683  as_free(struct as *as)
 687  684  {
 688  685          struct hat *hat = as->a_hat;
 689  686          struct seg *seg, *next;
 690      -        int called = 0;
      687 +        boolean_t free_started = B_FALSE;
 691  688  
 692  689  top:
 693  690          /*
 694  691           * Invoke ALL callbacks. as_do_callbacks will do one callback
 695  692           * per call, and not return (-1) until the callback has completed.
 696  693           * When as_do_callbacks returns zero, all callbacks have completed.
 697  694           */
 698  695          mutex_enter(&as->a_contents);
 699  696          while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 700  697                  ;
 701  698  
 702      -        /* This will prevent new XHATs from attaching to as */
 703      -        if (!called)
 704      -                AS_SETBUSY(as);
 705  699          mutex_exit(&as->a_contents);
 706  700          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 707  701  
 708      -        if (!called) {
 709      -                called = 1;
      702 +        if (!free_started) {
      703 +                free_started = B_TRUE;
 710  704                  hat_free_start(hat);
 711      -                if (as->a_xhat != NULL)
 712      -                        xhat_free_start_all(as);
 713  705          }
 714  706          for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 715  707                  int err;
 716  708  
 717  709                  next = AS_SEGNEXT(as, seg);
 718  710  retry:
 719  711                  err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
 720  712                  if (err == EAGAIN) {
 721  713                          mutex_enter(&as->a_contents);
 722  714                          if (as->a_callbacks) {

 723  715                                  AS_LOCK_EXIT(as, &as->a_lock);
 724  716                          } else if (!AS_ISNOUNMAPWAIT(as)) {
 725  717                                  /*
 726  718                                   * Memory is currently locked. Wait for a
 727  719                                   * cv_signal that it has been unlocked, then
 728  720                                   * try the operation again.
 729  721                                   */
 730  722                                  if (AS_ISUNMAPWAIT(as) == 0)
 731  723                                          cv_broadcast(&as->a_cv);
 732  724                                  AS_SETUNMAPWAIT(as);
 733  725                                  AS_LOCK_EXIT(as, &as->a_lock);
 734  726                                  while (AS_ISUNMAPWAIT(as))
 735  727                                          cv_wait(&as->a_cv, &as->a_contents);
 736  728                          } else {
 737  729                                  /*
 738  730                                   * We may have raced with
 739  731                                   * segvn_reclaim()/segspt_reclaim(). In this
 740  732                                   * case clean nounmapwait flag and retry since
 741  733                                   * softlockcnt in this segment may be already
 742  734                                   * 0.  We don't drop as writer lock so our
 743  735                                   * number of retries without sleeping should
 744  736                                   * be very small. See segvn_reclaim() for
 745  737                                   * more comments.
 746  738                                   */
 747  739                                  AS_CLRNOUNMAPWAIT(as);
 748  740                                  mutex_exit(&as->a_contents);
 749  741                                  goto retry;
 750  742                          }
 751  743                          mutex_exit(&as->a_contents);

↓ open down ↓

29 lines elided

↑ open up ↑

 752  744                          goto top;
 753  745                  } else {
 754  746                          /*
 755  747                           * We do not expect any other error return at this
 756  748                           * time. This is similar to an ASSERT in seg_unmap()
 757  749                           */
 758  750                          ASSERT(err == 0);
 759  751                  }
 760  752          }
 761  753          hat_free_end(hat);
 762      -        if (as->a_xhat != NULL)
 763      -                xhat_free_end_all(as);
 764  754          AS_LOCK_EXIT(as, &as->a_lock);
 765  755  
 766  756          /* /proc stuff */
 767  757          ASSERT(avl_numnodes(&as->a_wpage) == 0);
 768  758          if (as->a_objectdir) {
 769  759                  kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 770  760                  as->a_objectdir = NULL;
 771  761                  as->a_sizedir = 0;
 772  762          }
 773  763

 774  764          /*
 775  765           * Free the struct as back to kmem.  Assert it has no segments.
 776  766           */
 777  767          ASSERT(avl_numnodes(&as->a_segtree) == 0);
 778  768          kmem_cache_free(as_cache, as);
 779  769  }
 780  770  
 781  771  int
 782  772  as_dup(struct as *as, struct proc *forkedproc)
 783  773  {
 784  774          struct as *newas;
 785  775          struct seg *seg, *newseg;
 786  776          size_t  purgesize = 0;

↓ open down ↓

13 lines elided

↑ open up ↑

 787  777          int error;
 788  778  
 789  779          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 790  780          as_clearwatch(as);
 791  781          newas = as_alloc();
 792  782          newas->a_userlimit = as->a_userlimit;
 793  783          newas->a_proc = forkedproc;
 794  784  
 795  785          AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
 796  786  
 797      -        /* This will prevent new XHATs from attaching */
 798      -        mutex_enter(&as->a_contents);
 799      -        AS_SETBUSY(as);
 800      -        mutex_exit(&as->a_contents);
 801      -        mutex_enter(&newas->a_contents);
 802      -        AS_SETBUSY(newas);
 803      -        mutex_exit(&newas->a_contents);
 804      -
 805  787          (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 806  788  
 807  789          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 808  790  
 809  791                  if (seg->s_flags & S_PURGE) {
 810  792                          purgesize += seg->s_size;
 811  793                          continue;
 812  794                  }
 813  795  
 814  796                  newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 815  797                  if (newseg == NULL) {
 816  798                          AS_LOCK_EXIT(newas, &newas->a_lock);
 817  799                          as_setwatch(as);
 818      -                        mutex_enter(&as->a_contents);
 819      -                        AS_CLRBUSY(as);
 820      -                        mutex_exit(&as->a_contents);
 821  800                          AS_LOCK_EXIT(as, &as->a_lock);
 822  801                          as_free(newas);
 823  802                          return (-1);
 824  803                  }
 825  804                  if ((error = SEGOP_DUP(seg, newseg)) != 0) {
 826  805                          /*
 827  806                           * We call seg_free() on the new seg
 828  807                           * because the segment is not set up
 829  808                           * completely; i.e. it has no ops.
 830  809                           */
 831  810                          as_setwatch(as);
 832      -                        mutex_enter(&as->a_contents);
 833      -                        AS_CLRBUSY(as);
 834      -                        mutex_exit(&as->a_contents);
 835  811                          AS_LOCK_EXIT(as, &as->a_lock);
 836  812                          seg_free(newseg);
 837  813                          AS_LOCK_EXIT(newas, &newas->a_lock);
 838  814                          as_free(newas);
 839  815                          return (error);
 840  816                  }
 841  817                  newas->a_size += seg->s_size;
 842  818          }
 843  819          newas->a_resvsize = as->a_resvsize - purgesize;
 844  820  
 845  821          error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 846      -        if (as->a_xhat != NULL)
 847      -                error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL);
 848  822  
 849      -        mutex_enter(&newas->a_contents);
 850      -        AS_CLRBUSY(newas);
 851      -        mutex_exit(&newas->a_contents);
 852  823          AS_LOCK_EXIT(newas, &newas->a_lock);
 853  824  
 854  825          as_setwatch(as);
 855      -        mutex_enter(&as->a_contents);
 856      -        AS_CLRBUSY(as);
 857      -        mutex_exit(&as->a_contents);
 858  826          AS_LOCK_EXIT(as, &as->a_lock);
 859  827          if (error != 0) {
 860  828                  as_free(newas);
 861  829                  return (error);
 862  830          }
 863  831          forkedproc->p_as = newas;
 864  832          return (0);
 865  833  }
 866  834  
 867  835  /*

 868  836   * Handle a ``fault'' at addr for size bytes.
 869  837   */
 870  838  faultcode_t
 871  839  as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 872  840          enum fault_type type, enum seg_rw rw)

↓ open down ↓

5 lines elided

↑ open up ↑

 873  841  {
 874  842          struct seg *seg;
 875  843          caddr_t raddr;                  /* rounded down addr */
 876  844          size_t rsize;                   /* rounded up size */
 877  845          size_t ssize;
 878  846          faultcode_t res = 0;
 879  847          caddr_t addrsav;
 880  848          struct seg *segsav;
 881  849          int as_lock_held;
 882  850          klwp_t *lwp = ttolwp(curthread);
 883      -        int is_xhat = 0;
 884  851          int holding_wpage = 0;
 885      -        extern struct seg_ops   segdev_ops;
 886  852  
 887  853  
 888  854  
 889      -        if (as->a_hat != hat) {
 890      -                /* This must be an XHAT then */
 891      -                is_xhat = 1;
 892      -
 893      -                if ((type != F_INVAL) || (as == &kas))
 894      -                        return (FC_NOSUPPORT);
 895      -        }
 896      -
 897  855  retry:
 898      -        if (!is_xhat) {
 899      -                /*
 900      -                 * Indicate that the lwp is not to be stopped while waiting
 901      -                 * for a pagefault.  This is to avoid deadlock while debugging
 902      -                 * a process via /proc over NFS (in particular).
 903      -                 */
 904      -                if (lwp != NULL)
 905      -                        lwp->lwp_nostop++;
      856 +        /*
      857 +         * Indicate that the lwp is not to be stopped while waiting for a
      858 +         * pagefault.  This is to avoid deadlock while debugging a process
      859 +         * via /proc over NFS (in particular).
      860 +         */
      861 +        if (lwp != NULL)
      862 +                lwp->lwp_nostop++;
 906  863  
 907      -                /*
 908      -                 * same length must be used when we softlock and softunlock.
 909      -                 * We don't support softunlocking lengths less than
 910      -                 * the original length when there is largepage support.
 911      -                 * See seg_dev.c for more comments.
 912      -                 */
 913      -                switch (type) {
      864 +        /*
      865 +         * same length must be used when we softlock and softunlock.  We
      866 +         * don't support softunlocking lengths less than the original length
      867 +         * when there is largepage support.  See seg_dev.c for more
      868 +         * comments.
      869 +         */
      870 +        switch (type) {
 914  871  
 915      -                case F_SOFTLOCK:
 916      -                        CPU_STATS_ADD_K(vm, softlock, 1);
 917      -                        break;
      872 +        case F_SOFTLOCK:
      873 +                CPU_STATS_ADD_K(vm, softlock, 1);
      874 +                break;
 918  875  
 919      -                case F_SOFTUNLOCK:
 920      -                        break;
      876 +        case F_SOFTUNLOCK:
      877 +                break;
 921  878  
 922      -                case F_PROT:
 923      -                        CPU_STATS_ADD_K(vm, prot_fault, 1);
 924      -                        break;
      879 +        case F_PROT:
      880 +                CPU_STATS_ADD_K(vm, prot_fault, 1);
      881 +                break;
 925  882  
 926      -                case F_INVAL:
 927      -                        CPU_STATS_ENTER_K();
 928      -                        CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 929      -                        if (as == &kas)
 930      -                                CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 931      -                        CPU_STATS_EXIT_K();
 932      -                        break;
 933      -                }
      883 +        case F_INVAL:
      884 +                CPU_STATS_ENTER_K();
      885 +                CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
      886 +                if (as == &kas)
      887 +                        CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
      888 +                CPU_STATS_EXIT_K();
      889 +                break;
 934  890          }
 935  891  
 936  892          /* Kernel probe */
 937  893          TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
 938  894              tnf_opaque, address,        addr,
 939  895              tnf_fault_type,     fault_type,     type,
 940  896              tnf_seg_access,     access,         rw);
 941  897  
 942  898          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 943  899          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -

 944  900              (size_t)raddr;

↓ open down ↓

1 lines elided

↑ open up ↑

 945  901  
 946  902          /*
 947  903           * XXX -- Don't grab the as lock for segkmap. We should grab it for
 948  904           * correctness, but then we could be stuck holding this lock for
 949  905           * a LONG time if the fault needs to be resolved on a slow
 950  906           * filesystem, and then no-one will be able to exec new commands,
 951  907           * as exec'ing requires the write lock on the as.
 952  908           */
 953  909          if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 954  910              raddr + size < segkmap->s_base + segkmap->s_size) {
 955      -                /*
 956      -                 * if (as==&kas), this can't be XHAT: we've already returned
 957      -                 * FC_NOSUPPORT.
 958      -                 */
 959  911                  seg = segkmap;
 960  912                  as_lock_held = 0;
 961  913          } else {
 962  914                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
 963      -                if (is_xhat && avl_numnodes(&as->a_wpage) != 0) {
 964      -                        /*
 965      -                         * Grab and hold the writers' lock on the as
 966      -                         * if the fault is to a watched page.
 967      -                         * This will keep CPUs from "peeking" at the
 968      -                         * address range while we're temporarily boosting
 969      -                         * the permissions for the XHAT device to
 970      -                         * resolve the fault in the segment layer.
 971      -                         *
 972      -                         * We could check whether faulted address
 973      -                         * is within a watched page and only then grab
 974      -                         * the writer lock, but this is simpler.
 975      -                         */
 976      -                        AS_LOCK_EXIT(as, &as->a_lock);
 977      -                        AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 978      -                }
 979  915  
 980  916                  seg = as_segat(as, raddr);
 981  917                  if (seg == NULL) {
 982  918                          AS_LOCK_EXIT(as, &as->a_lock);
 983      -                        if ((lwp != NULL) && (!is_xhat))
      919 +                        if (lwp != NULL)
 984  920                                  lwp->lwp_nostop--;
 985  921                          return (FC_NOMAP);
 986  922                  }
 987  923  
 988  924                  as_lock_held = 1;
 989  925          }
 990  926  
 991  927          addrsav = raddr;
 992  928          segsav = seg;
 993  929

 994  930          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 995  931                  if (raddr >= seg->s_base + seg->s_size) {
 996  932                          seg = AS_SEGNEXT(as, seg);

↓ open down ↓

3 lines elided

↑ open up ↑

 997  933                          if (seg == NULL || raddr != seg->s_base) {
 998  934                                  res = FC_NOMAP;
 999  935                                  break;
1000  936                          }
1001  937                  }
1002  938                  if (raddr + rsize > seg->s_base + seg->s_size)
1003  939                          ssize = seg->s_base + seg->s_size - raddr;
1004  940                  else
1005  941                          ssize = rsize;
1006  942  
1007      -                if (!is_xhat || (seg->s_ops != &segdev_ops)) {
      943 +                res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
1008  944  
1009      -                        if (is_xhat && avl_numnodes(&as->a_wpage) != 0 &&
1010      -                            pr_is_watchpage_as(raddr, rw, as)) {
1011      -                                /*
1012      -                                 * Handle watch pages.  If we're faulting on a
1013      -                                 * watched page from an X-hat, we have to
1014      -                                 * restore the original permissions while we
1015      -                                 * handle the fault.
1016      -                                 */
1017      -                                as_clearwatch(as);
1018      -                                holding_wpage = 1;
1019      -                        }
1020      -
1021      -                        res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
1022      -
1023      -                        /* Restore watchpoints */
1024      -                        if (holding_wpage) {
1025      -                                as_setwatch(as);
1026      -                                holding_wpage = 0;
1027      -                        }
      945 +                /* Restore watchpoints */
      946 +                if (holding_wpage) {
      947 +                        as_setwatch(as);
      948 +                        holding_wpage = 0;
      949 +                }
1028  950  
1029      -                        if (res != 0)
1030      -                                break;
1031      -                } else {
1032      -                        /* XHAT does not support seg_dev */
1033      -                        res = FC_NOSUPPORT;
      951 +                if (res != 0)
1034  952                          break;
1035      -                }
1036  953          }
1037  954  
1038  955          /*
1039  956           * If we were SOFTLOCKing and encountered a failure,
1040  957           * we must SOFTUNLOCK the range we already did. (Maybe we
1041  958           * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
1042  959           * right here...)
1043  960           */
1044  961          if (res != 0 && type == F_SOFTLOCK) {
1045  962                  for (seg = segsav; addrsav < raddr; addrsav += ssize) {

1046  963                          if (addrsav >= seg->s_base + seg->s_size)
1047  964                                  seg = AS_SEGNEXT(as, seg);
1048  965                          ASSERT(seg != NULL);
1049  966                          /*
1050  967                           * Now call the fault routine again to perform the
1051  968                           * unlock using S_OTHER instead of the rw variable
1052  969                           * since we never got a chance to touch the pages.
1053  970                           */

↓ open down ↓

8 lines elided

↑ open up ↑

1054  971                          if (raddr > seg->s_base + seg->s_size)
1055  972                                  ssize = seg->s_base + seg->s_size - addrsav;
1056  973                          else
1057  974                                  ssize = raddr - addrsav;
1058  975                          (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
1059  976                              F_SOFTUNLOCK, S_OTHER);
1060  977                  }
1061  978          }
1062  979          if (as_lock_held)
1063  980                  AS_LOCK_EXIT(as, &as->a_lock);
1064      -        if ((lwp != NULL) && (!is_xhat))
      981 +        if (lwp != NULL)
1065  982                  lwp->lwp_nostop--;
1066  983  
1067  984          /*
1068  985           * If the lower levels returned EDEADLK for a fault,
1069  986           * It means that we should retry the fault.  Let's wait
1070  987           * a bit also to let the deadlock causing condition clear.
1071  988           * This is part of a gross hack to work around a design flaw
1072  989           * in the ufs/sds logging code and should go away when the
1073  990           * logging code is re-designed to fix the problem. See bug
1074  991           * 4125102 for details of the problem.

1075  992           */
1076  993          if (FC_ERRNO(res) == EDEADLK) {
1077  994                  delay(deadlk_wait);
1078  995                  res = 0;
1079  996                  goto retry;
1080  997          }
1081  998          return (res);
1082  999  }
1083 1000  
1084 1001  
1085 1002  
1086 1003  /*
1087 1004   * Asynchronous ``fault'' at addr for size bytes.
1088 1005   */
1089 1006  faultcode_t
1090 1007  as_faulta(struct as *as, caddr_t addr, size_t size)
1091 1008  {
1092 1009          struct seg *seg;
1093 1010          caddr_t raddr;                  /* rounded down addr */
1094 1011          size_t rsize;                   /* rounded up size */
1095 1012          faultcode_t res = 0;
1096 1013          klwp_t *lwp = ttolwp(curthread);
1097 1014  
1098 1015  retry:
1099 1016          /*
1100 1017           * Indicate that the lwp is not to be stopped while waiting
1101 1018           * for a pagefault.  This is to avoid deadlock while debugging
1102 1019           * a process via /proc over NFS (in particular).
1103 1020           */
1104 1021          if (lwp != NULL)
1105 1022                  lwp->lwp_nostop++;
1106 1023  
1107 1024          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1108 1025          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1109 1026              (size_t)raddr;
1110 1027  
1111 1028          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1112 1029          seg = as_segat(as, raddr);
1113 1030          if (seg == NULL) {
1114 1031                  AS_LOCK_EXIT(as, &as->a_lock);
1115 1032                  if (lwp != NULL)
1116 1033                          lwp->lwp_nostop--;
1117 1034                  return (FC_NOMAP);
1118 1035          }
1119 1036  
1120 1037          for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1121 1038                  if (raddr >= seg->s_base + seg->s_size) {
1122 1039                          seg = AS_SEGNEXT(as, seg);
1123 1040                          if (seg == NULL || raddr != seg->s_base) {
1124 1041                                  res = FC_NOMAP;
1125 1042                                  break;
1126 1043                          }
1127 1044                  }
1128 1045                  res = SEGOP_FAULTA(seg, raddr);
1129 1046                  if (res != 0)
1130 1047                          break;
1131 1048          }
1132 1049          AS_LOCK_EXIT(as, &as->a_lock);
1133 1050          if (lwp != NULL)
1134 1051                  lwp->lwp_nostop--;
1135 1052          /*
1136 1053           * If the lower levels returned EDEADLK for a fault,
1137 1054           * It means that we should retry the fault.  Let's wait
1138 1055           * a bit also to let the deadlock causing condition clear.
1139 1056           * This is part of a gross hack to work around a design flaw
1140 1057           * in the ufs/sds logging code and should go away when the
1141 1058           * logging code is re-designed to fix the problem. See bug
1142 1059           * 4125102 for details of the problem.
1143 1060           */
1144 1061          if (FC_ERRNO(res) == EDEADLK) {
1145 1062                  delay(deadlk_wait);
1146 1063                  res = 0;
1147 1064                  goto retry;
1148 1065          }
1149 1066          return (res);
1150 1067  }
1151 1068  
1152 1069  /*
1153 1070   * Set the virtual mapping for the interval from [addr : addr + size)
1154 1071   * in address space `as' to have the specified protection.
1155 1072   * It is ok for the range to cross over several segments,
1156 1073   * as long as they are contiguous.
1157 1074   */
1158 1075  int
1159 1076  as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1160 1077  {
1161 1078          struct seg *seg;
1162 1079          struct as_callback *cb;
1163 1080          size_t ssize;
1164 1081          caddr_t raddr;                  /* rounded down addr */
1165 1082          size_t rsize;                   /* rounded up size */
1166 1083          int error = 0, writer = 0;
1167 1084          caddr_t saveraddr;
1168 1085          size_t saversize;
1169 1086  
1170 1087  setprot_top:
1171 1088          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1172 1089          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1173 1090              (size_t)raddr;
1174 1091  
1175 1092          if (raddr + rsize < raddr)              /* check for wraparound */
1176 1093                  return (ENOMEM);
1177 1094  
1178 1095          saveraddr = raddr;
1179 1096          saversize = rsize;
1180 1097  
1181 1098          /*
1182 1099           * Normally we only lock the as as a reader. But
1183 1100           * if due to setprot the segment driver needs to split
1184 1101           * a segment it will return IE_RETRY. Therefore we re-acquire
1185 1102           * the as lock as a writer so the segment driver can change
1186 1103           * the seg list. Also the segment driver will return IE_RETRY
1187 1104           * after it has changed the segment list so we therefore keep
1188 1105           * locking as a writer. Since these opeartions should be rare
1189 1106           * want to only lock as a writer when necessary.
1190 1107           */
1191 1108          if (writer || avl_numnodes(&as->a_wpage) != 0) {
1192 1109                  AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1193 1110          } else {
1194 1111                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1195 1112          }
1196 1113  
1197 1114          as_clearwatchprot(as, raddr, rsize);
1198 1115          seg = as_segat(as, raddr);
1199 1116          if (seg == NULL) {
1200 1117                  as_setwatch(as);
1201 1118                  AS_LOCK_EXIT(as, &as->a_lock);
1202 1119                  return (ENOMEM);
1203 1120          }
1204 1121  
1205 1122          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1206 1123                  if (raddr >= seg->s_base + seg->s_size) {
1207 1124                          seg = AS_SEGNEXT(as, seg);
1208 1125                          if (seg == NULL || raddr != seg->s_base) {
1209 1126                                  error = ENOMEM;
1210 1127                                  break;
1211 1128                          }
1212 1129                  }
1213 1130                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
1214 1131                          ssize = seg->s_base + seg->s_size - raddr;
1215 1132                  else
1216 1133                          ssize = rsize;
1217 1134  retry:
1218 1135                  error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1219 1136  
1220 1137                  if (error == IE_NOMEM) {
1221 1138                          error = EAGAIN;
1222 1139                          break;
1223 1140                  }
1224 1141  
1225 1142                  if (error == IE_RETRY) {
1226 1143                          AS_LOCK_EXIT(as, &as->a_lock);
1227 1144                          writer = 1;
1228 1145                          goto setprot_top;
1229 1146                  }
1230 1147  
1231 1148                  if (error == EAGAIN) {
1232 1149                          /*
1233 1150                           * Make sure we have a_lock as writer.
1234 1151                           */
1235 1152                          if (writer == 0) {
1236 1153                                  AS_LOCK_EXIT(as, &as->a_lock);
1237 1154                                  writer = 1;
1238 1155                                  goto setprot_top;
1239 1156                          }
1240 1157  
1241 1158                          /*
1242 1159                           * Memory is currently locked.  It must be unlocked
1243 1160                           * before this operation can succeed through a retry.
1244 1161                           * The possible reasons for locked memory and
1245 1162                           * corresponding strategies for unlocking are:
1246 1163                           * (1) Normal I/O
1247 1164                           *      wait for a signal that the I/O operation
1248 1165                           *      has completed and the memory is unlocked.
1249 1166                           * (2) Asynchronous I/O
1250 1167                           *      The aio subsystem does not unlock pages when
1251 1168                           *      the I/O is completed. Those pages are unlocked
1252 1169                           *      when the application calls aiowait/aioerror.
1253 1170                           *      So, to prevent blocking forever, cv_broadcast()
1254 1171                           *      is done to wake up aio_cleanup_thread.
1255 1172                           *      Subsequently, segvn_reclaim will be called, and
1256 1173                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
1257 1174                           * (3) Long term page locking:
1258 1175                           *      Drivers intending to have pages locked for a
1259 1176                           *      period considerably longer than for normal I/O
1260 1177                           *      (essentially forever) may have registered for a
1261 1178                           *      callback so they may unlock these pages on
1262 1179                           *      request. This is needed to allow this operation
1263 1180                           *      to succeed. Each entry on the callback list is
1264 1181                           *      examined. If the event or address range pertains
1265 1182                           *      the callback is invoked (unless it already is in
1266 1183                           *      progress). The a_contents lock must be dropped
1267 1184                           *      before the callback, so only one callback can
1268 1185                           *      be done at a time. Go to the top and do more
1269 1186                           *      until zero is returned. If zero is returned,
1270 1187                           *      either there were no callbacks for this event
1271 1188                           *      or they were already in progress.
1272 1189                           */
1273 1190                          mutex_enter(&as->a_contents);
1274 1191                          if (as->a_callbacks &&
1275 1192                              (cb = as_find_callback(as, AS_SETPROT_EVENT,
1276 1193                              seg->s_base, seg->s_size))) {
1277 1194                                  AS_LOCK_EXIT(as, &as->a_lock);
1278 1195                                  as_execute_callback(as, cb, AS_SETPROT_EVENT);
1279 1196                          } else if (!AS_ISNOUNMAPWAIT(as)) {
1280 1197                                  if (AS_ISUNMAPWAIT(as) == 0)
1281 1198                                          cv_broadcast(&as->a_cv);
1282 1199                                  AS_SETUNMAPWAIT(as);
1283 1200                                  AS_LOCK_EXIT(as, &as->a_lock);
1284 1201                                  while (AS_ISUNMAPWAIT(as))
1285 1202                                          cv_wait(&as->a_cv, &as->a_contents);
1286 1203                          } else {
1287 1204                                  /*
1288 1205                                   * We may have raced with
1289 1206                                   * segvn_reclaim()/segspt_reclaim(). In this
1290 1207                                   * case clean nounmapwait flag and retry since
1291 1208                                   * softlockcnt in this segment may be already
1292 1209                                   * 0.  We don't drop as writer lock so our
1293 1210                                   * number of retries without sleeping should
1294 1211                                   * be very small. See segvn_reclaim() for
1295 1212                                   * more comments.
1296 1213                                   */
1297 1214                                  AS_CLRNOUNMAPWAIT(as);
1298 1215                                  mutex_exit(&as->a_contents);
1299 1216                                  goto retry;
1300 1217                          }
1301 1218                          mutex_exit(&as->a_contents);
1302 1219                          goto setprot_top;
1303 1220                  } else if (error != 0)
1304 1221                          break;
1305 1222          }
1306 1223          if (error != 0) {
1307 1224                  as_setwatch(as);
1308 1225          } else {
1309 1226                  as_setwatchprot(as, saveraddr, saversize, prot);
1310 1227          }
1311 1228          AS_LOCK_EXIT(as, &as->a_lock);
1312 1229          return (error);
1313 1230  }
1314 1231  
1315 1232  /*
1316 1233   * Check to make sure that the interval [addr, addr + size)
1317 1234   * in address space `as' has at least the specified protection.
1318 1235   * It is ok for the range to cross over several segments, as long
1319 1236   * as they are contiguous.
1320 1237   */
1321 1238  int
1322 1239  as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1323 1240  {
1324 1241          struct seg *seg;
1325 1242          size_t ssize;
1326 1243          caddr_t raddr;                  /* rounded down addr */
1327 1244          size_t rsize;                   /* rounded up size */
1328 1245          int error = 0;
1329 1246  
1330 1247          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1331 1248          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1332 1249              (size_t)raddr;
1333 1250  
1334 1251          if (raddr + rsize < raddr)              /* check for wraparound */
1335 1252                  return (ENOMEM);
1336 1253  
1337 1254          /*
1338 1255           * This is ugly as sin...
1339 1256           * Normally, we only acquire the address space readers lock.
1340 1257           * However, if the address space has watchpoints present,
1341 1258           * we must acquire the writer lock on the address space for
1342 1259           * the benefit of as_clearwatchprot() and as_setwatchprot().
1343 1260           */
1344 1261          if (avl_numnodes(&as->a_wpage) != 0)
1345 1262                  AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1346 1263          else
1347 1264                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1348 1265          as_clearwatchprot(as, raddr, rsize);
1349 1266          seg = as_segat(as, raddr);
1350 1267          if (seg == NULL) {
1351 1268                  as_setwatch(as);
1352 1269                  AS_LOCK_EXIT(as, &as->a_lock);
1353 1270                  return (ENOMEM);
1354 1271          }
1355 1272  
1356 1273          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1357 1274                  if (raddr >= seg->s_base + seg->s_size) {
1358 1275                          seg = AS_SEGNEXT(as, seg);
1359 1276                          if (seg == NULL || raddr != seg->s_base) {
1360 1277                                  error = ENOMEM;
1361 1278                                  break;
1362 1279                          }
1363 1280                  }
1364 1281                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
1365 1282                          ssize = seg->s_base + seg->s_size - raddr;
1366 1283                  else
1367 1284                          ssize = rsize;
1368 1285  
1369 1286                  error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1370 1287                  if (error != 0)
1371 1288                          break;
1372 1289          }
1373 1290          as_setwatch(as);
1374 1291          AS_LOCK_EXIT(as, &as->a_lock);
1375 1292          return (error);
1376 1293  }
1377 1294  
1378 1295  int
1379 1296  as_unmap(struct as *as, caddr_t addr, size_t size)
1380 1297  {
1381 1298          struct seg *seg, *seg_next;
1382 1299          struct as_callback *cb;
1383 1300          caddr_t raddr, eaddr;
1384 1301          size_t ssize, rsize = 0;
1385 1302          int err;
1386 1303  
1387 1304  top:
1388 1305          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1389 1306          eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1390 1307              (uintptr_t)PAGEMASK);
1391 1308  
1392 1309          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1393 1310  
1394 1311          as->a_updatedir = 1;    /* inform /proc */
1395 1312          gethrestime(&as->a_updatetime);
1396 1313  
1397 1314          /*
1398 1315           * Use as_findseg to find the first segment in the range, then
1399 1316           * step through the segments in order, following s_next.
1400 1317           */
1401 1318          as_clearwatchprot(as, raddr, eaddr - raddr);
1402 1319  
1403 1320          for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1404 1321                  if (eaddr <= seg->s_base)
1405 1322                          break;          /* eaddr was in a gap; all done */
1406 1323  
1407 1324                  /* this is implied by the test above */
1408 1325                  ASSERT(raddr < eaddr);
1409 1326  
1410 1327                  if (raddr < seg->s_base)
1411 1328                          raddr = seg->s_base;    /* raddr was in a gap */
1412 1329  
1413 1330                  if (eaddr > (seg->s_base + seg->s_size))
1414 1331                          ssize = seg->s_base + seg->s_size - raddr;
1415 1332                  else
1416 1333                          ssize = eaddr - raddr;
1417 1334  
1418 1335                  /*
1419 1336                   * Save next segment pointer since seg can be
1420 1337                   * destroyed during the segment unmap operation.
1421 1338                   */
1422 1339                  seg_next = AS_SEGNEXT(as, seg);
1423 1340  
1424 1341                  /*
1425 1342                   * We didn't count /dev/null mappings, so ignore them here.
1426 1343                   * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1427 1344                   * we have to do this check here while we have seg.)
1428 1345                   */
1429 1346                  rsize = 0;
1430 1347                  if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1431 1348                      !SEG_IS_PARTIAL_RESV(seg))
1432 1349                          rsize = ssize;
1433 1350  
1434 1351  retry:
1435 1352                  err = SEGOP_UNMAP(seg, raddr, ssize);
1436 1353                  if (err == EAGAIN) {
1437 1354                          /*
1438 1355                           * Memory is currently locked.  It must be unlocked
1439 1356                           * before this operation can succeed through a retry.
1440 1357                           * The possible reasons for locked memory and
1441 1358                           * corresponding strategies for unlocking are:
1442 1359                           * (1) Normal I/O
1443 1360                           *      wait for a signal that the I/O operation
1444 1361                           *      has completed and the memory is unlocked.
1445 1362                           * (2) Asynchronous I/O
1446 1363                           *      The aio subsystem does not unlock pages when
1447 1364                           *      the I/O is completed. Those pages are unlocked
1448 1365                           *      when the application calls aiowait/aioerror.
1449 1366                           *      So, to prevent blocking forever, cv_broadcast()
1450 1367                           *      is done to wake up aio_cleanup_thread.
1451 1368                           *      Subsequently, segvn_reclaim will be called, and
1452 1369                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
1453 1370                           * (3) Long term page locking:
1454 1371                           *      Drivers intending to have pages locked for a
1455 1372                           *      period considerably longer than for normal I/O
1456 1373                           *      (essentially forever) may have registered for a
1457 1374                           *      callback so they may unlock these pages on
1458 1375                           *      request. This is needed to allow this operation
1459 1376                           *      to succeed. Each entry on the callback list is
1460 1377                           *      examined. If the event or address range pertains
1461 1378                           *      the callback is invoked (unless it already is in
1462 1379                           *      progress). The a_contents lock must be dropped
1463 1380                           *      before the callback, so only one callback can
1464 1381                           *      be done at a time. Go to the top and do more
1465 1382                           *      until zero is returned. If zero is returned,
1466 1383                           *      either there were no callbacks for this event
1467 1384                           *      or they were already in progress.
1468 1385                           */
1469 1386                          mutex_enter(&as->a_contents);
1470 1387                          if (as->a_callbacks &&
1471 1388                              (cb = as_find_callback(as, AS_UNMAP_EVENT,
1472 1389                              seg->s_base, seg->s_size))) {
1473 1390                                  AS_LOCK_EXIT(as, &as->a_lock);
1474 1391                                  as_execute_callback(as, cb, AS_UNMAP_EVENT);
1475 1392                          } else if (!AS_ISNOUNMAPWAIT(as)) {
1476 1393                                  if (AS_ISUNMAPWAIT(as) == 0)
1477 1394                                          cv_broadcast(&as->a_cv);
1478 1395                                  AS_SETUNMAPWAIT(as);
1479 1396                                  AS_LOCK_EXIT(as, &as->a_lock);
1480 1397                                  while (AS_ISUNMAPWAIT(as))
1481 1398                                          cv_wait(&as->a_cv, &as->a_contents);
1482 1399                          } else {
1483 1400                                  /*
1484 1401                                   * We may have raced with
1485 1402                                   * segvn_reclaim()/segspt_reclaim(). In this
1486 1403                                   * case clean nounmapwait flag and retry since
1487 1404                                   * softlockcnt in this segment may be already
1488 1405                                   * 0.  We don't drop as writer lock so our
1489 1406                                   * number of retries without sleeping should
1490 1407                                   * be very small. See segvn_reclaim() for
1491 1408                                   * more comments.
1492 1409                                   */
1493 1410                                  AS_CLRNOUNMAPWAIT(as);
1494 1411                                  mutex_exit(&as->a_contents);
1495 1412                                  goto retry;
1496 1413                          }
1497 1414                          mutex_exit(&as->a_contents);
1498 1415                          goto top;
1499 1416                  } else if (err == IE_RETRY) {
1500 1417                          AS_LOCK_EXIT(as, &as->a_lock);
1501 1418                          goto top;
1502 1419                  } else if (err) {
1503 1420                          as_setwatch(as);
1504 1421                          AS_LOCK_EXIT(as, &as->a_lock);
1505 1422                          return (-1);
1506 1423                  }
1507 1424  
1508 1425                  as->a_size -= ssize;
1509 1426                  if (rsize)
1510 1427                          as->a_resvsize -= rsize;
1511 1428                  raddr += ssize;
1512 1429          }
1513 1430          AS_LOCK_EXIT(as, &as->a_lock);
1514 1431          return (0);
1515 1432  }
1516 1433  
1517 1434  static int
1518 1435  as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1519 1436      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1520 1437  {
1521 1438          uint_t szc;
1522 1439          uint_t nszc;
1523 1440          int error;
1524 1441          caddr_t a;
1525 1442          caddr_t eaddr;
1526 1443          size_t segsize;
1527 1444          struct seg *seg;
1528 1445          size_t pgsz;
1529 1446          int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1530 1447          uint_t save_szcvec;
1531 1448  
1532 1449          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1533 1450          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1534 1451          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1535 1452          ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1536 1453          if (!do_off) {
1537 1454                  vn_a->offset = 0;
1538 1455          }
1539 1456  
1540 1457          if (szcvec <= 1) {
1541 1458                  seg = seg_alloc(as, addr, size);
1542 1459                  if (seg == NULL) {
1543 1460                          return (ENOMEM);
1544 1461                  }
1545 1462                  vn_a->szc = 0;
1546 1463                  error = (*crfp)(seg, vn_a);
1547 1464                  if (error != 0) {
1548 1465                          seg_free(seg);
1549 1466                  } else {
1550 1467                          as->a_size += size;
1551 1468                          as->a_resvsize += size;
1552 1469                  }
1553 1470                  return (error);
1554 1471          }
1555 1472  
1556 1473          eaddr = addr + size;
1557 1474          save_szcvec = szcvec;
1558 1475          szcvec >>= 1;
1559 1476          szc = 0;
1560 1477          nszc = 0;
1561 1478          while (szcvec) {
1562 1479                  if ((szcvec & 0x1) == 0) {
1563 1480                          nszc++;
1564 1481                          szcvec >>= 1;
1565 1482                          continue;
1566 1483                  }
1567 1484                  nszc++;
1568 1485                  pgsz = page_get_pagesize(nszc);
1569 1486                  a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1570 1487                  if (a != addr) {
1571 1488                          ASSERT(a < eaddr);
1572 1489                          segsize = a - addr;
1573 1490                          seg = seg_alloc(as, addr, segsize);
1574 1491                          if (seg == NULL) {
1575 1492                                  return (ENOMEM);
1576 1493                          }
1577 1494                          vn_a->szc = szc;
1578 1495                          error = (*crfp)(seg, vn_a);
1579 1496                          if (error != 0) {
1580 1497                                  seg_free(seg);
1581 1498                                  return (error);
1582 1499                          }
1583 1500                          as->a_size += segsize;
1584 1501                          as->a_resvsize += segsize;
1585 1502                          *segcreated = 1;
1586 1503                          if (do_off) {
1587 1504                                  vn_a->offset += segsize;
1588 1505                          }
1589 1506                          addr = a;
1590 1507                  }
1591 1508                  szc = nszc;
1592 1509                  szcvec >>= 1;
1593 1510          }
1594 1511  
1595 1512          ASSERT(addr < eaddr);
1596 1513          szcvec = save_szcvec | 1; /* add 8K pages */
1597 1514          while (szcvec) {
1598 1515                  a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1599 1516                  ASSERT(a >= addr);
1600 1517                  if (a != addr) {
1601 1518                          segsize = a - addr;
1602 1519                          seg = seg_alloc(as, addr, segsize);
1603 1520                          if (seg == NULL) {
1604 1521                                  return (ENOMEM);
1605 1522                          }
1606 1523                          vn_a->szc = szc;
1607 1524                          error = (*crfp)(seg, vn_a);
1608 1525                          if (error != 0) {
1609 1526                                  seg_free(seg);
1610 1527                                  return (error);
1611 1528                          }
1612 1529                          as->a_size += segsize;
1613 1530                          as->a_resvsize += segsize;
1614 1531                          *segcreated = 1;
1615 1532                          if (do_off) {
1616 1533                                  vn_a->offset += segsize;
1617 1534                          }
1618 1535                          addr = a;
1619 1536                  }
1620 1537                  szcvec &= ~(1 << szc);
1621 1538                  if (szcvec) {
1622 1539                          szc = highbit(szcvec) - 1;
1623 1540                          pgsz = page_get_pagesize(szc);
1624 1541                  }
1625 1542          }
1626 1543          ASSERT(addr == eaddr);
1627 1544  
1628 1545          return (0);
1629 1546  }
1630 1547  
1631 1548  static int
1632 1549  as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1633 1550      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1634 1551  {
1635 1552          uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1636 1553          int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1637 1554          uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1638 1555              type, 0);
1639 1556          int error;
1640 1557          struct seg *seg;
1641 1558          struct vattr va;
1642 1559          u_offset_t eoff;
1643 1560          size_t save_size = 0;
1644 1561          extern size_t textrepl_size_thresh;
1645 1562  
1646 1563          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1647 1564          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1648 1565          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1649 1566          ASSERT(vn_a->vp != NULL);
1650 1567          ASSERT(vn_a->amp == NULL);
1651 1568  
1652 1569  again:
1653 1570          if (szcvec <= 1) {
1654 1571                  seg = seg_alloc(as, addr, size);
1655 1572                  if (seg == NULL) {
1656 1573                          return (ENOMEM);
1657 1574                  }
1658 1575                  vn_a->szc = 0;
1659 1576                  error = (*crfp)(seg, vn_a);
1660 1577                  if (error != 0) {
1661 1578                          seg_free(seg);
1662 1579                  } else {
1663 1580                          as->a_size += size;
1664 1581                          as->a_resvsize += size;
1665 1582                  }
1666 1583                  return (error);
1667 1584          }
1668 1585  
1669 1586          va.va_mask = AT_SIZE;
1670 1587          if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1671 1588                  szcvec = 0;
1672 1589                  goto again;
1673 1590          }
1674 1591          eoff = vn_a->offset & PAGEMASK;
1675 1592          if (eoff >= va.va_size) {
1676 1593                  szcvec = 0;
1677 1594                  goto again;
1678 1595          }
1679 1596          eoff += size;
1680 1597          if (btopr(va.va_size) < btopr(eoff)) {
1681 1598                  save_size = size;
1682 1599                  size = va.va_size - (vn_a->offset & PAGEMASK);
1683 1600                  size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1684 1601                  szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1685 1602                      type, 0);
1686 1603                  if (szcvec <= 1) {
1687 1604                          size = save_size;
1688 1605                          goto again;
1689 1606                  }
1690 1607          }
1691 1608  
1692 1609          if (size > textrepl_size_thresh) {
1693 1610                  vn_a->flags |= _MAP_TEXTREPL;
1694 1611          }
1695 1612          error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1696 1613              segcreated);
1697 1614          if (error != 0) {
1698 1615                  return (error);
1699 1616          }
1700 1617          if (save_size) {
1701 1618                  addr += size;
1702 1619                  size = save_size - size;
1703 1620                  szcvec = 0;
1704 1621                  goto again;
1705 1622          }
1706 1623          return (0);
1707 1624  }
1708 1625  
1709 1626  /*
1710 1627   * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1711 1628   * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1712 1629   */
1713 1630  static int
1714 1631  as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1715 1632      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1716 1633  {
1717 1634          uint_t szcvec;
1718 1635          uchar_t type;
1719 1636  
1720 1637          ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1721 1638          if (vn_a->type == MAP_SHARED) {
1722 1639                  type = MAPPGSZC_SHM;
1723 1640          } else if (vn_a->type == MAP_PRIVATE) {
1724 1641                  if (vn_a->szc == AS_MAP_HEAP) {
1725 1642                          type = MAPPGSZC_HEAP;
1726 1643                  } else if (vn_a->szc == AS_MAP_STACK) {
1727 1644                          type = MAPPGSZC_STACK;
1728 1645                  } else {
1729 1646                          type = MAPPGSZC_PRIVM;
1730 1647                  }
1731 1648          }
1732 1649          szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1733 1650              (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1734 1651              (vn_a->flags & MAP_TEXT), type, 0);
1735 1652          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1736 1653          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1737 1654          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1738 1655          ASSERT(vn_a->vp == NULL);
1739 1656  
1740 1657          return (as_map_segvn_segs(as, addr, size, szcvec,
1741 1658              crfp, vn_a, segcreated));
1742 1659  }
1743 1660  
1744 1661  int
1745 1662  as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1746 1663  {
1747 1664          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1748 1665          return (as_map_locked(as, addr, size, crfp, argsp));
1749 1666  }
1750 1667  
1751 1668  int
1752 1669  as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1753 1670                  void *argsp)
1754 1671  {
1755 1672          struct seg *seg = NULL;
1756 1673          caddr_t raddr;                  /* rounded down addr */
1757 1674          size_t rsize;                   /* rounded up size */
1758 1675          int error;
1759 1676          int unmap = 0;
1760 1677          struct proc *p = curproc;
1761 1678          struct segvn_crargs crargs;
1762 1679  
1763 1680          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1764 1681          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1765 1682              (size_t)raddr;
1766 1683  
1767 1684          /*
1768 1685           * check for wrap around
1769 1686           */
1770 1687          if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1771 1688                  AS_LOCK_EXIT(as, &as->a_lock);
1772 1689                  return (ENOMEM);
1773 1690          }
1774 1691  
1775 1692          as->a_updatedir = 1;    /* inform /proc */
1776 1693          gethrestime(&as->a_updatetime);
1777 1694  
1778 1695          if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1779 1696                  AS_LOCK_EXIT(as, &as->a_lock);
1780 1697  
1781 1698                  (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1782 1699                      RCA_UNSAFE_ALL);
1783 1700  
1784 1701                  return (ENOMEM);
1785 1702          }
1786 1703  
1787 1704          if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1788 1705                  crargs = *(struct segvn_crargs *)argsp;
1789 1706                  error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1790 1707                  if (error != 0) {
1791 1708                          AS_LOCK_EXIT(as, &as->a_lock);
1792 1709                          if (unmap) {
1793 1710                                  (void) as_unmap(as, addr, size);
1794 1711                          }
1795 1712                          return (error);
1796 1713                  }
1797 1714          } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1798 1715                  crargs = *(struct segvn_crargs *)argsp;
1799 1716                  error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1800 1717                  if (error != 0) {
1801 1718                          AS_LOCK_EXIT(as, &as->a_lock);
1802 1719                          if (unmap) {
1803 1720                                  (void) as_unmap(as, addr, size);
1804 1721                          }
1805 1722                          return (error);
1806 1723                  }
1807 1724          } else {
1808 1725                  seg = seg_alloc(as, addr, size);
1809 1726                  if (seg == NULL) {
1810 1727                          AS_LOCK_EXIT(as, &as->a_lock);
1811 1728                          return (ENOMEM);
1812 1729                  }
1813 1730  
1814 1731                  error = (*crfp)(seg, argsp);
1815 1732                  if (error != 0) {
1816 1733                          seg_free(seg);
1817 1734                          AS_LOCK_EXIT(as, &as->a_lock);
1818 1735                          return (error);
1819 1736                  }
1820 1737                  /*
1821 1738                   * Add size now so as_unmap will work if as_ctl fails.
1822 1739                   */
1823 1740                  as->a_size += rsize;
1824 1741                  as->a_resvsize += rsize;
1825 1742          }
1826 1743  
1827 1744          as_setwatch(as);
1828 1745  
1829 1746          /*
1830 1747           * If the address space is locked,
1831 1748           * establish memory locks for the new segment.
1832 1749           */
1833 1750          mutex_enter(&as->a_contents);
1834 1751          if (AS_ISPGLCK(as)) {
1835 1752                  mutex_exit(&as->a_contents);
1836 1753                  AS_LOCK_EXIT(as, &as->a_lock);
1837 1754                  error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1838 1755                  if (error != 0)
1839 1756                          (void) as_unmap(as, addr, size);
1840 1757          } else {
1841 1758                  mutex_exit(&as->a_contents);
1842 1759                  AS_LOCK_EXIT(as, &as->a_lock);
1843 1760          }
1844 1761          return (error);
1845 1762  }
1846 1763  
1847 1764  
1848 1765  /*
1849 1766   * Delete all segments in the address space marked with S_PURGE.
1850 1767   * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1851 1768   * These segments are deleted as a first step before calls to as_gap(), so
1852 1769   * that they don't affect mmap() or shmat().
1853 1770   */
1854 1771  void
1855 1772  as_purge(struct as *as)
1856 1773  {
1857 1774          struct seg *seg;
1858 1775          struct seg *next_seg;
1859 1776  
1860 1777          /*
1861 1778           * the setting of NEEDSPURGE is protect by as_rangelock(), so
1862 1779           * no need to grab a_contents mutex for this check
1863 1780           */
1864 1781          if ((as->a_flags & AS_NEEDSPURGE) == 0)
1865 1782                  return;
1866 1783  
1867 1784          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1868 1785          next_seg = NULL;
1869 1786          seg = AS_SEGFIRST(as);
1870 1787          while (seg != NULL) {
1871 1788                  next_seg = AS_SEGNEXT(as, seg);
1872 1789                  if (seg->s_flags & S_PURGE)
1873 1790                          SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1874 1791                  seg = next_seg;
1875 1792          }
1876 1793          AS_LOCK_EXIT(as, &as->a_lock);
1877 1794  
1878 1795          mutex_enter(&as->a_contents);
1879 1796          as->a_flags &= ~AS_NEEDSPURGE;
1880 1797          mutex_exit(&as->a_contents);
1881 1798  }
1882 1799  
1883 1800  /*
1884 1801   * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1885 1802   * range of addresses at least "minlen" long, where the base of the range is
1886 1803   * at "off" phase from an "align" boundary and there is space for a
1887 1804   * "redzone"-sized redzone on eithe rside of the range.  Thus,
1888 1805   * if align was 4M and off was 16k, the user wants a hole which will start
1889 1806   * 16k into a 4M page.
1890 1807   *
1891 1808   * If flags specifies AH_HI, the hole will have the highest possible address
1892 1809   * in the range.  We use the as->a_lastgap field to figure out where to
1893 1810   * start looking for a gap.
1894 1811   *
1895 1812   * Otherwise, the gap will have the lowest possible address.
1896 1813   *
1897 1814   * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1898 1815   *
1899 1816   * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1900 1817   * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1901 1818   *
1902 1819   * NOTE: This routine is not correct when base+len overflows caddr_t.
1903 1820   */
1904 1821  int
1905 1822  as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1906 1823      uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1907 1824  {
1908 1825          caddr_t lobound = *basep;
1909 1826          caddr_t hibound = lobound + *lenp;
1910 1827          struct seg *lseg, *hseg;
1911 1828          caddr_t lo, hi;
1912 1829          int forward;
1913 1830          caddr_t save_base;
1914 1831          size_t save_len;
1915 1832          size_t save_minlen;
1916 1833          size_t save_redzone;
1917 1834          int fast_path = 1;
1918 1835  
1919 1836          save_base = *basep;
1920 1837          save_len = *lenp;
1921 1838          save_minlen = minlen;
1922 1839          save_redzone = redzone;
1923 1840  
1924 1841          /*
1925 1842           * For the first pass/fast_path, just add align and redzone into
1926 1843           * minlen since if we get an allocation, we can guarantee that it
1927 1844           * will fit the alignment and redzone requested.
1928 1845           * This increases the chance that hibound will be adjusted to
1929 1846           * a_lastgap->s_base which will likely allow us to find an
1930 1847           * acceptable hole in the address space quicker.
1931 1848           * If we can't find a hole with this fast_path, then we look for
1932 1849           * smaller holes in which the alignment and offset may allow
1933 1850           * the allocation to fit.
1934 1851           */
1935 1852          minlen += align;
1936 1853          minlen += 2 * redzone;
1937 1854          redzone = 0;
1938 1855  
1939 1856          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1940 1857          if (AS_SEGFIRST(as) == NULL) {
1941 1858                  if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1942 1859                      align, redzone, off)) {
1943 1860                          AS_LOCK_EXIT(as, &as->a_lock);
1944 1861                          return (0);
1945 1862                  } else {
1946 1863                          AS_LOCK_EXIT(as, &as->a_lock);
1947 1864                          *basep = save_base;
1948 1865                          *lenp = save_len;
1949 1866                          return (-1);
1950 1867                  }
1951 1868          }
1952 1869  
1953 1870  retry:
1954 1871          /*
1955 1872           * Set up to iterate over all the inter-segment holes in the given
1956 1873           * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1957 1874           * NULL for the highest-addressed hole.  If moving backwards, we reset
1958 1875           * sseg to denote the highest-addressed segment.
1959 1876           */
1960 1877          forward = (flags & AH_DIR) == AH_LO;
1961 1878          if (forward) {
1962 1879                  hseg = as_findseg(as, lobound, 1);
1963 1880                  lseg = AS_SEGPREV(as, hseg);
1964 1881          } else {
1965 1882  
1966 1883                  /*
1967 1884                   * If allocating at least as much as the last allocation,
1968 1885                   * use a_lastgap's base as a better estimate of hibound.
1969 1886                   */
1970 1887                  if (as->a_lastgap &&
1971 1888                      minlen >= as->a_lastgap->s_size &&
1972 1889                      hibound >= as->a_lastgap->s_base)
1973 1890                          hibound = as->a_lastgap->s_base;
1974 1891  
1975 1892                  hseg = as_findseg(as, hibound, 1);
1976 1893                  if (hseg->s_base + hseg->s_size < hibound) {
1977 1894                          lseg = hseg;
1978 1895                          hseg = NULL;
1979 1896                  } else {
1980 1897                          lseg = AS_SEGPREV(as, hseg);
1981 1898                  }
1982 1899          }
1983 1900  
1984 1901          for (;;) {
1985 1902                  /*
1986 1903                   * Set lo and hi to the hole's boundaries.  (We should really
1987 1904                   * use MAXADDR in place of hibound in the expression below,
1988 1905                   * but can't express it easily; using hibound in its place is
1989 1906                   * harmless.)
1990 1907                   */
1991 1908                  lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1992 1909                  hi = (hseg == NULL) ? hibound : hseg->s_base;
1993 1910                  /*
1994 1911                   * If the iteration has moved past the interval from lobound
1995 1912                   * to hibound it's pointless to continue.
1996 1913                   */
1997 1914                  if ((forward && lo > hibound) || (!forward && hi < lobound))
1998 1915                          break;
1999 1916                  else if (lo > hibound || hi < lobound)
2000 1917                          goto cont;
2001 1918                  /*
2002 1919                   * Candidate hole lies at least partially within the allowable
2003 1920                   * range.  Restrict it to fall completely within that range,
2004 1921                   * i.e., to [max(lo, lobound), min(hi, hibound)].
2005 1922                   */
2006 1923                  if (lo < lobound)
2007 1924                          lo = lobound;
2008 1925                  if (hi > hibound)
2009 1926                          hi = hibound;
2010 1927                  /*
2011 1928                   * Verify that the candidate hole is big enough and meets
2012 1929                   * hardware constraints.  If the hole is too small, no need
2013 1930                   * to do the further checks since they will fail.
2014 1931                   */
2015 1932                  *basep = lo;
2016 1933                  *lenp = hi - lo;
2017 1934                  if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
2018 1935                      minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
2019 1936                      ((flags & AH_CONTAIN) == 0 ||
2020 1937                      (*basep <= addr && *basep + *lenp > addr))) {
2021 1938                          if (!forward)
2022 1939                                  as->a_lastgap = hseg;
2023 1940                          if (hseg != NULL)
2024 1941                                  as->a_lastgaphl = hseg;
2025 1942                          else
2026 1943                                  as->a_lastgaphl = lseg;
2027 1944                          AS_LOCK_EXIT(as, &as->a_lock);
2028 1945                          return (0);
2029 1946                  }
2030 1947          cont:
2031 1948                  /*
2032 1949                   * Move to the next hole.
2033 1950                   */
2034 1951                  if (forward) {
2035 1952                          lseg = hseg;
2036 1953                          if (lseg == NULL)
2037 1954                                  break;
2038 1955                          hseg = AS_SEGNEXT(as, hseg);
2039 1956                  } else {
2040 1957                          hseg = lseg;
2041 1958                          if (hseg == NULL)
2042 1959                                  break;
2043 1960                          lseg = AS_SEGPREV(as, lseg);
2044 1961                  }
2045 1962          }
2046 1963          if (fast_path && (align != 0 || save_redzone != 0)) {
2047 1964                  fast_path = 0;
2048 1965                  minlen = save_minlen;
2049 1966                  redzone = save_redzone;
2050 1967                  goto retry;
2051 1968          }
2052 1969          *basep = save_base;
2053 1970          *lenp = save_len;
2054 1971          AS_LOCK_EXIT(as, &as->a_lock);
2055 1972          return (-1);
2056 1973  }
2057 1974  
2058 1975  /*
2059 1976   * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2060 1977   *
2061 1978   * If flags specifies AH_HI, the hole will have the highest possible address
2062 1979   * in the range.  We use the as->a_lastgap field to figure out where to
2063 1980   * start looking for a gap.
2064 1981   *
2065 1982   * Otherwise, the gap will have the lowest possible address.
2066 1983   *
2067 1984   * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2068 1985   *
2069 1986   * If an adequate hole is found, base and len are set to reflect the part of
2070 1987   * the hole that is within range, and 0 is returned, otherwise,
2071 1988   * -1 is returned.
2072 1989   *
2073 1990   * NOTE: This routine is not correct when base+len overflows caddr_t.
2074 1991   */
2075 1992  int
2076 1993  as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2077 1994      caddr_t addr)
2078 1995  {
2079 1996  
2080 1997          return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2081 1998  }
2082 1999  
2083 2000  /*
2084 2001   * Return the next range within [base, base + len) that is backed
2085 2002   * with "real memory".  Skip holes and non-seg_vn segments.
2086 2003   * We're lazy and only return one segment at a time.
2087 2004   */
2088 2005  int
2089 2006  as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2090 2007  {
2091 2008          extern struct seg_ops segspt_shmops;    /* needs a header file */
2092 2009          struct seg *seg;
2093 2010          caddr_t addr, eaddr;
2094 2011          caddr_t segend;
2095 2012  
2096 2013          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2097 2014  
2098 2015          addr = *basep;
2099 2016          eaddr = addr + *lenp;
2100 2017  
2101 2018          seg = as_findseg(as, addr, 0);
2102 2019          if (seg != NULL)
2103 2020                  addr = MAX(seg->s_base, addr);
2104 2021  
2105 2022          for (;;) {
2106 2023                  if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2107 2024                          AS_LOCK_EXIT(as, &as->a_lock);
2108 2025                          return (EINVAL);
2109 2026                  }
2110 2027  
2111 2028                  if (seg->s_ops == &segvn_ops) {
2112 2029                          segend = seg->s_base + seg->s_size;
2113 2030                          break;
2114 2031                  }
2115 2032  
2116 2033                  /*
2117 2034                   * We do ISM by looking into the private data
2118 2035                   * to determine the real size of the segment.
2119 2036                   */
2120 2037                  if (seg->s_ops == &segspt_shmops) {
2121 2038                          segend = seg->s_base + spt_realsize(seg);
2122 2039                          if (addr < segend)
2123 2040                                  break;
2124 2041                  }
2125 2042  
2126 2043                  seg = AS_SEGNEXT(as, seg);
2127 2044  
2128 2045                  if (seg != NULL)
2129 2046                          addr = seg->s_base;
2130 2047          }
2131 2048  
2132 2049          *basep = addr;
2133 2050  
2134 2051          if (segend > eaddr)
2135 2052                  *lenp = eaddr - addr;
2136 2053          else
2137 2054                  *lenp = segend - addr;
2138 2055  
2139 2056          AS_LOCK_EXIT(as, &as->a_lock);
2140 2057          return (0);
2141 2058  }
2142 2059  
2143 2060  /*
2144 2061   * Swap the pages associated with the address space as out to
2145 2062   * secondary storage, returning the number of bytes actually
2146 2063   * swapped.
2147 2064   *
2148 2065   * The value returned is intended to correlate well with the process's
2149 2066   * memory requirements.  Its usefulness for this purpose depends on
2150 2067   * how well the segment-level routines do at returning accurate
2151 2068   * information.
2152 2069   */
2153 2070  size_t
2154 2071  as_swapout(struct as *as)
2155 2072  {
2156 2073          struct seg *seg;
2157 2074          size_t swpcnt = 0;
2158 2075

↓ open down ↓

1084 lines elided

↑ open up ↑

2159 2076          /*
2160 2077           * Kernel-only processes have given up their address
2161 2078           * spaces.  Of course, we shouldn't be attempting to
2162 2079           * swap out such processes in the first place...
2163 2080           */
2164 2081          if (as == NULL)
2165 2082                  return (0);
2166 2083  
2167 2084          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2168 2085  
2169      -        /* Prevent XHATs from attaching */
2170      -        mutex_enter(&as->a_contents);
2171      -        AS_SETBUSY(as);
2172      -        mutex_exit(&as->a_contents);
2173      -
2174      -
2175 2086          /*
2176 2087           * Free all mapping resources associated with the address
2177 2088           * space.  The segment-level swapout routines capitalize
2178 2089           * on this unmapping by scavanging pages that have become
2179 2090           * unmapped here.
2180 2091           */
2181 2092          hat_swapout(as->a_hat);
2182      -        if (as->a_xhat != NULL)
2183      -                xhat_swapout_all(as);
2184      -
2185      -        mutex_enter(&as->a_contents);
2186      -        AS_CLRBUSY(as);
2187      -        mutex_exit(&as->a_contents);
2188 2093  
2189 2094          /*
2190 2095           * Call the swapout routines of all segments in the address
2191 2096           * space to do the actual work, accumulating the amount of
2192 2097           * space reclaimed.
2193 2098           */
2194 2099          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2195 2100                  struct seg_ops *ov = seg->s_ops;
2196 2101  
2197 2102                  /*

2198 2103                   * We have to check to see if the seg has
2199 2104                   * an ops vector because the seg may have
2200 2105                   * been in the middle of being set up when
2201 2106                   * the process was picked for swapout.
2202 2107                   */
2203 2108                  if ((ov != NULL) && (ov->swapout != NULL))
2204 2109                          swpcnt += SEGOP_SWAPOUT(seg);
2205 2110          }
2206 2111          AS_LOCK_EXIT(as, &as->a_lock);
2207 2112          return (swpcnt);
2208 2113  }
2209 2114  
2210 2115  /*
2211 2116   * Determine whether data from the mappings in interval [addr, addr + size)
2212 2117   * are in the primary memory (core) cache.
2213 2118   */
2214 2119  int
2215 2120  as_incore(struct as *as, caddr_t addr,
2216 2121      size_t size, char *vec, size_t *sizep)
2217 2122  {
2218 2123          struct seg *seg;
2219 2124          size_t ssize;
2220 2125          caddr_t raddr;          /* rounded down addr */
2221 2126          size_t rsize;           /* rounded up size */
2222 2127          size_t isize;                   /* iteration size */
2223 2128          int error = 0;          /* result, assume success */
2224 2129  
2225 2130          *sizep = 0;
2226 2131          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2227 2132          rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2228 2133              (size_t)raddr;
2229 2134  
2230 2135          if (raddr + rsize < raddr)              /* check for wraparound */
2231 2136                  return (ENOMEM);
2232 2137  
2233 2138          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2234 2139          seg = as_segat(as, raddr);
2235 2140          if (seg == NULL) {
2236 2141                  AS_LOCK_EXIT(as, &as->a_lock);
2237 2142                  return (-1);
2238 2143          }
2239 2144  
2240 2145          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2241 2146                  if (raddr >= seg->s_base + seg->s_size) {
2242 2147                          seg = AS_SEGNEXT(as, seg);
2243 2148                          if (seg == NULL || raddr != seg->s_base) {
2244 2149                                  error = -1;
2245 2150                                  break;
2246 2151                          }
2247 2152                  }
2248 2153                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2249 2154                          ssize = seg->s_base + seg->s_size - raddr;
2250 2155                  else
2251 2156                          ssize = rsize;
2252 2157                  *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2253 2158                  if (isize != ssize) {
2254 2159                          error = -1;
2255 2160                          break;
2256 2161                  }
2257 2162                  vec += btopr(ssize);
2258 2163          }
2259 2164          AS_LOCK_EXIT(as, &as->a_lock);
2260 2165          return (error);
2261 2166  }
2262 2167  
2263 2168  static void
2264 2169  as_segunlock(struct seg *seg, caddr_t addr, int attr,
2265 2170          ulong_t *bitmap, size_t position, size_t npages)
2266 2171  {
2267 2172          caddr_t range_start;
2268 2173          size_t  pos1 = position;
2269 2174          size_t  pos2;
2270 2175          size_t  size;
2271 2176          size_t  end_pos = npages + position;
2272 2177  
2273 2178          while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2274 2179                  size = ptob((pos2 - pos1));
2275 2180                  range_start = (caddr_t)((uintptr_t)addr +
2276 2181                      ptob(pos1 - position));
2277 2182  
2278 2183                  (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2279 2184                      (ulong_t *)NULL, (size_t)NULL);
2280 2185                  pos1 = pos2;
2281 2186          }
2282 2187  }
2283 2188  
2284 2189  static void
2285 2190  as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2286 2191          caddr_t raddr, size_t rsize)
2287 2192  {
2288 2193          struct seg *seg = as_segat(as, raddr);
2289 2194          size_t ssize;
2290 2195  
2291 2196          while (rsize != 0) {
2292 2197                  if (raddr >= seg->s_base + seg->s_size)
2293 2198                          seg = AS_SEGNEXT(as, seg);
2294 2199  
2295 2200                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2296 2201                          ssize = seg->s_base + seg->s_size - raddr;
2297 2202                  else
2298 2203                          ssize = rsize;
2299 2204  
2300 2205                  as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2301 2206  
2302 2207                  rsize -= ssize;
2303 2208                  raddr += ssize;
2304 2209          }
2305 2210  }
2306 2211  
2307 2212  /*
2308 2213   * Cache control operations over the interval [addr, addr + size) in
2309 2214   * address space "as".
2310 2215   */
2311 2216  /*ARGSUSED*/
2312 2217  int
2313 2218  as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2314 2219      uintptr_t arg, ulong_t *lock_map, size_t pos)
2315 2220  {
2316 2221          struct seg *seg;        /* working segment */
2317 2222          caddr_t raddr;          /* rounded down addr */
2318 2223          caddr_t initraddr;      /* saved initial rounded down addr */
2319 2224          size_t rsize;           /* rounded up size */
2320 2225          size_t initrsize;       /* saved initial rounded up size */
2321 2226          size_t ssize;           /* size of seg */
2322 2227          int error = 0;                  /* result */
2323 2228          size_t mlock_size;      /* size of bitmap */
2324 2229          ulong_t *mlock_map;     /* pointer to bitmap used */
2325 2230                                  /* to represent the locked */
2326 2231                                  /* pages. */
2327 2232  retry:
2328 2233          if (error == IE_RETRY)
2329 2234                  AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2330 2235          else
2331 2236                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2332 2237  
2333 2238          /*
2334 2239           * If these are address space lock/unlock operations, loop over
2335 2240           * all segments in the address space, as appropriate.
2336 2241           */
2337 2242          if (func == MC_LOCKAS) {
2338 2243                  size_t npages, idx;
2339 2244                  size_t rlen = 0;        /* rounded as length */
2340 2245  
2341 2246                  idx = pos;
2342 2247  
2343 2248                  if (arg & MCL_FUTURE) {
2344 2249                          mutex_enter(&as->a_contents);
2345 2250                          AS_SETPGLCK(as);
2346 2251                          mutex_exit(&as->a_contents);
2347 2252                  }
2348 2253                  if ((arg & MCL_CURRENT) == 0) {
2349 2254                          AS_LOCK_EXIT(as, &as->a_lock);
2350 2255                          return (0);
2351 2256                  }
2352 2257  
2353 2258                  seg = AS_SEGFIRST(as);
2354 2259                  if (seg == NULL) {
2355 2260                          AS_LOCK_EXIT(as, &as->a_lock);
2356 2261                          return (0);
2357 2262                  }
2358 2263  
2359 2264                  do {
2360 2265                          raddr = (caddr_t)((uintptr_t)seg->s_base &
2361 2266                              (uintptr_t)PAGEMASK);
2362 2267                          rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2363 2268                              PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2364 2269                  } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2365 2270  
2366 2271                  mlock_size = BT_BITOUL(btopr(rlen));
2367 2272                  if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2368 2273                      sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2369 2274                                  AS_LOCK_EXIT(as, &as->a_lock);
2370 2275                                  return (EAGAIN);
2371 2276                  }
2372 2277  
2373 2278                  for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2374 2279                          error = SEGOP_LOCKOP(seg, seg->s_base,
2375 2280                              seg->s_size, attr, MC_LOCK, mlock_map, pos);
2376 2281                          if (error != 0)
2377 2282                                  break;
2378 2283                          pos += seg_pages(seg);
2379 2284                  }
2380 2285  
2381 2286                  if (error) {
2382 2287                          for (seg = AS_SEGFIRST(as); seg != NULL;
2383 2288                              seg = AS_SEGNEXT(as, seg)) {
2384 2289  
2385 2290                                  raddr = (caddr_t)((uintptr_t)seg->s_base &
2386 2291                                      (uintptr_t)PAGEMASK);
2387 2292                                  npages = seg_pages(seg);
2388 2293                                  as_segunlock(seg, raddr, attr, mlock_map,
2389 2294                                      idx, npages);
2390 2295                                  idx += npages;
2391 2296                          }
2392 2297                  }
2393 2298  
2394 2299                  kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2395 2300                  AS_LOCK_EXIT(as, &as->a_lock);
2396 2301                  goto lockerr;
2397 2302          } else if (func == MC_UNLOCKAS) {
2398 2303                  mutex_enter(&as->a_contents);
2399 2304                  AS_CLRPGLCK(as);
2400 2305                  mutex_exit(&as->a_contents);
2401 2306  
2402 2307                  for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2403 2308                          error = SEGOP_LOCKOP(seg, seg->s_base,
2404 2309                              seg->s_size, attr, MC_UNLOCK, NULL, 0);
2405 2310                          if (error != 0)
2406 2311                                  break;
2407 2312                  }
2408 2313  
2409 2314                  AS_LOCK_EXIT(as, &as->a_lock);
2410 2315                  goto lockerr;
2411 2316          }
2412 2317  
2413 2318          /*
2414 2319           * Normalize addresses and sizes.
2415 2320           */
2416 2321          initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2417 2322          initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2418 2323              (size_t)raddr;
2419 2324  
2420 2325          if (raddr + rsize < raddr) {            /* check for wraparound */
2421 2326                  AS_LOCK_EXIT(as, &as->a_lock);
2422 2327                  return (ENOMEM);
2423 2328          }
2424 2329  
2425 2330          /*
2426 2331           * Get initial segment.
2427 2332           */
2428 2333          if ((seg = as_segat(as, raddr)) == NULL) {
2429 2334                  AS_LOCK_EXIT(as, &as->a_lock);
2430 2335                  return (ENOMEM);
2431 2336          }
2432 2337  
2433 2338          if (func == MC_LOCK) {
2434 2339                  mlock_size = BT_BITOUL(btopr(rsize));
2435 2340                  if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2436 2341                      sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2437 2342                                  AS_LOCK_EXIT(as, &as->a_lock);
2438 2343                                  return (EAGAIN);
2439 2344                  }
2440 2345          }
2441 2346  
2442 2347          /*
2443 2348           * Loop over all segments.  If a hole in the address range is
2444 2349           * discovered, then fail.  For each segment, perform the appropriate
2445 2350           * control operation.
2446 2351           */
2447 2352          while (rsize != 0) {
2448 2353  
2449 2354                  /*
2450 2355                   * Make sure there's no hole, calculate the portion
2451 2356                   * of the next segment to be operated over.
2452 2357                   */
2453 2358                  if (raddr >= seg->s_base + seg->s_size) {
2454 2359                          seg = AS_SEGNEXT(as, seg);
2455 2360                          if (seg == NULL || raddr != seg->s_base) {
2456 2361                                  if (func == MC_LOCK) {
2457 2362                                          as_unlockerr(as, attr, mlock_map,
2458 2363                                              initraddr, initrsize - rsize);
2459 2364                                          kmem_free(mlock_map,
2460 2365                                              mlock_size * sizeof (ulong_t));
2461 2366                                  }
2462 2367                                  AS_LOCK_EXIT(as, &as->a_lock);
2463 2368                                  return (ENOMEM);
2464 2369                          }
2465 2370                  }
2466 2371                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2467 2372                          ssize = seg->s_base + seg->s_size - raddr;
2468 2373                  else
2469 2374                          ssize = rsize;
2470 2375  
2471 2376                  /*
2472 2377                   * Dispatch on specific function.
2473 2378                   */
2474 2379                  switch (func) {
2475 2380  
2476 2381                  /*
2477 2382                   * Synchronize cached data from mappings with backing
2478 2383                   * objects.
2479 2384                   */
2480 2385                  case MC_SYNC:
2481 2386                          if (error = SEGOP_SYNC(seg, raddr, ssize,
2482 2387                              attr, (uint_t)arg)) {
2483 2388                                  AS_LOCK_EXIT(as, &as->a_lock);
2484 2389                                  return (error);
2485 2390                          }
2486 2391                          break;
2487 2392  
2488 2393                  /*
2489 2394                   * Lock pages in memory.
2490 2395                   */
2491 2396                  case MC_LOCK:
2492 2397                          if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2493 2398                              attr, func, mlock_map, pos)) {
2494 2399                                  as_unlockerr(as, attr, mlock_map, initraddr,
2495 2400                                      initrsize - rsize + ssize);
2496 2401                                  kmem_free(mlock_map, mlock_size *
2497 2402                                      sizeof (ulong_t));
2498 2403                                  AS_LOCK_EXIT(as, &as->a_lock);
2499 2404                                  goto lockerr;
2500 2405                          }
2501 2406                          break;
2502 2407  
2503 2408                  /*
2504 2409                   * Unlock mapped pages.
2505 2410                   */
2506 2411                  case MC_UNLOCK:
2507 2412                          (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2508 2413                              (ulong_t *)NULL, (size_t)NULL);
2509 2414                          break;
2510 2415  
2511 2416                  /*
2512 2417                   * Store VM advise for mapped pages in segment layer.
2513 2418                   */
2514 2419                  case MC_ADVISE:
2515 2420                          error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2516 2421  
2517 2422                          /*
2518 2423                           * Check for regular errors and special retry error
2519 2424                           */
2520 2425                          if (error) {
2521 2426                                  if (error == IE_RETRY) {
2522 2427                                          /*
2523 2428                                           * Need to acquire writers lock, so
2524 2429                                           * have to drop readers lock and start
2525 2430                                           * all over again
2526 2431                                           */
2527 2432                                          AS_LOCK_EXIT(as, &as->a_lock);
2528 2433                                          goto retry;
2529 2434                                  } else if (error == IE_REATTACH) {
2530 2435                                          /*
2531 2436                                           * Find segment for current address
2532 2437                                           * because current segment just got
2533 2438                                           * split or concatenated
2534 2439                                           */
2535 2440                                          seg = as_segat(as, raddr);
2536 2441                                          if (seg == NULL) {
2537 2442                                                  AS_LOCK_EXIT(as, &as->a_lock);
2538 2443                                                  return (ENOMEM);
2539 2444                                          }
2540 2445                                  } else {
2541 2446                                          /*
2542 2447                                           * Regular error
2543 2448                                           */
2544 2449                                          AS_LOCK_EXIT(as, &as->a_lock);
2545 2450                                          return (error);
2546 2451                                  }
2547 2452                          }
2548 2453                          break;
2549 2454  
2550 2455                  case MC_INHERIT_ZERO:
2551 2456                          if (seg->s_ops->inherit == NULL) {
2552 2457                                  error = ENOTSUP;
2553 2458                          } else {
2554 2459                                  error = SEGOP_INHERIT(seg, raddr, ssize,
2555 2460                                      SEGP_INH_ZERO);
2556 2461                          }
2557 2462                          if (error != 0) {
2558 2463                                  AS_LOCK_EXIT(as, &as->a_lock);
2559 2464                                  return (error);
2560 2465                          }
2561 2466                          break;
2562 2467  
2563 2468                  /*
2564 2469                   * Can't happen.
2565 2470                   */
2566 2471                  default:
2567 2472                          panic("as_ctl: bad operation %d", func);
2568 2473                          /*NOTREACHED*/
2569 2474                  }
2570 2475  
2571 2476                  rsize -= ssize;
2572 2477                  raddr += ssize;
2573 2478          }
2574 2479  
2575 2480          if (func == MC_LOCK)
2576 2481                  kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2577 2482          AS_LOCK_EXIT(as, &as->a_lock);
2578 2483          return (0);
2579 2484  lockerr:
2580 2485  
2581 2486          /*
2582 2487           * If the lower levels returned EDEADLK for a segment lockop,
2583 2488           * it means that we should retry the operation.  Let's wait
2584 2489           * a bit also to let the deadlock causing condition clear.
2585 2490           * This is part of a gross hack to work around a design flaw
2586 2491           * in the ufs/sds logging code and should go away when the
2587 2492           * logging code is re-designed to fix the problem. See bug
2588 2493           * 4125102 for details of the problem.
2589 2494           */
2590 2495          if (error == EDEADLK) {
2591 2496                  delay(deadlk_wait);
2592 2497                  error = 0;
2593 2498                  goto retry;
2594 2499          }
2595 2500          return (error);
2596 2501  }
2597 2502  
2598 2503  int
2599 2504  fc_decode(faultcode_t fault_err)
2600 2505  {
2601 2506          int error = 0;
2602 2507  
2603 2508          switch (FC_CODE(fault_err)) {
2604 2509          case FC_OBJERR:
2605 2510                  error = FC_ERRNO(fault_err);
2606 2511                  break;
2607 2512          case FC_PROT:
2608 2513                  error = EACCES;
2609 2514                  break;
2610 2515          default:
2611 2516                  error = EFAULT;
2612 2517                  break;
2613 2518          }
2614 2519          return (error);
2615 2520  }
2616 2521  
2617 2522  /*
2618 2523   * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2619 2524   * lists from each segment and copy them to one contiguous shadow list (plist)
2620 2525   * as expected by the caller.  Save pointers to per segment shadow lists at
2621 2526   * the tail of plist so that they can be used during as_pageunlock().
2622 2527   */
2623 2528  static int
2624 2529  as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2625 2530      caddr_t addr, size_t size, enum seg_rw rw)
2626 2531  {
2627 2532          caddr_t sv_addr = addr;
2628 2533          size_t sv_size = size;
2629 2534          struct seg *sv_seg = seg;
2630 2535          ulong_t segcnt = 1;
2631 2536          ulong_t cnt;
2632 2537          size_t ssize;
2633 2538          pgcnt_t npages = btop(size);
2634 2539          page_t **plist;
2635 2540          page_t **pl;
2636 2541          int error;
2637 2542          caddr_t eaddr;
2638 2543          faultcode_t fault_err = 0;
2639 2544          pgcnt_t pl_off;
2640 2545          extern struct seg_ops segspt_shmops;
2641 2546  
2642 2547          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2643 2548          ASSERT(seg != NULL);
2644 2549          ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2645 2550          ASSERT(addr + size > seg->s_base + seg->s_size);
2646 2551          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2647 2552          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2648 2553  
2649 2554          /*
2650 2555           * Count the number of segments covered by the range we are about to
2651 2556           * lock. The segment count is used to size the shadow list we return
2652 2557           * back to the caller.
2653 2558           */
2654 2559          for (; size != 0; size -= ssize, addr += ssize) {
2655 2560                  if (addr >= seg->s_base + seg->s_size) {
2656 2561  
2657 2562                          seg = AS_SEGNEXT(as, seg);
2658 2563                          if (seg == NULL || addr != seg->s_base) {
2659 2564                                  AS_LOCK_EXIT(as, &as->a_lock);
2660 2565                                  return (EFAULT);
2661 2566                          }
2662 2567                          /*
2663 2568                           * Do a quick check if subsequent segments
2664 2569                           * will most likely support pagelock.
2665 2570                           */
2666 2571                          if (seg->s_ops == &segvn_ops) {
2667 2572                                  vnode_t *vp;
2668 2573  
2669 2574                                  if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2670 2575                                      vp != NULL) {
2671 2576                                          AS_LOCK_EXIT(as, &as->a_lock);
2672 2577                                          goto slow;
2673 2578                                  }
2674 2579                          } else if (seg->s_ops != &segspt_shmops) {
2675 2580                                  AS_LOCK_EXIT(as, &as->a_lock);
2676 2581                                  goto slow;
2677 2582                          }
2678 2583                          segcnt++;
2679 2584                  }
2680 2585                  if (addr + size > seg->s_base + seg->s_size) {
2681 2586                          ssize = seg->s_base + seg->s_size - addr;
2682 2587                  } else {
2683 2588                          ssize = size;
2684 2589                  }
2685 2590          }
2686 2591          ASSERT(segcnt > 1);
2687 2592  
2688 2593          plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2689 2594  
2690 2595          addr = sv_addr;
2691 2596          size = sv_size;
2692 2597          seg = sv_seg;
2693 2598  
2694 2599          for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2695 2600                  if (addr >= seg->s_base + seg->s_size) {
2696 2601                          seg = AS_SEGNEXT(as, seg);
2697 2602                          ASSERT(seg != NULL && addr == seg->s_base);
2698 2603                          cnt++;
2699 2604                          ASSERT(cnt < segcnt);
2700 2605                  }
2701 2606                  if (addr + size > seg->s_base + seg->s_size) {
2702 2607                          ssize = seg->s_base + seg->s_size - addr;
2703 2608                  } else {
2704 2609                          ssize = size;
2705 2610                  }
2706 2611                  pl = &plist[npages + cnt];
2707 2612                  error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2708 2613                      L_PAGELOCK, rw);
2709 2614                  if (error) {
2710 2615                          break;
2711 2616                  }
2712 2617                  ASSERT(plist[npages + cnt] != NULL);
2713 2618                  ASSERT(pl_off + btop(ssize) <= npages);
2714 2619                  bcopy(plist[npages + cnt], &plist[pl_off],
2715 2620                      btop(ssize) * sizeof (page_t *));
2716 2621                  pl_off += btop(ssize);
2717 2622          }
2718 2623  
2719 2624          if (size == 0) {
2720 2625                  AS_LOCK_EXIT(as, &as->a_lock);
2721 2626                  ASSERT(cnt == segcnt - 1);
2722 2627                  *ppp = plist;
2723 2628                  return (0);
2724 2629          }
2725 2630  
2726 2631          /*
2727 2632           * one of pagelock calls failed. The error type is in error variable.
2728 2633           * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2729 2634           * type is either EFAULT or ENOTSUP. Otherwise just return the error
2730 2635           * back to the caller.
2731 2636           */
2732 2637  
2733 2638          eaddr = addr;
2734 2639          seg = sv_seg;
2735 2640  
2736 2641          for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2737 2642                  if (addr >= seg->s_base + seg->s_size) {
2738 2643                          seg = AS_SEGNEXT(as, seg);
2739 2644                          ASSERT(seg != NULL && addr == seg->s_base);
2740 2645                          cnt++;
2741 2646                          ASSERT(cnt < segcnt);
2742 2647                  }
2743 2648                  if (eaddr > seg->s_base + seg->s_size) {
2744 2649                          ssize = seg->s_base + seg->s_size - addr;
2745 2650                  } else {
2746 2651                          ssize = eaddr - addr;
2747 2652                  }
2748 2653                  pl = &plist[npages + cnt];
2749 2654                  ASSERT(*pl != NULL);
2750 2655                  (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2751 2656                      L_PAGEUNLOCK, rw);
2752 2657          }
2753 2658  
2754 2659          AS_LOCK_EXIT(as, &as->a_lock);
2755 2660  
2756 2661          kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2757 2662  
2758 2663          if (error != ENOTSUP && error != EFAULT) {
2759 2664                  return (error);
2760 2665          }
2761 2666  
2762 2667  slow:
2763 2668          /*
2764 2669           * If we are here because pagelock failed due to the need to cow fault
2765 2670           * in the pages we want to lock F_SOFTLOCK will do this job and in
2766 2671           * next as_pagelock() call for this address range pagelock will
2767 2672           * hopefully succeed.
2768 2673           */
2769 2674          fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2770 2675          if (fault_err != 0) {
2771 2676                  return (fc_decode(fault_err));
2772 2677          }
2773 2678          *ppp = NULL;
2774 2679  
2775 2680          return (0);
2776 2681  }
2777 2682  
2778 2683  /*
2779 2684   * lock pages in a given address space. Return shadow list. If
2780 2685   * the list is NULL, the MMU mapping is also locked.
2781 2686   */
2782 2687  int
2783 2688  as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2784 2689      size_t size, enum seg_rw rw)
2785 2690  {
2786 2691          size_t rsize;
2787 2692          caddr_t raddr;
2788 2693          faultcode_t fault_err;
2789 2694          struct seg *seg;
2790 2695          int err;
2791 2696  
2792 2697          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2793 2698              "as_pagelock_start: addr %p size %ld", addr, size);
2794 2699  
2795 2700          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2796 2701          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2797 2702              (size_t)raddr;
2798 2703  
2799 2704          /*
2800 2705           * if the request crosses two segments let
2801 2706           * as_fault handle it.
2802 2707           */
2803 2708          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2804 2709  
2805 2710          seg = as_segat(as, raddr);
2806 2711          if (seg == NULL) {
2807 2712                  AS_LOCK_EXIT(as, &as->a_lock);
2808 2713                  return (EFAULT);
2809 2714          }
2810 2715          ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2811 2716          if (raddr + rsize > seg->s_base + seg->s_size) {
2812 2717                  return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2813 2718          }
2814 2719          if (raddr + rsize <= raddr) {
2815 2720                  AS_LOCK_EXIT(as, &as->a_lock);
2816 2721                  return (EFAULT);
2817 2722          }
2818 2723  
2819 2724          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2820 2725              "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2821 2726  
2822 2727          /*
2823 2728           * try to lock pages and pass back shadow list
2824 2729           */
2825 2730          err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2826 2731  
2827 2732          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2828 2733  
2829 2734          AS_LOCK_EXIT(as, &as->a_lock);
2830 2735  
2831 2736          if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2832 2737                  return (err);
2833 2738          }
2834 2739  
2835 2740          /*
2836 2741           * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2837 2742           * to no pagelock support for this segment or pages need to be cow
2838 2743           * faulted in. If fault is needed F_SOFTLOCK will do this job for
2839 2744           * this as_pagelock() call and in the next as_pagelock() call for the
2840 2745           * same address range pagelock call will hopefull succeed.
2841 2746           */
2842 2747          fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2843 2748          if (fault_err != 0) {
2844 2749                  return (fc_decode(fault_err));
2845 2750          }
2846 2751          *ppp = NULL;
2847 2752  
2848 2753          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2849 2754          return (0);
2850 2755  }
2851 2756  
2852 2757  /*
2853 2758   * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2854 2759   * lists from the end of plist and call pageunlock interface for each segment.
2855 2760   * Drop as lock and free plist.
2856 2761   */
2857 2762  static void
2858 2763  as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2859 2764      struct page **plist, enum seg_rw rw)
2860 2765  {
2861 2766          ulong_t cnt;
2862 2767          caddr_t eaddr = addr + size;
2863 2768          pgcnt_t npages = btop(size);
2864 2769          size_t ssize;
2865 2770          page_t **pl;
2866 2771  
2867 2772          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2868 2773          ASSERT(seg != NULL);
2869 2774          ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2870 2775          ASSERT(addr + size > seg->s_base + seg->s_size);
2871 2776          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2872 2777          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2873 2778          ASSERT(plist != NULL);
2874 2779  
2875 2780          for (cnt = 0; addr < eaddr; addr += ssize) {
2876 2781                  if (addr >= seg->s_base + seg->s_size) {
2877 2782                          seg = AS_SEGNEXT(as, seg);
2878 2783                          ASSERT(seg != NULL && addr == seg->s_base);
2879 2784                          cnt++;
2880 2785                  }
2881 2786                  if (eaddr > seg->s_base + seg->s_size) {
2882 2787                          ssize = seg->s_base + seg->s_size - addr;
2883 2788                  } else {
2884 2789                          ssize = eaddr - addr;
2885 2790                  }
2886 2791                  pl = &plist[npages + cnt];
2887 2792                  ASSERT(*pl != NULL);
2888 2793                  (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2889 2794                      L_PAGEUNLOCK, rw);
2890 2795          }
2891 2796          ASSERT(cnt > 0);
2892 2797          AS_LOCK_EXIT(as, &as->a_lock);
2893 2798  
2894 2799          cnt++;
2895 2800          kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2896 2801  }
2897 2802  
2898 2803  /*
2899 2804   * unlock pages in a given address range
2900 2805   */
2901 2806  void
2902 2807  as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2903 2808      enum seg_rw rw)
2904 2809  {
2905 2810          struct seg *seg;
2906 2811          size_t rsize;
2907 2812          caddr_t raddr;
2908 2813  
2909 2814          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2910 2815              "as_pageunlock_start: addr %p size %ld", addr, size);
2911 2816  
2912 2817          /*
2913 2818           * if the shadow list is NULL, as_pagelock was
2914 2819           * falling back to as_fault
2915 2820           */
2916 2821          if (pp == NULL) {
2917 2822                  (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2918 2823                  return;
2919 2824          }
2920 2825  
2921 2826          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2922 2827          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2923 2828              (size_t)raddr;
2924 2829  
2925 2830          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2926 2831          seg = as_segat(as, raddr);
2927 2832          ASSERT(seg != NULL);
2928 2833  
2929 2834          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2930 2835              "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2931 2836  
2932 2837          ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2933 2838          if (raddr + rsize <= seg->s_base + seg->s_size) {
2934 2839                  SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2935 2840          } else {
2936 2841                  as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2937 2842                  return;
2938 2843          }
2939 2844          AS_LOCK_EXIT(as, &as->a_lock);
2940 2845          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2941 2846  }
2942 2847  
2943 2848  int
2944 2849  as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2945 2850      boolean_t wait)
2946 2851  {
2947 2852          struct seg *seg;
2948 2853          size_t ssize;
2949 2854          caddr_t raddr;                  /* rounded down addr */
2950 2855          size_t rsize;                   /* rounded up size */
2951 2856          int error = 0;
2952 2857          size_t pgsz = page_get_pagesize(szc);
2953 2858  
2954 2859  setpgsz_top:
2955 2860          if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2956 2861                  return (EINVAL);
2957 2862          }
2958 2863  
2959 2864          raddr = addr;
2960 2865          rsize = size;
2961 2866  
2962 2867          if (raddr + rsize < raddr)              /* check for wraparound */
2963 2868                  return (ENOMEM);
2964 2869  
2965 2870          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2966 2871          as_clearwatchprot(as, raddr, rsize);
2967 2872          seg = as_segat(as, raddr);
2968 2873          if (seg == NULL) {
2969 2874                  as_setwatch(as);
2970 2875                  AS_LOCK_EXIT(as, &as->a_lock);
2971 2876                  return (ENOMEM);
2972 2877          }
2973 2878  
2974 2879          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2975 2880                  if (raddr >= seg->s_base + seg->s_size) {
2976 2881                          seg = AS_SEGNEXT(as, seg);
2977 2882                          if (seg == NULL || raddr != seg->s_base) {
2978 2883                                  error = ENOMEM;
2979 2884                                  break;
2980 2885                          }
2981 2886                  }
2982 2887                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2983 2888                          ssize = seg->s_base + seg->s_size - raddr;
2984 2889                  } else {
2985 2890                          ssize = rsize;
2986 2891                  }
2987 2892  
2988 2893  retry:
2989 2894                  error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2990 2895  
2991 2896                  if (error == IE_NOMEM) {
2992 2897                          error = EAGAIN;
2993 2898                          break;
2994 2899                  }
2995 2900  
2996 2901                  if (error == IE_RETRY) {
2997 2902                          AS_LOCK_EXIT(as, &as->a_lock);
2998 2903                          goto setpgsz_top;
2999 2904                  }
3000 2905  
3001 2906                  if (error == ENOTSUP) {
3002 2907                          error = EINVAL;
3003 2908                          break;
3004 2909                  }
3005 2910  
3006 2911                  if (wait && (error == EAGAIN)) {
3007 2912                          /*
3008 2913                           * Memory is currently locked.  It must be unlocked
3009 2914                           * before this operation can succeed through a retry.
3010 2915                           * The possible reasons for locked memory and
3011 2916                           * corresponding strategies for unlocking are:
3012 2917                           * (1) Normal I/O
3013 2918                           *      wait for a signal that the I/O operation
3014 2919                           *      has completed and the memory is unlocked.
3015 2920                           * (2) Asynchronous I/O
3016 2921                           *      The aio subsystem does not unlock pages when
3017 2922                           *      the I/O is completed. Those pages are unlocked
3018 2923                           *      when the application calls aiowait/aioerror.
3019 2924                           *      So, to prevent blocking forever, cv_broadcast()
3020 2925                           *      is done to wake up aio_cleanup_thread.
3021 2926                           *      Subsequently, segvn_reclaim will be called, and
3022 2927                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
3023 2928                           * (3) Long term page locking:
3024 2929                           *      This is not relevant for as_setpagesize()
3025 2930                           *      because we cannot change the page size for
3026 2931                           *      driver memory. The attempt to do so will
3027 2932                           *      fail with a different error than EAGAIN so
3028 2933                           *      there's no need to trigger as callbacks like
3029 2934                           *      as_unmap, as_setprot or as_free would do.
3030 2935                           */
3031 2936                          mutex_enter(&as->a_contents);
3032 2937                          if (!AS_ISNOUNMAPWAIT(as)) {
3033 2938                                  if (AS_ISUNMAPWAIT(as) == 0) {
3034 2939                                          cv_broadcast(&as->a_cv);
3035 2940                                  }
3036 2941                                  AS_SETUNMAPWAIT(as);
3037 2942                                  AS_LOCK_EXIT(as, &as->a_lock);
3038 2943                                  while (AS_ISUNMAPWAIT(as)) {
3039 2944                                          cv_wait(&as->a_cv, &as->a_contents);
3040 2945                                  }
3041 2946                          } else {
3042 2947                                  /*
3043 2948                                   * We may have raced with
3044 2949                                   * segvn_reclaim()/segspt_reclaim(). In this
3045 2950                                   * case clean nounmapwait flag and retry since
3046 2951                                   * softlockcnt in this segment may be already
3047 2952                                   * 0.  We don't drop as writer lock so our
3048 2953                                   * number of retries without sleeping should
3049 2954                                   * be very small. See segvn_reclaim() for
3050 2955                                   * more comments.
3051 2956                                   */
3052 2957                                  AS_CLRNOUNMAPWAIT(as);
3053 2958                                  mutex_exit(&as->a_contents);
3054 2959                                  goto retry;
3055 2960                          }
3056 2961                          mutex_exit(&as->a_contents);
3057 2962                          goto setpgsz_top;
3058 2963                  } else if (error != 0) {
3059 2964                          break;
3060 2965                  }
3061 2966          }
3062 2967          as_setwatch(as);
3063 2968          AS_LOCK_EXIT(as, &as->a_lock);
3064 2969          return (error);
3065 2970  }
3066 2971  
3067 2972  /*
3068 2973   * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
3069 2974   * in its chunk where s_szc is less than the szc we want to set.
3070 2975   */
3071 2976  static int
3072 2977  as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3073 2978      int *retry)
3074 2979  {
3075 2980          struct seg *seg;
3076 2981          size_t ssize;
3077 2982          int error;
3078 2983  
3079 2984          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3080 2985  
3081 2986          seg = as_segat(as, raddr);
3082 2987          if (seg == NULL) {
3083 2988                  panic("as_iset3_default_lpsize: no seg");
3084 2989          }
3085 2990  
3086 2991          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3087 2992                  if (raddr >= seg->s_base + seg->s_size) {
3088 2993                          seg = AS_SEGNEXT(as, seg);
3089 2994                          if (seg == NULL || raddr != seg->s_base) {
3090 2995                                  panic("as_iset3_default_lpsize: as changed");
3091 2996                          }
3092 2997                  }
3093 2998                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3094 2999                          ssize = seg->s_base + seg->s_size - raddr;
3095 3000                  } else {
3096 3001                          ssize = rsize;
3097 3002                  }
3098 3003  
3099 3004                  if (szc > seg->s_szc) {
3100 3005                          error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3101 3006                          /* Only retry on EINVAL segments that have no vnode. */
3102 3007                          if (error == EINVAL) {
3103 3008                                  vnode_t *vp = NULL;
3104 3009                                  if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3105 3010                                      (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3106 3011                                      vp == NULL)) {
3107 3012                                          *retry = 1;
3108 3013                                  } else {
3109 3014                                          *retry = 0;
3110 3015                                  }
3111 3016                          }
3112 3017                          if (error) {
3113 3018                                  return (error);
3114 3019                          }
3115 3020                  }
3116 3021          }
3117 3022          return (0);
3118 3023  }
3119 3024  
3120 3025  /*
3121 3026   * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3122 3027   * pagesize on each segment in its range, but if any fails with EINVAL,
3123 3028   * then it reduces the pagesizes to the next size in the bitmap and
3124 3029   * retries as_iset3_default_lpsize(). The reason why the code retries
3125 3030   * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3126 3031   * match the bigger sizes, and (b) it's hard to get this offset (to begin
3127 3032   * with) to pass to map_pgszcvec().
3128 3033   */
3129 3034  static int
3130 3035  as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3131 3036      uint_t szcvec)
3132 3037  {
3133 3038          int error;
3134 3039          int retry;
3135 3040  
3136 3041          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3137 3042  
3138 3043          for (;;) {
3139 3044                  error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3140 3045                  if (error == EINVAL && retry) {
3141 3046                          szcvec &= ~(1 << szc);
3142 3047                          if (szcvec <= 1) {
3143 3048                                  return (EINVAL);
3144 3049                          }
3145 3050                          szc = highbit(szcvec) - 1;
3146 3051                  } else {
3147 3052                          return (error);
3148 3053                  }
3149 3054          }
3150 3055  }
3151 3056  
3152 3057  /*
3153 3058   * as_iset1_default_lpsize() breaks its chunk into areas where existing
3154 3059   * segments have a smaller szc than we want to set. For each such area,
3155 3060   * it calls as_iset2_default_lpsize()
3156 3061   */
3157 3062  static int
3158 3063  as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3159 3064      uint_t szcvec)
3160 3065  {
3161 3066          struct seg *seg;
3162 3067          size_t ssize;
3163 3068          caddr_t setaddr = raddr;
3164 3069          size_t setsize = 0;
3165 3070          int set;
3166 3071          int error;
3167 3072  
3168 3073          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3169 3074  
3170 3075          seg = as_segat(as, raddr);
3171 3076          if (seg == NULL) {
3172 3077                  panic("as_iset1_default_lpsize: no seg");
3173 3078          }
3174 3079          if (seg->s_szc < szc) {
3175 3080                  set = 1;
3176 3081          } else {
3177 3082                  set = 0;
3178 3083          }
3179 3084  
3180 3085          for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3181 3086                  if (raddr >= seg->s_base + seg->s_size) {
3182 3087                          seg = AS_SEGNEXT(as, seg);
3183 3088                          if (seg == NULL || raddr != seg->s_base) {
3184 3089                                  panic("as_iset1_default_lpsize: as changed");
3185 3090                          }
3186 3091                          if (seg->s_szc >= szc && set) {
3187 3092                                  ASSERT(setsize != 0);
3188 3093                                  error = as_iset2_default_lpsize(as,
3189 3094                                      setaddr, setsize, szc, szcvec);
3190 3095                                  if (error) {
3191 3096                                          return (error);
3192 3097                                  }
3193 3098                                  set = 0;
3194 3099                          } else if (seg->s_szc < szc && !set) {
3195 3100                                  setaddr = raddr;
3196 3101                                  setsize = 0;
3197 3102                                  set = 1;
3198 3103                          }
3199 3104                  }
3200 3105                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3201 3106                          ssize = seg->s_base + seg->s_size - raddr;
3202 3107                  } else {
3203 3108                          ssize = rsize;
3204 3109                  }
3205 3110          }
3206 3111          error = 0;
3207 3112          if (set) {
3208 3113                  ASSERT(setsize != 0);
3209 3114                  error = as_iset2_default_lpsize(as, setaddr, setsize,
3210 3115                      szc, szcvec);
3211 3116          }
3212 3117          return (error);
3213 3118  }
3214 3119  
3215 3120  /*
3216 3121   * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3217 3122   * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3218 3123   * chunk to as_iset1_default_lpsize().
3219 3124   */
3220 3125  static int
3221 3126  as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3222 3127      int type)
3223 3128  {
3224 3129          int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3225 3130          uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3226 3131              flags, rtype, 1);
3227 3132          uint_t szc;
3228 3133          uint_t nszc;
3229 3134          int error;
3230 3135          caddr_t a;
3231 3136          caddr_t eaddr;
3232 3137          size_t segsize;
3233 3138          size_t pgsz;
3234 3139          uint_t save_szcvec;
3235 3140  
3236 3141          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3237 3142          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3238 3143          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3239 3144  
3240 3145          szcvec &= ~1;
3241 3146          if (szcvec <= 1) {      /* skip if base page size */
3242 3147                  return (0);
3243 3148          }
3244 3149  
3245 3150          /* Get the pagesize of the first larger page size. */
3246 3151          szc = lowbit(szcvec) - 1;
3247 3152          pgsz = page_get_pagesize(szc);
3248 3153          eaddr = addr + size;
3249 3154          addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3250 3155          eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3251 3156  
3252 3157          save_szcvec = szcvec;
3253 3158          szcvec >>= (szc + 1);
3254 3159          nszc = szc;
3255 3160          while (szcvec) {
3256 3161                  if ((szcvec & 0x1) == 0) {
3257 3162                          nszc++;
3258 3163                          szcvec >>= 1;
3259 3164                          continue;
3260 3165                  }
3261 3166                  nszc++;
3262 3167                  pgsz = page_get_pagesize(nszc);
3263 3168                  a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3264 3169                  if (a != addr) {
3265 3170                          ASSERT(szc > 0);
3266 3171                          ASSERT(a < eaddr);
3267 3172                          segsize = a - addr;
3268 3173                          error = as_iset1_default_lpsize(as, addr, segsize, szc,
3269 3174                              save_szcvec);
3270 3175                          if (error) {
3271 3176                                  return (error);
3272 3177                          }
3273 3178                          addr = a;
3274 3179                  }
3275 3180                  szc = nszc;
3276 3181                  szcvec >>= 1;
3277 3182          }
3278 3183  
3279 3184          ASSERT(addr < eaddr);
3280 3185          szcvec = save_szcvec;
3281 3186          while (szcvec) {
3282 3187                  a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3283 3188                  ASSERT(a >= addr);
3284 3189                  if (a != addr) {
3285 3190                          ASSERT(szc > 0);
3286 3191                          segsize = a - addr;
3287 3192                          error = as_iset1_default_lpsize(as, addr, segsize, szc,
3288 3193                              save_szcvec);
3289 3194                          if (error) {
3290 3195                                  return (error);
3291 3196                          }
3292 3197                          addr = a;
3293 3198                  }
3294 3199                  szcvec &= ~(1 << szc);
3295 3200                  if (szcvec) {
3296 3201                          szc = highbit(szcvec) - 1;
3297 3202                          pgsz = page_get_pagesize(szc);
3298 3203                  }
3299 3204          }
3300 3205          ASSERT(addr == eaddr);
3301 3206  
3302 3207          return (0);
3303 3208  }
3304 3209  
3305 3210  /*
3306 3211   * Set the default large page size for the range. Called via memcntl with
3307 3212   * page size set to 0. as_set_default_lpsize breaks the range down into
3308 3213   * chunks with the same type/flags, ignores-non segvn segments, and passes
3309 3214   * each chunk to as_iset_default_lpsize().
3310 3215   */
3311 3216  int
3312 3217  as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3313 3218  {
3314 3219          struct seg *seg;
3315 3220          caddr_t raddr;
3316 3221          size_t rsize;
3317 3222          size_t ssize;
3318 3223          int rtype, rflags;
3319 3224          int stype, sflags;
3320 3225          int error;
3321 3226          caddr_t setaddr;
3322 3227          size_t setsize;
3323 3228          int segvn;
3324 3229  
3325 3230          if (size == 0)
3326 3231                  return (0);
3327 3232  
3328 3233          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3329 3234  again:
3330 3235          error = 0;
3331 3236  
3332 3237          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3333 3238          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3334 3239              (size_t)raddr;
3335 3240  
3336 3241          if (raddr + rsize < raddr) {            /* check for wraparound */
3337 3242                  AS_LOCK_EXIT(as, &as->a_lock);
3338 3243                  return (ENOMEM);
3339 3244          }
3340 3245          as_clearwatchprot(as, raddr, rsize);
3341 3246          seg = as_segat(as, raddr);
3342 3247          if (seg == NULL) {
3343 3248                  as_setwatch(as);
3344 3249                  AS_LOCK_EXIT(as, &as->a_lock);
3345 3250                  return (ENOMEM);
3346 3251          }
3347 3252          if (seg->s_ops == &segvn_ops) {
3348 3253                  rtype = SEGOP_GETTYPE(seg, addr);
3349 3254                  rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3350 3255                  rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3351 3256                  segvn = 1;
3352 3257          } else {
3353 3258                  segvn = 0;
3354 3259          }
3355 3260          setaddr = raddr;
3356 3261          setsize = 0;
3357 3262  
3358 3263          for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3359 3264                  if (raddr >= (seg->s_base + seg->s_size)) {
3360 3265                          seg = AS_SEGNEXT(as, seg);
3361 3266                          if (seg == NULL || raddr != seg->s_base) {
3362 3267                                  error = ENOMEM;
3363 3268                                  break;
3364 3269                          }
3365 3270                          if (seg->s_ops == &segvn_ops) {
3366 3271                                  stype = SEGOP_GETTYPE(seg, raddr);
3367 3272                                  sflags = stype & (MAP_TEXT | MAP_INITDATA);
3368 3273                                  stype &= (MAP_SHARED | MAP_PRIVATE);
3369 3274                                  if (segvn && (rflags != sflags ||
3370 3275                                      rtype != stype)) {
3371 3276                                          /*
3372 3277                                           * The next segment is also segvn but
3373 3278                                           * has different flags and/or type.
3374 3279                                           */
3375 3280                                          ASSERT(setsize != 0);
3376 3281                                          error = as_iset_default_lpsize(as,
3377 3282                                              setaddr, setsize, rflags, rtype);
3378 3283                                          if (error) {
3379 3284                                                  break;
3380 3285                                          }
3381 3286                                          rflags = sflags;
3382 3287                                          rtype = stype;
3383 3288                                          setaddr = raddr;
3384 3289                                          setsize = 0;
3385 3290                                  } else if (!segvn) {
3386 3291                                          rflags = sflags;
3387 3292                                          rtype = stype;
3388 3293                                          setaddr = raddr;
3389 3294                                          setsize = 0;
3390 3295                                          segvn = 1;
3391 3296                                  }
3392 3297                          } else if (segvn) {
3393 3298                                  /* The next segment is not segvn. */
3394 3299                                  ASSERT(setsize != 0);
3395 3300                                  error = as_iset_default_lpsize(as,
3396 3301                                      setaddr, setsize, rflags, rtype);
3397 3302                                  if (error) {
3398 3303                                          break;
3399 3304                                  }
3400 3305                                  segvn = 0;
3401 3306                          }
3402 3307                  }
3403 3308                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3404 3309                          ssize = seg->s_base + seg->s_size - raddr;
3405 3310                  } else {
3406 3311                          ssize = rsize;
3407 3312                  }
3408 3313          }
3409 3314          if (error == 0 && segvn) {
3410 3315                  /* The last chunk when rsize == 0. */
3411 3316                  ASSERT(setsize != 0);
3412 3317                  error = as_iset_default_lpsize(as, setaddr, setsize,
3413 3318                      rflags, rtype);
3414 3319          }
3415 3320  
3416 3321          if (error == IE_RETRY) {
3417 3322                  goto again;
3418 3323          } else if (error == IE_NOMEM) {
3419 3324                  error = EAGAIN;
3420 3325          } else if (error == ENOTSUP) {
3421 3326                  error = EINVAL;
3422 3327          } else if (error == EAGAIN) {
3423 3328                  mutex_enter(&as->a_contents);
3424 3329                  if (!AS_ISNOUNMAPWAIT(as)) {
3425 3330                          if (AS_ISUNMAPWAIT(as) == 0) {
3426 3331                                  cv_broadcast(&as->a_cv);
3427 3332                          }
3428 3333                          AS_SETUNMAPWAIT(as);
3429 3334                          AS_LOCK_EXIT(as, &as->a_lock);
3430 3335                          while (AS_ISUNMAPWAIT(as)) {
3431 3336                                  cv_wait(&as->a_cv, &as->a_contents);
3432 3337                          }
3433 3338                          mutex_exit(&as->a_contents);
3434 3339                          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3435 3340                  } else {
3436 3341                          /*
3437 3342                           * We may have raced with
3438 3343                           * segvn_reclaim()/segspt_reclaim(). In this case
3439 3344                           * clean nounmapwait flag and retry since softlockcnt
3440 3345                           * in this segment may be already 0.  We don't drop as
3441 3346                           * writer lock so our number of retries without
3442 3347                           * sleeping should be very small. See segvn_reclaim()
3443 3348                           * for more comments.
3444 3349                           */
3445 3350                          AS_CLRNOUNMAPWAIT(as);
3446 3351                          mutex_exit(&as->a_contents);
3447 3352                  }
3448 3353                  goto again;
3449 3354          }
3450 3355  
3451 3356          as_setwatch(as);
3452 3357          AS_LOCK_EXIT(as, &as->a_lock);
3453 3358          return (error);
3454 3359  }
3455 3360  
3456 3361  /*
3457 3362   * Setup all of the uninitialized watched pages that we can.
3458 3363   */
3459 3364  void
3460 3365  as_setwatch(struct as *as)
3461 3366  {
3462 3367          struct watched_page *pwp;
3463 3368          struct seg *seg;
3464 3369          caddr_t vaddr;
3465 3370          uint_t prot;
3466 3371          int  err, retrycnt;
3467 3372  
3468 3373          if (avl_numnodes(&as->a_wpage) == 0)
3469 3374                  return;
3470 3375  
3471 3376          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3472 3377  
3473 3378          for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3474 3379              pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3475 3380                  retrycnt = 0;
3476 3381          retry:
3477 3382                  vaddr = pwp->wp_vaddr;
3478 3383                  if (pwp->wp_oprot != 0 ||       /* already set up */
3479 3384                      (seg = as_segat(as, vaddr)) == NULL ||
3480 3385                      SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3481 3386                          continue;
3482 3387  
3483 3388                  pwp->wp_oprot = prot;
3484 3389                  if (pwp->wp_read)
3485 3390                          prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3486 3391                  if (pwp->wp_write)
3487 3392                          prot &= ~PROT_WRITE;
3488 3393                  if (pwp->wp_exec)
3489 3394                          prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3490 3395                  if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3491 3396                          err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3492 3397                          if (err == IE_RETRY) {
3493 3398                                  pwp->wp_oprot = 0;
3494 3399                                  ASSERT(retrycnt == 0);
3495 3400                                  retrycnt++;
3496 3401                                  goto retry;
3497 3402                          }
3498 3403                  }
3499 3404                  pwp->wp_prot = prot;
3500 3405          }
3501 3406  }
3502 3407  
3503 3408  /*
3504 3409   * Clear all of the watched pages in the address space.
3505 3410   */
3506 3411  void
3507 3412  as_clearwatch(struct as *as)
3508 3413  {
3509 3414          struct watched_page *pwp;
3510 3415          struct seg *seg;
3511 3416          caddr_t vaddr;
3512 3417          uint_t prot;
3513 3418          int err, retrycnt;
3514 3419  
3515 3420          if (avl_numnodes(&as->a_wpage) == 0)
3516 3421                  return;
3517 3422  
3518 3423          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3519 3424  
3520 3425          for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3521 3426              pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3522 3427                  retrycnt = 0;
3523 3428          retry:
3524 3429                  vaddr = pwp->wp_vaddr;
3525 3430                  if (pwp->wp_oprot == 0 ||       /* not set up */
3526 3431                      (seg = as_segat(as, vaddr)) == NULL)
3527 3432                          continue;
3528 3433  
3529 3434                  if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3530 3435                          err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3531 3436                          if (err == IE_RETRY) {
3532 3437                                  ASSERT(retrycnt == 0);
3533 3438                                  retrycnt++;
3534 3439                                  goto retry;
3535 3440                          }
3536 3441                  }
3537 3442                  pwp->wp_oprot = 0;
3538 3443                  pwp->wp_prot = 0;
3539 3444          }
3540 3445  }
3541 3446  
3542 3447  /*
3543 3448   * Force a new setup for all the watched pages in the range.
3544 3449   */
3545 3450  static void
3546 3451  as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3547 3452  {
3548 3453          struct watched_page *pwp;
3549 3454          struct watched_page tpw;
3550 3455          caddr_t eaddr = addr + size;
3551 3456          caddr_t vaddr;
3552 3457          struct seg *seg;
3553 3458          int err, retrycnt;
3554 3459          uint_t  wprot;
3555 3460          avl_index_t where;
3556 3461  
3557 3462          if (avl_numnodes(&as->a_wpage) == 0)
3558 3463                  return;
3559 3464  
3560 3465          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3561 3466  
3562 3467          tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3563 3468          if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3564 3469                  pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3565 3470  
3566 3471          while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3567 3472                  retrycnt = 0;
3568 3473                  vaddr = pwp->wp_vaddr;
3569 3474  
3570 3475                  wprot = prot;
3571 3476                  if (pwp->wp_read)
3572 3477                          wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3573 3478                  if (pwp->wp_write)
3574 3479                          wprot &= ~PROT_WRITE;
3575 3480                  if (pwp->wp_exec)
3576 3481                          wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3577 3482                  if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3578 3483                  retry:
3579 3484                          seg = as_segat(as, vaddr);
3580 3485                          if (seg == NULL) {
3581 3486                                  panic("as_setwatchprot: no seg");
3582 3487                                  /*NOTREACHED*/
3583 3488                          }
3584 3489                          err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3585 3490                          if (err == IE_RETRY) {
3586 3491                                  ASSERT(retrycnt == 0);
3587 3492                                  retrycnt++;
3588 3493                                  goto retry;
3589 3494                          }
3590 3495                  }
3591 3496                  pwp->wp_oprot = prot;
3592 3497                  pwp->wp_prot = wprot;
3593 3498  
3594 3499                  pwp = AVL_NEXT(&as->a_wpage, pwp);
3595 3500          }
3596 3501  }
3597 3502  
3598 3503  /*
3599 3504   * Clear all of the watched pages in the range.
3600 3505   */
3601 3506  static void
3602 3507  as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3603 3508  {
3604 3509          caddr_t eaddr = addr + size;
3605 3510          struct watched_page *pwp;
3606 3511          struct watched_page tpw;
3607 3512          uint_t prot;
3608 3513          struct seg *seg;
3609 3514          int err, retrycnt;
3610 3515          avl_index_t where;
3611 3516  
3612 3517          if (avl_numnodes(&as->a_wpage) == 0)
3613 3518                  return;
3614 3519  
3615 3520          tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3616 3521          if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3617 3522                  pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3618 3523  
3619 3524          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3620 3525  
3621 3526          while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3622 3527  
3623 3528                  if ((prot = pwp->wp_oprot) != 0) {
3624 3529                          retrycnt = 0;
3625 3530  
3626 3531                          if (prot != pwp->wp_prot) {
3627 3532                          retry:
3628 3533                                  seg = as_segat(as, pwp->wp_vaddr);
3629 3534                                  if (seg == NULL)
3630 3535                                          continue;
3631 3536                                  err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3632 3537                                      PAGESIZE, prot);
3633 3538                                  if (err == IE_RETRY) {
3634 3539                                          ASSERT(retrycnt == 0);
3635 3540                                          retrycnt++;
3636 3541                                          goto retry;
3637 3542  
3638 3543                                  }
3639 3544                          }
3640 3545                          pwp->wp_oprot = 0;
3641 3546                          pwp->wp_prot = 0;
3642 3547                  }
3643 3548  
3644 3549                  pwp = AVL_NEXT(&as->a_wpage, pwp);
3645 3550          }
3646 3551  }
3647 3552  
3648 3553  void
3649 3554  as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3650 3555  {
3651 3556          struct proc *p;
3652 3557  
3653 3558          mutex_enter(&pidlock);
3654 3559          for (p = practive; p; p = p->p_next) {
3655 3560                  if (p->p_as == as) {
3656 3561                          mutex_enter(&p->p_lock);
3657 3562                          if (p->p_as == as)
3658 3563                                  sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3659 3564                          mutex_exit(&p->p_lock);
3660 3565                  }
3661 3566          }
3662 3567          mutex_exit(&pidlock);
3663 3568  }
3664 3569  
3665 3570  /*
3666 3571   * return memory object ID
3667 3572   */
3668 3573  int
3669 3574  as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3670 3575  {
3671 3576          struct seg      *seg;
3672 3577          int             sts;
3673 3578  
3674 3579          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
3675 3580          seg = as_segat(as, addr);
3676 3581          if (seg == NULL) {
3677 3582                  AS_LOCK_EXIT(as, &as->a_lock);
3678 3583                  return (EFAULT);
3679 3584          }
3680 3585          /*
3681 3586           * catch old drivers which may not support getmemid
3682 3587           */
3683 3588          if (seg->s_ops->getmemid == NULL) {
3684 3589                  AS_LOCK_EXIT(as, &as->a_lock);
3685 3590                  return (ENODEV);
3686 3591          }
3687 3592  
3688 3593          sts = SEGOP_GETMEMID(seg, addr, memidp);
3689 3594  
3690 3595          AS_LOCK_EXIT(as, &as->a_lock);
3691 3596          return (sts);
3692 3597  }

↓ open down ↓

1495 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX