6154-const-ify-segment-ops-structures Wdiff usr/src/uts/common/vm/vm_as.c

Print this page

6154 const-ify segment ops structures

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_as.c
          +++ new/usr/src/uts/common/vm/vm_as.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   * Copyright 2015, Joyent, Inc.  All rights reserved.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28   28  /*        All Rights Reserved   */
  29   29  
  30   30  /*
  31   31   * University Copyright- Copyright (c) 1982, 1986, 1988
  32   32   * The Regents of the University of California
  33   33   * All Rights Reserved
  34   34   *
  35   35   * University Acknowledgment- Portions of this document are derived from
  36   36   * software developed by the University of California, Berkeley, and its
  37   37   * contributors.
  38   38   */
  39   39  
  40   40  /*
  41   41   * VM - address spaces.
  42   42   */
  43   43  
  44   44  #include <sys/types.h>
  45   45  #include <sys/t_lock.h>
  46   46  #include <sys/param.h>
  47   47  #include <sys/errno.h>
  48   48  #include <sys/systm.h>
  49   49  #include <sys/mman.h>
  50   50  #include <sys/sysmacros.h>
  51   51  #include <sys/cpuvar.h>
  52   52  #include <sys/sysinfo.h>
  53   53  #include <sys/kmem.h>
  54   54  #include <sys/vnode.h>
  55   55  #include <sys/vmsystm.h>
  56   56  #include <sys/cmn_err.h>
  57   57  #include <sys/debug.h>
  58   58  #include <sys/tnf_probe.h>
  59   59  #include <sys/vtrace.h>
  60   60  
  61   61  #include <vm/hat.h>
  62   62  #include <vm/xhat.h>
  63   63  #include <vm/as.h>
  64   64  #include <vm/seg.h>
  65   65  #include <vm/seg_vn.h>
  66   66  #include <vm/seg_dev.h>
  67   67  #include <vm/seg_kmem.h>
  68   68  #include <vm/seg_map.h>
  69   69  #include <vm/seg_spt.h>
  70   70  #include <vm/page.h>
  71   71  
  72   72  clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
  73   73  
  74   74  static struct kmem_cache *as_cache;
  75   75  
  76   76  static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  77   77  static void as_clearwatchprot(struct as *, caddr_t, size_t);
  78   78  int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
  79   79  
  80   80  
  81   81  /*
  82   82   * Verifying the segment lists is very time-consuming; it may not be
  83   83   * desirable always to define VERIFY_SEGLIST when DEBUG is set.
  84   84   */
  85   85  #ifdef DEBUG
  86   86  #define VERIFY_SEGLIST
  87   87  int do_as_verify = 0;
  88   88  #endif
  89   89  
  90   90  /*
  91   91   * Allocate a new callback data structure entry and fill in the events of
  92   92   * interest, the address range of interest, and the callback argument.
  93   93   * Link the entry on the as->a_callbacks list. A callback entry for the
  94   94   * entire address space may be specified with vaddr = 0 and size = -1.
  95   95   *
  96   96   * CALLERS RESPONSIBILITY: If not calling from within the process context for
  97   97   * the specified as, the caller must guarantee persistence of the specified as
  98   98   * for the duration of this function (eg. pages being locked within the as
  99   99   * will guarantee persistence).
 100  100   */
 101  101  int
 102  102  as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 103  103                  caddr_t vaddr, size_t size, int sleepflag)
 104  104  {
 105  105          struct as_callback      *current_head, *cb;
 106  106          caddr_t                 saddr;
 107  107          size_t                  rsize;
 108  108  
 109  109          /* callback function and an event are mandatory */
 110  110          if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 111  111                  return (EINVAL);
 112  112  
 113  113          /* Adding a callback after as_free has been called is not allowed */
 114  114          if (as == &kas)
 115  115                  return (ENOMEM);
 116  116  
 117  117          /*
 118  118           * vaddr = 0 and size = -1 is used to indicate that the callback range
 119  119           * is the entire address space so no rounding is done in that case.
 120  120           */
 121  121          if (size != -1) {
 122  122                  saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 123  123                  rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 124  124                      (size_t)saddr;
 125  125                  /* check for wraparound */
 126  126                  if (saddr + rsize < saddr)
 127  127                          return (ENOMEM);
 128  128          } else {
 129  129                  if (vaddr != 0)
 130  130                          return (EINVAL);
 131  131                  saddr = vaddr;
 132  132                  rsize = size;
 133  133          }
 134  134  
 135  135          /* Allocate and initialize a callback entry */
 136  136          cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 137  137          if (cb == NULL)
 138  138                  return (EAGAIN);
 139  139  
 140  140          cb->ascb_func = cb_func;
 141  141          cb->ascb_arg = arg;
 142  142          cb->ascb_events = events;
 143  143          cb->ascb_saddr = saddr;
 144  144          cb->ascb_len = rsize;
 145  145  
 146  146          /* Add the entry to the list */
 147  147          mutex_enter(&as->a_contents);
 148  148          current_head = as->a_callbacks;
 149  149          as->a_callbacks = cb;
 150  150          cb->ascb_next = current_head;
 151  151  
 152  152          /*
 153  153           * The call to this function may lose in a race with
 154  154           * a pertinent event - eg. a thread does long term memory locking
 155  155           * but before the callback is added another thread executes as_unmap.
 156  156           * A broadcast here resolves that.
 157  157           */
 158  158          if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 159  159                  AS_CLRUNMAPWAIT(as);
 160  160                  cv_broadcast(&as->a_cv);
 161  161          }
 162  162  
 163  163          mutex_exit(&as->a_contents);
 164  164          return (0);
 165  165  }
 166  166  
 167  167  /*
 168  168   * Search the callback list for an entry which pertains to arg.
 169  169   *
 170  170   * This is called from within the client upon completion of the callback.
 171  171   * RETURN VALUES:
 172  172   *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 173  173   *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 174  174   *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 175  175   *                      entry will be made in as_do_callbacks)
 176  176   *
 177  177   * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 178  178   * set, it indicates that as_do_callbacks is processing this entry.  The
 179  179   * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 180  180   * to unblock as_do_callbacks, in case it is blocked.
 181  181   *
 182  182   * CALLERS RESPONSIBILITY: If not calling from within the process context for
 183  183   * the specified as, the caller must guarantee persistence of the specified as
 184  184   * for the duration of this function (eg. pages being locked within the as
 185  185   * will guarantee persistence).
 186  186   */
 187  187  uint_t
 188  188  as_delete_callback(struct as *as, void *arg)
 189  189  {
 190  190          struct as_callback **prevcb = &as->a_callbacks;
 191  191          struct as_callback *cb;
 192  192          uint_t rc = AS_CALLBACK_NOTFOUND;
 193  193  
 194  194          mutex_enter(&as->a_contents);
 195  195          for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 196  196                  if (cb->ascb_arg != arg)
 197  197                          continue;
 198  198  
 199  199                  /*
 200  200                   * If the events indicate AS_CALLBACK_CALLED, just clear
 201  201                   * AS_ALL_EVENT in the events field and wakeup the thread
 202  202                   * that may be waiting in as_do_callbacks.  as_do_callbacks
 203  203                   * will take care of removing this entry from the list.  In
 204  204                   * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 205  205                   * (AS_CALLBACK_CALLED not set), just remove it from the
 206  206                   * list, return the memory and return AS_CALLBACK_DELETED.
 207  207                   */
 208  208                  if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 209  209                          /* leave AS_CALLBACK_CALLED */
 210  210                          cb->ascb_events &= ~AS_ALL_EVENT;
 211  211                          rc = AS_CALLBACK_DELETE_DEFERRED;
 212  212                          cv_broadcast(&as->a_cv);
 213  213                  } else {
 214  214                          *prevcb = cb->ascb_next;
 215  215                          kmem_free(cb, sizeof (struct as_callback));
 216  216                          rc = AS_CALLBACK_DELETED;
 217  217                  }
 218  218                  break;
 219  219          }
 220  220          mutex_exit(&as->a_contents);
 221  221          return (rc);
 222  222  }
 223  223  
 224  224  /*
 225  225   * Searches the as callback list for a matching entry.
 226  226   * Returns a pointer to the first matching callback, or NULL if
 227  227   * nothing is found.
 228  228   * This function never sleeps so it is ok to call it with more
 229  229   * locks held but the (required) a_contents mutex.
 230  230   *
 231  231   * See also comment on as_do_callbacks below.
 232  232   */
 233  233  static struct as_callback *
 234  234  as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 235  235                          size_t event_len)
 236  236  {
 237  237          struct as_callback      *cb;
 238  238  
 239  239          ASSERT(MUTEX_HELD(&as->a_contents));
 240  240          for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 241  241                  /*
 242  242                   * If the callback has not already been called, then
 243  243                   * check if events or address range pertains.  An event_len
 244  244                   * of zero means do an unconditional callback.
 245  245                   */
 246  246                  if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 247  247                      ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 248  248                      (event_addr + event_len < cb->ascb_saddr) ||
 249  249                      (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 250  250                          continue;
 251  251                  }
 252  252                  break;
 253  253          }
 254  254          return (cb);
 255  255  }
 256  256  
 257  257  /*
 258  258   * Executes a given callback and removes it from the callback list for
 259  259   * this address space.
 260  260   * This function may sleep so the caller must drop all locks except
 261  261   * a_contents before calling this func.
 262  262   *
 263  263   * See also comments on as_do_callbacks below.
 264  264   */
 265  265  static void
 266  266  as_execute_callback(struct as *as, struct as_callback *cb,
 267  267                                  uint_t events)
 268  268  {
 269  269          struct as_callback **prevcb;
 270  270          void    *cb_arg;
 271  271  
 272  272          ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 273  273          cb->ascb_events |= AS_CALLBACK_CALLED;
 274  274          mutex_exit(&as->a_contents);
 275  275          (*cb->ascb_func)(as, cb->ascb_arg, events);
 276  276          mutex_enter(&as->a_contents);
 277  277          /*
 278  278           * the callback function is required to delete the callback
 279  279           * when the callback function determines it is OK for
 280  280           * this thread to continue. as_delete_callback will clear
 281  281           * the AS_ALL_EVENT in the events field when it is deleted.
 282  282           * If the callback function called as_delete_callback,
 283  283           * events will already be cleared and there will be no blocking.
 284  284           */
 285  285          while ((cb->ascb_events & events) != 0) {
 286  286                  cv_wait(&as->a_cv, &as->a_contents);
 287  287          }
 288  288          /*
 289  289           * This entry needs to be taken off the list. Normally, the
 290  290           * callback func itself does that, but unfortunately the list
 291  291           * may have changed while the callback was running because the
 292  292           * a_contents mutex was dropped and someone else other than the
 293  293           * callback func itself could have called as_delete_callback,
 294  294           * so we have to search to find this entry again.  The entry
 295  295           * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 296  296           */
 297  297          cb_arg = cb->ascb_arg;
 298  298          prevcb = &as->a_callbacks;
 299  299          for (cb = as->a_callbacks; cb != NULL;
 300  300              prevcb = &cb->ascb_next, cb = *prevcb) {
 301  301                  if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 302  302                      (cb_arg != cb->ascb_arg)) {
 303  303                          continue;
 304  304                  }
 305  305                  *prevcb = cb->ascb_next;
 306  306                  kmem_free(cb, sizeof (struct as_callback));
 307  307                  break;
 308  308          }
 309  309  }
 310  310  
 311  311  /*
 312  312   * Check the callback list for a matching event and intersection of
 313  313   * address range. If there is a match invoke the callback.  Skip an entry if:
 314  314   *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 315  315   *    - not event of interest
 316  316   *    - not address range of interest
 317  317   *
 318  318   * An event_len of zero indicates a request for an unconditional callback
 319  319   * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 320  320   * a_contents lock must be dropped before a callback, so only one callback
 321  321   * can be done before returning. Return -1 (true) if a callback was
 322  322   * executed and removed from the list, else return 0 (false).
 323  323   *
 324  324   * The logically separate parts, i.e. finding a matching callback and
 325  325   * executing a given callback have been separated into two functions
 326  326   * so that they can be called with different sets of locks held beyond
 327  327   * the always-required a_contents. as_find_callback does not sleep so
 328  328   * it is ok to call it if more locks than a_contents (i.e. the a_lock
 329  329   * rwlock) are held. as_execute_callback on the other hand may sleep
 330  330   * so all locks beyond a_contents must be dropped by the caller if one
 331  331   * does not want to end comatose.
 332  332   */
 333  333  static int
 334  334  as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 335  335                          size_t event_len)
 336  336  {
 337  337          struct as_callback *cb;
 338  338  
 339  339          if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 340  340                  as_execute_callback(as, cb, events);
 341  341                  return (-1);
 342  342          }
 343  343          return (0);
 344  344  }
 345  345  
 346  346  /*
 347  347   * Search for the segment containing addr. If a segment containing addr
 348  348   * exists, that segment is returned.  If no such segment exists, and
 349  349   * the list spans addresses greater than addr, then the first segment
 350  350   * whose base is greater than addr is returned; otherwise, NULL is
 351  351   * returned unless tail is true, in which case the last element of the
 352  352   * list is returned.
 353  353   *
 354  354   * a_seglast is used to cache the last found segment for repeated
 355  355   * searches to the same addr (which happens frequently).
 356  356   */
 357  357  struct seg *
 358  358  as_findseg(struct as *as, caddr_t addr, int tail)
 359  359  {
 360  360          struct seg *seg = as->a_seglast;
 361  361          avl_index_t where;
 362  362  
 363  363          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 364  364  
 365  365          if (seg != NULL &&
 366  366              seg->s_base <= addr &&
 367  367              addr < seg->s_base + seg->s_size)
 368  368                  return (seg);
 369  369  
 370  370          seg = avl_find(&as->a_segtree, &addr, &where);
 371  371          if (seg != NULL)
 372  372                  return (as->a_seglast = seg);
 373  373  
 374  374          seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 375  375          if (seg == NULL && tail)
 376  376                  seg = avl_last(&as->a_segtree);
 377  377          return (as->a_seglast = seg);
 378  378  }
 379  379  
 380  380  #ifdef VERIFY_SEGLIST
 381  381  /*
 382  382   * verify that the linked list is coherent
 383  383   */
 384  384  static void
 385  385  as_verify(struct as *as)
 386  386  {
 387  387          struct seg *seg, *seglast, *p, *n;
 388  388          uint_t nsegs = 0;
 389  389  
 390  390          if (do_as_verify == 0)
 391  391                  return;
 392  392  
 393  393          seglast = as->a_seglast;
 394  394  
 395  395          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 396  396                  ASSERT(seg->s_as == as);
 397  397                  p = AS_SEGPREV(as, seg);
 398  398                  n = AS_SEGNEXT(as, seg);
 399  399                  ASSERT(p == NULL || p->s_as == as);
 400  400                  ASSERT(p == NULL || p->s_base < seg->s_base);
 401  401                  ASSERT(n == NULL || n->s_base > seg->s_base);
 402  402                  ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 403  403                  if (seg == seglast)
 404  404                          seglast = NULL;
 405  405                  nsegs++;
 406  406          }
 407  407          ASSERT(seglast == NULL);
 408  408          ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 409  409  }
 410  410  #endif /* VERIFY_SEGLIST */
 411  411  
 412  412  /*
 413  413   * Add a new segment to the address space. The avl_find()
 414  414   * may be expensive so we attempt to use last segment accessed
 415  415   * in as_gap() as an insertion point.
 416  416   */
 417  417  int
 418  418  as_addseg(struct as  *as, struct seg *newseg)
 419  419  {
 420  420          struct seg *seg;
 421  421          caddr_t addr;
 422  422          caddr_t eaddr;
 423  423          avl_index_t where;
 424  424  
 425  425          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 426  426  
 427  427          as->a_updatedir = 1;    /* inform /proc */
 428  428          gethrestime(&as->a_updatetime);
 429  429  
 430  430          if (as->a_lastgaphl != NULL) {
 431  431                  struct seg *hseg = NULL;
 432  432                  struct seg *lseg = NULL;
 433  433  
 434  434                  if (as->a_lastgaphl->s_base > newseg->s_base) {
 435  435                          hseg = as->a_lastgaphl;
 436  436                          lseg = AVL_PREV(&as->a_segtree, hseg);
 437  437                  } else {
 438  438                          lseg = as->a_lastgaphl;
 439  439                          hseg = AVL_NEXT(&as->a_segtree, lseg);
 440  440                  }
 441  441  
 442  442                  if (hseg && lseg && lseg->s_base < newseg->s_base &&
 443  443                      hseg->s_base > newseg->s_base) {
 444  444                          avl_insert_here(&as->a_segtree, newseg, lseg,
 445  445                              AVL_AFTER);
 446  446                          as->a_lastgaphl = NULL;
 447  447                          as->a_seglast = newseg;
 448  448                          return (0);
 449  449                  }
 450  450                  as->a_lastgaphl = NULL;
 451  451          }
 452  452  
 453  453          addr = newseg->s_base;
 454  454          eaddr = addr + newseg->s_size;
 455  455  again:
 456  456  
 457  457          seg = avl_find(&as->a_segtree, &addr, &where);
 458  458  
 459  459          if (seg == NULL)
 460  460                  seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 461  461  
 462  462          if (seg == NULL)
 463  463                  seg = avl_last(&as->a_segtree);
 464  464  
 465  465          if (seg != NULL) {
 466  466                  caddr_t base = seg->s_base;

↓ open down ↓

466 lines elided

↑ open up ↑

 467  467  
 468  468                  /*
 469  469                   * If top of seg is below the requested address, then
 470  470                   * the insertion point is at the end of the linked list,
 471  471                   * and seg points to the tail of the list.  Otherwise,
 472  472                   * the insertion point is immediately before seg.
 473  473                   */
 474  474                  if (base + seg->s_size > addr) {
 475  475                          if (addr >= base || eaddr > base) {
 476  476  #ifdef __sparc
 477      -                                extern struct seg_ops segnf_ops;
      477 +                                extern const struct seg_ops segnf_ops;
 478  478  
 479  479                                  /*
 480  480                                   * no-fault segs must disappear if overlaid.
 481  481                                   * XXX need new segment type so
 482  482                                   * we don't have to check s_ops
 483  483                                   */
 484  484                                  if (seg->s_ops == &segnf_ops) {
 485  485                                          seg_unmap(seg);
 486  486                                          goto again;
 487  487                                  }

 488  488  #endif
 489  489                                  return (-1);    /* overlapping segment */
 490  490                          }
 491  491                  }
 492  492          }
 493  493          as->a_seglast = newseg;
 494  494          avl_insert(&as->a_segtree, newseg, where);
 495  495  
 496  496  #ifdef VERIFY_SEGLIST
 497  497          as_verify(as);
 498  498  #endif
 499  499          return (0);
 500  500  }
 501  501  
 502  502  struct seg *
 503  503  as_removeseg(struct as *as, struct seg *seg)
 504  504  {
 505  505          avl_tree_t *t;
 506  506  
 507  507          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 508  508  
 509  509          as->a_updatedir = 1;    /* inform /proc */
 510  510          gethrestime(&as->a_updatetime);
 511  511  
 512  512          if (seg == NULL)
 513  513                  return (NULL);
 514  514  
 515  515          t = &as->a_segtree;
 516  516          if (as->a_seglast == seg)
 517  517                  as->a_seglast = NULL;
 518  518          as->a_lastgaphl = NULL;
 519  519  
 520  520          /*
 521  521           * if this segment is at an address higher than
 522  522           * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 523  523           */
 524  524          if (as->a_lastgap &&
 525  525              (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 526  526                  as->a_lastgap = AVL_NEXT(t, seg);
 527  527  
 528  528          /*
 529  529           * remove the segment from the seg tree
 530  530           */
 531  531          avl_remove(t, seg);
 532  532  
 533  533  #ifdef VERIFY_SEGLIST
 534  534          as_verify(as);
 535  535  #endif
 536  536          return (seg);
 537  537  }
 538  538  
 539  539  /*
 540  540   * Find a segment containing addr.
 541  541   */
 542  542  struct seg *
 543  543  as_segat(struct as *as, caddr_t addr)
 544  544  {
 545  545          struct seg *seg = as->a_seglast;
 546  546  
 547  547          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 548  548  
 549  549          if (seg != NULL && seg->s_base <= addr &&
 550  550              addr < seg->s_base + seg->s_size)
 551  551                  return (seg);
 552  552  
 553  553          seg = avl_find(&as->a_segtree, &addr, NULL);
 554  554          return (seg);
 555  555  }
 556  556  
 557  557  /*
 558  558   * Serialize all searches for holes in an address space to
 559  559   * prevent two or more threads from allocating the same virtual
 560  560   * address range.  The address space must not be "read/write"
 561  561   * locked by the caller since we may block.
 562  562   */
 563  563  void
 564  564  as_rangelock(struct as *as)
 565  565  {
 566  566          mutex_enter(&as->a_contents);
 567  567          while (AS_ISCLAIMGAP(as))
 568  568                  cv_wait(&as->a_cv, &as->a_contents);
 569  569          AS_SETCLAIMGAP(as);
 570  570          mutex_exit(&as->a_contents);
 571  571  }
 572  572  
 573  573  /*
 574  574   * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 575  575   */
 576  576  void
 577  577  as_rangeunlock(struct as *as)
 578  578  {
 579  579          mutex_enter(&as->a_contents);
 580  580          AS_CLRCLAIMGAP(as);
 581  581          cv_signal(&as->a_cv);
 582  582          mutex_exit(&as->a_contents);
 583  583  }
 584  584  
 585  585  /*
 586  586   * compar segments (or just an address) by segment address range
 587  587   */
 588  588  static int
 589  589  as_segcompar(const void *x, const void *y)
 590  590  {
 591  591          struct seg *a = (struct seg *)x;
 592  592          struct seg *b = (struct seg *)y;
 593  593  
 594  594          if (a->s_base < b->s_base)
 595  595                  return (-1);
 596  596          if (a->s_base >= b->s_base + b->s_size)
 597  597                  return (1);
 598  598          return (0);
 599  599  }
 600  600  
 601  601  
 602  602  void
 603  603  as_avlinit(struct as *as)
 604  604  {
 605  605          avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 606  606              offsetof(struct seg, s_tree));
 607  607          avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 608  608              offsetof(struct watched_page, wp_link));
 609  609  }
 610  610  
 611  611  /*ARGSUSED*/
 612  612  static int
 613  613  as_constructor(void *buf, void *cdrarg, int kmflags)
 614  614  {
 615  615          struct as *as = buf;
 616  616  
 617  617          mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 618  618          cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 619  619          rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 620  620          as_avlinit(as);
 621  621          return (0);
 622  622  }
 623  623  
 624  624  /*ARGSUSED1*/
 625  625  static void
 626  626  as_destructor(void *buf, void *cdrarg)
 627  627  {
 628  628          struct as *as = buf;
 629  629  
 630  630          avl_destroy(&as->a_segtree);
 631  631          mutex_destroy(&as->a_contents);
 632  632          cv_destroy(&as->a_cv);
 633  633          rw_destroy(&as->a_lock);
 634  634  }
 635  635  
 636  636  void
 637  637  as_init(void)
 638  638  {
 639  639          as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 640  640              as_constructor, as_destructor, NULL, NULL, NULL, 0);
 641  641  }
 642  642  
 643  643  /*
 644  644   * Allocate and initialize an address space data structure.
 645  645   * We call hat_alloc to allow any machine dependent
 646  646   * information in the hat structure to be initialized.
 647  647   */
 648  648  struct as *
 649  649  as_alloc(void)
 650  650  {
 651  651          struct as *as;
 652  652  
 653  653          as = kmem_cache_alloc(as_cache, KM_SLEEP);
 654  654  
 655  655          as->a_flags             = 0;
 656  656          as->a_vbits             = 0;
 657  657          as->a_hrm               = NULL;
 658  658          as->a_seglast           = NULL;
 659  659          as->a_size              = 0;
 660  660          as->a_resvsize          = 0;
 661  661          as->a_updatedir         = 0;
 662  662          gethrestime(&as->a_updatetime);
 663  663          as->a_objectdir         = NULL;
 664  664          as->a_sizedir           = 0;
 665  665          as->a_userlimit         = (caddr_t)USERLIMIT;
 666  666          as->a_lastgap           = NULL;
 667  667          as->a_lastgaphl         = NULL;
 668  668          as->a_callbacks         = NULL;
 669  669  
 670  670          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 671  671          as->a_hat = hat_alloc(as);      /* create hat for default system mmu */
 672  672          AS_LOCK_EXIT(as, &as->a_lock);
 673  673  
 674  674          as->a_xhat = NULL;
 675  675  
 676  676          return (as);
 677  677  }
 678  678  
 679  679  /*
 680  680   * Free an address space data structure.
 681  681   * Need to free the hat first and then
 682  682   * all the segments on this as and finally
 683  683   * the space for the as struct itself.
 684  684   */
 685  685  void
 686  686  as_free(struct as *as)
 687  687  {
 688  688          struct hat *hat = as->a_hat;
 689  689          struct seg *seg, *next;
 690  690          int called = 0;
 691  691  
 692  692  top:
 693  693          /*
 694  694           * Invoke ALL callbacks. as_do_callbacks will do one callback
 695  695           * per call, and not return (-1) until the callback has completed.
 696  696           * When as_do_callbacks returns zero, all callbacks have completed.
 697  697           */
 698  698          mutex_enter(&as->a_contents);
 699  699          while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 700  700                  ;
 701  701  
 702  702          /* This will prevent new XHATs from attaching to as */
 703  703          if (!called)
 704  704                  AS_SETBUSY(as);
 705  705          mutex_exit(&as->a_contents);
 706  706          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 707  707  
 708  708          if (!called) {
 709  709                  called = 1;
 710  710                  hat_free_start(hat);
 711  711                  if (as->a_xhat != NULL)
 712  712                          xhat_free_start_all(as);
 713  713          }
 714  714          for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 715  715                  int err;
 716  716  
 717  717                  next = AS_SEGNEXT(as, seg);
 718  718  retry:
 719  719                  err = segop_unmap(seg, seg->s_base, seg->s_size);
 720  720                  if (err == EAGAIN) {
 721  721                          mutex_enter(&as->a_contents);
 722  722                          if (as->a_callbacks) {
 723  723                                  AS_LOCK_EXIT(as, &as->a_lock);
 724  724                          } else if (!AS_ISNOUNMAPWAIT(as)) {
 725  725                                  /*
 726  726                                   * Memory is currently locked. Wait for a
 727  727                                   * cv_signal that it has been unlocked, then
 728  728                                   * try the operation again.
 729  729                                   */
 730  730                                  if (AS_ISUNMAPWAIT(as) == 0)
 731  731                                          cv_broadcast(&as->a_cv);
 732  732                                  AS_SETUNMAPWAIT(as);
 733  733                                  AS_LOCK_EXIT(as, &as->a_lock);
 734  734                                  while (AS_ISUNMAPWAIT(as))
 735  735                                          cv_wait(&as->a_cv, &as->a_contents);
 736  736                          } else {
 737  737                                  /*
 738  738                                   * We may have raced with
 739  739                                   * segvn_reclaim()/segspt_reclaim(). In this
 740  740                                   * case clean nounmapwait flag and retry since
 741  741                                   * softlockcnt in this segment may be already
 742  742                                   * 0.  We don't drop as writer lock so our
 743  743                                   * number of retries without sleeping should
 744  744                                   * be very small. See segvn_reclaim() for
 745  745                                   * more comments.
 746  746                                   */
 747  747                                  AS_CLRNOUNMAPWAIT(as);
 748  748                                  mutex_exit(&as->a_contents);
 749  749                                  goto retry;
 750  750                          }
 751  751                          mutex_exit(&as->a_contents);
 752  752                          goto top;
 753  753                  } else {
 754  754                          /*
 755  755                           * We do not expect any other error return at this
 756  756                           * time. This is similar to an ASSERT in seg_unmap()
 757  757                           */
 758  758                          ASSERT(err == 0);
 759  759                  }
 760  760          }
 761  761          hat_free_end(hat);
 762  762          if (as->a_xhat != NULL)
 763  763                  xhat_free_end_all(as);
 764  764          AS_LOCK_EXIT(as, &as->a_lock);
 765  765  
 766  766          /* /proc stuff */
 767  767          ASSERT(avl_numnodes(&as->a_wpage) == 0);
 768  768          if (as->a_objectdir) {
 769  769                  kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 770  770                  as->a_objectdir = NULL;
 771  771                  as->a_sizedir = 0;
 772  772          }
 773  773  
 774  774          /*
 775  775           * Free the struct as back to kmem.  Assert it has no segments.
 776  776           */
 777  777          ASSERT(avl_numnodes(&as->a_segtree) == 0);
 778  778          kmem_cache_free(as_cache, as);
 779  779  }
 780  780  
 781  781  int
 782  782  as_dup(struct as *as, struct proc *forkedproc)
 783  783  {
 784  784          struct as *newas;
 785  785          struct seg *seg, *newseg;
 786  786          size_t  purgesize = 0;
 787  787          int error;
 788  788  
 789  789          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 790  790          as_clearwatch(as);
 791  791          newas = as_alloc();
 792  792          newas->a_userlimit = as->a_userlimit;
 793  793          newas->a_proc = forkedproc;
 794  794  
 795  795          AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
 796  796  
 797  797          /* This will prevent new XHATs from attaching */
 798  798          mutex_enter(&as->a_contents);
 799  799          AS_SETBUSY(as);
 800  800          mutex_exit(&as->a_contents);
 801  801          mutex_enter(&newas->a_contents);
 802  802          AS_SETBUSY(newas);
 803  803          mutex_exit(&newas->a_contents);
 804  804  
 805  805          (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 806  806  
 807  807          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 808  808  
 809  809                  if (seg->s_flags & S_PURGE) {
 810  810                          purgesize += seg->s_size;
 811  811                          continue;
 812  812                  }
 813  813  
 814  814                  newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 815  815                  if (newseg == NULL) {
 816  816                          AS_LOCK_EXIT(newas, &newas->a_lock);
 817  817                          as_setwatch(as);
 818  818                          mutex_enter(&as->a_contents);
 819  819                          AS_CLRBUSY(as);
 820  820                          mutex_exit(&as->a_contents);
 821  821                          AS_LOCK_EXIT(as, &as->a_lock);
 822  822                          as_free(newas);
 823  823                          return (-1);
 824  824                  }
 825  825                  if ((error = segop_dup(seg, newseg)) != 0) {
 826  826                          /*
 827  827                           * We call seg_free() on the new seg
 828  828                           * because the segment is not set up
 829  829                           * completely; i.e. it has no ops.
 830  830                           */
 831  831                          as_setwatch(as);
 832  832                          mutex_enter(&as->a_contents);
 833  833                          AS_CLRBUSY(as);
 834  834                          mutex_exit(&as->a_contents);
 835  835                          AS_LOCK_EXIT(as, &as->a_lock);
 836  836                          seg_free(newseg);
 837  837                          AS_LOCK_EXIT(newas, &newas->a_lock);
 838  838                          as_free(newas);
 839  839                          return (error);
 840  840                  }
 841  841                  newas->a_size += seg->s_size;
 842  842          }
 843  843          newas->a_resvsize = as->a_resvsize - purgesize;
 844  844  
 845  845          error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 846  846          if (as->a_xhat != NULL)
 847  847                  error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL);
 848  848  
 849  849          mutex_enter(&newas->a_contents);
 850  850          AS_CLRBUSY(newas);
 851  851          mutex_exit(&newas->a_contents);
 852  852          AS_LOCK_EXIT(newas, &newas->a_lock);
 853  853  
 854  854          as_setwatch(as);
 855  855          mutex_enter(&as->a_contents);
 856  856          AS_CLRBUSY(as);
 857  857          mutex_exit(&as->a_contents);
 858  858          AS_LOCK_EXIT(as, &as->a_lock);
 859  859          if (error != 0) {
 860  860                  as_free(newas);
 861  861                  return (error);
 862  862          }
 863  863          forkedproc->p_as = newas;
 864  864          return (0);
 865  865  }
 866  866  
 867  867  /*
 868  868   * Handle a ``fault'' at addr for size bytes.
 869  869   */
 870  870  faultcode_t
 871  871  as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 872  872          enum fault_type type, enum seg_rw rw)
 873  873  {
 874  874          struct seg *seg;

↓ open down ↓

387 lines elided

↑ open up ↑

 875  875          caddr_t raddr;                  /* rounded down addr */
 876  876          size_t rsize;                   /* rounded up size */
 877  877          size_t ssize;
 878  878          faultcode_t res = 0;
 879  879          caddr_t addrsav;
 880  880          struct seg *segsav;
 881  881          int as_lock_held;
 882  882          klwp_t *lwp = ttolwp(curthread);
 883  883          int is_xhat = 0;
 884  884          int holding_wpage = 0;
 885      -        extern struct seg_ops   segdev_ops;
 886      -
 887      -
 888  885  
 889  886          if (as->a_hat != hat) {
 890  887                  /* This must be an XHAT then */
 891  888                  is_xhat = 1;
 892  889  
 893  890                  if ((type != F_INVAL) || (as == &kas))
 894  891                          return (FC_NOSUPPORT);
 895  892          }
 896  893  
 897  894  retry:

 898  895          if (!is_xhat) {
 899  896                  /*
 900  897                   * Indicate that the lwp is not to be stopped while waiting
 901  898                   * for a pagefault.  This is to avoid deadlock while debugging
 902  899                   * a process via /proc over NFS (in particular).
 903  900                   */
 904  901                  if (lwp != NULL)
 905  902                          lwp->lwp_nostop++;
 906  903  
 907  904                  /*
 908  905                   * same length must be used when we softlock and softunlock.
 909  906                   * We don't support softunlocking lengths less than
 910  907                   * the original length when there is largepage support.
 911  908                   * See seg_dev.c for more comments.
 912  909                   */
 913  910                  switch (type) {
 914  911  
 915  912                  case F_SOFTLOCK:
 916  913                          CPU_STATS_ADD_K(vm, softlock, 1);
 917  914                          break;
 918  915  
 919  916                  case F_SOFTUNLOCK:
 920  917                          break;
 921  918  
 922  919                  case F_PROT:
 923  920                          CPU_STATS_ADD_K(vm, prot_fault, 1);
 924  921                          break;
 925  922  
 926  923                  case F_INVAL:
 927  924                          CPU_STATS_ENTER_K();
 928  925                          CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 929  926                          if (as == &kas)
 930  927                                  CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 931  928                          CPU_STATS_EXIT_K();
 932  929                          break;
 933  930                  }
 934  931          }
 935  932  
 936  933          /* Kernel probe */
 937  934          TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
 938  935              tnf_opaque, address,        addr,
 939  936              tnf_fault_type,     fault_type,     type,
 940  937              tnf_seg_access,     access,         rw);
 941  938  
 942  939          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 943  940          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 944  941              (size_t)raddr;
 945  942  
 946  943          /*
 947  944           * XXX -- Don't grab the as lock for segkmap. We should grab it for
 948  945           * correctness, but then we could be stuck holding this lock for
 949  946           * a LONG time if the fault needs to be resolved on a slow
 950  947           * filesystem, and then no-one will be able to exec new commands,
 951  948           * as exec'ing requires the write lock on the as.
 952  949           */
 953  950          if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 954  951              raddr + size < segkmap->s_base + segkmap->s_size) {
 955  952                  /*
 956  953                   * if (as==&kas), this can't be XHAT: we've already returned
 957  954                   * FC_NOSUPPORT.
 958  955                   */
 959  956                  seg = segkmap;
 960  957                  as_lock_held = 0;
 961  958          } else {
 962  959                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
 963  960                  if (is_xhat && avl_numnodes(&as->a_wpage) != 0) {
 964  961                          /*
 965  962                           * Grab and hold the writers' lock on the as
 966  963                           * if the fault is to a watched page.
 967  964                           * This will keep CPUs from "peeking" at the
 968  965                           * address range while we're temporarily boosting
 969  966                           * the permissions for the XHAT device to
 970  967                           * resolve the fault in the segment layer.
 971  968                           *
 972  969                           * We could check whether faulted address
 973  970                           * is within a watched page and only then grab
 974  971                           * the writer lock, but this is simpler.
 975  972                           */
 976  973                          AS_LOCK_EXIT(as, &as->a_lock);
 977  974                          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 978  975                  }
 979  976  
 980  977                  seg = as_segat(as, raddr);
 981  978                  if (seg == NULL) {
 982  979                          AS_LOCK_EXIT(as, &as->a_lock);
 983  980                          if ((lwp != NULL) && (!is_xhat))
 984  981                                  lwp->lwp_nostop--;
 985  982                          return (FC_NOMAP);
 986  983                  }
 987  984  
 988  985                  as_lock_held = 1;
 989  986          }
 990  987  
 991  988          addrsav = raddr;
 992  989          segsav = seg;
 993  990  
 994  991          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 995  992                  if (raddr >= seg->s_base + seg->s_size) {
 996  993                          seg = AS_SEGNEXT(as, seg);
 997  994                          if (seg == NULL || raddr != seg->s_base) {
 998  995                                  res = FC_NOMAP;
 999  996                                  break;
1000  997                          }
1001  998                  }
1002  999                  if (raddr + rsize > seg->s_base + seg->s_size)
1003 1000                          ssize = seg->s_base + seg->s_size - raddr;
1004 1001                  else
1005 1002                          ssize = rsize;
1006 1003  
1007 1004                  if (!is_xhat || (seg->s_ops != &segdev_ops)) {
1008 1005  
1009 1006                          if (is_xhat && avl_numnodes(&as->a_wpage) != 0 &&
1010 1007                              pr_is_watchpage_as(raddr, rw, as)) {
1011 1008                                  /*
1012 1009                                   * Handle watch pages.  If we're faulting on a
1013 1010                                   * watched page from an X-hat, we have to
1014 1011                                   * restore the original permissions while we
1015 1012                                   * handle the fault.
1016 1013                                   */
1017 1014                                  as_clearwatch(as);
1018 1015                                  holding_wpage = 1;
1019 1016                          }
1020 1017  
1021 1018                          res = segop_fault(hat, seg, raddr, ssize, type, rw);
1022 1019  
1023 1020                          /* Restore watchpoints */
1024 1021                          if (holding_wpage) {
1025 1022                                  as_setwatch(as);
1026 1023                                  holding_wpage = 0;
1027 1024                          }
1028 1025  
1029 1026                          if (res != 0)
1030 1027                                  break;
1031 1028                  } else {
1032 1029                          /* XHAT does not support seg_dev */
1033 1030                          res = FC_NOSUPPORT;
1034 1031                          break;
1035 1032                  }
1036 1033          }
1037 1034  
1038 1035          /*
1039 1036           * If we were SOFTLOCKing and encountered a failure,
1040 1037           * we must SOFTUNLOCK the range we already did. (Maybe we
1041 1038           * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
1042 1039           * right here...)
1043 1040           */
1044 1041          if (res != 0 && type == F_SOFTLOCK) {
1045 1042                  for (seg = segsav; addrsav < raddr; addrsav += ssize) {
1046 1043                          if (addrsav >= seg->s_base + seg->s_size)
1047 1044                                  seg = AS_SEGNEXT(as, seg);
1048 1045                          ASSERT(seg != NULL);
1049 1046                          /*
1050 1047                           * Now call the fault routine again to perform the
1051 1048                           * unlock using S_OTHER instead of the rw variable
1052 1049                           * since we never got a chance to touch the pages.
1053 1050                           */
1054 1051                          if (raddr > seg->s_base + seg->s_size)
1055 1052                                  ssize = seg->s_base + seg->s_size - addrsav;
1056 1053                          else
1057 1054                                  ssize = raddr - addrsav;
1058 1055                          (void) segop_fault(hat, seg, addrsav, ssize,
1059 1056                              F_SOFTUNLOCK, S_OTHER);
1060 1057                  }
1061 1058          }
1062 1059          if (as_lock_held)
1063 1060                  AS_LOCK_EXIT(as, &as->a_lock);
1064 1061          if ((lwp != NULL) && (!is_xhat))
1065 1062                  lwp->lwp_nostop--;
1066 1063  
1067 1064          /*
1068 1065           * If the lower levels returned EDEADLK for a fault,
1069 1066           * It means that we should retry the fault.  Let's wait
1070 1067           * a bit also to let the deadlock causing condition clear.
1071 1068           * This is part of a gross hack to work around a design flaw
1072 1069           * in the ufs/sds logging code and should go away when the
1073 1070           * logging code is re-designed to fix the problem. See bug
1074 1071           * 4125102 for details of the problem.
1075 1072           */
1076 1073          if (FC_ERRNO(res) == EDEADLK) {
1077 1074                  delay(deadlk_wait);
1078 1075                  res = 0;
1079 1076                  goto retry;
1080 1077          }
1081 1078          return (res);
1082 1079  }
1083 1080  
1084 1081  
1085 1082  
1086 1083  /*
1087 1084   * Asynchronous ``fault'' at addr for size bytes.
1088 1085   */
1089 1086  faultcode_t
1090 1087  as_faulta(struct as *as, caddr_t addr, size_t size)
1091 1088  {
1092 1089          struct seg *seg;
1093 1090          caddr_t raddr;                  /* rounded down addr */
1094 1091          size_t rsize;                   /* rounded up size */
1095 1092          faultcode_t res = 0;
1096 1093          klwp_t *lwp = ttolwp(curthread);
1097 1094  
1098 1095  retry:
1099 1096          /*
1100 1097           * Indicate that the lwp is not to be stopped while waiting
1101 1098           * for a pagefault.  This is to avoid deadlock while debugging
1102 1099           * a process via /proc over NFS (in particular).
1103 1100           */
1104 1101          if (lwp != NULL)
1105 1102                  lwp->lwp_nostop++;
1106 1103  
1107 1104          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1108 1105          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1109 1106              (size_t)raddr;
1110 1107  
1111 1108          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1112 1109          seg = as_segat(as, raddr);
1113 1110          if (seg == NULL) {
1114 1111                  AS_LOCK_EXIT(as, &as->a_lock);
1115 1112                  if (lwp != NULL)
1116 1113                          lwp->lwp_nostop--;
1117 1114                  return (FC_NOMAP);
1118 1115          }
1119 1116  
1120 1117          for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1121 1118                  if (raddr >= seg->s_base + seg->s_size) {
1122 1119                          seg = AS_SEGNEXT(as, seg);
1123 1120                          if (seg == NULL || raddr != seg->s_base) {
1124 1121                                  res = FC_NOMAP;
1125 1122                                  break;
1126 1123                          }
1127 1124                  }
1128 1125                  res = segop_faulta(seg, raddr);
1129 1126                  if (res != 0)
1130 1127                          break;
1131 1128          }
1132 1129          AS_LOCK_EXIT(as, &as->a_lock);
1133 1130          if (lwp != NULL)
1134 1131                  lwp->lwp_nostop--;
1135 1132          /*
1136 1133           * If the lower levels returned EDEADLK for a fault,
1137 1134           * It means that we should retry the fault.  Let's wait
1138 1135           * a bit also to let the deadlock causing condition clear.
1139 1136           * This is part of a gross hack to work around a design flaw
1140 1137           * in the ufs/sds logging code and should go away when the
1141 1138           * logging code is re-designed to fix the problem. See bug
1142 1139           * 4125102 for details of the problem.
1143 1140           */
1144 1141          if (FC_ERRNO(res) == EDEADLK) {
1145 1142                  delay(deadlk_wait);
1146 1143                  res = 0;
1147 1144                  goto retry;
1148 1145          }
1149 1146          return (res);
1150 1147  }
1151 1148  
1152 1149  /*
1153 1150   * Set the virtual mapping for the interval from [addr : addr + size)
1154 1151   * in address space `as' to have the specified protection.
1155 1152   * It is ok for the range to cross over several segments,
1156 1153   * as long as they are contiguous.
1157 1154   */
1158 1155  int
1159 1156  as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1160 1157  {
1161 1158          struct seg *seg;
1162 1159          struct as_callback *cb;
1163 1160          size_t ssize;
1164 1161          caddr_t raddr;                  /* rounded down addr */
1165 1162          size_t rsize;                   /* rounded up size */
1166 1163          int error = 0, writer = 0;
1167 1164          caddr_t saveraddr;
1168 1165          size_t saversize;
1169 1166  
1170 1167  setprot_top:
1171 1168          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1172 1169          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1173 1170              (size_t)raddr;
1174 1171  
1175 1172          if (raddr + rsize < raddr)              /* check for wraparound */
1176 1173                  return (ENOMEM);
1177 1174  
1178 1175          saveraddr = raddr;
1179 1176          saversize = rsize;
1180 1177  
1181 1178          /*
1182 1179           * Normally we only lock the as as a reader. But
1183 1180           * if due to setprot the segment driver needs to split
1184 1181           * a segment it will return IE_RETRY. Therefore we re-acquire
1185 1182           * the as lock as a writer so the segment driver can change
1186 1183           * the seg list. Also the segment driver will return IE_RETRY
1187 1184           * after it has changed the segment list so we therefore keep
1188 1185           * locking as a writer. Since these opeartions should be rare
1189 1186           * want to only lock as a writer when necessary.
1190 1187           */
1191 1188          if (writer || avl_numnodes(&as->a_wpage) != 0) {
1192 1189                  AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1193 1190          } else {
1194 1191                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1195 1192          }
1196 1193  
1197 1194          as_clearwatchprot(as, raddr, rsize);
1198 1195          seg = as_segat(as, raddr);
1199 1196          if (seg == NULL) {
1200 1197                  as_setwatch(as);
1201 1198                  AS_LOCK_EXIT(as, &as->a_lock);
1202 1199                  return (ENOMEM);
1203 1200          }
1204 1201  
1205 1202          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1206 1203                  if (raddr >= seg->s_base + seg->s_size) {
1207 1204                          seg = AS_SEGNEXT(as, seg);
1208 1205                          if (seg == NULL || raddr != seg->s_base) {
1209 1206                                  error = ENOMEM;
1210 1207                                  break;
1211 1208                          }
1212 1209                  }
1213 1210                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
1214 1211                          ssize = seg->s_base + seg->s_size - raddr;
1215 1212                  else
1216 1213                          ssize = rsize;
1217 1214  retry:
1218 1215                  error = segop_setprot(seg, raddr, ssize, prot);
1219 1216  
1220 1217                  if (error == IE_NOMEM) {
1221 1218                          error = EAGAIN;
1222 1219                          break;
1223 1220                  }
1224 1221  
1225 1222                  if (error == IE_RETRY) {
1226 1223                          AS_LOCK_EXIT(as, &as->a_lock);
1227 1224                          writer = 1;
1228 1225                          goto setprot_top;
1229 1226                  }
1230 1227  
1231 1228                  if (error == EAGAIN) {
1232 1229                          /*
1233 1230                           * Make sure we have a_lock as writer.
1234 1231                           */
1235 1232                          if (writer == 0) {
1236 1233                                  AS_LOCK_EXIT(as, &as->a_lock);
1237 1234                                  writer = 1;
1238 1235                                  goto setprot_top;
1239 1236                          }
1240 1237  
1241 1238                          /*
1242 1239                           * Memory is currently locked.  It must be unlocked
1243 1240                           * before this operation can succeed through a retry.
1244 1241                           * The possible reasons for locked memory and
1245 1242                           * corresponding strategies for unlocking are:
1246 1243                           * (1) Normal I/O
1247 1244                           *      wait for a signal that the I/O operation
1248 1245                           *      has completed and the memory is unlocked.
1249 1246                           * (2) Asynchronous I/O
1250 1247                           *      The aio subsystem does not unlock pages when
1251 1248                           *      the I/O is completed. Those pages are unlocked
1252 1249                           *      when the application calls aiowait/aioerror.
1253 1250                           *      So, to prevent blocking forever, cv_broadcast()
1254 1251                           *      is done to wake up aio_cleanup_thread.
1255 1252                           *      Subsequently, segvn_reclaim will be called, and
1256 1253                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
1257 1254                           * (3) Long term page locking:
1258 1255                           *      Drivers intending to have pages locked for a
1259 1256                           *      period considerably longer than for normal I/O
1260 1257                           *      (essentially forever) may have registered for a
1261 1258                           *      callback so they may unlock these pages on
1262 1259                           *      request. This is needed to allow this operation
1263 1260                           *      to succeed. Each entry on the callback list is
1264 1261                           *      examined. If the event or address range pertains
1265 1262                           *      the callback is invoked (unless it already is in
1266 1263                           *      progress). The a_contents lock must be dropped
1267 1264                           *      before the callback, so only one callback can
1268 1265                           *      be done at a time. Go to the top and do more
1269 1266                           *      until zero is returned. If zero is returned,
1270 1267                           *      either there were no callbacks for this event
1271 1268                           *      or they were already in progress.
1272 1269                           */
1273 1270                          mutex_enter(&as->a_contents);
1274 1271                          if (as->a_callbacks &&
1275 1272                              (cb = as_find_callback(as, AS_SETPROT_EVENT,
1276 1273                              seg->s_base, seg->s_size))) {
1277 1274                                  AS_LOCK_EXIT(as, &as->a_lock);
1278 1275                                  as_execute_callback(as, cb, AS_SETPROT_EVENT);
1279 1276                          } else if (!AS_ISNOUNMAPWAIT(as)) {
1280 1277                                  if (AS_ISUNMAPWAIT(as) == 0)
1281 1278                                          cv_broadcast(&as->a_cv);
1282 1279                                  AS_SETUNMAPWAIT(as);
1283 1280                                  AS_LOCK_EXIT(as, &as->a_lock);
1284 1281                                  while (AS_ISUNMAPWAIT(as))
1285 1282                                          cv_wait(&as->a_cv, &as->a_contents);
1286 1283                          } else {
1287 1284                                  /*
1288 1285                                   * We may have raced with
1289 1286                                   * segvn_reclaim()/segspt_reclaim(). In this
1290 1287                                   * case clean nounmapwait flag and retry since
1291 1288                                   * softlockcnt in this segment may be already
1292 1289                                   * 0.  We don't drop as writer lock so our
1293 1290                                   * number of retries without sleeping should
1294 1291                                   * be very small. See segvn_reclaim() for
1295 1292                                   * more comments.
1296 1293                                   */
1297 1294                                  AS_CLRNOUNMAPWAIT(as);
1298 1295                                  mutex_exit(&as->a_contents);
1299 1296                                  goto retry;
1300 1297                          }
1301 1298                          mutex_exit(&as->a_contents);
1302 1299                          goto setprot_top;
1303 1300                  } else if (error != 0)
1304 1301                          break;
1305 1302          }
1306 1303          if (error != 0) {
1307 1304                  as_setwatch(as);
1308 1305          } else {
1309 1306                  as_setwatchprot(as, saveraddr, saversize, prot);
1310 1307          }
1311 1308          AS_LOCK_EXIT(as, &as->a_lock);
1312 1309          return (error);
1313 1310  }
1314 1311  
1315 1312  /*
1316 1313   * Check to make sure that the interval [addr, addr + size)
1317 1314   * in address space `as' has at least the specified protection.
1318 1315   * It is ok for the range to cross over several segments, as long
1319 1316   * as they are contiguous.
1320 1317   */
1321 1318  int
1322 1319  as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1323 1320  {
1324 1321          struct seg *seg;
1325 1322          size_t ssize;
1326 1323          caddr_t raddr;                  /* rounded down addr */
1327 1324          size_t rsize;                   /* rounded up size */
1328 1325          int error = 0;
1329 1326  
1330 1327          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1331 1328          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1332 1329              (size_t)raddr;
1333 1330  
1334 1331          if (raddr + rsize < raddr)              /* check for wraparound */
1335 1332                  return (ENOMEM);
1336 1333  
1337 1334          /*
1338 1335           * This is ugly as sin...
1339 1336           * Normally, we only acquire the address space readers lock.
1340 1337           * However, if the address space has watchpoints present,
1341 1338           * we must acquire the writer lock on the address space for
1342 1339           * the benefit of as_clearwatchprot() and as_setwatchprot().
1343 1340           */
1344 1341          if (avl_numnodes(&as->a_wpage) != 0)
1345 1342                  AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1346 1343          else
1347 1344                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1348 1345          as_clearwatchprot(as, raddr, rsize);
1349 1346          seg = as_segat(as, raddr);
1350 1347          if (seg == NULL) {
1351 1348                  as_setwatch(as);
1352 1349                  AS_LOCK_EXIT(as, &as->a_lock);
1353 1350                  return (ENOMEM);
1354 1351          }
1355 1352  
1356 1353          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1357 1354                  if (raddr >= seg->s_base + seg->s_size) {
1358 1355                          seg = AS_SEGNEXT(as, seg);
1359 1356                          if (seg == NULL || raddr != seg->s_base) {
1360 1357                                  error = ENOMEM;
1361 1358                                  break;
1362 1359                          }
1363 1360                  }
1364 1361                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
1365 1362                          ssize = seg->s_base + seg->s_size - raddr;
1366 1363                  else
1367 1364                          ssize = rsize;
1368 1365  
1369 1366                  error = segop_checkprot(seg, raddr, ssize, prot);
1370 1367                  if (error != 0)
1371 1368                          break;
1372 1369          }
1373 1370          as_setwatch(as);
1374 1371          AS_LOCK_EXIT(as, &as->a_lock);
1375 1372          return (error);
1376 1373  }
1377 1374  
1378 1375  int
1379 1376  as_unmap(struct as *as, caddr_t addr, size_t size)
1380 1377  {
1381 1378          struct seg *seg, *seg_next;
1382 1379          struct as_callback *cb;
1383 1380          caddr_t raddr, eaddr;
1384 1381          size_t ssize, rsize = 0;
1385 1382          int err;
1386 1383  
1387 1384  top:
1388 1385          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1389 1386          eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1390 1387              (uintptr_t)PAGEMASK);
1391 1388  
1392 1389          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1393 1390  
1394 1391          as->a_updatedir = 1;    /* inform /proc */
1395 1392          gethrestime(&as->a_updatetime);
1396 1393  
1397 1394          /*
1398 1395           * Use as_findseg to find the first segment in the range, then
1399 1396           * step through the segments in order, following s_next.
1400 1397           */
1401 1398          as_clearwatchprot(as, raddr, eaddr - raddr);
1402 1399  
1403 1400          for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1404 1401                  if (eaddr <= seg->s_base)
1405 1402                          break;          /* eaddr was in a gap; all done */
1406 1403  
1407 1404                  /* this is implied by the test above */
1408 1405                  ASSERT(raddr < eaddr);
1409 1406  
1410 1407                  if (raddr < seg->s_base)
1411 1408                          raddr = seg->s_base;    /* raddr was in a gap */
1412 1409  
1413 1410                  if (eaddr > (seg->s_base + seg->s_size))
1414 1411                          ssize = seg->s_base + seg->s_size - raddr;
1415 1412                  else
1416 1413                          ssize = eaddr - raddr;
1417 1414  
1418 1415                  /*
1419 1416                   * Save next segment pointer since seg can be
1420 1417                   * destroyed during the segment unmap operation.
1421 1418                   */
1422 1419                  seg_next = AS_SEGNEXT(as, seg);
1423 1420  
1424 1421                  /*
1425 1422                   * We didn't count /dev/null mappings, so ignore them here.
1426 1423                   * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1427 1424                   * we have to do this check here while we have seg.)
1428 1425                   */
1429 1426                  rsize = 0;
1430 1427                  if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1431 1428                      !SEG_IS_PARTIAL_RESV(seg))
1432 1429                          rsize = ssize;
1433 1430  
1434 1431  retry:
1435 1432                  err = segop_unmap(seg, raddr, ssize);
1436 1433                  if (err == EAGAIN) {
1437 1434                          /*
1438 1435                           * Memory is currently locked.  It must be unlocked
1439 1436                           * before this operation can succeed through a retry.
1440 1437                           * The possible reasons for locked memory and
1441 1438                           * corresponding strategies for unlocking are:
1442 1439                           * (1) Normal I/O
1443 1440                           *      wait for a signal that the I/O operation
1444 1441                           *      has completed and the memory is unlocked.
1445 1442                           * (2) Asynchronous I/O
1446 1443                           *      The aio subsystem does not unlock pages when
1447 1444                           *      the I/O is completed. Those pages are unlocked
1448 1445                           *      when the application calls aiowait/aioerror.
1449 1446                           *      So, to prevent blocking forever, cv_broadcast()
1450 1447                           *      is done to wake up aio_cleanup_thread.
1451 1448                           *      Subsequently, segvn_reclaim will be called, and
1452 1449                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
1453 1450                           * (3) Long term page locking:
1454 1451                           *      Drivers intending to have pages locked for a
1455 1452                           *      period considerably longer than for normal I/O
1456 1453                           *      (essentially forever) may have registered for a
1457 1454                           *      callback so they may unlock these pages on
1458 1455                           *      request. This is needed to allow this operation
1459 1456                           *      to succeed. Each entry on the callback list is
1460 1457                           *      examined. If the event or address range pertains
1461 1458                           *      the callback is invoked (unless it already is in
1462 1459                           *      progress). The a_contents lock must be dropped
1463 1460                           *      before the callback, so only one callback can
1464 1461                           *      be done at a time. Go to the top and do more
1465 1462                           *      until zero is returned. If zero is returned,
1466 1463                           *      either there were no callbacks for this event
1467 1464                           *      or they were already in progress.
1468 1465                           */
1469 1466                          mutex_enter(&as->a_contents);
1470 1467                          if (as->a_callbacks &&
1471 1468                              (cb = as_find_callback(as, AS_UNMAP_EVENT,
1472 1469                              seg->s_base, seg->s_size))) {
1473 1470                                  AS_LOCK_EXIT(as, &as->a_lock);
1474 1471                                  as_execute_callback(as, cb, AS_UNMAP_EVENT);
1475 1472                          } else if (!AS_ISNOUNMAPWAIT(as)) {
1476 1473                                  if (AS_ISUNMAPWAIT(as) == 0)
1477 1474                                          cv_broadcast(&as->a_cv);
1478 1475                                  AS_SETUNMAPWAIT(as);
1479 1476                                  AS_LOCK_EXIT(as, &as->a_lock);
1480 1477                                  while (AS_ISUNMAPWAIT(as))
1481 1478                                          cv_wait(&as->a_cv, &as->a_contents);
1482 1479                          } else {
1483 1480                                  /*
1484 1481                                   * We may have raced with
1485 1482                                   * segvn_reclaim()/segspt_reclaim(). In this
1486 1483                                   * case clean nounmapwait flag and retry since
1487 1484                                   * softlockcnt in this segment may be already
1488 1485                                   * 0.  We don't drop as writer lock so our
1489 1486                                   * number of retries without sleeping should
1490 1487                                   * be very small. See segvn_reclaim() for
1491 1488                                   * more comments.
1492 1489                                   */
1493 1490                                  AS_CLRNOUNMAPWAIT(as);
1494 1491                                  mutex_exit(&as->a_contents);
1495 1492                                  goto retry;
1496 1493                          }
1497 1494                          mutex_exit(&as->a_contents);
1498 1495                          goto top;
1499 1496                  } else if (err == IE_RETRY) {
1500 1497                          AS_LOCK_EXIT(as, &as->a_lock);
1501 1498                          goto top;
1502 1499                  } else if (err) {
1503 1500                          as_setwatch(as);
1504 1501                          AS_LOCK_EXIT(as, &as->a_lock);
1505 1502                          return (-1);
1506 1503                  }
1507 1504  
1508 1505                  as->a_size -= ssize;
1509 1506                  if (rsize)
1510 1507                          as->a_resvsize -= rsize;
1511 1508                  raddr += ssize;
1512 1509          }
1513 1510          AS_LOCK_EXIT(as, &as->a_lock);
1514 1511          return (0);
1515 1512  }
1516 1513  
1517 1514  static int
1518 1515  as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1519 1516      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1520 1517  {
1521 1518          uint_t szc;
1522 1519          uint_t nszc;
1523 1520          int error;
1524 1521          caddr_t a;
1525 1522          caddr_t eaddr;
1526 1523          size_t segsize;
1527 1524          struct seg *seg;
1528 1525          size_t pgsz;
1529 1526          int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1530 1527          uint_t save_szcvec;
1531 1528  
1532 1529          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1533 1530          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1534 1531          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1535 1532          ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1536 1533          if (!do_off) {
1537 1534                  vn_a->offset = 0;
1538 1535          }
1539 1536  
1540 1537          if (szcvec <= 1) {
1541 1538                  seg = seg_alloc(as, addr, size);
1542 1539                  if (seg == NULL) {
1543 1540                          return (ENOMEM);
1544 1541                  }
1545 1542                  vn_a->szc = 0;
1546 1543                  error = (*crfp)(seg, vn_a);
1547 1544                  if (error != 0) {
1548 1545                          seg_free(seg);
1549 1546                  } else {
1550 1547                          as->a_size += size;
1551 1548                          as->a_resvsize += size;
1552 1549                  }
1553 1550                  return (error);
1554 1551          }
1555 1552  
1556 1553          eaddr = addr + size;
1557 1554          save_szcvec = szcvec;
1558 1555          szcvec >>= 1;
1559 1556          szc = 0;
1560 1557          nszc = 0;
1561 1558          while (szcvec) {
1562 1559                  if ((szcvec & 0x1) == 0) {
1563 1560                          nszc++;
1564 1561                          szcvec >>= 1;
1565 1562                          continue;
1566 1563                  }
1567 1564                  nszc++;
1568 1565                  pgsz = page_get_pagesize(nszc);
1569 1566                  a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1570 1567                  if (a != addr) {
1571 1568                          ASSERT(a < eaddr);
1572 1569                          segsize = a - addr;
1573 1570                          seg = seg_alloc(as, addr, segsize);
1574 1571                          if (seg == NULL) {
1575 1572                                  return (ENOMEM);
1576 1573                          }
1577 1574                          vn_a->szc = szc;
1578 1575                          error = (*crfp)(seg, vn_a);
1579 1576                          if (error != 0) {
1580 1577                                  seg_free(seg);
1581 1578                                  return (error);
1582 1579                          }
1583 1580                          as->a_size += segsize;
1584 1581                          as->a_resvsize += segsize;
1585 1582                          *segcreated = 1;
1586 1583                          if (do_off) {
1587 1584                                  vn_a->offset += segsize;
1588 1585                          }
1589 1586                          addr = a;
1590 1587                  }
1591 1588                  szc = nszc;
1592 1589                  szcvec >>= 1;
1593 1590          }
1594 1591  
1595 1592          ASSERT(addr < eaddr);
1596 1593          szcvec = save_szcvec | 1; /* add 8K pages */
1597 1594          while (szcvec) {
1598 1595                  a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1599 1596                  ASSERT(a >= addr);
1600 1597                  if (a != addr) {
1601 1598                          segsize = a - addr;
1602 1599                          seg = seg_alloc(as, addr, segsize);
1603 1600                          if (seg == NULL) {
1604 1601                                  return (ENOMEM);
1605 1602                          }
1606 1603                          vn_a->szc = szc;
1607 1604                          error = (*crfp)(seg, vn_a);
1608 1605                          if (error != 0) {
1609 1606                                  seg_free(seg);
1610 1607                                  return (error);
1611 1608                          }
1612 1609                          as->a_size += segsize;
1613 1610                          as->a_resvsize += segsize;
1614 1611                          *segcreated = 1;
1615 1612                          if (do_off) {
1616 1613                                  vn_a->offset += segsize;
1617 1614                          }
1618 1615                          addr = a;
1619 1616                  }
1620 1617                  szcvec &= ~(1 << szc);
1621 1618                  if (szcvec) {
1622 1619                          szc = highbit(szcvec) - 1;
1623 1620                          pgsz = page_get_pagesize(szc);
1624 1621                  }
1625 1622          }
1626 1623          ASSERT(addr == eaddr);
1627 1624  
1628 1625          return (0);
1629 1626  }
1630 1627  
1631 1628  static int
1632 1629  as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1633 1630      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1634 1631  {
1635 1632          uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1636 1633          int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1637 1634          uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1638 1635              type, 0);
1639 1636          int error;
1640 1637          struct seg *seg;
1641 1638          struct vattr va;
1642 1639          u_offset_t eoff;
1643 1640          size_t save_size = 0;
1644 1641          extern size_t textrepl_size_thresh;
1645 1642  
1646 1643          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1647 1644          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1648 1645          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1649 1646          ASSERT(vn_a->vp != NULL);
1650 1647          ASSERT(vn_a->amp == NULL);
1651 1648  
1652 1649  again:
1653 1650          if (szcvec <= 1) {
1654 1651                  seg = seg_alloc(as, addr, size);
1655 1652                  if (seg == NULL) {
1656 1653                          return (ENOMEM);
1657 1654                  }
1658 1655                  vn_a->szc = 0;
1659 1656                  error = (*crfp)(seg, vn_a);
1660 1657                  if (error != 0) {
1661 1658                          seg_free(seg);
1662 1659                  } else {
1663 1660                          as->a_size += size;
1664 1661                          as->a_resvsize += size;
1665 1662                  }
1666 1663                  return (error);
1667 1664          }
1668 1665  
1669 1666          va.va_mask = AT_SIZE;
1670 1667          if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1671 1668                  szcvec = 0;
1672 1669                  goto again;
1673 1670          }
1674 1671          eoff = vn_a->offset & PAGEMASK;
1675 1672          if (eoff >= va.va_size) {
1676 1673                  szcvec = 0;
1677 1674                  goto again;
1678 1675          }
1679 1676          eoff += size;
1680 1677          if (btopr(va.va_size) < btopr(eoff)) {
1681 1678                  save_size = size;
1682 1679                  size = va.va_size - (vn_a->offset & PAGEMASK);
1683 1680                  size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1684 1681                  szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1685 1682                      type, 0);
1686 1683                  if (szcvec <= 1) {
1687 1684                          size = save_size;
1688 1685                          goto again;
1689 1686                  }
1690 1687          }
1691 1688  
1692 1689          if (size > textrepl_size_thresh) {
1693 1690                  vn_a->flags |= _MAP_TEXTREPL;
1694 1691          }
1695 1692          error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1696 1693              segcreated);
1697 1694          if (error != 0) {
1698 1695                  return (error);
1699 1696          }
1700 1697          if (save_size) {
1701 1698                  addr += size;
1702 1699                  size = save_size - size;
1703 1700                  szcvec = 0;
1704 1701                  goto again;
1705 1702          }
1706 1703          return (0);
1707 1704  }
1708 1705  
1709 1706  /*
1710 1707   * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1711 1708   * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1712 1709   */
1713 1710  static int
1714 1711  as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1715 1712      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1716 1713  {
1717 1714          uint_t szcvec;
1718 1715          uchar_t type;
1719 1716  
1720 1717          ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1721 1718          if (vn_a->type == MAP_SHARED) {
1722 1719                  type = MAPPGSZC_SHM;
1723 1720          } else if (vn_a->type == MAP_PRIVATE) {
1724 1721                  if (vn_a->szc == AS_MAP_HEAP) {
1725 1722                          type = MAPPGSZC_HEAP;
1726 1723                  } else if (vn_a->szc == AS_MAP_STACK) {
1727 1724                          type = MAPPGSZC_STACK;
1728 1725                  } else {
1729 1726                          type = MAPPGSZC_PRIVM;
1730 1727                  }
1731 1728          }
1732 1729          szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1733 1730              (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1734 1731              (vn_a->flags & MAP_TEXT), type, 0);
1735 1732          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1736 1733          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1737 1734          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1738 1735          ASSERT(vn_a->vp == NULL);
1739 1736  
1740 1737          return (as_map_segvn_segs(as, addr, size, szcvec,
1741 1738              crfp, vn_a, segcreated));
1742 1739  }
1743 1740  
1744 1741  int
1745 1742  as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1746 1743  {
1747 1744          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1748 1745          return (as_map_locked(as, addr, size, crfp, argsp));
1749 1746  }
1750 1747  
1751 1748  int
1752 1749  as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1753 1750                  void *argsp)
1754 1751  {
1755 1752          struct seg *seg = NULL;
1756 1753          caddr_t raddr;                  /* rounded down addr */
1757 1754          size_t rsize;                   /* rounded up size */
1758 1755          int error;
1759 1756          int unmap = 0;
1760 1757          struct proc *p = curproc;
1761 1758          struct segvn_crargs crargs;
1762 1759  
1763 1760          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1764 1761          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1765 1762              (size_t)raddr;
1766 1763  
1767 1764          /*
1768 1765           * check for wrap around
1769 1766           */
1770 1767          if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1771 1768                  AS_LOCK_EXIT(as, &as->a_lock);
1772 1769                  return (ENOMEM);
1773 1770          }
1774 1771  
1775 1772          as->a_updatedir = 1;    /* inform /proc */
1776 1773          gethrestime(&as->a_updatetime);
1777 1774  
1778 1775          if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1779 1776                  AS_LOCK_EXIT(as, &as->a_lock);
1780 1777  
1781 1778                  (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1782 1779                      RCA_UNSAFE_ALL);
1783 1780  
1784 1781                  return (ENOMEM);
1785 1782          }
1786 1783  
1787 1784          if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1788 1785                  crargs = *(struct segvn_crargs *)argsp;
1789 1786                  error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1790 1787                  if (error != 0) {
1791 1788                          AS_LOCK_EXIT(as, &as->a_lock);
1792 1789                          if (unmap) {
1793 1790                                  (void) as_unmap(as, addr, size);
1794 1791                          }
1795 1792                          return (error);
1796 1793                  }
1797 1794          } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1798 1795                  crargs = *(struct segvn_crargs *)argsp;
1799 1796                  error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1800 1797                  if (error != 0) {
1801 1798                          AS_LOCK_EXIT(as, &as->a_lock);
1802 1799                          if (unmap) {
1803 1800                                  (void) as_unmap(as, addr, size);
1804 1801                          }
1805 1802                          return (error);
1806 1803                  }
1807 1804          } else {
1808 1805                  seg = seg_alloc(as, addr, size);
1809 1806                  if (seg == NULL) {
1810 1807                          AS_LOCK_EXIT(as, &as->a_lock);
1811 1808                          return (ENOMEM);
1812 1809                  }
1813 1810  
1814 1811                  error = (*crfp)(seg, argsp);
1815 1812                  if (error != 0) {
1816 1813                          seg_free(seg);
1817 1814                          AS_LOCK_EXIT(as, &as->a_lock);
1818 1815                          return (error);
1819 1816                  }
1820 1817                  /*
1821 1818                   * Add size now so as_unmap will work if as_ctl fails.
1822 1819                   */
1823 1820                  as->a_size += rsize;
1824 1821                  as->a_resvsize += rsize;
1825 1822          }
1826 1823  
1827 1824          as_setwatch(as);
1828 1825  
1829 1826          /*
1830 1827           * If the address space is locked,
1831 1828           * establish memory locks for the new segment.
1832 1829           */
1833 1830          mutex_enter(&as->a_contents);
1834 1831          if (AS_ISPGLCK(as)) {
1835 1832                  mutex_exit(&as->a_contents);
1836 1833                  AS_LOCK_EXIT(as, &as->a_lock);
1837 1834                  error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1838 1835                  if (error != 0)
1839 1836                          (void) as_unmap(as, addr, size);
1840 1837          } else {
1841 1838                  mutex_exit(&as->a_contents);
1842 1839                  AS_LOCK_EXIT(as, &as->a_lock);
1843 1840          }
1844 1841          return (error);
1845 1842  }
1846 1843  
1847 1844  
1848 1845  /*
1849 1846   * Delete all segments in the address space marked with S_PURGE.
1850 1847   * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1851 1848   * These segments are deleted as a first step before calls to as_gap(), so
1852 1849   * that they don't affect mmap() or shmat().
1853 1850   */
1854 1851  void
1855 1852  as_purge(struct as *as)
1856 1853  {
1857 1854          struct seg *seg;
1858 1855          struct seg *next_seg;
1859 1856  
1860 1857          /*
1861 1858           * the setting of NEEDSPURGE is protect by as_rangelock(), so
1862 1859           * no need to grab a_contents mutex for this check
1863 1860           */
1864 1861          if ((as->a_flags & AS_NEEDSPURGE) == 0)
1865 1862                  return;
1866 1863  
1867 1864          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1868 1865          next_seg = NULL;
1869 1866          seg = AS_SEGFIRST(as);
1870 1867          while (seg != NULL) {
1871 1868                  next_seg = AS_SEGNEXT(as, seg);
1872 1869                  if (seg->s_flags & S_PURGE)
1873 1870                          (void) segop_unmap(seg, seg->s_base, seg->s_size);
1874 1871                  seg = next_seg;
1875 1872          }
1876 1873          AS_LOCK_EXIT(as, &as->a_lock);
1877 1874  
1878 1875          mutex_enter(&as->a_contents);
1879 1876          as->a_flags &= ~AS_NEEDSPURGE;
1880 1877          mutex_exit(&as->a_contents);
1881 1878  }
1882 1879  
1883 1880  /*
1884 1881   * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1885 1882   * range of addresses at least "minlen" long, where the base of the range is
1886 1883   * at "off" phase from an "align" boundary and there is space for a
1887 1884   * "redzone"-sized redzone on eithe rside of the range.  Thus,
1888 1885   * if align was 4M and off was 16k, the user wants a hole which will start
1889 1886   * 16k into a 4M page.
1890 1887   *
1891 1888   * If flags specifies AH_HI, the hole will have the highest possible address
1892 1889   * in the range.  We use the as->a_lastgap field to figure out where to
1893 1890   * start looking for a gap.
1894 1891   *
1895 1892   * Otherwise, the gap will have the lowest possible address.
1896 1893   *
1897 1894   * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1898 1895   *
1899 1896   * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1900 1897   * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1901 1898   *
1902 1899   * NOTE: This routine is not correct when base+len overflows caddr_t.
1903 1900   */
1904 1901  int
1905 1902  as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1906 1903      uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1907 1904  {
1908 1905          caddr_t lobound = *basep;
1909 1906          caddr_t hibound = lobound + *lenp;
1910 1907          struct seg *lseg, *hseg;
1911 1908          caddr_t lo, hi;
1912 1909          int forward;
1913 1910          caddr_t save_base;
1914 1911          size_t save_len;
1915 1912          size_t save_minlen;
1916 1913          size_t save_redzone;
1917 1914          int fast_path = 1;
1918 1915  
1919 1916          save_base = *basep;
1920 1917          save_len = *lenp;
1921 1918          save_minlen = minlen;
1922 1919          save_redzone = redzone;
1923 1920  
1924 1921          /*
1925 1922           * For the first pass/fast_path, just add align and redzone into
1926 1923           * minlen since if we get an allocation, we can guarantee that it
1927 1924           * will fit the alignment and redzone requested.
1928 1925           * This increases the chance that hibound will be adjusted to
1929 1926           * a_lastgap->s_base which will likely allow us to find an
1930 1927           * acceptable hole in the address space quicker.
1931 1928           * If we can't find a hole with this fast_path, then we look for
1932 1929           * smaller holes in which the alignment and offset may allow
1933 1930           * the allocation to fit.
1934 1931           */
1935 1932          minlen += align;
1936 1933          minlen += 2 * redzone;
1937 1934          redzone = 0;
1938 1935  
1939 1936          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1940 1937          if (AS_SEGFIRST(as) == NULL) {
1941 1938                  if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1942 1939                      align, redzone, off)) {
1943 1940                          AS_LOCK_EXIT(as, &as->a_lock);
1944 1941                          return (0);
1945 1942                  } else {
1946 1943                          AS_LOCK_EXIT(as, &as->a_lock);
1947 1944                          *basep = save_base;
1948 1945                          *lenp = save_len;
1949 1946                          return (-1);
1950 1947                  }
1951 1948          }
1952 1949  
1953 1950  retry:
1954 1951          /*
1955 1952           * Set up to iterate over all the inter-segment holes in the given
1956 1953           * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1957 1954           * NULL for the highest-addressed hole.  If moving backwards, we reset
1958 1955           * sseg to denote the highest-addressed segment.
1959 1956           */
1960 1957          forward = (flags & AH_DIR) == AH_LO;
1961 1958          if (forward) {
1962 1959                  hseg = as_findseg(as, lobound, 1);
1963 1960                  lseg = AS_SEGPREV(as, hseg);
1964 1961          } else {
1965 1962  
1966 1963                  /*
1967 1964                   * If allocating at least as much as the last allocation,
1968 1965                   * use a_lastgap's base as a better estimate of hibound.
1969 1966                   */
1970 1967                  if (as->a_lastgap &&
1971 1968                      minlen >= as->a_lastgap->s_size &&
1972 1969                      hibound >= as->a_lastgap->s_base)
1973 1970                          hibound = as->a_lastgap->s_base;
1974 1971  
1975 1972                  hseg = as_findseg(as, hibound, 1);
1976 1973                  if (hseg->s_base + hseg->s_size < hibound) {
1977 1974                          lseg = hseg;
1978 1975                          hseg = NULL;
1979 1976                  } else {
1980 1977                          lseg = AS_SEGPREV(as, hseg);
1981 1978                  }
1982 1979          }
1983 1980  
1984 1981          for (;;) {
1985 1982                  /*
1986 1983                   * Set lo and hi to the hole's boundaries.  (We should really
1987 1984                   * use MAXADDR in place of hibound in the expression below,
1988 1985                   * but can't express it easily; using hibound in its place is
1989 1986                   * harmless.)
1990 1987                   */
1991 1988                  lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1992 1989                  hi = (hseg == NULL) ? hibound : hseg->s_base;
1993 1990                  /*
1994 1991                   * If the iteration has moved past the interval from lobound
1995 1992                   * to hibound it's pointless to continue.
1996 1993                   */
1997 1994                  if ((forward && lo > hibound) || (!forward && hi < lobound))
1998 1995                          break;
1999 1996                  else if (lo > hibound || hi < lobound)
2000 1997                          goto cont;
2001 1998                  /*
2002 1999                   * Candidate hole lies at least partially within the allowable
2003 2000                   * range.  Restrict it to fall completely within that range,
2004 2001                   * i.e., to [max(lo, lobound), min(hi, hibound)].
2005 2002                   */
2006 2003                  if (lo < lobound)
2007 2004                          lo = lobound;
2008 2005                  if (hi > hibound)
2009 2006                          hi = hibound;
2010 2007                  /*
2011 2008                   * Verify that the candidate hole is big enough and meets
2012 2009                   * hardware constraints.  If the hole is too small, no need
2013 2010                   * to do the further checks since they will fail.
2014 2011                   */
2015 2012                  *basep = lo;
2016 2013                  *lenp = hi - lo;
2017 2014                  if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
2018 2015                      minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
2019 2016                      ((flags & AH_CONTAIN) == 0 ||
2020 2017                      (*basep <= addr && *basep + *lenp > addr))) {
2021 2018                          if (!forward)
2022 2019                                  as->a_lastgap = hseg;
2023 2020                          if (hseg != NULL)
2024 2021                                  as->a_lastgaphl = hseg;
2025 2022                          else
2026 2023                                  as->a_lastgaphl = lseg;
2027 2024                          AS_LOCK_EXIT(as, &as->a_lock);
2028 2025                          return (0);
2029 2026                  }
2030 2027          cont:
2031 2028                  /*
2032 2029                   * Move to the next hole.
2033 2030                   */
2034 2031                  if (forward) {
2035 2032                          lseg = hseg;
2036 2033                          if (lseg == NULL)
2037 2034                                  break;
2038 2035                          hseg = AS_SEGNEXT(as, hseg);
2039 2036                  } else {
2040 2037                          hseg = lseg;
2041 2038                          if (hseg == NULL)
2042 2039                                  break;
2043 2040                          lseg = AS_SEGPREV(as, lseg);
2044 2041                  }
2045 2042          }
2046 2043          if (fast_path && (align != 0 || save_redzone != 0)) {
2047 2044                  fast_path = 0;
2048 2045                  minlen = save_minlen;
2049 2046                  redzone = save_redzone;
2050 2047                  goto retry;
2051 2048          }
2052 2049          *basep = save_base;
2053 2050          *lenp = save_len;
2054 2051          AS_LOCK_EXIT(as, &as->a_lock);
2055 2052          return (-1);
2056 2053  }
2057 2054  
2058 2055  /*
2059 2056   * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2060 2057   *
2061 2058   * If flags specifies AH_HI, the hole will have the highest possible address
2062 2059   * in the range.  We use the as->a_lastgap field to figure out where to
2063 2060   * start looking for a gap.
2064 2061   *
2065 2062   * Otherwise, the gap will have the lowest possible address.
2066 2063   *
2067 2064   * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2068 2065   *
2069 2066   * If an adequate hole is found, base and len are set to reflect the part of
2070 2067   * the hole that is within range, and 0 is returned, otherwise,
2071 2068   * -1 is returned.
2072 2069   *
2073 2070   * NOTE: This routine is not correct when base+len overflows caddr_t.
2074 2071   */
2075 2072  int
2076 2073  as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2077 2074      caddr_t addr)
2078 2075  {
2079 2076  
2080 2077          return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));

↓ open down ↓

1183 lines elided

↑ open up ↑

2081 2078  }
2082 2079  
2083 2080  /*
2084 2081   * Return the next range within [base, base + len) that is backed
2085 2082   * with "real memory".  Skip holes and non-seg_vn segments.
2086 2083   * We're lazy and only return one segment at a time.
2087 2084   */
2088 2085  int
2089 2086  as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2090 2087  {
2091      -        extern struct seg_ops segspt_shmops;    /* needs a header file */
     2088 +        extern const struct seg_ops segspt_shmops; /* needs a header file */
2092 2089          struct seg *seg;
2093 2090          caddr_t addr, eaddr;
2094 2091          caddr_t segend;
2095 2092  
2096 2093          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2097 2094  
2098 2095          addr = *basep;
2099 2096          eaddr = addr + *lenp;
2100 2097  
2101 2098          seg = as_findseg(as, addr, 0);

2102 2099          if (seg != NULL)
2103 2100                  addr = MAX(seg->s_base, addr);
2104 2101  
2105 2102          for (;;) {
2106 2103                  if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2107 2104                          AS_LOCK_EXIT(as, &as->a_lock);
2108 2105                          return (EINVAL);
2109 2106                  }
2110 2107  
2111 2108                  if (seg->s_ops == &segvn_ops) {
2112 2109                          segend = seg->s_base + seg->s_size;
2113 2110                          break;
2114 2111                  }
2115 2112  
2116 2113                  /*
2117 2114                   * We do ISM by looking into the private data
2118 2115                   * to determine the real size of the segment.
2119 2116                   */
2120 2117                  if (seg->s_ops == &segspt_shmops) {
2121 2118                          segend = seg->s_base + spt_realsize(seg);
2122 2119                          if (addr < segend)
2123 2120                                  break;
2124 2121                  }
2125 2122  
2126 2123                  seg = AS_SEGNEXT(as, seg);
2127 2124  
2128 2125                  if (seg != NULL)
2129 2126                          addr = seg->s_base;
2130 2127          }
2131 2128  
2132 2129          *basep = addr;
2133 2130  
2134 2131          if (segend > eaddr)
2135 2132                  *lenp = eaddr - addr;
2136 2133          else
2137 2134                  *lenp = segend - addr;
2138 2135  
2139 2136          AS_LOCK_EXIT(as, &as->a_lock);
2140 2137          return (0);
2141 2138  }
2142 2139  
2143 2140  /*
2144 2141   * Swap the pages associated with the address space as out to
2145 2142   * secondary storage, returning the number of bytes actually
2146 2143   * swapped.
2147 2144   *
2148 2145   * The value returned is intended to correlate well with the process's
2149 2146   * memory requirements.  Its usefulness for this purpose depends on
2150 2147   * how well the segment-level routines do at returning accurate
2151 2148   * information.
2152 2149   */
2153 2150  size_t
2154 2151  as_swapout(struct as *as)
2155 2152  {
2156 2153          struct seg *seg;
2157 2154          size_t swpcnt = 0;
2158 2155  
2159 2156          /*
2160 2157           * Kernel-only processes have given up their address
2161 2158           * spaces.  Of course, we shouldn't be attempting to
2162 2159           * swap out such processes in the first place...
2163 2160           */
2164 2161          if (as == NULL)
2165 2162                  return (0);
2166 2163  
2167 2164          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2168 2165  
2169 2166          /* Prevent XHATs from attaching */
2170 2167          mutex_enter(&as->a_contents);
2171 2168          AS_SETBUSY(as);
2172 2169          mutex_exit(&as->a_contents);
2173 2170  
2174 2171  
2175 2172          /*
2176 2173           * Free all mapping resources associated with the address
2177 2174           * space.  The segment-level swapout routines capitalize
2178 2175           * on this unmapping by scavanging pages that have become
2179 2176           * unmapped here.
2180 2177           */
2181 2178          hat_swapout(as->a_hat);
2182 2179          if (as->a_xhat != NULL)
2183 2180                  xhat_swapout_all(as);
2184 2181

↓ open down ↓

83 lines elided

↑ open up ↑

2185 2182          mutex_enter(&as->a_contents);
2186 2183          AS_CLRBUSY(as);
2187 2184          mutex_exit(&as->a_contents);
2188 2185  
2189 2186          /*
2190 2187           * Call the swapout routines of all segments in the address
2191 2188           * space to do the actual work, accumulating the amount of
2192 2189           * space reclaimed.
2193 2190           */
2194 2191          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2195      -                struct seg_ops *ov = seg->s_ops;
     2192 +                const struct seg_ops *ov = seg->s_ops;
2196 2193  
2197 2194                  /*
2198 2195                   * We have to check to see if the seg has
2199 2196                   * an ops vector because the seg may have
2200 2197                   * been in the middle of being set up when
2201 2198                   * the process was picked for swapout.
2202 2199                   */
2203 2200                  if ((ov != NULL) && (ov->swapout != NULL))
2204 2201                          swpcnt += segop_swapout(seg);
2205 2202          }

2206 2203          AS_LOCK_EXIT(as, &as->a_lock);
2207 2204          return (swpcnt);
2208 2205  }
2209 2206  
2210 2207  /*
2211 2208   * Determine whether data from the mappings in interval [addr, addr + size)
2212 2209   * are in the primary memory (core) cache.
2213 2210   */
2214 2211  int
2215 2212  as_incore(struct as *as, caddr_t addr,
2216 2213      size_t size, char *vec, size_t *sizep)
2217 2214  {
2218 2215          struct seg *seg;
2219 2216          size_t ssize;
2220 2217          caddr_t raddr;          /* rounded down addr */
2221 2218          size_t rsize;           /* rounded up size */
2222 2219          size_t isize;                   /* iteration size */
2223 2220          int error = 0;          /* result, assume success */
2224 2221  
2225 2222          *sizep = 0;
2226 2223          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2227 2224          rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2228 2225              (size_t)raddr;
2229 2226  
2230 2227          if (raddr + rsize < raddr)              /* check for wraparound */
2231 2228                  return (ENOMEM);
2232 2229  
2233 2230          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2234 2231          seg = as_segat(as, raddr);
2235 2232          if (seg == NULL) {
2236 2233                  AS_LOCK_EXIT(as, &as->a_lock);
2237 2234                  return (-1);
2238 2235          }
2239 2236  
2240 2237          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2241 2238                  if (raddr >= seg->s_base + seg->s_size) {
2242 2239                          seg = AS_SEGNEXT(as, seg);
2243 2240                          if (seg == NULL || raddr != seg->s_base) {
2244 2241                                  error = -1;
2245 2242                                  break;
2246 2243                          }
2247 2244                  }
2248 2245                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2249 2246                          ssize = seg->s_base + seg->s_size - raddr;
2250 2247                  else
2251 2248                          ssize = rsize;
2252 2249                  *sizep += isize = segop_incore(seg, raddr, ssize, vec);
2253 2250                  if (isize != ssize) {
2254 2251                          error = -1;
2255 2252                          break;
2256 2253                  }
2257 2254                  vec += btopr(ssize);
2258 2255          }
2259 2256          AS_LOCK_EXIT(as, &as->a_lock);
2260 2257          return (error);
2261 2258  }
2262 2259  
2263 2260  static void
2264 2261  as_segunlock(struct seg *seg, caddr_t addr, int attr,
2265 2262          ulong_t *bitmap, size_t position, size_t npages)
2266 2263  {
2267 2264          caddr_t range_start;
2268 2265          size_t  pos1 = position;
2269 2266          size_t  pos2;
2270 2267          size_t  size;
2271 2268          size_t  end_pos = npages + position;
2272 2269  
2273 2270          while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2274 2271                  size = ptob((pos2 - pos1));
2275 2272                  range_start = (caddr_t)((uintptr_t)addr +
2276 2273                      ptob(pos1 - position));
2277 2274  
2278 2275                  (void) segop_lockop(seg, range_start, size, attr, MC_UNLOCK,
2279 2276                      (ulong_t *)NULL, (size_t)NULL);
2280 2277                  pos1 = pos2;
2281 2278          }
2282 2279  }
2283 2280  
2284 2281  static void
2285 2282  as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2286 2283          caddr_t raddr, size_t rsize)
2287 2284  {
2288 2285          struct seg *seg = as_segat(as, raddr);
2289 2286          size_t ssize;
2290 2287  
2291 2288          while (rsize != 0) {
2292 2289                  if (raddr >= seg->s_base + seg->s_size)
2293 2290                          seg = AS_SEGNEXT(as, seg);
2294 2291  
2295 2292                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2296 2293                          ssize = seg->s_base + seg->s_size - raddr;
2297 2294                  else
2298 2295                          ssize = rsize;
2299 2296  
2300 2297                  as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2301 2298  
2302 2299                  rsize -= ssize;
2303 2300                  raddr += ssize;
2304 2301          }
2305 2302  }
2306 2303  
2307 2304  /*
2308 2305   * Cache control operations over the interval [addr, addr + size) in
2309 2306   * address space "as".
2310 2307   */
2311 2308  /*ARGSUSED*/
2312 2309  int
2313 2310  as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2314 2311      uintptr_t arg, ulong_t *lock_map, size_t pos)
2315 2312  {
2316 2313          struct seg *seg;        /* working segment */
2317 2314          caddr_t raddr;          /* rounded down addr */
2318 2315          caddr_t initraddr;      /* saved initial rounded down addr */
2319 2316          size_t rsize;           /* rounded up size */
2320 2317          size_t initrsize;       /* saved initial rounded up size */
2321 2318          size_t ssize;           /* size of seg */
2322 2319          int error = 0;                  /* result */
2323 2320          size_t mlock_size;      /* size of bitmap */
2324 2321          ulong_t *mlock_map;     /* pointer to bitmap used */
2325 2322                                  /* to represent the locked */
2326 2323                                  /* pages. */
2327 2324  retry:
2328 2325          if (error == IE_RETRY)
2329 2326                  AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2330 2327          else
2331 2328                  AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2332 2329  
2333 2330          /*
2334 2331           * If these are address space lock/unlock operations, loop over
2335 2332           * all segments in the address space, as appropriate.
2336 2333           */
2337 2334          if (func == MC_LOCKAS) {
2338 2335                  size_t npages, idx;
2339 2336                  size_t rlen = 0;        /* rounded as length */
2340 2337  
2341 2338                  idx = pos;
2342 2339  
2343 2340                  if (arg & MCL_FUTURE) {
2344 2341                          mutex_enter(&as->a_contents);
2345 2342                          AS_SETPGLCK(as);
2346 2343                          mutex_exit(&as->a_contents);
2347 2344                  }
2348 2345                  if ((arg & MCL_CURRENT) == 0) {
2349 2346                          AS_LOCK_EXIT(as, &as->a_lock);
2350 2347                          return (0);
2351 2348                  }
2352 2349  
2353 2350                  seg = AS_SEGFIRST(as);
2354 2351                  if (seg == NULL) {
2355 2352                          AS_LOCK_EXIT(as, &as->a_lock);
2356 2353                          return (0);
2357 2354                  }
2358 2355  
2359 2356                  do {
2360 2357                          raddr = (caddr_t)((uintptr_t)seg->s_base &
2361 2358                              (uintptr_t)PAGEMASK);
2362 2359                          rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2363 2360                              PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2364 2361                  } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2365 2362  
2366 2363                  mlock_size = BT_BITOUL(btopr(rlen));
2367 2364                  if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2368 2365                      sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2369 2366                                  AS_LOCK_EXIT(as, &as->a_lock);
2370 2367                                  return (EAGAIN);
2371 2368                  }
2372 2369  
2373 2370                  for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2374 2371                          error = segop_lockop(seg, seg->s_base,
2375 2372                              seg->s_size, attr, MC_LOCK, mlock_map, pos);
2376 2373                          if (error != 0)
2377 2374                                  break;
2378 2375                          pos += seg_pages(seg);
2379 2376                  }
2380 2377  
2381 2378                  if (error) {
2382 2379                          for (seg = AS_SEGFIRST(as); seg != NULL;
2383 2380                              seg = AS_SEGNEXT(as, seg)) {
2384 2381  
2385 2382                                  raddr = (caddr_t)((uintptr_t)seg->s_base &
2386 2383                                      (uintptr_t)PAGEMASK);
2387 2384                                  npages = seg_pages(seg);
2388 2385                                  as_segunlock(seg, raddr, attr, mlock_map,
2389 2386                                      idx, npages);
2390 2387                                  idx += npages;
2391 2388                          }
2392 2389                  }
2393 2390  
2394 2391                  kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2395 2392                  AS_LOCK_EXIT(as, &as->a_lock);
2396 2393                  goto lockerr;
2397 2394          } else if (func == MC_UNLOCKAS) {
2398 2395                  mutex_enter(&as->a_contents);
2399 2396                  AS_CLRPGLCK(as);
2400 2397                  mutex_exit(&as->a_contents);
2401 2398  
2402 2399                  for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2403 2400                          error = segop_lockop(seg, seg->s_base,
2404 2401                              seg->s_size, attr, MC_UNLOCK, NULL, 0);
2405 2402                          if (error != 0)
2406 2403                                  break;
2407 2404                  }
2408 2405  
2409 2406                  AS_LOCK_EXIT(as, &as->a_lock);
2410 2407                  goto lockerr;
2411 2408          }
2412 2409  
2413 2410          /*
2414 2411           * Normalize addresses and sizes.
2415 2412           */
2416 2413          initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2417 2414          initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2418 2415              (size_t)raddr;
2419 2416  
2420 2417          if (raddr + rsize < raddr) {            /* check for wraparound */
2421 2418                  AS_LOCK_EXIT(as, &as->a_lock);
2422 2419                  return (ENOMEM);
2423 2420          }
2424 2421  
2425 2422          /*
2426 2423           * Get initial segment.
2427 2424           */
2428 2425          if ((seg = as_segat(as, raddr)) == NULL) {
2429 2426                  AS_LOCK_EXIT(as, &as->a_lock);
2430 2427                  return (ENOMEM);
2431 2428          }
2432 2429  
2433 2430          if (func == MC_LOCK) {
2434 2431                  mlock_size = BT_BITOUL(btopr(rsize));
2435 2432                  if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2436 2433                      sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2437 2434                                  AS_LOCK_EXIT(as, &as->a_lock);
2438 2435                                  return (EAGAIN);
2439 2436                  }
2440 2437          }
2441 2438  
2442 2439          /*
2443 2440           * Loop over all segments.  If a hole in the address range is
2444 2441           * discovered, then fail.  For each segment, perform the appropriate
2445 2442           * control operation.
2446 2443           */
2447 2444          while (rsize != 0) {
2448 2445  
2449 2446                  /*
2450 2447                   * Make sure there's no hole, calculate the portion
2451 2448                   * of the next segment to be operated over.
2452 2449                   */
2453 2450                  if (raddr >= seg->s_base + seg->s_size) {
2454 2451                          seg = AS_SEGNEXT(as, seg);
2455 2452                          if (seg == NULL || raddr != seg->s_base) {
2456 2453                                  if (func == MC_LOCK) {
2457 2454                                          as_unlockerr(as, attr, mlock_map,
2458 2455                                              initraddr, initrsize - rsize);
2459 2456                                          kmem_free(mlock_map,
2460 2457                                              mlock_size * sizeof (ulong_t));
2461 2458                                  }
2462 2459                                  AS_LOCK_EXIT(as, &as->a_lock);
2463 2460                                  return (ENOMEM);
2464 2461                          }
2465 2462                  }
2466 2463                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2467 2464                          ssize = seg->s_base + seg->s_size - raddr;
2468 2465                  else
2469 2466                          ssize = rsize;
2470 2467  
2471 2468                  /*
2472 2469                   * Dispatch on specific function.
2473 2470                   */
2474 2471                  switch (func) {
2475 2472  
2476 2473                  /*
2477 2474                   * Synchronize cached data from mappings with backing
2478 2475                   * objects.
2479 2476                   */
2480 2477                  case MC_SYNC:
2481 2478                          if (error = segop_sync(seg, raddr, ssize,
2482 2479                              attr, (uint_t)arg)) {
2483 2480                                  AS_LOCK_EXIT(as, &as->a_lock);
2484 2481                                  return (error);
2485 2482                          }
2486 2483                          break;
2487 2484  
2488 2485                  /*
2489 2486                   * Lock pages in memory.
2490 2487                   */
2491 2488                  case MC_LOCK:
2492 2489                          if (error = segop_lockop(seg, raddr, ssize,
2493 2490                              attr, func, mlock_map, pos)) {
2494 2491                                  as_unlockerr(as, attr, mlock_map, initraddr,
2495 2492                                      initrsize - rsize + ssize);
2496 2493                                  kmem_free(mlock_map, mlock_size *
2497 2494                                      sizeof (ulong_t));
2498 2495                                  AS_LOCK_EXIT(as, &as->a_lock);
2499 2496                                  goto lockerr;
2500 2497                          }
2501 2498                          break;
2502 2499  
2503 2500                  /*
2504 2501                   * Unlock mapped pages.
2505 2502                   */
2506 2503                  case MC_UNLOCK:
2507 2504                          (void) segop_lockop(seg, raddr, ssize, attr, func,
2508 2505                              (ulong_t *)NULL, (size_t)NULL);
2509 2506                          break;
2510 2507  
2511 2508                  /*
2512 2509                   * Store VM advise for mapped pages in segment layer.
2513 2510                   */
2514 2511                  case MC_ADVISE:
2515 2512                          error = segop_advise(seg, raddr, ssize, (uint_t)arg);
2516 2513  
2517 2514                          /*
2518 2515                           * Check for regular errors and special retry error
2519 2516                           */
2520 2517                          if (error) {
2521 2518                                  if (error == IE_RETRY) {
2522 2519                                          /*
2523 2520                                           * Need to acquire writers lock, so
2524 2521                                           * have to drop readers lock and start
2525 2522                                           * all over again
2526 2523                                           */
2527 2524                                          AS_LOCK_EXIT(as, &as->a_lock);
2528 2525                                          goto retry;
2529 2526                                  } else if (error == IE_REATTACH) {
2530 2527                                          /*
2531 2528                                           * Find segment for current address
2532 2529                                           * because current segment just got
2533 2530                                           * split or concatenated
2534 2531                                           */
2535 2532                                          seg = as_segat(as, raddr);
2536 2533                                          if (seg == NULL) {
2537 2534                                                  AS_LOCK_EXIT(as, &as->a_lock);
2538 2535                                                  return (ENOMEM);
2539 2536                                          }
2540 2537                                  } else {
2541 2538                                          /*
2542 2539                                           * Regular error
2543 2540                                           */
2544 2541                                          AS_LOCK_EXIT(as, &as->a_lock);
2545 2542                                          return (error);
2546 2543                                  }
2547 2544                          }
2548 2545                          break;
2549 2546  
2550 2547                  case MC_INHERIT_ZERO:
2551 2548                          error = segop_inherit(seg, raddr, ssize, SEGP_INH_ZERO);
2552 2549                          if (error != 0) {
2553 2550                                  AS_LOCK_EXIT(as, &as->a_lock);
2554 2551                                  return (error);
2555 2552                          }
2556 2553                          break;
2557 2554  
2558 2555                  /*
2559 2556                   * Can't happen.
2560 2557                   */
2561 2558                  default:
2562 2559                          panic("as_ctl: bad operation %d", func);
2563 2560                          /*NOTREACHED*/
2564 2561                  }
2565 2562  
2566 2563                  rsize -= ssize;
2567 2564                  raddr += ssize;
2568 2565          }
2569 2566  
2570 2567          if (func == MC_LOCK)
2571 2568                  kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2572 2569          AS_LOCK_EXIT(as, &as->a_lock);
2573 2570          return (0);
2574 2571  lockerr:
2575 2572  
2576 2573          /*
2577 2574           * If the lower levels returned EDEADLK for a segment lockop,
2578 2575           * it means that we should retry the operation.  Let's wait
2579 2576           * a bit also to let the deadlock causing condition clear.
2580 2577           * This is part of a gross hack to work around a design flaw
2581 2578           * in the ufs/sds logging code and should go away when the
2582 2579           * logging code is re-designed to fix the problem. See bug
2583 2580           * 4125102 for details of the problem.
2584 2581           */
2585 2582          if (error == EDEADLK) {
2586 2583                  delay(deadlk_wait);
2587 2584                  error = 0;
2588 2585                  goto retry;
2589 2586          }
2590 2587          return (error);
2591 2588  }
2592 2589  
2593 2590  int
2594 2591  fc_decode(faultcode_t fault_err)
2595 2592  {
2596 2593          int error = 0;
2597 2594  
2598 2595          switch (FC_CODE(fault_err)) {
2599 2596          case FC_OBJERR:
2600 2597                  error = FC_ERRNO(fault_err);
2601 2598                  break;
2602 2599          case FC_PROT:
2603 2600                  error = EACCES;
2604 2601                  break;
2605 2602          default:
2606 2603                  error = EFAULT;
2607 2604                  break;
2608 2605          }
2609 2606          return (error);
2610 2607  }
2611 2608  
2612 2609  /*
2613 2610   * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2614 2611   * lists from each segment and copy them to one contiguous shadow list (plist)
2615 2612   * as expected by the caller.  Save pointers to per segment shadow lists at
2616 2613   * the tail of plist so that they can be used during as_pageunlock().
2617 2614   */
2618 2615  static int
2619 2616  as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2620 2617      caddr_t addr, size_t size, enum seg_rw rw)
2621 2618  {
2622 2619          caddr_t sv_addr = addr;
2623 2620          size_t sv_size = size;
2624 2621          struct seg *sv_seg = seg;

↓ open down ↓

419 lines elided

↑ open up ↑

2625 2622          ulong_t segcnt = 1;
2626 2623          ulong_t cnt;
2627 2624          size_t ssize;
2628 2625          pgcnt_t npages = btop(size);
2629 2626          page_t **plist;
2630 2627          page_t **pl;
2631 2628          int error;
2632 2629          caddr_t eaddr;
2633 2630          faultcode_t fault_err = 0;
2634 2631          pgcnt_t pl_off;
2635      -        extern struct seg_ops segspt_shmops;
     2632 +        extern const struct seg_ops segspt_shmops;
2636 2633  
2637 2634          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2638 2635          ASSERT(seg != NULL);
2639 2636          ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2640 2637          ASSERT(addr + size > seg->s_base + seg->s_size);
2641 2638          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2642 2639          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2643 2640  
2644 2641          /*
2645 2642           * Count the number of segments covered by the range we are about to

2646 2643           * lock. The segment count is used to size the shadow list we return
2647 2644           * back to the caller.
2648 2645           */
2649 2646          for (; size != 0; size -= ssize, addr += ssize) {
2650 2647                  if (addr >= seg->s_base + seg->s_size) {
2651 2648  
2652 2649                          seg = AS_SEGNEXT(as, seg);
2653 2650                          if (seg == NULL || addr != seg->s_base) {
2654 2651                                  AS_LOCK_EXIT(as, &as->a_lock);
2655 2652                                  return (EFAULT);
2656 2653                          }
2657 2654                          /*
2658 2655                           * Do a quick check if subsequent segments
2659 2656                           * will most likely support pagelock.
2660 2657                           */
2661 2658                          if (seg->s_ops == &segvn_ops) {
2662 2659                                  vnode_t *vp;
2663 2660  
2664 2661                                  if (segop_getvp(seg, addr, &vp) != 0 ||
2665 2662                                      vp != NULL) {
2666 2663                                          AS_LOCK_EXIT(as, &as->a_lock);
2667 2664                                          goto slow;
2668 2665                                  }
2669 2666                          } else if (seg->s_ops != &segspt_shmops) {
2670 2667                                  AS_LOCK_EXIT(as, &as->a_lock);
2671 2668                                  goto slow;
2672 2669                          }
2673 2670                          segcnt++;
2674 2671                  }
2675 2672                  if (addr + size > seg->s_base + seg->s_size) {
2676 2673                          ssize = seg->s_base + seg->s_size - addr;
2677 2674                  } else {
2678 2675                          ssize = size;
2679 2676                  }
2680 2677          }
2681 2678          ASSERT(segcnt > 1);
2682 2679  
2683 2680          plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2684 2681  
2685 2682          addr = sv_addr;
2686 2683          size = sv_size;
2687 2684          seg = sv_seg;
2688 2685  
2689 2686          for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2690 2687                  if (addr >= seg->s_base + seg->s_size) {
2691 2688                          seg = AS_SEGNEXT(as, seg);
2692 2689                          ASSERT(seg != NULL && addr == seg->s_base);
2693 2690                          cnt++;
2694 2691                          ASSERT(cnt < segcnt);
2695 2692                  }
2696 2693                  if (addr + size > seg->s_base + seg->s_size) {
2697 2694                          ssize = seg->s_base + seg->s_size - addr;
2698 2695                  } else {
2699 2696                          ssize = size;
2700 2697                  }
2701 2698                  pl = &plist[npages + cnt];
2702 2699                  error = segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2703 2700                      L_PAGELOCK, rw);
2704 2701                  if (error) {
2705 2702                          break;
2706 2703                  }
2707 2704                  ASSERT(plist[npages + cnt] != NULL);
2708 2705                  ASSERT(pl_off + btop(ssize) <= npages);
2709 2706                  bcopy(plist[npages + cnt], &plist[pl_off],
2710 2707                      btop(ssize) * sizeof (page_t *));
2711 2708                  pl_off += btop(ssize);
2712 2709          }
2713 2710  
2714 2711          if (size == 0) {
2715 2712                  AS_LOCK_EXIT(as, &as->a_lock);
2716 2713                  ASSERT(cnt == segcnt - 1);
2717 2714                  *ppp = plist;
2718 2715                  return (0);
2719 2716          }
2720 2717  
2721 2718          /*
2722 2719           * one of pagelock calls failed. The error type is in error variable.
2723 2720           * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2724 2721           * type is either EFAULT or ENOTSUP. Otherwise just return the error
2725 2722           * back to the caller.
2726 2723           */
2727 2724  
2728 2725          eaddr = addr;
2729 2726          seg = sv_seg;
2730 2727  
2731 2728          for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2732 2729                  if (addr >= seg->s_base + seg->s_size) {
2733 2730                          seg = AS_SEGNEXT(as, seg);
2734 2731                          ASSERT(seg != NULL && addr == seg->s_base);
2735 2732                          cnt++;
2736 2733                          ASSERT(cnt < segcnt);
2737 2734                  }
2738 2735                  if (eaddr > seg->s_base + seg->s_size) {
2739 2736                          ssize = seg->s_base + seg->s_size - addr;
2740 2737                  } else {
2741 2738                          ssize = eaddr - addr;
2742 2739                  }
2743 2740                  pl = &plist[npages + cnt];
2744 2741                  ASSERT(*pl != NULL);
2745 2742                  (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2746 2743                      L_PAGEUNLOCK, rw);
2747 2744          }
2748 2745  
2749 2746          AS_LOCK_EXIT(as, &as->a_lock);
2750 2747  
2751 2748          kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2752 2749  
2753 2750          if (error != ENOTSUP && error != EFAULT) {
2754 2751                  return (error);
2755 2752          }
2756 2753  
2757 2754  slow:
2758 2755          /*
2759 2756           * If we are here because pagelock failed due to the need to cow fault
2760 2757           * in the pages we want to lock F_SOFTLOCK will do this job and in
2761 2758           * next as_pagelock() call for this address range pagelock will
2762 2759           * hopefully succeed.
2763 2760           */
2764 2761          fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2765 2762          if (fault_err != 0) {
2766 2763                  return (fc_decode(fault_err));
2767 2764          }
2768 2765          *ppp = NULL;
2769 2766  
2770 2767          return (0);
2771 2768  }
2772 2769  
2773 2770  /*
2774 2771   * lock pages in a given address space. Return shadow list. If
2775 2772   * the list is NULL, the MMU mapping is also locked.
2776 2773   */
2777 2774  int
2778 2775  as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2779 2776      size_t size, enum seg_rw rw)
2780 2777  {
2781 2778          size_t rsize;
2782 2779          caddr_t raddr;
2783 2780          faultcode_t fault_err;
2784 2781          struct seg *seg;
2785 2782          int err;
2786 2783  
2787 2784          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2788 2785              "as_pagelock_start: addr %p size %ld", addr, size);
2789 2786  
2790 2787          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2791 2788          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2792 2789              (size_t)raddr;
2793 2790  
2794 2791          /*
2795 2792           * if the request crosses two segments let
2796 2793           * as_fault handle it.
2797 2794           */
2798 2795          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2799 2796  
2800 2797          seg = as_segat(as, raddr);
2801 2798          if (seg == NULL) {
2802 2799                  AS_LOCK_EXIT(as, &as->a_lock);
2803 2800                  return (EFAULT);
2804 2801          }
2805 2802          ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2806 2803          if (raddr + rsize > seg->s_base + seg->s_size) {
2807 2804                  return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2808 2805          }
2809 2806          if (raddr + rsize <= raddr) {
2810 2807                  AS_LOCK_EXIT(as, &as->a_lock);
2811 2808                  return (EFAULT);
2812 2809          }
2813 2810  
2814 2811          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2815 2812              "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2816 2813  
2817 2814          /*
2818 2815           * try to lock pages and pass back shadow list
2819 2816           */
2820 2817          err = segop_pagelock(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2821 2818  
2822 2819          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2823 2820  
2824 2821          AS_LOCK_EXIT(as, &as->a_lock);
2825 2822  
2826 2823          if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2827 2824                  return (err);
2828 2825          }
2829 2826  
2830 2827          /*
2831 2828           * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2832 2829           * to no pagelock support for this segment or pages need to be cow
2833 2830           * faulted in. If fault is needed F_SOFTLOCK will do this job for
2834 2831           * this as_pagelock() call and in the next as_pagelock() call for the
2835 2832           * same address range pagelock call will hopefull succeed.
2836 2833           */
2837 2834          fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2838 2835          if (fault_err != 0) {
2839 2836                  return (fc_decode(fault_err));
2840 2837          }
2841 2838          *ppp = NULL;
2842 2839  
2843 2840          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2844 2841          return (0);
2845 2842  }
2846 2843  
2847 2844  /*
2848 2845   * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2849 2846   * lists from the end of plist and call pageunlock interface for each segment.
2850 2847   * Drop as lock and free plist.
2851 2848   */
2852 2849  static void
2853 2850  as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2854 2851      struct page **plist, enum seg_rw rw)
2855 2852  {
2856 2853          ulong_t cnt;
2857 2854          caddr_t eaddr = addr + size;
2858 2855          pgcnt_t npages = btop(size);
2859 2856          size_t ssize;
2860 2857          page_t **pl;
2861 2858  
2862 2859          ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2863 2860          ASSERT(seg != NULL);
2864 2861          ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2865 2862          ASSERT(addr + size > seg->s_base + seg->s_size);
2866 2863          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2867 2864          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2868 2865          ASSERT(plist != NULL);
2869 2866  
2870 2867          for (cnt = 0; addr < eaddr; addr += ssize) {
2871 2868                  if (addr >= seg->s_base + seg->s_size) {
2872 2869                          seg = AS_SEGNEXT(as, seg);
2873 2870                          ASSERT(seg != NULL && addr == seg->s_base);
2874 2871                          cnt++;
2875 2872                  }
2876 2873                  if (eaddr > seg->s_base + seg->s_size) {
2877 2874                          ssize = seg->s_base + seg->s_size - addr;
2878 2875                  } else {
2879 2876                          ssize = eaddr - addr;
2880 2877                  }
2881 2878                  pl = &plist[npages + cnt];
2882 2879                  ASSERT(*pl != NULL);
2883 2880                  (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2884 2881                      L_PAGEUNLOCK, rw);
2885 2882          }
2886 2883          ASSERT(cnt > 0);
2887 2884          AS_LOCK_EXIT(as, &as->a_lock);
2888 2885  
2889 2886          cnt++;
2890 2887          kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2891 2888  }
2892 2889  
2893 2890  /*
2894 2891   * unlock pages in a given address range
2895 2892   */
2896 2893  void
2897 2894  as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2898 2895      enum seg_rw rw)
2899 2896  {
2900 2897          struct seg *seg;
2901 2898          size_t rsize;
2902 2899          caddr_t raddr;
2903 2900  
2904 2901          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2905 2902              "as_pageunlock_start: addr %p size %ld", addr, size);
2906 2903  
2907 2904          /*
2908 2905           * if the shadow list is NULL, as_pagelock was
2909 2906           * falling back to as_fault
2910 2907           */
2911 2908          if (pp == NULL) {
2912 2909                  (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2913 2910                  return;
2914 2911          }
2915 2912  
2916 2913          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2917 2914          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2918 2915              (size_t)raddr;
2919 2916  
2920 2917          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2921 2918          seg = as_segat(as, raddr);
2922 2919          ASSERT(seg != NULL);
2923 2920  
2924 2921          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2925 2922              "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2926 2923  
2927 2924          ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2928 2925          if (raddr + rsize <= seg->s_base + seg->s_size) {
2929 2926                  (void) segop_pagelock(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2930 2927          } else {
2931 2928                  as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2932 2929                  return;
2933 2930          }
2934 2931          AS_LOCK_EXIT(as, &as->a_lock);
2935 2932          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2936 2933  }
2937 2934  
2938 2935  int
2939 2936  as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2940 2937      boolean_t wait)
2941 2938  {
2942 2939          struct seg *seg;
2943 2940          size_t ssize;
2944 2941          caddr_t raddr;                  /* rounded down addr */
2945 2942          size_t rsize;                   /* rounded up size */
2946 2943          int error = 0;
2947 2944          size_t pgsz = page_get_pagesize(szc);
2948 2945  
2949 2946  setpgsz_top:
2950 2947          if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2951 2948                  return (EINVAL);
2952 2949          }
2953 2950  
2954 2951          raddr = addr;
2955 2952          rsize = size;
2956 2953  
2957 2954          if (raddr + rsize < raddr)              /* check for wraparound */
2958 2955                  return (ENOMEM);
2959 2956  
2960 2957          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2961 2958          as_clearwatchprot(as, raddr, rsize);
2962 2959          seg = as_segat(as, raddr);
2963 2960          if (seg == NULL) {
2964 2961                  as_setwatch(as);
2965 2962                  AS_LOCK_EXIT(as, &as->a_lock);
2966 2963                  return (ENOMEM);
2967 2964          }
2968 2965  
2969 2966          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2970 2967                  if (raddr >= seg->s_base + seg->s_size) {
2971 2968                          seg = AS_SEGNEXT(as, seg);
2972 2969                          if (seg == NULL || raddr != seg->s_base) {
2973 2970                                  error = ENOMEM;
2974 2971                                  break;
2975 2972                          }
2976 2973                  }
2977 2974                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2978 2975                          ssize = seg->s_base + seg->s_size - raddr;
2979 2976                  } else {
2980 2977                          ssize = rsize;
2981 2978                  }
2982 2979  
2983 2980  retry:
2984 2981                  error = segop_setpagesize(seg, raddr, ssize, szc);
2985 2982  
2986 2983                  if (error == IE_NOMEM) {
2987 2984                          error = EAGAIN;
2988 2985                          break;
2989 2986                  }
2990 2987  
2991 2988                  if (error == IE_RETRY) {
2992 2989                          AS_LOCK_EXIT(as, &as->a_lock);
2993 2990                          goto setpgsz_top;
2994 2991                  }
2995 2992  
2996 2993                  if (error == ENOTSUP) {
2997 2994                          error = EINVAL;
2998 2995                          break;
2999 2996                  }
3000 2997  
3001 2998                  if (wait && (error == EAGAIN)) {
3002 2999                          /*
3003 3000                           * Memory is currently locked.  It must be unlocked
3004 3001                           * before this operation can succeed through a retry.
3005 3002                           * The possible reasons for locked memory and
3006 3003                           * corresponding strategies for unlocking are:
3007 3004                           * (1) Normal I/O
3008 3005                           *      wait for a signal that the I/O operation
3009 3006                           *      has completed and the memory is unlocked.
3010 3007                           * (2) Asynchronous I/O
3011 3008                           *      The aio subsystem does not unlock pages when
3012 3009                           *      the I/O is completed. Those pages are unlocked
3013 3010                           *      when the application calls aiowait/aioerror.
3014 3011                           *      So, to prevent blocking forever, cv_broadcast()
3015 3012                           *      is done to wake up aio_cleanup_thread.
3016 3013                           *      Subsequently, segvn_reclaim will be called, and
3017 3014                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
3018 3015                           * (3) Long term page locking:
3019 3016                           *      This is not relevant for as_setpagesize()
3020 3017                           *      because we cannot change the page size for
3021 3018                           *      driver memory. The attempt to do so will
3022 3019                           *      fail with a different error than EAGAIN so
3023 3020                           *      there's no need to trigger as callbacks like
3024 3021                           *      as_unmap, as_setprot or as_free would do.
3025 3022                           */
3026 3023                          mutex_enter(&as->a_contents);
3027 3024                          if (!AS_ISNOUNMAPWAIT(as)) {
3028 3025                                  if (AS_ISUNMAPWAIT(as) == 0) {
3029 3026                                          cv_broadcast(&as->a_cv);
3030 3027                                  }
3031 3028                                  AS_SETUNMAPWAIT(as);
3032 3029                                  AS_LOCK_EXIT(as, &as->a_lock);
3033 3030                                  while (AS_ISUNMAPWAIT(as)) {
3034 3031                                          cv_wait(&as->a_cv, &as->a_contents);
3035 3032                                  }
3036 3033                          } else {
3037 3034                                  /*
3038 3035                                   * We may have raced with
3039 3036                                   * segvn_reclaim()/segspt_reclaim(). In this
3040 3037                                   * case clean nounmapwait flag and retry since
3041 3038                                   * softlockcnt in this segment may be already
3042 3039                                   * 0.  We don't drop as writer lock so our
3043 3040                                   * number of retries without sleeping should
3044 3041                                   * be very small. See segvn_reclaim() for
3045 3042                                   * more comments.
3046 3043                                   */
3047 3044                                  AS_CLRNOUNMAPWAIT(as);
3048 3045                                  mutex_exit(&as->a_contents);
3049 3046                                  goto retry;
3050 3047                          }
3051 3048                          mutex_exit(&as->a_contents);
3052 3049                          goto setpgsz_top;
3053 3050                  } else if (error != 0) {
3054 3051                          break;
3055 3052                  }
3056 3053          }
3057 3054          as_setwatch(as);
3058 3055          AS_LOCK_EXIT(as, &as->a_lock);
3059 3056          return (error);
3060 3057  }
3061 3058  
3062 3059  /*
3063 3060   * as_iset3_default_lpsize() just calls segop_setpagesize() on all segments
3064 3061   * in its chunk where s_szc is less than the szc we want to set.
3065 3062   */
3066 3063  static int
3067 3064  as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3068 3065      int *retry)
3069 3066  {
3070 3067          struct seg *seg;
3071 3068          size_t ssize;
3072 3069          int error;
3073 3070  
3074 3071          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3075 3072  
3076 3073          seg = as_segat(as, raddr);
3077 3074          if (seg == NULL) {
3078 3075                  panic("as_iset3_default_lpsize: no seg");
3079 3076          }
3080 3077  
3081 3078          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3082 3079                  if (raddr >= seg->s_base + seg->s_size) {
3083 3080                          seg = AS_SEGNEXT(as, seg);
3084 3081                          if (seg == NULL || raddr != seg->s_base) {
3085 3082                                  panic("as_iset3_default_lpsize: as changed");
3086 3083                          }
3087 3084                  }
3088 3085                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3089 3086                          ssize = seg->s_base + seg->s_size - raddr;
3090 3087                  } else {
3091 3088                          ssize = rsize;
3092 3089                  }
3093 3090  
3094 3091                  if (szc > seg->s_szc) {
3095 3092                          error = segop_setpagesize(seg, raddr, ssize, szc);
3096 3093                          /* Only retry on EINVAL segments that have no vnode. */
3097 3094                          if (error == EINVAL) {
3098 3095                                  vnode_t *vp = NULL;
3099 3096                                  if ((segop_gettype(seg, raddr) & MAP_SHARED) &&
3100 3097                                      (segop_getvp(seg, raddr, &vp) != 0 ||
3101 3098                                      vp == NULL)) {
3102 3099                                          *retry = 1;
3103 3100                                  } else {
3104 3101                                          *retry = 0;
3105 3102                                  }
3106 3103                          }
3107 3104                          if (error) {
3108 3105                                  return (error);
3109 3106                          }
3110 3107                  }
3111 3108          }
3112 3109          return (0);
3113 3110  }
3114 3111  
3115 3112  /*
3116 3113   * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3117 3114   * pagesize on each segment in its range, but if any fails with EINVAL,
3118 3115   * then it reduces the pagesizes to the next size in the bitmap and
3119 3116   * retries as_iset3_default_lpsize(). The reason why the code retries
3120 3117   * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3121 3118   * match the bigger sizes, and (b) it's hard to get this offset (to begin
3122 3119   * with) to pass to map_pgszcvec().
3123 3120   */
3124 3121  static int
3125 3122  as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3126 3123      uint_t szcvec)
3127 3124  {
3128 3125          int error;
3129 3126          int retry;
3130 3127  
3131 3128          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3132 3129  
3133 3130          for (;;) {
3134 3131                  error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3135 3132                  if (error == EINVAL && retry) {
3136 3133                          szcvec &= ~(1 << szc);
3137 3134                          if (szcvec <= 1) {
3138 3135                                  return (EINVAL);
3139 3136                          }
3140 3137                          szc = highbit(szcvec) - 1;
3141 3138                  } else {
3142 3139                          return (error);
3143 3140                  }
3144 3141          }
3145 3142  }
3146 3143  
3147 3144  /*
3148 3145   * as_iset1_default_lpsize() breaks its chunk into areas where existing
3149 3146   * segments have a smaller szc than we want to set. For each such area,
3150 3147   * it calls as_iset2_default_lpsize()
3151 3148   */
3152 3149  static int
3153 3150  as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3154 3151      uint_t szcvec)
3155 3152  {
3156 3153          struct seg *seg;
3157 3154          size_t ssize;
3158 3155          caddr_t setaddr = raddr;
3159 3156          size_t setsize = 0;
3160 3157          int set;
3161 3158          int error;
3162 3159  
3163 3160          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3164 3161  
3165 3162          seg = as_segat(as, raddr);
3166 3163          if (seg == NULL) {
3167 3164                  panic("as_iset1_default_lpsize: no seg");
3168 3165          }
3169 3166          if (seg->s_szc < szc) {
3170 3167                  set = 1;
3171 3168          } else {
3172 3169                  set = 0;
3173 3170          }
3174 3171  
3175 3172          for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3176 3173                  if (raddr >= seg->s_base + seg->s_size) {
3177 3174                          seg = AS_SEGNEXT(as, seg);
3178 3175                          if (seg == NULL || raddr != seg->s_base) {
3179 3176                                  panic("as_iset1_default_lpsize: as changed");
3180 3177                          }
3181 3178                          if (seg->s_szc >= szc && set) {
3182 3179                                  ASSERT(setsize != 0);
3183 3180                                  error = as_iset2_default_lpsize(as,
3184 3181                                      setaddr, setsize, szc, szcvec);
3185 3182                                  if (error) {
3186 3183                                          return (error);
3187 3184                                  }
3188 3185                                  set = 0;
3189 3186                          } else if (seg->s_szc < szc && !set) {
3190 3187                                  setaddr = raddr;
3191 3188                                  setsize = 0;
3192 3189                                  set = 1;
3193 3190                          }
3194 3191                  }
3195 3192                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3196 3193                          ssize = seg->s_base + seg->s_size - raddr;
3197 3194                  } else {
3198 3195                          ssize = rsize;
3199 3196                  }
3200 3197          }
3201 3198          error = 0;
3202 3199          if (set) {
3203 3200                  ASSERT(setsize != 0);
3204 3201                  error = as_iset2_default_lpsize(as, setaddr, setsize,
3205 3202                      szc, szcvec);
3206 3203          }
3207 3204          return (error);
3208 3205  }
3209 3206  
3210 3207  /*
3211 3208   * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3212 3209   * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3213 3210   * chunk to as_iset1_default_lpsize().
3214 3211   */
3215 3212  static int
3216 3213  as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3217 3214      int type)
3218 3215  {
3219 3216          int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3220 3217          uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3221 3218              flags, rtype, 1);
3222 3219          uint_t szc;
3223 3220          uint_t nszc;
3224 3221          int error;
3225 3222          caddr_t a;
3226 3223          caddr_t eaddr;
3227 3224          size_t segsize;
3228 3225          size_t pgsz;
3229 3226          uint_t save_szcvec;
3230 3227  
3231 3228          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3232 3229          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3233 3230          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3234 3231  
3235 3232          szcvec &= ~1;
3236 3233          if (szcvec <= 1) {      /* skip if base page size */
3237 3234                  return (0);
3238 3235          }
3239 3236  
3240 3237          /* Get the pagesize of the first larger page size. */
3241 3238          szc = lowbit(szcvec) - 1;
3242 3239          pgsz = page_get_pagesize(szc);
3243 3240          eaddr = addr + size;
3244 3241          addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3245 3242          eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3246 3243  
3247 3244          save_szcvec = szcvec;
3248 3245          szcvec >>= (szc + 1);
3249 3246          nszc = szc;
3250 3247          while (szcvec) {
3251 3248                  if ((szcvec & 0x1) == 0) {
3252 3249                          nszc++;
3253 3250                          szcvec >>= 1;
3254 3251                          continue;
3255 3252                  }
3256 3253                  nszc++;
3257 3254                  pgsz = page_get_pagesize(nszc);
3258 3255                  a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3259 3256                  if (a != addr) {
3260 3257                          ASSERT(szc > 0);
3261 3258                          ASSERT(a < eaddr);
3262 3259                          segsize = a - addr;
3263 3260                          error = as_iset1_default_lpsize(as, addr, segsize, szc,
3264 3261                              save_szcvec);
3265 3262                          if (error) {
3266 3263                                  return (error);
3267 3264                          }
3268 3265                          addr = a;
3269 3266                  }
3270 3267                  szc = nszc;
3271 3268                  szcvec >>= 1;
3272 3269          }
3273 3270  
3274 3271          ASSERT(addr < eaddr);
3275 3272          szcvec = save_szcvec;
3276 3273          while (szcvec) {
3277 3274                  a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3278 3275                  ASSERT(a >= addr);
3279 3276                  if (a != addr) {
3280 3277                          ASSERT(szc > 0);
3281 3278                          segsize = a - addr;
3282 3279                          error = as_iset1_default_lpsize(as, addr, segsize, szc,
3283 3280                              save_szcvec);
3284 3281                          if (error) {
3285 3282                                  return (error);
3286 3283                          }
3287 3284                          addr = a;
3288 3285                  }
3289 3286                  szcvec &= ~(1 << szc);
3290 3287                  if (szcvec) {
3291 3288                          szc = highbit(szcvec) - 1;
3292 3289                          pgsz = page_get_pagesize(szc);
3293 3290                  }
3294 3291          }
3295 3292          ASSERT(addr == eaddr);
3296 3293  
3297 3294          return (0);
3298 3295  }
3299 3296  
3300 3297  /*
3301 3298   * Set the default large page size for the range. Called via memcntl with
3302 3299   * page size set to 0. as_set_default_lpsize breaks the range down into
3303 3300   * chunks with the same type/flags, ignores-non segvn segments, and passes
3304 3301   * each chunk to as_iset_default_lpsize().
3305 3302   */
3306 3303  int
3307 3304  as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3308 3305  {
3309 3306          struct seg *seg;
3310 3307          caddr_t raddr;
3311 3308          size_t rsize;
3312 3309          size_t ssize;
3313 3310          int rtype, rflags;
3314 3311          int stype, sflags;
3315 3312          int error;
3316 3313          caddr_t setaddr;
3317 3314          size_t setsize;
3318 3315          int segvn;
3319 3316  
3320 3317          if (size == 0)
3321 3318                  return (0);
3322 3319  
3323 3320          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3324 3321  again:
3325 3322          error = 0;
3326 3323  
3327 3324          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3328 3325          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3329 3326              (size_t)raddr;
3330 3327  
3331 3328          if (raddr + rsize < raddr) {            /* check for wraparound */
3332 3329                  AS_LOCK_EXIT(as, &as->a_lock);
3333 3330                  return (ENOMEM);
3334 3331          }
3335 3332          as_clearwatchprot(as, raddr, rsize);
3336 3333          seg = as_segat(as, raddr);
3337 3334          if (seg == NULL) {
3338 3335                  as_setwatch(as);
3339 3336                  AS_LOCK_EXIT(as, &as->a_lock);
3340 3337                  return (ENOMEM);
3341 3338          }
3342 3339          if (seg->s_ops == &segvn_ops) {
3343 3340                  rtype = segop_gettype(seg, addr);
3344 3341                  rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3345 3342                  rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3346 3343                  segvn = 1;
3347 3344          } else {
3348 3345                  segvn = 0;
3349 3346          }
3350 3347          setaddr = raddr;
3351 3348          setsize = 0;
3352 3349  
3353 3350          for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3354 3351                  if (raddr >= (seg->s_base + seg->s_size)) {
3355 3352                          seg = AS_SEGNEXT(as, seg);
3356 3353                          if (seg == NULL || raddr != seg->s_base) {
3357 3354                                  error = ENOMEM;
3358 3355                                  break;
3359 3356                          }
3360 3357                          if (seg->s_ops == &segvn_ops) {
3361 3358                                  stype = segop_gettype(seg, raddr);
3362 3359                                  sflags = stype & (MAP_TEXT | MAP_INITDATA);
3363 3360                                  stype &= (MAP_SHARED | MAP_PRIVATE);
3364 3361                                  if (segvn && (rflags != sflags ||
3365 3362                                      rtype != stype)) {
3366 3363                                          /*
3367 3364                                           * The next segment is also segvn but
3368 3365                                           * has different flags and/or type.
3369 3366                                           */
3370 3367                                          ASSERT(setsize != 0);
3371 3368                                          error = as_iset_default_lpsize(as,
3372 3369                                              setaddr, setsize, rflags, rtype);
3373 3370                                          if (error) {
3374 3371                                                  break;
3375 3372                                          }
3376 3373                                          rflags = sflags;
3377 3374                                          rtype = stype;
3378 3375                                          setaddr = raddr;
3379 3376                                          setsize = 0;
3380 3377                                  } else if (!segvn) {
3381 3378                                          rflags = sflags;
3382 3379                                          rtype = stype;
3383 3380                                          setaddr = raddr;
3384 3381                                          setsize = 0;
3385 3382                                          segvn = 1;
3386 3383                                  }
3387 3384                          } else if (segvn) {
3388 3385                                  /* The next segment is not segvn. */
3389 3386                                  ASSERT(setsize != 0);
3390 3387                                  error = as_iset_default_lpsize(as,
3391 3388                                      setaddr, setsize, rflags, rtype);
3392 3389                                  if (error) {
3393 3390                                          break;
3394 3391                                  }
3395 3392                                  segvn = 0;
3396 3393                          }
3397 3394                  }
3398 3395                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3399 3396                          ssize = seg->s_base + seg->s_size - raddr;
3400 3397                  } else {
3401 3398                          ssize = rsize;
3402 3399                  }
3403 3400          }
3404 3401          if (error == 0 && segvn) {
3405 3402                  /* The last chunk when rsize == 0. */
3406 3403                  ASSERT(setsize != 0);
3407 3404                  error = as_iset_default_lpsize(as, setaddr, setsize,
3408 3405                      rflags, rtype);
3409 3406          }
3410 3407  
3411 3408          if (error == IE_RETRY) {
3412 3409                  goto again;
3413 3410          } else if (error == IE_NOMEM) {
3414 3411                  error = EAGAIN;
3415 3412          } else if (error == ENOTSUP) {
3416 3413                  error = EINVAL;
3417 3414          } else if (error == EAGAIN) {
3418 3415                  mutex_enter(&as->a_contents);
3419 3416                  if (!AS_ISNOUNMAPWAIT(as)) {
3420 3417                          if (AS_ISUNMAPWAIT(as) == 0) {
3421 3418                                  cv_broadcast(&as->a_cv);
3422 3419                          }
3423 3420                          AS_SETUNMAPWAIT(as);
3424 3421                          AS_LOCK_EXIT(as, &as->a_lock);
3425 3422                          while (AS_ISUNMAPWAIT(as)) {
3426 3423                                  cv_wait(&as->a_cv, &as->a_contents);
3427 3424                          }
3428 3425                          mutex_exit(&as->a_contents);
3429 3426                          AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3430 3427                  } else {
3431 3428                          /*
3432 3429                           * We may have raced with
3433 3430                           * segvn_reclaim()/segspt_reclaim(). In this case
3434 3431                           * clean nounmapwait flag and retry since softlockcnt
3435 3432                           * in this segment may be already 0.  We don't drop as
3436 3433                           * writer lock so our number of retries without
3437 3434                           * sleeping should be very small. See segvn_reclaim()
3438 3435                           * for more comments.
3439 3436                           */
3440 3437                          AS_CLRNOUNMAPWAIT(as);
3441 3438                          mutex_exit(&as->a_contents);
3442 3439                  }
3443 3440                  goto again;
3444 3441          }
3445 3442  
3446 3443          as_setwatch(as);
3447 3444          AS_LOCK_EXIT(as, &as->a_lock);
3448 3445          return (error);
3449 3446  }
3450 3447  
3451 3448  /*
3452 3449   * Setup all of the uninitialized watched pages that we can.
3453 3450   */
3454 3451  void
3455 3452  as_setwatch(struct as *as)
3456 3453  {
3457 3454          struct watched_page *pwp;
3458 3455          struct seg *seg;
3459 3456          caddr_t vaddr;
3460 3457          uint_t prot;
3461 3458          int  err, retrycnt;
3462 3459  
3463 3460          if (avl_numnodes(&as->a_wpage) == 0)
3464 3461                  return;
3465 3462  
3466 3463          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3467 3464  
3468 3465          for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3469 3466              pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3470 3467                  retrycnt = 0;
3471 3468          retry:
3472 3469                  vaddr = pwp->wp_vaddr;
3473 3470                  if (pwp->wp_oprot != 0 ||       /* already set up */
3474 3471                      (seg = as_segat(as, vaddr)) == NULL ||
3475 3472                      segop_getprot(seg, vaddr, 0, &prot) != 0)
3476 3473                          continue;
3477 3474  
3478 3475                  pwp->wp_oprot = prot;
3479 3476                  if (pwp->wp_read)
3480 3477                          prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3481 3478                  if (pwp->wp_write)
3482 3479                          prot &= ~PROT_WRITE;
3483 3480                  if (pwp->wp_exec)
3484 3481                          prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3485 3482                  if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3486 3483                          err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3487 3484                          if (err == IE_RETRY) {
3488 3485                                  pwp->wp_oprot = 0;
3489 3486                                  ASSERT(retrycnt == 0);
3490 3487                                  retrycnt++;
3491 3488                                  goto retry;
3492 3489                          }
3493 3490                  }
3494 3491                  pwp->wp_prot = prot;
3495 3492          }
3496 3493  }
3497 3494  
3498 3495  /*
3499 3496   * Clear all of the watched pages in the address space.
3500 3497   */
3501 3498  void
3502 3499  as_clearwatch(struct as *as)
3503 3500  {
3504 3501          struct watched_page *pwp;
3505 3502          struct seg *seg;
3506 3503          caddr_t vaddr;
3507 3504          uint_t prot;
3508 3505          int err, retrycnt;
3509 3506  
3510 3507          if (avl_numnodes(&as->a_wpage) == 0)
3511 3508                  return;
3512 3509  
3513 3510          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3514 3511  
3515 3512          for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3516 3513              pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3517 3514                  retrycnt = 0;
3518 3515          retry:
3519 3516                  vaddr = pwp->wp_vaddr;
3520 3517                  if (pwp->wp_oprot == 0 ||       /* not set up */
3521 3518                      (seg = as_segat(as, vaddr)) == NULL)
3522 3519                          continue;
3523 3520  
3524 3521                  if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3525 3522                          err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3526 3523                          if (err == IE_RETRY) {
3527 3524                                  ASSERT(retrycnt == 0);
3528 3525                                  retrycnt++;
3529 3526                                  goto retry;
3530 3527                          }
3531 3528                  }
3532 3529                  pwp->wp_oprot = 0;
3533 3530                  pwp->wp_prot = 0;
3534 3531          }
3535 3532  }
3536 3533  
3537 3534  /*
3538 3535   * Force a new setup for all the watched pages in the range.
3539 3536   */
3540 3537  static void
3541 3538  as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3542 3539  {
3543 3540          struct watched_page *pwp;
3544 3541          struct watched_page tpw;
3545 3542          caddr_t eaddr = addr + size;
3546 3543          caddr_t vaddr;
3547 3544          struct seg *seg;
3548 3545          int err, retrycnt;
3549 3546          uint_t  wprot;
3550 3547          avl_index_t where;
3551 3548  
3552 3549          if (avl_numnodes(&as->a_wpage) == 0)
3553 3550                  return;
3554 3551  
3555 3552          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3556 3553  
3557 3554          tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3558 3555          if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3559 3556                  pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3560 3557  
3561 3558          while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3562 3559                  retrycnt = 0;
3563 3560                  vaddr = pwp->wp_vaddr;
3564 3561  
3565 3562                  wprot = prot;
3566 3563                  if (pwp->wp_read)
3567 3564                          wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3568 3565                  if (pwp->wp_write)
3569 3566                          wprot &= ~PROT_WRITE;
3570 3567                  if (pwp->wp_exec)
3571 3568                          wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3572 3569                  if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3573 3570                  retry:
3574 3571                          seg = as_segat(as, vaddr);
3575 3572                          if (seg == NULL) {
3576 3573                                  panic("as_setwatchprot: no seg");
3577 3574                                  /*NOTREACHED*/
3578 3575                          }
3579 3576                          err = segop_setprot(seg, vaddr, PAGESIZE, wprot);
3580 3577                          if (err == IE_RETRY) {
3581 3578                                  ASSERT(retrycnt == 0);
3582 3579                                  retrycnt++;
3583 3580                                  goto retry;
3584 3581                          }
3585 3582                  }
3586 3583                  pwp->wp_oprot = prot;
3587 3584                  pwp->wp_prot = wprot;
3588 3585  
3589 3586                  pwp = AVL_NEXT(&as->a_wpage, pwp);
3590 3587          }
3591 3588  }
3592 3589  
3593 3590  /*
3594 3591   * Clear all of the watched pages in the range.
3595 3592   */
3596 3593  static void
3597 3594  as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3598 3595  {
3599 3596          caddr_t eaddr = addr + size;
3600 3597          struct watched_page *pwp;
3601 3598          struct watched_page tpw;
3602 3599          uint_t prot;
3603 3600          struct seg *seg;
3604 3601          int err, retrycnt;
3605 3602          avl_index_t where;
3606 3603  
3607 3604          if (avl_numnodes(&as->a_wpage) == 0)
3608 3605                  return;
3609 3606  
3610 3607          tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3611 3608          if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3612 3609                  pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3613 3610  
3614 3611          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3615 3612  
3616 3613          while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3617 3614  
3618 3615                  if ((prot = pwp->wp_oprot) != 0) {
3619 3616                          retrycnt = 0;
3620 3617  
3621 3618                          if (prot != pwp->wp_prot) {
3622 3619                          retry:
3623 3620                                  seg = as_segat(as, pwp->wp_vaddr);
3624 3621                                  if (seg == NULL)
3625 3622                                          continue;
3626 3623                                  err = segop_setprot(seg, pwp->wp_vaddr,
3627 3624                                      PAGESIZE, prot);
3628 3625                                  if (err == IE_RETRY) {
3629 3626                                          ASSERT(retrycnt == 0);
3630 3627                                          retrycnt++;
3631 3628                                          goto retry;
3632 3629  
3633 3630                                  }
3634 3631                          }
3635 3632                          pwp->wp_oprot = 0;
3636 3633                          pwp->wp_prot = 0;
3637 3634                  }
3638 3635  
3639 3636                  pwp = AVL_NEXT(&as->a_wpage, pwp);
3640 3637          }
3641 3638  }
3642 3639  
3643 3640  void
3644 3641  as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3645 3642  {
3646 3643          struct proc *p;
3647 3644  
3648 3645          mutex_enter(&pidlock);
3649 3646          for (p = practive; p; p = p->p_next) {
3650 3647                  if (p->p_as == as) {
3651 3648                          mutex_enter(&p->p_lock);
3652 3649                          if (p->p_as == as)
3653 3650                                  sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3654 3651                          mutex_exit(&p->p_lock);
3655 3652                  }
3656 3653          }
3657 3654          mutex_exit(&pidlock);
3658 3655  }
3659 3656  
3660 3657  /*
3661 3658   * return memory object ID
3662 3659   */
3663 3660  int
3664 3661  as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3665 3662  {
3666 3663          struct seg      *seg;
3667 3664          int             sts;
3668 3665  
3669 3666          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
3670 3667          seg = as_segat(as, addr);
3671 3668          if (seg == NULL) {
3672 3669                  AS_LOCK_EXIT(as, &as->a_lock);
3673 3670                  return (EFAULT);
3674 3671          }
3675 3672  
3676 3673          sts = segop_getmemid(seg, addr, memidp);
3677 3674  
3678 3675          AS_LOCK_EXIT(as, &as->a_lock);
3679 3676          return (sts);
3680 3677  }

↓ open down ↓

1035 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX