combined Wdiff usr/src/uts/common/vm/vm_pvn.c

Print this page

patch lower-case-segops

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_pvn.c
          +++ new/usr/src/uts/common/vm/vm_pvn.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  24   24   */
  25   25  
  26   26  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  27   27  /*        All Rights Reserved   */
  28   28  
  29   29  /*
  30   30   * University Copyright- Copyright (c) 1982, 1986, 1988
  31   31   * The Regents of the University of California
  32   32   * All Rights Reserved
  33   33   *
  34   34   * University Acknowledgment- Portions of this document are derived from
  35   35   * software developed by the University of California, Berkeley, and its
  36   36   * contributors.
  37   37   */
  38   38  
  39   39  /*
  40   40   * VM - paged vnode.
  41   41   *
  42   42   * This file supplies vm support for the vnode operations that deal with pages.
  43   43   */
  44   44  #include <sys/types.h>
  45   45  #include <sys/t_lock.h>
  46   46  #include <sys/param.h>
  47   47  #include <sys/sysmacros.h>
  48   48  #include <sys/systm.h>
  49   49  #include <sys/time.h>
  50   50  #include <sys/buf.h>
  51   51  #include <sys/vnode.h>
  52   52  #include <sys/uio.h>
  53   53  #include <sys/vmsystm.h>
  54   54  #include <sys/mman.h>
  55   55  #include <sys/vfs.h>
  56   56  #include <sys/cred.h>
  57   57  #include <sys/user.h>
  58   58  #include <sys/kmem.h>
  59   59  #include <sys/cmn_err.h>
  60   60  #include <sys/debug.h>
  61   61  #include <sys/cpuvar.h>
  62   62  #include <sys/vtrace.h>
  63   63  #include <sys/tnf_probe.h>
  64   64  
  65   65  #include <vm/hat.h>
  66   66  #include <vm/as.h>
  67   67  #include <vm/seg.h>
  68   68  #include <vm/rm.h>
  69   69  #include <vm/pvn.h>
  70   70  #include <vm/page.h>
  71   71  #include <vm/seg_map.h>
  72   72  #include <vm/seg_kmem.h>
  73   73  #include <sys/fs/swapnode.h>
  74   74  
  75   75  int pvn_nofodklust = 0;
  76   76  int pvn_write_noklust = 0;
  77   77  
  78   78  uint_t pvn_vmodsort_supported = 0;      /* set if HAT supports VMODSORT */
  79   79  uint_t pvn_vmodsort_disable = 0;        /* set in /etc/system to disable HAT */
  80   80                                          /* support for vmodsort for testing */
  81   81  
  82   82  static struct kmem_cache *marker_cache = NULL;
  83   83  
  84   84  /*
  85   85   * Find the largest contiguous block which contains `addr' for file offset
  86   86   * `offset' in it while living within the file system block sizes (`vp_off'
  87   87   * and `vp_len') and the address space limits for which no pages currently
  88   88   * exist and which map to consecutive file offsets.
  89   89   */
  90   90  page_t *
  91   91  pvn_read_kluster(
  92   92          struct vnode *vp,
  93   93          u_offset_t off,
  94   94          struct seg *seg,
  95   95          caddr_t addr,
  96   96          u_offset_t *offp,                       /* return values */
  97   97          size_t *lenp,                           /* return values */
  98   98          u_offset_t vp_off,
  99   99          size_t vp_len,
 100  100          int isra)
 101  101  {
 102  102          ssize_t deltaf, deltab;
 103  103          page_t *pp;
 104  104          page_t *plist = NULL;
 105  105          spgcnt_t pagesavail;
 106  106          u_offset_t vp_end;
 107  107  
 108  108          ASSERT(off >= vp_off && off < vp_off + vp_len);
 109  109  
 110  110          /*
 111  111           * We only want to do klustering/read ahead if there
 112  112           * is more than minfree pages currently available.
 113  113           */
 114  114          pagesavail = freemem - minfree;
 115  115  
 116  116          if (pagesavail <= 0)
 117  117                  if (isra)
 118  118                          return ((page_t *)NULL);    /* ra case - give up */
 119  119                  else
 120  120                          pagesavail = 1;             /* must return a page */
 121  121  
 122  122          /* We calculate in pages instead of bytes due to 32-bit overflows */
 123  123          if (pagesavail < (spgcnt_t)btopr(vp_len)) {
 124  124                  /*
 125  125                   * Don't have enough free memory for the
 126  126                   * max request, try sizing down vp request.
 127  127                   */
 128  128                  deltab = (ssize_t)(off - vp_off);
 129  129                  vp_len -= deltab;
 130  130                  vp_off += deltab;
 131  131                  if (pagesavail < btopr(vp_len)) {
 132  132                          /*

↓ open down ↓

132 lines elided

↑ open up ↑

 133  133                           * Still not enough memory, just settle for
 134  134                           * pagesavail which is at least 1.
 135  135                           */
 136  136                          vp_len = ptob(pagesavail);
 137  137                  }
 138  138          }
 139  139  
 140  140          vp_end = vp_off + vp_len;
 141  141          ASSERT(off >= vp_off && off < vp_end);
 142  142  
 143      -        if (isra && SEGOP_KLUSTER(seg, addr, 0))
      143 +        if (isra && segop_kluster(seg, addr, 0))
 144  144                  return ((page_t *)NULL);        /* segment driver says no */
 145  145  
 146  146          if ((plist = page_create_va(vp, off,
 147  147              PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
 148  148                  return ((page_t *)NULL);
 149  149  
 150  150          if (vp_len <= PAGESIZE || pvn_nofodklust) {
 151  151                  *offp = off;
 152  152                  *lenp = MIN(vp_len, PAGESIZE);
 153  153          } else {

 154  154                  /*

↓ open down ↓

1 lines elided

↑ open up ↑

 155  155                   * Scan back from front by incrementing "deltab" and
 156  156                   * comparing "off" with "vp_off + deltab" to avoid
 157  157                   * "signed" versus "unsigned" conversion problems.
 158  158                   */
 159  159                  for (deltab = PAGESIZE; off >= vp_off + deltab;
 160  160                      deltab += PAGESIZE) {
 161  161                          /*
 162  162                           * Call back to the segment driver to verify that
 163  163                           * the klustering/read ahead operation makes sense.
 164  164                           */
 165      -                        if (SEGOP_KLUSTER(seg, addr, -deltab))
      165 +                        if (segop_kluster(seg, addr, -deltab))
 166  166                                  break;          /* page not eligible */
 167  167                          if ((pp = page_create_va(vp, off - deltab,
 168  168                              PAGESIZE, PG_EXCL, seg, addr - deltab))
 169  169                              == NULL)
 170  170                                  break;          /* already have the page */
 171  171                          /*
 172  172                           * Add page to front of page list.
 173  173                           */
 174  174                          page_add(&plist, pp);
 175  175                  }
 176  176                  deltab -= PAGESIZE;
 177  177  
 178  178                  /* scan forward from front */
 179  179                  for (deltaf = PAGESIZE; off + deltaf < vp_end;
 180  180                      deltaf += PAGESIZE) {
 181  181                          /*
 182  182                           * Call back to the segment driver to verify that
 183  183                           * the klustering/read ahead operation makes sense.
 184  184                           */
 185      -                        if (SEGOP_KLUSTER(seg, addr, deltaf))
      185 +                        if (segop_kluster(seg, addr, deltaf))
 186  186                                  break;          /* page not file extension */
 187  187                          if ((pp = page_create_va(vp, off + deltaf,
 188  188                              PAGESIZE, PG_EXCL, seg, addr + deltaf))
 189  189                              == NULL)
 190  190                                  break;          /* already have page */
 191  191  
 192  192                          /*
 193  193                           * Add page to end of page list.
 194  194                           */
 195  195                          page_add(&plist, pp);

 196  196                          plist = plist->p_next;
 197  197                  }
 198  198                  *offp = off = off - deltab;
 199  199                  *lenp = deltab + deltaf;
 200  200                  ASSERT(off >= vp_off);
 201  201  
 202  202                  /*
 203  203                   * If we ended up getting more than was actually
 204  204                   * requested, retract the returned length to only
 205  205                   * reflect what was requested.  This might happen
 206  206                   * if we were allowed to kluster pages across a
 207  207                   * span of (say) 5 frags, and frag size is less
 208  208                   * than PAGESIZE.  We need a whole number of
 209  209                   * pages to contain those frags, but the returned
 210  210                   * size should only allow the returned range to
 211  211                   * extend as far as the end of the frags.
 212  212                   */
 213  213                  if ((vp_off + vp_len) < (off + *lenp)) {
 214  214                          ASSERT(vp_end > off);
 215  215                          *lenp = vp_end - off;
 216  216                  }
 217  217          }
 218  218          TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
 219  219              "pvn_read_kluster:seg %p addr %x isra %x",
 220  220              seg, addr, isra);
 221  221          return (plist);
 222  222  }
 223  223  
 224  224  /*
 225  225   * Handle pages for this vnode on either side of the page "pp"
 226  226   * which has been locked by the caller.  This routine will also
 227  227   * do klustering in the range [vp_off, vp_off + vp_len] up
 228  228   * until a page which is not found.  The offset and length
 229  229   * of pages included is returned in "*offp" and "*lenp".
 230  230   *
 231  231   * Returns a list of dirty locked pages all ready to be
 232  232   * written back.
 233  233   */
 234  234  page_t *
 235  235  pvn_write_kluster(
 236  236          struct vnode *vp,
 237  237          page_t *pp,
 238  238          u_offset_t *offp,               /* return values */
 239  239          size_t *lenp,                   /* return values */
 240  240          u_offset_t vp_off,
 241  241          size_t vp_len,
 242  242          int flags)
 243  243  {
 244  244          u_offset_t off;
 245  245          page_t *dirty;
 246  246          size_t deltab, deltaf;
 247  247          se_t se;
 248  248          u_offset_t vp_end;
 249  249  
 250  250          off = pp->p_offset;
 251  251  
 252  252          /*
 253  253           * Kustering should not be done if we are invalidating
 254  254           * pages since we could destroy pages that belong to
 255  255           * some other process if this is a swap vnode.
 256  256           */
 257  257          if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
 258  258                  *offp = off;
 259  259                  *lenp = PAGESIZE;
 260  260                  return (pp);
 261  261          }
 262  262  
 263  263          if (flags & (B_FREE | B_INVAL))
 264  264                  se = SE_EXCL;
 265  265          else
 266  266                  se = SE_SHARED;
 267  267  
 268  268          dirty = pp;
 269  269          /*
 270  270           * Scan backwards looking for pages to kluster by incrementing
 271  271           * "deltab" and comparing "off" with "vp_off + deltab" to
 272  272           * avoid "signed" versus "unsigned" conversion problems.
 273  273           */
 274  274          for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
 275  275                  pp = page_lookup_nowait(vp, off - deltab, se);
 276  276                  if (pp == NULL)
 277  277                          break;          /* page not found */
 278  278                  if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 279  279                          break;
 280  280                  page_add(&dirty, pp);
 281  281          }
 282  282          deltab -= PAGESIZE;
 283  283  
 284  284          vp_end = vp_off + vp_len;
 285  285          /* now scan forwards looking for pages to kluster */
 286  286          for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
 287  287                  pp = page_lookup_nowait(vp, off + deltaf, se);
 288  288                  if (pp == NULL)
 289  289                          break;          /* page not found */
 290  290                  if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 291  291                          break;
 292  292                  page_add(&dirty, pp);
 293  293                  dirty = dirty->p_next;
 294  294          }
 295  295  
 296  296          *offp = off - deltab;
 297  297          *lenp = deltab + deltaf;
 298  298          return (dirty);
 299  299  }
 300  300  
 301  301  /*
 302  302   * Generic entry point used to release the "shared/exclusive" lock
 303  303   * and the "p_iolock" on pages after i/o is complete.
 304  304   */
 305  305  void
 306  306  pvn_io_done(page_t *plist)
 307  307  {
 308  308          page_t *pp;
 309  309  
 310  310          while (plist != NULL) {
 311  311                  pp = plist;
 312  312                  page_sub(&plist, pp);
 313  313                  page_io_unlock(pp);
 314  314                  page_unlock(pp);
 315  315          }
 316  316  }
 317  317  
 318  318  /*
 319  319   * Entry point to be used by file system getpage subr's and
 320  320   * other such routines which either want to unlock pages (B_ASYNC
 321  321   * request) or destroy a list of pages if an error occurred.
 322  322   */
 323  323  void
 324  324  pvn_read_done(page_t *plist, int flags)
 325  325  {
 326  326          page_t *pp;
 327  327  
 328  328          while (plist != NULL) {
 329  329                  pp = plist;
 330  330                  page_sub(&plist, pp);
 331  331                  page_io_unlock(pp);
 332  332                  if (flags & B_ERROR) {
 333  333                          /*LINTED: constant in conditional context*/
 334  334                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
 335  335                  } else {
 336  336                          (void) page_release(pp, 0);
 337  337                  }
 338  338          }
 339  339  }
 340  340  
 341  341  /*
 342  342   * Automagic pageout.
 343  343   * When memory gets tight, start freeing pages popping out of the
 344  344   * write queue.
 345  345   */
 346  346  int     write_free = 1;
 347  347  pgcnt_t pages_before_pager = 200;       /* LMXXX */
 348  348  
 349  349  /*
 350  350   * Routine to be called when page-out's complete.
 351  351   * The caller, typically VOP_PUTPAGE, has to explicity call this routine
 352  352   * after waiting for i/o to complete (biowait) to free the list of
 353  353   * pages associated with the buffer.  These pages must be locked
 354  354   * before i/o is initiated.
 355  355   *
 356  356   * If a write error occurs, the pages are marked as modified
 357  357   * so the write will be re-tried later.
 358  358   */
 359  359  
 360  360  void
 361  361  pvn_write_done(page_t *plist, int flags)
 362  362  {
 363  363          int dfree = 0;
 364  364          int pgrec = 0;
 365  365          int pgout = 0;
 366  366          int pgpgout = 0;
 367  367          int anonpgout = 0;
 368  368          int anonfree = 0;
 369  369          int fspgout = 0;
 370  370          int fsfree = 0;
 371  371          int execpgout = 0;
 372  372          int execfree = 0;
 373  373          page_t *pp;
 374  374          struct cpu *cpup;
 375  375          struct vnode *vp = NULL;        /* for probe */
 376  376          uint_t ppattr;
 377  377          kmutex_t *vphm = NULL;
 378  378  
 379  379          ASSERT((flags & B_READ) == 0);
 380  380  
 381  381          /*
 382  382           * If we are about to start paging anyway, start freeing pages.
 383  383           */
 384  384          if (write_free && freemem < lotsfree + pages_before_pager &&
 385  385              (flags & B_ERROR) == 0) {
 386  386                  flags |= B_FREE;
 387  387          }
 388  388  
 389  389          /*
 390  390           * Handle each page involved in the i/o operation.
 391  391           */
 392  392          while (plist != NULL) {
 393  393                  pp = plist;
 394  394                  ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
 395  395                  page_sub(&plist, pp);
 396  396  
 397  397                  /* Kernel probe support */
 398  398                  if (vp == NULL)
 399  399                          vp = pp->p_vnode;
 400  400  
 401  401                  if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
 402  402                          /*
 403  403                           * Move page to the top of the v_page list.
 404  404                           * Skip pages modified during IO.
 405  405                           */
 406  406                          vphm = page_vnode_mutex(vp);
 407  407                          mutex_enter(vphm);
 408  408                          if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
 409  409                                  page_vpsub(&vp->v_pages, pp);
 410  410                                  page_vpadd(&vp->v_pages, pp);
 411  411                          }
 412  412                          mutex_exit(vphm);
 413  413                  }
 414  414  
 415  415                  if (flags & B_ERROR) {
 416  416                          /*
 417  417                           * Write operation failed.  We don't want
 418  418                           * to destroy (or free) the page unless B_FORCE
 419  419                           * is set. We set the mod bit again and release
 420  420                           * all locks on the page so that it will get written
 421  421                           * back again later when things are hopefully
 422  422                           * better again.
 423  423                           * If B_INVAL and B_FORCE is set we really have
 424  424                           * to destroy the page.
 425  425                           */
 426  426                          if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
 427  427                                  page_io_unlock(pp);
 428  428                                  /*LINTED: constant in conditional context*/
 429  429                                  VN_DISPOSE(pp, B_INVAL, 0, kcred);
 430  430                          } else {
 431  431                                  hat_setmod_only(pp);
 432  432                                  page_io_unlock(pp);
 433  433                                  page_unlock(pp);
 434  434                          }
 435  435                  } else if (flags & B_INVAL) {
 436  436                          /*
 437  437                           * XXX - Failed writes with B_INVAL set are
 438  438                           * not handled appropriately.
 439  439                           */
 440  440                          page_io_unlock(pp);
 441  441                          /*LINTED: constant in conditional context*/
 442  442                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
 443  443                  } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
 444  444                          /*
 445  445                           * Update statistics for pages being paged out
 446  446                           */
 447  447                          if (pp->p_vnode) {
 448  448                                  if (IS_SWAPFSVP(pp->p_vnode)) {
 449  449                                          anonpgout++;
 450  450                                  } else {
 451  451                                          if (pp->p_vnode->v_flag & VVMEXEC) {
 452  452                                                  execpgout++;
 453  453                                          } else {
 454  454                                                  fspgout++;
 455  455                                          }
 456  456                                  }
 457  457                          }
 458  458                          page_io_unlock(pp);
 459  459                          pgout = 1;
 460  460                          pgpgout++;
 461  461                          TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
 462  462                              "page_ws_out:pp %p", pp);
 463  463  
 464  464                          /*
 465  465                           * The page_struct_lock need not be acquired to
 466  466                           * examine "p_lckcnt" and "p_cowcnt" since we'll
 467  467                           * have an "exclusive" lock if the upgrade succeeds.
 468  468                           */
 469  469                          if (page_tryupgrade(pp) &&
 470  470                              pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
 471  471                                  /*
 472  472                                   * Check if someone has reclaimed the
 473  473                                   * page.  If ref and mod are not set, no
 474  474                                   * one is using it so we can free it.
 475  475                                   * The rest of the system is careful
 476  476                                   * to use the NOSYNC flag to unload
 477  477                                   * translations set up for i/o w/o
 478  478                                   * affecting ref and mod bits.
 479  479                                   *
 480  480                                   * Obtain a copy of the real hardware
 481  481                                   * mod bit using hat_pagesync(pp, HAT_DONTZERO)
 482  482                                   * to avoid having to flush the cache.
 483  483                                   */
 484  484                                  ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
 485  485                                      HAT_SYNC_STOPON_MOD);
 486  486                          ck_refmod:
 487  487                                  if (!(ppattr & (P_REF | P_MOD))) {
 488  488                                          if (hat_page_is_mapped(pp)) {
 489  489                                                  /*
 490  490                                                   * Doesn't look like the page
 491  491                                                   * was modified so now we
 492  492                                                   * really have to unload the
 493  493                                                   * translations.  Meanwhile
 494  494                                                   * another CPU could've
 495  495                                                   * modified it so we have to
 496  496                                                   * check again.  We don't loop
 497  497                                                   * forever here because now
 498  498                                                   * the translations are gone
 499  499                                                   * and no one can get a new one
 500  500                                                   * since we have the "exclusive"
 501  501                                                   * lock on the page.
 502  502                                                   */
 503  503                                                  (void) hat_pageunload(pp,
 504  504                                                      HAT_FORCE_PGUNLOAD);
 505  505                                                  ppattr = hat_page_getattr(pp,
 506  506                                                      P_REF | P_MOD);
 507  507                                                  goto ck_refmod;
 508  508                                          }
 509  509                                          /*
 510  510                                           * Update statistics for pages being
 511  511                                           * freed
 512  512                                           */
 513  513                                          if (pp->p_vnode) {
 514  514                                                  if (IS_SWAPFSVP(pp->p_vnode)) {
 515  515                                                          anonfree++;
 516  516                                                  } else {
 517  517                                                          if (pp->p_vnode->v_flag
 518  518                                                              & VVMEXEC) {
 519  519                                                                  execfree++;
 520  520                                                          } else {
 521  521                                                                  fsfree++;
 522  522                                                          }
 523  523                                                  }
 524  524                                          }
 525  525                                          /*LINTED: constant in conditional ctx*/
 526  526                                          VN_DISPOSE(pp, B_FREE,
 527  527                                              (flags & B_DONTNEED), kcred);
 528  528                                          dfree++;
 529  529                                  } else {
 530  530                                          page_unlock(pp);
 531  531                                          pgrec++;
 532  532                                          TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
 533  533                                              "page_ws_free:pp %p", pp);
 534  534                                  }
 535  535                          } else {
 536  536                                  /*
 537  537                                   * Page is either `locked' in memory
 538  538                                   * or was reclaimed and now has a
 539  539                                   * "shared" lock, so release it.
 540  540                                   */
 541  541                                  page_unlock(pp);
 542  542                          }
 543  543                  } else {
 544  544                          /*
 545  545                           * Neither B_FREE nor B_INVAL nor B_ERROR.
 546  546                           * Just release locks.
 547  547                           */
 548  548                          page_io_unlock(pp);
 549  549                          page_unlock(pp);
 550  550                  }
 551  551          }
 552  552  
 553  553          CPU_STATS_ENTER_K();
 554  554          cpup = CPU;             /* get cpup now that CPU cannot change */
 555  555          CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
 556  556          CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
 557  557          CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
 558  558          CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
 559  559          CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
 560  560          CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
 561  561          CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
 562  562          CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
 563  563          CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
 564  564          CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
 565  565          CPU_STATS_EXIT_K();
 566  566  
 567  567          /* Kernel probe */
 568  568          TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
 569  569              tnf_opaque, vnode,                  vp,
 570  570              tnf_ulong,  pages_pageout,          pgpgout,
 571  571              tnf_ulong,  pages_freed,            dfree,
 572  572              tnf_ulong,  pages_reclaimed,        pgrec);
 573  573  }
 574  574  
 575  575  /*
 576  576   * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
 577  577   * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
 578  578   * operation and is only to be considered if it doesn't involve any
 579  579   * waiting here.  B_TRUNC indicates that the file is being truncated
 580  580   * and so no i/o needs to be done. B_FORCE indicates that the page
 581  581   * must be destroyed so don't try wrting it out.
 582  582   *
 583  583   * The caller must ensure that the page is locked.  Returns 1, if
 584  584   * the page should be written back (the "iolock" is held in this
 585  585   * case), or 0 if the page has been dealt with or has been
 586  586   * unlocked.
 587  587   */
 588  588  int
 589  589  pvn_getdirty(page_t *pp, int flags)
 590  590  {
 591  591          ASSERT((flags & (B_INVAL | B_FREE)) ?
 592  592              PAGE_EXCL(pp) : PAGE_SHARED(pp));
 593  593          ASSERT(PP_ISFREE(pp) == 0);
 594  594  
 595  595          /*
 596  596           * If trying to invalidate or free a logically `locked' page,
 597  597           * forget it.  Don't need page_struct_lock to check p_lckcnt and
 598  598           * p_cowcnt as the page is exclusively locked.
 599  599           */
 600  600          if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
 601  601              (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
 602  602                  page_unlock(pp);
 603  603                  return (0);
 604  604          }
 605  605  
 606  606          /*
 607  607           * Now acquire the i/o lock so we can add it to the dirty
 608  608           * list (if necessary).  We avoid blocking on the i/o lock
 609  609           * in the following cases:
 610  610           *
 611  611           *      If B_DELWRI is set, which implies that this request is
 612  612           *      due to a klustering operartion.
 613  613           *
 614  614           *      If this is an async (B_ASYNC) operation and we are not doing
 615  615           *      invalidation (B_INVAL) [The current i/o or fsflush will ensure
 616  616           *      that the the page is written out].
 617  617           */
 618  618          if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
 619  619                  if (!page_io_trylock(pp)) {
 620  620                          page_unlock(pp);
 621  621                          return (0);
 622  622                  }
 623  623          } else {
 624  624                  page_io_lock(pp);
 625  625          }
 626  626  
 627  627          /*
 628  628           * If we want to free or invalidate the page then
 629  629           * we need to unload it so that anyone who wants
 630  630           * it will have to take a minor fault to get it.
 631  631           * Otherwise, we're just writing the page back so we
 632  632           * need to sync up the hardwre and software mod bit to
 633  633           * detect any future modifications.  We clear the
 634  634           * software mod bit when we put the page on the dirty
 635  635           * list.
 636  636           */
 637  637          if (flags & (B_INVAL | B_FREE)) {
 638  638                  (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 639  639          } else {
 640  640                  (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
 641  641          }
 642  642  
 643  643          if (!hat_ismod(pp) || (flags & B_TRUNC)) {
 644  644                  /*
 645  645                   * Don't need to add it to the
 646  646                   * list after all.
 647  647                   */
 648  648                  page_io_unlock(pp);
 649  649                  if (flags & B_INVAL) {
 650  650                          /*LINTED: constant in conditional context*/
 651  651                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
 652  652                  } else if (flags & B_FREE) {
 653  653                          /*LINTED: constant in conditional context*/
 654  654                          VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
 655  655                  } else {
 656  656                          /*
 657  657                           * This is advisory path for the callers
 658  658                           * of VOP_PUTPAGE() who prefer freeing the
 659  659                           * page _only_ if no one else is accessing it.
 660  660                           * E.g. segmap_release()
 661  661                           *
 662  662                           * The above hat_ismod() check is useless because:
 663  663                           * (1) we may not be holding SE_EXCL lock;
 664  664                           * (2) we've not unloaded _all_ translations
 665  665                           *
 666  666                           * Let page_release() do the heavy-lifting.
 667  667                           */
 668  668                          (void) page_release(pp, 1);
 669  669                  }
 670  670                  return (0);
 671  671          }
 672  672  
 673  673          /*
 674  674           * Page is dirty, get it ready for the write back
 675  675           * and add page to the dirty list.
 676  676           */
 677  677          hat_clrrefmod(pp);
 678  678  
 679  679          /*
 680  680           * If we're going to free the page when we're done
 681  681           * then we can let others try to use it starting now.
 682  682           * We'll detect the fact that they used it when the
 683  683           * i/o is done and avoid freeing the page.
 684  684           */
 685  685          if (flags & B_FREE)
 686  686                  page_downgrade(pp);
 687  687  
 688  688  
 689  689          TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
 690  690  
 691  691          return (1);
 692  692  }
 693  693  
 694  694  
 695  695  /*ARGSUSED*/
 696  696  static int
 697  697  marker_constructor(void *buf, void *cdrarg, int kmflags)
 698  698  {
 699  699          page_t *mark = buf;
 700  700          bzero(mark, sizeof (page_t));
 701  701          mark->p_hash = PVN_VPLIST_HASH_TAG;
 702  702          return (0);
 703  703  }
 704  704  
 705  705  void
 706  706  pvn_init()
 707  707  {
 708  708          if (pvn_vmodsort_disable == 0)
 709  709                  pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
 710  710          marker_cache = kmem_cache_create("marker_cache",
 711  711              sizeof (page_t), 0, marker_constructor,
 712  712              NULL, NULL, NULL, NULL, 0);
 713  713  }
 714  714  
 715  715  
 716  716  /*
 717  717   * Process a vnode's page list for all pages whose offset is >= off.
 718  718   * Pages are to either be free'd, invalidated, or written back to disk.
 719  719   *
 720  720   * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
 721  721   * is specified, otherwise they are "shared" locked.
 722  722   *
 723  723   * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
 724  724   *
 725  725   * Special marker page_t's are inserted in the list in order
 726  726   * to keep track of where we are in the list when locks are dropped.
 727  727   *
 728  728   * Note the list is circular and insertions can happen only at the
 729  729   * head and tail of the list. The algorithm ensures visiting all pages
 730  730   * on the list in the following way:
 731  731   *
 732  732   *    Drop two marker pages at the end of the list.
 733  733   *
 734  734   *    Move one marker page backwards towards the start of the list until
 735  735   *    it is at the list head, processing the pages passed along the way.
 736  736   *
 737  737   *    Due to race conditions when the vphm mutex is dropped, additional pages
 738  738   *    can be added to either end of the list, so we'll continue to move
 739  739   *    the marker and process pages until it is up against the end marker.
 740  740   *
 741  741   * There is one special exit condition. If we are processing a VMODSORT
 742  742   * vnode and only writing back modified pages, we can stop as soon as
 743  743   * we run into an unmodified page.  This makes fsync(3) operations fast.
 744  744   */
 745  745  int
 746  746  pvn_vplist_dirty(
 747  747          vnode_t         *vp,
 748  748          u_offset_t      off,
 749  749          int             (*putapage)(vnode_t *, page_t *, u_offset_t *,
 750  750                          size_t *, int, cred_t *),
 751  751          int             flags,
 752  752          cred_t          *cred)
 753  753  {
 754  754          page_t          *pp;
 755  755          page_t          *mark;          /* marker page that moves toward head */
 756  756          page_t          *end;           /* marker page at end of list */
 757  757          int             err = 0;
 758  758          int             error;
 759  759          kmutex_t        *vphm;
 760  760          se_t            se;
 761  761          page_t          **where_to_move;
 762  762  
 763  763          ASSERT(vp->v_type != VCHR);
 764  764  
 765  765          if (vp->v_pages == NULL)
 766  766                  return (0);
 767  767  
 768  768  
 769  769          /*
 770  770           * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
 771  771           *
 772  772           * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
 773  773           * from getting blocked while flushing pages to a dead NFS server.
 774  774           */
 775  775          mutex_enter(&vp->v_lock);
 776  776          if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
 777  777                  mutex_exit(&vp->v_lock);
 778  778                  return (EAGAIN);
 779  779          }
 780  780  
 781  781          while (vp->v_flag & VVMLOCK)
 782  782                  cv_wait(&vp->v_cv, &vp->v_lock);
 783  783  
 784  784          if (vp->v_pages == NULL) {
 785  785                  mutex_exit(&vp->v_lock);
 786  786                  return (0);
 787  787          }
 788  788  
 789  789          vp->v_flag |= VVMLOCK;
 790  790          mutex_exit(&vp->v_lock);
 791  791  
 792  792  
 793  793          /*
 794  794           * Set up the marker pages used to walk the list
 795  795           */
 796  796          end = kmem_cache_alloc(marker_cache, KM_SLEEP);
 797  797          end->p_vnode = vp;
 798  798          end->p_offset = (u_offset_t)-2;
 799  799          mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
 800  800          mark->p_vnode = vp;
 801  801          mark->p_offset = (u_offset_t)-1;
 802  802  
 803  803          /*
 804  804           * Grab the lock protecting the vnode's page list
 805  805           * note that this lock is dropped at times in the loop.
 806  806           */
 807  807          vphm = page_vnode_mutex(vp);
 808  808          mutex_enter(vphm);
 809  809          if (vp->v_pages == NULL)
 810  810                  goto leave;
 811  811  
 812  812          /*
 813  813           * insert the markers and loop through the list of pages
 814  814           */
 815  815          page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
 816  816          page_vpadd(&mark->p_vpnext, end);
 817  817          for (;;) {
 818  818  
 819  819                  /*
 820  820                   * If only doing an async write back, then we can
 821  821                   * stop as soon as we get to start of the list.
 822  822                   */
 823  823                  if (flags == B_ASYNC && vp->v_pages == mark)
 824  824                          break;
 825  825  
 826  826                  /*
 827  827                   * otherwise stop when we've gone through all the pages
 828  828                   */
 829  829                  if (mark->p_vpprev == end)
 830  830                          break;
 831  831  
 832  832                  pp = mark->p_vpprev;
 833  833                  if (vp->v_pages == pp)
 834  834                          where_to_move = &vp->v_pages;
 835  835                  else
 836  836                          where_to_move = &pp->p_vpprev->p_vpnext;
 837  837  
 838  838                  ASSERT(pp->p_vnode == vp);
 839  839  
 840  840                  /*
 841  841                   * If just flushing dirty pages to disk and this vnode
 842  842                   * is using a sorted list of pages, we can stop processing
 843  843                   * as soon as we find an unmodified page. Since all the
 844  844                   * modified pages are visited first.
 845  845                   */
 846  846                  if (IS_VMODSORT(vp) &&
 847  847                      !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
 848  848                          if (!hat_ismod(pp) && !page_io_locked(pp)) {
 849  849  #ifdef  DEBUG
 850  850                                  /*
 851  851                                   * For debug kernels examine what should be
 852  852                                   * all the remaining clean pages, asserting
 853  853                                   * that they are not modified.
 854  854                                   */
 855  855                                  page_t  *chk = pp;
 856  856                                  int     attr;
 857  857  
 858  858                                  page_vpsub(&vp->v_pages, mark);
 859  859                                  page_vpadd(where_to_move, mark);
 860  860                                  do {
 861  861                                          chk = chk->p_vpprev;
 862  862                                          ASSERT(chk != end);
 863  863                                          if (chk == mark)
 864  864                                                  continue;
 865  865                                          attr = hat_page_getattr(chk, P_MOD |
 866  866                                              P_REF);
 867  867                                          if ((attr & P_MOD) == 0)
 868  868                                                  continue;
 869  869                                          panic("v_pages list not all clean: "
 870  870                                              "page_t*=%p vnode=%p off=%lx "
 871  871                                              "attr=0x%x last clean page_t*=%p\n",
 872  872                                              (void *)chk, (void *)chk->p_vnode,
 873  873                                              (long)chk->p_offset, attr,
 874  874                                              (void *)pp);
 875  875                                  } while (chk != vp->v_pages);
 876  876  #endif
 877  877                                  break;
 878  878                          } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
 879  879                                  /*
 880  880                                   * Couldn't get io lock, wait until IO is done.
 881  881                                   * Block only for sync IO since we don't want
 882  882                                   * to block async IO.
 883  883                                   */
 884  884                                  mutex_exit(vphm);
 885  885                                  page_io_wait(pp);
 886  886                                  mutex_enter(vphm);
 887  887                                  continue;
 888  888                          }
 889  889                  }
 890  890  
 891  891                  /*
 892  892                   * Skip this page if the offset is out of the desired range.
 893  893                   * Just move the marker and continue.
 894  894                   */
 895  895                  if (pp->p_offset < off) {
 896  896                          page_vpsub(&vp->v_pages, mark);
 897  897                          page_vpadd(where_to_move, mark);
 898  898                          continue;
 899  899                  }
 900  900  
 901  901                  /*
 902  902                   * If we are supposed to invalidate or free this
 903  903                   * page, then we need an exclusive lock.
 904  904                   */
 905  905                  se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
 906  906  
 907  907                  /*
 908  908                   * We must acquire the page lock for all synchronous
 909  909                   * operations (invalidate, free and write).
 910  910                   */
 911  911                  if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
 912  912                          /*
 913  913                           * If the page_lock() drops the mutex
 914  914                           * we must retry the loop.
 915  915                           */
 916  916                          if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
 917  917                                  continue;
 918  918  
 919  919                          /*
 920  920                           * It's ok to move the marker page now.
 921  921                           */
 922  922                          page_vpsub(&vp->v_pages, mark);
 923  923                          page_vpadd(where_to_move, mark);
 924  924                  } else {
 925  925  
 926  926                          /*
 927  927                           * update the marker page for all remaining cases
 928  928                           */
 929  929                          page_vpsub(&vp->v_pages, mark);
 930  930                          page_vpadd(where_to_move, mark);
 931  931  
 932  932                          /*
 933  933                           * For write backs, If we can't lock the page, it's
 934  934                           * invalid or in the process of being destroyed.  Skip
 935  935                           * it, assuming someone else is writing it.
 936  936                           */
 937  937                          if (!page_trylock(pp, se))
 938  938                                  continue;
 939  939                  }
 940  940  
 941  941                  ASSERT(pp->p_vnode == vp);
 942  942  
 943  943                  /*
 944  944                   * Successfully locked the page, now figure out what to
 945  945                   * do with it. Free pages are easily dealt with, invalidate
 946  946                   * if desired or just go on to the next page.
 947  947                   */
 948  948                  if (PP_ISFREE(pp)) {
 949  949                          if ((flags & B_INVAL) == 0) {
 950  950                                  page_unlock(pp);
 951  951                                  continue;
 952  952                          }
 953  953  
 954  954                          /*
 955  955                           * Invalidate (destroy) the page.
 956  956                           */
 957  957                          mutex_exit(vphm);
 958  958                          page_destroy_free(pp);
 959  959                          mutex_enter(vphm);
 960  960                          continue;
 961  961                  }
 962  962  
 963  963                  /*
 964  964                   * pvn_getdirty() figures out what do do with a dirty page.
 965  965                   * If the page is dirty, the putapage() routine will write it
 966  966                   * and will kluster any other adjacent dirty pages it can.
 967  967                   *
 968  968                   * pvn_getdirty() and `(*putapage)' unlock the page.
 969  969                   */
 970  970                  mutex_exit(vphm);
 971  971                  if (pvn_getdirty(pp, flags)) {
 972  972                          error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
 973  973                          if (!err)
 974  974                                  err = error;
 975  975                  }
 976  976                  mutex_enter(vphm);
 977  977          }
 978  978          page_vpsub(&vp->v_pages, mark);
 979  979          page_vpsub(&vp->v_pages, end);
 980  980  
 981  981  leave:
 982  982          /*
 983  983           * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
 984  984           */
 985  985          mutex_exit(vphm);
 986  986          kmem_cache_free(marker_cache, mark);
 987  987          kmem_cache_free(marker_cache, end);
 988  988          mutex_enter(&vp->v_lock);
 989  989          vp->v_flag &= ~VVMLOCK;
 990  990          cv_broadcast(&vp->v_cv);
 991  991          mutex_exit(&vp->v_lock);
 992  992          return (err);
 993  993  }
 994  994  
 995  995  /*
 996  996   * Walk the vp->v_pages list, for every page call the callback function
 997  997   * pointed by *page_check. If page_check returns non-zero, then mark the
 998  998   * page as modified and if VMODSORT is set, move it to the end of v_pages
 999  999   * list. Moving makes sense only if we have at least two pages - this also
1000 1000   * avoids having v_pages temporarily being NULL after calling page_vpsub()
1001 1001   * if there was just one page.
1002 1002   */
1003 1003  void
1004 1004  pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
1005 1005  {
1006 1006          page_t  *pp, *next, *end;
1007 1007          kmutex_t        *vphm;
1008 1008          int     shuffle;
1009 1009  
1010 1010          vphm = page_vnode_mutex(vp);
1011 1011          mutex_enter(vphm);
1012 1012  
1013 1013          if (vp->v_pages == NULL) {
1014 1014                  mutex_exit(vphm);
1015 1015                  return;
1016 1016          }
1017 1017  
1018 1018          end = vp->v_pages->p_vpprev;
1019 1019          shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1020 1020          pp = vp->v_pages;
1021 1021  
1022 1022          for (;;) {
1023 1023                  next = pp->p_vpnext;
1024 1024                  if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1025 1025                          /*
1026 1026                           * hat_setmod_only() in contrast to hat_setmod() does
1027 1027                           * not shuffle the pages and does not grab the mutex
1028 1028                           * page_vnode_mutex. Exactly what we need.
1029 1029                           */
1030 1030                          hat_setmod_only(pp);
1031 1031                          if (shuffle) {
1032 1032                                  page_vpsub(&vp->v_pages, pp);
1033 1033                                  ASSERT(vp->v_pages != NULL);
1034 1034                                  page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1035 1035                                      pp);
1036 1036                          }
1037 1037                  }
1038 1038                  /* Stop if we have just processed the last page. */
1039 1039                  if (pp == end)
1040 1040                          break;
1041 1041                  pp = next;
1042 1042          }
1043 1043  
1044 1044          mutex_exit(vphm);
1045 1045  }
1046 1046  
1047 1047  /*
1048 1048   * Zero out zbytes worth of data. Caller should be aware that this
1049 1049   * routine may enter back into the fs layer (xxx_getpage). Locks
1050 1050   * that the xxx_getpage routine may need should not be held while
1051 1051   * calling this.
1052 1052   */
1053 1053  void
1054 1054  pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
1055 1055  {
1056 1056          caddr_t addr;
1057 1057  
1058 1058          ASSERT(vp->v_type != VCHR);
1059 1059  
1060 1060          if (vp->v_pages == NULL)
1061 1061                  return;
1062 1062  
1063 1063          /*
1064 1064           * zbytes may be zero but there still may be some portion of
1065 1065           * a page which needs clearing (since zbytes is a function
1066 1066           * of filesystem block size, not pagesize.)
1067 1067           */
1068 1068          if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1069 1069                  return;
1070 1070  
1071 1071          /*
1072 1072           * We get the last page and handle the partial
1073 1073           * zeroing via kernel mappings.  This will make the page
1074 1074           * dirty so that we know that when this page is written
1075 1075           * back, the zeroed information will go out with it.  If
1076 1076           * the page is not currently in memory, then the kzero
1077 1077           * operation will cause it to be brought it.  We use kzero
1078 1078           * instead of bzero so that if the page cannot be read in
1079 1079           * for any reason, the system will not panic.  We need
1080 1080           * to zero out a minimum of the fs given zbytes, but we
1081 1081           * might also have to do more to get the entire last page.
1082 1082           */
1083 1083  
1084 1084          if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1085 1085                  panic("pvn_vptrunc zbytes");
1086 1086          addr = segmap_getmapflt(segkmap, vp, vplen,
1087 1087              MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1088 1088          (void) kzero(addr + (vplen & MAXBOFFSET),
1089 1089              MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1090 1090          (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1091 1091  }
1092 1092  
1093 1093  /*
1094 1094   * Handles common work of the VOP_GETPAGE routines by iterating page by page
1095 1095   * calling the getpage helper for each.
1096 1096   */
1097 1097  int
1098 1098  pvn_getpages(
1099 1099          int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
1100 1100                  size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1101 1101          struct vnode *vp,
1102 1102          u_offset_t off,
1103 1103          size_t len,
1104 1104          uint_t *protp,
1105 1105          page_t *pl[],
1106 1106          size_t plsz,
1107 1107          struct seg *seg,
1108 1108          caddr_t addr,
1109 1109          enum seg_rw rw,
1110 1110          struct cred *cred)
1111 1111  {
1112 1112          page_t **ppp;
1113 1113          u_offset_t o, eoff;
1114 1114          size_t sz, xlen;
1115 1115          int err;
1116 1116  
1117 1117          /* ensure that we have enough space */
1118 1118          ASSERT(pl == NULL || plsz >= len);
1119 1119  
1120 1120          /*
1121 1121           * Loop one page at a time and let getapage function fill
1122 1122           * in the next page in array.  We only allow one page to be
1123 1123           * returned at a time (except for the last page) so that we
1124 1124           * don't have any problems with duplicates and other such
1125 1125           * painful problems.  This is a very simple minded algorithm,
1126 1126           * but it does the job correctly.  We hope that the cost of a
1127 1127           * getapage call for a resident page that we might have been
1128 1128           * able to get from an earlier call doesn't cost too much.
1129 1129           */
1130 1130          ppp = pl;
1131 1131          sz = (pl != NULL) ? PAGESIZE : 0;
1132 1132          eoff = off + len;
1133 1133          xlen = len;
1134 1134          for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1135 1135              xlen -= PAGESIZE) {
1136 1136                  if (o + PAGESIZE >= eoff && pl != NULL) {
1137 1137                          /*
1138 1138                           * Last time through - allow the all of
1139 1139                           * what's left of the pl[] array to be used.
1140 1140                           */
1141 1141                          sz = plsz - (o - off);
1142 1142                  }
1143 1143                  err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1144 1144                      rw, cred);
1145 1145                  if (err) {
1146 1146                          /*
1147 1147                           * Release any pages we already got.
1148 1148                           */
1149 1149                          if (o > off && pl != NULL) {
1150 1150                                  for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1151 1151                                          (void) page_release(*ppp, 1);
1152 1152                          }
1153 1153                          break;
1154 1154                  }
1155 1155                  if (pl != NULL)
1156 1156                          ppp++;
1157 1157          }
1158 1158          return (err);
1159 1159  }
1160 1160  
1161 1161  /*
1162 1162   * Initialize the page list array.
1163 1163   */
1164 1164  /*ARGSUSED*/
1165 1165  void
1166 1166  pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1167 1167      u_offset_t off, size_t io_len, enum seg_rw rw)
1168 1168  {
1169 1169          ssize_t sz;
1170 1170          page_t *ppcur, **ppp;
1171 1171  
1172 1172          /*
1173 1173           * Set up to load plsz worth
1174 1174           * starting at the needed page.
1175 1175           */
1176 1176          while (pp != NULL && pp->p_offset != off) {
1177 1177                  /*
1178 1178                   * Remove page from the i/o list,
1179 1179                   * release the i/o and the page lock.
1180 1180                   */
1181 1181                  ppcur = pp;
1182 1182                  page_sub(&pp, ppcur);
1183 1183                  page_io_unlock(ppcur);
1184 1184                  (void) page_release(ppcur, 1);
1185 1185          }
1186 1186  
1187 1187          if (pp == NULL) {
1188 1188                  pl[0] = NULL;
1189 1189                  return;
1190 1190          }
1191 1191  
1192 1192          sz = plsz;
1193 1193  
1194 1194          /*
1195 1195           * Initialize the page list array.
1196 1196           */
1197 1197          ppp = pl;
1198 1198          do {
1199 1199                  ppcur = pp;
1200 1200                  *ppp++ = ppcur;
1201 1201                  page_sub(&pp, ppcur);
1202 1202                  page_io_unlock(ppcur);
1203 1203                  if (rw != S_CREATE)
1204 1204                          page_downgrade(ppcur);
1205 1205                  sz -= PAGESIZE;
1206 1206          } while (sz > 0 && pp != NULL);
1207 1207          *ppp = NULL;            /* terminate list */
1208 1208  
1209 1209          /*
1210 1210           * Now free the remaining pages that weren't
1211 1211           * loaded in the page list.
1212 1212           */
1213 1213          while (pp != NULL) {
1214 1214                  ppcur = pp;
1215 1215                  page_sub(&pp, ppcur);
1216 1216                  page_io_unlock(ppcur);
1217 1217                  (void) page_release(ppcur, 1);
1218 1218          }
1219 1219  }

↓ open down ↓

1024 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX