1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38 
  39 /*
  40  * VM - paged vnode.
  41  *
  42  * This file supplies vm support for the vnode operations that deal with pages.
  43  */
  44 #include <sys/types.h>
  45 #include <sys/t_lock.h>
  46 #include <sys/param.h>
  47 #include <sys/sysmacros.h>
  48 #include <sys/systm.h>
  49 #include <sys/time.h>
  50 #include <sys/buf.h>
  51 #include <sys/vnode.h>
  52 #include <sys/uio.h>
  53 #include <sys/vmsystm.h>
  54 #include <sys/mman.h>
  55 #include <sys/vfs.h>
  56 #include <sys/cred.h>
  57 #include <sys/user.h>
  58 #include <sys/kmem.h>
  59 #include <sys/cmn_err.h>
  60 #include <sys/debug.h>
  61 #include <sys/cpuvar.h>
  62 #include <sys/vtrace.h>
  63 #include <sys/tnf_probe.h>
  64 
  65 #include <vm/hat.h>
  66 #include <vm/as.h>
  67 #include <vm/seg.h>
  68 #include <vm/rm.h>
  69 #include <vm/pvn.h>
  70 #include <vm/page.h>
  71 #include <vm/seg_map.h>
  72 #include <vm/seg_kmem.h>
  73 #include <sys/fs/swapnode.h>
  74 
  75 int pvn_nofodklust = 0;
  76 int pvn_write_noklust = 0;
  77 
  78 uint_t pvn_vmodsort_supported = 0;      /* set if HAT supports VMODSORT */
  79 uint_t pvn_vmodsort_disable = 0;        /* set in /etc/system to disable HAT */
  80                                         /* support for vmodsort for testing */
  81 
  82 static struct kmem_cache *marker_cache = NULL;
  83 
  84 /*
  85  * Find the largest contiguous block which contains `addr' for file offset
  86  * `offset' in it while living within the file system block sizes (`vp_off'
  87  * and `vp_len') and the address space limits for which no pages currently
  88  * exist and which map to consecutive file offsets.
  89  */
  90 page_t *
  91 pvn_read_kluster(
  92         struct vnode *vp,
  93         u_offset_t off,
  94         struct seg *seg,
  95         caddr_t addr,
  96         u_offset_t *offp,                       /* return values */
  97         size_t *lenp,                           /* return values */
  98         u_offset_t vp_off,
  99         size_t vp_len,
 100         int isra)
 101 {
 102         ssize_t deltaf, deltab;
 103         page_t *pp;
 104         page_t *plist = NULL;
 105         spgcnt_t pagesavail;
 106         u_offset_t vp_end;
 107 
 108         ASSERT(off >= vp_off && off < vp_off + vp_len);
 109 
 110         /*
 111          * We only want to do klustering/read ahead if there
 112          * is more than minfree pages currently available.
 113          */
 114         pagesavail = freemem - minfree;
 115 
 116         if (pagesavail <= 0)
 117                 if (isra)
 118                         return ((page_t *)NULL);    /* ra case - give up */
 119                 else
 120                         pagesavail = 1;             /* must return a page */
 121 
 122         /* We calculate in pages instead of bytes due to 32-bit overflows */
 123         if (pagesavail < (spgcnt_t)btopr(vp_len)) {
 124                 /*
 125                  * Don't have enough free memory for the
 126                  * max request, try sizing down vp request.
 127                  */
 128                 deltab = (ssize_t)(off - vp_off);
 129                 vp_len -= deltab;
 130                 vp_off += deltab;
 131                 if (pagesavail < btopr(vp_len)) {
 132                         /*
 133                          * Still not enough memory, just settle for
 134                          * pagesavail which is at least 1.
 135                          */
 136                         vp_len = ptob(pagesavail);
 137                 }
 138         }
 139 
 140         vp_end = vp_off + vp_len;
 141         ASSERT(off >= vp_off && off < vp_end);
 142 
 143         if (isra && segop_kluster(seg, addr, 0))
 144                 return ((page_t *)NULL);        /* segment driver says no */
 145 
 146         if ((plist = page_create_va(vp, off,
 147             PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
 148                 return ((page_t *)NULL);
 149 
 150         if (vp_len <= PAGESIZE || pvn_nofodklust) {
 151                 *offp = off;
 152                 *lenp = MIN(vp_len, PAGESIZE);
 153         } else {
 154                 /*
 155                  * Scan back from front by incrementing "deltab" and
 156                  * comparing "off" with "vp_off + deltab" to avoid
 157                  * "signed" versus "unsigned" conversion problems.
 158                  */
 159                 for (deltab = PAGESIZE; off >= vp_off + deltab;
 160                     deltab += PAGESIZE) {
 161                         /*
 162                          * Call back to the segment driver to verify that
 163                          * the klustering/read ahead operation makes sense.
 164                          */
 165                         if (segop_kluster(seg, addr, -deltab))
 166                                 break;          /* page not eligible */
 167                         if ((pp = page_create_va(vp, off - deltab,
 168                             PAGESIZE, PG_EXCL, seg, addr - deltab))
 169                             == NULL)
 170                                 break;          /* already have the page */
 171                         /*
 172                          * Add page to front of page list.
 173                          */
 174                         page_add(&plist, pp);
 175                 }
 176                 deltab -= PAGESIZE;
 177 
 178                 /* scan forward from front */
 179                 for (deltaf = PAGESIZE; off + deltaf < vp_end;
 180                     deltaf += PAGESIZE) {
 181                         /*
 182                          * Call back to the segment driver to verify that
 183                          * the klustering/read ahead operation makes sense.
 184                          */
 185                         if (segop_kluster(seg, addr, deltaf))
 186                                 break;          /* page not file extension */
 187                         if ((pp = page_create_va(vp, off + deltaf,
 188                             PAGESIZE, PG_EXCL, seg, addr + deltaf))
 189                             == NULL)
 190                                 break;          /* already have page */
 191 
 192                         /*
 193                          * Add page to end of page list.
 194                          */
 195                         page_add(&plist, pp);
 196                         plist = plist->p_next;
 197                 }
 198                 *offp = off = off - deltab;
 199                 *lenp = deltab + deltaf;
 200                 ASSERT(off >= vp_off);
 201 
 202                 /*
 203                  * If we ended up getting more than was actually
 204                  * requested, retract the returned length to only
 205                  * reflect what was requested.  This might happen
 206                  * if we were allowed to kluster pages across a
 207                  * span of (say) 5 frags, and frag size is less
 208                  * than PAGESIZE.  We need a whole number of
 209                  * pages to contain those frags, but the returned
 210                  * size should only allow the returned range to
 211                  * extend as far as the end of the frags.
 212                  */
 213                 if ((vp_off + vp_len) < (off + *lenp)) {
 214                         ASSERT(vp_end > off);
 215                         *lenp = vp_end - off;
 216                 }
 217         }
 218         TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
 219             "pvn_read_kluster:seg %p addr %x isra %x",
 220             seg, addr, isra);
 221         return (plist);
 222 }
 223 
 224 /*
 225  * Handle pages for this vnode on either side of the page "pp"
 226  * which has been locked by the caller.  This routine will also
 227  * do klustering in the range [vp_off, vp_off + vp_len] up
 228  * until a page which is not found.  The offset and length
 229  * of pages included is returned in "*offp" and "*lenp".
 230  *
 231  * Returns a list of dirty locked pages all ready to be
 232  * written back.
 233  */
 234 page_t *
 235 pvn_write_kluster(
 236         struct vnode *vp,
 237         page_t *pp,
 238         u_offset_t *offp,               /* return values */
 239         size_t *lenp,                   /* return values */
 240         u_offset_t vp_off,
 241         size_t vp_len,
 242         int flags)
 243 {
 244         u_offset_t off;
 245         page_t *dirty;
 246         size_t deltab, deltaf;
 247         se_t se;
 248         u_offset_t vp_end;
 249 
 250         off = pp->p_offset;
 251 
 252         /*
 253          * Kustering should not be done if we are invalidating
 254          * pages since we could destroy pages that belong to
 255          * some other process if this is a swap vnode.
 256          */
 257         if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
 258                 *offp = off;
 259                 *lenp = PAGESIZE;
 260                 return (pp);
 261         }
 262 
 263         if (flags & (B_FREE | B_INVAL))
 264                 se = SE_EXCL;
 265         else
 266                 se = SE_SHARED;
 267 
 268         dirty = pp;
 269         /*
 270          * Scan backwards looking for pages to kluster by incrementing
 271          * "deltab" and comparing "off" with "vp_off + deltab" to
 272          * avoid "signed" versus "unsigned" conversion problems.
 273          */
 274         for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
 275                 pp = page_lookup_nowait(vp, off - deltab, se);
 276                 if (pp == NULL)
 277                         break;          /* page not found */
 278                 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 279                         break;
 280                 page_add(&dirty, pp);
 281         }
 282         deltab -= PAGESIZE;
 283 
 284         vp_end = vp_off + vp_len;
 285         /* now scan forwards looking for pages to kluster */
 286         for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
 287                 pp = page_lookup_nowait(vp, off + deltaf, se);
 288                 if (pp == NULL)
 289                         break;          /* page not found */
 290                 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 291                         break;
 292                 page_add(&dirty, pp);
 293                 dirty = dirty->p_next;
 294         }
 295 
 296         *offp = off - deltab;
 297         *lenp = deltab + deltaf;
 298         return (dirty);
 299 }
 300 
 301 /*
 302  * Generic entry point used to release the "shared/exclusive" lock
 303  * and the "p_iolock" on pages after i/o is complete.
 304  */
 305 void
 306 pvn_io_done(page_t *plist)
 307 {
 308         page_t *pp;
 309 
 310         while (plist != NULL) {
 311                 pp = plist;
 312                 page_sub(&plist, pp);
 313                 page_io_unlock(pp);
 314                 page_unlock(pp);
 315         }
 316 }
 317 
 318 /*
 319  * Entry point to be used by file system getpage subr's and
 320  * other such routines which either want to unlock pages (B_ASYNC
 321  * request) or destroy a list of pages if an error occurred.
 322  */
 323 void
 324 pvn_read_done(page_t *plist, int flags)
 325 {
 326         page_t *pp;
 327 
 328         while (plist != NULL) {
 329                 pp = plist;
 330                 page_sub(&plist, pp);
 331                 page_io_unlock(pp);
 332                 if (flags & B_ERROR) {
 333                         /*LINTED: constant in conditional context*/
 334                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
 335                 } else {
 336                         (void) page_release(pp, 0);
 337                 }
 338         }
 339 }
 340 
 341 /*
 342  * Automagic pageout.
 343  * When memory gets tight, start freeing pages popping out of the
 344  * write queue.
 345  */
 346 int     write_free = 1;
 347 pgcnt_t pages_before_pager = 200;       /* LMXXX */
 348 
 349 /*
 350  * Routine to be called when page-out's complete.
 351  * The caller, typically VOP_PUTPAGE, has to explicity call this routine
 352  * after waiting for i/o to complete (biowait) to free the list of
 353  * pages associated with the buffer.  These pages must be locked
 354  * before i/o is initiated.
 355  *
 356  * If a write error occurs, the pages are marked as modified
 357  * so the write will be re-tried later.
 358  */
 359 
 360 void
 361 pvn_write_done(page_t *plist, int flags)
 362 {
 363         int dfree = 0;
 364         int pgrec = 0;
 365         int pgout = 0;
 366         int pgpgout = 0;
 367         int anonpgout = 0;
 368         int anonfree = 0;
 369         int fspgout = 0;
 370         int fsfree = 0;
 371         int execpgout = 0;
 372         int execfree = 0;
 373         page_t *pp;
 374         struct cpu *cpup;
 375         struct vnode *vp = NULL;        /* for probe */
 376         uint_t ppattr;
 377         kmutex_t *vphm = NULL;
 378 
 379         ASSERT((flags & B_READ) == 0);
 380 
 381         /*
 382          * If we are about to start paging anyway, start freeing pages.
 383          */
 384         if (write_free && freemem < lotsfree + pages_before_pager &&
 385             (flags & B_ERROR) == 0) {
 386                 flags |= B_FREE;
 387         }
 388 
 389         /*
 390          * Handle each page involved in the i/o operation.
 391          */
 392         while (plist != NULL) {
 393                 pp = plist;
 394                 ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
 395                 page_sub(&plist, pp);
 396 
 397                 /* Kernel probe support */
 398                 if (vp == NULL)
 399                         vp = pp->p_vnode;
 400 
 401                 if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
 402                         /*
 403                          * Move page to the top of the v_page list.
 404                          * Skip pages modified during IO.
 405                          */
 406                         vphm = page_vnode_mutex(vp);
 407                         mutex_enter(vphm);
 408                         if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
 409                                 page_vpsub(&vp->v_pages, pp);
 410                                 page_vpadd(&vp->v_pages, pp);
 411                         }
 412                         mutex_exit(vphm);
 413                 }
 414 
 415                 if (flags & B_ERROR) {
 416                         /*
 417                          * Write operation failed.  We don't want
 418                          * to destroy (or free) the page unless B_FORCE
 419                          * is set. We set the mod bit again and release
 420                          * all locks on the page so that it will get written
 421                          * back again later when things are hopefully
 422                          * better again.
 423                          * If B_INVAL and B_FORCE is set we really have
 424                          * to destroy the page.
 425                          */
 426                         if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
 427                                 page_io_unlock(pp);
 428                                 /*LINTED: constant in conditional context*/
 429                                 VN_DISPOSE(pp, B_INVAL, 0, kcred);
 430                         } else {
 431                                 hat_setmod_only(pp);
 432                                 page_io_unlock(pp);
 433                                 page_unlock(pp);
 434                         }
 435                 } else if (flags & B_INVAL) {
 436                         /*
 437                          * XXX - Failed writes with B_INVAL set are
 438                          * not handled appropriately.
 439                          */
 440                         page_io_unlock(pp);
 441                         /*LINTED: constant in conditional context*/
 442                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
 443                 } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
 444                         /*
 445                          * Update statistics for pages being paged out
 446                          */
 447                         if (pp->p_vnode) {
 448                                 if (IS_SWAPFSVP(pp->p_vnode)) {
 449                                         anonpgout++;
 450                                 } else {
 451                                         if (pp->p_vnode->v_flag & VVMEXEC) {
 452                                                 execpgout++;
 453                                         } else {
 454                                                 fspgout++;
 455                                         }
 456                                 }
 457                         }
 458                         page_io_unlock(pp);
 459                         pgout = 1;
 460                         pgpgout++;
 461                         TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
 462                             "page_ws_out:pp %p", pp);
 463 
 464                         /*
 465                          * The page_struct_lock need not be acquired to
 466                          * examine "p_lckcnt" and "p_cowcnt" since we'll
 467                          * have an "exclusive" lock if the upgrade succeeds.
 468                          */
 469                         if (page_tryupgrade(pp) &&
 470                             pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
 471                                 /*
 472                                  * Check if someone has reclaimed the
 473                                  * page.  If ref and mod are not set, no
 474                                  * one is using it so we can free it.
 475                                  * The rest of the system is careful
 476                                  * to use the NOSYNC flag to unload
 477                                  * translations set up for i/o w/o
 478                                  * affecting ref and mod bits.
 479                                  *
 480                                  * Obtain a copy of the real hardware
 481                                  * mod bit using hat_pagesync(pp, HAT_DONTZERO)
 482                                  * to avoid having to flush the cache.
 483                                  */
 484                                 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
 485                                     HAT_SYNC_STOPON_MOD);
 486                         ck_refmod:
 487                                 if (!(ppattr & (P_REF | P_MOD))) {
 488                                         if (hat_page_is_mapped(pp)) {
 489                                                 /*
 490                                                  * Doesn't look like the page
 491                                                  * was modified so now we
 492                                                  * really have to unload the
 493                                                  * translations.  Meanwhile
 494                                                  * another CPU could've
 495                                                  * modified it so we have to
 496                                                  * check again.  We don't loop
 497                                                  * forever here because now
 498                                                  * the translations are gone
 499                                                  * and no one can get a new one
 500                                                  * since we have the "exclusive"
 501                                                  * lock on the page.
 502                                                  */
 503                                                 (void) hat_pageunload(pp,
 504                                                     HAT_FORCE_PGUNLOAD);
 505                                                 ppattr = hat_page_getattr(pp,
 506                                                     P_REF | P_MOD);
 507                                                 goto ck_refmod;
 508                                         }
 509                                         /*
 510                                          * Update statistics for pages being
 511                                          * freed
 512                                          */
 513                                         if (pp->p_vnode) {
 514                                                 if (IS_SWAPFSVP(pp->p_vnode)) {
 515                                                         anonfree++;
 516                                                 } else {
 517                                                         if (pp->p_vnode->v_flag
 518                                                             & VVMEXEC) {
 519                                                                 execfree++;
 520                                                         } else {
 521                                                                 fsfree++;
 522                                                         }
 523                                                 }
 524                                         }
 525                                         /*LINTED: constant in conditional ctx*/
 526                                         VN_DISPOSE(pp, B_FREE,
 527                                             (flags & B_DONTNEED), kcred);
 528                                         dfree++;
 529                                 } else {
 530                                         page_unlock(pp);
 531                                         pgrec++;
 532                                         TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
 533                                             "page_ws_free:pp %p", pp);
 534                                 }
 535                         } else {
 536                                 /*
 537                                  * Page is either `locked' in memory
 538                                  * or was reclaimed and now has a
 539                                  * "shared" lock, so release it.
 540                                  */
 541                                 page_unlock(pp);
 542                         }
 543                 } else {
 544                         /*
 545                          * Neither B_FREE nor B_INVAL nor B_ERROR.
 546                          * Just release locks.
 547                          */
 548                         page_io_unlock(pp);
 549                         page_unlock(pp);
 550                 }
 551         }
 552 
 553         CPU_STATS_ENTER_K();
 554         cpup = CPU;             /* get cpup now that CPU cannot change */
 555         CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
 556         CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
 557         CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
 558         CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
 559         CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
 560         CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
 561         CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
 562         CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
 563         CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
 564         CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
 565         CPU_STATS_EXIT_K();
 566 
 567         /* Kernel probe */
 568         TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
 569             tnf_opaque, vnode,                  vp,
 570             tnf_ulong,  pages_pageout,          pgpgout,
 571             tnf_ulong,  pages_freed,            dfree,
 572             tnf_ulong,  pages_reclaimed,        pgrec);
 573 }
 574 
 575 /*
 576  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
 577  * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
 578  * operation and is only to be considered if it doesn't involve any
 579  * waiting here.  B_TRUNC indicates that the file is being truncated
 580  * and so no i/o needs to be done. B_FORCE indicates that the page
 581  * must be destroyed so don't try wrting it out.
 582  *
 583  * The caller must ensure that the page is locked.  Returns 1, if
 584  * the page should be written back (the "iolock" is held in this
 585  * case), or 0 if the page has been dealt with or has been
 586  * unlocked.
 587  */
 588 int
 589 pvn_getdirty(page_t *pp, int flags)
 590 {
 591         ASSERT((flags & (B_INVAL | B_FREE)) ?
 592             PAGE_EXCL(pp) : PAGE_SHARED(pp));
 593         ASSERT(PP_ISFREE(pp) == 0);
 594 
 595         /*
 596          * If trying to invalidate or free a logically `locked' page,
 597          * forget it.  Don't need page_struct_lock to check p_lckcnt and
 598          * p_cowcnt as the page is exclusively locked.
 599          */
 600         if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
 601             (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
 602                 page_unlock(pp);
 603                 return (0);
 604         }
 605 
 606         /*
 607          * Now acquire the i/o lock so we can add it to the dirty
 608          * list (if necessary).  We avoid blocking on the i/o lock
 609          * in the following cases:
 610          *
 611          *      If B_DELWRI is set, which implies that this request is
 612          *      due to a klustering operartion.
 613          *
 614          *      If this is an async (B_ASYNC) operation and we are not doing
 615          *      invalidation (B_INVAL) [The current i/o or fsflush will ensure
 616          *      that the the page is written out].
 617          */
 618         if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
 619                 if (!page_io_trylock(pp)) {
 620                         page_unlock(pp);
 621                         return (0);
 622                 }
 623         } else {
 624                 page_io_lock(pp);
 625         }
 626 
 627         /*
 628          * If we want to free or invalidate the page then
 629          * we need to unload it so that anyone who wants
 630          * it will have to take a minor fault to get it.
 631          * Otherwise, we're just writing the page back so we
 632          * need to sync up the hardwre and software mod bit to
 633          * detect any future modifications.  We clear the
 634          * software mod bit when we put the page on the dirty
 635          * list.
 636          */
 637         if (flags & (B_INVAL | B_FREE)) {
 638                 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 639         } else {
 640                 (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
 641         }
 642 
 643         if (!hat_ismod(pp) || (flags & B_TRUNC)) {
 644                 /*
 645                  * Don't need to add it to the
 646                  * list after all.
 647                  */
 648                 page_io_unlock(pp);
 649                 if (flags & B_INVAL) {
 650                         /*LINTED: constant in conditional context*/
 651                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
 652                 } else if (flags & B_FREE) {
 653                         /*LINTED: constant in conditional context*/
 654                         VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
 655                 } else {
 656                         /*
 657                          * This is advisory path for the callers
 658                          * of VOP_PUTPAGE() who prefer freeing the
 659                          * page _only_ if no one else is accessing it.
 660                          * E.g. segmap_release()
 661                          *
 662                          * The above hat_ismod() check is useless because:
 663                          * (1) we may not be holding SE_EXCL lock;
 664                          * (2) we've not unloaded _all_ translations
 665                          *
 666                          * Let page_release() do the heavy-lifting.
 667                          */
 668                         (void) page_release(pp, 1);
 669                 }
 670                 return (0);
 671         }
 672 
 673         /*
 674          * Page is dirty, get it ready for the write back
 675          * and add page to the dirty list.
 676          */
 677         hat_clrrefmod(pp);
 678 
 679         /*
 680          * If we're going to free the page when we're done
 681          * then we can let others try to use it starting now.
 682          * We'll detect the fact that they used it when the
 683          * i/o is done and avoid freeing the page.
 684          */
 685         if (flags & B_FREE)
 686                 page_downgrade(pp);
 687 
 688 
 689         TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
 690 
 691         return (1);
 692 }
 693 
 694 
 695 /*ARGSUSED*/
 696 static int
 697 marker_constructor(void *buf, void *cdrarg, int kmflags)
 698 {
 699         page_t *mark = buf;
 700         bzero(mark, sizeof (page_t));
 701         mark->p_hash = PVN_VPLIST_HASH_TAG;
 702         return (0);
 703 }
 704 
 705 void
 706 pvn_init()
 707 {
 708         if (pvn_vmodsort_disable == 0)
 709                 pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
 710         marker_cache = kmem_cache_create("marker_cache",
 711             sizeof (page_t), 0, marker_constructor,
 712             NULL, NULL, NULL, NULL, 0);
 713 }
 714 
 715 
 716 /*
 717  * Process a vnode's page list for all pages whose offset is >= off.
 718  * Pages are to either be free'd, invalidated, or written back to disk.
 719  *
 720  * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
 721  * is specified, otherwise they are "shared" locked.
 722  *
 723  * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
 724  *
 725  * Special marker page_t's are inserted in the list in order
 726  * to keep track of where we are in the list when locks are dropped.
 727  *
 728  * Note the list is circular and insertions can happen only at the
 729  * head and tail of the list. The algorithm ensures visiting all pages
 730  * on the list in the following way:
 731  *
 732  *    Drop two marker pages at the end of the list.
 733  *
 734  *    Move one marker page backwards towards the start of the list until
 735  *    it is at the list head, processing the pages passed along the way.
 736  *
 737  *    Due to race conditions when the vphm mutex is dropped, additional pages
 738  *    can be added to either end of the list, so we'll continue to move
 739  *    the marker and process pages until it is up against the end marker.
 740  *
 741  * There is one special exit condition. If we are processing a VMODSORT
 742  * vnode and only writing back modified pages, we can stop as soon as
 743  * we run into an unmodified page.  This makes fsync(3) operations fast.
 744  */
 745 int
 746 pvn_vplist_dirty(
 747         vnode_t         *vp,
 748         u_offset_t      off,
 749         int             (*putapage)(vnode_t *, page_t *, u_offset_t *,
 750                         size_t *, int, cred_t *),
 751         int             flags,
 752         cred_t          *cred)
 753 {
 754         page_t          *pp;
 755         page_t          *mark;          /* marker page that moves toward head */
 756         page_t          *end;           /* marker page at end of list */
 757         int             err = 0;
 758         int             error;
 759         kmutex_t        *vphm;
 760         se_t            se;
 761         page_t          **where_to_move;
 762 
 763         ASSERT(vp->v_type != VCHR);
 764 
 765         if (vp->v_pages == NULL)
 766                 return (0);
 767 
 768 
 769         /*
 770          * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
 771          *
 772          * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
 773          * from getting blocked while flushing pages to a dead NFS server.
 774          */
 775         mutex_enter(&vp->v_lock);
 776         if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
 777                 mutex_exit(&vp->v_lock);
 778                 return (EAGAIN);
 779         }
 780 
 781         while (vp->v_flag & VVMLOCK)
 782                 cv_wait(&vp->v_cv, &vp->v_lock);
 783 
 784         if (vp->v_pages == NULL) {
 785                 mutex_exit(&vp->v_lock);
 786                 return (0);
 787         }
 788 
 789         vp->v_flag |= VVMLOCK;
 790         mutex_exit(&vp->v_lock);
 791 
 792 
 793         /*
 794          * Set up the marker pages used to walk the list
 795          */
 796         end = kmem_cache_alloc(marker_cache, KM_SLEEP);
 797         end->p_vnode = vp;
 798         end->p_offset = (u_offset_t)-2;
 799         mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
 800         mark->p_vnode = vp;
 801         mark->p_offset = (u_offset_t)-1;
 802 
 803         /*
 804          * Grab the lock protecting the vnode's page list
 805          * note that this lock is dropped at times in the loop.
 806          */
 807         vphm = page_vnode_mutex(vp);
 808         mutex_enter(vphm);
 809         if (vp->v_pages == NULL)
 810                 goto leave;
 811 
 812         /*
 813          * insert the markers and loop through the list of pages
 814          */
 815         page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
 816         page_vpadd(&mark->p_vpnext, end);
 817         for (;;) {
 818 
 819                 /*
 820                  * If only doing an async write back, then we can
 821                  * stop as soon as we get to start of the list.
 822                  */
 823                 if (flags == B_ASYNC && vp->v_pages == mark)
 824                         break;
 825 
 826                 /*
 827                  * otherwise stop when we've gone through all the pages
 828                  */
 829                 if (mark->p_vpprev == end)
 830                         break;
 831 
 832                 pp = mark->p_vpprev;
 833                 if (vp->v_pages == pp)
 834                         where_to_move = &vp->v_pages;
 835                 else
 836                         where_to_move = &pp->p_vpprev->p_vpnext;
 837 
 838                 ASSERT(pp->p_vnode == vp);
 839 
 840                 /*
 841                  * If just flushing dirty pages to disk and this vnode
 842                  * is using a sorted list of pages, we can stop processing
 843                  * as soon as we find an unmodified page. Since all the
 844                  * modified pages are visited first.
 845                  */
 846                 if (IS_VMODSORT(vp) &&
 847                     !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
 848                         if (!hat_ismod(pp) && !page_io_locked(pp)) {
 849 #ifdef  DEBUG
 850                                 /*
 851                                  * For debug kernels examine what should be
 852                                  * all the remaining clean pages, asserting
 853                                  * that they are not modified.
 854                                  */
 855                                 page_t  *chk = pp;
 856                                 int     attr;
 857 
 858                                 page_vpsub(&vp->v_pages, mark);
 859                                 page_vpadd(where_to_move, mark);
 860                                 do {
 861                                         chk = chk->p_vpprev;
 862                                         ASSERT(chk != end);
 863                                         if (chk == mark)
 864                                                 continue;
 865                                         attr = hat_page_getattr(chk, P_MOD |
 866                                             P_REF);
 867                                         if ((attr & P_MOD) == 0)
 868                                                 continue;
 869                                         panic("v_pages list not all clean: "
 870                                             "page_t*=%p vnode=%p off=%lx "
 871                                             "attr=0x%x last clean page_t*=%p\n",
 872                                             (void *)chk, (void *)chk->p_vnode,
 873                                             (long)chk->p_offset, attr,
 874                                             (void *)pp);
 875                                 } while (chk != vp->v_pages);
 876 #endif
 877                                 break;
 878                         } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
 879                                 /*
 880                                  * Couldn't get io lock, wait until IO is done.
 881                                  * Block only for sync IO since we don't want
 882                                  * to block async IO.
 883                                  */
 884                                 mutex_exit(vphm);
 885                                 page_io_wait(pp);
 886                                 mutex_enter(vphm);
 887                                 continue;
 888                         }
 889                 }
 890 
 891                 /*
 892                  * Skip this page if the offset is out of the desired range.
 893                  * Just move the marker and continue.
 894                  */
 895                 if (pp->p_offset < off) {
 896                         page_vpsub(&vp->v_pages, mark);
 897                         page_vpadd(where_to_move, mark);
 898                         continue;
 899                 }
 900 
 901                 /*
 902                  * If we are supposed to invalidate or free this
 903                  * page, then we need an exclusive lock.
 904                  */
 905                 se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
 906 
 907                 /*
 908                  * We must acquire the page lock for all synchronous
 909                  * operations (invalidate, free and write).
 910                  */
 911                 if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
 912                         /*
 913                          * If the page_lock() drops the mutex
 914                          * we must retry the loop.
 915                          */
 916                         if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
 917                                 continue;
 918 
 919                         /*
 920                          * It's ok to move the marker page now.
 921                          */
 922                         page_vpsub(&vp->v_pages, mark);
 923                         page_vpadd(where_to_move, mark);
 924                 } else {
 925 
 926                         /*
 927                          * update the marker page for all remaining cases
 928                          */
 929                         page_vpsub(&vp->v_pages, mark);
 930                         page_vpadd(where_to_move, mark);
 931 
 932                         /*
 933                          * For write backs, If we can't lock the page, it's
 934                          * invalid or in the process of being destroyed.  Skip
 935                          * it, assuming someone else is writing it.
 936                          */
 937                         if (!page_trylock(pp, se))
 938                                 continue;
 939                 }
 940 
 941                 ASSERT(pp->p_vnode == vp);
 942 
 943                 /*
 944                  * Successfully locked the page, now figure out what to
 945                  * do with it. Free pages are easily dealt with, invalidate
 946                  * if desired or just go on to the next page.
 947                  */
 948                 if (PP_ISFREE(pp)) {
 949                         if ((flags & B_INVAL) == 0) {
 950                                 page_unlock(pp);
 951                                 continue;
 952                         }
 953 
 954                         /*
 955                          * Invalidate (destroy) the page.
 956                          */
 957                         mutex_exit(vphm);
 958                         page_destroy_free(pp);
 959                         mutex_enter(vphm);
 960                         continue;
 961                 }
 962 
 963                 /*
 964                  * pvn_getdirty() figures out what do do with a dirty page.
 965                  * If the page is dirty, the putapage() routine will write it
 966                  * and will kluster any other adjacent dirty pages it can.
 967                  *
 968                  * pvn_getdirty() and `(*putapage)' unlock the page.
 969                  */
 970                 mutex_exit(vphm);
 971                 if (pvn_getdirty(pp, flags)) {
 972                         error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
 973                         if (!err)
 974                                 err = error;
 975                 }
 976                 mutex_enter(vphm);
 977         }
 978         page_vpsub(&vp->v_pages, mark);
 979         page_vpsub(&vp->v_pages, end);
 980 
 981 leave:
 982         /*
 983          * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
 984          */
 985         mutex_exit(vphm);
 986         kmem_cache_free(marker_cache, mark);
 987         kmem_cache_free(marker_cache, end);
 988         mutex_enter(&vp->v_lock);
 989         vp->v_flag &= ~VVMLOCK;
 990         cv_broadcast(&vp->v_cv);
 991         mutex_exit(&vp->v_lock);
 992         return (err);
 993 }
 994 
 995 /*
 996  * Walk the vp->v_pages list, for every page call the callback function
 997  * pointed by *page_check. If page_check returns non-zero, then mark the
 998  * page as modified and if VMODSORT is set, move it to the end of v_pages
 999  * list. Moving makes sense only if we have at least two pages - this also
1000  * avoids having v_pages temporarily being NULL after calling page_vpsub()
1001  * if there was just one page.
1002  */
1003 void
1004 pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
1005 {
1006         page_t  *pp, *next, *end;
1007         kmutex_t        *vphm;
1008         int     shuffle;
1009 
1010         vphm = page_vnode_mutex(vp);
1011         mutex_enter(vphm);
1012 
1013         if (vp->v_pages == NULL) {
1014                 mutex_exit(vphm);
1015                 return;
1016         }
1017 
1018         end = vp->v_pages->p_vpprev;
1019         shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1020         pp = vp->v_pages;
1021 
1022         for (;;) {
1023                 next = pp->p_vpnext;
1024                 if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1025                         /*
1026                          * hat_setmod_only() in contrast to hat_setmod() does
1027                          * not shuffle the pages and does not grab the mutex
1028                          * page_vnode_mutex. Exactly what we need.
1029                          */
1030                         hat_setmod_only(pp);
1031                         if (shuffle) {
1032                                 page_vpsub(&vp->v_pages, pp);
1033                                 ASSERT(vp->v_pages != NULL);
1034                                 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1035                                     pp);
1036                         }
1037                 }
1038                 /* Stop if we have just processed the last page. */
1039                 if (pp == end)
1040                         break;
1041                 pp = next;
1042         }
1043 
1044         mutex_exit(vphm);
1045 }
1046 
1047 /*
1048  * Zero out zbytes worth of data. Caller should be aware that this
1049  * routine may enter back into the fs layer (xxx_getpage). Locks
1050  * that the xxx_getpage routine may need should not be held while
1051  * calling this.
1052  */
1053 void
1054 pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
1055 {
1056         caddr_t addr;
1057 
1058         ASSERT(vp->v_type != VCHR);
1059 
1060         if (vp->v_pages == NULL)
1061                 return;
1062 
1063         /*
1064          * zbytes may be zero but there still may be some portion of
1065          * a page which needs clearing (since zbytes is a function
1066          * of filesystem block size, not pagesize.)
1067          */
1068         if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1069                 return;
1070 
1071         /*
1072          * We get the last page and handle the partial
1073          * zeroing via kernel mappings.  This will make the page
1074          * dirty so that we know that when this page is written
1075          * back, the zeroed information will go out with it.  If
1076          * the page is not currently in memory, then the kzero
1077          * operation will cause it to be brought it.  We use kzero
1078          * instead of bzero so that if the page cannot be read in
1079          * for any reason, the system will not panic.  We need
1080          * to zero out a minimum of the fs given zbytes, but we
1081          * might also have to do more to get the entire last page.
1082          */
1083 
1084         if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1085                 panic("pvn_vptrunc zbytes");
1086         addr = segmap_getmapflt(segkmap, vp, vplen,
1087             MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1088         (void) kzero(addr + (vplen & MAXBOFFSET),
1089             MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1090         (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1091 }
1092 
1093 /*
1094  * Handles common work of the VOP_GETPAGE routines by iterating page by page
1095  * calling the getpage helper for each.
1096  */
1097 int
1098 pvn_getpages(
1099         int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
1100                 size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1101         struct vnode *vp,
1102         u_offset_t off,
1103         size_t len,
1104         uint_t *protp,
1105         page_t *pl[],
1106         size_t plsz,
1107         struct seg *seg,
1108         caddr_t addr,
1109         enum seg_rw rw,
1110         struct cred *cred)
1111 {
1112         page_t **ppp;
1113         u_offset_t o, eoff;
1114         size_t sz, xlen;
1115         int err;
1116 
1117         /* ensure that we have enough space */
1118         ASSERT(pl == NULL || plsz >= len);
1119 
1120         /*
1121          * Loop one page at a time and let getapage function fill
1122          * in the next page in array.  We only allow one page to be
1123          * returned at a time (except for the last page) so that we
1124          * don't have any problems with duplicates and other such
1125          * painful problems.  This is a very simple minded algorithm,
1126          * but it does the job correctly.  We hope that the cost of a
1127          * getapage call for a resident page that we might have been
1128          * able to get from an earlier call doesn't cost too much.
1129          */
1130         ppp = pl;
1131         sz = (pl != NULL) ? PAGESIZE : 0;
1132         eoff = off + len;
1133         xlen = len;
1134         for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1135             xlen -= PAGESIZE) {
1136                 if (o + PAGESIZE >= eoff && pl != NULL) {
1137                         /*
1138                          * Last time through - allow the all of
1139                          * what's left of the pl[] array to be used.
1140                          */
1141                         sz = plsz - (o - off);
1142                 }
1143                 err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1144                     rw, cred);
1145                 if (err) {
1146                         /*
1147                          * Release any pages we already got.
1148                          */
1149                         if (o > off && pl != NULL) {
1150                                 for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1151                                         (void) page_release(*ppp, 1);
1152                         }
1153                         break;
1154                 }
1155                 if (pl != NULL)
1156                         ppp++;
1157         }
1158         return (err);
1159 }
1160 
1161 /*
1162  * Initialize the page list array.
1163  */
1164 /*ARGSUSED*/
1165 void
1166 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1167     u_offset_t off, size_t io_len, enum seg_rw rw)
1168 {
1169         ssize_t sz;
1170         page_t *ppcur, **ppp;
1171 
1172         /*
1173          * Set up to load plsz worth
1174          * starting at the needed page.
1175          */
1176         while (pp != NULL && pp->p_offset != off) {
1177                 /*
1178                  * Remove page from the i/o list,
1179                  * release the i/o and the page lock.
1180                  */
1181                 ppcur = pp;
1182                 page_sub(&pp, ppcur);
1183                 page_io_unlock(ppcur);
1184                 (void) page_release(ppcur, 1);
1185         }
1186 
1187         if (pp == NULL) {
1188                 pl[0] = NULL;
1189                 return;
1190         }
1191 
1192         sz = plsz;
1193 
1194         /*
1195          * Initialize the page list array.
1196          */
1197         ppp = pl;
1198         do {
1199                 ppcur = pp;
1200                 *ppp++ = ppcur;
1201                 page_sub(&pp, ppcur);
1202                 page_io_unlock(ppcur);
1203                 if (rw != S_CREATE)
1204                         page_downgrade(ppcur);
1205                 sz -= PAGESIZE;
1206         } while (sz > 0 && pp != NULL);
1207         *ppp = NULL;            /* terminate list */
1208 
1209         /*
1210          * Now free the remaining pages that weren't
1211          * loaded in the page list.
1212          */
1213         while (pp != NULL) {
1214                 ppcur = pp;
1215                 page_sub(&pp, ppcur);
1216                 page_io_unlock(ppcur);
1217                 (void) page_release(ppcur, 1);
1218         }
1219 }