1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38 
  39 #include <sys/types.h>
  40 #include <sys/t_lock.h>
  41 #include <sys/sysmacros.h>
  42 #include <sys/conf.h>
  43 #include <sys/cpuvar.h>
  44 #include <sys/errno.h>
  45 #include <sys/debug.h>
  46 #include <sys/buf.h>
  47 #include <sys/var.h>
  48 #include <sys/vnode.h>
  49 #include <sys/bitmap.h>
  50 #include <sys/cmn_err.h>
  51 #include <sys/kmem.h>
  52 #include <sys/vmem.h>
  53 #include <sys/atomic.h>
  54 #include <vm/seg_kmem.h>
  55 #include <vm/page.h>
  56 #include <vm/pvn.h>
  57 #include <sys/vtrace.h>
  58 #include <sys/tnf_probe.h>
  59 #include <sys/fs/ufs_inode.h>
  60 #include <sys/fs/ufs_bio.h>
  61 #include <sys/fs/ufs_log.h>
  62 #include <sys/systm.h>
  63 #include <sys/vfs.h>
  64 #include <sys/sdt.h>
  65 
  66 /* Locks */
  67 static  kmutex_t        blist_lock;     /* protects b_list */
  68 static  kmutex_t        bhdr_lock;      /* protects the bhdrlist */
  69 static  kmutex_t        bfree_lock;     /* protects the bfreelist structure */
  70 
  71 struct hbuf     *hbuf;                  /* Hash buckets */
  72 struct dwbuf    *dwbuf;                 /* Delayed write buckets */
  73 static struct buf *bhdrlist;            /* buf header free list */
  74 static int      nbuf;                   /* number of buffer headers allocated */
  75 
  76 static int      lastindex;              /* Reference point on where to start */
  77                                         /* when looking for free buffers */
  78 
  79 #define bio_bhash(dev, bn)      (hash2ints((dev), (int)(bn)) & v.v_hmask)
  80 #define EMPTY_LIST      ((struct buf *)-1)
  81 
  82 static kcondvar_t       bio_mem_cv;     /* Condition variables */
  83 static kcondvar_t       bio_flushinval_cv;
  84 static int      bio_doingflush;         /* flush in progress */
  85 static int      bio_doinginval;         /* inval in progress */
  86 static int      bio_flinv_cv_wanted;    /* someone waiting for cv */
  87 
  88 /*
  89  * Statistics on the buffer cache
  90  */
  91 struct biostats biostats = {
  92         { "buffer_cache_lookups",               KSTAT_DATA_UINT32 },
  93         { "buffer_cache_hits",                  KSTAT_DATA_UINT32 },
  94         { "new_buffer_requests",                KSTAT_DATA_UINT32 },
  95         { "waits_for_buffer_allocs",            KSTAT_DATA_UINT32 },
  96         { "buffers_locked_by_someone",          KSTAT_DATA_UINT32 },
  97         { "duplicate_buffers_found",            KSTAT_DATA_UINT32 }
  98 };
  99 
 100 /*
 101  * kstat data
 102  */
 103 kstat_named_t   *biostats_ptr = (kstat_named_t *)&biostats;
 104 uint_t          biostats_ndata = (uint_t)(sizeof (biostats) /
 105                                         sizeof (kstat_named_t));
 106 
 107 /*
 108  * Statistics on ufs buffer cache
 109  * Not protected by locks
 110  */
 111 struct ufsbiostats ub = {
 112         { "breads",                     KSTAT_DATA_UINT32 },
 113         { "bwrites",                    KSTAT_DATA_UINT32 },
 114         { "fbiwrites",                  KSTAT_DATA_UINT32 },
 115         { "getpages",                   KSTAT_DATA_UINT32 },
 116         { "getras",                     KSTAT_DATA_UINT32 },
 117         { "putsyncs",                   KSTAT_DATA_UINT32 },
 118         { "putasyncs",                  KSTAT_DATA_UINT32 },
 119         { "putpageios",                 KSTAT_DATA_UINT32 },
 120 };
 121 
 122 /*
 123  * more UFS Logging eccentricities...
 124  *
 125  * required since "#pragma weak ..." doesn't work in reverse order.
 126  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
 127  *        to ufs routines don't get plugged into bio.c calls so
 128  *        we initialize it when setting up the "lufsops" table
 129  *        in "lufs.c:_init()"
 130  */
 131 void (*bio_lufs_strategy)(void *, buf_t *);
 132 void (*bio_snapshot_strategy)(void *, buf_t *);
 133 
 134 
 135 /* Private routines */
 136 static struct buf       *bio_getfreeblk(long);
 137 static void             bio_mem_get(long);
 138 static void             bio_bhdr_free(struct buf *);
 139 static struct buf       *bio_bhdr_alloc(void);
 140 static void             bio_recycle(int, long);
 141 static void             bio_pageio_done(struct buf *);
 142 static int              bio_incore(dev_t, daddr_t);
 143 
 144 /*
 145  * Buffer cache constants
 146  */
 147 #define BIO_BUF_PERCENT (100/2)         /* default: 2% of memory */
 148 #define BIO_MAX_PERCENT (100/20)        /* max is 20% of real memory */
 149 #define BIO_BHDR_POOL   100             /* Default bhdr pool size */
 150 #define BIO_MIN_HDR     10              /* Minimum number of buffer headers */
 151 #define BIO_MIN_HWM     (BIO_MIN_HDR * MAXBSIZE / 1024)
 152 #define BIO_HASHLEN     4               /* Target length of hash chains */
 153 
 154 
 155 /* Flags for bio_recycle() */
 156 #define BIO_HEADER      0x01
 157 #define BIO_MEM         0x02
 158 
 159 extern  int bufhwm;             /* User tunable - high water mark for mem  */
 160 extern  int bufhwm_pct;         /* ditto - given in % of physmem  */
 161 
 162 /*
 163  * The following routines allocate and free
 164  * buffers with various side effects.  In general the
 165  * arguments to an allocate routine are a device and
 166  * a block number, and the value is a pointer to
 167  * to the buffer header; the buffer returned is locked with a
 168  * binary semaphore so that no one else can touch it. If the block was
 169  * already in core, no I/O need be done; if it is
 170  * already locked, the process waits until it becomes free.
 171  * The following routines allocate a buffer:
 172  *      getblk
 173  *      bread/BREAD
 174  *      breada
 175  * Eventually the buffer must be released, possibly with the
 176  * side effect of writing it out, by using one of
 177  *      bwrite/BWRITE/brwrite
 178  *      bdwrite/bdrwrite
 179  *      bawrite
 180  *      brelse
 181  *
 182  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
 183  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
 184  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
 185  * B_DONE is still used to denote a buffer with I/O complete on it.
 186  *
 187  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
 188  * should not be used where a very accurate count of the free buffers is
 189  * needed.
 190  */
 191 
 192 /*
 193  * Read in (if necessary) the block and return a buffer pointer.
 194  *
 195  * This interface is provided for binary compatibility.  Using
 196  * BREAD() directly avoids the extra function call overhead invoked
 197  * by calling this routine.
 198  */
 199 struct buf *
 200 bread(dev_t dev, daddr_t blkno, long bsize)
 201 {
 202         return (BREAD(dev, blkno, bsize));
 203 }
 204 
 205 /*
 206  * Common code for reading a buffer with various options
 207  *
 208  * Read in (if necessary) the block and return a buffer pointer.
 209  */
 210 struct buf *
 211 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
 212 {
 213         struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
 214         struct buf *bp;
 215         klwp_t *lwp = ttolwp(curthread);
 216 
 217         CPU_STATS_ADD_K(sys, lread, 1);
 218         bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
 219         if (bp->b_flags & B_DONE)
 220                 return (bp);
 221         bp->b_flags |= B_READ;
 222         ASSERT(bp->b_bcount == bsize);
 223         if (ufsvfsp == NULL) {                                  /* !ufs */
 224                 (void) bdev_strategy(bp);
 225         } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
 226                                                         /* ufs && logging */
 227                 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
 228         } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
 229                                                         /* ufs && snapshots */
 230                 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
 231         } else {
 232                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
 233                 ub.ub_breads.value.ul++;                /* ufs && !logging */
 234                 (void) bdev_strategy(bp);
 235         }
 236         if (lwp != NULL)
 237                 lwp->lwp_ru.inblock++;
 238         CPU_STATS_ADD_K(sys, bread, 1);
 239         (void) biowait(bp);
 240         return (bp);
 241 }
 242 
 243 /*
 244  * Read in the block, like bread, but also start I/O on the
 245  * read-ahead block (which is not allocated to the caller).
 246  */
 247 struct buf *
 248 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
 249 {
 250         struct buf *bp, *rabp;
 251         klwp_t *lwp = ttolwp(curthread);
 252 
 253         bp = NULL;
 254         if (!bio_incore(dev, blkno)) {
 255                 CPU_STATS_ADD_K(sys, lread, 1);
 256                 bp = GETBLK(dev, blkno, bsize);
 257                 if ((bp->b_flags & B_DONE) == 0) {
 258                         bp->b_flags |= B_READ;
 259                         bp->b_bcount = bsize;
 260                         (void) bdev_strategy(bp);
 261                         if (lwp != NULL)
 262                                 lwp->lwp_ru.inblock++;
 263                         CPU_STATS_ADD_K(sys, bread, 1);
 264                 }
 265         }
 266         if (rablkno && bfreelist.b_bcount > 1 &&
 267             !bio_incore(dev, rablkno)) {
 268                 rabp = GETBLK(dev, rablkno, bsize);
 269                 if (rabp->b_flags & B_DONE)
 270                         brelse(rabp);
 271                 else {
 272                         rabp->b_flags |= B_READ|B_ASYNC;
 273                         rabp->b_bcount = bsize;
 274                         (void) bdev_strategy(rabp);
 275                         if (lwp != NULL)
 276                                 lwp->lwp_ru.inblock++;
 277                         CPU_STATS_ADD_K(sys, bread, 1);
 278                 }
 279         }
 280         if (bp == NULL)
 281                 return (BREAD(dev, blkno, bsize));
 282         (void) biowait(bp);
 283         return (bp);
 284 }
 285 
 286 /*
 287  * Common code for writing a buffer with various options.
 288  *
 289  * force_wait  - wait for write completion regardless of B_ASYNC flag
 290  * do_relse    - release the buffer when we are done
 291  * clear_flags - flags to clear from the buffer
 292  */
 293 void
 294 bwrite_common(void *arg, struct buf *bp, int force_wait,
 295                                 int do_relse, int clear_flags)
 296 {
 297         register int do_wait;
 298         struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
 299         int flag;
 300         klwp_t *lwp = ttolwp(curthread);
 301         struct cpu *cpup;
 302 
 303         ASSERT(SEMA_HELD(&bp->b_sem));
 304         flag = bp->b_flags;
 305         bp->b_flags &= ~clear_flags;
 306         if (lwp != NULL)
 307                 lwp->lwp_ru.oublock++;
 308         CPU_STATS_ENTER_K();
 309         cpup = CPU;             /* get pointer AFTER preemption is disabled */
 310         CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
 311         CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
 312         do_wait = ((flag & B_ASYNC) == 0 || force_wait);
 313         if (do_wait == 0)
 314                 CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
 315         CPU_STATS_EXIT_K();
 316         if (ufsvfsp == NULL) {
 317                 (void) bdev_strategy(bp);
 318         } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
 319                                                         /* ufs && logging */
 320                 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
 321         } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
 322                                                         /* ufs && snapshots */
 323                 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
 324         } else {
 325                 ub.ub_bwrites.value.ul++;               /* ufs && !logging */
 326                 (void) bdev_strategy(bp);
 327         }
 328         if (do_wait) {
 329                 (void) biowait(bp);
 330                 if (do_relse) {
 331                         brelse(bp);
 332                 }
 333         }
 334 }
 335 
 336 /*
 337  * Write the buffer, waiting for completion (unless B_ASYNC is set).
 338  * Then release the buffer.
 339  * This interface is provided for binary compatibility.  Using
 340  * BWRITE() directly avoids the extra function call overhead invoked
 341  * by calling this routine.
 342  */
 343 void
 344 bwrite(struct buf *bp)
 345 {
 346         BWRITE(bp);
 347 }
 348 
 349 /*
 350  * Write the buffer, waiting for completion.
 351  * But don't release the buffer afterwards.
 352  * This interface is provided for binary compatibility.  Using
 353  * BWRITE2() directly avoids the extra function call overhead.
 354  */
 355 void
 356 bwrite2(struct buf *bp)
 357 {
 358         BWRITE2(bp);
 359 }
 360 
 361 /*
 362  * Release the buffer, marking it so that if it is grabbed
 363  * for another purpose it will be written out before being
 364  * given up (e.g. when writing a partial block where it is
 365  * assumed that another write for the same block will soon follow).
 366  * Also save the time that the block is first marked as delayed
 367  * so that it will be written in a reasonable time.
 368  */
 369 void
 370 bdwrite(struct buf *bp)
 371 {
 372         ASSERT(SEMA_HELD(&bp->b_sem));
 373         CPU_STATS_ADD_K(sys, lwrite, 1);
 374         if ((bp->b_flags & B_DELWRI) == 0)
 375                 bp->b_start = ddi_get_lbolt();
 376         /*
 377          * B_DONE allows others to use the buffer, B_DELWRI causes the
 378          * buffer to be written before being reused, and setting b_resid
 379          * to zero says the buffer is complete.
 380          */
 381         bp->b_flags |= B_DELWRI | B_DONE;
 382         bp->b_resid = 0;
 383         brelse(bp);
 384 }
 385 
 386 /*
 387  * Release the buffer, start I/O on it, but don't wait for completion.
 388  */
 389 void
 390 bawrite(struct buf *bp)
 391 {
 392         ASSERT(SEMA_HELD(&bp->b_sem));
 393 
 394         /* Use bfreelist.b_bcount as a weird-ass heuristic */
 395         if (bfreelist.b_bcount > 4)
 396                 bp->b_flags |= B_ASYNC;
 397         BWRITE(bp);
 398 }
 399 
 400 /*
 401  * Release the buffer, with no I/O implied.
 402  */
 403 void
 404 brelse(struct buf *bp)
 405 {
 406         struct buf      **backp;
 407         uint_t          index;
 408         kmutex_t        *hmp;
 409         struct  buf     *dp;
 410         struct  hbuf    *hp;
 411 
 412 
 413         ASSERT(SEMA_HELD(&bp->b_sem));
 414 
 415         /*
 416          * Clear the retry write flag if the buffer was written without
 417          * error.  The presence of B_DELWRI means the buffer has not yet
 418          * been written and the presence of B_ERROR means that an error
 419          * is still occurring.
 420          */
 421         if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
 422                 bp->b_flags &= ~B_RETRYWRI;
 423         }
 424 
 425         /* Check for anomalous conditions */
 426         if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
 427                 if (bp->b_flags & B_NOCACHE) {
 428                         /* Don't add to the freelist. Destroy it now */
 429                         kmem_free(bp->b_un.b_addr, bp->b_bufsize);
 430                         sema_destroy(&bp->b_sem);
 431                         sema_destroy(&bp->b_io);
 432                         kmem_free(bp, sizeof (struct buf));
 433                         return;
 434                 }
 435                 /*
 436                  * If a write failed and we are supposed to retry write,
 437                  * don't toss the buffer.  Keep it around and mark it
 438                  * delayed write in the hopes that it will eventually
 439                  * get flushed (and still keep the system running.)
 440                  */
 441                 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
 442                         bp->b_flags |= B_DELWRI;
 443                         /* keep fsflush from trying continuously to flush */
 444                         bp->b_start = ddi_get_lbolt();
 445                 } else
 446                         bp->b_flags |= B_AGE|B_STALE;
 447                 bp->b_flags &= ~B_ERROR;
 448                 bp->b_error = 0;
 449         }
 450 
 451         /*
 452          * If delayed write is set then put in on the delayed
 453          * write list instead of the free buffer list.
 454          */
 455         index = bio_bhash(bp->b_edev, bp->b_blkno);
 456         hmp   = &hbuf[index].b_lock;
 457 
 458         mutex_enter(hmp);
 459         hp = &hbuf[index];
 460         dp = (struct buf *)hp;
 461 
 462         /*
 463          * Make sure that the number of entries on this list are
 464          * Zero <= count <= total # buffers
 465          */
 466         ASSERT(hp->b_length >= 0);
 467         ASSERT(hp->b_length < nbuf);
 468 
 469         hp->b_length++;              /* We are adding this buffer */
 470 
 471         if (bp->b_flags & B_DELWRI) {
 472                 /*
 473                  * This buffer goes on the delayed write buffer list
 474                  */
 475                 dp = (struct buf *)&dwbuf[index];
 476         }
 477         ASSERT(bp->b_bufsize > 0);
 478         ASSERT(bp->b_bcount > 0);
 479         ASSERT(bp->b_un.b_addr != NULL);
 480 
 481         if (bp->b_flags & B_AGE) {
 482                 backp = &dp->av_forw;
 483                 (*backp)->av_back = bp;
 484                 bp->av_forw = *backp;
 485                 *backp = bp;
 486                 bp->av_back = dp;
 487         } else {
 488                 backp = &dp->av_back;
 489                 (*backp)->av_forw = bp;
 490                 bp->av_back = *backp;
 491                 *backp = bp;
 492                 bp->av_forw = dp;
 493         }
 494         mutex_exit(hmp);
 495 
 496         if (bfreelist.b_flags & B_WANTED) {
 497                 /*
 498                  * Should come here very very rarely.
 499                  */
 500                 mutex_enter(&bfree_lock);
 501                 if (bfreelist.b_flags & B_WANTED) {
 502                         bfreelist.b_flags &= ~B_WANTED;
 503                         cv_broadcast(&bio_mem_cv);
 504                 }
 505                 mutex_exit(&bfree_lock);
 506         }
 507 
 508         bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
 509         /*
 510          * Don't let anyone get the buffer off the freelist before we
 511          * release our hold on it.
 512          */
 513         sema_v(&bp->b_sem);
 514 }
 515 
 516 /*
 517  * Return a count of the number of B_BUSY buffers in the system
 518  * Can only be used as a good estimate.  If 'cleanit' is set,
 519  * try to flush all bufs.
 520  */
 521 int
 522 bio_busy(int cleanit)
 523 {
 524         struct buf *bp, *dp;
 525         int busy = 0;
 526         int i;
 527         kmutex_t *hmp;
 528 
 529         for (i = 0; i < v.v_hbuf; i++) {
 530                 vfs_syncprogress();
 531                 dp = (struct buf *)&hbuf[i];
 532                 hmp = &hbuf[i].b_lock;
 533 
 534                 mutex_enter(hmp);
 535                 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 536                         if (bp->b_flags & B_BUSY)
 537                                 busy++;
 538                 }
 539                 mutex_exit(hmp);
 540         }
 541 
 542         if (cleanit && busy != 0) {
 543                 bflush(NODEV);
 544         }
 545 
 546         return (busy);
 547 }
 548 
 549 /*
 550  * this interface is provided for binary compatibility.
 551  *
 552  * Assign a buffer for the given block.  If the appropriate
 553  * block is already associated, return it; otherwise search
 554  * for the oldest non-busy buffer and reassign it.
 555  */
 556 struct buf *
 557 getblk(dev_t dev, daddr_t blkno, long bsize)
 558 {
 559         return (getblk_common(/* ufsvfsp */ NULL, dev,
 560             blkno, bsize, /* errflg */ 0));
 561 }
 562 
 563 /*
 564  * Assign a buffer for the given block.  If the appropriate
 565  * block is already associated, return it; otherwise search
 566  * for the oldest non-busy buffer and reassign it.
 567  */
 568 struct buf *
 569 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
 570 {
 571         ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
 572         struct buf *bp;
 573         struct buf *dp;
 574         struct buf *nbp = NULL;
 575         struct buf *errbp;
 576         uint_t          index;
 577         kmutex_t        *hmp;
 578         struct  hbuf    *hp;
 579 
 580         if (getmajor(dev) >= devcnt)
 581                 cmn_err(CE_PANIC, "blkdev");
 582 
 583         biostats.bio_lookup.value.ui32++;
 584 
 585         index = bio_bhash(dev, blkno);
 586         hp    = &hbuf[index];
 587         dp    = (struct buf *)hp;
 588         hmp   = &hp->b_lock;
 589 
 590         mutex_enter(hmp);
 591 loop:
 592         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 593                 if (bp->b_blkno != blkno || bp->b_edev != dev ||
 594                     (bp->b_flags & B_STALE))
 595                         continue;
 596                 /*
 597                  * Avoid holding the hash lock in the event that
 598                  * the buffer is locked by someone. Since the hash chain
 599                  * may change when we drop the hash lock
 600                  * we have to start at the beginning of the chain if the
 601                  * buffer identity/contents aren't valid.
 602                  */
 603                 if (!sema_tryp(&bp->b_sem)) {
 604                         biostats.bio_bufbusy.value.ui32++;
 605                         mutex_exit(hmp);
 606                         /*
 607                          * OK, we are dealing with a busy buffer.
 608                          * In the case that we are panicking and we
 609                          * got called from bread(), we have some chance
 610                          * for error recovery. So better bail out from
 611                          * here since sema_p() won't block. If we got
 612                          * called directly from ufs routines, there is
 613                          * no way to report an error yet.
 614                          */
 615                         if (panicstr && errflg)
 616                                 goto errout;
 617                         /*
 618                          * For the following line of code to work
 619                          * correctly never kmem_free the buffer "header".
 620                          */
 621                         sema_p(&bp->b_sem);
 622                         if (bp->b_blkno != blkno || bp->b_edev != dev ||
 623                             (bp->b_flags & B_STALE)) {
 624                                 sema_v(&bp->b_sem);
 625                                 mutex_enter(hmp);
 626                                 goto loop;      /* start over */
 627                         }
 628                         mutex_enter(hmp);
 629                 }
 630                 /* Found */
 631                 biostats.bio_hit.value.ui32++;
 632                 bp->b_flags &= ~B_AGE;
 633 
 634                 /*
 635                  * Yank it off the free/delayed write lists
 636                  */
 637                 hp->b_length--;
 638                 notavail(bp);
 639                 mutex_exit(hmp);
 640 
 641                 ASSERT((bp->b_flags & B_NOCACHE) == NULL);
 642 
 643                 if (nbp == NULL) {
 644                         /*
 645                          * Make the common path short.
 646                          */
 647                         ASSERT(SEMA_HELD(&bp->b_sem));
 648                         return (bp);
 649                 }
 650 
 651                 biostats.bio_bufdup.value.ui32++;
 652 
 653                 /*
 654                  * The buffer must have entered during the lock upgrade
 655                  * so free the new buffer we allocated and return the
 656                  * found buffer.
 657                  */
 658                 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
 659                 nbp->b_un.b_addr = NULL;
 660 
 661                 /*
 662                  * Account for the memory
 663                  */
 664                 mutex_enter(&bfree_lock);
 665                 bfreelist.b_bufsize += nbp->b_bufsize;
 666                 mutex_exit(&bfree_lock);
 667 
 668                 /*
 669                  * Destroy buf identity, and place on avail list
 670                  */
 671                 nbp->b_dev = (o_dev_t)NODEV;
 672                 nbp->b_edev = NODEV;
 673                 nbp->b_flags = 0;
 674                 nbp->b_file = NULL;
 675                 nbp->b_offset = -1;
 676 
 677                 sema_v(&nbp->b_sem);
 678                 bio_bhdr_free(nbp);
 679 
 680                 ASSERT(SEMA_HELD(&bp->b_sem));
 681                 return (bp);
 682         }
 683 
 684         /*
 685          * bio_getfreeblk may block so check the hash chain again.
 686          */
 687         if (nbp == NULL) {
 688                 mutex_exit(hmp);
 689                 nbp = bio_getfreeblk(bsize);
 690                 mutex_enter(hmp);
 691                 goto loop;
 692         }
 693 
 694         /*
 695          * New buffer. Assign nbp and stick it on the hash.
 696          */
 697         nbp->b_flags = B_BUSY;
 698         nbp->b_edev = dev;
 699         nbp->b_dev = (o_dev_t)cmpdev(dev);
 700         nbp->b_blkno = blkno;
 701         nbp->b_iodone = NULL;
 702         nbp->b_bcount = bsize;
 703         /*
 704          * If we are given a ufsvfsp and the vfs_root field is NULL
 705          * then this must be I/O for a superblock.  A superblock's
 706          * buffer is set up in mountfs() and there is no root vnode
 707          * at that point.
 708          */
 709         if (ufsvfsp && ufsvfsp->vfs_root) {
 710                 nbp->b_vp = ufsvfsp->vfs_root;
 711         } else {
 712                 nbp->b_vp = NULL;
 713         }
 714 
 715         ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
 716 
 717         binshash(nbp, dp);
 718         mutex_exit(hmp);
 719 
 720         ASSERT(SEMA_HELD(&nbp->b_sem));
 721 
 722         return (nbp);
 723 
 724 
 725         /*
 726          * Come here in case of an internal error. At this point we couldn't
 727          * get a buffer, but he have to return one. Hence we allocate some
 728          * kind of error reply buffer on the fly. This buffer is marked as
 729          * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
 730          *      - B_ERROR will indicate error to the caller.
 731          *      - B_DONE will prevent us from reading the buffer from
 732          *        the device.
 733          *      - B_NOCACHE will cause that this buffer gets free'd in
 734          *        brelse().
 735          */
 736 
 737 errout:
 738         errbp = geteblk();
 739         sema_p(&errbp->b_sem);
 740         errbp->b_flags &= ~B_BUSY;
 741         errbp->b_flags |= (B_ERROR | B_DONE);
 742         return (errbp);
 743 }
 744 
 745 /*
 746  * Get an empty block, not assigned to any particular device.
 747  * Returns a locked buffer that is not on any hash or free list.
 748  */
 749 struct buf *
 750 ngeteblk(long bsize)
 751 {
 752         struct buf *bp;
 753 
 754         bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
 755         bioinit(bp);
 756         bp->av_forw = bp->av_back = NULL;
 757         bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
 758         bp->b_bufsize = bsize;
 759         bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
 760         bp->b_dev = (o_dev_t)NODEV;
 761         bp->b_edev = NODEV;
 762         bp->b_lblkno = 0;
 763         bp->b_bcount = bsize;
 764         bp->b_iodone = NULL;
 765         return (bp);
 766 }
 767 
 768 /*
 769  * Interface of geteblk() is kept intact to maintain driver compatibility.
 770  * Use ngeteblk() to allocate block size other than 1 KB.
 771  */
 772 struct buf *
 773 geteblk(void)
 774 {
 775         return (ngeteblk((long)1024));
 776 }
 777 
 778 /*
 779  * Return a buffer w/o sleeping
 780  */
 781 struct buf *
 782 trygetblk(dev_t dev, daddr_t blkno)
 783 {
 784         struct buf      *bp;
 785         struct buf      *dp;
 786         struct hbuf     *hp;
 787         kmutex_t        *hmp;
 788         uint_t          index;
 789 
 790         index = bio_bhash(dev, blkno);
 791         hp = &hbuf[index];
 792         hmp = &hp->b_lock;
 793 
 794         if (!mutex_tryenter(hmp))
 795                 return (NULL);
 796 
 797         dp = (struct buf *)hp;
 798         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 799                 if (bp->b_blkno != blkno || bp->b_edev != dev ||
 800                     (bp->b_flags & B_STALE))
 801                         continue;
 802                 /*
 803                  * Get access to a valid buffer without sleeping
 804                  */
 805                 if (sema_tryp(&bp->b_sem)) {
 806                         if (bp->b_flags & B_DONE) {
 807                                 hp->b_length--;
 808                                 notavail(bp);
 809                                 mutex_exit(hmp);
 810                                 return (bp);
 811                         } else {
 812                                 sema_v(&bp->b_sem);
 813                                 break;
 814                         }
 815                 }
 816                 break;
 817         }
 818         mutex_exit(hmp);
 819         return (NULL);
 820 }
 821 
 822 /*
 823  * Wait for I/O completion on the buffer; return errors
 824  * to the user.
 825  */
 826 int
 827 iowait(struct buf *bp)
 828 {
 829         ASSERT(SEMA_HELD(&bp->b_sem));
 830         return (biowait(bp));
 831 }
 832 
 833 /*
 834  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
 835  * and wake up anyone waiting for it.
 836  */
 837 void
 838 iodone(struct buf *bp)
 839 {
 840         ASSERT(SEMA_HELD(&bp->b_sem));
 841         (void) biodone(bp);
 842 }
 843 
 844 /*
 845  * Zero the core associated with a buffer.
 846  */
 847 void
 848 clrbuf(struct buf *bp)
 849 {
 850         ASSERT(SEMA_HELD(&bp->b_sem));
 851         bzero(bp->b_un.b_addr, bp->b_bcount);
 852         bp->b_resid = 0;
 853 }
 854 
 855 
 856 /*
 857  * Make sure all write-behind blocks on dev (or NODEV for all)
 858  * are flushed out.
 859  */
 860 void
 861 bflush(dev_t dev)
 862 {
 863         struct buf *bp, *dp;
 864         struct hbuf *hp;
 865         struct buf *delwri_list = EMPTY_LIST;
 866         int i, index;
 867         kmutex_t *hmp;
 868 
 869         mutex_enter(&blist_lock);
 870         /*
 871          * Wait for any invalidates or flushes ahead of us to finish.
 872          * We really could split blist_lock up per device for better
 873          * parallelism here.
 874          */
 875         while (bio_doinginval || bio_doingflush) {
 876                 bio_flinv_cv_wanted = 1;
 877                 cv_wait(&bio_flushinval_cv, &blist_lock);
 878         }
 879         bio_doingflush++;
 880         /*
 881          * Gather all B_DELWRI buffer for device.
 882          * Lock ordering is b_sem > hash lock (brelse).
 883          * Since we are finding the buffer via the delayed write list,
 884          * it may be busy and we would block trying to get the
 885          * b_sem lock while holding hash lock. So transfer all the
 886          * candidates on the delwri_list and then drop the hash locks.
 887          */
 888         for (i = 0; i < v.v_hbuf; i++) {
 889                 vfs_syncprogress();
 890                 hmp = &hbuf[i].b_lock;
 891                 dp = (struct buf *)&dwbuf[i];
 892                 mutex_enter(hmp);
 893                 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
 894                         if (dev == NODEV || bp->b_edev == dev) {
 895                                 if (bp->b_list == NULL) {
 896                                         bp->b_list = delwri_list;
 897                                         delwri_list = bp;
 898                                 }
 899                         }
 900                 }
 901                 mutex_exit(hmp);
 902         }
 903         mutex_exit(&blist_lock);
 904 
 905         /*
 906          * Now that the hash locks have been dropped grab the semaphores
 907          * and write back all the buffers that have B_DELWRI set.
 908          */
 909         while (delwri_list != EMPTY_LIST) {
 910                 vfs_syncprogress();
 911                 bp = delwri_list;
 912 
 913                 sema_p(&bp->b_sem);      /* may block */
 914                 if ((dev != bp->b_edev && dev != NODEV) ||
 915                     (panicstr && bp->b_flags & B_BUSY)) {
 916                         sema_v(&bp->b_sem);
 917                         delwri_list = bp->b_list;
 918                         bp->b_list = NULL;
 919                         continue;       /* No longer a candidate */
 920                 }
 921                 if (bp->b_flags & B_DELWRI) {
 922                         index = bio_bhash(bp->b_edev, bp->b_blkno);
 923                         hp = &hbuf[index];
 924                         hmp = &hp->b_lock;
 925                         dp = (struct buf *)hp;
 926 
 927                         bp->b_flags |= B_ASYNC;
 928                         mutex_enter(hmp);
 929                         hp->b_length--;
 930                         notavail(bp);
 931                         mutex_exit(hmp);
 932                         if (bp->b_vp == NULL) {              /* !ufs */
 933                                 BWRITE(bp);
 934                         } else {                        /* ufs */
 935                                 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
 936                         }
 937                 } else {
 938                         sema_v(&bp->b_sem);
 939                 }
 940                 delwri_list = bp->b_list;
 941                 bp->b_list = NULL;
 942         }
 943         mutex_enter(&blist_lock);
 944         bio_doingflush--;
 945         if (bio_flinv_cv_wanted) {
 946                 bio_flinv_cv_wanted = 0;
 947                 cv_broadcast(&bio_flushinval_cv);
 948         }
 949         mutex_exit(&blist_lock);
 950 }
 951 
 952 /*
 953  * Ensure that a specified block is up-to-date on disk.
 954  */
 955 void
 956 blkflush(dev_t dev, daddr_t blkno)
 957 {
 958         struct buf *bp, *dp;
 959         struct hbuf *hp;
 960         struct buf *sbp = NULL;
 961         uint_t index;
 962         kmutex_t *hmp;
 963 
 964         index = bio_bhash(dev, blkno);
 965         hp    = &hbuf[index];
 966         dp    = (struct buf *)hp;
 967         hmp   = &hp->b_lock;
 968 
 969         /*
 970          * Identify the buffer in the cache belonging to
 971          * this device and blkno (if any).
 972          */
 973         mutex_enter(hmp);
 974         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 975                 if (bp->b_blkno != blkno || bp->b_edev != dev ||
 976                     (bp->b_flags & B_STALE))
 977                         continue;
 978                 sbp = bp;
 979                 break;
 980         }
 981         mutex_exit(hmp);
 982         if (sbp == NULL)
 983                 return;
 984         /*
 985          * Now check the buffer we have identified and
 986          * make sure it still belongs to the device and is B_DELWRI
 987          */
 988         sema_p(&sbp->b_sem);
 989         if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
 990             (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
 991                 mutex_enter(hmp);
 992                 hp->b_length--;
 993                 notavail(sbp);
 994                 mutex_exit(hmp);
 995                 /*
 996                  * XXX - There is nothing to guarantee a synchronous
 997                  * write here if the B_ASYNC flag is set.  This needs
 998                  * some investigation.
 999                  */
1000                 if (sbp->b_vp == NULL) {             /* !ufs */
1001                         BWRITE(sbp);    /* synchronous write */
1002                 } else {                                /* ufs */
1003                         UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1004                 }
1005         } else {
1006                 sema_v(&sbp->b_sem);
1007         }
1008 }
1009 
1010 /*
1011  * Same as binval, except can force-invalidate delayed-write buffers
1012  * (which are not be already flushed because of device errors).  Also
1013  * makes sure that the retry write flag is cleared.
1014  */
1015 int
1016 bfinval(dev_t dev, int force)
1017 {
1018         struct buf *dp;
1019         struct buf *bp;
1020         struct buf *binval_list = EMPTY_LIST;
1021         int i, error = 0;
1022         kmutex_t *hmp;
1023         uint_t index;
1024         struct buf **backp;
1025 
1026         mutex_enter(&blist_lock);
1027         /*
1028          * Wait for any flushes ahead of us to finish, it's ok to
1029          * do invalidates in parallel.
1030          */
1031         while (bio_doingflush) {
1032                 bio_flinv_cv_wanted = 1;
1033                 cv_wait(&bio_flushinval_cv, &blist_lock);
1034         }
1035         bio_doinginval++;
1036 
1037         /* Gather bp's */
1038         for (i = 0; i < v.v_hbuf; i++) {
1039                 dp = (struct buf *)&hbuf[i];
1040                 hmp = &hbuf[i].b_lock;
1041 
1042                 mutex_enter(hmp);
1043                 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1044                         if (bp->b_edev == dev) {
1045                                 if (bp->b_list == NULL) {
1046                                         bp->b_list = binval_list;
1047                                         binval_list = bp;
1048                                 }
1049                         }
1050                 }
1051                 mutex_exit(hmp);
1052         }
1053         mutex_exit(&blist_lock);
1054 
1055         /* Invalidate all bp's found */
1056         while (binval_list != EMPTY_LIST) {
1057                 bp = binval_list;
1058 
1059                 sema_p(&bp->b_sem);
1060                 if (bp->b_edev == dev) {
1061                         if (force && (bp->b_flags & B_DELWRI)) {
1062                                 /* clear B_DELWRI, move to non-dw freelist */
1063                                 index = bio_bhash(bp->b_edev, bp->b_blkno);
1064                                 hmp = &hbuf[index].b_lock;
1065                                 dp = (struct buf *)&hbuf[index];
1066                                 mutex_enter(hmp);
1067 
1068                                 /* remove from delayed write freelist */
1069                                 notavail(bp);
1070 
1071                                 /* add to B_AGE side of non-dw freelist */
1072                                 backp = &dp->av_forw;
1073                                 (*backp)->av_back = bp;
1074                                 bp->av_forw = *backp;
1075                                 *backp = bp;
1076                                 bp->av_back = dp;
1077 
1078                                 /*
1079                                  * make sure write retries and busy are cleared
1080                                  */
1081                                 bp->b_flags &=
1082                                     ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1083                                 mutex_exit(hmp);
1084                         }
1085                         if ((bp->b_flags & B_DELWRI) == 0)
1086                                 bp->b_flags |= B_STALE|B_AGE;
1087                         else
1088                                 error = EIO;
1089                 }
1090                 sema_v(&bp->b_sem);
1091                 binval_list = bp->b_list;
1092                 bp->b_list = NULL;
1093         }
1094         mutex_enter(&blist_lock);
1095         bio_doinginval--;
1096         if (bio_flinv_cv_wanted) {
1097                 cv_broadcast(&bio_flushinval_cv);
1098                 bio_flinv_cv_wanted = 0;
1099         }
1100         mutex_exit(&blist_lock);
1101         return (error);
1102 }
1103 
1104 /*
1105  * If possible, invalidate blocks for a dev on demand
1106  */
1107 void
1108 binval(dev_t dev)
1109 {
1110         (void) bfinval(dev, 0);
1111 }
1112 
1113 /*
1114  * Initialize the buffer I/O system by freeing
1115  * all buffers and setting all device hash buffer lists to empty.
1116  */
1117 void
1118 binit(void)
1119 {
1120         struct buf *bp;
1121         unsigned int i, pct;
1122         ulong_t bio_max_hwm, bio_default_hwm;
1123 
1124         /*
1125          * Maximum/Default values for bufhwm are set to the smallest of:
1126          *      - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1127          *      - 1/4 of kernel virtual memory
1128          *      - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1129          * Additionally, in order to allow simple tuning by percentage of
1130          * physical memory, bufhwm_pct is used to calculate the default if
1131          * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1132          *
1133          * Since the unit for v.v_bufhwm is kilobytes, this allows for
1134          * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1135          */
1136         bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1137             btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1138         bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1139 
1140         pct = BIO_BUF_PERCENT;
1141         if (bufhwm_pct != 0 &&
1142             ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1143                 pct = BIO_BUF_PERCENT;
1144                 /*
1145                  * Invalid user specified value, emit a warning.
1146                  */
1147                 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1148                     range(1..%d). Using %d as default.",
1149                     bufhwm_pct,
1150                     100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1151         }
1152 
1153         bio_default_hwm = MIN(physmem / pct,
1154             btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1155         bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1156 
1157         if ((v.v_bufhwm = bufhwm) == 0)
1158                 v.v_bufhwm = bio_default_hwm;
1159 
1160         if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1161                 v.v_bufhwm = (int)bio_max_hwm;
1162                 /*
1163                  * Invalid user specified value, emit a warning.
1164                  */
1165                 cmn_err(CE_WARN,
1166                     "binit: bufhwm(%d) out \
1167                     of range(%d..%lu). Using %lu as default",
1168                     bufhwm,
1169                     BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1170         }
1171 
1172         /*
1173          * Determine the number of hash buckets. Default is to
1174          * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1175          * Round up number to the next power of 2.
1176          */
1177         v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1178             BIO_HASHLEN);
1179         v.v_hmask = v.v_hbuf - 1;
1180         v.v_buf = BIO_BHDR_POOL;
1181 
1182         hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1183 
1184         dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1185 
1186         bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1187         bp = &bfreelist;
1188         bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1189 
1190         for (i = 0; i < v.v_hbuf; i++) {
1191                 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1192                 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1193 
1194                 /*
1195                  * Initialize the delayed write buffer list.
1196                  */
1197                 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1198                 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1199         }
1200 }
1201 
1202 /*
1203  * Wait for I/O completion on the buffer; return error code.
1204  * If bp was for synchronous I/O, bp is invalid and associated
1205  * resources are freed on return.
1206  */
1207 int
1208 biowait(struct buf *bp)
1209 {
1210         int error = 0;
1211         struct cpu *cpup;
1212 
1213         ASSERT(SEMA_HELD(&bp->b_sem));
1214 
1215         cpup = CPU;
1216         atomic_add_64(&cpup->cpu_stats.sys.iowait, 1);
1217         DTRACE_IO1(wait__start, struct buf *, bp);
1218 
1219         /*
1220          * In case of panic, busy wait for completion
1221          */
1222         if (panicstr) {
1223                 while ((bp->b_flags & B_DONE) == 0)
1224                         drv_usecwait(10);
1225         } else
1226                 sema_p(&bp->b_io);
1227 
1228         DTRACE_IO1(wait__done, struct buf *, bp);
1229         atomic_add_64(&cpup->cpu_stats.sys.iowait, -1);
1230 
1231         error = geterror(bp);
1232         if ((bp->b_flags & B_ASYNC) == 0) {
1233                 if (bp->b_flags & B_REMAPPED)
1234                         bp_mapout(bp);
1235         }
1236         return (error);
1237 }
1238 
1239 static void
1240 biodone_tnf_probe(struct buf *bp)
1241 {
1242         /* Kernel probe */
1243         TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1244             tnf_device,         device,         bp->b_edev,
1245             tnf_diskaddr,       block,          bp->b_lblkno,
1246             tnf_opaque,         buf,            bp);
1247 }
1248 
1249 /*
1250  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1251  * and wake up anyone waiting for it.
1252  */
1253 void
1254 biodone(struct buf *bp)
1255 {
1256         if (bp->b_flags & B_STARTED) {
1257                 DTRACE_IO1(done, struct buf *, bp);
1258                 bp->b_flags &= ~B_STARTED;
1259         }
1260 
1261         /*
1262          * Call the TNF probe here instead of the inline code
1263          * to force our compiler to use the tail call optimization.
1264          */
1265         biodone_tnf_probe(bp);
1266 
1267         if (bp->b_iodone != NULL) {
1268                 (*(bp->b_iodone))(bp);
1269                 return;
1270         }
1271         ASSERT((bp->b_flags & B_DONE) == 0);
1272         ASSERT(SEMA_HELD(&bp->b_sem));
1273         bp->b_flags |= B_DONE;
1274         if (bp->b_flags & B_ASYNC) {
1275                 if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1276                         bio_pageio_done(bp);
1277                 else
1278                         brelse(bp);     /* release bp to freelist */
1279         } else {
1280                 sema_v(&bp->b_io);
1281         }
1282 }
1283 
1284 /*
1285  * Pick up the device's error number and pass it to the user;
1286  * if there is an error but the number is 0 set a generalized code.
1287  */
1288 int
1289 geterror(struct buf *bp)
1290 {
1291         int error = 0;
1292 
1293         ASSERT(SEMA_HELD(&bp->b_sem));
1294         if (bp->b_flags & B_ERROR) {
1295                 error = bp->b_error;
1296                 if (!error)
1297                         error = EIO;
1298         }
1299         return (error);
1300 }
1301 
1302 /*
1303  * Support for pageio buffers.
1304  *
1305  * This stuff should be generalized to provide a generalized bp
1306  * header facility that can be used for things other than pageio.
1307  */
1308 
1309 /*
1310  * Allocate and initialize a buf struct for use with pageio.
1311  */
1312 struct buf *
1313 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1314 {
1315         struct buf *bp;
1316         struct cpu *cpup;
1317 
1318         if (flags & B_READ) {
1319                 CPU_STATS_ENTER_K();
1320                 cpup = CPU;     /* get pointer AFTER preemption is disabled */
1321                 CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1322                 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1323                 if ((flags & B_ASYNC) == 0) {
1324                         klwp_t *lwp = ttolwp(curthread);
1325                         if (lwp != NULL)
1326                                 lwp->lwp_ru.majflt++;
1327                         CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1328                         /* Kernel probe */
1329                         TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1330                             tnf_opaque,         vnode,          pp->p_vnode,
1331                             tnf_offset,         offset,         pp->p_offset);
1332                 }
1333                 /*
1334                  * Update statistics for pages being paged in
1335                  */
1336                 if (pp != NULL && pp->p_vnode != NULL) {
1337                         if (IS_SWAPFSVP(pp->p_vnode)) {
1338                                 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1339                         } else {
1340                                 if (pp->p_vnode->v_flag & VVMEXEC) {
1341                                         CPU_STATS_ADDQ(cpup, vm, execpgin,
1342                                             btopr(len));
1343                                 } else {
1344                                         CPU_STATS_ADDQ(cpup, vm, fspgin,
1345                                             btopr(len));
1346                                 }
1347                         }
1348                 }
1349                 CPU_STATS_EXIT_K();
1350                 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1351                     "page_ws_in:pp %p", pp);
1352                 /* Kernel probe */
1353                 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1354                     tnf_opaque, vnode,  pp->p_vnode,
1355                     tnf_offset, offset, pp->p_offset,
1356                     tnf_size,   size,   len);
1357         }
1358 
1359         bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1360         bp->b_bcount = len;
1361         bp->b_bufsize = len;
1362         bp->b_pages = pp;
1363         bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1364         bp->b_offset = -1;
1365         sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1366 
1367         /* Initialize bp->b_sem in "locked" state */
1368         sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1369 
1370         VN_HOLD(vp);
1371         bp->b_vp = vp;
1372         THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1373 
1374         /*
1375          * Caller sets dev & blkno and can adjust
1376          * b_addr for page offset and can use bp_mapin
1377          * to make pages kernel addressable.
1378          */
1379         return (bp);
1380 }
1381 
1382 void
1383 pageio_done(struct buf *bp)
1384 {
1385         ASSERT(SEMA_HELD(&bp->b_sem));
1386         if (bp->b_flags & B_REMAPPED)
1387                 bp_mapout(bp);
1388         VN_RELE(bp->b_vp);
1389         bp->b_vp = NULL;
1390         ASSERT((bp->b_flags & B_NOCACHE) != 0);
1391 
1392         /* A sema_v(bp->b_sem) is implied if we are destroying it */
1393         sema_destroy(&bp->b_sem);
1394         sema_destroy(&bp->b_io);
1395         kmem_free(bp, sizeof (struct buf));
1396 }
1397 
1398 /*
1399  * Check to see whether the buffers, except the one pointed by sbp,
1400  * associated with the device are busy.
1401  * NOTE: This expensive operation shall be improved together with ufs_icheck().
1402  */
1403 int
1404 bcheck(dev_t dev, struct buf *sbp)
1405 {
1406         struct buf      *bp;
1407         struct buf      *dp;
1408         int i;
1409         kmutex_t *hmp;
1410 
1411         /*
1412          * check for busy bufs for this filesystem
1413          */
1414         for (i = 0; i < v.v_hbuf; i++) {
1415                 dp = (struct buf *)&hbuf[i];
1416                 hmp = &hbuf[i].b_lock;
1417 
1418                 mutex_enter(hmp);
1419                 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1420                         /*
1421                          * if buf is busy or dirty, then filesystem is busy
1422                          */
1423                         if ((bp->b_edev == dev) &&
1424                             ((bp->b_flags & B_STALE) == 0) &&
1425                             (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1426                             (bp != sbp)) {
1427                                 mutex_exit(hmp);
1428                                 return (1);
1429                         }
1430                 }
1431                 mutex_exit(hmp);
1432         }
1433         return (0);
1434 }
1435 
1436 /*
1437  * Hash two 32 bit entities.
1438  */
1439 int
1440 hash2ints(int x, int y)
1441 {
1442         int hash = 0;
1443 
1444         hash = x - 1;
1445         hash = ((hash * 7) + (x >> 8)) - 1;
1446         hash = ((hash * 7) + (x >> 16)) - 1;
1447         hash = ((hash * 7) + (x >> 24)) - 1;
1448         hash = ((hash * 7) + y) - 1;
1449         hash = ((hash * 7) + (y >> 8)) - 1;
1450         hash = ((hash * 7) + (y >> 16)) - 1;
1451         hash = ((hash * 7) + (y >> 24)) - 1;
1452 
1453         return (hash);
1454 }
1455 
1456 
1457 /*
1458  * Return a new buffer struct.
1459  *      Create a new buffer if we haven't gone over our high water
1460  *      mark for memory, otherwise try to get one off the freelist.
1461  *
1462  * Returns a locked buf that has no id and is not on any hash or free
1463  * list.
1464  */
1465 static struct buf *
1466 bio_getfreeblk(long bsize)
1467 {
1468         struct buf *bp, *dp;
1469         struct hbuf *hp;
1470         kmutex_t        *hmp;
1471         uint_t          start, end;
1472 
1473         /*
1474          * mutex_enter(&bfree_lock);
1475          * bfreelist.b_bufsize represents the amount of memory
1476          * mutex_exit(&bfree_lock); protect ref to bfreelist
1477          * we are allowed to allocate in the cache before we hit our hwm.
1478          */
1479         bio_mem_get(bsize);     /* Account for our memory request */
1480 
1481 again:
1482         bp = bio_bhdr_alloc();  /* Get a buf hdr */
1483         sema_p(&bp->b_sem);      /* Should never fail */
1484 
1485         ASSERT(bp->b_un.b_addr == NULL);
1486         bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1487         if (bp->b_un.b_addr != NULL) {
1488                 /*
1489                  * Make the common path short
1490                  */
1491                 bp->b_bufsize = bsize;
1492                 ASSERT(SEMA_HELD(&bp->b_sem));
1493                 return (bp);
1494         } else {
1495                 struct buf *save;
1496 
1497                 save = bp;      /* Save bp we allocated */
1498                 start = end = lastindex;
1499 
1500                 biostats.bio_bufwant.value.ui32++;
1501 
1502                 /*
1503                  * Memory isn't available from the system now. Scan
1504                  * the hash buckets till enough space is found.
1505                  */
1506                 do {
1507                         hp = &hbuf[start];
1508                         hmp = &hp->b_lock;
1509                         dp = (struct buf *)hp;
1510 
1511                         mutex_enter(hmp);
1512                         bp = dp->av_forw;
1513 
1514                         while (bp != dp) {
1515 
1516                                 ASSERT(bp != NULL);
1517 
1518                                 if (!sema_tryp(&bp->b_sem)) {
1519                                         bp = bp->av_forw;
1520                                         continue;
1521                                 }
1522 
1523                                 /*
1524                                  * Since we are going down the freelist
1525                                  * associated with this hash bucket the
1526                                  * B_DELWRI flag should not be set.
1527                                  */
1528                                 ASSERT(!(bp->b_flags & B_DELWRI));
1529 
1530                                 if (bp->b_bufsize == bsize) {
1531                                         hp->b_length--;
1532                                         notavail(bp);
1533                                         bremhash(bp);
1534                                         mutex_exit(hmp);
1535 
1536                                         /*
1537                                          * Didn't kmem_alloc any more, so don't
1538                                          * count it twice.
1539                                          */
1540                                         mutex_enter(&bfree_lock);
1541                                         bfreelist.b_bufsize += bsize;
1542                                         mutex_exit(&bfree_lock);
1543 
1544                                         /*
1545                                          * Update the lastindex value.
1546                                          */
1547                                         lastindex = start;
1548 
1549                                         /*
1550                                          * Put our saved bp back on the list
1551                                          */
1552                                         sema_v(&save->b_sem);
1553                                         bio_bhdr_free(save);
1554                                         ASSERT(SEMA_HELD(&bp->b_sem));
1555                                         return (bp);
1556                                 }
1557                                 sema_v(&bp->b_sem);
1558                                 bp = bp->av_forw;
1559                         }
1560                         mutex_exit(hmp);
1561                         start = ((start + 1) % v.v_hbuf);
1562                 } while (start != end);
1563 
1564                 biostats.bio_bufwait.value.ui32++;
1565                 bp = save;              /* Use original bp */
1566                 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1567         }
1568 
1569         bp->b_bufsize = bsize;
1570         ASSERT(SEMA_HELD(&bp->b_sem));
1571         return (bp);
1572 }
1573 
1574 /*
1575  * Allocate a buffer header. If none currently available, allocate
1576  * a new pool.
1577  */
1578 static struct buf *
1579 bio_bhdr_alloc(void)
1580 {
1581         struct buf *dp, *sdp;
1582         struct buf *bp;
1583         int i;
1584 
1585         for (;;) {
1586                 mutex_enter(&bhdr_lock);
1587                 if (bhdrlist != NULL) {
1588                         bp = bhdrlist;
1589                         bhdrlist = bp->av_forw;
1590                         mutex_exit(&bhdr_lock);
1591                         bp->av_forw = NULL;
1592                         return (bp);
1593                 }
1594                 mutex_exit(&bhdr_lock);
1595 
1596                 /*
1597                  * Need to allocate a new pool. If the system is currently
1598                  * out of memory, then try freeing things on the freelist.
1599                  */
1600                 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1601                 if (dp == NULL) {
1602                         /*
1603                          * System can't give us a pool of headers, try
1604                          * recycling from the free lists.
1605                          */
1606                         bio_recycle(BIO_HEADER, 0);
1607                 } else {
1608                         sdp = dp;
1609                         for (i = 0; i < v.v_buf; i++, dp++) {
1610                                 /*
1611                                  * The next two lines are needed since NODEV
1612                                  * is -1 and not NULL
1613                                  */
1614                                 dp->b_dev = (o_dev_t)NODEV;
1615                                 dp->b_edev = NODEV;
1616                                 dp->av_forw = dp + 1;
1617                                 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1618                                     NULL);
1619                                 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1620                                     NULL);
1621                                 dp->b_offset = -1;
1622                         }
1623                         mutex_enter(&bhdr_lock);
1624                         (--dp)->av_forw = bhdrlist;  /* Fix last pointer */
1625                         bhdrlist = sdp;
1626                         nbuf += v.v_buf;
1627                         bp = bhdrlist;
1628                         bhdrlist = bp->av_forw;
1629                         mutex_exit(&bhdr_lock);
1630 
1631                         bp->av_forw = NULL;
1632                         return (bp);
1633                 }
1634         }
1635 }
1636 
1637 static  void
1638 bio_bhdr_free(struct buf *bp)
1639 {
1640         ASSERT(bp->b_back == NULL);
1641         ASSERT(bp->b_forw == NULL);
1642         ASSERT(bp->av_back == NULL);
1643         ASSERT(bp->av_forw == NULL);
1644         ASSERT(bp->b_un.b_addr == NULL);
1645         ASSERT(bp->b_dev == (o_dev_t)NODEV);
1646         ASSERT(bp->b_edev == NODEV);
1647         ASSERT(bp->b_flags == 0);
1648 
1649         mutex_enter(&bhdr_lock);
1650         bp->av_forw = bhdrlist;
1651         bhdrlist = bp;
1652         mutex_exit(&bhdr_lock);
1653 }
1654 
1655 /*
1656  * If we haven't gone over the high water mark, it's o.k. to
1657  * allocate more buffer space, otherwise recycle buffers
1658  * from the freelist until enough memory is free for a bsize request.
1659  *
1660  * We account for this memory, even though
1661  * we don't allocate it here.
1662  */
1663 static void
1664 bio_mem_get(long bsize)
1665 {
1666         mutex_enter(&bfree_lock);
1667         if (bfreelist.b_bufsize > bsize) {
1668                 bfreelist.b_bufsize -= bsize;
1669                 mutex_exit(&bfree_lock);
1670                 return;
1671         }
1672         mutex_exit(&bfree_lock);
1673         bio_recycle(BIO_MEM, bsize);
1674 }
1675 
1676 /*
1677  * flush a list of delayed write buffers.
1678  * (currently used only by bio_recycle below.)
1679  */
1680 static void
1681 bio_flushlist(struct buf *delwri_list)
1682 {
1683         struct buf *bp;
1684 
1685         while (delwri_list != EMPTY_LIST) {
1686                 bp = delwri_list;
1687                 bp->b_flags |= B_AGE | B_ASYNC;
1688                 if (bp->b_vp == NULL) {              /* !ufs */
1689                         BWRITE(bp);
1690                 } else {                        /* ufs */
1691                         UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1692                 }
1693                 delwri_list = bp->b_list;
1694                 bp->b_list = NULL;
1695         }
1696 }
1697 
1698 /*
1699  * Start recycling buffers on the freelist for one of 2 reasons:
1700  *      - we need a buffer header
1701  *      - we need to free up memory
1702  * Once started we continue to recycle buffers until the B_AGE
1703  * buffers are gone.
1704  */
1705 static void
1706 bio_recycle(int want, long bsize)
1707 {
1708         struct buf *bp, *dp, *dwp, *nbp;
1709         struct hbuf *hp;
1710         int     found = 0;
1711         kmutex_t        *hmp;
1712         int             start, end;
1713         struct buf *delwri_list = EMPTY_LIST;
1714 
1715         /*
1716          * Recycle buffers.
1717          */
1718 top:
1719         start = end = lastindex;
1720         do {
1721                 hp = &hbuf[start];
1722                 hmp = &hp->b_lock;
1723                 dp = (struct buf *)hp;
1724 
1725                 mutex_enter(hmp);
1726                 bp = dp->av_forw;
1727 
1728                 while (bp != dp) {
1729 
1730                         ASSERT(bp != NULL);
1731 
1732                         if (!sema_tryp(&bp->b_sem)) {
1733                                 bp = bp->av_forw;
1734                                 continue;
1735                         }
1736                         /*
1737                          * Do we really want to nuke all of the B_AGE stuff??
1738                          */
1739                         if ((bp->b_flags & B_AGE) == 0 && found) {
1740                                 sema_v(&bp->b_sem);
1741                                 mutex_exit(hmp);
1742                                 lastindex = start;
1743                                 return; /* All done */
1744                         }
1745 
1746                         ASSERT(MUTEX_HELD(&hp->b_lock));
1747                         ASSERT(!(bp->b_flags & B_DELWRI));
1748                         hp->b_length--;
1749                         notavail(bp);
1750 
1751                         /*
1752                          * Remove bhdr from cache, free up memory,
1753                          * and add the hdr to the freelist.
1754                          */
1755                         bremhash(bp);
1756                         mutex_exit(hmp);
1757 
1758                         if (bp->b_bufsize) {
1759                                 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1760                                 bp->b_un.b_addr = NULL;
1761                                 mutex_enter(&bfree_lock);
1762                                 bfreelist.b_bufsize += bp->b_bufsize;
1763                                 mutex_exit(&bfree_lock);
1764                         }
1765 
1766                         bp->b_dev = (o_dev_t)NODEV;
1767                         bp->b_edev = NODEV;
1768                         bp->b_flags = 0;
1769                         sema_v(&bp->b_sem);
1770                         bio_bhdr_free(bp);
1771                         if (want == BIO_HEADER) {
1772                                 found = 1;
1773                         } else {
1774                                 ASSERT(want == BIO_MEM);
1775                                 if (!found && bfreelist.b_bufsize >= bsize) {
1776                                         /* Account for the memory we want */
1777                                         mutex_enter(&bfree_lock);
1778                                         if (bfreelist.b_bufsize >= bsize) {
1779                                                 bfreelist.b_bufsize -= bsize;
1780                                                 found = 1;
1781                                         }
1782                                         mutex_exit(&bfree_lock);
1783                                 }
1784                         }
1785 
1786                         /*
1787                          * Since we dropped hmp start from the
1788                          * begining.
1789                          */
1790                         mutex_enter(hmp);
1791                         bp = dp->av_forw;
1792                 }
1793                 mutex_exit(hmp);
1794 
1795                 /*
1796                  * Look at the delayed write list.
1797                  * First gather into a private list, then write them.
1798                  */
1799                 dwp = (struct buf *)&dwbuf[start];
1800                 mutex_enter(&blist_lock);
1801                 bio_doingflush++;
1802                 mutex_enter(hmp);
1803                 for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1804 
1805                         ASSERT(bp != NULL);
1806                         nbp = bp->av_forw;
1807 
1808                         if (!sema_tryp(&bp->b_sem))
1809                                 continue;
1810                         ASSERT(bp->b_flags & B_DELWRI);
1811                         /*
1812                          * Do we really want to nuke all of the B_AGE stuff??
1813                          */
1814 
1815                         if ((bp->b_flags & B_AGE) == 0 && found) {
1816                                 sema_v(&bp->b_sem);
1817                                 mutex_exit(hmp);
1818                                 lastindex = start;
1819                                 mutex_exit(&blist_lock);
1820                                 bio_flushlist(delwri_list);
1821                                 mutex_enter(&blist_lock);
1822                                 bio_doingflush--;
1823                                 if (bio_flinv_cv_wanted) {
1824                                         bio_flinv_cv_wanted = 0;
1825                                         cv_broadcast(&bio_flushinval_cv);
1826                                 }
1827                                 mutex_exit(&blist_lock);
1828                                 return; /* All done */
1829                         }
1830 
1831                         /*
1832                          * If the buffer is already on a flush or
1833                          * invalidate list then just skip it.
1834                          */
1835                         if (bp->b_list != NULL) {
1836                                 sema_v(&bp->b_sem);
1837                                 continue;
1838                         }
1839                         /*
1840                          * We are still on the same bucket.
1841                          */
1842                         hp->b_length--;
1843                         notavail(bp);
1844                         bp->b_list = delwri_list;
1845                         delwri_list = bp;
1846                 }
1847                 mutex_exit(hmp);
1848                 mutex_exit(&blist_lock);
1849                 bio_flushlist(delwri_list);
1850                 delwri_list = EMPTY_LIST;
1851                 mutex_enter(&blist_lock);
1852                 bio_doingflush--;
1853                 if (bio_flinv_cv_wanted) {
1854                         bio_flinv_cv_wanted = 0;
1855                         cv_broadcast(&bio_flushinval_cv);
1856                 }
1857                 mutex_exit(&blist_lock);
1858                 start = (start + 1) % v.v_hbuf;
1859 
1860         } while (start != end);
1861 
1862         if (found)
1863                 return;
1864 
1865         /*
1866          * Free lists exhausted and we haven't satisfied the request.
1867          * Wait here for more entries to be added to freelist.
1868          * Because this might have just happened, make it timed.
1869          */
1870         mutex_enter(&bfree_lock);
1871         bfreelist.b_flags |= B_WANTED;
1872         (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1873         mutex_exit(&bfree_lock);
1874         goto top;
1875 }
1876 
1877 /*
1878  * See if the block is associated with some buffer
1879  * (mainly to avoid getting hung up on a wait in breada).
1880  */
1881 static int
1882 bio_incore(dev_t dev, daddr_t blkno)
1883 {
1884         struct buf *bp;
1885         struct buf *dp;
1886         uint_t index;
1887         kmutex_t *hmp;
1888 
1889         index = bio_bhash(dev, blkno);
1890         dp = (struct buf *)&hbuf[index];
1891         hmp = &hbuf[index].b_lock;
1892 
1893         mutex_enter(hmp);
1894         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1895                 if (bp->b_blkno == blkno && bp->b_edev == dev &&
1896                     (bp->b_flags & B_STALE) == 0) {
1897                         mutex_exit(hmp);
1898                         return (1);
1899                 }
1900         }
1901         mutex_exit(hmp);
1902         return (0);
1903 }
1904 
1905 static void
1906 bio_pageio_done(struct buf *bp)
1907 {
1908         if (bp->b_flags & B_PAGEIO) {
1909 
1910                 if (bp->b_flags & B_REMAPPED)
1911                         bp_mapout(bp);
1912 
1913                 if (bp->b_flags & B_READ)
1914                         pvn_read_done(bp->b_pages, bp->b_flags);
1915                 else
1916                         pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1917                 pageio_done(bp);
1918         } else {
1919                 ASSERT(bp->b_flags & B_REMAPPED);
1920                 bp_mapout(bp);
1921                 brelse(bp);
1922         }
1923 }
1924 
1925 /*
1926  * bioerror(9F) - indicate error in buffer header
1927  * If 'error' is zero, remove the error indication.
1928  */
1929 void
1930 bioerror(struct buf *bp, int error)
1931 {
1932         ASSERT(bp != NULL);
1933         ASSERT(error >= 0);
1934         ASSERT(SEMA_HELD(&bp->b_sem));
1935 
1936         if (error != 0) {
1937                 bp->b_flags |= B_ERROR;
1938         } else {
1939                 bp->b_flags &= ~B_ERROR;
1940         }
1941         bp->b_error = error;
1942 }
1943 
1944 /*
1945  * bioreset(9F) - reuse a private buffer header after I/O is complete
1946  */
1947 void
1948 bioreset(struct buf *bp)
1949 {
1950         ASSERT(bp != NULL);
1951 
1952         biofini(bp);
1953         bioinit(bp);
1954 }
1955 
1956 /*
1957  * biosize(9F) - return size of a buffer header
1958  */
1959 size_t
1960 biosize(void)
1961 {
1962         return (sizeof (struct buf));
1963 }
1964 
1965 /*
1966  * biomodified(9F) - check if buffer is modified
1967  */
1968 int
1969 biomodified(struct buf *bp)
1970 {
1971         int npf;
1972         int ppattr;
1973         struct page *pp;
1974 
1975         ASSERT(bp != NULL);
1976 
1977         if ((bp->b_flags & B_PAGEIO) == 0) {
1978                 return (-1);
1979         }
1980         pp = bp->b_pages;
1981         npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1982 
1983         while (npf > 0) {
1984                 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1985                     HAT_SYNC_STOPON_MOD);
1986                 if (ppattr & P_MOD)
1987                         return (1);
1988                 pp = pp->p_next;
1989                 npf--;
1990         }
1991 
1992         return (0);
1993 }
1994 
1995 /*
1996  * bioinit(9F) - initialize a buffer structure
1997  */
1998 void
1999 bioinit(struct buf *bp)
2000 {
2001         bzero(bp, sizeof (struct buf));
2002         sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2003         sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2004         bp->b_offset = -1;
2005 }
2006 
2007 /*
2008  * biofini(9F) - uninitialize a buffer structure
2009  */
2010 void
2011 biofini(struct buf *bp)
2012 {
2013         sema_destroy(&bp->b_io);
2014         sema_destroy(&bp->b_sem);
2015 }
2016 
2017 /*
2018  * bioclone(9F) - clone a buffer
2019  */
2020 struct buf *
2021 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2022     int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2023 {
2024         struct buf *bufp;
2025 
2026         ASSERT(bp);
2027         if (bp_mem == NULL) {
2028                 bufp = kmem_alloc(sizeof (struct buf), sleep);
2029                 if (bufp == NULL) {
2030                         return (NULL);
2031                 }
2032                 bioinit(bufp);
2033         } else {
2034                 bufp = bp_mem;
2035                 bioreset(bufp);
2036         }
2037 
2038 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2039         B_ABRWRITE)
2040 
2041         /*
2042          * The cloned buffer does not inherit the B_REMAPPED flag.
2043          */
2044         bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
2045         bufp->b_bcount = len;
2046         bufp->b_blkno = blkno;
2047         bufp->b_iodone = iodone;
2048         bufp->b_proc = bp->b_proc;
2049         bufp->b_edev = dev;
2050         bufp->b_file = bp->b_file;
2051         bufp->b_offset = bp->b_offset;
2052 
2053         if (bp->b_flags & B_SHADOW) {
2054                 ASSERT(bp->b_shadow);
2055                 ASSERT(bp->b_flags & B_PHYS);
2056 
2057                 bufp->b_shadow = bp->b_shadow +
2058                     btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2059                 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2060                 if (bp->b_flags & B_REMAPPED)
2061                         bufp->b_proc = NULL;
2062         } else {
2063                 if (bp->b_flags & B_PAGEIO) {
2064                         struct page *pp;
2065                         off_t o;
2066                         int i;
2067 
2068                         pp = bp->b_pages;
2069                         o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2070                         for (i = btop(o); i > 0; i--) {
2071                                 pp = pp->p_next;
2072                         }
2073                         bufp->b_pages = pp;
2074                         bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2075                 } else {
2076                         bufp->b_un.b_addr =
2077                             (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2078                         if (bp->b_flags & B_REMAPPED)
2079                                 bufp->b_proc = NULL;
2080                 }
2081         }
2082         return (bufp);
2083 }