1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 #include <sys/types.h>
  26 #include <sys/t_lock.h>
  27 #include <sys/param.h>
  28 #include <sys/time.h>
  29 #include <sys/systm.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/resource.h>
  32 #include <sys/signal.h>
  33 #include <sys/cred.h>
  34 #include <sys/user.h>
  35 #include <sys/buf.h>
  36 #include <sys/vfs.h>
  37 #include <sys/vnode.h>
  38 #include <sys/proc.h>
  39 #include <sys/disp.h>
  40 #include <sys/file.h>
  41 #include <sys/fcntl.h>
  42 #include <sys/flock.h>
  43 #include <sys/atomic.h>
  44 #include <sys/kmem.h>
  45 #include <sys/uio.h>
  46 #include <sys/conf.h>
  47 #include <sys/mman.h>
  48 #include <sys/pathname.h>
  49 #include <sys/debug.h>
  50 #include <sys/vmsystm.h>
  51 #include <sys/cmn_err.h>
  52 #include <sys/acct.h>
  53 #include <sys/dnlc.h>
  54 #include <sys/swap.h>
  55 
  56 #include <sys/fs/ufs_fs.h>
  57 #include <sys/fs/ufs_inode.h>
  58 #include <sys/fs/ufs_fsdir.h>
  59 #include <sys/fs/ufs_trans.h>
  60 #include <sys/fs/ufs_panic.h>
  61 #include <sys/fs/ufs_mount.h>
  62 #include <sys/fs/ufs_bio.h>
  63 #include <sys/fs/ufs_log.h>
  64 #include <sys/fs/ufs_quota.h>
  65 #include <sys/dirent.h>           /* must be AFTER <sys/fs/fsdir.h>! */
  66 #include <sys/errno.h>
  67 #include <sys/sysinfo.h>
  68 
  69 #include <vm/hat.h>
  70 #include <vm/pvn.h>
  71 #include <vm/as.h>
  72 #include <vm/seg.h>
  73 #include <vm/seg_map.h>
  74 #include <vm/seg_vn.h>
  75 #include <vm/rm.h>
  76 #include <vm/anon.h>
  77 #include <sys/swap.h>
  78 #include <sys/dnlc.h>
  79 
  80 extern struct vnode *common_specvp(struct vnode *vp);
  81 
  82 /* error lock status */
  83 #define UN_ERRLCK       (-1)
  84 #define SET_ERRLCK      1
  85 #define RE_ERRLCK       2
  86 #define NO_ERRLCK       0
  87 
  88 /*
  89  * Index to be used in TSD for storing lockfs data
  90  */
  91 uint_t ufs_lockfs_key;
  92 
  93 typedef struct _ulockfs_info {
  94         struct _ulockfs_info *next;
  95         struct ulockfs *ulp;
  96         uint_t flags;
  97 } ulockfs_info_t;
  98 
  99 #define ULOCK_INFO_FALLOCATE    0x00000001      /* fallocate thread */
 100 
 101 /*
 102  * Check in TSD that whether we are already doing any VOP on this filesystem
 103  */
 104 #define IS_REC_VOP(found, head, ulp, free)              \
 105 {                                                       \
 106         ulockfs_info_t *_curr;                          \
 107                                                         \
 108         for (found = 0, free = NULL, _curr = head;      \
 109             _curr != NULL; _curr = _curr->next) {    \
 110                 if ((free == NULL) &&                   \
 111                     (_curr->ulp == NULL))            \
 112                         free = _curr;                   \
 113                 if (_curr->ulp == ulp) {             \
 114                         found = 1;                      \
 115                         break;                          \
 116                 }                                       \
 117         }                                               \
 118 }
 119 
 120 /*
 121  * Get the lockfs data from TSD so that lockfs handles the recursive VOP
 122  * properly
 123  */
 124 #define SEARCH_ULOCKFSP(head, ulp, info)                \
 125 {                                                       \
 126         ulockfs_info_t *_curr;                          \
 127                                                         \
 128         for (_curr = head; _curr != NULL;               \
 129             _curr = _curr->next) {                   \
 130                 if (_curr->ulp == ulp) {             \
 131                         break;                          \
 132                 }                                       \
 133         }                                               \
 134                                                         \
 135         info = _curr;                                   \
 136 }
 137 
 138 /*
 139  * Validate lockfs request
 140  */
 141 static int
 142 ufs_getlfd(
 143         struct lockfs *lockfsp,         /* new lock request */
 144         struct lockfs *ul_lockfsp)      /* old lock state */
 145 {
 146         int     error = 0;
 147 
 148         /*
 149          * no input flags defined
 150          */
 151         if (lockfsp->lf_flags != 0) {
 152                 error = EINVAL;
 153                 goto errout;
 154         }
 155 
 156         /*
 157          * check key
 158          */
 159         if (!LOCKFS_IS_ULOCK(ul_lockfsp))
 160                 if (lockfsp->lf_key != ul_lockfsp->lf_key) {
 161                         error = EINVAL;
 162                         goto errout;
 163         }
 164 
 165         lockfsp->lf_key = ul_lockfsp->lf_key + 1;
 166 
 167 errout:
 168         return (error);
 169 }
 170 
 171 /*
 172  * ufs_checkaccton
 173  *      check if accounting is turned on on this fs
 174  */
 175 
 176 int
 177 ufs_checkaccton(struct vnode *vp)
 178 {
 179         if (acct_fs_in_use(vp))
 180                 return (EDEADLK);
 181         return (0);
 182 }
 183 
 184 /*
 185  * ufs_checkswapon
 186  *      check if local swapping is to file on this fs
 187  */
 188 int
 189 ufs_checkswapon(struct vnode *vp)
 190 {
 191         struct swapinfo *sip;
 192 
 193         mutex_enter(&swapinfo_lock);
 194         for (sip = swapinfo; sip; sip = sip->si_next)
 195                 if (sip->si_vp->v_vfsp == vp->v_vfsp) {
 196                         mutex_exit(&swapinfo_lock);
 197                         return (EDEADLK);
 198                 }
 199         mutex_exit(&swapinfo_lock);
 200         return (0);
 201 }
 202 
 203 /*
 204  * ufs_freeze
 205  *      pend future accesses for current lock and desired lock
 206  */
 207 void
 208 ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp)
 209 {
 210         /*
 211          * set to new lock type
 212          */
 213         ulp->ul_lockfs.lf_lock = lockfsp->lf_lock;
 214         ulp->ul_lockfs.lf_key = lockfsp->lf_key;
 215         ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen;
 216         ulp->ul_lockfs.lf_comment = lockfsp->lf_comment;
 217 
 218         ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock);
 219 }
 220 
 221 /*
 222  * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before
 223  * starting ufs_quiesce() protocol and decrement it only when a file system no
 224  * longer has to be in quiescent state. This allows ufs_pageio() to detect
 225  * that another thread wants to quiesce a file system. See more comments in
 226  * ufs_pageio().
 227  */
 228 ulong_t ufs_quiesce_pend = 0;
 229 
 230 /*
 231  * ufs_quiesce
 232  *      wait for outstanding accesses to finish
 233  */
 234 int
 235 ufs_quiesce(struct ulockfs *ulp)
 236 {
 237         int error = 0;
 238         ulockfs_info_t *head;
 239         ulockfs_info_t *info;
 240         klwp_t *lwp = ttolwp(curthread);
 241 
 242         head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
 243         SEARCH_ULOCKFSP(head, ulp, info);
 244 
 245         /*
 246          * We have to keep /proc away from stopping us after we applied
 247          * the softlock but before we got a chance to clear it again.
 248          * prstop() may pagefault and become stuck on the softlock still
 249          * pending.
 250          */
 251         if (lwp != NULL)
 252                 lwp->lwp_nostop++;
 253 
 254         /*
 255          * Set a softlock to suspend future ufs_vnops so that
 256          * this lockfs request will not be starved
 257          */
 258         ULOCKFS_SET_SLOCK(ulp);
 259         ASSERT(ufs_quiesce_pend);
 260 
 261         /* check if there is any outstanding ufs vnodeops calls */
 262         while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) {
 263                 /*
 264                  * use timed version of cv_wait_sig() to make sure we don't
 265                  * miss a wake up call from ufs_pageio() when it doesn't use
 266                  * ul_lock.
 267                  *
 268                  * when a fallocate thread comes in, the only way it returns
 269                  * from this function is if there are no other vnode operations
 270                  * going on (remember fallocate threads are tracked using
 271                  * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread
 272                  * hasn't already grabbed the fs write lock.
 273                  */
 274                 if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
 275                         if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp))
 276                                 goto out;
 277                 }
 278                 if (!cv_reltimedwait_sig(&ulp->ul_cv, &ulp->ul_lock, hz,
 279                     TR_CLOCK_TICK)) {
 280                         error = EINTR;
 281                         goto out;
 282                 }
 283         }
 284 
 285 out:
 286         /*
 287          * unlock the soft lock
 288          */
 289         ULOCKFS_CLR_SLOCK(ulp);
 290 
 291         if (lwp != NULL)
 292                 lwp->lwp_nostop--;
 293 
 294         return (error);
 295 }
 296 
 297 /*
 298  * ufs_flush_inode
 299  */
 300 int
 301 ufs_flush_inode(struct inode *ip, void *arg)
 302 {
 303         int     error;
 304         int     saverror        = 0;
 305 
 306         /*
 307          * wrong file system; keep looking
 308          */
 309         if (ip->i_ufsvfs != (struct ufsvfs *)arg)
 310                 return (0);
 311 
 312         /*
 313          * asynchronously push all the dirty pages
 314          */
 315         if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) &&
 316             (error != EAGAIN))
 317                 saverror = error;
 318         /*
 319          * wait for io and discard all mappings
 320          */
 321         if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI))
 322                 saverror = error;
 323 
 324         if (ITOV(ip)->v_type == VDIR) {
 325                 dnlc_dir_purge(&ip->i_danchor);
 326         }
 327 
 328         return (saverror);
 329 }
 330 
 331 /*
 332  * ufs_flush
 333  *      Flush everything that is currently dirty; this includes invalidating
 334  *      any mappings.
 335  */
 336 int
 337 ufs_flush(struct vfs *vfsp)
 338 {
 339         int             error;
 340         int             saverror = 0;
 341         struct ufsvfs   *ufsvfsp        = (struct ufsvfs *)vfsp->vfs_data;
 342         struct fs       *fs             = ufsvfsp->vfs_fs;
 343         int             tdontblock = 0;
 344 
 345         ASSERT(vfs_lock_held(vfsp));
 346 
 347         /*
 348          * purge dnlc
 349          */
 350         (void) dnlc_purge_vfsp(vfsp, 0);
 351 
 352         /*
 353          * drain the delete and idle threads
 354          */
 355         ufs_delete_drain(vfsp, 0, 0);
 356         ufs_idle_drain(vfsp);
 357 
 358         /*
 359          * flush and invalidate quota records
 360          */
 361         (void) qsync(ufsvfsp);
 362 
 363         /*
 364          * flush w/invalidate the inodes for vfsp
 365          */
 366         if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp))
 367                 saverror = error;
 368 
 369         /*
 370          * synchronously flush superblock and summary info
 371          */
 372         if (fs->fs_ronly == 0 && fs->fs_fmod) {
 373                 fs->fs_fmod = 0;
 374                 TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH);
 375         }
 376         /*
 377          * flush w/invalidate block device pages and buf cache
 378          */
 379         if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp),
 380             (offset_t)0, 0, B_INVAL, CRED(), NULL)) > 0)
 381                 saverror = error;
 382 
 383         (void) bflush((dev_t)vfsp->vfs_dev);
 384         (void) bfinval((dev_t)vfsp->vfs_dev, 0);
 385 
 386         /*
 387          * drain the delete and idle threads again
 388          */
 389         ufs_delete_drain(vfsp, 0, 0);
 390         ufs_idle_drain(vfsp);
 391 
 392         /*
 393          * play with the clean flag
 394          */
 395         if (saverror == 0)
 396                 ufs_checkclean(vfsp);
 397 
 398         /*
 399          * Flush any outstanding transactions and roll the log
 400          * only if we are supposed to do, i.e. LDL_NOROLL not set.
 401          * We can not simply check for fs_ronly here since fsck also may
 402          * use this code to roll the log on a read-only filesystem, e.g.
 403          * root during early stages of boot, if other then a sanity check is
 404          * done, it will clear LDL_NOROLL before.
 405          * In addition we assert that the deltamap does not contain any deltas
 406          * in case LDL_NOROLL is set since this is not supposed to happen.
 407          */
 408         if (TRANS_ISTRANS(ufsvfsp)) {
 409                 ml_unit_t       *ul     = ufsvfsp->vfs_log;
 410                 mt_map_t        *mtm    = ul->un_deltamap;
 411 
 412                 if (ul->un_flags & LDL_NOROLL) {
 413                         ASSERT(mtm->mtm_nme == 0);
 414                 } else {
 415                         /*
 416                          * Do not set T_DONTBLOCK if there is a
 417                          * transaction opened by caller.
 418                          */
 419                         if (curthread->t_flag & T_DONTBLOCK)
 420                                 tdontblock = 1;
 421                         else
 422                                 curthread->t_flag |= T_DONTBLOCK;
 423 
 424                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH,
 425                             TOP_COMMIT_SIZE, error);
 426 
 427                         if (!error) {
 428                                 TRANS_END_SYNC(ufsvfsp, saverror,
 429                                     TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE);
 430                         }
 431 
 432                         if (tdontblock == 0)
 433                                 curthread->t_flag &= ~T_DONTBLOCK;
 434 
 435                         logmap_roll_dev(ufsvfsp->vfs_log);
 436                 }
 437         }
 438 
 439         return (saverror);
 440 }
 441 
 442 /*
 443  * ufs_thaw_wlock
 444  *      special processing when thawing down to wlock
 445  */
 446 static int
 447 ufs_thaw_wlock(struct inode *ip, void *arg)
 448 {
 449         /*
 450          * wrong file system; keep looking
 451          */
 452         if (ip->i_ufsvfs != (struct ufsvfs *)arg)
 453                 return (0);
 454 
 455         /*
 456          * iupdat refuses to clear flags if the fs is read only.  The fs
 457          * may become read/write during the lock and we wouldn't want
 458          * these inodes being written to disk.  So clear the flags.
 459          */
 460         rw_enter(&ip->i_contents, RW_WRITER);
 461         ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
 462         rw_exit(&ip->i_contents);
 463 
 464         /*
 465          * pages are mlocked -- fail wlock
 466          */
 467         if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip)))
 468                 return (EBUSY);
 469 
 470         return (0);
 471 }
 472 
 473 /*
 474  * ufs_thaw_hlock
 475  *      special processing when thawing down to hlock or elock
 476  */
 477 static int
 478 ufs_thaw_hlock(struct inode *ip, void *arg)
 479 {
 480         struct vnode    *vp     = ITOV(ip);
 481 
 482         /*
 483          * wrong file system; keep looking
 484          */
 485         if (ip->i_ufsvfs != (struct ufsvfs *)arg)
 486                 return (0);
 487 
 488         /*
 489          * blow away all pages - even if they are mlocked
 490          */
 491         do {
 492                 (void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK);
 493         } while ((vp->v_type != VCHR) && vn_has_cached_data(vp));
 494         rw_enter(&ip->i_contents, RW_WRITER);
 495         ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG);
 496         rw_exit(&ip->i_contents);
 497 
 498         return (0);
 499 }
 500 
 501 /*
 502  * ufs_thaw
 503  *      thaw file system lock down to current value
 504  */
 505 int
 506 ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp)
 507 {
 508         int             error   = 0;
 509         int             noidel  = (int)(ulp->ul_flag & ULOCKFS_NOIDEL);
 510 
 511         /*
 512          * if wlock or hlock or elock
 513          */
 514         if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) ||
 515             ULOCKFS_IS_ELOCK(ulp)) {
 516 
 517                 /*
 518                  * don't keep access times
 519                  * don't free deleted files
 520                  * if superblock writes are allowed, limit them to me for now
 521                  */
 522                 ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
 523                 if (ulp->ul_sbowner != (kthread_id_t)-1)
 524                         ulp->ul_sbowner = curthread;
 525 
 526                 /*
 527                  * wait for writes for deleted files and superblock updates
 528                  */
 529                 (void) ufs_flush(vfsp);
 530 
 531                 /*
 532                  * now make sure the quota file is up-to-date
 533                  *      expensive; but effective
 534                  */
 535                 error = ufs_flush(vfsp);
 536                 /*
 537                  * no one can write the superblock
 538                  */
 539                 ulp->ul_sbowner = (kthread_id_t)-1;
 540 
 541                 /*
 542                  * special processing for wlock/hlock/elock
 543                  */
 544                 if (ULOCKFS_IS_WLOCK(ulp)) {
 545                         if (error)
 546                                 goto errout;
 547                         error = bfinval(ufsvfsp->vfs_dev, 0);
 548                         if (error)
 549                                 goto errout;
 550                         error = ufs_scan_inodes(0, ufs_thaw_wlock,
 551                             (void *)ufsvfsp, ufsvfsp);
 552                         if (error)
 553                                 goto errout;
 554                 }
 555                 if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) {
 556                         error = 0;
 557                         (void) ufs_scan_inodes(0, ufs_thaw_hlock,
 558                             (void *)ufsvfsp, ufsvfsp);
 559                         (void) bfinval(ufsvfsp->vfs_dev, 1);
 560                 }
 561         } else {
 562 
 563                 /*
 564                  * okay to keep access times
 565                  * okay to free deleted files
 566                  * okay to write the superblock
 567                  */
 568                 ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
 569                 ulp->ul_sbowner = NULL;
 570 
 571                 /*
 572                  * flush in case deleted files are in memory
 573                  */
 574                 if (noidel) {
 575                         if (error = ufs_flush(vfsp))
 576                                 goto errout;
 577                 }
 578         }
 579 
 580 errout:
 581         cv_broadcast(&ulp->ul_cv);
 582         return (error);
 583 }
 584 
 585 /*
 586  * ufs_reconcile_fs
 587  *      reconcile incore superblock with ondisk superblock
 588  */
 589 int
 590 ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
 591 {
 592         struct fs       *mfs;   /* in-memory superblock */
 593         struct fs       *dfs;   /* on-disk   superblock */
 594         struct buf      *bp;    /* on-disk   superblock buf */
 595         int              needs_unlock;
 596         char             finished_fsclean;
 597 
 598         mfs = ufsvfsp->vfs_fs;
 599 
 600         /*
 601          * get the on-disk copy of the superblock
 602          */
 603         bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE);
 604         bp->b_flags |= (B_STALE|B_AGE);
 605         if (bp->b_flags & B_ERROR) {
 606                 brelse(bp);
 607                 return (EIO);
 608         }
 609         dfs = bp->b_un.b_fs;
 610 
 611         /* error locks may only unlock after the fs has been made consistent */
 612         if (errlck == UN_ERRLCK) {
 613                 if (dfs->fs_clean == FSFIX) {        /* being repaired */
 614                         brelse(bp);
 615                         return (EAGAIN);
 616                 }
 617                 /* repair not yet started? */
 618                 finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN;
 619                 if (dfs->fs_clean != finished_fsclean) {
 620                         brelse(bp);
 621                         return (EBUSY);
 622                 }
 623         }
 624 
 625         /*
 626          * if superblock has changed too much, abort
 627          */
 628         if ((mfs->fs_sblkno          != dfs->fs_sblkno) ||
 629             (mfs->fs_cblkno          != dfs->fs_cblkno) ||
 630             (mfs->fs_iblkno          != dfs->fs_iblkno) ||
 631             (mfs->fs_dblkno          != dfs->fs_dblkno) ||
 632             (mfs->fs_cgoffset                != dfs->fs_cgoffset) ||
 633             (mfs->fs_cgmask          != dfs->fs_cgmask) ||
 634             (mfs->fs_bsize           != dfs->fs_bsize) ||
 635             (mfs->fs_fsize           != dfs->fs_fsize) ||
 636             (mfs->fs_frag            != dfs->fs_frag) ||
 637             (mfs->fs_bmask           != dfs->fs_bmask) ||
 638             (mfs->fs_fmask           != dfs->fs_fmask) ||
 639             (mfs->fs_bshift          != dfs->fs_bshift) ||
 640             (mfs->fs_fshift          != dfs->fs_fshift) ||
 641             (mfs->fs_fragshift               != dfs->fs_fragshift) ||
 642             (mfs->fs_fsbtodb         != dfs->fs_fsbtodb) ||
 643             (mfs->fs_sbsize          != dfs->fs_sbsize) ||
 644             (mfs->fs_nindir          != dfs->fs_nindir) ||
 645             (mfs->fs_nspf            != dfs->fs_nspf) ||
 646             (mfs->fs_trackskew               != dfs->fs_trackskew) ||
 647             (mfs->fs_cgsize          != dfs->fs_cgsize) ||
 648             (mfs->fs_ntrak           != dfs->fs_ntrak) ||
 649             (mfs->fs_nsect           != dfs->fs_nsect) ||
 650             (mfs->fs_spc             != dfs->fs_spc) ||
 651             (mfs->fs_cpg             != dfs->fs_cpg) ||
 652             (mfs->fs_ipg             != dfs->fs_ipg) ||
 653             (mfs->fs_fpg             != dfs->fs_fpg) ||
 654             (mfs->fs_postblformat    != dfs->fs_postblformat) ||
 655             (mfs->fs_magic           != dfs->fs_magic)) {
 656                 brelse(bp);
 657                 return (EACCES);
 658         }
 659         if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time)
 660                 if (mfs->fs_clean == FSLOG) {
 661                         brelse(bp);
 662                         return (EACCES);
 663                 }
 664 
 665         /*
 666          * get new summary info
 667          */
 668         if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) {
 669                 brelse(bp);
 670                 return (EIO);
 671         }
 672 
 673         /*
 674          * release old summary info and update in-memory superblock
 675          */
 676         kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize);
 677         mfs->fs_u.fs_csp = dfs->fs_u.fs_csp;      /* Only entry 0 used */
 678 
 679         /*
 680          * update fields allowed to change
 681          */
 682         mfs->fs_size         = dfs->fs_size;
 683         mfs->fs_dsize                = dfs->fs_dsize;
 684         mfs->fs_ncg          = dfs->fs_ncg;
 685         mfs->fs_minfree              = dfs->fs_minfree;
 686         mfs->fs_rotdelay     = dfs->fs_rotdelay;
 687         mfs->fs_rps          = dfs->fs_rps;
 688         mfs->fs_maxcontig    = dfs->fs_maxcontig;
 689         mfs->fs_maxbpg               = dfs->fs_maxbpg;
 690         mfs->fs_csmask               = dfs->fs_csmask;
 691         mfs->fs_csshift              = dfs->fs_csshift;
 692         mfs->fs_optim                = dfs->fs_optim;
 693         mfs->fs_csaddr               = dfs->fs_csaddr;
 694         mfs->fs_cssize               = dfs->fs_cssize;
 695         mfs->fs_ncyl         = dfs->fs_ncyl;
 696         mfs->fs_cstotal              = dfs->fs_cstotal;
 697         mfs->fs_reclaim              = dfs->fs_reclaim;
 698 
 699         if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
 700                 mfs->fs_reclaim &= ~FS_RECLAIM;
 701                 mfs->fs_reclaim |=  FS_RECLAIMING;
 702                 ufs_thread_start(&ufsvfsp->vfs_reclaim,
 703                     ufs_thread_reclaim, vfsp);
 704         }
 705 
 706         /* XXX What to do about sparecon? */
 707 
 708         /* XXX need to copy volume label */
 709 
 710         /*
 711          * ondisk clean flag overrides inmemory clean flag iff == FSBAD
 712          * or if error-locked and ondisk is now clean
 713          */
 714         needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
 715         if (needs_unlock)
 716                 mutex_enter(&ufsvfsp->vfs_lock);
 717 
 718         if (errlck == UN_ERRLCK) {
 719                 if (finished_fsclean == dfs->fs_clean)
 720                         mfs->fs_clean = finished_fsclean;
 721                 else
 722                         mfs->fs_clean = FSBAD;
 723                 mfs->fs_state = FSOKAY - dfs->fs_time;
 724         }
 725 
 726         if (FSOKAY != dfs->fs_state + dfs->fs_time ||
 727             (dfs->fs_clean == FSBAD))
 728                 mfs->fs_clean = FSBAD;
 729 
 730         if (needs_unlock)
 731                 mutex_exit(&ufsvfsp->vfs_lock);
 732 
 733         brelse(bp);
 734 
 735         return (0);
 736 }
 737 
 738 /*
 739  * ufs_reconcile_inode
 740  *      reconcile ondisk inode with incore inode
 741  */
 742 static int
 743 ufs_reconcile_inode(struct inode *ip, void *arg)
 744 {
 745         int             i;
 746         int             ndaddr;
 747         int             niaddr;
 748         struct dinode   *dp;            /* ondisk inode */
 749         struct buf      *bp     = NULL;
 750         uid_t           d_uid;
 751         gid_t           d_gid;
 752         int             error = 0;
 753         struct fs       *fs;
 754 
 755         /*
 756          * not an inode we care about
 757          */
 758         if (ip->i_ufsvfs != (struct ufsvfs *)arg)
 759                 return (0);
 760 
 761         fs = ip->i_fs;
 762 
 763         /*
 764          * Inode reconciliation fails: we made the filesystem quiescent
 765          * and we did a ufs_flush() before calling ufs_reconcile_inode()
 766          * and thus the inode should not have been changed inbetween.
 767          * Any discrepancies indicate a logic error and a pretty
 768          * significant run-state inconsistency we should complain about.
 769          */
 770         if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) {
 771                 cmn_err(CE_WARN, "%s: Inode reconciliation failed for"
 772                     "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number);
 773                 return (EINVAL);
 774         }
 775 
 776         /*
 777          * get the dinode
 778          */
 779         bp = UFS_BREAD(ip->i_ufsvfs,
 780             ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
 781             (int)fs->fs_bsize);
 782         if (bp->b_flags & B_ERROR) {
 783                 brelse(bp);
 784                 return (EIO);
 785         }
 786         dp  = bp->b_un.b_dino;
 787         dp += itoo(fs, ip->i_number);
 788 
 789         /*
 790          * handle Sun's implementation of EFT
 791          */
 792         d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid;
 793         d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid;
 794 
 795         rw_enter(&ip->i_contents, RW_WRITER);
 796 
 797         /*
 798          * some fields are not allowed to change
 799          */
 800         if ((ip->i_mode  != dp->di_mode) ||
 801             (ip->i_gen   != dp->di_gen) ||
 802             (ip->i_uid   != d_uid) ||
 803             (ip->i_gid   != d_gid)) {
 804                 error = EACCES;
 805                 goto out;
 806         }
 807 
 808         /*
 809          * and some are allowed to change
 810          */
 811         ip->i_size           = dp->di_size;
 812         ip->i_ic.ic_flags    = dp->di_ic.ic_flags;
 813         ip->i_blocks         = dp->di_blocks;
 814         ip->i_nlink          = dp->di_nlink;
 815         if (ip->i_flag & IFASTSYMLNK) {
 816                 ndaddr = 1;
 817                 niaddr = 0;
 818         } else {
 819                 ndaddr = NDADDR;
 820                 niaddr = NIADDR;
 821         }
 822         for (i = 0; i < ndaddr; ++i)
 823                 ip->i_db[i] = dp->di_db[i];
 824         for (i = 0; i < niaddr; ++i)
 825                 ip->i_ib[i] = dp->di_ib[i];
 826 
 827 out:
 828         rw_exit(&ip->i_contents);
 829         brelse(bp);
 830         return (error);
 831 }
 832 
 833 /*
 834  * ufs_reconcile
 835  *      reconcile ondisk superblock/inodes with any incore
 836  */
 837 static int
 838 ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck)
 839 {
 840         int     error = 0;
 841 
 842         /*
 843          * get rid of as much inmemory data as possible
 844          */
 845         (void) ufs_flush(vfsp);
 846 
 847         /*
 848          * reconcile the superblock and inodes
 849          */
 850         if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck))
 851                 return (error);
 852         if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp))
 853                 return (error);
 854         /*
 855          * allocation blocks may be incorrect; get rid of them
 856          */
 857         (void) ufs_flush(vfsp);
 858 
 859         return (error);
 860 }
 861 
 862 /*
 863  * File system locking
 864  */
 865 int
 866 ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log)
 867 {
 868         return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log));
 869 }
 870 
 871 /* kernel-internal interface, also used by fix-on-panic */
 872 int
 873 ufs__fiolfs(
 874         struct vnode *vp,
 875         struct lockfs *lockfsp,
 876         int from_user,
 877         int from_log)
 878 {
 879         struct ulockfs  *ulp;
 880         struct lockfs   lfs;
 881         int             error;
 882         struct vfs      *vfsp;
 883         struct ufsvfs   *ufsvfsp;
 884         int              errlck         = NO_ERRLCK;
 885         int              poll_events    = POLLPRI;
 886         extern struct pollhead ufs_pollhd;
 887         ulockfs_info_t *head;
 888         ulockfs_info_t *info;
 889         int signal = 0;
 890 
 891         /* check valid lock type */
 892         if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK)
 893                 return (EINVAL);
 894 
 895         if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data)
 896                 return (EIO);
 897 
 898         vfsp = vp->v_vfsp;
 899 
 900         if (vfsp->vfs_flag & VFS_UNMOUNTED) /* has been unmounted */
 901                 return (EIO);
 902 
 903         /* take the lock and check again */
 904         vfs_lock_wait(vfsp);
 905         if (vfsp->vfs_flag & VFS_UNMOUNTED) {
 906                 vfs_unlock(vfsp);
 907                 return (EIO);
 908         }
 909 
 910         /*
 911          * Can't wlock or ro/elock fs with accounting or local swap file
 912          * We need to check for this before we grab the ul_lock to avoid
 913          * deadlocks with the accounting framework.
 914          */
 915         if ((LOCKFS_IS_WLOCK(lockfsp) || LOCKFS_IS_ELOCK(lockfsp) ||
 916             LOCKFS_IS_ROELOCK(lockfsp)) && !from_log) {
 917                 if (ufs_checkaccton(vp) || ufs_checkswapon(vp)) {
 918                         vfs_unlock(vfsp);
 919                         return (EDEADLK);
 920                 }
 921         }
 922 
 923         ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
 924         ulp = &ufsvfsp->vfs_ulockfs;
 925         head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
 926         SEARCH_ULOCKFSP(head, ulp, info);
 927 
 928         /*
 929          * Suspend both the reclaim thread and the delete thread.
 930          * This must be done outside the lockfs locking protocol.
 931          */
 932         ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
 933         ufs_thread_suspend(&ufsvfsp->vfs_delete);
 934 
 935         mutex_enter(&ulp->ul_lock);
 936         atomic_inc_ulong(&ufs_quiesce_pend);
 937 
 938         /*
 939          * Quit if there is another lockfs request in progress
 940          * that is waiting for existing ufs_vnops to complete.
 941          */
 942         if (ULOCKFS_IS_BUSY(ulp)) {
 943                 error = EBUSY;
 944                 goto errexit;
 945         }
 946 
 947         /* cannot ulocked or downgrade a hard-lock */
 948         if (ULOCKFS_IS_HLOCK(ulp)) {
 949                 error = EIO;
 950                 goto errexit;
 951         }
 952 
 953         /* an error lock may be unlocked or relocked, only */
 954         if (ULOCKFS_IS_ELOCK(ulp)) {
 955                 if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
 956                         error = EBUSY;
 957                         goto errexit;
 958                 }
 959         }
 960 
 961         /*
 962          * a read-only error lock may only be upgraded to an
 963          * error lock or hard lock
 964          */
 965         if (ULOCKFS_IS_ROELOCK(ulp)) {
 966                 if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) {
 967                         error = EBUSY;
 968                         goto errexit;
 969                 }
 970         }
 971 
 972         /*
 973          * until read-only error locks are fully implemented
 974          * just return EINVAL
 975          */
 976         if (LOCKFS_IS_ROELOCK(lockfsp)) {
 977                 error = EINVAL;
 978                 goto errexit;
 979         }
 980 
 981         /*
 982          * an error lock may only be applied if the file system is
 983          * unlocked or already error locked.
 984          * (this is to prevent the case where a fs gets changed out from
 985          * underneath a fs that is locked for backup,
 986          * that is, name/delete/write-locked.)
 987          */
 988         if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) &&
 989             !ULOCKFS_IS_ROELOCK(ulp)) &&
 990             (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) {
 991                 error = EBUSY;
 992                 goto errexit;
 993         }
 994 
 995         /* get and validate the input lockfs request */
 996         if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs))
 997                 goto errexit;
 998 
 999         /*
1000          * save current ulockfs struct
1001          */
1002         bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs));
1003 
1004         /*
1005          * Freeze the file system (pend future accesses)
1006          */
1007         ufs_freeze(ulp, lockfsp);
1008 
1009         /*
1010          * Set locking in progress because ufs_quiesce may free the
1011          * ul_lock mutex.
1012          */
1013         ULOCKFS_SET_BUSY(ulp);
1014         /* update the ioctl copy */
1015         LOCKFS_SET_BUSY(&ulp->ul_lockfs);
1016 
1017         /*
1018          * We  need to unset FWLOCK status before we call ufs_quiesce
1019          * so that the thread doesnt get suspended. We do this only if
1020          * this (fallocate) thread requested an unlock operation.
1021          */
1022         if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1023                 if (!ULOCKFS_IS_WLOCK(ulp))
1024                         ULOCKFS_CLR_FWLOCK(ulp);
1025         }
1026 
1027         /*
1028          * Quiesce (wait for outstanding accesses to finish)
1029          */
1030         if (error = ufs_quiesce(ulp)) {
1031                 /*
1032                  * Interrupted due to signal. There could still be
1033                  * pending vnops.
1034                  */
1035                 signal = 1;
1036 
1037                 /*
1038                  * We do broadcast because lock-status
1039                  * could be reverted to old status.
1040                  */
1041                 cv_broadcast(&ulp->ul_cv);
1042                 goto errout;
1043         }
1044 
1045         /*
1046          * If the fallocate thread requested a write fs lock operation
1047          * then we set fwlock status in the ulp.
1048          */
1049         if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
1050                 if (ULOCKFS_IS_WLOCK(ulp))
1051                         ULOCKFS_SET_FWLOCK(ulp);
1052         }
1053 
1054         /*
1055          * save error lock status to pass down to reconcilation
1056          * routines and for later cleanup
1057          */
1058         if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp))
1059                 errlck = UN_ERRLCK;
1060 
1061         if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) {
1062                 int needs_unlock;
1063                 int needs_sbwrite;
1064 
1065                 poll_events |= POLLERR;
1066                 errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ?
1067                     RE_ERRLCK : SET_ERRLCK;
1068 
1069                 needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock);
1070                 if (needs_unlock)
1071                         mutex_enter(&ufsvfsp->vfs_lock);
1072 
1073                 /* disable delayed i/o */
1074                 needs_sbwrite = 0;
1075 
1076                 if (errlck == SET_ERRLCK) {
1077                         ufsvfsp->vfs_fs->fs_clean = FSBAD;
1078                         needs_sbwrite = 1;
1079                 }
1080 
1081                 needs_sbwrite |= ufsvfsp->vfs_dio;
1082                 ufsvfsp->vfs_dio = 0;
1083 
1084                 if (needs_unlock)
1085                         mutex_exit(&ufsvfsp->vfs_lock);
1086 
1087                 if (needs_sbwrite) {
1088                         ulp->ul_sbowner = curthread;
1089                         TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE);
1090 
1091                         if (needs_unlock)
1092                                 mutex_enter(&ufsvfsp->vfs_lock);
1093 
1094                         ufsvfsp->vfs_fs->fs_fmod = 0;
1095 
1096                         if (needs_unlock)
1097                                 mutex_exit(&ufsvfsp->vfs_lock);
1098                 }
1099         }
1100 
1101         /*
1102          * reconcile superblock and inodes if was wlocked
1103          */
1104         if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) {
1105                 if (error = ufs_reconcile(vfsp, ufsvfsp, errlck))
1106                         goto errout;
1107                 /*
1108                  * in case the fs grew; reset the metadata map for logging tests
1109                  */
1110                 TRANS_MATA_UMOUNT(ufsvfsp);
1111                 TRANS_MATA_MOUNT(ufsvfsp);
1112                 TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs);
1113         }
1114 
1115         /*
1116          * At least everything *currently* dirty goes out.
1117          */
1118 
1119         if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) &&
1120             !ULOCKFS_IS_ELOCK(ulp))
1121                 goto errout;
1122 
1123         /*
1124          * thaw file system and wakeup pended processes
1125          */
1126         if (error = ufs_thaw(vfsp, ufsvfsp, ulp))
1127                 if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp))
1128                         goto errout;
1129 
1130         /*
1131          * reset modified flag if not already write locked
1132          */
1133         if (!LOCKFS_IS_WLOCK(&lfs))
1134                 ULOCKFS_CLR_MOD(ulp);
1135 
1136         /*
1137          * idle the lock struct
1138          */
1139         ULOCKFS_CLR_BUSY(ulp);
1140         /* update the ioctl copy */
1141         LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1142 
1143         /*
1144          * free current comment
1145          */
1146         if (lfs.lf_comment && lfs.lf_comlen != 0) {
1147                 kmem_free(lfs.lf_comment, lfs.lf_comlen);
1148                 lfs.lf_comment = NULL;
1149                 lfs.lf_comlen = 0;
1150         }
1151 
1152         /* do error lock cleanup */
1153         if (errlck == UN_ERRLCK)
1154                 ufsfx_unlockfs(ufsvfsp);
1155 
1156         else if (errlck == RE_ERRLCK)
1157                 ufsfx_lockfs(ufsvfsp);
1158 
1159         /* don't allow error lock from user to invoke panic */
1160         else if (from_user && errlck == SET_ERRLCK &&
1161             !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4)))
1162                 (void) ufs_fault(ufsvfsp->vfs_root,
1163                     ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ?
1164                     ulp->ul_lockfs.lf_comment: "user-applied error lock");
1165 
1166         atomic_dec_ulong(&ufs_quiesce_pend);
1167         mutex_exit(&ulp->ul_lock);
1168         vfs_unlock(vfsp);
1169 
1170         if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs))
1171                 poll_events |= POLLERR;
1172 
1173         pollwakeup(&ufs_pollhd, poll_events);
1174 
1175         /*
1176          * Allow both the delete thread and the reclaim thread to
1177          * continue.
1178          */
1179         ufs_thread_continue(&ufsvfsp->vfs_delete);
1180         ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1181 
1182         return (0);
1183 
1184 errout:
1185         /*
1186          * Lock failed. Reset the old lock in ufsvfs if not hard locked.
1187          */
1188         if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) {
1189                 bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs));
1190                 ulp->ul_fs_lock = (1 << lfs.lf_lock);
1191         }
1192 
1193         /*
1194          * Don't call ufs_thaw() when there's a signal during
1195          * ufs quiesce operation as it can lead to deadlock
1196          * with getpage.
1197          */
1198         if (signal == 0)
1199                 (void) ufs_thaw(vfsp, ufsvfsp, ulp);
1200 
1201         ULOCKFS_CLR_BUSY(ulp);
1202         LOCKFS_CLR_BUSY(&ulp->ul_lockfs);
1203 
1204 errexit:
1205         atomic_dec_ulong(&ufs_quiesce_pend);
1206         mutex_exit(&ulp->ul_lock);
1207         vfs_unlock(vfsp);
1208 
1209         /*
1210          * Allow both the delete thread and the reclaim thread to
1211          * continue.
1212          */
1213         ufs_thread_continue(&ufsvfsp->vfs_delete);
1214         ufs_thread_continue(&ufsvfsp->vfs_reclaim);
1215 
1216         return (error);
1217 }
1218 
1219 /*
1220  * fiolfss
1221  *      return the current file system locking state info
1222  */
1223 int
1224 ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp)
1225 {
1226         struct ulockfs  *ulp;
1227 
1228         if (!vp || !vp->v_vfsp || !VTOI(vp))
1229                 return (EINVAL);
1230 
1231         /* file system has been forcibly unmounted */
1232         if (VTOI(vp)->i_ufsvfs == NULL)
1233                 return (EIO);
1234 
1235         ulp = VTOUL(vp);
1236 
1237         if (ULOCKFS_IS_HLOCK(ulp)) {
1238                 *lockfsp = ulp->ul_lockfs;   /* structure assignment */
1239                 return (0);
1240         }
1241 
1242         mutex_enter(&ulp->ul_lock);
1243 
1244         *lockfsp = ulp->ul_lockfs;   /* structure assignment */
1245 
1246         if (ULOCKFS_IS_MOD(ulp))
1247                 lockfsp->lf_flags |= LOCKFS_MOD;
1248 
1249         mutex_exit(&ulp->ul_lock);
1250 
1251         return (0);
1252 }
1253 
1254 /*
1255  * ufs_check_lockfs
1256  *      check whether a ufs_vnops conflicts with the file system lock
1257  */
1258 int
1259 ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask)
1260 {
1261         k_sigset_t      smask;
1262         int             sig, slock;
1263 
1264         ASSERT(MUTEX_HELD(&ulp->ul_lock));
1265 
1266         while (ulp->ul_fs_lock & mask) {
1267                 slock = (int)ULOCKFS_IS_SLOCK(ulp);
1268                 if ((curthread->t_flag & T_DONTPEND) && !slock) {
1269                         curthread->t_flag |= T_WOULDBLOCK;
1270                         return (EAGAIN);
1271                 }
1272                 curthread->t_flag &= ~T_WOULDBLOCK;
1273 
1274                 /*
1275                  * In the case of an onerr umount of the fs, threads could
1276                  * have blocked before coming into ufs_check_lockfs and
1277                  * need to check for the special case of ELOCK and
1278                  * vfs_dontblock being set which would indicate that the fs
1279                  * is on its way out and will not return therefore making
1280                  * EIO the appropriate response.
1281                  */
1282                 if (ULOCKFS_IS_HLOCK(ulp) ||
1283                     (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1284                         return (EIO);
1285 
1286                 /*
1287                  * wait for lock status to change
1288                  */
1289                 if (slock || ufsvfsp->vfs_nointr) {
1290                         cv_wait(&ulp->ul_cv, &ulp->ul_lock);
1291                 } else {
1292                         sigintr(&smask, 1);
1293                         sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
1294                         sigunintr(&smask);
1295                         if ((!sig && (ulp->ul_fs_lock & mask)) ||
1296                             ufsvfsp->vfs_dontblock)
1297                                 return (EINTR);
1298                 }
1299         }
1300 
1301         if (mask & ULOCKFS_FWLOCK) {
1302                 atomic_inc_ulong(&ulp->ul_falloc_cnt);
1303                 ULOCKFS_SET_FALLOC(ulp);
1304         } else {
1305                 atomic_inc_ulong(&ulp->ul_vnops_cnt);
1306         }
1307 
1308         return (0);
1309 }
1310 
1311 /*
1312  * Check whether we came across the handcrafted lockfs protocol path. We can't
1313  * simply check for T_DONTBLOCK here as one would assume since this can also
1314  * falsely catch recursive VOP's going to a different filesystem, instead we
1315  * check if we already hold the ulockfs->ul_lock mutex.
1316  */
1317 static int
1318 ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp)
1319 {
1320         return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1);
1321 }
1322 
1323 /*
1324  * ufs_lockfs_begin - start the lockfs locking protocol
1325  */
1326 int
1327 ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1328 {
1329         int             error;
1330         int             rec_vop;
1331         ushort_t        op_cnt_incremented = 0;
1332         ulong_t         *ctr;
1333         struct ulockfs *ulp;
1334         ulockfs_info_t  *ulockfs_info;
1335         ulockfs_info_t  *ulockfs_info_free;
1336         ulockfs_info_t  *ulockfs_info_temp;
1337 
1338         /*
1339          * file system has been forcibly unmounted
1340          */
1341         if (ufsvfsp == NULL)
1342                 return (EIO);
1343 
1344         *ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1345 
1346         /*
1347          * Do lockfs protocol
1348          */
1349         ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1350         IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1351 
1352         /*
1353          * Detect recursive VOP call or handcrafted internal lockfs protocol
1354          * path and bail out in that case.
1355          */
1356         if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1357                 *ulpp = NULL;
1358                 return (0);
1359         } else {
1360                 if (ulockfs_info_free == NULL) {
1361                         if ((ulockfs_info_temp = (ulockfs_info_t *)
1362                             kmem_zalloc(sizeof (ulockfs_info_t),
1363                             KM_NOSLEEP)) == NULL) {
1364                                 *ulpp = NULL;
1365                                 return (ENOMEM);
1366                         }
1367                 }
1368         }
1369 
1370         /*
1371          * First time VOP call
1372          *
1373          * Increment the ctr irrespective of the lockfs state. If the lockfs
1374          * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1375          * before incrementing we need to check if there is a pending quiesce
1376          * request because if we have a continuous stream of ufs_lockfs_begin
1377          * requests pounding on a few cpu's then the ufs_quiesce thread might
1378          * never see the value of zero for ctr - a livelock kind of scenario.
1379          */
1380         ctr = (mask & ULOCKFS_FWLOCK) ?
1381             &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1382         if (!ULOCKFS_IS_SLOCK(ulp)) {
1383                 atomic_inc_ulong(ctr);
1384                 op_cnt_incremented++;
1385         }
1386 
1387         /*
1388          * If the lockfs state (indicated by ul_fs_lock) is not just
1389          * ULOCKFS_ULOCK, then we will be routed through ufs_check_lockfs
1390          * where there is a check with an appropriate mask to selectively allow
1391          * operations permitted for that kind of lockfs state.
1392          *
1393          * Even these selective operations should not be allowed to go through
1394          * if a lockfs request is in progress because that could result in inode
1395          * modifications during a quiesce and could hence result in inode
1396          * reconciliation failures. ULOCKFS_SLOCK alone would not be sufficient,
1397          * so make use of ufs_quiesce_pend to disallow vnode operations when a
1398          * quiesce is in progress.
1399          */
1400         if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1401                 if (op_cnt_incremented)
1402                         if (!atomic_dec_ulong_nv(ctr))
1403                                 cv_broadcast(&ulp->ul_cv);
1404                 mutex_enter(&ulp->ul_lock);
1405                 error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1406                 mutex_exit(&ulp->ul_lock);
1407                 if (error) {
1408                         if (ulockfs_info_free == NULL)
1409                                 kmem_free(ulockfs_info_temp,
1410                                     sizeof (ulockfs_info_t));
1411                         return (error);
1412                 }
1413         } else {
1414                 /*
1415                  * This is the common case of file system in a unlocked state.
1416                  *
1417                  * If a file system is unlocked, we would expect the ctr to have
1418                  * been incremented by now. But this will not be true when a
1419                  * quiesce is winding up - SLOCK was set when we checked before
1420                  * incrementing the ctr, but by the time we checked for
1421                  * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. It is okay
1422                  * to take ul_lock and go through the slow path in this uncommon
1423                  * case.
1424                  */
1425                 if (op_cnt_incremented == 0) {
1426                         mutex_enter(&ulp->ul_lock);
1427                         error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1428                         if (error) {
1429                                 mutex_exit(&ulp->ul_lock);
1430                                 if (ulockfs_info_free == NULL)
1431                                         kmem_free(ulockfs_info_temp,
1432                                             sizeof (ulockfs_info_t));
1433                                 return (error);
1434                         }
1435                         if (mask & ULOCKFS_FWLOCK)
1436                                 ULOCKFS_SET_FALLOC(ulp);
1437                         mutex_exit(&ulp->ul_lock);
1438                 } else if (mask & ULOCKFS_FWLOCK) {
1439                         mutex_enter(&ulp->ul_lock);
1440                         ULOCKFS_SET_FALLOC(ulp);
1441                         mutex_exit(&ulp->ul_lock);
1442                 }
1443         }
1444 
1445         if (ulockfs_info_free != NULL) {
1446                 ulockfs_info_free->ulp = ulp;
1447                 if (mask & ULOCKFS_FWLOCK)
1448                         ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1449         } else {
1450                 ulockfs_info_temp->ulp = ulp;
1451                 ulockfs_info_temp->next = ulockfs_info;
1452                 if (mask & ULOCKFS_FWLOCK)
1453                         ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1454                 ASSERT(ufs_lockfs_key != 0);
1455                 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1456         }
1457 
1458         curthread->t_flag |= T_DONTBLOCK;
1459         return (0);
1460 }
1461 
1462 /*
1463  * Check whether we are returning from the top level VOP.
1464  */
1465 static int
1466 ufs_lockfs_top_vop_return(ulockfs_info_t *head)
1467 {
1468         ulockfs_info_t *info;
1469         int result = 1;
1470 
1471         for (info = head; info != NULL; info = info->next) {
1472                 if (info->ulp != NULL) {
1473                         result = 0;
1474                         break;
1475                 }
1476         }
1477 
1478         return (result);
1479 }
1480 
1481 /*
1482  * ufs_lockfs_end - terminate the lockfs locking protocol
1483  */
1484 void
1485 ufs_lockfs_end(struct ulockfs *ulp)
1486 {
1487         ulockfs_info_t *info;
1488         ulockfs_info_t *head;
1489 
1490         /*
1491          * end-of-VOP protocol
1492          */
1493         if (ulp == NULL)
1494                 return;
1495 
1496         head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1497         SEARCH_ULOCKFSP(head, ulp, info);
1498 
1499         /*
1500          * If we're called from a first level VOP, we have to have a
1501          * valid ulockfs record in the TSD.
1502          */
1503         ASSERT(info != NULL);
1504 
1505         /*
1506          * Invalidate the ulockfs record.
1507          */
1508         info->ulp = NULL;
1509 
1510         if (ufs_lockfs_top_vop_return(head))
1511                 curthread->t_flag &= ~T_DONTBLOCK;
1512 
1513         /* fallocate thread */
1514         if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) {
1515                 /* Clear the thread's fallocate state */
1516                 info->flags &= ~ULOCK_INFO_FALLOCATE;
1517                 if (!atomic_dec_ulong_nv(&ulp->ul_falloc_cnt)) {
1518                         mutex_enter(&ulp->ul_lock);
1519                         ULOCKFS_CLR_FALLOC(ulp);
1520                         cv_broadcast(&ulp->ul_cv);
1521                         mutex_exit(&ulp->ul_lock);
1522                 }
1523         } else  { /* normal thread */
1524                 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
1525                         cv_broadcast(&ulp->ul_cv);
1526         }
1527 }
1528 
1529 /*
1530  * ufs_lockfs_trybegin - try to start the lockfs locking protocol without
1531  * blocking.
1532  */
1533 int
1534 ufs_lockfs_trybegin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask)
1535 {
1536         int             error = 0;
1537         int             rec_vop;
1538         ushort_t        op_cnt_incremented = 0;
1539         ulong_t         *ctr;
1540         struct ulockfs *ulp;
1541         ulockfs_info_t  *ulockfs_info;
1542         ulockfs_info_t  *ulockfs_info_free;
1543         ulockfs_info_t  *ulockfs_info_temp;
1544 
1545         /*
1546          * file system has been forcibly unmounted
1547          */
1548         if (ufsvfsp == NULL)
1549                 return (EIO);
1550 
1551         *ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1552 
1553         /*
1554          * Do lockfs protocol
1555          */
1556         ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1557         IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1558 
1559         /*
1560          * Detect recursive VOP call or handcrafted internal lockfs protocol
1561          * path and bail out in that case.
1562          */
1563         if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1564                 *ulpp = NULL;
1565                 return (0);
1566         } else {
1567                 if (ulockfs_info_free == NULL) {
1568                         if ((ulockfs_info_temp = (ulockfs_info_t *)
1569                             kmem_zalloc(sizeof (ulockfs_info_t),
1570                             KM_NOSLEEP)) == NULL) {
1571                                 *ulpp = NULL;
1572                                 return (ENOMEM);
1573                         }
1574                 }
1575         }
1576 
1577         /*
1578          * First time VOP call
1579          *
1580          * Increment the ctr irrespective of the lockfs state. If the lockfs
1581          * state is not ULOCKFS_ULOCK, we can decrement it later. However,
1582          * before incrementing we need to check if there is a pending quiesce
1583          * request because if we have a continuous stream of ufs_lockfs_begin
1584          * requests pounding on a few cpu's then the ufs_quiesce thread might
1585          * never see the value of zero for ctr - a livelock kind of scenario.
1586          */
1587         ctr = (mask & ULOCKFS_FWLOCK) ?
1588             &ulp->ul_falloc_cnt : &ulp->ul_vnops_cnt;
1589         if (!ULOCKFS_IS_SLOCK(ulp)) {
1590                 atomic_inc_ulong(ctr);
1591                 op_cnt_incremented++;
1592         }
1593 
1594         if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1595                 /*
1596                  * Non-blocking version of ufs_check_lockfs() code.
1597                  *
1598                  * If the file system is not hard locked or error locked
1599                  * and if ulp->ul_fs_lock allows this operation, increment
1600                  * the appropriate counter and proceed (For eg., In case the
1601                  * file system is delete locked, a mmap can still go through).
1602                  */
1603                 if (op_cnt_incremented)
1604                         if (!atomic_dec_ulong_nv(ctr))
1605                                 cv_broadcast(&ulp->ul_cv);
1606                 mutex_enter(&ulp->ul_lock);
1607                 if (ULOCKFS_IS_HLOCK(ulp) ||
1608                     (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1609                         error = EIO;
1610                 else if (ulp->ul_fs_lock & mask)
1611                         error = EAGAIN;
1612 
1613                 if (error) {
1614                         mutex_exit(&ulp->ul_lock);
1615                         if (ulockfs_info_free == NULL)
1616                                 kmem_free(ulockfs_info_temp,
1617                                     sizeof (ulockfs_info_t));
1618                         return (error);
1619                 }
1620                 atomic_inc_ulong(ctr);
1621                 if (mask & ULOCKFS_FWLOCK)
1622                         ULOCKFS_SET_FALLOC(ulp);
1623                 mutex_exit(&ulp->ul_lock);
1624         } else {
1625                 /*
1626                  * This is the common case of file system in a unlocked state.
1627                  *
1628                  * If a file system is unlocked, we would expect the ctr to have
1629                  * been incremented by now. But this will not be true when a
1630                  * quiesce is winding up - SLOCK was set when we checked before
1631                  * incrementing the ctr, but by the time we checked for
1632                  * ULOCKFS_IS_JUSTULOCK, the quiesce thread was gone. Take
1633                  * ul_lock and go through the non-blocking version of
1634                  * ufs_check_lockfs() code.
1635                  */
1636                 if (op_cnt_incremented == 0) {
1637                         mutex_enter(&ulp->ul_lock);
1638                         if (ULOCKFS_IS_HLOCK(ulp) ||
1639                             (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock))
1640                                 error = EIO;
1641                         else if (ulp->ul_fs_lock & mask)
1642                                 error = EAGAIN;
1643 
1644                         if (error) {
1645                                 mutex_exit(&ulp->ul_lock);
1646                                 if (ulockfs_info_free == NULL)
1647                                         kmem_free(ulockfs_info_temp,
1648                                             sizeof (ulockfs_info_t));
1649                                 return (error);
1650                         }
1651                         atomic_inc_ulong(ctr);
1652                         if (mask & ULOCKFS_FWLOCK)
1653                                 ULOCKFS_SET_FALLOC(ulp);
1654                         mutex_exit(&ulp->ul_lock);
1655                 } else if (mask & ULOCKFS_FWLOCK) {
1656                         mutex_enter(&ulp->ul_lock);
1657                         ULOCKFS_SET_FALLOC(ulp);
1658                         mutex_exit(&ulp->ul_lock);
1659                 }
1660         }
1661 
1662         if (ulockfs_info_free != NULL) {
1663                 ulockfs_info_free->ulp = ulp;
1664                 if (mask & ULOCKFS_FWLOCK)
1665                         ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
1666         } else {
1667                 ulockfs_info_temp->ulp = ulp;
1668                 ulockfs_info_temp->next = ulockfs_info;
1669                 if (mask & ULOCKFS_FWLOCK)
1670                         ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
1671                 ASSERT(ufs_lockfs_key != 0);
1672                 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1673         }
1674 
1675         curthread->t_flag |= T_DONTBLOCK;
1676         return (0);
1677 }
1678 
1679 /*
1680  * specialized version of ufs_lockfs_begin() called by ufs_getpage().
1681  */
1682 int
1683 ufs_lockfs_begin_getpage(
1684         struct ufsvfs   *ufsvfsp,
1685         struct ulockfs  **ulpp,
1686         struct seg      *seg,
1687         int             read_access,
1688         uint_t          *protp)
1689 {
1690         ulong_t                 mask;
1691         int                     error;
1692         int                     rec_vop;
1693         struct ulockfs          *ulp;
1694         ulockfs_info_t          *ulockfs_info;
1695         ulockfs_info_t          *ulockfs_info_free;
1696         ulockfs_info_t          *ulockfs_info_temp;
1697 
1698         /*
1699          * file system has been forcibly unmounted
1700          */
1701         if (ufsvfsp == NULL)
1702                 return (EIO);
1703 
1704         *ulpp = ulp = &ufsvfsp->vfs_ulockfs;
1705 
1706         /*
1707          * Do lockfs protocol
1708          */
1709         ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
1710         IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free);
1711 
1712         /*
1713          * Detect recursive VOP call or handcrafted internal lockfs protocol
1714          * path and bail out in that case.
1715          */
1716         if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) {
1717                 *ulpp = NULL;
1718                 return (0);
1719         } else {
1720                 if (ulockfs_info_free == NULL) {
1721                         if ((ulockfs_info_temp = (ulockfs_info_t *)
1722                             kmem_zalloc(sizeof (ulockfs_info_t),
1723                             KM_NOSLEEP)) == NULL) {
1724                                 *ulpp = NULL;
1725                                 return (ENOMEM);
1726                         }
1727                 }
1728         }
1729 
1730         /*
1731          * First time VOP call
1732          */
1733         atomic_inc_ulong(&ulp->ul_vnops_cnt);
1734         if (!ULOCKFS_IS_JUSTULOCK(ulp) || ufs_quiesce_pend) {
1735                 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
1736                         cv_broadcast(&ulp->ul_cv);
1737                 mutex_enter(&ulp->ul_lock);
1738                 if (seg->s_ops == &segvn_ops &&
1739                     ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) {
1740                         mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1741                 } else if (protp && read_access) {
1742                         /*
1743                          * Restrict the mapping to readonly.
1744                          * Writes to this mapping will cause
1745                          * another fault which will then
1746                          * be suspended if fs is write locked
1747                          */
1748                         *protp &= ~PROT_WRITE;
1749                         mask = (ulong_t)ULOCKFS_GETREAD_MASK;
1750                 } else
1751                         mask = (ulong_t)ULOCKFS_GETWRITE_MASK;
1752 
1753                 /*
1754                  * will sleep if this fs is locked against this VOP
1755                  */
1756                 error = ufs_check_lockfs(ufsvfsp, ulp, mask);
1757                 mutex_exit(&ulp->ul_lock);
1758                 if (error) {
1759                         if (ulockfs_info_free == NULL)
1760                                 kmem_free(ulockfs_info_temp,
1761                                     sizeof (ulockfs_info_t));
1762                         return (error);
1763                 }
1764         }
1765 
1766         if (ulockfs_info_free != NULL) {
1767                 ulockfs_info_free->ulp = ulp;
1768         } else {
1769                 ulockfs_info_temp->ulp = ulp;
1770                 ulockfs_info_temp->next = ulockfs_info;
1771                 ASSERT(ufs_lockfs_key != 0);
1772                 (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
1773         }
1774 
1775         curthread->t_flag |= T_DONTBLOCK;
1776         return (0);
1777 }
1778 
1779 void
1780 ufs_lockfs_tsd_destructor(void *head)
1781 {
1782         ulockfs_info_t *curr = (ulockfs_info_t *)head;
1783         ulockfs_info_t *temp;
1784 
1785         for (; curr != NULL; ) {
1786                 /*
1787                  * The TSD destructor is being called when the thread exits
1788                  * (via thread_exit()). At that time it must have cleaned up
1789                  * all VOPs via ufs_lockfs_end() and there must not be a
1790                  * valid ulockfs record exist while a thread is exiting.
1791                  */
1792                 temp = curr;
1793                 curr = curr->next;
1794                 ASSERT(temp->ulp == NULL);
1795                 kmem_free(temp, sizeof (ulockfs_info_t));
1796         }
1797 }