1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  28  */
  29 
  30 #include <sys/param.h>
  31 #include <sys/types.h>
  32 #include <sys/systm.h>
  33 #include <sys/cred.h>
  34 #include <sys/proc.h>
  35 #include <sys/user.h>
  36 #include <sys/time.h>
  37 #include <sys/buf.h>
  38 #include <sys/vfs.h>
  39 #include <sys/vnode.h>
  40 #include <sys/socket.h>
  41 #include <sys/uio.h>
  42 #include <sys/tiuser.h>
  43 #include <sys/swap.h>
  44 #include <sys/errno.h>
  45 #include <sys/debug.h>
  46 #include <sys/kmem.h>
  47 #include <sys/kstat.h>
  48 #include <sys/cmn_err.h>
  49 #include <sys/vtrace.h>
  50 #include <sys/session.h>
  51 #include <sys/dnlc.h>
  52 #include <sys/bitmap.h>
  53 #include <sys/acl.h>
  54 #include <sys/ddi.h>
  55 #include <sys/pathname.h>
  56 #include <sys/flock.h>
  57 #include <sys/dirent.h>
  58 #include <sys/flock.h>
  59 #include <sys/callb.h>
  60 #include <sys/atomic.h>
  61 #include <sys/list.h>
  62 #include <sys/tsol/tnet.h>
  63 #include <sys/priv.h>
  64 #include <sys/sdt.h>
  65 #include <sys/attr.h>
  66 
  67 #include <inet/ip6.h>
  68 
  69 #include <rpc/types.h>
  70 #include <rpc/xdr.h>
  71 #include <rpc/auth.h>
  72 #include <rpc/clnt.h>
  73 
  74 #include <nfs/nfs.h>
  75 #include <nfs/nfs4.h>
  76 #include <nfs/nfs_clnt.h>
  77 #include <nfs/rnode.h>
  78 #include <nfs/nfs_acl.h>
  79 
  80 #include <sys/tsol/label.h>
  81 
  82 /*
  83  * The hash queues for the access to active and cached rnodes
  84  * are organized as doubly linked lists.  A reader/writer lock
  85  * for each hash bucket is used to control access and to synchronize
  86  * lookups, additions, and deletions from the hash queue.
  87  *
  88  * The rnode freelist is organized as a doubly linked list with
  89  * a head pointer.  Additions and deletions are synchronized via
  90  * a single mutex.
  91  *
  92  * In order to add an rnode to the free list, it must be hashed into
  93  * a hash queue and the exclusive lock to the hash queue be held.
  94  * If an rnode is not hashed into a hash queue, then it is destroyed
  95  * because it represents no valuable information that can be reused
  96  * about the file.  The exclusive lock to the hash queue must be
  97  * held in order to prevent a lookup in the hash queue from finding
  98  * the rnode and using it and assuming that the rnode is not on the
  99  * freelist.  The lookup in the hash queue will have the hash queue
 100  * locked, either exclusive or shared.
 101  *
 102  * The vnode reference count for each rnode is not allowed to drop
 103  * below 1.  This prevents external entities, such as the VM
 104  * subsystem, from acquiring references to vnodes already on the
 105  * freelist and then trying to place them back on the freelist
 106  * when their reference is released.  This means that the when an
 107  * rnode is looked up in the hash queues, then either the rnode
 108  * is removed from the freelist and that reference is transferred to
 109  * the new reference or the vnode reference count must be incremented
 110  * accordingly.  The mutex for the freelist must be held in order to
 111  * accurately test to see if the rnode is on the freelist or not.
 112  * The hash queue lock might be held shared and it is possible that
 113  * two different threads may race to remove the rnode from the
 114  * freelist.  This race can be resolved by holding the mutex for the
 115  * freelist.  Please note that the mutex for the freelist does not
 116  * need to held if the rnode is not on the freelist.  It can not be
 117  * placed on the freelist due to the requirement that the thread
 118  * putting the rnode on the freelist must hold the exclusive lock
 119  * to the hash queue and the thread doing the lookup in the hash
 120  * queue is holding either a shared or exclusive lock to the hash
 121  * queue.
 122  *
 123  * The lock ordering is:
 124  *
 125  *      hash bucket lock -> vnode lock
 126  *      hash bucket lock -> freelist lock
 127  */
 128 static rhashq_t *rtable;
 129 
 130 static kmutex_t rpfreelist_lock;
 131 static rnode_t *rpfreelist = NULL;
 132 static long rnew = 0;
 133 long nrnode = 0;
 134 
 135 static int rtablesize;
 136 static int rtablemask;
 137 
 138 static int hashlen = 4;
 139 
 140 static struct kmem_cache *rnode_cache;
 141 
 142 /*
 143  * Mutex to protect the following variables:
 144  *      nfs_major
 145  *      nfs_minor
 146  */
 147 kmutex_t nfs_minor_lock;
 148 int nfs_major;
 149 int nfs_minor;
 150 
 151 /* Do we allow preepoch (negative) time values otw? */
 152 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */
 153 
 154 /*
 155  * Access cache
 156  */
 157 static acache_hash_t *acache;
 158 static long nacache;    /* used strictly to size the number of hash queues */
 159 
 160 static int acachesize;
 161 static int acachemask;
 162 static struct kmem_cache *acache_cache;
 163 
 164 /*
 165  * Client side utilities
 166  */
 167 
 168 /*
 169  * client side statistics
 170  */
 171 static const struct clstat clstat_tmpl = {
 172         { "calls",      KSTAT_DATA_UINT64 },
 173         { "badcalls",   KSTAT_DATA_UINT64 },
 174         { "clgets",     KSTAT_DATA_UINT64 },
 175         { "cltoomany",  KSTAT_DATA_UINT64 },
 176 #ifdef DEBUG
 177         { "clalloc",    KSTAT_DATA_UINT64 },
 178         { "noresponse", KSTAT_DATA_UINT64 },
 179         { "failover",   KSTAT_DATA_UINT64 },
 180         { "remap",      KSTAT_DATA_UINT64 },
 181 #endif
 182 };
 183 
 184 /*
 185  * The following are statistics that describe behavior of the system as a whole
 186  * and doesn't correspond to any one particular zone.
 187  */
 188 #ifdef DEBUG
 189 static struct clstat_debug {
 190         kstat_named_t   nrnode;                 /* number of allocated rnodes */
 191         kstat_named_t   access;                 /* size of access cache */
 192         kstat_named_t   dirent;                 /* size of readdir cache */
 193         kstat_named_t   dirents;                /* size of readdir buf cache */
 194         kstat_named_t   reclaim;                /* number of reclaims */
 195         kstat_named_t   clreclaim;              /* number of cl reclaims */
 196         kstat_named_t   f_reclaim;              /* number of free reclaims */
 197         kstat_named_t   a_reclaim;              /* number of active reclaims */
 198         kstat_named_t   r_reclaim;              /* number of rnode reclaims */
 199         kstat_named_t   rpath;                  /* bytes used to store rpaths */
 200 } clstat_debug = {
 201         { "nrnode",     KSTAT_DATA_UINT64 },
 202         { "access",     KSTAT_DATA_UINT64 },
 203         { "dirent",     KSTAT_DATA_UINT64 },
 204         { "dirents",    KSTAT_DATA_UINT64 },
 205         { "reclaim",    KSTAT_DATA_UINT64 },
 206         { "clreclaim",  KSTAT_DATA_UINT64 },
 207         { "f_reclaim",  KSTAT_DATA_UINT64 },
 208         { "a_reclaim",  KSTAT_DATA_UINT64 },
 209         { "r_reclaim",  KSTAT_DATA_UINT64 },
 210         { "r_path",     KSTAT_DATA_UINT64 },
 211 };
 212 #endif  /* DEBUG */
 213 
 214 /*
 215  * We keep a global list of per-zone client data, so we can clean up all zones
 216  * if we get low on memory.
 217  */
 218 static list_t nfs_clnt_list;
 219 static kmutex_t nfs_clnt_list_lock;
 220 static zone_key_t nfsclnt_zone_key;
 221 
 222 static struct kmem_cache *chtab_cache;
 223 
 224 /*
 225  * Some servers do not properly update the attributes of the
 226  * directory when changes are made.  To allow interoperability
 227  * with these broken servers, the nfs_disable_rddir_cache
 228  * parameter must be set in /etc/system
 229  */
 230 int nfs_disable_rddir_cache = 0;
 231 
 232 int             clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
 233                     struct chtab **);
 234 void            clfree(CLIENT *, struct chtab *);
 235 static int      acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
 236                     struct chtab **, struct nfs_clnt *);
 237 static int      nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
 238                     struct chtab **, struct nfs_clnt *);
 239 static void     clreclaim(void *);
 240 static int      nfs_feedback(int, int, mntinfo_t *);
 241 static int      rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
 242                     caddr_t, cred_t *, int *, enum clnt_stat *, int,
 243                     failinfo_t *);
 244 static int      aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
 245                     caddr_t, cred_t *, int *, int, failinfo_t *);
 246 static void     rinactive(rnode_t *, cred_t *);
 247 static int      rtablehash(nfs_fhandle *);
 248 static vnode_t  *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
 249                     struct vnodeops *,
 250                     int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
 251                         cred_t *),
 252                     int (*)(const void *, const void *), int *, cred_t *,
 253                     char *, char *);
 254 static void     rp_rmfree(rnode_t *);
 255 static void     rp_addhash(rnode_t *);
 256 static void     rp_rmhash_locked(rnode_t *);
 257 static rnode_t  *rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
 258 static void     destroy_rnode(rnode_t *);
 259 static void     rddir_cache_free(rddir_cache *);
 260 static int      nfs_free_data_reclaim(rnode_t *);
 261 static int      nfs_active_data_reclaim(rnode_t *);
 262 static int      nfs_free_reclaim(void);
 263 static int      nfs_active_reclaim(void);
 264 static int      nfs_rnode_reclaim(void);
 265 static void     nfs_reclaim(void *);
 266 static int      failover_safe(failinfo_t *);
 267 static void     failover_newserver(mntinfo_t *mi);
 268 static void     failover_thread(mntinfo_t *mi);
 269 static int      failover_wait(mntinfo_t *);
 270 static int      failover_remap(failinfo_t *);
 271 static int      failover_lookup(char *, vnode_t *,
 272                     int (*)(vnode_t *, char *, vnode_t **,
 273                         struct pathname *, int, vnode_t *, cred_t *, int),
 274                     int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
 275                     vnode_t **);
 276 static void     nfs_free_r_path(rnode_t *);
 277 static void     nfs_set_vroot(vnode_t *);
 278 static char     *nfs_getsrvnames(mntinfo_t *, size_t *);
 279 
 280 /*
 281  * from rpcsec module (common/rpcsec)
 282  */
 283 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
 284 extern void sec_clnt_freeh(AUTH *);
 285 extern void sec_clnt_freeinfo(struct sec_data *);
 286 
 287 /*
 288  * used in mount policy
 289  */
 290 extern ts_label_t *getflabel_cipso(vfs_t *);
 291 
 292 /*
 293  * EIO or EINTR are not recoverable errors.
 294  */
 295 #define IS_RECOVERABLE_ERROR(error)     !((error == EINTR) || (error == EIO))
 296 
 297 #ifdef DEBUG
 298 #define SRV_QFULL_MSG   "send queue to NFS%d server %s is full; still trying\n"
 299 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n"
 300 #else
 301 #define SRV_QFULL_MSG   "send queue to NFS server %s is full still trying\n"
 302 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n"
 303 #endif
 304 /*
 305  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
 306  */
 307 static int
 308 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 309     struct chtab **chp, struct nfs_clnt *nfscl)
 310 {
 311         struct chhead *ch, *newch;
 312         struct chhead **plistp;
 313         struct chtab *cp;
 314         int error;
 315         k_sigset_t smask;
 316 
 317         if (newcl == NULL || chp == NULL || ci == NULL)
 318                 return (EINVAL);
 319 
 320         *newcl = NULL;
 321         *chp = NULL;
 322 
 323         /*
 324          * Find an unused handle or create one
 325          */
 326         newch = NULL;
 327         nfscl->nfscl_stat.clgets.value.ui64++;
 328 top:
 329         /*
 330          * Find the correct entry in the cache to check for free
 331          * client handles.  The search is based on the RPC program
 332          * number, program version number, dev_t for the transport
 333          * device, and the protocol family.
 334          */
 335         mutex_enter(&nfscl->nfscl_chtable_lock);
 336         plistp = &nfscl->nfscl_chtable;
 337         for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
 338                 if (ch->ch_prog == ci->cl_prog &&
 339                     ch->ch_vers == ci->cl_vers &&
 340                     ch->ch_dev == svp->sv_knconf->knc_rdev &&
 341                     (strcmp(ch->ch_protofmly,
 342                     svp->sv_knconf->knc_protofmly) == 0))
 343                         break;
 344                 plistp = &ch->ch_next;
 345         }
 346 
 347         /*
 348          * If we didn't find a cache entry for this quadruple, then
 349          * create one.  If we don't have one already preallocated,
 350          * then drop the cache lock, create one, and then start over.
 351          * If we did have a preallocated entry, then just add it to
 352          * the front of the list.
 353          */
 354         if (ch == NULL) {
 355                 if (newch == NULL) {
 356                         mutex_exit(&nfscl->nfscl_chtable_lock);
 357                         newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
 358                         newch->ch_timesused = 0;
 359                         newch->ch_prog = ci->cl_prog;
 360                         newch->ch_vers = ci->cl_vers;
 361                         newch->ch_dev = svp->sv_knconf->knc_rdev;
 362                         newch->ch_protofmly = kmem_alloc(
 363                             strlen(svp->sv_knconf->knc_protofmly) + 1,
 364                             KM_SLEEP);
 365                         (void) strcpy(newch->ch_protofmly,
 366                             svp->sv_knconf->knc_protofmly);
 367                         newch->ch_list = NULL;
 368                         goto top;
 369                 }
 370                 ch = newch;
 371                 newch = NULL;
 372                 ch->ch_next = nfscl->nfscl_chtable;
 373                 nfscl->nfscl_chtable = ch;
 374         /*
 375          * We found a cache entry, but if it isn't on the front of the
 376          * list, then move it to the front of the list to try to take
 377          * advantage of locality of operations.
 378          */
 379         } else if (ch != nfscl->nfscl_chtable) {
 380                 *plistp = ch->ch_next;
 381                 ch->ch_next = nfscl->nfscl_chtable;
 382                 nfscl->nfscl_chtable = ch;
 383         }
 384 
 385         /*
 386          * If there was a free client handle cached, then remove it
 387          * from the list, init it, and use it.
 388          */
 389         if (ch->ch_list != NULL) {
 390                 cp = ch->ch_list;
 391                 ch->ch_list = cp->ch_list;
 392                 mutex_exit(&nfscl->nfscl_chtable_lock);
 393                 if (newch != NULL) {
 394                         kmem_free(newch->ch_protofmly,
 395                             strlen(newch->ch_protofmly) + 1);
 396                         kmem_free(newch, sizeof (*newch));
 397                 }
 398                 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
 399                     &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
 400                 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
 401                     &cp->ch_client->cl_auth);
 402                 if (error || cp->ch_client->cl_auth == NULL) {
 403                         CLNT_DESTROY(cp->ch_client);
 404                         kmem_cache_free(chtab_cache, cp);
 405                         return ((error != 0) ? error : EINTR);
 406                 }
 407                 ch->ch_timesused++;
 408                 *newcl = cp->ch_client;
 409                 *chp = cp;
 410                 return (0);
 411         }
 412 
 413         /*
 414          * There weren't any free client handles which fit, so allocate
 415          * a new one and use that.
 416          */
 417 #ifdef DEBUG
 418         atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
 419 #endif
 420         mutex_exit(&nfscl->nfscl_chtable_lock);
 421 
 422         nfscl->nfscl_stat.cltoomany.value.ui64++;
 423         if (newch != NULL) {
 424                 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
 425                 kmem_free(newch, sizeof (*newch));
 426         }
 427 
 428         cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
 429         cp->ch_head = ch;
 430 
 431         sigintr(&smask, (int)ci->cl_flags & MI_INT);
 432         error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
 433             ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
 434         sigunintr(&smask);
 435 
 436         if (error != 0) {
 437                 kmem_cache_free(chtab_cache, cp);
 438 #ifdef DEBUG
 439                 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
 440 #endif
 441                 /*
 442                  * Warning is unnecessary if error is EINTR.
 443                  */
 444                 if (error != EINTR) {
 445                         nfs_cmn_err(error, CE_WARN,
 446                             "clget: couldn't create handle: %m\n");
 447                 }
 448                 return (error);
 449         }
 450         (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
 451         auth_destroy(cp->ch_client->cl_auth);
 452         error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
 453             &cp->ch_client->cl_auth);
 454         if (error || cp->ch_client->cl_auth == NULL) {
 455                 CLNT_DESTROY(cp->ch_client);
 456                 kmem_cache_free(chtab_cache, cp);
 457 #ifdef DEBUG
 458                 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
 459 #endif
 460                 return ((error != 0) ? error : EINTR);
 461         }
 462         ch->ch_timesused++;
 463         *newcl = cp->ch_client;
 464         ASSERT(cp->ch_client->cl_nosignal == FALSE);
 465         *chp = cp;
 466         return (0);
 467 }
 468 
 469 int
 470 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 471     struct chtab **chp)
 472 {
 473         struct nfs_clnt *nfscl;
 474 
 475         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
 476         ASSERT(nfscl != NULL);
 477 
 478         return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
 479 }
 480 
 481 static int
 482 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 483     struct chtab **chp, struct nfs_clnt *nfscl)
 484 {
 485         clinfo_t ci;
 486         int error;
 487 
 488         /*
 489          * Set read buffer size to rsize
 490          * and add room for RPC headers.
 491          */
 492         ci.cl_readsize = mi->mi_tsize;
 493         if (ci.cl_readsize != 0)
 494                 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
 495 
 496         /*
 497          * If soft mount and server is down just try once.
 498          * meaning: do not retransmit.
 499          */
 500         if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
 501                 ci.cl_retrans = 0;
 502         else
 503                 ci.cl_retrans = mi->mi_retrans;
 504 
 505         ci.cl_prog = NFS_ACL_PROGRAM;
 506         ci.cl_vers = mi->mi_vers;
 507         ci.cl_flags = mi->mi_flags;
 508 
 509         /*
 510          * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
 511          * security flavor, the client tries to establish a security context
 512          * by contacting the server. If the connection is timed out or reset,
 513          * e.g. server reboot, we will try again.
 514          */
 515         do {
 516                 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
 517 
 518                 if (error == 0)
 519                         break;
 520 
 521                 /*
 522                  * For forced unmount or zone shutdown, bail out, no retry.
 523                  */
 524                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
 525                         error = EIO;
 526                         break;
 527                 }
 528 
 529                 /* do not retry for softmount */
 530                 if (!(mi->mi_flags & MI_HARD))
 531                         break;
 532 
 533                 /* let the caller deal with the failover case */
 534                 if (FAILOVER_MOUNT(mi))
 535                         break;
 536 
 537         } while (error == ETIMEDOUT || error == ECONNRESET);
 538 
 539         return (error);
 540 }
 541 
 542 static int
 543 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 544     struct chtab **chp, struct nfs_clnt *nfscl)
 545 {
 546         clinfo_t ci;
 547         int error;
 548 
 549         /*
 550          * Set read buffer size to rsize
 551          * and add room for RPC headers.
 552          */
 553         ci.cl_readsize = mi->mi_tsize;
 554         if (ci.cl_readsize != 0)
 555                 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
 556 
 557         /*
 558          * If soft mount and server is down just try once.
 559          * meaning: do not retransmit.
 560          */
 561         if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
 562                 ci.cl_retrans = 0;
 563         else
 564                 ci.cl_retrans = mi->mi_retrans;
 565 
 566         ci.cl_prog = mi->mi_prog;
 567         ci.cl_vers = mi->mi_vers;
 568         ci.cl_flags = mi->mi_flags;
 569 
 570         /*
 571          * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
 572          * security flavor, the client tries to establish a security context
 573          * by contacting the server. If the connection is timed out or reset,
 574          * e.g. server reboot, we will try again.
 575          */
 576         do {
 577                 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
 578 
 579                 if (error == 0)
 580                         break;
 581 
 582                 /*
 583                  * For forced unmount or zone shutdown, bail out, no retry.
 584                  */
 585                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
 586                         error = EIO;
 587                         break;
 588                 }
 589 
 590                 /* do not retry for softmount */
 591                 if (!(mi->mi_flags & MI_HARD))
 592                         break;
 593 
 594                 /* let the caller deal with the failover case */
 595                 if (FAILOVER_MOUNT(mi))
 596                         break;
 597 
 598         } while (error == ETIMEDOUT || error == ECONNRESET);
 599 
 600         return (error);
 601 }
 602 
 603 static void
 604 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
 605 {
 606         if (cl->cl_auth != NULL) {
 607                 sec_clnt_freeh(cl->cl_auth);
 608                 cl->cl_auth = NULL;
 609         }
 610 
 611         /*
 612          * Timestamp this cache entry so that we know when it was last
 613          * used.
 614          */
 615         cp->ch_freed = gethrestime_sec();
 616 
 617         /*
 618          * Add the free client handle to the front of the list.
 619          * This way, the list will be sorted in youngest to oldest
 620          * order.
 621          */
 622         mutex_enter(&nfscl->nfscl_chtable_lock);
 623         cp->ch_list = cp->ch_head->ch_list;
 624         cp->ch_head->ch_list = cp;
 625         mutex_exit(&nfscl->nfscl_chtable_lock);
 626 }
 627 
 628 void
 629 clfree(CLIENT *cl, struct chtab *cp)
 630 {
 631         struct nfs_clnt *nfscl;
 632 
 633         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
 634         ASSERT(nfscl != NULL);
 635 
 636         clfree_impl(cl, cp, nfscl);
 637 }
 638 
 639 #define CL_HOLDTIME     60      /* time to hold client handles */
 640 
 641 static void
 642 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
 643 {
 644         struct chhead *ch;
 645         struct chtab *cp;       /* list of objects that can be reclaimed */
 646         struct chtab *cpe;
 647         struct chtab *cpl;
 648         struct chtab **cpp;
 649 #ifdef DEBUG
 650         int n = 0;
 651 #endif
 652 
 653         /*
 654          * Need to reclaim some memory, so step through the cache
 655          * looking through the lists for entries which can be freed.
 656          */
 657         cp = NULL;
 658 
 659         mutex_enter(&nfscl->nfscl_chtable_lock);
 660 
 661         /*
 662          * Here we step through each non-NULL quadruple and start to
 663          * construct the reclaim list pointed to by cp.  Note that
 664          * cp will contain all eligible chtab entries.  When this traversal
 665          * completes, chtab entries from the last quadruple will be at the
 666          * front of cp and entries from previously inspected quadruples have
 667          * been appended to the rear of cp.
 668          */
 669         for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
 670                 if (ch->ch_list == NULL)
 671                         continue;
 672                 /*
 673                  * Search each list for entries older then
 674                  * cl_holdtime seconds.  The lists are maintained
 675                  * in youngest to oldest order so that when the
 676                  * first entry is found which is old enough, then
 677                  * all of the rest of the entries on the list will
 678                  * be old enough as well.
 679                  */
 680                 cpl = ch->ch_list;
 681                 cpp = &ch->ch_list;
 682                 while (cpl != NULL &&
 683                     cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
 684                         cpp = &cpl->ch_list;
 685                         cpl = cpl->ch_list;
 686                 }
 687                 if (cpl != NULL) {
 688                         *cpp = NULL;
 689                         if (cp != NULL) {
 690                                 cpe = cpl;
 691                                 while (cpe->ch_list != NULL)
 692                                         cpe = cpe->ch_list;
 693                                 cpe->ch_list = cp;
 694                         }
 695                         cp = cpl;
 696                 }
 697         }
 698 
 699         mutex_exit(&nfscl->nfscl_chtable_lock);
 700 
 701         /*
 702          * If cp is empty, then there is nothing to reclaim here.
 703          */
 704         if (cp == NULL)
 705                 return;
 706 
 707         /*
 708          * Step through the list of entries to free, destroying each client
 709          * handle and kmem_free'ing the memory for each entry.
 710          */
 711         while (cp != NULL) {
 712 #ifdef DEBUG
 713                 n++;
 714 #endif
 715                 CLNT_DESTROY(cp->ch_client);
 716                 cpl = cp->ch_list;
 717                 kmem_cache_free(chtab_cache, cp);
 718                 cp = cpl;
 719         }
 720 
 721 #ifdef DEBUG
 722         /*
 723          * Update clalloc so that nfsstat shows the current number
 724          * of allocated client handles.
 725          */
 726         atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
 727 #endif
 728 }
 729 
 730 /* ARGSUSED */
 731 static void
 732 clreclaim(void *all)
 733 {
 734         struct nfs_clnt *nfscl;
 735 
 736 #ifdef DEBUG
 737         clstat_debug.clreclaim.value.ui64++;
 738 #endif
 739         /*
 740          * The system is low on memory; go through and try to reclaim some from
 741          * every zone on the system.
 742          */
 743         mutex_enter(&nfs_clnt_list_lock);
 744         nfscl = list_head(&nfs_clnt_list);
 745         for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
 746                 clreclaim_zone(nfscl, CL_HOLDTIME);
 747         mutex_exit(&nfs_clnt_list_lock);
 748 }
 749 
 750 /*
 751  * Minimum time-out values indexed by call type
 752  * These units are in "eights" of a second to avoid multiplies
 753  */
 754 static unsigned int minimum_timeo[] = {
 755         6, 7, 10
 756 };
 757 
 758 /*
 759  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
 760  */
 761 #define MAXTIMO (20*hz)
 762 #define backoff(tim)    (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
 763 #define dobackoff(tim)  ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
 764 
 765 #define MIN_NFS_TSIZE 512       /* minimum "chunk" of NFS IO */
 766 #define REDUCE_NFS_TIME (hz/2)  /* rtxcur we try to keep under */
 767 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
 768 
 769 /*
 770  * Function called when rfscall notices that we have been
 771  * re-transmitting, or when we get a response without retransmissions.
 772  * Return 1 if the transfer size was adjusted down - 0 if no change.
 773  */
 774 static int
 775 nfs_feedback(int flag, int which, mntinfo_t *mi)
 776 {
 777         int kind;
 778         int r = 0;
 779 
 780         mutex_enter(&mi->mi_lock);
 781         if (flag == FEEDBACK_REXMIT1) {
 782                 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
 783                     mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
 784                         goto done;
 785                 if (mi->mi_curread > MIN_NFS_TSIZE) {
 786                         mi->mi_curread /= 2;
 787                         if (mi->mi_curread < MIN_NFS_TSIZE)
 788                                 mi->mi_curread = MIN_NFS_TSIZE;
 789                         r = 1;
 790                 }
 791 
 792                 if (mi->mi_curwrite > MIN_NFS_TSIZE) {
 793                         mi->mi_curwrite /= 2;
 794                         if (mi->mi_curwrite < MIN_NFS_TSIZE)
 795                                 mi->mi_curwrite = MIN_NFS_TSIZE;
 796                         r = 1;
 797                 }
 798         } else if (flag == FEEDBACK_OK) {
 799                 kind = mi->mi_timer_type[which];
 800                 if (kind == 0 ||
 801                     mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
 802                         goto done;
 803                 if (kind == 1) {
 804                         if (mi->mi_curread >= mi->mi_tsize)
 805                                 goto done;
 806                         mi->mi_curread +=  MIN_NFS_TSIZE;
 807                         if (mi->mi_curread > mi->mi_tsize/2)
 808                                 mi->mi_curread = mi->mi_tsize;
 809                 } else if (kind == 2) {
 810                         if (mi->mi_curwrite >= mi->mi_stsize)
 811                                 goto done;
 812                         mi->mi_curwrite += MIN_NFS_TSIZE;
 813                         if (mi->mi_curwrite > mi->mi_stsize/2)
 814                                 mi->mi_curwrite = mi->mi_stsize;
 815                 }
 816         }
 817 done:
 818         mutex_exit(&mi->mi_lock);
 819         return (r);
 820 }
 821 
 822 #ifdef DEBUG
 823 static int rfs2call_hits = 0;
 824 static int rfs2call_misses = 0;
 825 #endif
 826 
 827 int
 828 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
 829     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
 830     enum nfsstat *statusp, int flags, failinfo_t *fi)
 831 {
 832         int rpcerror;
 833         enum clnt_stat rpc_status;
 834 
 835         ASSERT(statusp != NULL);
 836 
 837         rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
 838             cr, douprintf, &rpc_status, flags, fi);
 839         if (!rpcerror) {
 840                 /*
 841                  * See crnetadjust() for comments.
 842                  */
 843                 if (*statusp == NFSERR_ACCES &&
 844                     (cr = crnetadjust(cr)) != NULL) {
 845 #ifdef DEBUG
 846                         rfs2call_hits++;
 847 #endif
 848                         rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
 849                             resp, cr, douprintf, NULL, flags, fi);
 850                         crfree(cr);
 851 #ifdef DEBUG
 852                         if (*statusp == NFSERR_ACCES)
 853                                 rfs2call_misses++;
 854 #endif
 855                 }
 856         } else if (rpc_status == RPC_PROCUNAVAIL) {
 857                 *statusp = NFSERR_OPNOTSUPP;
 858                 rpcerror = 0;
 859         }
 860 
 861         return (rpcerror);
 862 }
 863 
 864 #define NFS3_JUKEBOX_DELAY      10 * hz
 865 
 866 static clock_t nfs3_jukebox_delay = 0;
 867 
 868 #ifdef DEBUG
 869 static int rfs3call_hits = 0;
 870 static int rfs3call_misses = 0;
 871 #endif
 872 
 873 int
 874 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
 875     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
 876     nfsstat3 *statusp, int flags, failinfo_t *fi)
 877 {
 878         int rpcerror;
 879         int user_informed;
 880 
 881         user_informed = 0;
 882         do {
 883                 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
 884                     cr, douprintf, NULL, flags, fi);
 885                 if (!rpcerror) {
 886                         cred_t *crr;
 887                         if (*statusp == NFS3ERR_JUKEBOX) {
 888                                 if (ttoproc(curthread) == &p0) {
 889                                         rpcerror = EAGAIN;
 890                                         break;
 891                                 }
 892                                 if (!user_informed) {
 893                                         user_informed = 1;
 894                                         uprintf(
 895                 "file temporarily unavailable on the server, retrying...\n");
 896                                 }
 897                                 delay(nfs3_jukebox_delay);
 898                         }
 899                         /*
 900                          * See crnetadjust() for comments.
 901                          */
 902                         else if (*statusp == NFS3ERR_ACCES &&
 903                             (crr = crnetadjust(cr)) != NULL) {
 904 #ifdef DEBUG
 905                                 rfs3call_hits++;
 906 #endif
 907                                 rpcerror = rfscall(mi, which, xdrargs, argsp,
 908                                     xdrres, resp, crr, douprintf,
 909                                     NULL, flags, fi);
 910 
 911                                 crfree(crr);
 912 #ifdef DEBUG
 913                                 if (*statusp == NFS3ERR_ACCES)
 914                                         rfs3call_misses++;
 915 #endif
 916                         }
 917                 }
 918         } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
 919 
 920         return (rpcerror);
 921 }
 922 
 923 #define VALID_FH(fi)    (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
 924 #define INC_READERS(mi)         { \
 925         mi->mi_readers++; \
 926 }
 927 #define DEC_READERS(mi)         { \
 928         mi->mi_readers--; \
 929         if (mi->mi_readers == 0) \
 930                 cv_broadcast(&mi->mi_failover_cv); \
 931 }
 932 
 933 static int
 934 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
 935     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
 936     enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
 937 {
 938         CLIENT *client;
 939         struct chtab *ch;
 940         cred_t *cr = icr;
 941         enum clnt_stat status;
 942         struct rpc_err rpcerr, rpcerr_tmp;
 943         struct timeval wait;
 944         int timeo;              /* in units of hz */
 945         int my_rsize, my_wsize;
 946         bool_t tryagain;
 947         bool_t cred_cloned = FALSE;
 948         k_sigset_t smask;
 949         servinfo_t *svp;
 950         struct nfs_clnt *nfscl;
 951         zoneid_t zoneid = getzoneid();
 952         char *msg;
 953 #ifdef DEBUG
 954         char *bufp;
 955 #endif
 956 
 957 
 958         TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
 959             "rfscall_start:which %d mi %p", which, mi);
 960 
 961         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
 962         ASSERT(nfscl != NULL);
 963 
 964         nfscl->nfscl_stat.calls.value.ui64++;
 965         mi->mi_reqs[which].value.ui64++;
 966 
 967         rpcerr.re_status = RPC_SUCCESS;
 968 
 969         /*
 970          * In case of forced unmount or zone shutdown, return EIO.
 971          */
 972 
 973         if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
 974                 rpcerr.re_status = RPC_FAILED;
 975                 rpcerr.re_errno = EIO;
 976                 return (rpcerr.re_errno);
 977         }
 978 
 979         /*
 980          * Remember the transfer sizes in case
 981          * nfs_feedback changes them underneath us.
 982          */
 983         my_rsize = mi->mi_curread;
 984         my_wsize = mi->mi_curwrite;
 985 
 986         /*
 987          * NFS client failover support
 988          *
 989          * If this rnode is not in sync with the current server (VALID_FH),
 990          * we'd like to do a remap to get in sync.  We can be interrupted
 991          * in failover_remap(), and if so we'll bail.  Otherwise, we'll
 992          * use the best info we have to try the RPC.  Part of that is
 993          * unconditionally updating the filehandle copy kept for V3.
 994          *
 995          * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
 996          * rw_enter(); we're trying to keep the current server from being
 997          * changed on us until we're done with the remapping and have a
 998          * matching client handle.  We don't want to sending a filehandle
 999          * to the wrong host.
1000          */
1001 failoverretry:
1002         if (FAILOVER_MOUNT(mi)) {
1003                 mutex_enter(&mi->mi_lock);
1004                 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1005                         if (failover_wait(mi)) {
1006                                 mutex_exit(&mi->mi_lock);
1007                                 return (EINTR);
1008                         }
1009                 }
1010                 INC_READERS(mi);
1011                 mutex_exit(&mi->mi_lock);
1012                 if (fi) {
1013                         if (!VALID_FH(fi) &&
1014                             !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1015                                 int remaperr;
1016 
1017                                 svp = mi->mi_curr_serv;
1018                                 remaperr = failover_remap(fi);
1019                                 if (remaperr != 0) {
1020 #ifdef DEBUG
1021                                         if (remaperr != EINTR)
1022                                                 nfs_cmn_err(remaperr, CE_WARN,
1023                                             "rfscall couldn't failover: %m");
1024 #endif
1025                                         mutex_enter(&mi->mi_lock);
1026                                         DEC_READERS(mi);
1027                                         mutex_exit(&mi->mi_lock);
1028                                         /*
1029                                          * If failover_remap returns ETIMEDOUT
1030                                          * and the filesystem is hard mounted
1031                                          * we have to retry the call with a new
1032                                          * server.
1033                                          */
1034                                         if ((mi->mi_flags & MI_HARD) &&
1035                                             IS_RECOVERABLE_ERROR(remaperr)) {
1036                                                 if (svp == mi->mi_curr_serv)
1037                                                         failover_newserver(mi);
1038                                                 rpcerr.re_status = RPC_SUCCESS;
1039                                                 goto failoverretry;
1040                                         }
1041                                         rpcerr.re_errno = remaperr;
1042                                         return (remaperr);
1043                                 }
1044                         }
1045                         if (fi->fhp && fi->copyproc)
1046                                 (*fi->copyproc)(fi->fhp, fi->vp);
1047                 }
1048         }
1049 
1050         /* For TSOL, use a new cred which has net_mac_aware flag */
1051         if (!cred_cloned && is_system_labeled()) {
1052                 cred_cloned = TRUE;
1053                 cr = crdup(icr);
1054                 (void) setpflags(NET_MAC_AWARE, 1, cr);
1055         }
1056 
1057         /*
1058          * clget() calls clnt_tli_kinit() which clears the xid, so we
1059          * are guaranteed to reprocess the retry as a new request.
1060          */
1061         svp = mi->mi_curr_serv;
1062         rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1063 
1064         if (FAILOVER_MOUNT(mi)) {
1065                 mutex_enter(&mi->mi_lock);
1066                 DEC_READERS(mi);
1067                 mutex_exit(&mi->mi_lock);
1068 
1069                 if ((rpcerr.re_errno == ETIMEDOUT ||
1070                     rpcerr.re_errno == ECONNRESET) &&
1071                     failover_safe(fi)) {
1072                         if (svp == mi->mi_curr_serv)
1073                                 failover_newserver(mi);
1074                         goto failoverretry;
1075                 }
1076         }
1077         if (rpcerr.re_errno != 0)
1078                 return (rpcerr.re_errno);
1079 
1080         if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1081             svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1082                 timeo = (mi->mi_timeo * hz) / 10;
1083         } else {
1084                 mutex_enter(&mi->mi_lock);
1085                 timeo = CLNT_SETTIMERS(client,
1086                     &(mi->mi_timers[mi->mi_timer_type[which]]),
1087                     &(mi->mi_timers[NFS_CALLTYPES]),
1088                     (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1089                     (void (*)())NULL, (caddr_t)mi, 0);
1090                 mutex_exit(&mi->mi_lock);
1091         }
1092 
1093         /*
1094          * If hard mounted fs, retry call forever unless hard error occurs.
1095          */
1096         do {
1097                 tryagain = FALSE;
1098 
1099                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1100                         status = RPC_FAILED;
1101                         rpcerr.re_status = RPC_FAILED;
1102                         rpcerr.re_errno = EIO;
1103                         break;
1104                 }
1105 
1106                 TICK_TO_TIMEVAL(timeo, &wait);
1107 
1108                 /*
1109                  * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1110                  * and SIGTERM. (Preserving the existing masks).
1111                  * Mask out SIGINT if mount option nointr is specified.
1112                  */
1113                 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1114                 if (!(mi->mi_flags & MI_INT))
1115                         client->cl_nosignal = TRUE;
1116 
1117                 /*
1118                  * If there is a current signal, then don't bother
1119                  * even trying to send out the request because we
1120                  * won't be able to block waiting for the response.
1121                  * Simply assume RPC_INTR and get on with it.
1122                  */
1123                 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1124                         status = RPC_INTR;
1125                 else {
1126                         status = CLNT_CALL(client, which, xdrargs, argsp,
1127                             xdrres, resp, wait);
1128                 }
1129 
1130                 if (!(mi->mi_flags & MI_INT))
1131                         client->cl_nosignal = FALSE;
1132                 /*
1133                  * restore original signal mask
1134                  */
1135                 sigunintr(&smask);
1136 
1137                 switch (status) {
1138                 case RPC_SUCCESS:
1139                         if ((mi->mi_flags & MI_DYNAMIC) &&
1140                             mi->mi_timer_type[which] != 0 &&
1141                             (mi->mi_curread != my_rsize ||
1142                             mi->mi_curwrite != my_wsize))
1143                                 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1144                         break;
1145 
1146                 case RPC_INTR:
1147                         /*
1148                          * There is no way to recover from this error,
1149                          * even if mount option nointr is specified.
1150                          * SIGKILL, for example, cannot be blocked.
1151                          */
1152                         rpcerr.re_status = RPC_INTR;
1153                         rpcerr.re_errno = EINTR;
1154                         break;
1155 
1156                 case RPC_UDERROR:
1157                         /*
1158                          * If the NFS server is local (vold) and
1159                          * it goes away then we get RPC_UDERROR.
1160                          * This is a retryable error, so we would
1161                          * loop, so check to see if the specific
1162                          * error was ECONNRESET, indicating that
1163                          * target did not exist at all.  If so,
1164                          * return with RPC_PROGUNAVAIL and
1165                          * ECONNRESET to indicate why.
1166                          */
1167                         CLNT_GETERR(client, &rpcerr);
1168                         if (rpcerr.re_errno == ECONNRESET) {
1169                                 rpcerr.re_status = RPC_PROGUNAVAIL;
1170                                 rpcerr.re_errno = ECONNRESET;
1171                                 break;
1172                         }
1173                         /*FALLTHROUGH*/
1174 
1175                 default:                /* probably RPC_TIMEDOUT */
1176                         if (IS_UNRECOVERABLE_RPC(status))
1177                                 break;
1178 
1179                         /*
1180                          * increment server not responding count
1181                          */
1182                         mutex_enter(&mi->mi_lock);
1183                         mi->mi_noresponse++;
1184                         mutex_exit(&mi->mi_lock);
1185 #ifdef DEBUG
1186                         nfscl->nfscl_stat.noresponse.value.ui64++;
1187 #endif
1188 
1189                         if (!(mi->mi_flags & MI_HARD)) {
1190                                 if (!(mi->mi_flags & MI_SEMISOFT) ||
1191                                     (mi->mi_ss_call_type[which] == 0))
1192                                         break;
1193                         }
1194 
1195                         /*
1196                          * The call is in progress (over COTS).
1197                          * Try the CLNT_CALL again, but don't
1198                          * print a noisy error message.
1199                          */
1200                         if (status == RPC_INPROGRESS) {
1201                                 tryagain = TRUE;
1202                                 break;
1203                         }
1204 
1205                         if (flags & RFSCALL_SOFT)
1206                                 break;
1207 
1208                         /*
1209                          * On zone shutdown, just move on.
1210                          */
1211                         if (zone_status_get(curproc->p_zone) >=
1212                             ZONE_IS_SHUTTING_DOWN) {
1213                                 rpcerr.re_status = RPC_FAILED;
1214                                 rpcerr.re_errno = EIO;
1215                                 break;
1216                         }
1217 
1218                         /*
1219                          * NFS client failover support
1220                          *
1221                          * If the current server just failed us, we'll
1222                          * start the process of finding a new server.
1223                          * After that, we can just retry.
1224                          */
1225                         if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1226                                 if (svp == mi->mi_curr_serv)
1227                                         failover_newserver(mi);
1228                                 clfree_impl(client, ch, nfscl);
1229                                 goto failoverretry;
1230                         }
1231 
1232                         tryagain = TRUE;
1233                         timeo = backoff(timeo);
1234 
1235                         CLNT_GETERR(client, &rpcerr_tmp);
1236                         if ((status == RPC_CANTSEND) &&
1237                             (rpcerr_tmp.re_errno == ENOBUFS))
1238                                 msg = SRV_QFULL_MSG;
1239                         else
1240                                 msg = SRV_NOTRESP_MSG;
1241 
1242                         mutex_enter(&mi->mi_lock);
1243                         if (!(mi->mi_flags & MI_PRINTED)) {
1244                                 mi->mi_flags |= MI_PRINTED;
1245                                 mutex_exit(&mi->mi_lock);
1246 #ifdef DEBUG
1247                                 zprintf(zoneid, msg, mi->mi_vers,
1248                                     svp->sv_hostname);
1249 #else
1250                                 zprintf(zoneid, msg, svp->sv_hostname);
1251 #endif
1252                         } else
1253                                 mutex_exit(&mi->mi_lock);
1254                         if (*douprintf && nfs_has_ctty()) {
1255                                 *douprintf = 0;
1256                                 if (!(mi->mi_flags & MI_NOPRINT))
1257 #ifdef DEBUG
1258                                         uprintf(msg, mi->mi_vers,
1259                                             svp->sv_hostname);
1260 #else
1261                                         uprintf(msg, svp->sv_hostname);
1262 #endif
1263                         }
1264 
1265                         /*
1266                          * If doing dynamic adjustment of transfer
1267                          * size and if it's a read or write call
1268                          * and if the transfer size changed while
1269                          * retransmitting or if the feedback routine
1270                          * changed the transfer size,
1271                          * then exit rfscall so that the transfer
1272                          * size can be adjusted at the vnops level.
1273                          */
1274                         if ((mi->mi_flags & MI_DYNAMIC) &&
1275                             mi->mi_timer_type[which] != 0 &&
1276                             (mi->mi_curread != my_rsize ||
1277                             mi->mi_curwrite != my_wsize ||
1278                             nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1279                                 /*
1280                                  * On read or write calls, return
1281                                  * back to the vnode ops level if
1282                                  * the transfer size changed.
1283                                  */
1284                                 clfree_impl(client, ch, nfscl);
1285                                 if (cred_cloned)
1286                                         crfree(cr);
1287                                 return (ENFS_TRYAGAIN);
1288                         }
1289                 }
1290         } while (tryagain);
1291 
1292         if (status != RPC_SUCCESS) {
1293                 /*
1294                  * Let soft mounts use the timed out message.
1295                  */
1296                 if (status == RPC_INPROGRESS)
1297                         status = RPC_TIMEDOUT;
1298                 nfscl->nfscl_stat.badcalls.value.ui64++;
1299                 if (status != RPC_INTR) {
1300                         mutex_enter(&mi->mi_lock);
1301                         mi->mi_flags |= MI_DOWN;
1302                         mutex_exit(&mi->mi_lock);
1303                         CLNT_GETERR(client, &rpcerr);
1304 #ifdef DEBUG
1305                         bufp = clnt_sperror(client, svp->sv_hostname);
1306                         zprintf(zoneid, "NFS%d %s failed for %s\n",
1307                             mi->mi_vers, mi->mi_rfsnames[which], bufp);
1308                         if (nfs_has_ctty()) {
1309                                 if (!(mi->mi_flags & MI_NOPRINT)) {
1310                                         uprintf("NFS%d %s failed for %s\n",
1311                                             mi->mi_vers, mi->mi_rfsnames[which],
1312                                             bufp);
1313                                 }
1314                         }
1315                         kmem_free(bufp, MAXPATHLEN);
1316 #else
1317                         zprintf(zoneid,
1318                             "NFS %s failed for server %s: error %d (%s)\n",
1319                             mi->mi_rfsnames[which], svp->sv_hostname,
1320                             status, clnt_sperrno(status));
1321                         if (nfs_has_ctty()) {
1322                                 if (!(mi->mi_flags & MI_NOPRINT)) {
1323                                         uprintf(
1324                                 "NFS %s failed for server %s: error %d (%s)\n",
1325                                             mi->mi_rfsnames[which],
1326                                             svp->sv_hostname, status,
1327                                             clnt_sperrno(status));
1328                                 }
1329                         }
1330 #endif
1331                         /*
1332                          * when CLNT_CALL() fails with RPC_AUTHERROR,
1333                          * re_errno is set appropriately depending on
1334                          * the authentication error
1335                          */
1336                         if (status == RPC_VERSMISMATCH ||
1337                             status == RPC_PROGVERSMISMATCH)
1338                                 rpcerr.re_errno = EIO;
1339                 }
1340         } else {
1341                 /*
1342                  * Test the value of mi_down and mi_printed without
1343                  * holding the mi_lock mutex.  If they are both zero,
1344                  * then it is okay to skip the down and printed
1345                  * processing.  This saves on a mutex_enter and
1346                  * mutex_exit pair for a normal, successful RPC.
1347                  * This was just complete overhead.
1348                  */
1349                 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1350                         mutex_enter(&mi->mi_lock);
1351                         mi->mi_flags &= ~MI_DOWN;
1352                         if (mi->mi_flags & MI_PRINTED) {
1353                                 mi->mi_flags &= ~MI_PRINTED;
1354                                 mutex_exit(&mi->mi_lock);
1355 #ifdef DEBUG
1356                         if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1357                                 zprintf(zoneid, "NFS%d server %s ok\n",
1358                                     mi->mi_vers, svp->sv_hostname);
1359 #else
1360                         if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1361                                 zprintf(zoneid, "NFS server %s ok\n",
1362                                     svp->sv_hostname);
1363 #endif
1364                         } else
1365                                 mutex_exit(&mi->mi_lock);
1366                 }
1367 
1368                 if (*douprintf == 0) {
1369                         if (!(mi->mi_flags & MI_NOPRINT))
1370 #ifdef DEBUG
1371                                 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1372                                         uprintf("NFS%d server %s ok\n",
1373                                             mi->mi_vers, svp->sv_hostname);
1374 #else
1375                         if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1376                                 uprintf("NFS server %s ok\n", svp->sv_hostname);
1377 #endif
1378                         *douprintf = 1;
1379                 }
1380         }
1381 
1382         clfree_impl(client, ch, nfscl);
1383         if (cred_cloned)
1384                 crfree(cr);
1385 
1386         ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1387 
1388         if (rpc_status != NULL)
1389                 *rpc_status = rpcerr.re_status;
1390 
1391         TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1392             rpcerr.re_errno);
1393 
1394         return (rpcerr.re_errno);
1395 }
1396 
1397 #ifdef DEBUG
1398 static int acl2call_hits = 0;
1399 static int acl2call_misses = 0;
1400 #endif
1401 
1402 int
1403 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1404     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1405     enum nfsstat *statusp, int flags, failinfo_t *fi)
1406 {
1407         int rpcerror;
1408 
1409         rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1410             cr, douprintf, flags, fi);
1411         if (!rpcerror) {
1412                 /*
1413                  * See comments with crnetadjust().
1414                  */
1415                 if (*statusp == NFSERR_ACCES &&
1416                     (cr = crnetadjust(cr)) != NULL) {
1417 #ifdef DEBUG
1418                         acl2call_hits++;
1419 #endif
1420                         rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1421                             resp, cr, douprintf, flags, fi);
1422                         crfree(cr);
1423 #ifdef DEBUG
1424                         if (*statusp == NFSERR_ACCES)
1425                                 acl2call_misses++;
1426 #endif
1427                 }
1428         }
1429 
1430         return (rpcerror);
1431 }
1432 
1433 #ifdef DEBUG
1434 static int acl3call_hits = 0;
1435 static int acl3call_misses = 0;
1436 #endif
1437 
1438 int
1439 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1440     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1441     nfsstat3 *statusp, int flags, failinfo_t *fi)
1442 {
1443         int rpcerror;
1444         int user_informed;
1445 
1446         user_informed = 0;
1447 
1448         do {
1449                 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1450                     cr, douprintf, flags, fi);
1451                 if (!rpcerror) {
1452                         cred_t *crr;
1453                         if (*statusp == NFS3ERR_JUKEBOX) {
1454                                 if (!user_informed) {
1455                                         user_informed = 1;
1456                                         uprintf(
1457                 "file temporarily unavailable on the server, retrying...\n");
1458                                 }
1459                                 delay(nfs3_jukebox_delay);
1460                         }
1461                         /*
1462                          * See crnetadjust() for comments.
1463                          */
1464                         else if (*statusp == NFS3ERR_ACCES &&
1465                             (crr = crnetadjust(cr)) != NULL) {
1466 #ifdef DEBUG
1467                                 acl3call_hits++;
1468 #endif
1469                                 rpcerror = aclcall(mi, which, xdrargs, argsp,
1470                                     xdrres, resp, crr, douprintf, flags, fi);
1471 
1472                                 crfree(crr);
1473 #ifdef DEBUG
1474                                 if (*statusp == NFS3ERR_ACCES)
1475                                         acl3call_misses++;
1476 #endif
1477                         }
1478                 }
1479         } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1480 
1481         return (rpcerror);
1482 }
1483 
1484 static int
1485 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1486     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1487     int flags, failinfo_t *fi)
1488 {
1489         CLIENT *client;
1490         struct chtab *ch;
1491         cred_t *cr = icr;
1492         bool_t cred_cloned = FALSE;
1493         enum clnt_stat status;
1494         struct rpc_err rpcerr;
1495         struct timeval wait;
1496         int timeo;              /* in units of hz */
1497 #if 0 /* notyet */
1498         int my_rsize, my_wsize;
1499 #endif
1500         bool_t tryagain;
1501         k_sigset_t smask;
1502         servinfo_t *svp;
1503         struct nfs_clnt *nfscl;
1504         zoneid_t zoneid = getzoneid();
1505 #ifdef DEBUG
1506         char *bufp;
1507 #endif
1508 
1509 #if 0 /* notyet */
1510         TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1511             "rfscall_start:which %d mi %p", which, mi);
1512 #endif
1513 
1514         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1515         ASSERT(nfscl != NULL);
1516 
1517         nfscl->nfscl_stat.calls.value.ui64++;
1518         mi->mi_aclreqs[which].value.ui64++;
1519 
1520         rpcerr.re_status = RPC_SUCCESS;
1521 
1522         if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1523                 rpcerr.re_status = RPC_FAILED;
1524                 rpcerr.re_errno = EIO;
1525                 return (rpcerr.re_errno);
1526         }
1527 
1528 #if 0 /* notyet */
1529         /*
1530          * Remember the transfer sizes in case
1531          * nfs_feedback changes them underneath us.
1532          */
1533         my_rsize = mi->mi_curread;
1534         my_wsize = mi->mi_curwrite;
1535 #endif
1536 
1537         /*
1538          * NFS client failover support
1539          *
1540          * If this rnode is not in sync with the current server (VALID_FH),
1541          * we'd like to do a remap to get in sync.  We can be interrupted
1542          * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1543          * use the best info we have to try the RPC.  Part of that is
1544          * unconditionally updating the filehandle copy kept for V3.
1545          *
1546          * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1547          * rw_enter(); we're trying to keep the current server from being
1548          * changed on us until we're done with the remapping and have a
1549          * matching client handle.  We don't want to sending a filehandle
1550          * to the wrong host.
1551          */
1552 failoverretry:
1553         if (FAILOVER_MOUNT(mi)) {
1554                 mutex_enter(&mi->mi_lock);
1555                 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1556                         if (failover_wait(mi)) {
1557                                 mutex_exit(&mi->mi_lock);
1558                                 return (EINTR);
1559                         }
1560                 }
1561                 INC_READERS(mi);
1562                 mutex_exit(&mi->mi_lock);
1563                 if (fi) {
1564                         if (!VALID_FH(fi) &&
1565                             !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1566                                 int remaperr;
1567 
1568                                 svp = mi->mi_curr_serv;
1569                                 remaperr = failover_remap(fi);
1570                                 if (remaperr != 0) {
1571 #ifdef DEBUG
1572                                         if (remaperr != EINTR)
1573                                                 nfs_cmn_err(remaperr, CE_WARN,
1574                                             "aclcall couldn't failover: %m");
1575 #endif
1576                                         mutex_enter(&mi->mi_lock);
1577                                         DEC_READERS(mi);
1578                                         mutex_exit(&mi->mi_lock);
1579 
1580                                         /*
1581                                          * If failover_remap returns ETIMEDOUT
1582                                          * and the filesystem is hard mounted
1583                                          * we have to retry the call with a new
1584                                          * server.
1585                                          */
1586                                         if ((mi->mi_flags & MI_HARD) &&
1587                                             IS_RECOVERABLE_ERROR(remaperr)) {
1588                                                 if (svp == mi->mi_curr_serv)
1589                                                         failover_newserver(mi);
1590                                                 rpcerr.re_status = RPC_SUCCESS;
1591                                                 goto failoverretry;
1592                                         }
1593                                         return (remaperr);
1594                                 }
1595                         }
1596                         if (fi->fhp && fi->copyproc)
1597                                 (*fi->copyproc)(fi->fhp, fi->vp);
1598                 }
1599         }
1600 
1601         /* For TSOL, use a new cred which has net_mac_aware flag */
1602         if (!cred_cloned && is_system_labeled()) {
1603                 cred_cloned = TRUE;
1604                 cr = crdup(icr);
1605                 (void) setpflags(NET_MAC_AWARE, 1, cr);
1606         }
1607 
1608         /*
1609          * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1610          * are guaranteed to reprocess the retry as a new request.
1611          */
1612         svp = mi->mi_curr_serv;
1613         rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1614         if (FAILOVER_MOUNT(mi)) {
1615                 mutex_enter(&mi->mi_lock);
1616                 DEC_READERS(mi);
1617                 mutex_exit(&mi->mi_lock);
1618 
1619                 if ((rpcerr.re_errno == ETIMEDOUT ||
1620                     rpcerr.re_errno == ECONNRESET) &&
1621                     failover_safe(fi)) {
1622                         if (svp == mi->mi_curr_serv)
1623                                 failover_newserver(mi);
1624                         goto failoverretry;
1625                 }
1626         }
1627         if (rpcerr.re_errno != 0) {
1628                 if (cred_cloned)
1629                         crfree(cr);
1630                 return (rpcerr.re_errno);
1631         }
1632 
1633         if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1634             svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1635                 timeo = (mi->mi_timeo * hz) / 10;
1636         } else {
1637                 mutex_enter(&mi->mi_lock);
1638                 timeo = CLNT_SETTIMERS(client,
1639                     &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1640                     &(mi->mi_timers[NFS_CALLTYPES]),
1641                     (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1642                     (void (*)()) 0, (caddr_t)mi, 0);
1643                 mutex_exit(&mi->mi_lock);
1644         }
1645 
1646         /*
1647          * If hard mounted fs, retry call forever unless hard error occurs.
1648          */
1649         do {
1650                 tryagain = FALSE;
1651 
1652                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1653                         status = RPC_FAILED;
1654                         rpcerr.re_status = RPC_FAILED;
1655                         rpcerr.re_errno = EIO;
1656                         break;
1657                 }
1658 
1659                 TICK_TO_TIMEVAL(timeo, &wait);
1660 
1661                 /*
1662                  * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1663                  * and SIGTERM. (Preserving the existing masks).
1664                  * Mask out SIGINT if mount option nointr is specified.
1665                  */
1666                 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1667                 if (!(mi->mi_flags & MI_INT))
1668                         client->cl_nosignal = TRUE;
1669 
1670                 /*
1671                  * If there is a current signal, then don't bother
1672                  * even trying to send out the request because we
1673                  * won't be able to block waiting for the response.
1674                  * Simply assume RPC_INTR and get on with it.
1675                  */
1676                 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1677                         status = RPC_INTR;
1678                 else {
1679                         status = CLNT_CALL(client, which, xdrargs, argsp,
1680                             xdrres, resp, wait);
1681                 }
1682 
1683                 if (!(mi->mi_flags & MI_INT))
1684                         client->cl_nosignal = FALSE;
1685                 /*
1686                  * restore original signal mask
1687                  */
1688                 sigunintr(&smask);
1689 
1690                 switch (status) {
1691                 case RPC_SUCCESS:
1692 #if 0 /* notyet */
1693                         if ((mi->mi_flags & MI_DYNAMIC) &&
1694                             mi->mi_timer_type[which] != 0 &&
1695                             (mi->mi_curread != my_rsize ||
1696                             mi->mi_curwrite != my_wsize))
1697                                 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1698 #endif
1699                         break;
1700 
1701                 /*
1702                  * Unfortunately, there are servers in the world which
1703                  * are not coded correctly.  They are not prepared to
1704                  * handle RPC requests to the NFS port which are not
1705                  * NFS requests.  Thus, they may try to process the
1706                  * NFS_ACL request as if it were an NFS request.  This
1707                  * does not work.  Generally, an error will be generated
1708                  * on the client because it will not be able to decode
1709                  * the response from the server.  However, it seems
1710                  * possible that the server may not be able to decode
1711                  * the arguments.  Thus, the criteria for deciding
1712                  * whether the server supports NFS_ACL or not is whether
1713                  * the following RPC errors are returned from CLNT_CALL.
1714                  */
1715                 case RPC_CANTDECODERES:
1716                 case RPC_PROGUNAVAIL:
1717                 case RPC_CANTDECODEARGS:
1718                 case RPC_PROGVERSMISMATCH:
1719                         mutex_enter(&mi->mi_lock);
1720                         mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1721                         mutex_exit(&mi->mi_lock);
1722                         break;
1723 
1724                 /*
1725                  * If the server supports NFS_ACL but not the new ops
1726                  * for extended attributes, make sure we don't retry.
1727                  */
1728                 case RPC_PROCUNAVAIL:
1729                         mutex_enter(&mi->mi_lock);
1730                         mi->mi_flags &= ~MI_EXTATTR;
1731                         mutex_exit(&mi->mi_lock);
1732                         break;
1733 
1734                 case RPC_INTR:
1735                         /*
1736                          * There is no way to recover from this error,
1737                          * even if mount option nointr is specified.
1738                          * SIGKILL, for example, cannot be blocked.
1739                          */
1740                         rpcerr.re_status = RPC_INTR;
1741                         rpcerr.re_errno = EINTR;
1742                         break;
1743 
1744                 case RPC_UDERROR:
1745                         /*
1746                          * If the NFS server is local (vold) and
1747                          * it goes away then we get RPC_UDERROR.
1748                          * This is a retryable error, so we would
1749                          * loop, so check to see if the specific
1750                          * error was ECONNRESET, indicating that
1751                          * target did not exist at all.  If so,
1752                          * return with RPC_PROGUNAVAIL and
1753                          * ECONNRESET to indicate why.
1754                          */
1755                         CLNT_GETERR(client, &rpcerr);
1756                         if (rpcerr.re_errno == ECONNRESET) {
1757                                 rpcerr.re_status = RPC_PROGUNAVAIL;
1758                                 rpcerr.re_errno = ECONNRESET;
1759                                 break;
1760                         }
1761                         /*FALLTHROUGH*/
1762 
1763                 default:                /* probably RPC_TIMEDOUT */
1764                         if (IS_UNRECOVERABLE_RPC(status))
1765                                 break;
1766 
1767                         /*
1768                          * increment server not responding count
1769                          */
1770                         mutex_enter(&mi->mi_lock);
1771                         mi->mi_noresponse++;
1772                         mutex_exit(&mi->mi_lock);
1773 #ifdef DEBUG
1774                         nfscl->nfscl_stat.noresponse.value.ui64++;
1775 #endif
1776 
1777                         if (!(mi->mi_flags & MI_HARD)) {
1778                                 if (!(mi->mi_flags & MI_SEMISOFT) ||
1779                                     (mi->mi_acl_ss_call_type[which] == 0))
1780                                         break;
1781                         }
1782 
1783                         /*
1784                          * The call is in progress (over COTS).
1785                          * Try the CLNT_CALL again, but don't
1786                          * print a noisy error message.
1787                          */
1788                         if (status == RPC_INPROGRESS) {
1789                                 tryagain = TRUE;
1790                                 break;
1791                         }
1792 
1793                         if (flags & RFSCALL_SOFT)
1794                                 break;
1795 
1796                         /*
1797                          * On zone shutdown, just move on.
1798                          */
1799                         if (zone_status_get(curproc->p_zone) >=
1800                             ZONE_IS_SHUTTING_DOWN) {
1801                                 rpcerr.re_status = RPC_FAILED;
1802                                 rpcerr.re_errno = EIO;
1803                                 break;
1804                         }
1805 
1806                         /*
1807                          * NFS client failover support
1808                          *
1809                          * If the current server just failed us, we'll
1810                          * start the process of finding a new server.
1811                          * After that, we can just retry.
1812                          */
1813                         if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1814                                 if (svp == mi->mi_curr_serv)
1815                                         failover_newserver(mi);
1816                                 clfree_impl(client, ch, nfscl);
1817                                 goto failoverretry;
1818                         }
1819 
1820                         tryagain = TRUE;
1821                         timeo = backoff(timeo);
1822                         mutex_enter(&mi->mi_lock);
1823                         if (!(mi->mi_flags & MI_PRINTED)) {
1824                                 mi->mi_flags |= MI_PRINTED;
1825                                 mutex_exit(&mi->mi_lock);
1826 #ifdef DEBUG
1827                                 zprintf(zoneid,
1828                         "NFS_ACL%d server %s not responding still trying\n",
1829                                     mi->mi_vers, svp->sv_hostname);
1830 #else
1831                                 zprintf(zoneid,
1832                             "NFS server %s not responding still trying\n",
1833                                     svp->sv_hostname);
1834 #endif
1835                         } else
1836                                 mutex_exit(&mi->mi_lock);
1837                         if (*douprintf && nfs_has_ctty()) {
1838                                 *douprintf = 0;
1839                                 if (!(mi->mi_flags & MI_NOPRINT))
1840 #ifdef DEBUG
1841                                         uprintf(
1842                         "NFS_ACL%d server %s not responding still trying\n",
1843                                             mi->mi_vers, svp->sv_hostname);
1844 #else
1845                                         uprintf(
1846                             "NFS server %s not responding still trying\n",
1847                                             svp->sv_hostname);
1848 #endif
1849                         }
1850 
1851 #if 0 /* notyet */
1852                         /*
1853                          * If doing dynamic adjustment of transfer
1854                          * size and if it's a read or write call
1855                          * and if the transfer size changed while
1856                          * retransmitting or if the feedback routine
1857                          * changed the transfer size,
1858                          * then exit rfscall so that the transfer
1859                          * size can be adjusted at the vnops level.
1860                          */
1861                         if ((mi->mi_flags & MI_DYNAMIC) &&
1862                             mi->mi_acl_timer_type[which] != 0 &&
1863                             (mi->mi_curread != my_rsize ||
1864                             mi->mi_curwrite != my_wsize ||
1865                             nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1866                                 /*
1867                                  * On read or write calls, return
1868                                  * back to the vnode ops level if
1869                                  * the transfer size changed.
1870                                  */
1871                                 clfree_impl(client, ch, nfscl);
1872                                 if (cred_cloned)
1873                                         crfree(cr);
1874                                 return (ENFS_TRYAGAIN);
1875                         }
1876 #endif
1877                 }
1878         } while (tryagain);
1879 
1880         if (status != RPC_SUCCESS) {
1881                 /*
1882                  * Let soft mounts use the timed out message.
1883                  */
1884                 if (status == RPC_INPROGRESS)
1885                         status = RPC_TIMEDOUT;
1886                 nfscl->nfscl_stat.badcalls.value.ui64++;
1887                 if (status == RPC_CANTDECODERES ||
1888                     status == RPC_PROGUNAVAIL ||
1889                     status == RPC_PROCUNAVAIL ||
1890                     status == RPC_CANTDECODEARGS ||
1891                     status == RPC_PROGVERSMISMATCH)
1892                         CLNT_GETERR(client, &rpcerr);
1893                 else if (status != RPC_INTR) {
1894                         mutex_enter(&mi->mi_lock);
1895                         mi->mi_flags |= MI_DOWN;
1896                         mutex_exit(&mi->mi_lock);
1897                         CLNT_GETERR(client, &rpcerr);
1898 #ifdef DEBUG
1899                         bufp = clnt_sperror(client, svp->sv_hostname);
1900                         zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1901                             mi->mi_vers, mi->mi_aclnames[which], bufp);
1902                         if (nfs_has_ctty()) {
1903                                 if (!(mi->mi_flags & MI_NOPRINT)) {
1904                                         uprintf("NFS_ACL%d %s failed for %s\n",
1905                                             mi->mi_vers, mi->mi_aclnames[which],
1906                                             bufp);
1907                                 }
1908                         }
1909                         kmem_free(bufp, MAXPATHLEN);
1910 #else
1911                         zprintf(zoneid,
1912                             "NFS %s failed for server %s: error %d (%s)\n",
1913                             mi->mi_aclnames[which], svp->sv_hostname,
1914                             status, clnt_sperrno(status));
1915                         if (nfs_has_ctty()) {
1916                                 if (!(mi->mi_flags & MI_NOPRINT))
1917                                         uprintf(
1918                                 "NFS %s failed for server %s: error %d (%s)\n",
1919                                             mi->mi_aclnames[which],
1920                                             svp->sv_hostname, status,
1921                                             clnt_sperrno(status));
1922                         }
1923 #endif
1924                         /*
1925                          * when CLNT_CALL() fails with RPC_AUTHERROR,
1926                          * re_errno is set appropriately depending on
1927                          * the authentication error
1928                          */
1929                         if (status == RPC_VERSMISMATCH ||
1930                             status == RPC_PROGVERSMISMATCH)
1931                                 rpcerr.re_errno = EIO;
1932                 }
1933         } else {
1934                 /*
1935                  * Test the value of mi_down and mi_printed without
1936                  * holding the mi_lock mutex.  If they are both zero,
1937                  * then it is okay to skip the down and printed
1938                  * processing.  This saves on a mutex_enter and
1939                  * mutex_exit pair for a normal, successful RPC.
1940                  * This was just complete overhead.
1941                  */
1942                 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1943                         mutex_enter(&mi->mi_lock);
1944                         mi->mi_flags &= ~MI_DOWN;
1945                         if (mi->mi_flags & MI_PRINTED) {
1946                                 mi->mi_flags &= ~MI_PRINTED;
1947                                 mutex_exit(&mi->mi_lock);
1948 #ifdef DEBUG
1949                                 zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1950                                     mi->mi_vers, svp->sv_hostname);
1951 #else
1952                                 zprintf(zoneid, "NFS server %s ok\n",
1953                                     svp->sv_hostname);
1954 #endif
1955                         } else
1956                                 mutex_exit(&mi->mi_lock);
1957                 }
1958 
1959                 if (*douprintf == 0) {
1960                         if (!(mi->mi_flags & MI_NOPRINT))
1961 #ifdef DEBUG
1962                                 uprintf("NFS_ACL%d server %s ok\n",
1963                                     mi->mi_vers, svp->sv_hostname);
1964 #else
1965                                 uprintf("NFS server %s ok\n", svp->sv_hostname);
1966 #endif
1967                         *douprintf = 1;
1968                 }
1969         }
1970 
1971         clfree_impl(client, ch, nfscl);
1972         if (cred_cloned)
1973                 crfree(cr);
1974 
1975         ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1976 
1977 #if 0 /* notyet */
1978         TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1979             rpcerr.re_errno);
1980 #endif
1981 
1982         return (rpcerr.re_errno);
1983 }
1984 
1985 int
1986 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1987 {
1988         uint_t mask = vap->va_mask;
1989 
1990         if (!(mask & AT_MODE))
1991                 sa->sa_mode = (uint32_t)-1;
1992         else
1993                 sa->sa_mode = vap->va_mode;
1994         if (!(mask & AT_UID))
1995                 sa->sa_uid = (uint32_t)-1;
1996         else
1997                 sa->sa_uid = (uint32_t)vap->va_uid;
1998         if (!(mask & AT_GID))
1999                 sa->sa_gid = (uint32_t)-1;
2000         else
2001                 sa->sa_gid = (uint32_t)vap->va_gid;
2002         if (!(mask & AT_SIZE))
2003                 sa->sa_size = (uint32_t)-1;
2004         else
2005                 sa->sa_size = (uint32_t)vap->va_size;
2006         if (!(mask & AT_ATIME))
2007                 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2008         else {
2009                 /* check time validity */
2010                 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2011                         return (EOVERFLOW);
2012                 }
2013                 sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2014                 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2015         }
2016         if (!(mask & AT_MTIME))
2017                 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2018         else {
2019                 /* check time validity */
2020                 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2021                         return (EOVERFLOW);
2022                 }
2023                 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2024                 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2025         }
2026         return (0);
2027 }
2028 
2029 int
2030 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2031 {
2032         uint_t mask = vap->va_mask;
2033 
2034         if (!(mask & AT_MODE))
2035                 sa->mode.set_it = FALSE;
2036         else {
2037                 sa->mode.set_it = TRUE;
2038                 sa->mode.mode = (mode3)vap->va_mode;
2039         }
2040         if (!(mask & AT_UID))
2041                 sa->uid.set_it = FALSE;
2042         else {
2043                 sa->uid.set_it = TRUE;
2044                 sa->uid.uid = (uid3)vap->va_uid;
2045         }
2046         if (!(mask & AT_GID))
2047                 sa->gid.set_it = FALSE;
2048         else {
2049                 sa->gid.set_it = TRUE;
2050                 sa->gid.gid = (gid3)vap->va_gid;
2051         }
2052         if (!(mask & AT_SIZE))
2053                 sa->size.set_it = FALSE;
2054         else {
2055                 sa->size.set_it = TRUE;
2056                 sa->size.size = (size3)vap->va_size;
2057         }
2058         if (!(mask & AT_ATIME))
2059                 sa->atime.set_it = DONT_CHANGE;
2060         else {
2061                 /* check time validity */
2062                 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2063                         return (EOVERFLOW);
2064                 }
2065                 sa->atime.set_it = SET_TO_CLIENT_TIME;
2066                 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2067                 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2068         }
2069         if (!(mask & AT_MTIME))
2070                 sa->mtime.set_it = DONT_CHANGE;
2071         else {
2072                 /* check time validity */
2073                 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2074                         return (EOVERFLOW);
2075                 }
2076                 sa->mtime.set_it = SET_TO_CLIENT_TIME;
2077                 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2078                 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2079         }
2080         return (0);
2081 }
2082 
2083 void
2084 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2085 {
2086 
2087         da->da_fhandle = VTOFH(dvp);
2088         da->da_name = nm;
2089         da->da_flags = 0;
2090 }
2091 
2092 void
2093 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2094 {
2095 
2096         da->dirp = VTOFH3(dvp);
2097         da->name = nm;
2098 }
2099 
2100 int
2101 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2102 {
2103         int error;
2104         rnode_t *rp;
2105         struct vattr va;
2106 
2107         va.va_mask = AT_MODE | AT_GID;
2108         error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2109         if (error)
2110                 return (error);
2111 
2112         /*
2113          * To determine the expected group-id of the created file:
2114          *  1)  If the filesystem was not mounted with the Old-BSD-compatible
2115          *      GRPID option, and the directory's set-gid bit is clear,
2116          *      then use the process's gid.
2117          *  2)  Otherwise, set the group-id to the gid of the parent directory.
2118          */
2119         rp = VTOR(dvp);
2120         mutex_enter(&rp->r_statelock);
2121         if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2122                 *gidp = crgetgid(cr);
2123         else
2124                 *gidp = va.va_gid;
2125         mutex_exit(&rp->r_statelock);
2126         return (0);
2127 }
2128 
2129 int
2130 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2131 {
2132         int error;
2133         struct vattr va;
2134 
2135         va.va_mask = AT_MODE;
2136         error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2137         if (error)
2138                 return (error);
2139 
2140         /*
2141          * Modify the expected mode (om) so that the set-gid bit matches
2142          * that of the parent directory (dvp).
2143          */
2144         if (va.va_mode & VSGID)
2145                 *omp |= VSGID;
2146         else
2147                 *omp &= ~VSGID;
2148         return (0);
2149 }
2150 
2151 void
2152 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2153 {
2154 
2155         if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2156                 if (!(vp->v_flag & VSWAPLIKE)) {
2157                         mutex_enter(&vp->v_lock);
2158                         vp->v_flag |= VSWAPLIKE;
2159                         mutex_exit(&vp->v_lock);
2160                 }
2161         } else {
2162                 if (vp->v_flag & VSWAPLIKE) {
2163                         mutex_enter(&vp->v_lock);
2164                         vp->v_flag &= ~VSWAPLIKE;
2165                         mutex_exit(&vp->v_lock);
2166                 }
2167         }
2168 }
2169 
2170 /*
2171  * Free the resources associated with an rnode.
2172  */
2173 static void
2174 rinactive(rnode_t *rp, cred_t *cr)
2175 {
2176         vnode_t *vp;
2177         cred_t *cred;
2178         char *contents;
2179         int size;
2180         vsecattr_t *vsp;
2181         int error;
2182         nfs3_pathconf_info *info;
2183 
2184         /*
2185          * Before freeing anything, wait until all asynchronous
2186          * activity is done on this rnode.  This will allow all
2187          * asynchronous read ahead and write behind i/o's to
2188          * finish.
2189          */
2190         mutex_enter(&rp->r_statelock);
2191         while (rp->r_count > 0)
2192                 cv_wait(&rp->r_cv, &rp->r_statelock);
2193         mutex_exit(&rp->r_statelock);
2194 
2195         /*
2196          * Flush and invalidate all pages associated with the vnode.
2197          */
2198         vp = RTOV(rp);
2199         if (vn_has_cached_data(vp)) {
2200                 ASSERT(vp->v_type != VCHR);
2201                 if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2202                         error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2203                         if (error && (error == ENOSPC || error == EDQUOT)) {
2204                                 mutex_enter(&rp->r_statelock);
2205                                 if (!rp->r_error)
2206                                         rp->r_error = error;
2207                                 mutex_exit(&rp->r_statelock);
2208                         }
2209                 }
2210                 nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2211         }
2212 
2213         /*
2214          * Free any held credentials and caches which may be associated
2215          * with this rnode.
2216          */
2217         mutex_enter(&rp->r_statelock);
2218         cred = rp->r_cred;
2219         rp->r_cred = NULL;
2220         contents = rp->r_symlink.contents;
2221         size = rp->r_symlink.size;
2222         rp->r_symlink.contents = NULL;
2223         vsp = rp->r_secattr;
2224         rp->r_secattr = NULL;
2225         info = rp->r_pathconf;
2226         rp->r_pathconf = NULL;
2227         mutex_exit(&rp->r_statelock);
2228 
2229         /*
2230          * Free the held credential.
2231          */
2232         if (cred != NULL)
2233                 crfree(cred);
2234 
2235         /*
2236          * Free the access cache entries.
2237          */
2238         (void) nfs_access_purge_rp(rp);
2239 
2240         /*
2241          * Free the readdir cache entries.
2242          */
2243         if (HAVE_RDDIR_CACHE(rp))
2244                 nfs_purge_rddir_cache(vp);
2245 
2246         /*
2247          * Free the symbolic link cache.
2248          */
2249         if (contents != NULL) {
2250 
2251                 kmem_free((void *)contents, size);
2252         }
2253 
2254         /*
2255          * Free any cached ACL.
2256          */
2257         if (vsp != NULL)
2258                 nfs_acl_free(vsp);
2259 
2260         /*
2261          * Free any cached pathconf information.
2262          */
2263         if (info != NULL)
2264                 kmem_free(info, sizeof (*info));
2265 }
2266 
2267 /*
2268  * Return a vnode for the given NFS Version 2 file handle.
2269  * If no rnode exists for this fhandle, create one and put it
2270  * into the hash queues.  If the rnode for this fhandle
2271  * already exists, return it.
2272  *
2273  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2274  */
2275 vnode_t *
2276 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2277     hrtime_t t, cred_t *cr, char *dnm, char *nm)
2278 {
2279         int newnode;
2280         int index;
2281         vnode_t *vp;
2282         nfs_fhandle nfh;
2283         vattr_t va;
2284 
2285         nfh.fh_len = NFS_FHSIZE;
2286         bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2287 
2288         index = rtablehash(&nfh);
2289         rw_enter(&rtable[index].r_lock, RW_READER);
2290 
2291         vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2292             nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2293 
2294         if (attr != NULL) {
2295                 if (!newnode) {
2296                         rw_exit(&rtable[index].r_lock);
2297                         (void) nfs_cache_fattr(vp, attr, &va, t, cr);
2298                 } else {
2299                         if (attr->na_type < NFNON || attr->na_type > NFSOC)
2300                                 vp->v_type = VBAD;
2301                         else
2302                                 vp->v_type = n2v_type(attr);
2303                         /*
2304                          * A translation here seems to be necessary
2305                          * because this function can be called
2306                          * with `attr' that has come from the wire,
2307                          * and been operated on by vattr_to_nattr().
2308                          * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2309                          * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2310                          * ->makenfsnode().
2311                          */
2312                         if ((attr->na_rdev & 0xffff0000) == 0)
2313                                 vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2314                         else
2315                                 vp->v_rdev = expldev(n2v_rdev(attr));
2316                         nfs_attrcache(vp, attr, t);
2317                         rw_exit(&rtable[index].r_lock);
2318                 }
2319         } else {
2320                 if (newnode) {
2321                         PURGE_ATTRCACHE(vp);
2322                 }
2323                 rw_exit(&rtable[index].r_lock);
2324         }
2325 
2326         return (vp);
2327 }
2328 
2329 /*
2330  * Return a vnode for the given NFS Version 3 file handle.
2331  * If no rnode exists for this fhandle, create one and put it
2332  * into the hash queues.  If the rnode for this fhandle
2333  * already exists, return it.
2334  *
2335  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2336  */
2337 vnode_t *
2338 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2339     cred_t *cr, char *dnm, char *nm)
2340 {
2341         int newnode;
2342         int index;
2343         vnode_t *vp;
2344 
2345         index = rtablehash((nfs_fhandle *)fh);
2346         rw_enter(&rtable[index].r_lock, RW_READER);
2347 
2348         vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2349             nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2350             dnm, nm);
2351 
2352         if (vap == NULL) {
2353                 if (newnode) {
2354                         PURGE_ATTRCACHE(vp);
2355                 }
2356                 rw_exit(&rtable[index].r_lock);
2357                 return (vp);
2358         }
2359 
2360         if (!newnode) {
2361                 rw_exit(&rtable[index].r_lock);
2362                 nfs_attr_cache(vp, vap, t, cr);
2363         } else {
2364                 rnode_t *rp = VTOR(vp);
2365 
2366                 vp->v_type = vap->va_type;
2367                 vp->v_rdev = vap->va_rdev;
2368 
2369                 mutex_enter(&rp->r_statelock);
2370                 if (rp->r_mtime <= t)
2371                         nfs_attrcache_va(vp, vap);
2372                 mutex_exit(&rp->r_statelock);
2373                 rw_exit(&rtable[index].r_lock);
2374         }
2375 
2376         return (vp);
2377 }
2378 
2379 vnode_t *
2380 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2381     cred_t *cr, char *dnm, char *nm)
2382 {
2383         int newnode;
2384         int index;
2385         vnode_t *vp;
2386         vattr_t va;
2387 
2388         index = rtablehash((nfs_fhandle *)fh);
2389         rw_enter(&rtable[index].r_lock, RW_READER);
2390 
2391         vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2392             nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2393             dnm, nm);
2394 
2395         if (attr == NULL) {
2396                 if (newnode) {
2397                         PURGE_ATTRCACHE(vp);
2398                 }
2399                 rw_exit(&rtable[index].r_lock);
2400                 return (vp);
2401         }
2402 
2403         if (!newnode) {
2404                 rw_exit(&rtable[index].r_lock);
2405                 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2406         } else {
2407                 if (attr->type < NF3REG || attr->type > NF3FIFO)
2408                         vp->v_type = VBAD;
2409                 else
2410                         vp->v_type = nf3_to_vt[attr->type];
2411                 vp->v_rdev = makedevice(attr->rdev.specdata1,
2412                     attr->rdev.specdata2);
2413                 nfs3_attrcache(vp, attr, t);
2414                 rw_exit(&rtable[index].r_lock);
2415         }
2416 
2417         return (vp);
2418 }
2419 
2420 /*
2421  * Read this comment before making changes to rtablehash()!
2422  * This is a hash function in which seemingly obvious and harmless
2423  * changes can cause escalations costing million dollars!
2424  * Know what you are doing.
2425  *
2426  * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2427  * algorithm is currently detailed here:
2428  *
2429  *   http://burtleburtle.net/bob/hash/doobs.html
2430  *
2431  * Of course, the above link may not be valid by the time you are reading
2432  * this, but suffice it to say that the one-at-a-time algorithm works well in
2433  * almost all cases.  If you are changing the algorithm be sure to verify that
2434  * the hash algorithm still provides even distribution in all cases and with
2435  * any server returning filehandles in whatever order (sequential or random).
2436  */
2437 static int
2438 rtablehash(nfs_fhandle *fh)
2439 {
2440         ulong_t hash, len, i;
2441         char *key;
2442 
2443         key = fh->fh_buf;
2444         len = (ulong_t)fh->fh_len;
2445         for (hash = 0, i = 0; i < len; i++) {
2446                 hash += key[i];
2447                 hash += (hash << 10);
2448                 hash ^= (hash >> 6);
2449         }
2450         hash += (hash << 3);
2451         hash ^= (hash >> 11);
2452         hash += (hash << 15);
2453         return (hash & rtablemask);
2454 }
2455 
2456 static vnode_t *
2457 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2458     struct vnodeops *vops,
2459     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2460     int (*compar)(const void *, const void *),
2461     int *newnode, cred_t *cr, char *dnm, char *nm)
2462 {
2463         rnode_t *rp;
2464         rnode_t *trp;
2465         vnode_t *vp;
2466         mntinfo_t *mi;
2467 
2468         ASSERT(RW_READ_HELD(&rhtp->r_lock));
2469 
2470         mi = VFTOMI(vfsp);
2471 start:
2472         if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2473                 vp = RTOV(rp);
2474                 nfs_set_vroot(vp);
2475                 *newnode = 0;
2476                 return (vp);
2477         }
2478         rw_exit(&rhtp->r_lock);
2479 
2480         mutex_enter(&rpfreelist_lock);
2481         if (rpfreelist != NULL && rnew >= nrnode) {
2482                 rp = rpfreelist;
2483                 rp_rmfree(rp);
2484                 mutex_exit(&rpfreelist_lock);
2485 
2486                 vp = RTOV(rp);
2487 
2488                 if (rp->r_flags & RHASHED) {
2489                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2490                         mutex_enter(&vp->v_lock);
2491                         if (vp->v_count > 1) {
2492                                 vp->v_count--;
2493                                 mutex_exit(&vp->v_lock);
2494                                 rw_exit(&rp->r_hashq->r_lock);
2495                                 rw_enter(&rhtp->r_lock, RW_READER);
2496                                 goto start;
2497                         }
2498                         mutex_exit(&vp->v_lock);
2499                         rp_rmhash_locked(rp);
2500                         rw_exit(&rp->r_hashq->r_lock);
2501                 }
2502 
2503                 rinactive(rp, cr);
2504 
2505                 mutex_enter(&vp->v_lock);
2506                 if (vp->v_count > 1) {
2507                         vp->v_count--;
2508                         mutex_exit(&vp->v_lock);
2509                         rw_enter(&rhtp->r_lock, RW_READER);
2510                         goto start;
2511                 }
2512                 mutex_exit(&vp->v_lock);
2513                 vn_invalid(vp);
2514                 /*
2515                  * destroy old locks before bzero'ing and
2516                  * recreating the locks below.
2517                  */
2518                 nfs_rw_destroy(&rp->r_rwlock);
2519                 nfs_rw_destroy(&rp->r_lkserlock);
2520                 mutex_destroy(&rp->r_statelock);
2521                 cv_destroy(&rp->r_cv);
2522                 cv_destroy(&rp->r_commit.c_cv);
2523                 nfs_free_r_path(rp);
2524                 avl_destroy(&rp->r_dir);
2525                 /*
2526                  * Make sure that if rnode is recycled then
2527                  * VFS count is decremented properly before
2528                  * reuse.
2529                  */
2530                 VFS_RELE(vp->v_vfsp);
2531                 vn_reinit(vp);
2532         } else {
2533                 vnode_t *new_vp;
2534 
2535                 mutex_exit(&rpfreelist_lock);
2536 
2537                 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2538                 new_vp = vn_alloc(KM_SLEEP);
2539 
2540                 atomic_inc_ulong((ulong_t *)&rnew);
2541 #ifdef DEBUG
2542                 clstat_debug.nrnode.value.ui64++;
2543 #endif
2544                 vp = new_vp;
2545         }
2546 
2547         bzero(rp, sizeof (*rp));
2548         rp->r_vnode = vp;
2549         nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2550         nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2551         mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2552         cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2553         cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2554         rp->r_fh.fh_len = fh->fh_len;
2555         bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2556         rp->r_server = mi->mi_curr_serv;
2557         if (FAILOVER_MOUNT(mi)) {
2558                 /*
2559                  * If replicated servers, stash pathnames
2560                  */
2561                 if (dnm != NULL && nm != NULL) {
2562                         char *s, *p;
2563                         uint_t len;
2564 
2565                         len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2566                         rp->r_path = kmem_alloc(len, KM_SLEEP);
2567 #ifdef DEBUG
2568                         clstat_debug.rpath.value.ui64 += len;
2569 #endif
2570                         s = rp->r_path;
2571                         for (p = dnm; *p; p++)
2572                                 *s++ = *p;
2573                         *s++ = '/';
2574                         for (p = nm; *p; p++)
2575                                 *s++ = *p;
2576                         *s = '\0';
2577                 } else {
2578                         /* special case for root */
2579                         rp->r_path = kmem_alloc(2, KM_SLEEP);
2580 #ifdef DEBUG
2581                         clstat_debug.rpath.value.ui64 += 2;
2582 #endif
2583                         *rp->r_path = '.';
2584                         *(rp->r_path + 1) = '\0';
2585                 }
2586         }
2587         VFS_HOLD(vfsp);
2588         rp->r_putapage = putapage;
2589         rp->r_hashq = rhtp;
2590         rp->r_flags = RREADDIRPLUS;
2591         avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2592             offsetof(rddir_cache, tree));
2593         vn_setops(vp, vops);
2594         vp->v_data = (caddr_t)rp;
2595         vp->v_vfsp = vfsp;
2596         vp->v_type = VNON;
2597         vp->v_flag |= VMODSORT;
2598         nfs_set_vroot(vp);
2599 
2600         /*
2601          * There is a race condition if someone else
2602          * alloc's the rnode while no locks are held, so we
2603          * check again and recover if found.
2604          */
2605         rw_enter(&rhtp->r_lock, RW_WRITER);
2606         if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2607                 vp = RTOV(trp);
2608                 nfs_set_vroot(vp);
2609                 *newnode = 0;
2610                 rw_exit(&rhtp->r_lock);
2611                 rp_addfree(rp, cr);
2612                 rw_enter(&rhtp->r_lock, RW_READER);
2613                 return (vp);
2614         }
2615         rp_addhash(rp);
2616         *newnode = 1;
2617         return (vp);
2618 }
2619 
2620 /*
2621  * Callback function to check if the page should be marked as
2622  * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2623  */
2624 int
2625 nfs_setmod_check(page_t *pp)
2626 {
2627         if (pp->p_fsdata != C_NOCOMMIT) {
2628                 pp->p_fsdata = C_NOCOMMIT;
2629                 return (1);
2630         }
2631         return (0);
2632 }
2633 
2634 static void
2635 nfs_set_vroot(vnode_t *vp)
2636 {
2637         rnode_t *rp;
2638         nfs_fhandle *rootfh;
2639 
2640         rp = VTOR(vp);
2641         rootfh = &rp->r_server->sv_fhandle;
2642         if (rootfh->fh_len == rp->r_fh.fh_len &&
2643             bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2644                 if (!(vp->v_flag & VROOT)) {
2645                         mutex_enter(&vp->v_lock);
2646                         vp->v_flag |= VROOT;
2647                         mutex_exit(&vp->v_lock);
2648                 }
2649         }
2650 }
2651 
2652 static void
2653 nfs_free_r_path(rnode_t *rp)
2654 {
2655         char *path;
2656         size_t len;
2657 
2658         path = rp->r_path;
2659         if (path) {
2660                 rp->r_path = NULL;
2661                 len = strlen(path) + 1;
2662                 kmem_free(path, len);
2663 #ifdef DEBUG
2664                 clstat_debug.rpath.value.ui64 -= len;
2665 #endif
2666         }
2667 }
2668 
2669 /*
2670  * Put an rnode on the free list.
2671  *
2672  * Rnodes which were allocated above and beyond the normal limit
2673  * are immediately freed.
2674  */
2675 void
2676 rp_addfree(rnode_t *rp, cred_t *cr)
2677 {
2678         vnode_t *vp;
2679         struct vfs *vfsp;
2680 
2681         vp = RTOV(rp);
2682         ASSERT(vp->v_count >= 1);
2683         ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2684 
2685         /*
2686          * If we have too many rnodes allocated and there are no
2687          * references to this rnode, or if the rnode is no longer
2688          * accessible by it does not reside in the hash queues,
2689          * or if an i/o error occurred while writing to the file,
2690          * then just free it instead of putting it on the rnode
2691          * freelist.
2692          */
2693         vfsp = vp->v_vfsp;
2694         if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2695             (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2696                 if (rp->r_flags & RHASHED) {
2697                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2698                         mutex_enter(&vp->v_lock);
2699                         if (vp->v_count > 1) {
2700                                 vp->v_count--;
2701                                 mutex_exit(&vp->v_lock);
2702                                 rw_exit(&rp->r_hashq->r_lock);
2703                                 return;
2704                         }
2705                         mutex_exit(&vp->v_lock);
2706                         rp_rmhash_locked(rp);
2707                         rw_exit(&rp->r_hashq->r_lock);
2708                 }
2709 
2710                 rinactive(rp, cr);
2711 
2712                 /*
2713                  * Recheck the vnode reference count.  We need to
2714                  * make sure that another reference has not been
2715                  * acquired while we were not holding v_lock.  The
2716                  * rnode is not in the rnode hash queues, so the
2717                  * only way for a reference to have been acquired
2718                  * is for a VOP_PUTPAGE because the rnode was marked
2719                  * with RDIRTY or for a modified page.  This
2720                  * reference may have been acquired before our call
2721                  * to rinactive.  The i/o may have been completed,
2722                  * thus allowing rinactive to complete, but the
2723                  * reference to the vnode may not have been released
2724                  * yet.  In any case, the rnode can not be destroyed
2725                  * until the other references to this vnode have been
2726                  * released.  The other references will take care of
2727                  * either destroying the rnode or placing it on the
2728                  * rnode freelist.  If there are no other references,
2729                  * then the rnode may be safely destroyed.
2730                  */
2731                 mutex_enter(&vp->v_lock);
2732                 if (vp->v_count > 1) {
2733                         vp->v_count--;
2734                         mutex_exit(&vp->v_lock);
2735                         return;
2736                 }
2737                 mutex_exit(&vp->v_lock);
2738 
2739                 destroy_rnode(rp);
2740                 return;
2741         }
2742 
2743         /*
2744          * Lock the hash queue and then recheck the reference count
2745          * to ensure that no other threads have acquired a reference
2746          * to indicate that the rnode should not be placed on the
2747          * freelist.  If another reference has been acquired, then
2748          * just release this one and let the other thread complete
2749          * the processing of adding this rnode to the freelist.
2750          */
2751         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2752 
2753         mutex_enter(&vp->v_lock);
2754         if (vp->v_count > 1) {
2755                 vp->v_count--;
2756                 mutex_exit(&vp->v_lock);
2757                 rw_exit(&rp->r_hashq->r_lock);
2758                 return;
2759         }
2760         mutex_exit(&vp->v_lock);
2761 
2762         /*
2763          * If there is no cached data or metadata for this file, then
2764          * put the rnode on the front of the freelist so that it will
2765          * be reused before other rnodes which may have cached data or
2766          * metadata associated with them.
2767          */
2768         mutex_enter(&rpfreelist_lock);
2769         if (rpfreelist == NULL) {
2770                 rp->r_freef = rp;
2771                 rp->r_freeb = rp;
2772                 rpfreelist = rp;
2773         } else {
2774                 rp->r_freef = rpfreelist;
2775                 rp->r_freeb = rpfreelist->r_freeb;
2776                 rpfreelist->r_freeb->r_freef = rp;
2777                 rpfreelist->r_freeb = rp;
2778                 if (!vn_has_cached_data(vp) &&
2779                     !HAVE_RDDIR_CACHE(rp) &&
2780                     rp->r_symlink.contents == NULL &&
2781                     rp->r_secattr == NULL &&
2782                     rp->r_pathconf == NULL)
2783                         rpfreelist = rp;
2784         }
2785         mutex_exit(&rpfreelist_lock);
2786 
2787         rw_exit(&rp->r_hashq->r_lock);
2788 }
2789 
2790 /*
2791  * Remove an rnode from the free list.
2792  *
2793  * The caller must be holding rpfreelist_lock and the rnode
2794  * must be on the freelist.
2795  */
2796 static void
2797 rp_rmfree(rnode_t *rp)
2798 {
2799 
2800         ASSERT(MUTEX_HELD(&rpfreelist_lock));
2801         ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2802 
2803         if (rp == rpfreelist) {
2804                 rpfreelist = rp->r_freef;
2805                 if (rp == rpfreelist)
2806                         rpfreelist = NULL;
2807         }
2808 
2809         rp->r_freeb->r_freef = rp->r_freef;
2810         rp->r_freef->r_freeb = rp->r_freeb;
2811 
2812         rp->r_freef = rp->r_freeb = NULL;
2813 }
2814 
2815 /*
2816  * Put a rnode in the hash table.
2817  *
2818  * The caller must be holding the exclusive hash queue lock.
2819  */
2820 static void
2821 rp_addhash(rnode_t *rp)
2822 {
2823 
2824         ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2825         ASSERT(!(rp->r_flags & RHASHED));
2826 
2827         rp->r_hashf = rp->r_hashq->r_hashf;
2828         rp->r_hashq->r_hashf = rp;
2829         rp->r_hashb = (rnode_t *)rp->r_hashq;
2830         rp->r_hashf->r_hashb = rp;
2831 
2832         mutex_enter(&rp->r_statelock);
2833         rp->r_flags |= RHASHED;
2834         mutex_exit(&rp->r_statelock);
2835 }
2836 
2837 /*
2838  * Remove a rnode from the hash table.
2839  *
2840  * The caller must be holding the hash queue lock.
2841  */
2842 static void
2843 rp_rmhash_locked(rnode_t *rp)
2844 {
2845 
2846         ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2847         ASSERT(rp->r_flags & RHASHED);
2848 
2849         rp->r_hashb->r_hashf = rp->r_hashf;
2850         rp->r_hashf->r_hashb = rp->r_hashb;
2851 
2852         mutex_enter(&rp->r_statelock);
2853         rp->r_flags &= ~RHASHED;
2854         mutex_exit(&rp->r_statelock);
2855 }
2856 
2857 /*
2858  * Remove a rnode from the hash table.
2859  *
2860  * The caller must not be holding the hash queue lock.
2861  */
2862 void
2863 rp_rmhash(rnode_t *rp)
2864 {
2865 
2866         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2867         rp_rmhash_locked(rp);
2868         rw_exit(&rp->r_hashq->r_lock);
2869 }
2870 
2871 /*
2872  * Lookup a rnode by fhandle.
2873  *
2874  * The caller must be holding the hash queue lock, either shared or exclusive.
2875  */
2876 static rnode_t *
2877 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2878 {
2879         rnode_t *rp;
2880         vnode_t *vp;
2881 
2882         ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2883 
2884         for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2885                 vp = RTOV(rp);
2886                 if (vp->v_vfsp == vfsp &&
2887                     rp->r_fh.fh_len == fh->fh_len &&
2888                     bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2889                         /*
2890                          * remove rnode from free list, if necessary.
2891                          */
2892                         if (rp->r_freef != NULL) {
2893                                 mutex_enter(&rpfreelist_lock);
2894                                 /*
2895                                  * If the rnode is on the freelist,
2896                                  * then remove it and use that reference
2897                                  * as the new reference.  Otherwise,
2898                                  * need to increment the reference count.
2899                                  */
2900                                 if (rp->r_freef != NULL) {
2901                                         rp_rmfree(rp);
2902                                         mutex_exit(&rpfreelist_lock);
2903                                 } else {
2904                                         mutex_exit(&rpfreelist_lock);
2905                                         VN_HOLD(vp);
2906                                 }
2907                         } else
2908                                 VN_HOLD(vp);
2909                         return (rp);
2910                 }
2911         }
2912         return (NULL);
2913 }
2914 
2915 /*
2916  * Return 1 if there is a active vnode belonging to this vfs in the
2917  * rtable cache.
2918  *
2919  * Several of these checks are done without holding the usual
2920  * locks.  This is safe because destroy_rtable(), rp_addfree(),
2921  * etc. will redo the necessary checks before actually destroying
2922  * any rnodes.
2923  */
2924 int
2925 check_rtable(struct vfs *vfsp)
2926 {
2927         int index;
2928         rnode_t *rp;
2929         vnode_t *vp;
2930 
2931         for (index = 0; index < rtablesize; index++) {
2932                 rw_enter(&rtable[index].r_lock, RW_READER);
2933                 for (rp = rtable[index].r_hashf;
2934                     rp != (rnode_t *)(&rtable[index]);
2935                     rp = rp->r_hashf) {
2936                         vp = RTOV(rp);
2937                         if (vp->v_vfsp == vfsp) {
2938                                 if (rp->r_freef == NULL ||
2939                                     (vn_has_cached_data(vp) &&
2940                                     (rp->r_flags & RDIRTY)) ||
2941                                     rp->r_count > 0) {
2942                                         rw_exit(&rtable[index].r_lock);
2943                                         return (1);
2944                                 }
2945                         }
2946                 }
2947                 rw_exit(&rtable[index].r_lock);
2948         }
2949         return (0);
2950 }
2951 
2952 /*
2953  * Destroy inactive vnodes from the hash queues which belong to this
2954  * vfs.  It is essential that we destroy all inactive vnodes during a
2955  * forced unmount as well as during a normal unmount.
2956  */
2957 void
2958 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2959 {
2960         int index;
2961         rnode_t *rp;
2962         rnode_t *rlist;
2963         rnode_t *r_hashf;
2964         vnode_t *vp;
2965 
2966         rlist = NULL;
2967 
2968         for (index = 0; index < rtablesize; index++) {
2969                 rw_enter(&rtable[index].r_lock, RW_WRITER);
2970                 for (rp = rtable[index].r_hashf;
2971                     rp != (rnode_t *)(&rtable[index]);
2972                     rp = r_hashf) {
2973                         /* save the hash pointer before destroying */
2974                         r_hashf = rp->r_hashf;
2975                         vp = RTOV(rp);
2976                         if (vp->v_vfsp == vfsp) {
2977                                 mutex_enter(&rpfreelist_lock);
2978                                 if (rp->r_freef != NULL) {
2979                                         rp_rmfree(rp);
2980                                         mutex_exit(&rpfreelist_lock);
2981                                         rp_rmhash_locked(rp);
2982                                         rp->r_hashf = rlist;
2983                                         rlist = rp;
2984                                 } else
2985                                         mutex_exit(&rpfreelist_lock);
2986                         }
2987                 }
2988                 rw_exit(&rtable[index].r_lock);
2989         }
2990 
2991         for (rp = rlist; rp != NULL; rp = rlist) {
2992                 rlist = rp->r_hashf;
2993                 /*
2994                  * This call to rp_addfree will end up destroying the
2995                  * rnode, but in a safe way with the appropriate set
2996                  * of checks done.
2997                  */
2998                 rp_addfree(rp, cr);
2999         }
3000 
3001 }
3002 
3003 /*
3004  * This routine destroys all the resources associated with the rnode
3005  * and then the rnode itself.
3006  */
3007 static void
3008 destroy_rnode(rnode_t *rp)
3009 {
3010         vnode_t *vp;
3011         vfs_t *vfsp;
3012 
3013         vp = RTOV(rp);
3014         vfsp = vp->v_vfsp;
3015 
3016         ASSERT(vp->v_count == 1);
3017         ASSERT(rp->r_count == 0);
3018         ASSERT(rp->r_lmpl == NULL);
3019         ASSERT(rp->r_mapcnt == 0);
3020         ASSERT(!(rp->r_flags & RHASHED));
3021         ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3022         atomic_dec_ulong((ulong_t *)&rnew);
3023 #ifdef DEBUG
3024         clstat_debug.nrnode.value.ui64--;
3025 #endif
3026         nfs_rw_destroy(&rp->r_rwlock);
3027         nfs_rw_destroy(&rp->r_lkserlock);
3028         mutex_destroy(&rp->r_statelock);
3029         cv_destroy(&rp->r_cv);
3030         cv_destroy(&rp->r_commit.c_cv);
3031         if (rp->r_flags & RDELMAPLIST)
3032                 list_destroy(&rp->r_indelmap);
3033         nfs_free_r_path(rp);
3034         avl_destroy(&rp->r_dir);
3035         vn_invalid(vp);
3036         vn_free(vp);
3037         kmem_cache_free(rnode_cache, rp);
3038         VFS_RELE(vfsp);
3039 }
3040 
3041 /*
3042  * Flush all vnodes in this (or every) vfs.
3043  * Used by nfs_sync and by nfs_unmount.
3044  */
3045 void
3046 rflush(struct vfs *vfsp, cred_t *cr)
3047 {
3048         int index;
3049         rnode_t *rp;
3050         vnode_t *vp, **vplist;
3051         long num, cnt;
3052 
3053         /*
3054          * Check to see whether there is anything to do.
3055          */
3056         num = rnew;
3057         if (num == 0)
3058                 return;
3059 
3060         /*
3061          * Allocate a slot for all currently active rnodes on the
3062          * supposition that they all may need flushing.
3063          */
3064         vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3065         cnt = 0;
3066 
3067         /*
3068          * Walk the hash queues looking for rnodes with page
3069          * lists associated with them.  Make a list of these
3070          * files.
3071          */
3072         for (index = 0; index < rtablesize; index++) {
3073                 rw_enter(&rtable[index].r_lock, RW_READER);
3074                 for (rp = rtable[index].r_hashf;
3075                     rp != (rnode_t *)(&rtable[index]);
3076                     rp = rp->r_hashf) {
3077                         vp = RTOV(rp);
3078                         /*
3079                          * Don't bother sync'ing a vp if it
3080                          * is part of virtual swap device or
3081                          * if VFS is read-only
3082                          */
3083                         if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3084                                 continue;
3085                         /*
3086                          * If flushing all mounted file systems or
3087                          * the vnode belongs to this vfs, has pages
3088                          * and is marked as either dirty or mmap'd,
3089                          * hold and add this vnode to the list of
3090                          * vnodes to flush.
3091                          */
3092                         if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3093                             vn_has_cached_data(vp) &&
3094                             ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3095                                 VN_HOLD(vp);
3096                                 vplist[cnt++] = vp;
3097                                 if (cnt == num) {
3098                                         rw_exit(&rtable[index].r_lock);
3099                                         goto toomany;
3100                                 }
3101                         }
3102                 }
3103                 rw_exit(&rtable[index].r_lock);
3104         }
3105 toomany:
3106 
3107         /*
3108          * Flush and release all of the files on the list.
3109          */
3110         while (cnt-- > 0) {
3111                 vp = vplist[cnt];
3112                 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3113                 VN_RELE(vp);
3114         }
3115 
3116         /*
3117          * Free the space allocated to hold the list.
3118          */
3119         kmem_free(vplist, num * sizeof (*vplist));
3120 }
3121 
3122 /*
3123  * This probably needs to be larger than or equal to
3124  * log2(sizeof (struct rnode)) due to the way that rnodes are
3125  * allocated.
3126  */
3127 #define ACACHE_SHIFT_BITS       9
3128 
3129 static int
3130 acachehash(rnode_t *rp, cred_t *cr)
3131 {
3132 
3133         return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3134             acachemask);
3135 }
3136 
3137 #ifdef DEBUG
3138 static long nfs_access_cache_hits = 0;
3139 static long nfs_access_cache_misses = 0;
3140 #endif
3141 
3142 nfs_access_type_t
3143 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3144 {
3145         vnode_t *vp;
3146         acache_t *ap;
3147         acache_hash_t *hp;
3148         nfs_access_type_t all;
3149 
3150         vp = RTOV(rp);
3151         if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3152                 return (NFS_ACCESS_UNKNOWN);
3153 
3154         if (rp->r_acache != NULL) {
3155                 hp = &acache[acachehash(rp, cr)];
3156                 rw_enter(&hp->lock, RW_READER);
3157                 ap = hp->next;
3158                 while (ap != (acache_t *)hp) {
3159                         if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3160                                 if ((ap->known & acc) == acc) {
3161 #ifdef DEBUG
3162                                         nfs_access_cache_hits++;
3163 #endif
3164                                         if ((ap->allowed & acc) == acc)
3165                                                 all = NFS_ACCESS_ALLOWED;
3166                                         else
3167                                                 all = NFS_ACCESS_DENIED;
3168                                 } else {
3169 #ifdef DEBUG
3170                                         nfs_access_cache_misses++;
3171 #endif
3172                                         all = NFS_ACCESS_UNKNOWN;
3173                                 }
3174                                 rw_exit(&hp->lock);
3175                                 return (all);
3176                         }
3177                         ap = ap->next;
3178                 }
3179                 rw_exit(&hp->lock);
3180         }
3181 
3182 #ifdef DEBUG
3183         nfs_access_cache_misses++;
3184 #endif
3185         return (NFS_ACCESS_UNKNOWN);
3186 }
3187 
3188 void
3189 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3190 {
3191         acache_t *ap;
3192         acache_t *nap;
3193         acache_hash_t *hp;
3194 
3195         hp = &acache[acachehash(rp, cr)];
3196 
3197         /*
3198          * Allocate now assuming that mostly an allocation will be
3199          * required.  This allows the allocation to happen without
3200          * holding the hash bucket locked.
3201          */
3202         nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3203         if (nap != NULL) {
3204                 nap->known = acc;
3205                 nap->allowed = resacc;
3206                 nap->rnode = rp;
3207                 crhold(cr);
3208                 nap->cred = cr;
3209                 nap->hashq = hp;
3210         }
3211 
3212         rw_enter(&hp->lock, RW_WRITER);
3213 
3214         if (rp->r_acache != NULL) {
3215                 ap = hp->next;
3216                 while (ap != (acache_t *)hp) {
3217                         if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3218                                 ap->known |= acc;
3219                                 ap->allowed &= ~acc;
3220                                 ap->allowed |= resacc;
3221                                 rw_exit(&hp->lock);
3222                                 if (nap != NULL) {
3223                                         crfree(nap->cred);
3224                                         kmem_cache_free(acache_cache, nap);
3225                                 }
3226                                 return;
3227                         }
3228                         ap = ap->next;
3229                 }
3230         }
3231 
3232         if (nap != NULL) {
3233 #ifdef DEBUG
3234                 clstat_debug.access.value.ui64++;
3235 #endif
3236                 nap->next = hp->next;
3237                 hp->next = nap;
3238                 nap->next->prev = nap;
3239                 nap->prev = (acache_t *)hp;
3240 
3241                 mutex_enter(&rp->r_statelock);
3242                 nap->list = rp->r_acache;
3243                 rp->r_acache = nap;
3244                 mutex_exit(&rp->r_statelock);
3245         }
3246 
3247         rw_exit(&hp->lock);
3248 }
3249 
3250 int
3251 nfs_access_purge_rp(rnode_t *rp)
3252 {
3253         acache_t *ap;
3254         acache_t *tmpap;
3255         acache_t *rplist;
3256 
3257         /*
3258          * If there aren't any cached entries, then there is nothing
3259          * to free.
3260          */
3261         if (rp->r_acache == NULL)
3262                 return (0);
3263 
3264         mutex_enter(&rp->r_statelock);
3265         rplist = rp->r_acache;
3266         rp->r_acache = NULL;
3267         mutex_exit(&rp->r_statelock);
3268 
3269         /*
3270          * Loop through each entry in the list pointed to in the
3271          * rnode.  Remove each of these entries from the hash
3272          * queue that it is on and remove it from the list in
3273          * the rnode.
3274          */
3275         for (ap = rplist; ap != NULL; ap = tmpap) {
3276                 rw_enter(&ap->hashq->lock, RW_WRITER);
3277                 ap->prev->next = ap->next;
3278                 ap->next->prev = ap->prev;
3279                 rw_exit(&ap->hashq->lock);
3280 
3281                 tmpap = ap->list;
3282                 crfree(ap->cred);
3283                 kmem_cache_free(acache_cache, ap);
3284 #ifdef DEBUG
3285                 clstat_debug.access.value.ui64--;
3286 #endif
3287         }
3288 
3289         return (1);
3290 }
3291 
3292 static const char prefix[] = ".nfs";
3293 
3294 static kmutex_t newnum_lock;
3295 
3296 int
3297 newnum(void)
3298 {
3299         static uint_t newnum = 0;
3300         uint_t id;
3301 
3302         mutex_enter(&newnum_lock);
3303         if (newnum == 0)
3304                 newnum = gethrestime_sec() & 0xffff;
3305         id = newnum++;
3306         mutex_exit(&newnum_lock);
3307         return (id);
3308 }
3309 
3310 char *
3311 newname(void)
3312 {
3313         char *news;
3314         char *s;
3315         const char *p;
3316         uint_t id;
3317 
3318         id = newnum();
3319         news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3320         s = news;
3321         p = prefix;
3322         while (*p != '\0')
3323                 *s++ = *p++;
3324         while (id != 0) {
3325                 *s++ = "0123456789ABCDEF"[id & 0x0f];
3326                 id >>= 4;
3327         }
3328         *s = '\0';
3329         return (news);
3330 }
3331 
3332 /*
3333  * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3334  * framework.
3335  */
3336 static int
3337 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3338 {
3339         ksp->ks_snaptime = gethrtime();
3340         if (rw == KSTAT_WRITE) {
3341                 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3342 #ifdef DEBUG
3343                 /*
3344                  * Currently only the global zone can write to kstats, but we
3345                  * add the check just for paranoia.
3346                  */
3347                 if (INGLOBALZONE(curproc))
3348                         bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3349                             sizeof (clstat_debug));
3350 #endif
3351         } else {
3352                 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3353 #ifdef DEBUG
3354                 /*
3355                  * If we're displaying the "global" debug kstat values, we
3356                  * display them as-is to all zones since in fact they apply to
3357                  * the system as a whole.
3358                  */
3359                 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3360                     sizeof (clstat_debug));
3361 #endif
3362         }
3363         return (0);
3364 }
3365 
3366 static void *
3367 clinit_zone(zoneid_t zoneid)
3368 {
3369         kstat_t *nfs_client_kstat;
3370         struct nfs_clnt *nfscl;
3371         uint_t ndata;
3372 
3373         nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3374         mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3375         nfscl->nfscl_chtable = NULL;
3376         nfscl->nfscl_zoneid = zoneid;
3377 
3378         bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3379         ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3380 #ifdef DEBUG
3381         ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3382 #endif
3383         if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3384             "misc", KSTAT_TYPE_NAMED, ndata,
3385             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3386                 nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3387                 nfs_client_kstat->ks_snapshot = cl_snapshot;
3388                 kstat_install(nfs_client_kstat);
3389         }
3390         mutex_enter(&nfs_clnt_list_lock);
3391         list_insert_head(&nfs_clnt_list, nfscl);
3392         mutex_exit(&nfs_clnt_list_lock);
3393         return (nfscl);
3394 }
3395 
3396 /*ARGSUSED*/
3397 static void
3398 clfini_zone(zoneid_t zoneid, void *arg)
3399 {
3400         struct nfs_clnt *nfscl = arg;
3401         chhead_t *chp, *next;
3402 
3403         if (nfscl == NULL)
3404                 return;
3405         mutex_enter(&nfs_clnt_list_lock);
3406         list_remove(&nfs_clnt_list, nfscl);
3407         mutex_exit(&nfs_clnt_list_lock);
3408         clreclaim_zone(nfscl, 0);
3409         for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3410                 ASSERT(chp->ch_list == NULL);
3411                 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3412                 next = chp->ch_next;
3413                 kmem_free(chp, sizeof (*chp));
3414         }
3415         kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3416         mutex_destroy(&nfscl->nfscl_chtable_lock);
3417         kmem_free(nfscl, sizeof (*nfscl));
3418 }
3419 
3420 /*
3421  * Called by endpnt_destructor to make sure the client handles are
3422  * cleaned up before the RPC endpoints.  This becomes a no-op if
3423  * clfini_zone (above) is called first.  This function is needed
3424  * (rather than relying on clfini_zone to clean up) because the ZSD
3425  * callbacks have no ordering mechanism, so we have no way to ensure
3426  * that clfini_zone is called before endpnt_destructor.
3427  */
3428 void
3429 clcleanup_zone(zoneid_t zoneid)
3430 {
3431         struct nfs_clnt *nfscl;
3432 
3433         mutex_enter(&nfs_clnt_list_lock);
3434         nfscl = list_head(&nfs_clnt_list);
3435         for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3436                 if (nfscl->nfscl_zoneid == zoneid) {
3437                         clreclaim_zone(nfscl, 0);
3438                         break;
3439                 }
3440         }
3441         mutex_exit(&nfs_clnt_list_lock);
3442 }
3443 
3444 int
3445 nfs_subrinit(void)
3446 {
3447         int i;
3448         ulong_t nrnode_max;
3449 
3450         /*
3451          * Allocate and initialize the rnode hash queues
3452          */
3453         if (nrnode <= 0)
3454                 nrnode = ncsize;
3455         nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3456         if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3457                 zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3458                     "!setting nrnode to max value of %ld", nrnode_max);
3459                 nrnode = nrnode_max;
3460         }
3461 
3462         rtablesize = 1 << highbit(nrnode / hashlen);
3463         rtablemask = rtablesize - 1;
3464         rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3465         for (i = 0; i < rtablesize; i++) {
3466                 rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3467                 rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3468                 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3469         }
3470         rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3471             0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3472 
3473         /*
3474          * Allocate and initialize the access cache
3475          */
3476 
3477         /*
3478          * Initial guess is one access cache entry per rnode unless
3479          * nacache is set to a non-zero value and then it is used to
3480          * indicate a guess at the number of access cache entries.
3481          */
3482         if (nacache > 0)
3483                 acachesize = 1 << highbit(nacache / hashlen);
3484         else
3485                 acachesize = rtablesize;
3486         acachemask = acachesize - 1;
3487         acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3488         for (i = 0; i < acachesize; i++) {
3489                 acache[i].next = (acache_t *)&acache[i];
3490                 acache[i].prev = (acache_t *)&acache[i];
3491                 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3492         }
3493         acache_cache = kmem_cache_create("nfs_access_cache",
3494             sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3495         /*
3496          * Allocate and initialize the client handle cache
3497          */
3498         chtab_cache = kmem_cache_create("client_handle_cache",
3499             sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3500         /*
3501          * Initialize the list of per-zone client handles (and associated data).
3502          * This needs to be done before we call zone_key_create().
3503          */
3504         list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3505             offsetof(struct nfs_clnt, nfscl_node));
3506         /*
3507          * Initialize the zone_key for per-zone client handle lists.
3508          */
3509         zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3510         /*
3511          * Initialize the various mutexes and reader/writer locks
3512          */
3513         mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3514         mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3515         mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3516 
3517         /*
3518          * Assign unique major number for all nfs mounts
3519          */
3520         if ((nfs_major = getudev()) == -1) {
3521                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
3522                     "nfs: init: can't get unique device number");
3523                 nfs_major = 0;
3524         }
3525         nfs_minor = 0;
3526 
3527         if (nfs3_jukebox_delay == 0)
3528                 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3529 
3530         return (0);
3531 }
3532 
3533 void
3534 nfs_subrfini(void)
3535 {
3536         int i;
3537 
3538         /*
3539          * Deallocate the rnode hash queues
3540          */
3541         kmem_cache_destroy(rnode_cache);
3542 
3543         for (i = 0; i < rtablesize; i++)
3544                 rw_destroy(&rtable[i].r_lock);
3545         kmem_free(rtable, rtablesize * sizeof (*rtable));
3546 
3547         /*
3548          * Deallocated the access cache
3549          */
3550         kmem_cache_destroy(acache_cache);
3551 
3552         for (i = 0; i < acachesize; i++)
3553                 rw_destroy(&acache[i].lock);
3554         kmem_free(acache, acachesize * sizeof (*acache));
3555 
3556         /*
3557          * Deallocate the client handle cache
3558          */
3559         kmem_cache_destroy(chtab_cache);
3560 
3561         /*
3562          * Destroy the various mutexes and reader/writer locks
3563          */
3564         mutex_destroy(&rpfreelist_lock);
3565         mutex_destroy(&newnum_lock);
3566         mutex_destroy(&nfs_minor_lock);
3567         (void) zone_key_delete(nfsclnt_zone_key);
3568 }
3569 
3570 enum nfsstat
3571 puterrno(int error)
3572 {
3573 
3574         switch (error) {
3575         case EOPNOTSUPP:
3576                 return (NFSERR_OPNOTSUPP);
3577         case ENAMETOOLONG:
3578                 return (NFSERR_NAMETOOLONG);
3579         case ENOTEMPTY:
3580                 return (NFSERR_NOTEMPTY);
3581         case EDQUOT:
3582                 return (NFSERR_DQUOT);
3583         case ESTALE:
3584                 return (NFSERR_STALE);
3585         case EREMOTE:
3586                 return (NFSERR_REMOTE);
3587         case ENOSYS:
3588                 return (NFSERR_OPNOTSUPP);
3589         case EOVERFLOW:
3590                 return (NFSERR_INVAL);
3591         default:
3592                 return ((enum nfsstat)error);
3593         }
3594         /* NOTREACHED */
3595 }
3596 
3597 int
3598 geterrno(enum nfsstat status)
3599 {
3600 
3601         switch (status) {
3602         case NFSERR_OPNOTSUPP:
3603                 return (EOPNOTSUPP);
3604         case NFSERR_NAMETOOLONG:
3605                 return (ENAMETOOLONG);
3606         case NFSERR_NOTEMPTY:
3607                 return (ENOTEMPTY);
3608         case NFSERR_DQUOT:
3609                 return (EDQUOT);
3610         case NFSERR_STALE:
3611                 return (ESTALE);
3612         case NFSERR_REMOTE:
3613                 return (EREMOTE);
3614         case NFSERR_WFLUSH:
3615                 return (EIO);
3616         default:
3617                 return ((int)status);
3618         }
3619         /* NOTREACHED */
3620 }
3621 
3622 enum nfsstat3
3623 puterrno3(int error)
3624 {
3625 
3626 #ifdef DEBUG
3627         switch (error) {
3628         case 0:
3629                 return (NFS3_OK);
3630         case EPERM:
3631                 return (NFS3ERR_PERM);
3632         case ENOENT:
3633                 return (NFS3ERR_NOENT);
3634         case EIO:
3635                 return (NFS3ERR_IO);
3636         case ENXIO:
3637                 return (NFS3ERR_NXIO);
3638         case EACCES:
3639                 return (NFS3ERR_ACCES);
3640         case EEXIST:
3641                 return (NFS3ERR_EXIST);
3642         case EXDEV:
3643                 return (NFS3ERR_XDEV);
3644         case ENODEV:
3645                 return (NFS3ERR_NODEV);
3646         case ENOTDIR:
3647                 return (NFS3ERR_NOTDIR);
3648         case EISDIR:
3649                 return (NFS3ERR_ISDIR);
3650         case EINVAL:
3651                 return (NFS3ERR_INVAL);
3652         case EFBIG:
3653                 return (NFS3ERR_FBIG);
3654         case ENOSPC:
3655                 return (NFS3ERR_NOSPC);
3656         case EROFS:
3657                 return (NFS3ERR_ROFS);
3658         case EMLINK:
3659                 return (NFS3ERR_MLINK);
3660         case ENAMETOOLONG:
3661                 return (NFS3ERR_NAMETOOLONG);
3662         case ENOTEMPTY:
3663                 return (NFS3ERR_NOTEMPTY);
3664         case EDQUOT:
3665                 return (NFS3ERR_DQUOT);
3666         case ESTALE:
3667                 return (NFS3ERR_STALE);
3668         case EREMOTE:
3669                 return (NFS3ERR_REMOTE);
3670         case ENOSYS:
3671         case EOPNOTSUPP:
3672                 return (NFS3ERR_NOTSUPP);
3673         case EOVERFLOW:
3674                 return (NFS3ERR_INVAL);
3675         default:
3676                 zcmn_err(getzoneid(), CE_WARN,
3677                     "puterrno3: got error %d", error);
3678                 return ((enum nfsstat3)error);
3679         }
3680 #else
3681         switch (error) {
3682         case ENAMETOOLONG:
3683                 return (NFS3ERR_NAMETOOLONG);
3684         case ENOTEMPTY:
3685                 return (NFS3ERR_NOTEMPTY);
3686         case EDQUOT:
3687                 return (NFS3ERR_DQUOT);
3688         case ESTALE:
3689                 return (NFS3ERR_STALE);
3690         case ENOSYS:
3691         case EOPNOTSUPP:
3692                 return (NFS3ERR_NOTSUPP);
3693         case EREMOTE:
3694                 return (NFS3ERR_REMOTE);
3695         case EOVERFLOW:
3696                 return (NFS3ERR_INVAL);
3697         default:
3698                 return ((enum nfsstat3)error);
3699         }
3700 #endif
3701 }
3702 
3703 int
3704 geterrno3(enum nfsstat3 status)
3705 {
3706 
3707 #ifdef DEBUG
3708         switch (status) {
3709         case NFS3_OK:
3710                 return (0);
3711         case NFS3ERR_PERM:
3712                 return (EPERM);
3713         case NFS3ERR_NOENT:
3714                 return (ENOENT);
3715         case NFS3ERR_IO:
3716                 return (EIO);
3717         case NFS3ERR_NXIO:
3718                 return (ENXIO);
3719         case NFS3ERR_ACCES:
3720                 return (EACCES);
3721         case NFS3ERR_EXIST:
3722                 return (EEXIST);
3723         case NFS3ERR_XDEV:
3724                 return (EXDEV);
3725         case NFS3ERR_NODEV:
3726                 return (ENODEV);
3727         case NFS3ERR_NOTDIR:
3728                 return (ENOTDIR);
3729         case NFS3ERR_ISDIR:
3730                 return (EISDIR);
3731         case NFS3ERR_INVAL:
3732                 return (EINVAL);
3733         case NFS3ERR_FBIG:
3734                 return (EFBIG);
3735         case NFS3ERR_NOSPC:
3736                 return (ENOSPC);
3737         case NFS3ERR_ROFS:
3738                 return (EROFS);
3739         case NFS3ERR_MLINK:
3740                 return (EMLINK);
3741         case NFS3ERR_NAMETOOLONG:
3742                 return (ENAMETOOLONG);
3743         case NFS3ERR_NOTEMPTY:
3744                 return (ENOTEMPTY);
3745         case NFS3ERR_DQUOT:
3746                 return (EDQUOT);
3747         case NFS3ERR_STALE:
3748                 return (ESTALE);
3749         case NFS3ERR_REMOTE:
3750                 return (EREMOTE);
3751         case NFS3ERR_BADHANDLE:
3752                 return (ESTALE);
3753         case NFS3ERR_NOT_SYNC:
3754                 return (EINVAL);
3755         case NFS3ERR_BAD_COOKIE:
3756                 return (ENOENT);
3757         case NFS3ERR_NOTSUPP:
3758                 return (EOPNOTSUPP);
3759         case NFS3ERR_TOOSMALL:
3760                 return (EINVAL);
3761         case NFS3ERR_SERVERFAULT:
3762                 return (EIO);
3763         case NFS3ERR_BADTYPE:
3764                 return (EINVAL);
3765         case NFS3ERR_JUKEBOX:
3766                 return (ENXIO);
3767         default:
3768                 zcmn_err(getzoneid(), CE_WARN,
3769                     "geterrno3: got status %d", status);
3770                 return ((int)status);
3771         }
3772 #else
3773         switch (status) {
3774         case NFS3ERR_NAMETOOLONG:
3775                 return (ENAMETOOLONG);
3776         case NFS3ERR_NOTEMPTY:
3777                 return (ENOTEMPTY);
3778         case NFS3ERR_DQUOT:
3779                 return (EDQUOT);
3780         case NFS3ERR_STALE:
3781         case NFS3ERR_BADHANDLE:
3782                 return (ESTALE);
3783         case NFS3ERR_NOTSUPP:
3784                 return (EOPNOTSUPP);
3785         case NFS3ERR_REMOTE:
3786                 return (EREMOTE);
3787         case NFS3ERR_NOT_SYNC:
3788         case NFS3ERR_TOOSMALL:
3789         case NFS3ERR_BADTYPE:
3790                 return (EINVAL);
3791         case NFS3ERR_BAD_COOKIE:
3792                 return (ENOENT);
3793         case NFS3ERR_SERVERFAULT:
3794                 return (EIO);
3795         case NFS3ERR_JUKEBOX:
3796                 return (ENXIO);
3797         default:
3798                 return ((int)status);
3799         }
3800 #endif
3801 }
3802 
3803 rddir_cache *
3804 rddir_cache_alloc(int flags)
3805 {
3806         rddir_cache *rc;
3807 
3808         rc = kmem_alloc(sizeof (*rc), flags);
3809         if (rc != NULL) {
3810                 rc->entries = NULL;
3811                 rc->flags = RDDIR;
3812                 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3813                 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3814                 rc->count = 1;
3815 #ifdef DEBUG
3816                 atomic_inc_64(&clstat_debug.dirent.value.ui64);
3817 #endif
3818         }
3819         return (rc);
3820 }
3821 
3822 static void
3823 rddir_cache_free(rddir_cache *rc)
3824 {
3825 
3826 #ifdef DEBUG
3827         atomic_dec_64(&clstat_debug.dirent.value.ui64);
3828 #endif
3829         if (rc->entries != NULL) {
3830 #ifdef DEBUG
3831                 rddir_cache_buf_free(rc->entries, rc->buflen);
3832 #else
3833                 kmem_free(rc->entries, rc->buflen);
3834 #endif
3835         }
3836         cv_destroy(&rc->cv);
3837         mutex_destroy(&rc->lock);
3838         kmem_free(rc, sizeof (*rc));
3839 }
3840 
3841 void
3842 rddir_cache_hold(rddir_cache *rc)
3843 {
3844 
3845         mutex_enter(&rc->lock);
3846         rc->count++;
3847         mutex_exit(&rc->lock);
3848 }
3849 
3850 void
3851 rddir_cache_rele(rddir_cache *rc)
3852 {
3853 
3854         mutex_enter(&rc->lock);
3855         ASSERT(rc->count > 0);
3856         if (--rc->count == 0) {
3857                 mutex_exit(&rc->lock);
3858                 rddir_cache_free(rc);
3859         } else
3860                 mutex_exit(&rc->lock);
3861 }
3862 
3863 #ifdef DEBUG
3864 char *
3865 rddir_cache_buf_alloc(size_t size, int flags)
3866 {
3867         char *rc;
3868 
3869         rc = kmem_alloc(size, flags);
3870         if (rc != NULL)
3871                 atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3872         return (rc);
3873 }
3874 
3875 void
3876 rddir_cache_buf_free(void *addr, size_t size)
3877 {
3878 
3879         atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3880         kmem_free(addr, size);
3881 }
3882 #endif
3883 
3884 static int
3885 nfs_free_data_reclaim(rnode_t *rp)
3886 {
3887         char *contents;
3888         int size;
3889         vsecattr_t *vsp;
3890         nfs3_pathconf_info *info;
3891         int freed;
3892         cred_t *cred;
3893 
3894         /*
3895          * Free any held credentials and caches which
3896          * may be associated with this rnode.
3897          */
3898         mutex_enter(&rp->r_statelock);
3899         cred = rp->r_cred;
3900         rp->r_cred = NULL;
3901         contents = rp->r_symlink.contents;
3902         size = rp->r_symlink.size;
3903         rp->r_symlink.contents = NULL;
3904         vsp = rp->r_secattr;
3905         rp->r_secattr = NULL;
3906         info = rp->r_pathconf;
3907         rp->r_pathconf = NULL;
3908         mutex_exit(&rp->r_statelock);
3909 
3910         if (cred != NULL)
3911                 crfree(cred);
3912 
3913         /*
3914          * Free the access cache entries.
3915          */
3916         freed = nfs_access_purge_rp(rp);
3917 
3918         if (!HAVE_RDDIR_CACHE(rp) &&
3919             contents == NULL &&
3920             vsp == NULL &&
3921             info == NULL)
3922                 return (freed);
3923 
3924         /*
3925          * Free the readdir cache entries
3926          */
3927         if (HAVE_RDDIR_CACHE(rp))
3928                 nfs_purge_rddir_cache(RTOV(rp));
3929 
3930         /*
3931          * Free the symbolic link cache.
3932          */
3933         if (contents != NULL) {
3934 
3935                 kmem_free((void *)contents, size);
3936         }
3937 
3938         /*
3939          * Free any cached ACL.
3940          */
3941         if (vsp != NULL)
3942                 nfs_acl_free(vsp);
3943 
3944         /*
3945          * Free any cached pathconf information.
3946          */
3947         if (info != NULL)
3948                 kmem_free(info, sizeof (*info));
3949 
3950         return (1);
3951 }
3952 
3953 static int
3954 nfs_active_data_reclaim(rnode_t *rp)
3955 {
3956         char *contents;
3957         int size;
3958         vsecattr_t *vsp;
3959         nfs3_pathconf_info *info;
3960         int freed;
3961 
3962         /*
3963          * Free any held credentials and caches which
3964          * may be associated with this rnode.
3965          */
3966         if (!mutex_tryenter(&rp->r_statelock))
3967                 return (0);
3968         contents = rp->r_symlink.contents;
3969         size = rp->r_symlink.size;
3970         rp->r_symlink.contents = NULL;
3971         vsp = rp->r_secattr;
3972         rp->r_secattr = NULL;
3973         info = rp->r_pathconf;
3974         rp->r_pathconf = NULL;
3975         mutex_exit(&rp->r_statelock);
3976 
3977         /*
3978          * Free the access cache entries.
3979          */
3980         freed = nfs_access_purge_rp(rp);
3981 
3982         if (!HAVE_RDDIR_CACHE(rp) &&
3983             contents == NULL &&
3984             vsp == NULL &&
3985             info == NULL)
3986                 return (freed);
3987 
3988         /*
3989          * Free the readdir cache entries
3990          */
3991         if (HAVE_RDDIR_CACHE(rp))
3992                 nfs_purge_rddir_cache(RTOV(rp));
3993 
3994         /*
3995          * Free the symbolic link cache.
3996          */
3997         if (contents != NULL) {
3998 
3999                 kmem_free((void *)contents, size);
4000         }
4001 
4002         /*
4003          * Free any cached ACL.
4004          */
4005         if (vsp != NULL)
4006                 nfs_acl_free(vsp);
4007 
4008         /*
4009          * Free any cached pathconf information.
4010          */
4011         if (info != NULL)
4012                 kmem_free(info, sizeof (*info));
4013 
4014         return (1);
4015 }
4016 
4017 static int
4018 nfs_free_reclaim(void)
4019 {
4020         int freed;
4021         rnode_t *rp;
4022 
4023 #ifdef DEBUG
4024         clstat_debug.f_reclaim.value.ui64++;
4025 #endif
4026         freed = 0;
4027         mutex_enter(&rpfreelist_lock);
4028         rp = rpfreelist;
4029         if (rp != NULL) {
4030                 do {
4031                         if (nfs_free_data_reclaim(rp))
4032                                 freed = 1;
4033                 } while ((rp = rp->r_freef) != rpfreelist);
4034         }
4035         mutex_exit(&rpfreelist_lock);
4036         return (freed);
4037 }
4038 
4039 static int
4040 nfs_active_reclaim(void)
4041 {
4042         int freed;
4043         int index;
4044         rnode_t *rp;
4045 
4046 #ifdef DEBUG
4047         clstat_debug.a_reclaim.value.ui64++;
4048 #endif
4049         freed = 0;
4050         for (index = 0; index < rtablesize; index++) {
4051                 rw_enter(&rtable[index].r_lock, RW_READER);
4052                 for (rp = rtable[index].r_hashf;
4053                     rp != (rnode_t *)(&rtable[index]);
4054                     rp = rp->r_hashf) {
4055                         if (nfs_active_data_reclaim(rp))
4056                                 freed = 1;
4057                 }
4058                 rw_exit(&rtable[index].r_lock);
4059         }
4060         return (freed);
4061 }
4062 
4063 static int
4064 nfs_rnode_reclaim(void)
4065 {
4066         int freed;
4067         rnode_t *rp;
4068         vnode_t *vp;
4069 
4070 #ifdef DEBUG
4071         clstat_debug.r_reclaim.value.ui64++;
4072 #endif
4073         freed = 0;
4074         mutex_enter(&rpfreelist_lock);
4075         while ((rp = rpfreelist) != NULL) {
4076                 rp_rmfree(rp);
4077                 mutex_exit(&rpfreelist_lock);
4078                 if (rp->r_flags & RHASHED) {
4079                         vp = RTOV(rp);
4080                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4081                         mutex_enter(&vp->v_lock);
4082                         if (vp->v_count > 1) {
4083                                 vp->v_count--;
4084                                 mutex_exit(&vp->v_lock);
4085                                 rw_exit(&rp->r_hashq->r_lock);
4086                                 mutex_enter(&rpfreelist_lock);
4087                                 continue;
4088                         }
4089                         mutex_exit(&vp->v_lock);
4090                         rp_rmhash_locked(rp);
4091                         rw_exit(&rp->r_hashq->r_lock);
4092                 }
4093                 /*
4094                  * This call to rp_addfree will end up destroying the
4095                  * rnode, but in a safe way with the appropriate set
4096                  * of checks done.
4097                  */
4098                 rp_addfree(rp, CRED());
4099                 mutex_enter(&rpfreelist_lock);
4100         }
4101         mutex_exit(&rpfreelist_lock);
4102         return (freed);
4103 }
4104 
4105 /*ARGSUSED*/
4106 static void
4107 nfs_reclaim(void *cdrarg)
4108 {
4109 
4110 #ifdef DEBUG
4111         clstat_debug.reclaim.value.ui64++;
4112 #endif
4113         if (nfs_free_reclaim())
4114                 return;
4115 
4116         if (nfs_active_reclaim())
4117                 return;
4118 
4119         (void) nfs_rnode_reclaim();
4120 }
4121 
4122 /*
4123  * NFS client failover support
4124  *
4125  * Routines to copy filehandles
4126  */
4127 void
4128 nfscopyfh(caddr_t fhp, vnode_t *vp)
4129 {
4130         fhandle_t *dest = (fhandle_t *)fhp;
4131 
4132         if (dest != NULL)
4133                 *dest = *VTOFH(vp);
4134 }
4135 
4136 void
4137 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4138 {
4139         nfs_fh3 *dest = (nfs_fh3 *)fhp;
4140 
4141         if (dest != NULL)
4142                 *dest = *VTOFH3(vp);
4143 }
4144 
4145 /*
4146  * NFS client failover support
4147  *
4148  * failover_safe() will test various conditions to ensure that
4149  * failover is permitted for this vnode.  It will be denied
4150  * if:
4151  *      1) the operation in progress does not support failover (NULL fi)
4152  *      2) there are no available replicas (NULL mi_servers->sv_next)
4153  *      3) any locks are outstanding on this file
4154  */
4155 static int
4156 failover_safe(failinfo_t *fi)
4157 {
4158 
4159         /*
4160          * Does this op permit failover?
4161          */
4162         if (fi == NULL || fi->vp == NULL)
4163                 return (0);
4164 
4165         /*
4166          * Are there any alternates to failover to?
4167          */
4168         if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4169                 return (0);
4170 
4171         /*
4172          * Disable check; we've forced local locking
4173          *
4174          * if (flk_has_remote_locks(fi->vp))
4175          *      return (0);
4176          */
4177 
4178         /*
4179          * If we have no partial path, we can't do anything
4180          */
4181         if (VTOR(fi->vp)->r_path == NULL)
4182                 return (0);
4183 
4184         return (1);
4185 }
4186 
4187 #include <sys/thread.h>
4188 
4189 /*
4190  * NFS client failover support
4191  *
4192  * failover_newserver() will start a search for a new server,
4193  * preferably by starting an async thread to do the work.  If
4194  * someone is already doing this (recognizable by MI_BINDINPROG
4195  * being set), it will simply return and the calling thread
4196  * will queue on the mi_failover_cv condition variable.
4197  */
4198 static void
4199 failover_newserver(mntinfo_t *mi)
4200 {
4201         /*
4202          * Check if someone else is doing this already
4203          */
4204         mutex_enter(&mi->mi_lock);
4205         if (mi->mi_flags & MI_BINDINPROG) {
4206                 mutex_exit(&mi->mi_lock);
4207                 return;
4208         }
4209         mi->mi_flags |= MI_BINDINPROG;
4210 
4211         /*
4212          * Need to hold the vfs struct so that it can't be released
4213          * while the failover thread is selecting a new server.
4214          */
4215         VFS_HOLD(mi->mi_vfsp);
4216 
4217         /*
4218          * Start a thread to do the real searching.
4219          */
4220         (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4221 
4222         mutex_exit(&mi->mi_lock);
4223 }
4224 
4225 /*
4226  * NFS client failover support
4227  *
4228  * failover_thread() will find a new server to replace the one
4229  * currently in use, wake up other threads waiting on this mount
4230  * point, and die.  It will start at the head of the server list
4231  * and poll servers until it finds one with an NFS server which is
4232  * registered and responds to a NULL procedure ping.
4233  *
4234  * XXX failover_thread is unsafe within the scope of the
4235  * present model defined for cpr to suspend the system.
4236  * Specifically, over-the-wire calls made by the thread
4237  * are unsafe. The thread needs to be reevaluated in case of
4238  * future updates to the cpr suspend model.
4239  */
4240 static void
4241 failover_thread(mntinfo_t *mi)
4242 {
4243         servinfo_t *svp = NULL;
4244         CLIENT *cl;
4245         enum clnt_stat status;
4246         struct timeval tv;
4247         int error;
4248         int oncethru = 0;
4249         callb_cpr_t cprinfo;
4250         rnode_t *rp;
4251         int index;
4252         char *srvnames;
4253         size_t srvnames_len;
4254         struct nfs_clnt *nfscl = NULL;
4255         zoneid_t zoneid = getzoneid();
4256 
4257 #ifdef DEBUG
4258         /*
4259          * This is currently only needed to access counters which exist on
4260          * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4261          * on non-DEBUG kernels.
4262          */
4263         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4264         ASSERT(nfscl != NULL);
4265 #endif
4266 
4267         /*
4268          * Its safe to piggyback on the mi_lock since failover_newserver()
4269          * code guarantees that there will be only one failover thread
4270          * per mountinfo at any instance.
4271          */
4272         CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4273             "failover_thread");
4274 
4275         mutex_enter(&mi->mi_lock);
4276         while (mi->mi_readers) {
4277                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4278                 cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4279                 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4280         }
4281         mutex_exit(&mi->mi_lock);
4282 
4283         tv.tv_sec = 2;
4284         tv.tv_usec = 0;
4285 
4286         /*
4287          * Ping the null NFS procedure of every server in
4288          * the list until one responds.  We always start
4289          * at the head of the list and always skip the one
4290          * that is current, since it's caused us a problem.
4291          */
4292         while (svp == NULL) {
4293                 for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4294                         if (!oncethru && svp == mi->mi_curr_serv)
4295                                 continue;
4296 
4297                         /*
4298                          * If the file system was forcibly umounted
4299                          * while trying to do a failover, then just
4300                          * give up on the failover.  It won't matter
4301                          * what the server is.
4302                          */
4303                         if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4304                                 svp = NULL;
4305                                 goto done;
4306                         }
4307 
4308                         error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4309                             NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4310                         if (error)
4311                                 continue;
4312 
4313                         if (!(mi->mi_flags & MI_INT))
4314                                 cl->cl_nosignal = TRUE;
4315                         status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4316                             xdr_void, NULL, tv);
4317                         if (!(mi->mi_flags & MI_INT))
4318                                 cl->cl_nosignal = FALSE;
4319                         AUTH_DESTROY(cl->cl_auth);
4320                         CLNT_DESTROY(cl);
4321                         if (status == RPC_SUCCESS) {
4322                                 if (svp == mi->mi_curr_serv) {
4323 #ifdef DEBUG
4324                                         zcmn_err(zoneid, CE_NOTE,
4325                         "NFS%d: failing over: selecting original server %s",
4326                                             mi->mi_vers, svp->sv_hostname);
4327 #else
4328                                         zcmn_err(zoneid, CE_NOTE,
4329                         "NFS: failing over: selecting original server %s",
4330                                             svp->sv_hostname);
4331 #endif
4332                                 } else {
4333 #ifdef DEBUG
4334                                         zcmn_err(zoneid, CE_NOTE,
4335                                     "NFS%d: failing over from %s to %s",
4336                                             mi->mi_vers,
4337                                             mi->mi_curr_serv->sv_hostname,
4338                                             svp->sv_hostname);
4339 #else
4340                                         zcmn_err(zoneid, CE_NOTE,
4341                                     "NFS: failing over from %s to %s",
4342                                             mi->mi_curr_serv->sv_hostname,
4343                                             svp->sv_hostname);
4344 #endif
4345                                 }
4346                                 break;
4347                         }
4348                 }
4349 
4350                 if (svp == NULL) {
4351                         if (!oncethru) {
4352                                 srvnames = nfs_getsrvnames(mi, &srvnames_len);
4353 #ifdef DEBUG
4354                                 zprintf(zoneid,
4355                                     "NFS%d servers %s not responding "
4356                                     "still trying\n", mi->mi_vers, srvnames);
4357 #else
4358                                 zprintf(zoneid, "NFS servers %s not responding "
4359                                     "still trying\n", srvnames);
4360 #endif
4361                                 oncethru = 1;
4362                         }
4363                         mutex_enter(&mi->mi_lock);
4364                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
4365                         mutex_exit(&mi->mi_lock);
4366                         delay(hz);
4367                         mutex_enter(&mi->mi_lock);
4368                         CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4369                         mutex_exit(&mi->mi_lock);
4370                 }
4371         }
4372 
4373         if (oncethru) {
4374 #ifdef DEBUG
4375                 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4376 #else
4377                 zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4378 #endif
4379         }
4380 
4381         if (svp != mi->mi_curr_serv) {
4382                 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4383                 index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4384                 rw_enter(&rtable[index].r_lock, RW_WRITER);
4385                 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4386                     mi->mi_vfsp);
4387                 if (rp != NULL) {
4388                         if (rp->r_flags & RHASHED)
4389                                 rp_rmhash_locked(rp);
4390                         rw_exit(&rtable[index].r_lock);
4391                         rp->r_server = svp;
4392                         rp->r_fh = svp->sv_fhandle;
4393                         (void) nfs_free_data_reclaim(rp);
4394                         index = rtablehash(&rp->r_fh);
4395                         rp->r_hashq = &rtable[index];
4396                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4397                         vn_exists(RTOV(rp));
4398                         rp_addhash(rp);
4399                         rw_exit(&rp->r_hashq->r_lock);
4400                         VN_RELE(RTOV(rp));
4401                 } else
4402                         rw_exit(&rtable[index].r_lock);
4403         }
4404 
4405 done:
4406         if (oncethru)
4407                 kmem_free(srvnames, srvnames_len);
4408         mutex_enter(&mi->mi_lock);
4409         mi->mi_flags &= ~MI_BINDINPROG;
4410         if (svp != NULL) {
4411                 mi->mi_curr_serv = svp;
4412                 mi->mi_failover++;
4413 #ifdef DEBUG
4414         nfscl->nfscl_stat.failover.value.ui64++;
4415 #endif
4416         }
4417         cv_broadcast(&mi->mi_failover_cv);
4418         CALLB_CPR_EXIT(&cprinfo);
4419         VFS_RELE(mi->mi_vfsp);
4420         zthread_exit();
4421         /* NOTREACHED */
4422 }
4423 
4424 /*
4425  * NFS client failover support
4426  *
4427  * failover_wait() will put the thread to sleep until MI_BINDINPROG
4428  * is cleared, meaning that failover is complete.  Called with
4429  * mi_lock mutex held.
4430  */
4431 static int
4432 failover_wait(mntinfo_t *mi)
4433 {
4434         k_sigset_t smask;
4435 
4436         /*
4437          * If someone else is hunting for a living server,
4438          * sleep until it's done.  After our sleep, we may
4439          * be bound to the right server and get off cheaply.
4440          */
4441         while (mi->mi_flags & MI_BINDINPROG) {
4442                 /*
4443                  * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4444                  * and SIGTERM. (Preserving the existing masks).
4445                  * Mask out SIGINT if mount option nointr is specified.
4446                  */
4447                 sigintr(&smask, (int)mi->mi_flags & MI_INT);
4448                 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4449                         /*
4450                          * restore original signal mask
4451                          */
4452                         sigunintr(&smask);
4453                         return (EINTR);
4454                 }
4455                 /*
4456                  * restore original signal mask
4457                  */
4458                 sigunintr(&smask);
4459         }
4460         return (0);
4461 }
4462 
4463 /*
4464  * NFS client failover support
4465  *
4466  * failover_remap() will do a partial pathname lookup and find the
4467  * desired vnode on the current server.  The interim vnode will be
4468  * discarded after we pilfer the new filehandle.
4469  *
4470  * Side effects:
4471  * - This routine will also update the filehandle in the args structure
4472  *    pointed to by the fi->fhp pointer if it is non-NULL.
4473  */
4474 
4475 static int
4476 failover_remap(failinfo_t *fi)
4477 {
4478         vnode_t *vp, *nvp, *rootvp;
4479         rnode_t *rp, *nrp;
4480         mntinfo_t *mi;
4481         int error;
4482 #ifdef DEBUG
4483         struct nfs_clnt *nfscl;
4484 
4485         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4486         ASSERT(nfscl != NULL);
4487 #endif
4488         /*
4489          * Sanity check
4490          */
4491         if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4492                 return (EINVAL);
4493         vp = fi->vp;
4494         rp = VTOR(vp);
4495         mi = VTOMI(vp);
4496 
4497         if (!(vp->v_flag & VROOT)) {
4498                 /*
4499                  * Given the root fh, use the path stored in
4500                  * the rnode to find the fh for the new server.
4501                  */
4502                 error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4503                 if (error)
4504                         return (error);
4505 
4506                 error = failover_lookup(rp->r_path, rootvp,
4507                     fi->lookupproc, fi->xattrdirproc, &nvp);
4508 
4509                 VN_RELE(rootvp);
4510 
4511                 if (error)
4512                         return (error);
4513 
4514                 /*
4515                  * If we found the same rnode, we're done now
4516                  */
4517                 if (nvp == vp) {
4518                         /*
4519                          * Failed and the new server may physically be same
4520                          * OR may share a same disk subsystem. In this case
4521                          * file handle for a particular file path is not going
4522                          * to change, given the same filehandle lookup will
4523                          * always locate the same rnode as the existing one.
4524                          * All we might need to do is to update the r_server
4525                          * with the current servinfo.
4526                          */
4527                         if (!VALID_FH(fi)) {
4528                                 rp->r_server = mi->mi_curr_serv;
4529                         }
4530                         VN_RELE(nvp);
4531                         return (0);
4532                 }
4533 
4534                 /*
4535                  * Try to make it so that no one else will find this
4536                  * vnode because it is just a temporary to hold the
4537                  * new file handle until that file handle can be
4538                  * copied to the original vnode/rnode.
4539                  */
4540                 nrp = VTOR(nvp);
4541                 mutex_enter(&mi->mi_remap_lock);
4542                 /*
4543                  * Some other thread could have raced in here and could
4544                  * have done the remap for this particular rnode before
4545                  * this thread here. Check for rp->r_server and
4546                  * mi->mi_curr_serv and return if they are same.
4547                  */
4548                 if (VALID_FH(fi)) {
4549                         mutex_exit(&mi->mi_remap_lock);
4550                         VN_RELE(nvp);
4551                         return (0);
4552                 }
4553 
4554                 if (nrp->r_flags & RHASHED)
4555                         rp_rmhash(nrp);
4556 
4557                 /*
4558                  * As a heuristic check on the validity of the new
4559                  * file, check that the size and type match against
4560                  * that we remember from the old version.
4561                  */
4562                 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4563                         mutex_exit(&mi->mi_remap_lock);
4564                         zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4565                             "NFS replicas %s and %s: file %s not same.",
4566                             rp->r_server->sv_hostname,
4567                             nrp->r_server->sv_hostname, rp->r_path);
4568                         VN_RELE(nvp);
4569                         return (EINVAL);
4570                 }
4571 
4572                 /*
4573                  * snarf the filehandle from the new rnode
4574                  * then release it, again while updating the
4575                  * hash queues for the rnode.
4576                  */
4577                 if (rp->r_flags & RHASHED)
4578                         rp_rmhash(rp);
4579                 rp->r_server = mi->mi_curr_serv;
4580                 rp->r_fh = nrp->r_fh;
4581                 rp->r_hashq = nrp->r_hashq;
4582                 /*
4583                  * Copy the attributes from the new rnode to the old
4584                  * rnode.  This will help to reduce unnecessary page
4585                  * cache flushes.
4586                  */
4587                 rp->r_attr = nrp->r_attr;
4588                 rp->r_attrtime = nrp->r_attrtime;
4589                 rp->r_mtime = nrp->r_mtime;
4590                 (void) nfs_free_data_reclaim(rp);
4591                 nfs_setswaplike(vp, &rp->r_attr);
4592                 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4593                 rp_addhash(rp);
4594                 rw_exit(&rp->r_hashq->r_lock);
4595                 mutex_exit(&mi->mi_remap_lock);
4596                 VN_RELE(nvp);
4597         }
4598 
4599         /*
4600          * Update successful failover remap count
4601          */
4602         mutex_enter(&mi->mi_lock);
4603         mi->mi_remap++;
4604         mutex_exit(&mi->mi_lock);
4605 #ifdef DEBUG
4606         nfscl->nfscl_stat.remap.value.ui64++;
4607 #endif
4608 
4609         /*
4610          * If we have a copied filehandle to update, do it now.
4611          */
4612         if (fi->fhp != NULL && fi->copyproc != NULL)
4613                 (*fi->copyproc)(fi->fhp, vp);
4614 
4615         return (0);
4616 }
4617 
4618 /*
4619  * NFS client failover support
4620  *
4621  * We want a simple pathname lookup routine to parse the pieces
4622  * of path in rp->r_path.  We know that the path was a created
4623  * as rnodes were made, so we know we have only to deal with
4624  * paths that look like:
4625  *      dir1/dir2/dir3/file
4626  * Any evidence of anything like .., symlinks, and ENOTDIR
4627  * are hard errors, because they mean something in this filesystem
4628  * is different from the one we came from, or has changed under
4629  * us in some way.  If this is true, we want the failure.
4630  *
4631  * Extended attributes: if the filesystem is mounted with extended
4632  * attributes enabled (-o xattr), the attribute directory will be
4633  * represented in the r_path as the magic name XATTR_RPATH. So if
4634  * we see that name in the pathname, is must be because this node
4635  * is an extended attribute.  Therefore, look it up that way.
4636  */
4637 static int
4638 failover_lookup(char *path, vnode_t *root,
4639     int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4640         vnode_t *, cred_t *, int),
4641     int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4642     vnode_t **new)
4643 {
4644         vnode_t *dvp, *nvp;
4645         int error = EINVAL;
4646         char *s, *p, *tmppath;
4647         size_t len;
4648         mntinfo_t *mi;
4649         bool_t xattr;
4650 
4651         /* Make local copy of path */
4652         len = strlen(path) + 1;
4653         tmppath = kmem_alloc(len, KM_SLEEP);
4654         (void) strcpy(tmppath, path);
4655         s = tmppath;
4656 
4657         dvp = root;
4658         VN_HOLD(dvp);
4659         mi = VTOMI(root);
4660         xattr = mi->mi_flags & MI_EXTATTR;
4661 
4662         do {
4663                 p = strchr(s, '/');
4664                 if (p != NULL)
4665                         *p = '\0';
4666                 if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4667                         error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4668                             RFSCALL_SOFT);
4669                 } else {
4670                         error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4671                             CRED(), RFSCALL_SOFT);
4672                 }
4673                 if (p != NULL)
4674                         *p++ = '/';
4675                 if (error) {
4676                         VN_RELE(dvp);
4677                         kmem_free(tmppath, len);
4678                         return (error);
4679                 }
4680                 s = p;
4681                 VN_RELE(dvp);
4682                 dvp = nvp;
4683         } while (p != NULL);
4684 
4685         if (nvp != NULL && new != NULL)
4686                 *new = nvp;
4687         kmem_free(tmppath, len);
4688         return (0);
4689 }
4690 
4691 /*
4692  * NFS client failover support
4693  *
4694  * sv_free() frees the malloc'd portion of a "servinfo_t".
4695  */
4696 void
4697 sv_free(servinfo_t *svp)
4698 {
4699         servinfo_t *next;
4700         struct knetconfig *knconf;
4701 
4702         while (svp != NULL) {
4703                 next = svp->sv_next;
4704                 if (svp->sv_secdata)
4705                         sec_clnt_freeinfo(svp->sv_secdata);
4706                 if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4707                         kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4708                 knconf = svp->sv_knconf;
4709                 if (knconf != NULL) {
4710                         if (knconf->knc_protofmly != NULL)
4711                                 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4712                         if (knconf->knc_proto != NULL)
4713                                 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4714                         kmem_free(knconf, sizeof (*knconf));
4715                 }
4716                 knconf = svp->sv_origknconf;
4717                 if (knconf != NULL) {
4718                         if (knconf->knc_protofmly != NULL)
4719                                 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4720                         if (knconf->knc_proto != NULL)
4721                                 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4722                         kmem_free(knconf, sizeof (*knconf));
4723                 }
4724                 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4725                         kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4726                 mutex_destroy(&svp->sv_lock);
4727                 kmem_free(svp, sizeof (*svp));
4728                 svp = next;
4729         }
4730 }
4731 
4732 /*
4733  * Only can return non-zero if intr != 0.
4734  */
4735 int
4736 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4737 {
4738 
4739         mutex_enter(&l->lock);
4740 
4741         /*
4742          * If this is a nested enter, then allow it.  There
4743          * must be as many exits as enters through.
4744          */
4745         if (l->owner == curthread) {
4746                 /* lock is held for writing by current thread */
4747                 ASSERT(rw == RW_READER || rw == RW_WRITER);
4748                 l->count--;
4749         } else if (rw == RW_READER) {
4750                 /*
4751                  * While there is a writer active or writers waiting,
4752                  * then wait for them to finish up and move on.  Then,
4753                  * increment the count to indicate that a reader is
4754                  * active.
4755                  */
4756                 while (l->count < 0 || l->waiters > 0) {
4757                         if (intr) {
4758                                 klwp_t *lwp = ttolwp(curthread);
4759 
4760                                 if (lwp != NULL)
4761                                         lwp->lwp_nostop++;
4762                                 if (!cv_wait_sig(&l->cv, &l->lock)) {
4763                                         if (lwp != NULL)
4764                                                 lwp->lwp_nostop--;
4765                                         mutex_exit(&l->lock);
4766                                         return (EINTR);
4767                                 }
4768                                 if (lwp != NULL)
4769                                         lwp->lwp_nostop--;
4770                         } else
4771                                 cv_wait(&l->cv, &l->lock);
4772                 }
4773                 ASSERT(l->count < INT_MAX);
4774 #ifdef  DEBUG
4775                 if ((l->count % 10000) == 9999)
4776                         cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4777                             "rwlock @ %p\n", l->count, (void *)&l);
4778 #endif
4779                 l->count++;
4780         } else {
4781                 ASSERT(rw == RW_WRITER);
4782                 /*
4783                  * While there are readers active or a writer
4784                  * active, then wait for all of the readers
4785                  * to finish or for the writer to finish.
4786                  * Then, set the owner field to curthread and
4787                  * decrement count to indicate that a writer
4788                  * is active.
4789                  */
4790                 while (l->count > 0 || l->owner != NULL) {
4791                         l->waiters++;
4792                         if (intr) {
4793                                 klwp_t *lwp = ttolwp(curthread);
4794 
4795                                 if (lwp != NULL)
4796                                         lwp->lwp_nostop++;
4797                                 if (!cv_wait_sig(&l->cv, &l->lock)) {
4798                                         if (lwp != NULL)
4799                                                 lwp->lwp_nostop--;
4800                                         l->waiters--;
4801                                         cv_broadcast(&l->cv);
4802                                         mutex_exit(&l->lock);
4803                                         return (EINTR);
4804                                 }
4805                                 if (lwp != NULL)
4806                                         lwp->lwp_nostop--;
4807                         } else
4808                                 cv_wait(&l->cv, &l->lock);
4809                         l->waiters--;
4810                 }
4811                 l->owner = curthread;
4812                 l->count--;
4813         }
4814 
4815         mutex_exit(&l->lock);
4816 
4817         return (0);
4818 }
4819 
4820 /*
4821  * If the lock is available, obtain it and return non-zero.  If there is
4822  * already a conflicting lock, return 0 immediately.
4823  */
4824 
4825 int
4826 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4827 {
4828         mutex_enter(&l->lock);
4829 
4830         /*
4831          * If this is a nested enter, then allow it.  There
4832          * must be as many exits as enters through.
4833          */
4834         if (l->owner == curthread) {
4835                 /* lock is held for writing by current thread */
4836                 ASSERT(rw == RW_READER || rw == RW_WRITER);
4837                 l->count--;
4838         } else if (rw == RW_READER) {
4839                 /*
4840                  * If there is a writer active or writers waiting, deny the
4841                  * lock.  Otherwise, bump the count of readers.
4842                  */
4843                 if (l->count < 0 || l->waiters > 0) {
4844                         mutex_exit(&l->lock);
4845                         return (0);
4846                 }
4847                 l->count++;
4848         } else {
4849                 ASSERT(rw == RW_WRITER);
4850                 /*
4851                  * If there are readers active or a writer active, deny the
4852                  * lock.  Otherwise, set the owner field to curthread and
4853                  * decrement count to indicate that a writer is active.
4854                  */
4855                 if (l->count > 0 || l->owner != NULL) {
4856                         mutex_exit(&l->lock);
4857                         return (0);
4858                 }
4859                 l->owner = curthread;
4860                 l->count--;
4861         }
4862 
4863         mutex_exit(&l->lock);
4864 
4865         return (1);
4866 }
4867 
4868 void
4869 nfs_rw_exit(nfs_rwlock_t *l)
4870 {
4871 
4872         mutex_enter(&l->lock);
4873         /*
4874          * If this is releasing a writer lock, then increment count to
4875          * indicate that there is one less writer active.  If this was
4876          * the last of possibly nested writer locks, then clear the owner
4877          * field as well to indicate that there is no writer active
4878          * and wakeup any possible waiting writers or readers.
4879          *
4880          * If releasing a reader lock, then just decrement count to
4881          * indicate that there is one less reader active.  If this was
4882          * the last active reader and there are writer(s) waiting,
4883          * then wake up the first.
4884          */
4885         if (l->owner != NULL) {
4886                 ASSERT(l->owner == curthread);
4887                 l->count++;
4888                 if (l->count == 0) {
4889                         l->owner = NULL;
4890                         cv_broadcast(&l->cv);
4891                 }
4892         } else {
4893                 ASSERT(l->count > 0);
4894                 l->count--;
4895                 if (l->count == 0 && l->waiters > 0)
4896                         cv_broadcast(&l->cv);
4897         }
4898         mutex_exit(&l->lock);
4899 }
4900 
4901 int
4902 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4903 {
4904 
4905         if (rw == RW_READER)
4906                 return (l->count > 0);
4907         ASSERT(rw == RW_WRITER);
4908         return (l->count < 0);
4909 }
4910 
4911 /* ARGSUSED */
4912 void
4913 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4914 {
4915 
4916         l->count = 0;
4917         l->waiters = 0;
4918         l->owner = NULL;
4919         mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4920         cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4921 }
4922 
4923 void
4924 nfs_rw_destroy(nfs_rwlock_t *l)
4925 {
4926 
4927         mutex_destroy(&l->lock);
4928         cv_destroy(&l->cv);
4929 }
4930 
4931 int
4932 nfs3_rddir_compar(const void *x, const void *y)
4933 {
4934         rddir_cache *a = (rddir_cache *)x;
4935         rddir_cache *b = (rddir_cache *)y;
4936 
4937         if (a->nfs3_cookie == b->nfs3_cookie) {
4938                 if (a->buflen == b->buflen)
4939                         return (0);
4940                 if (a->buflen < b->buflen)
4941                         return (-1);
4942                 return (1);
4943         }
4944 
4945         if (a->nfs3_cookie < b->nfs3_cookie)
4946                 return (-1);
4947 
4948         return (1);
4949 }
4950 
4951 int
4952 nfs_rddir_compar(const void *x, const void *y)
4953 {
4954         rddir_cache *a = (rddir_cache *)x;
4955         rddir_cache *b = (rddir_cache *)y;
4956 
4957         if (a->nfs_cookie == b->nfs_cookie) {
4958                 if (a->buflen == b->buflen)
4959                         return (0);
4960                 if (a->buflen < b->buflen)
4961                         return (-1);
4962                 return (1);
4963         }
4964 
4965         if (a->nfs_cookie < b->nfs_cookie)
4966                 return (-1);
4967 
4968         return (1);
4969 }
4970 
4971 static char *
4972 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4973 {
4974         servinfo_t *s;
4975         char *srvnames;
4976         char *namep;
4977         size_t length;
4978 
4979         /*
4980          * Calculate the length of the string required to hold all
4981          * of the server names plus either a comma or a null
4982          * character following each individual one.
4983          */
4984         length = 0;
4985         for (s = mi->mi_servers; s != NULL; s = s->sv_next)
4986                 length += s->sv_hostnamelen;
4987 
4988         srvnames = kmem_alloc(length, KM_SLEEP);
4989 
4990         namep = srvnames;
4991         for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
4992                 (void) strcpy(namep, s->sv_hostname);
4993                 namep += s->sv_hostnamelen - 1;
4994                 *namep++ = ',';
4995         }
4996         *--namep = '\0';
4997 
4998         *len = length;
4999 
5000         return (srvnames);
5001 }
5002 
5003 /*
5004  * These two functions are temporary and designed for the upgrade-workaround
5005  * only.  They cannot be used for general zone-crossing NFS client support, and
5006  * will be removed shortly.
5007  *
5008  * When the workaround is enabled, all NFS traffic is forced into the global
5009  * zone.  These functions are called when the code needs to refer to the state
5010  * of the underlying network connection.  They're not called when the function
5011  * needs to refer to the state of the process that invoked the system call.
5012  * (E.g., when checking whether the zone is shutting down during the mount()
5013  * call.)
5014  */
5015 
5016 struct zone *
5017 nfs_zone(void)
5018 {
5019         return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5020 }
5021 
5022 zoneid_t
5023 nfs_zoneid(void)
5024 {
5025         return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5026 }
5027 
5028 /*
5029  * nfs_mount_label_policy:
5030  *      Determine whether the mount is allowed according to MAC check,
5031  *      by comparing (where appropriate) label of the remote server
5032  *      against the label of the zone being mounted into.
5033  *
5034  *      Returns:
5035  *               0 :    access allowed
5036  *              -1 :    read-only access allowed (i.e., read-down)
5037  *              >0 : error code, such as EACCES
5038  */
5039 int
5040 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5041     struct knetconfig *knconf, cred_t *cr)
5042 {
5043         int             addr_type;
5044         void            *ipaddr;
5045         bslabel_t       *server_sl, *mntlabel;
5046         zone_t          *mntzone = NULL;
5047         ts_label_t      *zlabel;
5048         tsol_tpc_t      *tp;
5049         ts_label_t      *tsl = NULL;
5050         int             retv;
5051 
5052         /*
5053          * Get the zone's label.  Each zone on a labeled system has a label.
5054          */
5055         mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5056         zlabel = mntzone->zone_slabel;
5057         ASSERT(zlabel != NULL);
5058         label_hold(zlabel);
5059 
5060         if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5061                 addr_type = IPV4_VERSION;
5062                 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5063         } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5064                 addr_type = IPV6_VERSION;
5065                 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5066         } else {
5067                 retv = 0;
5068                 goto out;
5069         }
5070 
5071         retv = EACCES;                          /* assume the worst */
5072 
5073         /*
5074          * Next, get the assigned label of the remote server.
5075          */
5076         tp = find_tpc(ipaddr, addr_type, B_FALSE);
5077         if (tp == NULL)
5078                 goto out;                       /* error getting host entry */
5079 
5080         if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5081                 goto rel_tpc;                   /* invalid domain */
5082         if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5083             (tp->tpc_tp.host_type != UNLABELED))
5084                 goto rel_tpc;                   /* invalid hosttype */
5085 
5086         if (tp->tpc_tp.host_type == SUN_CIPSO) {
5087                 tsl = getflabel_cipso(vfsp);
5088                 if (tsl == NULL)
5089                         goto rel_tpc;           /* error getting server lbl */
5090 
5091                 server_sl = label2bslabel(tsl);
5092         } else {        /* UNLABELED */
5093                 server_sl = &tp->tpc_tp.tp_def_label;
5094         }
5095 
5096         mntlabel = label2bslabel(zlabel);
5097 
5098         /*
5099          * Now compare labels to complete the MAC check.  If the labels
5100          * are equal or if the requestor is in the global zone and has
5101          * NET_MAC_AWARE, then allow read-write access.   (Except for
5102          * mounts into the global zone itself; restrict these to
5103          * read-only.)
5104          *
5105          * If the requestor is in some other zone, but his label
5106          * dominates the server, then allow read-down.
5107          *
5108          * Otherwise, access is denied.
5109          */
5110         if (blequal(mntlabel, server_sl) ||
5111             (crgetzoneid(cr) == GLOBAL_ZONEID &&
5112             getpflags(NET_MAC_AWARE, cr) != 0)) {
5113                 if ((mntzone == global_zone) ||
5114                     !blequal(mntlabel, server_sl))
5115                         retv = -1;              /* read-only */
5116                 else
5117                         retv = 0;               /* access OK */
5118         } else if (bldominates(mntlabel, server_sl)) {
5119                 retv = -1;                      /* read-only */
5120         } else {
5121                 retv = EACCES;
5122         }
5123 
5124         if (tsl != NULL)
5125                 label_rele(tsl);
5126 
5127 rel_tpc:
5128         TPC_RELE(tp);
5129 out:
5130         if (mntzone)
5131                 zone_rele(mntzone);
5132         label_rele(zlabel);
5133         return (retv);
5134 }
5135 
5136 boolean_t
5137 nfs_has_ctty(void)
5138 {
5139         boolean_t rv;
5140         mutex_enter(&curproc->p_splock);
5141         rv = (curproc->p_sessp->s_vp != NULL);
5142         mutex_exit(&curproc->p_splock);
5143         return (rv);
5144 }
5145 
5146 /*
5147  * See if xattr directory to see if it has any generic user attributes
5148  */
5149 int
5150 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5151 {
5152         struct uio uio;
5153         struct iovec iov;
5154         char *dbuf;
5155         struct dirent64 *dp;
5156         size_t dlen = 8 * 1024;
5157         size_t dbuflen;
5158         int eof = 0;
5159         int error;
5160 
5161         *valp = 0;
5162         dbuf = kmem_alloc(dlen, KM_SLEEP);
5163         uio.uio_iov = &iov;
5164         uio.uio_iovcnt = 1;
5165         uio.uio_segflg = UIO_SYSSPACE;
5166         uio.uio_fmode = 0;
5167         uio.uio_extflg = UIO_COPY_CACHED;
5168         uio.uio_loffset = 0;
5169         uio.uio_resid = dlen;
5170         iov.iov_base = dbuf;
5171         iov.iov_len = dlen;
5172         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5173         error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5174         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5175 
5176         dbuflen = dlen - uio.uio_resid;
5177 
5178         if (error || dbuflen == 0) {
5179                 kmem_free(dbuf, dlen);
5180                 return (error);
5181         }
5182 
5183         dp = (dirent64_t *)dbuf;
5184 
5185         while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5186                 if (strcmp(dp->d_name, ".") == 0 ||
5187                     strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5188                     VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5189                     VIEW_READONLY) == 0) {
5190                         dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5191                         continue;
5192                 }
5193 
5194                 *valp = 1;
5195                 break;
5196         }
5197         kmem_free(dbuf, dlen);
5198         return (0);
5199 }