1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 
  26 /*
  27  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  28  *      All rights reserved.
  29  */
  30 
  31 #include <sys/param.h>
  32 #include <sys/types.h>
  33 #include <sys/systm.h>
  34 #include <sys/cred.h>
  35 #include <sys/buf.h>
  36 #include <sys/vfs.h>
  37 #include <sys/vnode.h>
  38 #include <sys/uio.h>
  39 #include <sys/stat.h>
  40 #include <sys/errno.h>
  41 #include <sys/sysmacros.h>
  42 #include <sys/statvfs.h>
  43 #include <sys/kmem.h>
  44 #include <sys/kstat.h>
  45 #include <sys/dirent.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/debug.h>
  48 #include <sys/vtrace.h>
  49 #include <sys/mode.h>
  50 #include <sys/acl.h>
  51 #include <sys/nbmlock.h>
  52 #include <sys/policy.h>
  53 #include <sys/sdt.h>
  54 
  55 #include <rpc/types.h>
  56 #include <rpc/auth.h>
  57 #include <rpc/svc.h>
  58 
  59 #include <nfs/nfs.h>
  60 #include <nfs/export.h>
  61 #include <nfs/nfs_cmd.h>
  62 
  63 #include <vm/hat.h>
  64 #include <vm/as.h>
  65 #include <vm/seg.h>
  66 #include <vm/seg_map.h>
  67 #include <vm/seg_kmem.h>
  68 
  69 #include <sys/strsubr.h>
  70 
  71 /*
  72  * These are the interface routines for the server side of the
  73  * Network File System.  See the NFS version 2 protocol specification
  74  * for a description of this interface.
  75  */
  76 
  77 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  78 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
  79                         cred_t *);
  80 
  81 /*
  82  * Some "over the wire" UNIX file types.  These are encoded
  83  * into the mode.  This needs to be fixed in the next rev.
  84  */
  85 #define IFMT            0170000         /* type of file */
  86 #define IFCHR           0020000         /* character special */
  87 #define IFBLK           0060000         /* block special */
  88 #define IFSOCK          0140000         /* socket */
  89 
  90 u_longlong_t nfs2_srv_caller_id;
  91 
  92 /*
  93  * Get file attributes.
  94  * Returns the current attributes of the file with the given fhandle.
  95  */
  96 /* ARGSUSED */
  97 void
  98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
  99     struct svc_req *req, cred_t *cr, bool_t ro)
 100 {
 101         int error;
 102         vnode_t *vp;
 103         struct vattr va;
 104 
 105         vp = nfs_fhtovp(fhp, exi);
 106         if (vp == NULL) {
 107                 ns->ns_status = NFSERR_STALE;
 108                 return;
 109         }
 110 
 111         /*
 112          * Do the getattr.
 113          */
 114         va.va_mask = AT_ALL;    /* we want all the attributes */
 115 
 116         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 117 
 118         /* check for overflows */
 119         if (!error) {
 120                 /* Lie about the object type for a referral */
 121                 if (vn_is_nfs_reparse(vp, cr))
 122                         va.va_type = VLNK;
 123 
 124                 acl_perm(vp, exi, &va, cr);
 125                 error = vattr_to_nattr(&va, &ns->ns_attr);
 126         }
 127 
 128         VN_RELE(vp);
 129 
 130         ns->ns_status = puterrno(error);
 131 }
 132 void *
 133 rfs_getattr_getfh(fhandle_t *fhp)
 134 {
 135         return (fhp);
 136 }
 137 
 138 /*
 139  * Set file attributes.
 140  * Sets the attributes of the file with the given fhandle.  Returns
 141  * the new attributes.
 142  */
 143 /* ARGSUSED */
 144 void
 145 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 146     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 147 {
 148         int error;
 149         int flag;
 150         int in_crit = 0;
 151         vnode_t *vp;
 152         struct vattr va;
 153         struct vattr bva;
 154         struct flock64 bf;
 155         caller_context_t ct;
 156 
 157 
 158         vp = nfs_fhtovp(&args->saa_fh, exi);
 159         if (vp == NULL) {
 160                 ns->ns_status = NFSERR_STALE;
 161                 return;
 162         }
 163 
 164         if (rdonly(ro, vp)) {
 165                 VN_RELE(vp);
 166                 ns->ns_status = NFSERR_ROFS;
 167                 return;
 168         }
 169 
 170         error = sattr_to_vattr(&args->saa_sa, &va);
 171         if (error) {
 172                 VN_RELE(vp);
 173                 ns->ns_status = puterrno(error);
 174                 return;
 175         }
 176 
 177         /*
 178          * If the client is requesting a change to the mtime,
 179          * but the nanosecond field is set to 1 billion, then
 180          * this is a flag to the server that it should set the
 181          * atime and mtime fields to the server's current time.
 182          * The 1 billion number actually came from the client
 183          * as 1 million, but the units in the over the wire
 184          * request are microseconds instead of nanoseconds.
 185          *
 186          * This is an overload of the protocol and should be
 187          * documented in the NFS Version 2 protocol specification.
 188          */
 189         if (va.va_mask & AT_MTIME) {
 190                 if (va.va_mtime.tv_nsec == 1000000000) {
 191                         gethrestime(&va.va_mtime);
 192                         va.va_atime = va.va_mtime;
 193                         va.va_mask |= AT_ATIME;
 194                         flag = 0;
 195                 } else
 196                         flag = ATTR_UTIME;
 197         } else
 198                 flag = 0;
 199 
 200         /*
 201          * If the filesystem is exported with nosuid, then mask off
 202          * the setuid and setgid bits.
 203          */
 204         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 205             (exi->exi_export.ex_flags & EX_NOSUID))
 206                 va.va_mode &= ~(VSUID | VSGID);
 207 
 208         ct.cc_sysid = 0;
 209         ct.cc_pid = 0;
 210         ct.cc_caller_id = nfs2_srv_caller_id;
 211         ct.cc_flags = CC_DONTBLOCK;
 212 
 213         /*
 214          * We need to specially handle size changes because it is
 215          * possible for the client to create a file with modes
 216          * which indicate read-only, but with the file opened for
 217          * writing.  If the client then tries to set the size of
 218          * the file, then the normal access checking done in
 219          * VOP_SETATTR would prevent the client from doing so,
 220          * although it should be legal for it to do so.  To get
 221          * around this, we do the access checking for ourselves
 222          * and then use VOP_SPACE which doesn't do the access
 223          * checking which VOP_SETATTR does. VOP_SPACE can only
 224          * operate on VREG files, let VOP_SETATTR handle the other
 225          * extremely rare cases.
 226          * Also the client should not be allowed to change the
 227          * size of the file if there is a conflicting non-blocking
 228          * mandatory lock in the region of change.
 229          */
 230         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 231                 if (nbl_need_check(vp)) {
 232                         nbl_start_crit(vp, RW_READER);
 233                         in_crit = 1;
 234                 }
 235 
 236                 bva.va_mask = AT_UID | AT_SIZE;
 237 
 238                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 239 
 240                 if (error) {
 241                         if (in_crit)
 242                                 nbl_end_crit(vp);
 243                         VN_RELE(vp);
 244                         ns->ns_status = puterrno(error);
 245                         return;
 246                 }
 247 
 248                 if (in_crit) {
 249                         u_offset_t offset;
 250                         ssize_t length;
 251 
 252                         if (va.va_size < bva.va_size) {
 253                                 offset = va.va_size;
 254                                 length = bva.va_size - va.va_size;
 255                         } else {
 256                                 offset = bva.va_size;
 257                                 length = va.va_size - bva.va_size;
 258                         }
 259                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 260                             NULL)) {
 261                                 error = EACCES;
 262                         }
 263                 }
 264 
 265                 if (crgetuid(cr) == bva.va_uid && !error &&
 266                     va.va_size != bva.va_size) {
 267                         va.va_mask &= ~AT_SIZE;
 268                         bf.l_type = F_WRLCK;
 269                         bf.l_whence = 0;
 270                         bf.l_start = (off64_t)va.va_size;
 271                         bf.l_len = 0;
 272                         bf.l_sysid = 0;
 273                         bf.l_pid = 0;
 274 
 275                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 276                             (offset_t)va.va_size, cr, &ct);
 277                 }
 278                 if (in_crit)
 279                         nbl_end_crit(vp);
 280         } else
 281                 error = 0;
 282 
 283         /*
 284          * Do the setattr.
 285          */
 286         if (!error && va.va_mask) {
 287                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 288         }
 289 
 290         /*
 291          * check if the monitor on either vop_space or vop_setattr detected
 292          * a delegation conflict and if so, mark the thread flag as
 293          * wouldblock so that the response is dropped and the client will
 294          * try again.
 295          */
 296         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 297                 VN_RELE(vp);
 298                 curthread->t_flag |= T_WOULDBLOCK;
 299                 return;
 300         }
 301 
 302         if (!error) {
 303                 va.va_mask = AT_ALL;    /* get everything */
 304 
 305                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 306 
 307                 /* check for overflows */
 308                 if (!error) {
 309                         acl_perm(vp, exi, &va, cr);
 310                         error = vattr_to_nattr(&va, &ns->ns_attr);
 311                 }
 312         }
 313 
 314         ct.cc_flags = 0;
 315 
 316         /*
 317          * Force modified metadata out to stable storage.
 318          */
 319         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 320 
 321         VN_RELE(vp);
 322 
 323         ns->ns_status = puterrno(error);
 324 }
 325 void *
 326 rfs_setattr_getfh(struct nfssaargs *args)
 327 {
 328         return (&args->saa_fh);
 329 }
 330 
 331 /*
 332  * Directory lookup.
 333  * Returns an fhandle and file attributes for file name in a directory.
 334  */
 335 /* ARGSUSED */
 336 void
 337 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 338     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 339 {
 340         int error;
 341         vnode_t *dvp;
 342         vnode_t *vp;
 343         struct vattr va;
 344         fhandle_t *fhp = da->da_fhandle;
 345         struct sec_ol sec = {0, 0};
 346         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 347         char *name;
 348         struct sockaddr *ca;
 349 
 350         /*
 351          * Trusted Extension doesn't support NFSv2. MOUNT
 352          * will reject v2 clients. Need to prevent v2 client
 353          * access via WebNFS here.
 354          */
 355         if (is_system_labeled() && req->rq_vers == 2) {
 356                 dr->dr_status = NFSERR_ACCES;
 357                 return;
 358         }
 359 
 360         /*
 361          * Disallow NULL paths
 362          */
 363         if (da->da_name == NULL || *da->da_name == '\0') {
 364                 dr->dr_status = NFSERR_ACCES;
 365                 return;
 366         }
 367 
 368         /*
 369          * Allow lookups from the root - the default
 370          * location of the public filehandle.
 371          */
 372         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 373                 dvp = rootdir;
 374                 VN_HOLD(dvp);
 375         } else {
 376                 dvp = nfs_fhtovp(fhp, exi);
 377                 if (dvp == NULL) {
 378                         dr->dr_status = NFSERR_STALE;
 379                         return;
 380                 }
 381         }
 382 
 383         /*
 384          * Not allow lookup beyond root.
 385          * If the filehandle matches a filehandle of the exi,
 386          * then the ".." refers beyond the root of an exported filesystem.
 387          */
 388         if (strcmp(da->da_name, "..") == 0 &&
 389             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 390                 VN_RELE(dvp);
 391                 dr->dr_status = NFSERR_NOENT;
 392                 return;
 393         }
 394 
 395         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 396         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 397             MAXPATHLEN);
 398 
 399         if (name == NULL) {
 400                 dr->dr_status = NFSERR_ACCES;
 401                 return;
 402         }
 403 
 404         /*
 405          * If the public filehandle is used then allow
 406          * a multi-component lookup, i.e. evaluate
 407          * a pathname and follow symbolic links if
 408          * necessary.
 409          *
 410          * This may result in a vnode in another filesystem
 411          * which is OK as long as the filesystem is exported.
 412          */
 413         if (PUBLIC_FH2(fhp)) {
 414                 publicfh_flag = TRUE;
 415                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 416                     &sec);
 417         } else {
 418                 /*
 419                  * Do a normal single component lookup.
 420                  */
 421                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 422                     NULL, NULL, NULL);
 423         }
 424 
 425         if (name != da->da_name)
 426                 kmem_free(name, MAXPATHLEN);
 427 
 428 
 429         if (!error) {
 430                 va.va_mask = AT_ALL;    /* we want everything */
 431 
 432                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 433 
 434                 /* check for overflows */
 435                 if (!error) {
 436                         acl_perm(vp, exi, &va, cr);
 437                         error = vattr_to_nattr(&va, &dr->dr_attr);
 438                         if (!error) {
 439                                 if (sec.sec_flags & SEC_QUERY)
 440                                         error = makefh_ol(&dr->dr_fhandle, exi,
 441                                             sec.sec_index);
 442                                 else {
 443                                         error = makefh(&dr->dr_fhandle, vp,
 444                                             exi);
 445                                         if (!error && publicfh_flag &&
 446                                             !chk_clnt_sec(exi, req))
 447                                                 auth_weak = TRUE;
 448                                 }
 449                         }
 450                 }
 451                 VN_RELE(vp);
 452         }
 453 
 454         VN_RELE(dvp);
 455 
 456         /*
 457          * If publicfh_flag is true then we have called rfs_publicfh_mclookup
 458          * and have obtained a new exportinfo in exi which needs to be
 459          * released. Note the the original exportinfo pointed to by exi
 460          * will be released by the caller, comon_dispatch.
 461          */
 462         if (publicfh_flag && exi != NULL)
 463                 exi_rele(exi);
 464 
 465         /*
 466          * If it's public fh, no 0x81, and client's flavor is
 467          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 468          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 469          */
 470         if (auth_weak)
 471                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 472         else
 473                 dr->dr_status = puterrno(error);
 474 }
 475 void *
 476 rfs_lookup_getfh(struct nfsdiropargs *da)
 477 {
 478         return (da->da_fhandle);
 479 }
 480 
 481 /*
 482  * Read symbolic link.
 483  * Returns the string in the symbolic link at the given fhandle.
 484  */
 485 /* ARGSUSED */
 486 void
 487 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 488     struct svc_req *req, cred_t *cr, bool_t ro)
 489 {
 490         int error;
 491         struct iovec iov;
 492         struct uio uio;
 493         vnode_t *vp;
 494         struct vattr va;
 495         struct sockaddr *ca;
 496         char *name = NULL;
 497         int is_referral = 0;
 498 
 499         vp = nfs_fhtovp(fhp, exi);
 500         if (vp == NULL) {
 501                 rl->rl_data = NULL;
 502                 rl->rl_status = NFSERR_STALE;
 503                 return;
 504         }
 505 
 506         va.va_mask = AT_MODE;
 507 
 508         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 509 
 510         if (error) {
 511                 VN_RELE(vp);
 512                 rl->rl_data = NULL;
 513                 rl->rl_status = puterrno(error);
 514                 return;
 515         }
 516 
 517         if (MANDLOCK(vp, va.va_mode)) {
 518                 VN_RELE(vp);
 519                 rl->rl_data = NULL;
 520                 rl->rl_status = NFSERR_ACCES;
 521                 return;
 522         }
 523 
 524         /* We lied about the object type for a referral */
 525         if (vn_is_nfs_reparse(vp, cr))
 526                 is_referral = 1;
 527 
 528         /*
 529          * XNFS and RFC1094 require us to return ENXIO if argument
 530          * is not a link. BUGID 1138002.
 531          */
 532         if (vp->v_type != VLNK && !is_referral) {
 533                 VN_RELE(vp);
 534                 rl->rl_data = NULL;
 535                 rl->rl_status = NFSERR_NXIO;
 536                 return;
 537         }
 538 
 539         /*
 540          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 541          */
 542         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 543 
 544         if (is_referral) {
 545                 char *s;
 546                 size_t strsz;
 547 
 548                 /* Get an artificial symlink based on a referral */
 549                 s = build_symlink(vp, cr, &strsz);
 550                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 551                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 552                     vnode_t *, vp, char *, s);
 553                 if (s == NULL)
 554                         error = EINVAL;
 555                 else {
 556                         error = 0;
 557                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 558                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 559                         kmem_free(s, strsz);
 560                 }
 561 
 562         } else {
 563 
 564                 /*
 565                  * Set up io vector to read sym link data
 566                  */
 567                 iov.iov_base = rl->rl_data;
 568                 iov.iov_len = NFS_MAXPATHLEN;
 569                 uio.uio_iov = &iov;
 570                 uio.uio_iovcnt = 1;
 571                 uio.uio_segflg = UIO_SYSSPACE;
 572                 uio.uio_extflg = UIO_COPY_CACHED;
 573                 uio.uio_loffset = (offset_t)0;
 574                 uio.uio_resid = NFS_MAXPATHLEN;
 575 
 576                 /*
 577                  * Do the readlink.
 578                  */
 579                 error = VOP_READLINK(vp, &uio, cr, NULL);
 580 
 581                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 582 
 583                 if (!error)
 584                         rl->rl_data[rl->rl_count] = '\0';
 585 
 586         }
 587 
 588 
 589         VN_RELE(vp);
 590 
 591         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 592         name = nfscmd_convname(ca, exi, rl->rl_data,
 593             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 594 
 595         if (name != NULL && name != rl->rl_data) {
 596                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 597                 rl->rl_data = name;
 598         }
 599 
 600         /*
 601          * XNFS and RFC1094 require us to return ENXIO if argument
 602          * is not a link. UFS returns EINVAL if this is the case,
 603          * so we do the mapping here. BUGID 1138002.
 604          */
 605         if (error == EINVAL)
 606                 rl->rl_status = NFSERR_NXIO;
 607         else
 608                 rl->rl_status = puterrno(error);
 609 
 610 }
 611 void *
 612 rfs_readlink_getfh(fhandle_t *fhp)
 613 {
 614         return (fhp);
 615 }
 616 /*
 617  * Free data allocated by rfs_readlink
 618  */
 619 void
 620 rfs_rlfree(struct nfsrdlnres *rl)
 621 {
 622         if (rl->rl_data != NULL)
 623                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 624 }
 625 
 626 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 627 
 628 /*
 629  * Read data.
 630  * Returns some data read from the file at the given fhandle.
 631  */
 632 /* ARGSUSED */
 633 void
 634 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 635     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 636 {
 637         vnode_t *vp;
 638         int error;
 639         struct vattr va;
 640         struct iovec iov;
 641         struct uio uio;
 642         mblk_t *mp;
 643         int alloc_err = 0;
 644         int in_crit = 0;
 645         caller_context_t ct;
 646 
 647         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 648         if (vp == NULL) {
 649                 rr->rr_data = NULL;
 650                 rr->rr_status = NFSERR_STALE;
 651                 return;
 652         }
 653 
 654         if (vp->v_type != VREG) {
 655                 VN_RELE(vp);
 656                 rr->rr_data = NULL;
 657                 rr->rr_status = NFSERR_ISDIR;
 658                 return;
 659         }
 660 
 661         ct.cc_sysid = 0;
 662         ct.cc_pid = 0;
 663         ct.cc_caller_id = nfs2_srv_caller_id;
 664         ct.cc_flags = CC_DONTBLOCK;
 665 
 666         /*
 667          * Enter the critical region before calling VOP_RWLOCK
 668          * to avoid a deadlock with write requests.
 669          */
 670         if (nbl_need_check(vp)) {
 671                 nbl_start_crit(vp, RW_READER);
 672                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 673                     0, NULL)) {
 674                         nbl_end_crit(vp);
 675                         VN_RELE(vp);
 676                         rr->rr_data = NULL;
 677                         rr->rr_status = NFSERR_ACCES;
 678                         return;
 679                 }
 680                 in_crit = 1;
 681         }
 682 
 683         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 684 
 685         /* check if a monitor detected a delegation conflict */
 686         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 687                 VN_RELE(vp);
 688                 /* mark as wouldblock so response is dropped */
 689                 curthread->t_flag |= T_WOULDBLOCK;
 690 
 691                 rr->rr_data = NULL;
 692                 return;
 693         }
 694 
 695         va.va_mask = AT_ALL;
 696 
 697         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 698 
 699         if (error) {
 700                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 701                 if (in_crit)
 702                         nbl_end_crit(vp);
 703 
 704                 VN_RELE(vp);
 705                 rr->rr_data = NULL;
 706                 rr->rr_status = puterrno(error);
 707 
 708                 return;
 709         }
 710 
 711         /*
 712          * This is a kludge to allow reading of files created
 713          * with no read permission.  The owner of the file
 714          * is always allowed to read it.
 715          */
 716         if (crgetuid(cr) != va.va_uid) {
 717                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 718 
 719                 if (error) {
 720                         /*
 721                          * Exec is the same as read over the net because
 722                          * of demand loading.
 723                          */
 724                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 725                 }
 726                 if (error) {
 727                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 728                         if (in_crit)
 729                                 nbl_end_crit(vp);
 730                         VN_RELE(vp);
 731                         rr->rr_data = NULL;
 732                         rr->rr_status = puterrno(error);
 733 
 734                         return;
 735                 }
 736         }
 737 
 738         if (MANDLOCK(vp, va.va_mode)) {
 739                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 740                 if (in_crit)
 741                         nbl_end_crit(vp);
 742 
 743                 VN_RELE(vp);
 744                 rr->rr_data = NULL;
 745                 rr->rr_status = NFSERR_ACCES;
 746 
 747                 return;
 748         }
 749 
 750         rr->rr_ok.rrok_wlist_len = 0;
 751         rr->rr_ok.rrok_wlist = NULL;
 752 
 753         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 754                 rr->rr_count = 0;
 755                 rr->rr_data = NULL;
 756                 /*
 757                  * In this case, status is NFS_OK, but there is no data
 758                  * to encode. So set rr_mp to NULL.
 759                  */
 760                 rr->rr_mp = NULL;
 761                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 762                 if (rr->rr_ok.rrok_wlist)
 763                         clist_zero_len(rr->rr_ok.rrok_wlist);
 764                 goto done;
 765         }
 766 
 767         if (ra->ra_wlist) {
 768                 mp = NULL;
 769                 rr->rr_mp = NULL;
 770                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 771                 if (ra->ra_count > iov.iov_len) {
 772                         rr->rr_data = NULL;
 773                         rr->rr_status = NFSERR_INVAL;
 774                         goto done;
 775                 }
 776         } else {
 777                 /*
 778                  * mp will contain the data to be sent out in the read reply.
 779                  * This will be freed after the reply has been sent out (by the
 780                  * driver).
 781                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 782                  * that the call to xdrmblk_putmblk() never fails.
 783                  */
 784                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 785                     &alloc_err);
 786                 ASSERT(mp != NULL);
 787                 ASSERT(alloc_err == 0);
 788 
 789                 rr->rr_mp = mp;
 790 
 791                 /*
 792                  * Set up io vector
 793                  */
 794                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 795                 iov.iov_len = ra->ra_count;
 796         }
 797 
 798         uio.uio_iov = &iov;
 799         uio.uio_iovcnt = 1;
 800         uio.uio_segflg = UIO_SYSSPACE;
 801         uio.uio_extflg = UIO_COPY_CACHED;
 802         uio.uio_loffset = (offset_t)ra->ra_offset;
 803         uio.uio_resid = ra->ra_count;
 804 
 805         error = VOP_READ(vp, &uio, 0, cr, &ct);
 806 
 807         if (error) {
 808                 if (mp)
 809                         freeb(mp);
 810 
 811                 /*
 812                  * check if a monitor detected a delegation conflict and
 813                  * mark as wouldblock so response is dropped
 814                  */
 815                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 816                         curthread->t_flag |= T_WOULDBLOCK;
 817                 else
 818                         rr->rr_status = puterrno(error);
 819 
 820                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 821                 if (in_crit)
 822                         nbl_end_crit(vp);
 823 
 824                 VN_RELE(vp);
 825                 rr->rr_data = NULL;
 826 
 827                 return;
 828         }
 829 
 830         /*
 831          * Get attributes again so we can send the latest access
 832          * time to the client side for his cache.
 833          */
 834         va.va_mask = AT_ALL;
 835 
 836         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 837 
 838         if (error) {
 839                 if (mp)
 840                         freeb(mp);
 841 
 842                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 843                 if (in_crit)
 844                         nbl_end_crit(vp);
 845 
 846                 VN_RELE(vp);
 847                 rr->rr_data = NULL;
 848                 rr->rr_status = puterrno(error);
 849 
 850                 return;
 851         }
 852 
 853         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 854 
 855         if (mp) {
 856                 rr->rr_data = (char *)mp->b_datap->db_base;
 857         } else {
 858                 if (ra->ra_wlist) {
 859                         rr->rr_data = (caddr_t)iov.iov_base;
 860                         if (!rdma_setup_read_data2(ra, rr)) {
 861                                 rr->rr_data = NULL;
 862                                 rr->rr_status = puterrno(NFSERR_INVAL);
 863                         }
 864                 }
 865         }
 866 done:
 867         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 868         if (in_crit)
 869                 nbl_end_crit(vp);
 870 
 871         acl_perm(vp, exi, &va, cr);
 872 
 873         /* check for overflows */
 874         error = vattr_to_nattr(&va, &rr->rr_attr);
 875 
 876         VN_RELE(vp);
 877 
 878         rr->rr_status = puterrno(error);
 879 }
 880 
 881 /*
 882  * Free data allocated by rfs_read
 883  */
 884 void
 885 rfs_rdfree(struct nfsrdresult *rr)
 886 {
 887         mblk_t *mp;
 888 
 889         if (rr->rr_status == NFS_OK) {
 890                 mp = rr->rr_mp;
 891                 if (mp != NULL)
 892                         freeb(mp);
 893         }
 894 }
 895 
 896 void *
 897 rfs_read_getfh(struct nfsreadargs *ra)
 898 {
 899         return (&ra->ra_fhandle);
 900 }
 901 
 902 #define MAX_IOVECS      12
 903 
 904 #ifdef DEBUG
 905 static int rfs_write_sync_hits = 0;
 906 static int rfs_write_sync_misses = 0;
 907 #endif
 908 
 909 /*
 910  * Write data to file.
 911  * Returns attributes of a file after writing some data to it.
 912  *
 913  * Any changes made here, especially in error handling might have
 914  * to also be done in rfs_write (which clusters write requests).
 915  */
 916 /* ARGSUSED */
 917 void
 918 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
 919     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 920 {
 921         int error;
 922         vnode_t *vp;
 923         rlim64_t rlimit;
 924         struct vattr va;
 925         struct uio uio;
 926         struct iovec iov[MAX_IOVECS];
 927         mblk_t *m;
 928         struct iovec *iovp;
 929         int iovcnt;
 930         cred_t *savecred;
 931         int in_crit = 0;
 932         caller_context_t ct;
 933 
 934         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
 935         if (vp == NULL) {
 936                 ns->ns_status = NFSERR_STALE;
 937                 return;
 938         }
 939 
 940         if (rdonly(ro, vp)) {
 941                 VN_RELE(vp);
 942                 ns->ns_status = NFSERR_ROFS;
 943                 return;
 944         }
 945 
 946         if (vp->v_type != VREG) {
 947                 VN_RELE(vp);
 948                 ns->ns_status = NFSERR_ISDIR;
 949                 return;
 950         }
 951 
 952         ct.cc_sysid = 0;
 953         ct.cc_pid = 0;
 954         ct.cc_caller_id = nfs2_srv_caller_id;
 955         ct.cc_flags = CC_DONTBLOCK;
 956 
 957         va.va_mask = AT_UID|AT_MODE;
 958 
 959         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 960 
 961         if (error) {
 962                 VN_RELE(vp);
 963                 ns->ns_status = puterrno(error);
 964 
 965                 return;
 966         }
 967 
 968         if (crgetuid(cr) != va.va_uid) {
 969                 /*
 970                  * This is a kludge to allow writes of files created
 971                  * with read only permission.  The owner of the file
 972                  * is always allowed to write it.
 973                  */
 974                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
 975 
 976                 if (error) {
 977                         VN_RELE(vp);
 978                         ns->ns_status = puterrno(error);
 979                         return;
 980                 }
 981         }
 982 
 983         /*
 984          * Can't access a mandatory lock file.  This might cause
 985          * the NFS service thread to block forever waiting for a
 986          * lock to be released that will never be released.
 987          */
 988         if (MANDLOCK(vp, va.va_mode)) {
 989                 VN_RELE(vp);
 990                 ns->ns_status = NFSERR_ACCES;
 991                 return;
 992         }
 993 
 994         /*
 995          * We have to enter the critical region before calling VOP_RWLOCK
 996          * to avoid a deadlock with ufs.
 997          */
 998         if (nbl_need_check(vp)) {
 999                 nbl_start_crit(vp, RW_READER);
1000                 in_crit = 1;
1001                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1002                     wa->wa_count, 0, NULL)) {
1003                         error = EACCES;
1004                         goto out;
1005                 }
1006         }
1007 
1008         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1009 
1010         /* check if a monitor detected a delegation conflict */
1011         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1012                 VN_RELE(vp);
1013                 /* mark as wouldblock so response is dropped */
1014                 curthread->t_flag |= T_WOULDBLOCK;
1015                 return;
1016         }
1017 
1018         if (wa->wa_data || wa->wa_rlist) {
1019                 /* Do the RDMA thing if necessary */
1020                 if (wa->wa_rlist) {
1021                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1022                         iov[0].iov_len = wa->wa_count;
1023                 } else  {
1024                         iov[0].iov_base = wa->wa_data;
1025                         iov[0].iov_len = wa->wa_count;
1026                 }
1027                 uio.uio_iov = iov;
1028                 uio.uio_iovcnt = 1;
1029                 uio.uio_segflg = UIO_SYSSPACE;
1030                 uio.uio_extflg = UIO_COPY_DEFAULT;
1031                 uio.uio_loffset = (offset_t)wa->wa_offset;
1032                 uio.uio_resid = wa->wa_count;
1033                 /*
1034                  * The limit is checked on the client. We
1035                  * should allow any size writes here.
1036                  */
1037                 uio.uio_llimit = curproc->p_fsz_ctl;
1038                 rlimit = uio.uio_llimit - wa->wa_offset;
1039                 if (rlimit < (rlim64_t)uio.uio_resid)
1040                         uio.uio_resid = (uint_t)rlimit;
1041 
1042                 /*
1043                  * for now we assume no append mode
1044                  */
1045                 /*
1046                  * We're changing creds because VM may fault and we need
1047                  * the cred of the current thread to be used if quota
1048                  * checking is enabled.
1049                  */
1050                 savecred = curthread->t_cred;
1051                 curthread->t_cred = cr;
1052                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1053                 curthread->t_cred = savecred;
1054         } else {
1055                 iovcnt = 0;
1056                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1057                         iovcnt++;
1058                 if (iovcnt <= MAX_IOVECS) {
1059 #ifdef DEBUG
1060                         rfs_write_sync_hits++;
1061 #endif
1062                         iovp = iov;
1063                 } else {
1064 #ifdef DEBUG
1065                         rfs_write_sync_misses++;
1066 #endif
1067                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1068                 }
1069                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1070                 uio.uio_iov = iovp;
1071                 uio.uio_iovcnt = iovcnt;
1072                 uio.uio_segflg = UIO_SYSSPACE;
1073                 uio.uio_extflg = UIO_COPY_DEFAULT;
1074                 uio.uio_loffset = (offset_t)wa->wa_offset;
1075                 uio.uio_resid = wa->wa_count;
1076                 /*
1077                  * The limit is checked on the client. We
1078                  * should allow any size writes here.
1079                  */
1080                 uio.uio_llimit = curproc->p_fsz_ctl;
1081                 rlimit = uio.uio_llimit - wa->wa_offset;
1082                 if (rlimit < (rlim64_t)uio.uio_resid)
1083                         uio.uio_resid = (uint_t)rlimit;
1084 
1085                 /*
1086                  * For now we assume no append mode.
1087                  */
1088                 /*
1089                  * We're changing creds because VM may fault and we need
1090                  * the cred of the current thread to be used if quota
1091                  * checking is enabled.
1092                  */
1093                 savecred = curthread->t_cred;
1094                 curthread->t_cred = cr;
1095                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1096                 curthread->t_cred = savecred;
1097 
1098                 if (iovp != iov)
1099                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1100         }
1101 
1102         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1103 
1104         if (!error) {
1105                 /*
1106                  * Get attributes again so we send the latest mod
1107                  * time to the client side for his cache.
1108                  */
1109                 va.va_mask = AT_ALL;    /* now we want everything */
1110 
1111                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1112 
1113                 /* check for overflows */
1114                 if (!error) {
1115                         acl_perm(vp, exi, &va, cr);
1116                         error = vattr_to_nattr(&va, &ns->ns_attr);
1117                 }
1118         }
1119 
1120 out:
1121         if (in_crit)
1122                 nbl_end_crit(vp);
1123         VN_RELE(vp);
1124 
1125         /* check if a monitor detected a delegation conflict */
1126         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1127                 /* mark as wouldblock so response is dropped */
1128                 curthread->t_flag |= T_WOULDBLOCK;
1129         else
1130                 ns->ns_status = puterrno(error);
1131 
1132 }
1133 
1134 struct rfs_async_write {
1135         struct nfswriteargs *wa;
1136         struct nfsattrstat *ns;
1137         struct svc_req *req;
1138         cred_t *cr;
1139         bool_t ro;
1140         kthread_t *thread;
1141         struct rfs_async_write *list;
1142 };
1143 
1144 struct rfs_async_write_list {
1145         fhandle_t *fhp;
1146         kcondvar_t cv;
1147         struct rfs_async_write *list;
1148         struct rfs_async_write_list *next;
1149 };
1150 
1151 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1152 static kmutex_t rfs_async_write_lock;
1153 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1154 
1155 #define MAXCLIOVECS     42
1156 #define RFSWRITE_INITVAL (enum nfsstat) -1
1157 
1158 #ifdef DEBUG
1159 static int rfs_write_hits = 0;
1160 static int rfs_write_misses = 0;
1161 #endif
1162 
1163 /*
1164  * Write data to file.
1165  * Returns attributes of a file after writing some data to it.
1166  */
1167 void
1168 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1169     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1170 {
1171         int error;
1172         vnode_t *vp;
1173         rlim64_t rlimit;
1174         struct vattr va;
1175         struct uio uio;
1176         struct rfs_async_write_list *lp;
1177         struct rfs_async_write_list *nlp;
1178         struct rfs_async_write *rp;
1179         struct rfs_async_write *nrp;
1180         struct rfs_async_write *trp;
1181         struct rfs_async_write *lrp;
1182         int data_written;
1183         int iovcnt;
1184         mblk_t *m;
1185         struct iovec *iovp;
1186         struct iovec *niovp;
1187         struct iovec iov[MAXCLIOVECS];
1188         int count;
1189         int rcount;
1190         uint_t off;
1191         uint_t len;
1192         struct rfs_async_write nrpsp;
1193         struct rfs_async_write_list nlpsp;
1194         ushort_t t_flag;
1195         cred_t *savecred;
1196         int in_crit = 0;
1197         caller_context_t ct;
1198 
1199         if (!rfs_write_async) {
1200                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1201                 return;
1202         }
1203 
1204         /*
1205          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1206          * is considered an OK.
1207          */
1208         ns->ns_status = RFSWRITE_INITVAL;
1209 
1210         nrp = &nrpsp;
1211         nrp->wa = wa;
1212         nrp->ns = ns;
1213         nrp->req = req;
1214         nrp->cr = cr;
1215         nrp->ro = ro;
1216         nrp->thread = curthread;
1217 
1218         /*
1219          * Look to see if there is already a cluster started
1220          * for this file.
1221          */
1222         mutex_enter(&rfs_async_write_lock);
1223         for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1224                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1225                     sizeof (fhandle_t)) == 0)
1226                         break;
1227         }
1228 
1229         /*
1230          * If lp is non-NULL, then there is already a cluster
1231          * started.  We need to place ourselves in the cluster
1232          * list in the right place as determined by starting
1233          * offset.  Conflicts with non-blocking mandatory locked
1234          * regions will be checked when the cluster is processed.
1235          */
1236         if (lp != NULL) {
1237                 rp = lp->list;
1238                 trp = NULL;
1239                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1240                         trp = rp;
1241                         rp = rp->list;
1242                 }
1243                 nrp->list = rp;
1244                 if (trp == NULL)
1245                         lp->list = nrp;
1246                 else
1247                         trp->list = nrp;
1248                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1249                         cv_wait(&lp->cv, &rfs_async_write_lock);
1250                 mutex_exit(&rfs_async_write_lock);
1251 
1252                 return;
1253         }
1254 
1255         /*
1256          * No cluster started yet, start one and add ourselves
1257          * to the list of clusters.
1258          */
1259         nrp->list = NULL;
1260 
1261         nlp = &nlpsp;
1262         nlp->fhp = &wa->wa_fhandle;
1263         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1264         nlp->list = nrp;
1265         nlp->next = NULL;
1266 
1267         if (rfs_async_write_head == NULL) {
1268                 rfs_async_write_head = nlp;
1269         } else {
1270                 lp = rfs_async_write_head;
1271                 while (lp->next != NULL)
1272                         lp = lp->next;
1273                 lp->next = nlp;
1274         }
1275         mutex_exit(&rfs_async_write_lock);
1276 
1277         /*
1278          * Convert the file handle common to all of the requests
1279          * in this cluster to a vnode.
1280          */
1281         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1282         if (vp == NULL) {
1283                 mutex_enter(&rfs_async_write_lock);
1284                 if (rfs_async_write_head == nlp)
1285                         rfs_async_write_head = nlp->next;
1286                 else {
1287                         lp = rfs_async_write_head;
1288                         while (lp->next != nlp)
1289                                 lp = lp->next;
1290                         lp->next = nlp->next;
1291                 }
1292                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1293                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1294                         rp->ns->ns_status = NFSERR_STALE;
1295                         rp->thread->t_flag |= t_flag;
1296                 }
1297                 cv_broadcast(&nlp->cv);
1298                 mutex_exit(&rfs_async_write_lock);
1299 
1300                 return;
1301         }
1302 
1303         /*
1304          * Can only write regular files.  Attempts to write any
1305          * other file types fail with EISDIR.
1306          */
1307         if (vp->v_type != VREG) {
1308                 VN_RELE(vp);
1309                 mutex_enter(&rfs_async_write_lock);
1310                 if (rfs_async_write_head == nlp)
1311                         rfs_async_write_head = nlp->next;
1312                 else {
1313                         lp = rfs_async_write_head;
1314                         while (lp->next != nlp)
1315                                 lp = lp->next;
1316                         lp->next = nlp->next;
1317                 }
1318                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1319                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1320                         rp->ns->ns_status = NFSERR_ISDIR;
1321                         rp->thread->t_flag |= t_flag;
1322                 }
1323                 cv_broadcast(&nlp->cv);
1324                 mutex_exit(&rfs_async_write_lock);
1325 
1326                 return;
1327         }
1328 
1329         /*
1330          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1331          * deadlock with ufs.
1332          */
1333         if (nbl_need_check(vp)) {
1334                 nbl_start_crit(vp, RW_READER);
1335                 in_crit = 1;
1336         }
1337 
1338         ct.cc_sysid = 0;
1339         ct.cc_pid = 0;
1340         ct.cc_caller_id = nfs2_srv_caller_id;
1341         ct.cc_flags = CC_DONTBLOCK;
1342 
1343         /*
1344          * Lock the file for writing.  This operation provides
1345          * the delay which allows clusters to grow.
1346          */
1347         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1348 
1349         /* check if a monitor detected a delegation conflict */
1350         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1351                 if (in_crit)
1352                         nbl_end_crit(vp);
1353                 VN_RELE(vp);
1354                 /* mark as wouldblock so response is dropped */
1355                 curthread->t_flag |= T_WOULDBLOCK;
1356                 mutex_enter(&rfs_async_write_lock);
1357                 if (rfs_async_write_head == nlp)
1358                         rfs_async_write_head = nlp->next;
1359                 else {
1360                         lp = rfs_async_write_head;
1361                         while (lp->next != nlp)
1362                                 lp = lp->next;
1363                         lp->next = nlp->next;
1364                 }
1365                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1366                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1367                                 rp->ns->ns_status = puterrno(error);
1368                                 rp->thread->t_flag |= T_WOULDBLOCK;
1369                         }
1370                 }
1371                 cv_broadcast(&nlp->cv);
1372                 mutex_exit(&rfs_async_write_lock);
1373 
1374                 return;
1375         }
1376 
1377         /*
1378          * Disconnect this cluster from the list of clusters.
1379          * The cluster that is being dealt with must be fixed
1380          * in size after this point, so there is no reason
1381          * to leave it on the list so that new requests can
1382          * find it.
1383          *
1384          * The algorithm is that the first write request will
1385          * create a cluster, convert the file handle to a
1386          * vnode pointer, and then lock the file for writing.
1387          * This request is not likely to be clustered with
1388          * any others.  However, the next request will create
1389          * a new cluster and be blocked in VOP_RWLOCK while
1390          * the first request is being processed.  This delay
1391          * will allow more requests to be clustered in this
1392          * second cluster.
1393          */
1394         mutex_enter(&rfs_async_write_lock);
1395         if (rfs_async_write_head == nlp)
1396                 rfs_async_write_head = nlp->next;
1397         else {
1398                 lp = rfs_async_write_head;
1399                 while (lp->next != nlp)
1400                         lp = lp->next;
1401                 lp->next = nlp->next;
1402         }
1403         mutex_exit(&rfs_async_write_lock);
1404 
1405         /*
1406          * Step through the list of requests in this cluster.
1407          * We need to check permissions to make sure that all
1408          * of the requests have sufficient permission to write
1409          * the file.  A cluster can be composed of requests
1410          * from different clients and different users on each
1411          * client.
1412          *
1413          * As a side effect, we also calculate the size of the
1414          * byte range that this cluster encompasses.
1415          */
1416         rp = nlp->list;
1417         off = rp->wa->wa_offset;
1418         len = (uint_t)0;
1419         do {
1420                 if (rdonly(rp->ro, vp)) {
1421                         rp->ns->ns_status = NFSERR_ROFS;
1422                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1423                         rp->thread->t_flag |= t_flag;
1424                         continue;
1425                 }
1426 
1427                 va.va_mask = AT_UID|AT_MODE;
1428 
1429                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1430 
1431                 if (!error) {
1432                         if (crgetuid(rp->cr) != va.va_uid) {
1433                                 /*
1434                                  * This is a kludge to allow writes of files
1435                                  * created with read only permission.  The
1436                                  * owner of the file is always allowed to
1437                                  * write it.
1438                                  */
1439                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1440                         }
1441                         if (!error && MANDLOCK(vp, va.va_mode))
1442                                 error = EACCES;
1443                 }
1444 
1445                 /*
1446                  * Check for a conflict with a nbmand-locked region.
1447                  */
1448                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1449                     rp->wa->wa_count, 0, NULL)) {
1450                         error = EACCES;
1451                 }
1452 
1453                 if (error) {
1454                         rp->ns->ns_status = puterrno(error);
1455                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1456                         rp->thread->t_flag |= t_flag;
1457                         continue;
1458                 }
1459                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1460                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1461         } while ((rp = rp->list) != NULL);
1462 
1463         /*
1464          * Step through the cluster attempting to gather as many
1465          * requests which are contiguous as possible.  These
1466          * contiguous requests are handled via one call to VOP_WRITE
1467          * instead of different calls to VOP_WRITE.  We also keep
1468          * track of the fact that any data was written.
1469          */
1470         rp = nlp->list;
1471         data_written = 0;
1472         do {
1473                 /*
1474                  * Skip any requests which are already marked as having an
1475                  * error.
1476                  */
1477                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1478                         rp = rp->list;
1479                         continue;
1480                 }
1481 
1482                 /*
1483                  * Count the number of iovec's which are required
1484                  * to handle this set of requests.  One iovec is
1485                  * needed for each data buffer, whether addressed
1486                  * by wa_data or by the b_rptr pointers in the
1487                  * mblk chains.
1488                  */
1489                 iovcnt = 0;
1490                 lrp = rp;
1491                 for (;;) {
1492                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1493                                 iovcnt++;
1494                         else {
1495                                 m = lrp->wa->wa_mblk;
1496                                 while (m != NULL) {
1497                                         iovcnt++;
1498                                         m = m->b_cont;
1499                                 }
1500                         }
1501                         if (lrp->list == NULL ||
1502                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1503                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1504                             lrp->list->wa->wa_offset) {
1505                                 lrp = lrp->list;
1506                                 break;
1507                         }
1508                         lrp = lrp->list;
1509                 }
1510 
1511                 if (iovcnt <= MAXCLIOVECS) {
1512 #ifdef DEBUG
1513                         rfs_write_hits++;
1514 #endif
1515                         niovp = iov;
1516                 } else {
1517 #ifdef DEBUG
1518                         rfs_write_misses++;
1519 #endif
1520                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1521                 }
1522                 /*
1523                  * Put together the scatter/gather iovecs.
1524                  */
1525                 iovp = niovp;
1526                 trp = rp;
1527                 count = 0;
1528                 do {
1529                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1530                                 if (trp->wa->wa_rlist) {
1531                                         iovp->iov_base =
1532                                             (char *)((trp->wa->wa_rlist)->
1533                                             u.c_daddr3);
1534                                         iovp->iov_len = trp->wa->wa_count;
1535                                 } else  {
1536                                         iovp->iov_base = trp->wa->wa_data;
1537                                         iovp->iov_len = trp->wa->wa_count;
1538                                 }
1539                                 iovp++;
1540                         } else {
1541                                 m = trp->wa->wa_mblk;
1542                                 rcount = trp->wa->wa_count;
1543                                 while (m != NULL) {
1544                                         iovp->iov_base = (caddr_t)m->b_rptr;
1545                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1546                                         rcount -= iovp->iov_len;
1547                                         if (rcount < 0)
1548                                                 iovp->iov_len += rcount;
1549                                         iovp++;
1550                                         if (rcount <= 0)
1551                                                 break;
1552                                         m = m->b_cont;
1553                                 }
1554                         }
1555                         count += trp->wa->wa_count;
1556                         trp = trp->list;
1557                 } while (trp != lrp);
1558 
1559                 uio.uio_iov = niovp;
1560                 uio.uio_iovcnt = iovcnt;
1561                 uio.uio_segflg = UIO_SYSSPACE;
1562                 uio.uio_extflg = UIO_COPY_DEFAULT;
1563                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1564                 uio.uio_resid = count;
1565                 /*
1566                  * The limit is checked on the client. We
1567                  * should allow any size writes here.
1568                  */
1569                 uio.uio_llimit = curproc->p_fsz_ctl;
1570                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1571                 if (rlimit < (rlim64_t)uio.uio_resid)
1572                         uio.uio_resid = (uint_t)rlimit;
1573 
1574                 /*
1575                  * For now we assume no append mode.
1576                  */
1577 
1578                 /*
1579                  * We're changing creds because VM may fault
1580                  * and we need the cred of the current
1581                  * thread to be used if quota * checking is
1582                  * enabled.
1583                  */
1584                 savecred = curthread->t_cred;
1585                 curthread->t_cred = cr;
1586                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1587                 curthread->t_cred = savecred;
1588 
1589                 /* check if a monitor detected a delegation conflict */
1590                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1591                         /* mark as wouldblock so response is dropped */
1592                         curthread->t_flag |= T_WOULDBLOCK;
1593 
1594                 if (niovp != iov)
1595                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1596 
1597                 if (!error) {
1598                         data_written = 1;
1599                         /*
1600                          * Get attributes again so we send the latest mod
1601                          * time to the client side for his cache.
1602                          */
1603                         va.va_mask = AT_ALL;    /* now we want everything */
1604 
1605                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1606 
1607                         if (!error)
1608                                 acl_perm(vp, exi, &va, rp->cr);
1609                 }
1610 
1611                 /*
1612                  * Fill in the status responses for each request
1613                  * which was just handled.  Also, copy the latest
1614                  * attributes in to the attribute responses if
1615                  * appropriate.
1616                  */
1617                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1618                 do {
1619                         rp->thread->t_flag |= t_flag;
1620                         /* check for overflows */
1621                         if (!error) {
1622                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1623                         }
1624                         rp->ns->ns_status = puterrno(error);
1625                         rp = rp->list;
1626                 } while (rp != lrp);
1627         } while (rp != NULL);
1628 
1629         /*
1630          * If any data was written at all, then we need to flush
1631          * the data and metadata to stable storage.
1632          */
1633         if (data_written) {
1634                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1635 
1636                 if (!error) {
1637                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1638                 }
1639         }
1640 
1641         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1642 
1643         if (in_crit)
1644                 nbl_end_crit(vp);
1645         VN_RELE(vp);
1646 
1647         t_flag = curthread->t_flag & T_WOULDBLOCK;
1648         mutex_enter(&rfs_async_write_lock);
1649         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1650                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1651                         rp->ns->ns_status = puterrno(error);
1652                         rp->thread->t_flag |= t_flag;
1653                 }
1654         }
1655         cv_broadcast(&nlp->cv);
1656         mutex_exit(&rfs_async_write_lock);
1657 
1658 }
1659 
1660 void *
1661 rfs_write_getfh(struct nfswriteargs *wa)
1662 {
1663         return (&wa->wa_fhandle);
1664 }
1665 
1666 /*
1667  * Create a file.
1668  * Creates a file with given attributes and returns those attributes
1669  * and an fhandle for the new file.
1670  */
1671 void
1672 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1673     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1674 {
1675         int error;
1676         int lookuperr;
1677         int in_crit = 0;
1678         struct vattr va;
1679         vnode_t *vp;
1680         vnode_t *realvp;
1681         vnode_t *dvp;
1682         char *name = args->ca_da.da_name;
1683         vnode_t *tvp = NULL;
1684         int mode;
1685         int lookup_ok;
1686         bool_t trunc;
1687         struct sockaddr *ca;
1688 
1689         /*
1690          * Disallow NULL paths
1691          */
1692         if (name == NULL || *name == '\0') {
1693                 dr->dr_status = NFSERR_ACCES;
1694                 return;
1695         }
1696 
1697         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1698         if (dvp == NULL) {
1699                 dr->dr_status = NFSERR_STALE;
1700                 return;
1701         }
1702 
1703         error = sattr_to_vattr(args->ca_sa, &va);
1704         if (error) {
1705                 dr->dr_status = puterrno(error);
1706                 return;
1707         }
1708 
1709         /*
1710          * Must specify the mode.
1711          */
1712         if (!(va.va_mask & AT_MODE)) {
1713                 VN_RELE(dvp);
1714                 dr->dr_status = NFSERR_INVAL;
1715                 return;
1716         }
1717 
1718         /*
1719          * This is a completely gross hack to make mknod
1720          * work over the wire until we can wack the protocol
1721          */
1722         if ((va.va_mode & IFMT) == IFCHR) {
1723                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1724                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1725                 else {
1726                         va.va_type = VCHR;
1727                         /*
1728                          * uncompress the received dev_t
1729                          * if the top half is zero indicating a request
1730                          * from an `older style' OS.
1731                          */
1732                         if ((va.va_size & 0xffff0000) == 0)
1733                                 va.va_rdev = nfsv2_expdev(va.va_size);
1734                         else
1735                                 va.va_rdev = (dev_t)va.va_size;
1736                 }
1737                 va.va_mask &= ~AT_SIZE;
1738         } else if ((va.va_mode & IFMT) == IFBLK) {
1739                 va.va_type = VBLK;
1740                 /*
1741                  * uncompress the received dev_t
1742                  * if the top half is zero indicating a request
1743                  * from an `older style' OS.
1744                  */
1745                 if ((va.va_size & 0xffff0000) == 0)
1746                         va.va_rdev = nfsv2_expdev(va.va_size);
1747                 else
1748                         va.va_rdev = (dev_t)va.va_size;
1749                 va.va_mask &= ~AT_SIZE;
1750         } else if ((va.va_mode & IFMT) == IFSOCK) {
1751                 va.va_type = VSOCK;
1752         } else {
1753                 va.va_type = VREG;
1754         }
1755         va.va_mode &= ~IFMT;
1756         va.va_mask |= AT_TYPE;
1757 
1758         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1759         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1760             MAXPATHLEN);
1761         if (name == NULL) {
1762                 dr->dr_status = puterrno(EINVAL);
1763                 return;
1764         }
1765 
1766         /*
1767          * Why was the choice made to use VWRITE as the mode to the
1768          * call to VOP_CREATE ? This results in a bug.  When a client
1769          * opens a file that already exists and is RDONLY, the second
1770          * open fails with an EACESS because of the mode.
1771          * bug ID 1054648.
1772          */
1773         lookup_ok = 0;
1774         mode = VWRITE;
1775         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1776                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1777                     NULL, NULL, NULL);
1778                 if (!error) {
1779                         struct vattr at;
1780 
1781                         lookup_ok = 1;
1782                         at.va_mask = AT_MODE;
1783                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1784                         if (!error)
1785                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1786                         VN_RELE(tvp);
1787                         tvp = NULL;
1788                 }
1789         }
1790 
1791         if (!lookup_ok) {
1792                 if (rdonly(ro, dvp)) {
1793                         error = EROFS;
1794                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1795                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1796                         error = EPERM;
1797                 } else {
1798                         error = 0;
1799                 }
1800         }
1801 
1802         /*
1803          * If file size is being modified on an already existing file
1804          * make sure that there are no conflicting non-blocking mandatory
1805          * locks in the region being manipulated. Return EACCES if there
1806          * are conflicting locks.
1807          */
1808         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1809                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1810                     NULL, NULL, NULL);
1811 
1812                 if (!lookuperr &&
1813                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1814                         VN_RELE(tvp);
1815                         curthread->t_flag |= T_WOULDBLOCK;
1816                         goto out;
1817                 }
1818 
1819                 if (!lookuperr && nbl_need_check(tvp)) {
1820                         /*
1821                          * The file exists. Now check if it has any
1822                          * conflicting non-blocking mandatory locks
1823                          * in the region being changed.
1824                          */
1825                         struct vattr bva;
1826                         u_offset_t offset;
1827                         ssize_t length;
1828 
1829                         nbl_start_crit(tvp, RW_READER);
1830                         in_crit = 1;
1831 
1832                         bva.va_mask = AT_SIZE;
1833                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1834                         if (!error) {
1835                                 if (va.va_size < bva.va_size) {
1836                                         offset = va.va_size;
1837                                         length = bva.va_size - va.va_size;
1838                                 } else {
1839                                         offset = bva.va_size;
1840                                         length = va.va_size - bva.va_size;
1841                                 }
1842                                 if (length) {
1843                                         if (nbl_conflict(tvp, NBL_WRITE,
1844                                             offset, length, 0, NULL)) {
1845                                                 error = EACCES;
1846                                         }
1847                                 }
1848                         }
1849                         if (error) {
1850                                 nbl_end_crit(tvp);
1851                                 VN_RELE(tvp);
1852                                 in_crit = 0;
1853                         }
1854                 } else if (tvp != NULL) {
1855                         VN_RELE(tvp);
1856                 }
1857         }
1858 
1859         if (!error) {
1860                 /*
1861                  * If filesystem is shared with nosuid the remove any
1862                  * setuid/setgid bits on create.
1863                  */
1864                 if (va.va_type == VREG &&
1865                     exi->exi_export.ex_flags & EX_NOSUID)
1866                         va.va_mode &= ~(VSUID | VSGID);
1867 
1868                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1869                     NULL, NULL);
1870 
1871                 if (!error) {
1872 
1873                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1874                                 trunc = TRUE;
1875                         else
1876                                 trunc = FALSE;
1877 
1878                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1879                                 VN_RELE(vp);
1880                                 curthread->t_flag |= T_WOULDBLOCK;
1881                                 goto out;
1882                         }
1883                         va.va_mask = AT_ALL;
1884 
1885                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1886 
1887                         /* check for overflows */
1888                         if (!error) {
1889                                 acl_perm(vp, exi, &va, cr);
1890                                 error = vattr_to_nattr(&va, &dr->dr_attr);
1891                                 if (!error) {
1892                                         error = makefh(&dr->dr_fhandle, vp,
1893                                             exi);
1894                                 }
1895                         }
1896                         /*
1897                          * Force modified metadata out to stable storage.
1898                          *
1899                          * if a underlying vp exists, pass it to VOP_FSYNC
1900                          */
1901                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
1902                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1903                         else
1904                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1905                         VN_RELE(vp);
1906                 }
1907 
1908                 if (in_crit) {
1909                         nbl_end_crit(tvp);
1910                         VN_RELE(tvp);
1911                 }
1912         }
1913 
1914         /*
1915          * Force modified data and metadata out to stable storage.
1916          */
1917         (void) VOP_FSYNC(dvp, 0, cr, NULL);
1918 
1919 out:
1920 
1921         VN_RELE(dvp);
1922 
1923         dr->dr_status = puterrno(error);
1924 
1925         if (name != args->ca_da.da_name)
1926                 kmem_free(name, MAXPATHLEN);
1927 }
1928 void *
1929 rfs_create_getfh(struct nfscreatargs *args)
1930 {
1931         return (args->ca_da.da_fhandle);
1932 }
1933 
1934 /*
1935  * Remove a file.
1936  * Remove named file from parent directory.
1937  */
1938 /* ARGSUSED */
1939 void
1940 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1941     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1942 {
1943         int error = 0;
1944         vnode_t *vp;
1945         vnode_t *targvp;
1946         int in_crit = 0;
1947 
1948         /*
1949          * Disallow NULL paths
1950          */
1951         if (da->da_name == NULL || *da->da_name == '\0') {
1952                 *status = NFSERR_ACCES;
1953                 return;
1954         }
1955 
1956         vp = nfs_fhtovp(da->da_fhandle, exi);
1957         if (vp == NULL) {
1958                 *status = NFSERR_STALE;
1959                 return;
1960         }
1961 
1962         if (rdonly(ro, vp)) {
1963                 VN_RELE(vp);
1964                 *status = NFSERR_ROFS;
1965                 return;
1966         }
1967 
1968         /*
1969          * Check for a conflict with a non-blocking mandatory share reservation.
1970          */
1971         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1972             NULL, cr, NULL, NULL, NULL);
1973         if (error != 0) {
1974                 VN_RELE(vp);
1975                 *status = puterrno(error);
1976                 return;
1977         }
1978 
1979         /*
1980          * If the file is delegated to an v4 client, then initiate
1981          * recall and drop this request (by setting T_WOULDBLOCK).
1982          * The client will eventually re-transmit the request and
1983          * (hopefully), by then, the v4 client will have returned
1984          * the delegation.
1985          */
1986 
1987         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1988                 VN_RELE(vp);
1989                 VN_RELE(targvp);
1990                 curthread->t_flag |= T_WOULDBLOCK;
1991                 return;
1992         }
1993 
1994         if (nbl_need_check(targvp)) {
1995                 nbl_start_crit(targvp, RW_READER);
1996                 in_crit = 1;
1997                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1998                         error = EACCES;
1999                         goto out;
2000                 }
2001         }
2002 
2003         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2004 
2005         /*
2006          * Force modified data and metadata out to stable storage.
2007          */
2008         (void) VOP_FSYNC(vp, 0, cr, NULL);
2009 
2010 out:
2011         if (in_crit)
2012                 nbl_end_crit(targvp);
2013         VN_RELE(targvp);
2014         VN_RELE(vp);
2015 
2016         *status = puterrno(error);
2017 
2018 }
2019 
2020 void *
2021 rfs_remove_getfh(struct nfsdiropargs *da)
2022 {
2023         return (da->da_fhandle);
2024 }
2025 
2026 /*
2027  * rename a file
2028  * Give a file (from) a new name (to).
2029  */
2030 /* ARGSUSED */
2031 void
2032 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2033     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2034 {
2035         int error = 0;
2036         vnode_t *fromvp;
2037         vnode_t *tovp;
2038         struct exportinfo *to_exi;
2039         fhandle_t *fh;
2040         vnode_t *srcvp;
2041         vnode_t *targvp;
2042         int in_crit = 0;
2043 
2044         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2045         if (fromvp == NULL) {
2046                 *status = NFSERR_STALE;
2047                 return;
2048         }
2049 
2050         fh = args->rna_to.da_fhandle;
2051         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2052         if (to_exi == NULL) {
2053                 VN_RELE(fromvp);
2054                 *status = NFSERR_ACCES;
2055                 return;
2056         }
2057         exi_rele(to_exi);
2058 
2059         if (to_exi != exi) {
2060                 VN_RELE(fromvp);
2061                 *status = NFSERR_XDEV;
2062                 return;
2063         }
2064 
2065         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2066         if (tovp == NULL) {
2067                 VN_RELE(fromvp);
2068                 *status = NFSERR_STALE;
2069                 return;
2070         }
2071 
2072         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2073                 VN_RELE(tovp);
2074                 VN_RELE(fromvp);
2075                 *status = NFSERR_NOTDIR;
2076                 return;
2077         }
2078 
2079         /*
2080          * Disallow NULL paths
2081          */
2082         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2083             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2084                 VN_RELE(tovp);
2085                 VN_RELE(fromvp);
2086                 *status = NFSERR_ACCES;
2087                 return;
2088         }
2089 
2090         if (rdonly(ro, tovp)) {
2091                 VN_RELE(tovp);
2092                 VN_RELE(fromvp);
2093                 *status = NFSERR_ROFS;
2094                 return;
2095         }
2096 
2097         /*
2098          * Check for a conflict with a non-blocking mandatory share reservation.
2099          */
2100         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2101             NULL, cr, NULL, NULL, NULL);
2102         if (error != 0) {
2103                 VN_RELE(tovp);
2104                 VN_RELE(fromvp);
2105                 *status = puterrno(error);
2106                 return;
2107         }
2108 
2109         /* Check for delegations on the source file */
2110 
2111         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2112                 VN_RELE(tovp);
2113                 VN_RELE(fromvp);
2114                 VN_RELE(srcvp);
2115                 curthread->t_flag |= T_WOULDBLOCK;
2116                 return;
2117         }
2118 
2119         /* Check for delegation on the file being renamed over, if it exists */
2120 
2121         if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2122             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2123             NULL, NULL, NULL) == 0) {
2124 
2125                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2126                         VN_RELE(tovp);
2127                         VN_RELE(fromvp);
2128                         VN_RELE(srcvp);
2129                         VN_RELE(targvp);
2130                         curthread->t_flag |= T_WOULDBLOCK;
2131                         return;
2132                 }
2133                 VN_RELE(targvp);
2134         }
2135 
2136 
2137         if (nbl_need_check(srcvp)) {
2138                 nbl_start_crit(srcvp, RW_READER);
2139                 in_crit = 1;
2140                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2141                         error = EACCES;
2142                         goto out;
2143                 }
2144         }
2145 
2146         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2147             tovp, args->rna_to.da_name, cr, NULL, 0);
2148 
2149         if (error == 0)
2150                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2151                     strlen(args->rna_to.da_name));
2152 
2153         /*
2154          * Force modified data and metadata out to stable storage.
2155          */
2156         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2157         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2158 
2159 out:
2160         if (in_crit)
2161                 nbl_end_crit(srcvp);
2162         VN_RELE(srcvp);
2163         VN_RELE(tovp);
2164         VN_RELE(fromvp);
2165 
2166         *status = puterrno(error);
2167 
2168 }
2169 void *
2170 rfs_rename_getfh(struct nfsrnmargs *args)
2171 {
2172         return (args->rna_from.da_fhandle);
2173 }
2174 
2175 /*
2176  * Link to a file.
2177  * Create a file (to) which is a hard link to the given file (from).
2178  */
2179 /* ARGSUSED */
2180 void
2181 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2182     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2183 {
2184         int error;
2185         vnode_t *fromvp;
2186         vnode_t *tovp;
2187         struct exportinfo *to_exi;
2188         fhandle_t *fh;
2189 
2190         fromvp = nfs_fhtovp(args->la_from, exi);
2191         if (fromvp == NULL) {
2192                 *status = NFSERR_STALE;
2193                 return;
2194         }
2195 
2196         fh = args->la_to.da_fhandle;
2197         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2198         if (to_exi == NULL) {
2199                 VN_RELE(fromvp);
2200                 *status = NFSERR_ACCES;
2201                 return;
2202         }
2203         exi_rele(to_exi);
2204 
2205         if (to_exi != exi) {
2206                 VN_RELE(fromvp);
2207                 *status = NFSERR_XDEV;
2208                 return;
2209         }
2210 
2211         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2212         if (tovp == NULL) {
2213                 VN_RELE(fromvp);
2214                 *status = NFSERR_STALE;
2215                 return;
2216         }
2217 
2218         if (tovp->v_type != VDIR) {
2219                 VN_RELE(tovp);
2220                 VN_RELE(fromvp);
2221                 *status = NFSERR_NOTDIR;
2222                 return;
2223         }
2224         /*
2225          * Disallow NULL paths
2226          */
2227         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2228                 VN_RELE(tovp);
2229                 VN_RELE(fromvp);
2230                 *status = NFSERR_ACCES;
2231                 return;
2232         }
2233 
2234         if (rdonly(ro, tovp)) {
2235                 VN_RELE(tovp);
2236                 VN_RELE(fromvp);
2237                 *status = NFSERR_ROFS;
2238                 return;
2239         }
2240 
2241         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2242 
2243         /*
2244          * Force modified data and metadata out to stable storage.
2245          */
2246         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2247         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2248 
2249         VN_RELE(tovp);
2250         VN_RELE(fromvp);
2251 
2252         *status = puterrno(error);
2253 
2254 }
2255 void *
2256 rfs_link_getfh(struct nfslinkargs *args)
2257 {
2258         return (args->la_from);
2259 }
2260 
2261 /*
2262  * Symbolicly link to a file.
2263  * Create a file (to) with the given attributes which is a symbolic link
2264  * to the given path name (to).
2265  */
2266 void
2267 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2268     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2269 {
2270         int error;
2271         struct vattr va;
2272         vnode_t *vp;
2273         vnode_t *svp;
2274         int lerror;
2275         struct sockaddr *ca;
2276         char *name = NULL;
2277 
2278         /*
2279          * Disallow NULL paths
2280          */
2281         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2282                 *status = NFSERR_ACCES;
2283                 return;
2284         }
2285 
2286         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2287         if (vp == NULL) {
2288                 *status = NFSERR_STALE;
2289                 return;
2290         }
2291 
2292         if (rdonly(ro, vp)) {
2293                 VN_RELE(vp);
2294                 *status = NFSERR_ROFS;
2295                 return;
2296         }
2297 
2298         error = sattr_to_vattr(args->sla_sa, &va);
2299         if (error) {
2300                 VN_RELE(vp);
2301                 *status = puterrno(error);
2302                 return;
2303         }
2304 
2305         if (!(va.va_mask & AT_MODE)) {
2306                 VN_RELE(vp);
2307                 *status = NFSERR_INVAL;
2308                 return;
2309         }
2310 
2311         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2312         name = nfscmd_convname(ca, exi, args->sla_tnm,
2313             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2314 
2315         if (name == NULL) {
2316                 *status = NFSERR_ACCES;
2317                 return;
2318         }
2319 
2320         va.va_type = VLNK;
2321         va.va_mask |= AT_TYPE;
2322 
2323         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2324 
2325         /*
2326          * Force new data and metadata out to stable storage.
2327          */
2328         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2329             NULL, cr, NULL, NULL, NULL);
2330 
2331         if (!lerror) {
2332                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2333                 VN_RELE(svp);
2334         }
2335 
2336         /*
2337          * Force modified data and metadata out to stable storage.
2338          */
2339         (void) VOP_FSYNC(vp, 0, cr, NULL);
2340 
2341         VN_RELE(vp);
2342 
2343         *status = puterrno(error);
2344         if (name != args->sla_tnm)
2345                 kmem_free(name, MAXPATHLEN);
2346 
2347 }
2348 void *
2349 rfs_symlink_getfh(struct nfsslargs *args)
2350 {
2351         return (args->sla_from.da_fhandle);
2352 }
2353 
2354 /*
2355  * Make a directory.
2356  * Create a directory with the given name, parent directory, and attributes.
2357  * Returns a file handle and attributes for the new directory.
2358  */
2359 /* ARGSUSED */
2360 void
2361 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2362     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2363 {
2364         int error;
2365         struct vattr va;
2366         vnode_t *dvp = NULL;
2367         vnode_t *vp;
2368         char *name = args->ca_da.da_name;
2369 
2370         /*
2371          * Disallow NULL paths
2372          */
2373         if (name == NULL || *name == '\0') {
2374                 dr->dr_status = NFSERR_ACCES;
2375                 return;
2376         }
2377 
2378         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2379         if (vp == NULL) {
2380                 dr->dr_status = NFSERR_STALE;
2381                 return;
2382         }
2383 
2384         if (rdonly(ro, vp)) {
2385                 VN_RELE(vp);
2386                 dr->dr_status = NFSERR_ROFS;
2387                 return;
2388         }
2389 
2390         error = sattr_to_vattr(args->ca_sa, &va);
2391         if (error) {
2392                 VN_RELE(vp);
2393                 dr->dr_status = puterrno(error);
2394                 return;
2395         }
2396 
2397         if (!(va.va_mask & AT_MODE)) {
2398                 VN_RELE(vp);
2399                 dr->dr_status = NFSERR_INVAL;
2400                 return;
2401         }
2402 
2403         va.va_type = VDIR;
2404         va.va_mask |= AT_TYPE;
2405 
2406         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2407 
2408         if (!error) {
2409                 /*
2410                  * Attribtutes of the newly created directory should
2411                  * be returned to the client.
2412                  */
2413                 va.va_mask = AT_ALL; /* We want everything */
2414                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2415 
2416                 /* check for overflows */
2417                 if (!error) {
2418                         acl_perm(vp, exi, &va, cr);
2419                         error = vattr_to_nattr(&va, &dr->dr_attr);
2420                         if (!error) {
2421                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2422                         }
2423                 }
2424                 /*
2425                  * Force new data and metadata out to stable storage.
2426                  */
2427                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2428                 VN_RELE(dvp);
2429         }
2430 
2431         /*
2432          * Force modified data and metadata out to stable storage.
2433          */
2434         (void) VOP_FSYNC(vp, 0, cr, NULL);
2435 
2436         VN_RELE(vp);
2437 
2438         dr->dr_status = puterrno(error);
2439 
2440 }
2441 void *
2442 rfs_mkdir_getfh(struct nfscreatargs *args)
2443 {
2444         return (args->ca_da.da_fhandle);
2445 }
2446 
2447 /*
2448  * Remove a directory.
2449  * Remove the given directory name from the given parent directory.
2450  */
2451 /* ARGSUSED */
2452 void
2453 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2454     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2455 {
2456         int error;
2457         vnode_t *vp;
2458 
2459         /*
2460          * Disallow NULL paths
2461          */
2462         if (da->da_name == NULL || *da->da_name == '\0') {
2463                 *status = NFSERR_ACCES;
2464                 return;
2465         }
2466 
2467         vp = nfs_fhtovp(da->da_fhandle, exi);
2468         if (vp == NULL) {
2469                 *status = NFSERR_STALE;
2470                 return;
2471         }
2472 
2473         if (rdonly(ro, vp)) {
2474                 VN_RELE(vp);
2475                 *status = NFSERR_ROFS;
2476                 return;
2477         }
2478 
2479         /*
2480          * VOP_RMDIR takes a third argument (the current
2481          * directory of the process).  That's because someone
2482          * wants to return EINVAL if one tries to remove ".".
2483          * Of course, NFS servers have no idea what their
2484          * clients' current directories are.  We fake it by
2485          * supplying a vnode known to exist and illegal to
2486          * remove.
2487          */
2488         error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2489 
2490         /*
2491          * Force modified data and metadata out to stable storage.
2492          */
2493         (void) VOP_FSYNC(vp, 0, cr, NULL);
2494 
2495         VN_RELE(vp);
2496 
2497         /*
2498          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2499          * if the directory is not empty.  A System V NFS server
2500          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2501          * over the wire.
2502          */
2503         if (error == EEXIST)
2504                 *status = NFSERR_NOTEMPTY;
2505         else
2506                 *status = puterrno(error);
2507 
2508 }
2509 void *
2510 rfs_rmdir_getfh(struct nfsdiropargs *da)
2511 {
2512         return (da->da_fhandle);
2513 }
2514 
2515 /* ARGSUSED */
2516 void
2517 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2518     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2519 {
2520         int error;
2521         int iseof;
2522         struct iovec iov;
2523         struct uio uio;
2524         vnode_t *vp;
2525         char *ndata = NULL;
2526         struct sockaddr *ca;
2527         size_t nents;
2528         int ret;
2529 
2530         vp = nfs_fhtovp(&rda->rda_fh, exi);
2531         if (vp == NULL) {
2532                 rd->rd_entries = NULL;
2533                 rd->rd_status = NFSERR_STALE;
2534                 return;
2535         }
2536 
2537         if (vp->v_type != VDIR) {
2538                 VN_RELE(vp);
2539                 rd->rd_entries = NULL;
2540                 rd->rd_status = NFSERR_NOTDIR;
2541                 return;
2542         }
2543 
2544         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2545 
2546         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2547 
2548         if (error) {
2549                 rd->rd_entries = NULL;
2550                 goto bad;
2551         }
2552 
2553         if (rda->rda_count == 0) {
2554                 rd->rd_entries = NULL;
2555                 rd->rd_size = 0;
2556                 rd->rd_eof = FALSE;
2557                 goto bad;
2558         }
2559 
2560         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2561 
2562         /*
2563          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2564          */
2565         rd->rd_bufsize = (uint_t)rda->rda_count;
2566         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2567 
2568         /*
2569          * Set up io vector to read directory data
2570          */
2571         iov.iov_base = (caddr_t)rd->rd_entries;
2572         iov.iov_len = rda->rda_count;
2573         uio.uio_iov = &iov;
2574         uio.uio_iovcnt = 1;
2575         uio.uio_segflg = UIO_SYSSPACE;
2576         uio.uio_extflg = UIO_COPY_CACHED;
2577         uio.uio_loffset = (offset_t)rda->rda_offset;
2578         uio.uio_resid = rda->rda_count;
2579 
2580         /*
2581          * read directory
2582          */
2583         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2584 
2585         /*
2586          * Clean up
2587          */
2588         if (!error) {
2589                 /*
2590                  * set size and eof
2591                  */
2592                 if (uio.uio_resid == rda->rda_count) {
2593                         rd->rd_size = 0;
2594                         rd->rd_eof = TRUE;
2595                 } else {
2596                         rd->rd_size = (uint32_t)(rda->rda_count -
2597                             uio.uio_resid);
2598                         rd->rd_eof = iseof ? TRUE : FALSE;
2599                 }
2600         }
2601 
2602         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2603         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2604         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2605             rda->rda_count, &ndata);
2606 
2607         if (ret != 0) {
2608                 size_t dropbytes;
2609                 /*
2610                  * We had to drop one or more entries in order to fit
2611                  * during the character conversion.  We need to patch
2612                  * up the size and eof info.
2613                  */
2614                 if (rd->rd_eof)
2615                         rd->rd_eof = FALSE;
2616                 dropbytes = nfscmd_dropped_entrysize(
2617                     (struct dirent64 *)rd->rd_entries, nents, ret);
2618                 rd->rd_size -= dropbytes;
2619         }
2620         if (ndata == NULL) {
2621                 ndata = (char *)rd->rd_entries;
2622         } else if (ndata != (char *)rd->rd_entries) {
2623                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2624                 rd->rd_entries = (void *)ndata;
2625                 rd->rd_bufsize = rda->rda_count;
2626         }
2627 
2628 bad:
2629         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2630 
2631 #if 0 /* notyet */
2632         /*
2633          * Don't do this.  It causes local disk writes when just
2634          * reading the file and the overhead is deemed larger
2635          * than the benefit.
2636          */
2637         /*
2638          * Force modified metadata out to stable storage.
2639          */
2640         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2641 #endif
2642 
2643         VN_RELE(vp);
2644 
2645         rd->rd_status = puterrno(error);
2646 
2647 }
2648 void *
2649 rfs_readdir_getfh(struct nfsrddirargs *rda)
2650 {
2651         return (&rda->rda_fh);
2652 }
2653 void
2654 rfs_rddirfree(struct nfsrddirres *rd)
2655 {
2656         if (rd->rd_entries != NULL)
2657                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2658 }
2659 
2660 /* ARGSUSED */
2661 void
2662 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2663     struct svc_req *req, cred_t *cr, bool_t ro)
2664 {
2665         int error;
2666         struct statvfs64 sb;
2667         vnode_t *vp;
2668 
2669         vp = nfs_fhtovp(fh, exi);
2670         if (vp == NULL) {
2671                 fs->fs_status = NFSERR_STALE;
2672                 return;
2673         }
2674 
2675         error = VFS_STATVFS(vp->v_vfsp, &sb);
2676 
2677         if (!error) {
2678                 fs->fs_tsize = nfstsize();
2679                 fs->fs_bsize = sb.f_frsize;
2680                 fs->fs_blocks = sb.f_blocks;
2681                 fs->fs_bfree = sb.f_bfree;
2682                 fs->fs_bavail = sb.f_bavail;
2683         }
2684 
2685         VN_RELE(vp);
2686 
2687         fs->fs_status = puterrno(error);
2688 
2689 }
2690 void *
2691 rfs_statfs_getfh(fhandle_t *fh)
2692 {
2693         return (fh);
2694 }
2695 
2696 static int
2697 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2698 {
2699         vap->va_mask = 0;
2700 
2701         /*
2702          * There was a sign extension bug in some VFS based systems
2703          * which stored the mode as a short.  When it would get
2704          * assigned to a u_long, no sign extension would occur.
2705          * It needed to, but this wasn't noticed because sa_mode
2706          * would then get assigned back to the short, thus ignoring
2707          * the upper 16 bits of sa_mode.
2708          *
2709          * To make this implementation work for both broken
2710          * clients and good clients, we check for both versions
2711          * of the mode.
2712          */
2713         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2714             sa->sa_mode != (uint32_t)-1) {
2715                 vap->va_mask |= AT_MODE;
2716                 vap->va_mode = sa->sa_mode;
2717         }
2718         if (sa->sa_uid != (uint32_t)-1) {
2719                 vap->va_mask |= AT_UID;
2720                 vap->va_uid = sa->sa_uid;
2721         }
2722         if (sa->sa_gid != (uint32_t)-1) {
2723                 vap->va_mask |= AT_GID;
2724                 vap->va_gid = sa->sa_gid;
2725         }
2726         if (sa->sa_size != (uint32_t)-1) {
2727                 vap->va_mask |= AT_SIZE;
2728                 vap->va_size = sa->sa_size;
2729         }
2730         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2731             sa->sa_atime.tv_usec != (int32_t)-1) {
2732 #ifndef _LP64
2733                 /* return error if time overflow */
2734                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2735                         return (EOVERFLOW);
2736 #endif
2737                 vap->va_mask |= AT_ATIME;
2738                 /*
2739                  * nfs protocol defines times as unsigned so don't extend sign,
2740                  * unless sysadmin set nfs_allow_preepoch_time.
2741                  */
2742                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2743                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2744         }
2745         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2746             sa->sa_mtime.tv_usec != (int32_t)-1) {
2747 #ifndef _LP64
2748                 /* return error if time overflow */
2749                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2750                         return (EOVERFLOW);
2751 #endif
2752                 vap->va_mask |= AT_MTIME;
2753                 /*
2754                  * nfs protocol defines times as unsigned so don't extend sign,
2755                  * unless sysadmin set nfs_allow_preepoch_time.
2756                  */
2757                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2758                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2759         }
2760         return (0);
2761 }
2762 
2763 static enum nfsftype vt_to_nf[] = {
2764         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2765 };
2766 
2767 /*
2768  * check the following fields for overflow: nodeid, size, and time.
2769  * There could be a problem when converting 64-bit LP64 fields
2770  * into 32-bit ones.  Return an error if there is an overflow.
2771  */
2772 int
2773 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2774 {
2775         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2776         na->na_type = vt_to_nf[vap->va_type];
2777 
2778         if (vap->va_mode == (unsigned short) -1)
2779                 na->na_mode = (uint32_t)-1;
2780         else
2781                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2782 
2783         if (vap->va_uid == (unsigned short)(-1))
2784                 na->na_uid = (uint32_t)(-1);
2785         else if (vap->va_uid == UID_NOBODY)
2786                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2787         else
2788                 na->na_uid = vap->va_uid;
2789 
2790         if (vap->va_gid == (unsigned short)(-1))
2791                 na->na_gid = (uint32_t)-1;
2792         else if (vap->va_gid == GID_NOBODY)
2793                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2794         else
2795                 na->na_gid = vap->va_gid;
2796 
2797         /*
2798          * Do we need to check fsid for overflow?  It is 64-bit in the
2799          * vattr, but are bigger than 32 bit values supported?
2800          */
2801         na->na_fsid = vap->va_fsid;
2802 
2803         na->na_nodeid = vap->va_nodeid;
2804 
2805         /*
2806          * Check to make sure that the nodeid is representable over the
2807          * wire without losing bits.
2808          */
2809         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2810                 return (EFBIG);
2811         na->na_nlink = vap->va_nlink;
2812 
2813         /*
2814          * Check for big files here, instead of at the caller.  See
2815          * comments in cstat for large special file explanation.
2816          */
2817         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2818                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2819                         return (EFBIG);
2820                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2821                         /* UNKNOWN_SIZE | OVERFLOW */
2822                         na->na_size = MAXOFF32_T;
2823                 } else
2824                         na->na_size = vap->va_size;
2825         } else
2826                 na->na_size = vap->va_size;
2827 
2828         /*
2829          * If the vnode times overflow the 32-bit times that NFS2
2830          * uses on the wire then return an error.
2831          */
2832         if (!NFS_VAP_TIME_OK(vap)) {
2833                 return (EOVERFLOW);
2834         }
2835         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2836         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2837 
2838         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2839         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2840 
2841         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2842         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2843 
2844         /*
2845          * If the dev_t will fit into 16 bits then compress
2846          * it, otherwise leave it alone. See comments in
2847          * nfs_client.c.
2848          */
2849         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2850             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2851                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2852         else
2853                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2854 
2855         na->na_blocks = vap->va_nblocks;
2856         na->na_blocksize = vap->va_blksize;
2857 
2858         /*
2859          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2860          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2861          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2862          *
2863          * BUYER BEWARE:
2864          *  If you are porting the NFS to a non-Sun server, you probably
2865          *  don't want to include the following block of code.  The
2866          *  over-the-wire special file types will be changing with the
2867          *  NFS Protocol Revision.
2868          */
2869         if (vap->va_type == VFIFO)
2870                 NA_SETFIFO(na);
2871         return (0);
2872 }
2873 
2874 /*
2875  * acl v2 support: returns approximate permission.
2876  *      default: returns minimal permission (more restrictive)
2877  *      aclok: returns maximal permission (less restrictive)
2878  *      This routine changes the permissions that are alaredy in *va.
2879  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2880  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
2881  */
2882 static void
2883 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2884 {
2885         vsecattr_t      vsa;
2886         int             aclcnt;
2887         aclent_t        *aclentp;
2888         mode_t          mask_perm;
2889         mode_t          grp_perm;
2890         mode_t          other_perm;
2891         mode_t          other_orig;
2892         int             error;
2893 
2894         /* dont care default acl */
2895         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2896         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2897 
2898         if (!error) {
2899                 aclcnt = vsa.vsa_aclcnt;
2900                 if (aclcnt > MIN_ACL_ENTRIES) {
2901                         /* non-trivial ACL */
2902                         aclentp = vsa.vsa_aclentp;
2903                         if (exi->exi_export.ex_flags & EX_ACLOK) {
2904                                 /* maximal permissions */
2905                                 grp_perm = 0;
2906                                 other_perm = 0;
2907                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2908                                         switch (aclentp->a_type) {
2909                                         case USER_OBJ:
2910                                                 break;
2911                                         case USER:
2912                                                 grp_perm |=
2913                                                     aclentp->a_perm << 3;
2914                                                 other_perm |= aclentp->a_perm;
2915                                                 break;
2916                                         case GROUP_OBJ:
2917                                                 grp_perm |=
2918                                                     aclentp->a_perm << 3;
2919                                                 break;
2920                                         case GROUP:
2921                                                 other_perm |= aclentp->a_perm;
2922                                                 break;
2923                                         case OTHER_OBJ:
2924                                                 other_orig = aclentp->a_perm;
2925                                                 break;
2926                                         case CLASS_OBJ:
2927                                                 mask_perm = aclentp->a_perm;
2928                                                 break;
2929                                         default:
2930                                                 break;
2931                                         }
2932                                 }
2933                                 grp_perm &= mask_perm << 3;
2934                                 other_perm &= mask_perm;
2935                                 other_perm |= other_orig;
2936 
2937                         } else {
2938                                 /* minimal permissions */
2939                                 grp_perm = 070;
2940                                 other_perm = 07;
2941                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2942                                         switch (aclentp->a_type) {
2943                                         case USER_OBJ:
2944                                                 break;
2945                                         case USER:
2946                                         case CLASS_OBJ:
2947                                                 grp_perm &=
2948                                                     aclentp->a_perm << 3;
2949                                                 other_perm &=
2950                                                     aclentp->a_perm;
2951                                                 break;
2952                                         case GROUP_OBJ:
2953                                                 grp_perm &=
2954                                                     aclentp->a_perm << 3;
2955                                                 break;
2956                                         case GROUP:
2957                                                 other_perm &=
2958                                                     aclentp->a_perm;
2959                                                 break;
2960                                         case OTHER_OBJ:
2961                                                 other_perm &=
2962                                                     aclentp->a_perm;
2963                                                 break;
2964                                         default:
2965                                                 break;
2966                                         }
2967                                 }
2968                         }
2969                         /* copy to va */
2970                         va->va_mode &= ~077;
2971                         va->va_mode |= grp_perm | other_perm;
2972                 }
2973                 if (vsa.vsa_aclcnt)
2974                         kmem_free(vsa.vsa_aclentp,
2975                             vsa.vsa_aclcnt * sizeof (aclent_t));
2976         }
2977 }
2978 
2979 void
2980 rfs_srvrinit(void)
2981 {
2982         mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2983         nfs2_srv_caller_id = fs_new_caller_id();
2984 }
2985 
2986 void
2987 rfs_srvrfini(void)
2988 {
2989         mutex_destroy(&rfs_async_write_lock);
2990 }
2991 
2992 static int
2993 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2994 {
2995         struct clist    *wcl;
2996         int             wlist_len;
2997         uint32_t        count = rr->rr_count;
2998 
2999         wcl = ra->ra_wlist;
3000 
3001         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3002                 return (FALSE);
3003         }
3004 
3005         wcl = ra->ra_wlist;
3006         rr->rr_ok.rrok_wlist_len = wlist_len;
3007         rr->rr_ok.rrok_wlist = wcl;
3008 
3009         return (TRUE);
3010 }