1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 
  26 #include <sys/systm.h>
  27 #include <sys/kmem.h>
  28 #include <sys/cmn_err.h>
  29 #include <sys/atomic.h>
  30 #include <sys/clconf.h>
  31 #include <sys/cladm.h>
  32 #include <sys/flock.h>
  33 #include <nfs/export.h>
  34 #include <nfs/nfs.h>
  35 #include <nfs/nfs4.h>
  36 #include <nfs/nfssys.h>
  37 #include <nfs/lm.h>
  38 #include <sys/pathname.h>
  39 #include <sys/sdt.h>
  40 #include <sys/nvpair.h>
  41 
  42 extern u_longlong_t nfs4_srv_caller_id;
  43 
  44 extern time_t rfs4_start_time;
  45 extern uint_t nfs4_srv_vkey;
  46 
  47 stateid4 special0 = {
  48         0,
  49         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
  50 };
  51 
  52 stateid4 special1 = {
  53         0xffffffff,
  54         {
  55                 (char)0xff, (char)0xff, (char)0xff, (char)0xff,
  56                 (char)0xff, (char)0xff, (char)0xff, (char)0xff,
  57                 (char)0xff, (char)0xff, (char)0xff, (char)0xff
  58         }
  59 };
  60 
  61 
  62 #define ISSPECIAL(id)  (stateid4_cmp(id, &special0) || \
  63                         stateid4_cmp(id, &special1))
  64 
  65 /* For embedding the cluster nodeid into our clientid */
  66 #define CLUSTER_NODEID_SHIFT    24
  67 #define CLUSTER_MAX_NODEID      255
  68 
  69 #ifdef DEBUG
  70 int rfs4_debug;
  71 #endif
  72 
  73 static uint32_t rfs4_database_debug = 0x00;
  74 
  75 static void rfs4_ss_clid_write(rfs4_client_t *cp, char *leaf);
  76 static void rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dir, char *leaf);
  77 static void rfs4_dss_clear_oldstate(rfs4_servinst_t *sip);
  78 static void rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip);
  79 
  80 /*
  81  * Couple of simple init/destroy functions for a general waiter
  82  */
  83 void
  84 rfs4_sw_init(rfs4_state_wait_t *swp)
  85 {
  86         mutex_init(swp->sw_cv_lock, NULL, MUTEX_DEFAULT, NULL);
  87         cv_init(swp->sw_cv, NULL, CV_DEFAULT, NULL);
  88         swp->sw_active = FALSE;
  89         swp->sw_wait_count = 0;
  90 }
  91 
  92 void
  93 rfs4_sw_destroy(rfs4_state_wait_t *swp)
  94 {
  95         mutex_destroy(swp->sw_cv_lock);
  96         cv_destroy(swp->sw_cv);
  97 }
  98 
  99 void
 100 rfs4_sw_enter(rfs4_state_wait_t *swp)
 101 {
 102         mutex_enter(swp->sw_cv_lock);
 103         while (swp->sw_active) {
 104                 swp->sw_wait_count++;
 105                 cv_wait(swp->sw_cv, swp->sw_cv_lock);
 106                 swp->sw_wait_count--;
 107         }
 108         ASSERT(swp->sw_active == FALSE);
 109         swp->sw_active = TRUE;
 110         mutex_exit(swp->sw_cv_lock);
 111 }
 112 
 113 void
 114 rfs4_sw_exit(rfs4_state_wait_t *swp)
 115 {
 116         mutex_enter(swp->sw_cv_lock);
 117         ASSERT(swp->sw_active == TRUE);
 118         swp->sw_active = FALSE;
 119         if (swp->sw_wait_count != 0)
 120                 cv_broadcast(swp->sw_cv);
 121         mutex_exit(swp->sw_cv_lock);
 122 }
 123 
 124 /*
 125  * CPR callback id -- not related to v4 callbacks
 126  */
 127 static callb_id_t cpr_id = 0;
 128 
 129 static void
 130 deep_lock_copy(LOCK4res *dres, LOCK4res *sres)
 131 {
 132         lock_owner4 *slo = &sres->LOCK4res_u.denied.owner;
 133         lock_owner4 *dlo = &dres->LOCK4res_u.denied.owner;
 134 
 135         if (sres->status == NFS4ERR_DENIED) {
 136                 dlo->owner_val = kmem_alloc(slo->owner_len, KM_SLEEP);
 137                 bcopy(slo->owner_val, dlo->owner_val, slo->owner_len);
 138         }
 139 }
 140 
 141 static void
 142 deep_lock_free(LOCK4res *res)
 143 {
 144         lock_owner4 *lo = &res->LOCK4res_u.denied.owner;
 145 
 146         if (res->status == NFS4ERR_DENIED)
 147                 kmem_free(lo->owner_val, lo->owner_len);
 148 }
 149 
 150 static void
 151 deep_open_copy(OPEN4res *dres, OPEN4res *sres)
 152 {
 153         nfsace4 *sacep, *dacep;
 154 
 155         if (sres->status != NFS4_OK) {
 156                 return;
 157         }
 158 
 159         dres->attrset = sres->attrset;
 160 
 161         switch (sres->delegation.delegation_type) {
 162         case OPEN_DELEGATE_NONE:
 163                 return;
 164         case OPEN_DELEGATE_READ:
 165                 sacep = &sres->delegation.open_delegation4_u.read.permissions;
 166                 dacep = &dres->delegation.open_delegation4_u.read.permissions;
 167                 break;
 168         case OPEN_DELEGATE_WRITE:
 169                 sacep = &sres->delegation.open_delegation4_u.write.permissions;
 170                 dacep = &dres->delegation.open_delegation4_u.write.permissions;
 171                 break;
 172         }
 173         dacep->who.utf8string_val =
 174             kmem_alloc(sacep->who.utf8string_len, KM_SLEEP);
 175         bcopy(sacep->who.utf8string_val, dacep->who.utf8string_val,
 176             sacep->who.utf8string_len);
 177 }
 178 
 179 static void
 180 deep_open_free(OPEN4res *res)
 181 {
 182         nfsace4 *acep;
 183         if (res->status != NFS4_OK)
 184                 return;
 185 
 186         switch (res->delegation.delegation_type) {
 187         case OPEN_DELEGATE_NONE:
 188                 return;
 189         case OPEN_DELEGATE_READ:
 190                 acep = &res->delegation.open_delegation4_u.read.permissions;
 191                 break;
 192         case OPEN_DELEGATE_WRITE:
 193                 acep = &res->delegation.open_delegation4_u.write.permissions;
 194                 break;
 195         }
 196 
 197         if (acep->who.utf8string_val) {
 198                 kmem_free(acep->who.utf8string_val, acep->who.utf8string_len);
 199                 acep->who.utf8string_val = NULL;
 200         }
 201 }
 202 
 203 void
 204 rfs4_free_reply(nfs_resop4 *rp)
 205 {
 206         switch (rp->resop) {
 207         case OP_LOCK:
 208                 deep_lock_free(&rp->nfs_resop4_u.oplock);
 209                 break;
 210         case OP_OPEN:
 211                 deep_open_free(&rp->nfs_resop4_u.opopen);
 212         default:
 213                 break;
 214         }
 215 }
 216 
 217 void
 218 rfs4_copy_reply(nfs_resop4 *dst, nfs_resop4 *src)
 219 {
 220         *dst = *src;
 221 
 222         /* Handle responses that need deep copy */
 223         switch (src->resop) {
 224         case OP_LOCK:
 225                 deep_lock_copy(&dst->nfs_resop4_u.oplock,
 226                     &src->nfs_resop4_u.oplock);
 227                 break;
 228         case OP_OPEN:
 229                 deep_open_copy(&dst->nfs_resop4_u.opopen,
 230                     &src->nfs_resop4_u.opopen);
 231                 break;
 232         default:
 233                 break;
 234         };
 235 }
 236 
 237 /*
 238  * This is the implementation of the underlying state engine. The
 239  * public interface to this engine is described by
 240  * nfs4_state.h. Callers to the engine should hold no state engine
 241  * locks when they call in to it. If the protocol needs to lock data
 242  * structures it should do so after acquiring all references to them
 243  * first and then follow the following lock order:
 244  *
 245  *      client > openowner > state > lo_state > lockowner > file.
 246  *
 247  * Internally we only allow a thread to hold one hash bucket lock at a
 248  * time and the lock is higher in the lock order (must be acquired
 249  * first) than the data structure that is on that hash list.
 250  *
 251  * If a new reference was acquired by the caller, that reference needs
 252  * to be released after releasing all acquired locks with the
 253  * corresponding rfs4_*_rele routine.
 254  */
 255 
 256 /*
 257  * This code is some what prototypical for now. Its purpose currently is to
 258  * implement the interfaces sufficiently to finish the higher protocol
 259  * elements. This will be replaced by a dynamically resizeable tables
 260  * backed by kmem_cache allocator. However synchronization is handled
 261  * correctly (I hope) and will not change by much.  The mutexes for
 262  * the hash buckets that can be used to create new instances of data
 263  * structures  might be good candidates to evolve into reader writer
 264  * locks. If it has to do a creation, it would be holding the
 265  * mutex across a kmem_alloc with KM_SLEEP specified.
 266  */
 267 
 268 #ifdef DEBUG
 269 #define TABSIZE 17
 270 #else
 271 #define TABSIZE 2047
 272 #endif
 273 
 274 #define ADDRHASH(key) ((unsigned long)(key) >> 3)
 275 
 276 /* Used to serialize create/destroy of rfs4_server_state database */
 277 kmutex_t        rfs4_state_lock;
 278 static rfs4_database_t *rfs4_server_state = NULL;
 279 
 280 /* Used to serialize lookups of clientids */
 281 static  krwlock_t       rfs4_findclient_lock;
 282 
 283 /*
 284  * For now this "table" is exposed so that the CPR callback
 285  * function can tromp through it..
 286  */
 287 rfs4_table_t *rfs4_client_tab;
 288 
 289 static rfs4_index_t *rfs4_clientid_idx;
 290 static rfs4_index_t *rfs4_nfsclnt_idx;
 291 static rfs4_table_t *rfs4_clntip_tab;
 292 static rfs4_index_t *rfs4_clntip_idx;
 293 static rfs4_table_t *rfs4_openowner_tab;
 294 static rfs4_index_t *rfs4_openowner_idx;
 295 static rfs4_table_t *rfs4_state_tab;
 296 static rfs4_index_t *rfs4_state_idx;
 297 static rfs4_index_t *rfs4_state_owner_file_idx;
 298 static rfs4_index_t *rfs4_state_file_idx;
 299 static rfs4_table_t *rfs4_lo_state_tab;
 300 static rfs4_index_t *rfs4_lo_state_idx;
 301 static rfs4_index_t *rfs4_lo_state_owner_idx;
 302 static rfs4_table_t *rfs4_lockowner_tab;
 303 static rfs4_index_t *rfs4_lockowner_idx;
 304 static rfs4_index_t *rfs4_lockowner_pid_idx;
 305 static rfs4_table_t *rfs4_file_tab;
 306 static rfs4_index_t *rfs4_file_idx;
 307 static rfs4_table_t *rfs4_deleg_state_tab;
 308 static rfs4_index_t *rfs4_deleg_idx;
 309 static rfs4_index_t *rfs4_deleg_state_idx;
 310 
 311 #define MAXTABSZ 1024*1024
 312 
 313 /* The values below are rfs4_lease_time units */
 314 
 315 #ifdef DEBUG
 316 #define CLIENT_CACHE_TIME 1
 317 #define OPENOWNER_CACHE_TIME 1
 318 #define STATE_CACHE_TIME 1
 319 #define LO_STATE_CACHE_TIME 1
 320 #define LOCKOWNER_CACHE_TIME 1
 321 #define FILE_CACHE_TIME 3
 322 #define DELEG_STATE_CACHE_TIME 1
 323 #else
 324 #define CLIENT_CACHE_TIME 10
 325 #define OPENOWNER_CACHE_TIME 5
 326 #define STATE_CACHE_TIME 1
 327 #define LO_STATE_CACHE_TIME 1
 328 #define LOCKOWNER_CACHE_TIME 3
 329 #define FILE_CACHE_TIME 40
 330 #define DELEG_STATE_CACHE_TIME 1
 331 #endif
 332 
 333 
 334 static time_t rfs4_client_cache_time = 0;
 335 static time_t rfs4_clntip_cache_time = 0;
 336 static time_t rfs4_openowner_cache_time = 0;
 337 static time_t rfs4_state_cache_time = 0;
 338 static time_t rfs4_lo_state_cache_time = 0;
 339 static time_t rfs4_lockowner_cache_time = 0;
 340 static time_t rfs4_file_cache_time = 0;
 341 static time_t rfs4_deleg_state_cache_time = 0;
 342 
 343 static bool_t rfs4_client_create(rfs4_entry_t, void *);
 344 static void rfs4_dss_remove_cpleaf(rfs4_client_t *);
 345 static void rfs4_dss_remove_leaf(rfs4_servinst_t *, char *, char *);
 346 static void rfs4_client_destroy(rfs4_entry_t);
 347 static bool_t rfs4_client_expiry(rfs4_entry_t);
 348 static uint32_t clientid_hash(void *);
 349 static bool_t clientid_compare(rfs4_entry_t, void *);
 350 static void *clientid_mkkey(rfs4_entry_t);
 351 static uint32_t nfsclnt_hash(void *);
 352 static bool_t nfsclnt_compare(rfs4_entry_t, void *);
 353 static void *nfsclnt_mkkey(rfs4_entry_t);
 354 static bool_t rfs4_clntip_expiry(rfs4_entry_t);
 355 static void rfs4_clntip_destroy(rfs4_entry_t);
 356 static bool_t rfs4_clntip_create(rfs4_entry_t, void *);
 357 static uint32_t clntip_hash(void *);
 358 static bool_t clntip_compare(rfs4_entry_t, void *);
 359 static void *clntip_mkkey(rfs4_entry_t);
 360 static bool_t rfs4_openowner_create(rfs4_entry_t, void *);
 361 static void rfs4_openowner_destroy(rfs4_entry_t);
 362 static bool_t rfs4_openowner_expiry(rfs4_entry_t);
 363 static uint32_t openowner_hash(void *);
 364 static bool_t openowner_compare(rfs4_entry_t, void *);
 365 static void *openowner_mkkey(rfs4_entry_t);
 366 static bool_t rfs4_state_create(rfs4_entry_t, void *);
 367 static void rfs4_state_destroy(rfs4_entry_t);
 368 static bool_t rfs4_state_expiry(rfs4_entry_t);
 369 static uint32_t state_hash(void *);
 370 static bool_t state_compare(rfs4_entry_t, void *);
 371 static void *state_mkkey(rfs4_entry_t);
 372 static uint32_t state_owner_file_hash(void *);
 373 static bool_t state_owner_file_compare(rfs4_entry_t, void *);
 374 static void *state_owner_file_mkkey(rfs4_entry_t);
 375 static uint32_t state_file_hash(void *);
 376 static bool_t state_file_compare(rfs4_entry_t, void *);
 377 static void *state_file_mkkey(rfs4_entry_t);
 378 static bool_t rfs4_lo_state_create(rfs4_entry_t, void *);
 379 static void rfs4_lo_state_destroy(rfs4_entry_t);
 380 static bool_t rfs4_lo_state_expiry(rfs4_entry_t);
 381 static uint32_t lo_state_hash(void *);
 382 static bool_t lo_state_compare(rfs4_entry_t, void *);
 383 static void *lo_state_mkkey(rfs4_entry_t);
 384 static uint32_t lo_state_lo_hash(void *);
 385 static bool_t lo_state_lo_compare(rfs4_entry_t, void *);
 386 static void *lo_state_lo_mkkey(rfs4_entry_t);
 387 static bool_t rfs4_lockowner_create(rfs4_entry_t, void *);
 388 static void rfs4_lockowner_destroy(rfs4_entry_t);
 389 static bool_t rfs4_lockowner_expiry(rfs4_entry_t);
 390 static uint32_t lockowner_hash(void *);
 391 static bool_t lockowner_compare(rfs4_entry_t, void *);
 392 static void *lockowner_mkkey(rfs4_entry_t);
 393 static uint32_t pid_hash(void *);
 394 static bool_t pid_compare(rfs4_entry_t, void *);
 395 static void *pid_mkkey(rfs4_entry_t);
 396 static bool_t rfs4_file_create(rfs4_entry_t, void *);
 397 static void rfs4_file_destroy(rfs4_entry_t);
 398 static uint32_t file_hash(void *);
 399 static bool_t file_compare(rfs4_entry_t, void *);
 400 static void *file_mkkey(rfs4_entry_t);
 401 static bool_t rfs4_deleg_state_create(rfs4_entry_t, void *);
 402 static void rfs4_deleg_state_destroy(rfs4_entry_t);
 403 static bool_t rfs4_deleg_state_expiry(rfs4_entry_t);
 404 static uint32_t deleg_hash(void *);
 405 static bool_t deleg_compare(rfs4_entry_t, void *);
 406 static void *deleg_mkkey(rfs4_entry_t);
 407 static uint32_t deleg_state_hash(void *);
 408 static bool_t deleg_state_compare(rfs4_entry_t, void *);
 409 static void *deleg_state_mkkey(rfs4_entry_t);
 410 
 411 static void rfs4_state_rele_nounlock(rfs4_state_t *);
 412 
 413 static int rfs4_ss_enabled = 0;
 414 
 415 extern void (*rfs4_client_clrst)(struct nfs4clrst_args *);
 416 
 417 void
 418 rfs4_ss_pnfree(rfs4_ss_pn_t *ss_pn)
 419 {
 420         kmem_free(ss_pn, sizeof (rfs4_ss_pn_t));
 421 }
 422 
 423 static rfs4_ss_pn_t *
 424 rfs4_ss_pnalloc(char *dir, char *leaf)
 425 {
 426         rfs4_ss_pn_t *ss_pn;
 427         int     dir_len, leaf_len;
 428 
 429         /*
 430          * validate we have a resonable path
 431          * (account for the '/' and trailing null)
 432          */
 433         if ((dir_len = strlen(dir)) > MAXPATHLEN ||
 434             (leaf_len = strlen(leaf)) > MAXNAMELEN ||
 435             (dir_len + leaf_len + 2) > MAXPATHLEN) {
 436                 return (NULL);
 437         }
 438 
 439         ss_pn = kmem_alloc(sizeof (rfs4_ss_pn_t), KM_SLEEP);
 440 
 441         (void) snprintf(ss_pn->pn, MAXPATHLEN, "%s/%s", dir, leaf);
 442         /* Handy pointer to just the leaf name */
 443         ss_pn->leaf = ss_pn->pn + dir_len + 1;
 444         return (ss_pn);
 445 }
 446 
 447 
 448 /*
 449  * Move the "leaf" filename from "sdir" directory
 450  * to the "ddir" directory. Return the pathname of
 451  * the destination unless the rename fails in which
 452  * case we need to return the source pathname.
 453  */
 454 static rfs4_ss_pn_t *
 455 rfs4_ss_movestate(char *sdir, char *ddir, char *leaf)
 456 {
 457         rfs4_ss_pn_t *src, *dst;
 458 
 459         if ((src = rfs4_ss_pnalloc(sdir, leaf)) == NULL)
 460                 return (NULL);
 461 
 462         if ((dst = rfs4_ss_pnalloc(ddir, leaf)) == NULL) {
 463                 rfs4_ss_pnfree(src);
 464                 return (NULL);
 465         }
 466 
 467         /*
 468          * If the rename fails we shall return the src
 469          * pathname and free the dst. Otherwise we need
 470          * to free the src and return the dst pathanme.
 471          */
 472         if (vn_rename(src->pn, dst->pn, UIO_SYSSPACE)) {
 473                 rfs4_ss_pnfree(dst);
 474                 return (src);
 475         }
 476         rfs4_ss_pnfree(src);
 477         return (dst);
 478 }
 479 
 480 
 481 static rfs4_oldstate_t *
 482 rfs4_ss_getstate(vnode_t *dvp, rfs4_ss_pn_t *ss_pn)
 483 {
 484         struct uio uio;
 485         struct iovec iov[3];
 486 
 487         rfs4_oldstate_t *cl_ss = NULL;
 488         vnode_t *vp;
 489         vattr_t va;
 490         uint_t id_len;
 491         int err, kill_file, file_vers;
 492 
 493         if (ss_pn == NULL)
 494                 return (NULL);
 495 
 496         /*
 497          * open the state file.
 498          */
 499         if (vn_open(ss_pn->pn, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0) != 0) {
 500                 return (NULL);
 501         }
 502 
 503         if (vp->v_type != VREG) {
 504                 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 505                 VN_RELE(vp);
 506                 return (NULL);
 507         }
 508 
 509         err = VOP_ACCESS(vp, VREAD, 0, CRED(), NULL);
 510         if (err) {
 511                 /*
 512                  * We don't have read access? better get the heck out.
 513                  */
 514                 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 515                 VN_RELE(vp);
 516                 return (NULL);
 517         }
 518 
 519         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
 520         /*
 521          * get the file size to do some basic validation
 522          */
 523         va.va_mask = AT_SIZE;
 524         err = VOP_GETATTR(vp, &va, 0, CRED(), NULL);
 525 
 526         kill_file = (va.va_size == 0 || va.va_size <
 527             (NFS4_VERIFIER_SIZE + sizeof (uint_t)+1));
 528 
 529         if (err || kill_file) {
 530                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
 531                 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 532                 VN_RELE(vp);
 533                 if (kill_file) {
 534                         (void) VOP_REMOVE(dvp, ss_pn->leaf, CRED(), NULL, 0);
 535                 }
 536                 return (NULL);
 537         }
 538 
 539         cl_ss = kmem_alloc(sizeof (rfs4_oldstate_t), KM_SLEEP);
 540 
 541         /*
 542          * build iovecs to read in the file_version, verifier and id_len
 543          */
 544         iov[0].iov_base = (caddr_t)&file_vers;
 545         iov[0].iov_len = sizeof (int);
 546         iov[1].iov_base = (caddr_t)&cl_ss->cl_id4.verifier;
 547         iov[1].iov_len = NFS4_VERIFIER_SIZE;
 548         iov[2].iov_base = (caddr_t)&id_len;
 549         iov[2].iov_len = sizeof (uint_t);
 550 
 551         uio.uio_iov = iov;
 552         uio.uio_iovcnt = 3;
 553         uio.uio_segflg = UIO_SYSSPACE;
 554         uio.uio_loffset = 0;
 555         uio.uio_resid = sizeof (int) + NFS4_VERIFIER_SIZE + sizeof (uint_t);
 556 
 557         if (err = VOP_READ(vp, &uio, FREAD, CRED(), NULL)) {
 558                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
 559                 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 560                 VN_RELE(vp);
 561                 kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
 562                 return (NULL);
 563         }
 564 
 565         /*
 566          * if the file_version doesn't match or if the
 567          * id_len is zero or the combination of the verifier,
 568          * id_len and id_val is bigger than the file we have
 569          * a problem. If so ditch the file.
 570          */
 571         kill_file = (file_vers != NFS4_SS_VERSION || id_len == 0 ||
 572             (id_len + NFS4_VERIFIER_SIZE + sizeof (uint_t)) > va.va_size);
 573 
 574         if (err || kill_file) {
 575                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
 576                 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 577                 VN_RELE(vp);
 578                 kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
 579                 if (kill_file) {
 580                         (void) VOP_REMOVE(dvp, ss_pn->leaf, CRED(), NULL, 0);
 581                 }
 582                 return (NULL);
 583         }
 584 
 585         /*
 586          * now get the client id value
 587          */
 588         cl_ss->cl_id4.id_val = kmem_alloc(id_len, KM_SLEEP);
 589         iov[0].iov_base = cl_ss->cl_id4.id_val;
 590         iov[0].iov_len = id_len;
 591 
 592         uio.uio_iov = iov;
 593         uio.uio_iovcnt = 1;
 594         uio.uio_segflg = UIO_SYSSPACE;
 595         uio.uio_resid = cl_ss->cl_id4.id_len = id_len;
 596 
 597         if (err = VOP_READ(vp, &uio, FREAD, CRED(), NULL)) {
 598                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
 599                 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 600                 VN_RELE(vp);
 601                 kmem_free(cl_ss->cl_id4.id_val, id_len);
 602                 kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
 603                 return (NULL);
 604         }
 605 
 606         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
 607         (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 608         VN_RELE(vp);
 609         return (cl_ss);
 610 }
 611 
 612 #ifdef  nextdp
 613 #undef nextdp
 614 #endif
 615 #define nextdp(dp)      ((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
 616 
 617 /*
 618  * Add entries from statedir to supplied oldstate list.
 619  * Optionally, move all entries from statedir -> destdir.
 620  */
 621 void
 622 rfs4_ss_oldstate(rfs4_oldstate_t *oldstate, char *statedir, char *destdir)
 623 {
 624         rfs4_ss_pn_t *ss_pn;
 625         rfs4_oldstate_t *cl_ss = NULL;
 626         char    *dirt = NULL;
 627         int     err, dir_eof = 0, size = 0;
 628         vnode_t *dvp;
 629         struct iovec iov;
 630         struct uio uio;
 631         struct dirent64 *dep;
 632         offset_t dirchunk_offset = 0;
 633 
 634         /*
 635          * open the state directory
 636          */
 637         if (vn_open(statedir, UIO_SYSSPACE, FREAD, 0, &dvp, 0, 0))
 638                 return;
 639 
 640         if (dvp->v_type != VDIR || VOP_ACCESS(dvp, VREAD, 0, CRED(), NULL))
 641                 goto out;
 642 
 643         dirt = kmem_alloc(RFS4_SS_DIRSIZE, KM_SLEEP);
 644 
 645         /*
 646          * Get and process the directory entries
 647          */
 648         while (!dir_eof) {
 649                 (void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL);
 650                 iov.iov_base = dirt;
 651                 iov.iov_len = RFS4_SS_DIRSIZE;
 652                 uio.uio_iov = &iov;
 653                 uio.uio_iovcnt = 1;
 654                 uio.uio_segflg = UIO_SYSSPACE;
 655                 uio.uio_loffset = dirchunk_offset;
 656                 uio.uio_resid = RFS4_SS_DIRSIZE;
 657 
 658                 err = VOP_READDIR(dvp, &uio, CRED(), &dir_eof, NULL, 0);
 659                 VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
 660                 if (err)
 661                         goto out;
 662 
 663                 size = RFS4_SS_DIRSIZE - uio.uio_resid;
 664 
 665                 /*
 666                  * Process all the directory entries in this
 667                  * readdir chunk
 668                  */
 669                 for (dep = (struct dirent64 *)dirt; size > 0;
 670                     dep = nextdp(dep)) {
 671 
 672                         size -= dep->d_reclen;
 673                         dirchunk_offset = dep->d_off;
 674 
 675                         /*
 676                          * Skip '.' and '..'
 677                          */
 678                         if (NFS_IS_DOTNAME(dep->d_name))
 679                                 continue;
 680 
 681                         ss_pn = rfs4_ss_pnalloc(statedir, dep->d_name);
 682                         if (ss_pn == NULL)
 683                                 continue;
 684 
 685                         if (cl_ss = rfs4_ss_getstate(dvp, ss_pn)) {
 686                                 if (destdir != NULL) {
 687                                         rfs4_ss_pnfree(ss_pn);
 688                                         cl_ss->ss_pn = rfs4_ss_movestate(
 689                                             statedir, destdir, dep->d_name);
 690                                 } else {
 691                                         cl_ss->ss_pn = ss_pn;
 692                                 }
 693                                 insque(cl_ss, oldstate);
 694                         } else {
 695                                 rfs4_ss_pnfree(ss_pn);
 696                         }
 697                 }
 698         }
 699 
 700 out:
 701         (void) VOP_CLOSE(dvp, FREAD, 1, (offset_t)0, CRED(), NULL);
 702         VN_RELE(dvp);
 703         if (dirt)
 704                 kmem_free((caddr_t)dirt, RFS4_SS_DIRSIZE);
 705 }
 706 
 707 static void
 708 rfs4_ss_init(void)
 709 {
 710         int npaths = 1;
 711         char *default_dss_path = NFS4_DSS_VAR_DIR;
 712 
 713         /* read the default stable storage state */
 714         rfs4_dss_readstate(npaths, &default_dss_path);
 715 
 716         rfs4_ss_enabled = 1;
 717 }
 718 
 719 static void
 720 rfs4_ss_fini(void)
 721 {
 722         rfs4_servinst_t *sip;
 723 
 724         mutex_enter(&rfs4_servinst_lock);
 725         sip = rfs4_cur_servinst;
 726         while (sip != NULL) {
 727                 rfs4_dss_clear_oldstate(sip);
 728                 sip = sip->next;
 729         }
 730         mutex_exit(&rfs4_servinst_lock);
 731 }
 732 
 733 /*
 734  * Remove all oldstate files referenced by this servinst.
 735  */
 736 static void
 737 rfs4_dss_clear_oldstate(rfs4_servinst_t *sip)
 738 {
 739         rfs4_oldstate_t *os_head, *osp;
 740 
 741         rw_enter(&sip->oldstate_lock, RW_WRITER);
 742         os_head = sip->oldstate;
 743 
 744         if (os_head == NULL) {
 745                 rw_exit(&sip->oldstate_lock);
 746                 return;
 747         }
 748 
 749         /* skip dummy entry */
 750         osp = os_head->next;
 751         while (osp != os_head) {
 752                 char *leaf = osp->ss_pn->leaf;
 753                 rfs4_oldstate_t *os_next;
 754 
 755                 rfs4_dss_remove_leaf(sip, NFS4_DSS_OLDSTATE_LEAF, leaf);
 756 
 757                 if (osp->cl_id4.id_val)
 758                         kmem_free(osp->cl_id4.id_val, osp->cl_id4.id_len);
 759                 rfs4_ss_pnfree(osp->ss_pn);
 760 
 761                 os_next = osp->next;
 762                 remque(osp);
 763                 kmem_free(osp, sizeof (rfs4_oldstate_t));
 764                 osp = os_next;
 765         }
 766 
 767         rw_exit(&sip->oldstate_lock);
 768 }
 769 
 770 /*
 771  * Form the state and oldstate paths, and read in the stable storage files.
 772  */
 773 void
 774 rfs4_dss_readstate(int npaths, char **paths)
 775 {
 776         int i;
 777         char *state, *oldstate;
 778 
 779         state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 780         oldstate = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 781 
 782         for (i = 0; i < npaths; i++) {
 783                 char *path = paths[i];
 784 
 785                 (void) sprintf(state, "%s/%s", path, NFS4_DSS_STATE_LEAF);
 786                 (void) sprintf(oldstate, "%s/%s", path, NFS4_DSS_OLDSTATE_LEAF);
 787 
 788                 /*
 789                  * Populate the current server instance's oldstate list.
 790                  *
 791                  * 1. Read stable storage data from old state directory,
 792                  *    leaving its contents alone.
 793                  *
 794                  * 2. Read stable storage data from state directory,
 795                  *    and move the latter's contents to old state
 796                  *    directory.
 797                  */
 798                 rfs4_ss_oldstate(rfs4_cur_servinst->oldstate, oldstate, NULL);
 799                 rfs4_ss_oldstate(rfs4_cur_servinst->oldstate, state, oldstate);
 800         }
 801 
 802         kmem_free(state, MAXPATHLEN);
 803         kmem_free(oldstate, MAXPATHLEN);
 804 }
 805 
 806 
 807 /*
 808  * Check if we are still in grace and if the client can be
 809  * granted permission to perform reclaims.
 810  */
 811 void
 812 rfs4_ss_chkclid(rfs4_client_t *cp)
 813 {
 814         rfs4_servinst_t *sip;
 815 
 816         /*
 817          * It should be sufficient to check the oldstate data for just
 818          * this client's instance. However, since our per-instance
 819          * client grouping is solely temporal, HA-NFSv4 RG failover
 820          * might result in clients of the same RG being partitioned into
 821          * separate instances.
 822          *
 823          * Until the client grouping is improved, we must check the
 824          * oldstate data for all instances with an active grace period.
 825          *
 826          * This also serves as the mechanism to remove stale oldstate data.
 827          * The first time we check an instance after its grace period has
 828          * expired, the oldstate data should be cleared.
 829          *
 830          * Start at the current instance, and walk the list backwards
 831          * to the first.
 832          */
 833         mutex_enter(&rfs4_servinst_lock);
 834         for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) {
 835                 rfs4_ss_chkclid_sip(cp, sip);
 836 
 837                 /* if the above check found this client, we're done */
 838                 if (cp->rc_can_reclaim)
 839                         break;
 840         }
 841         mutex_exit(&rfs4_servinst_lock);
 842 }
 843 
 844 static void
 845 rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip)
 846 {
 847         rfs4_oldstate_t *osp, *os_head;
 848 
 849         /* short circuit everything if this server instance has no oldstate */
 850         rw_enter(&sip->oldstate_lock, RW_READER);
 851         os_head = sip->oldstate;
 852         rw_exit(&sip->oldstate_lock);
 853         if (os_head == NULL)
 854                 return;
 855 
 856         /*
 857          * If this server instance is no longer in a grace period then
 858          * the client won't be able to reclaim. No further need for this
 859          * instance's oldstate data, so it can be cleared.
 860          */
 861         if (!rfs4_servinst_in_grace(sip))
 862                 return;
 863 
 864         /* this instance is still in grace; search for the clientid */
 865 
 866         rw_enter(&sip->oldstate_lock, RW_READER);
 867 
 868         os_head = sip->oldstate;
 869         /* skip dummy entry */
 870         osp = os_head->next;
 871         while (osp != os_head) {
 872                 if (osp->cl_id4.id_len == cp->rc_nfs_client.id_len) {
 873                         if (bcmp(osp->cl_id4.id_val, cp->rc_nfs_client.id_val,
 874                             osp->cl_id4.id_len) == 0) {
 875                                 cp->rc_can_reclaim = 1;
 876                                 break;
 877                         }
 878                 }
 879                 osp = osp->next;
 880         }
 881 
 882         rw_exit(&sip->oldstate_lock);
 883 }
 884 
 885 /*
 886  * Place client information into stable storage: 1/3.
 887  * First, generate the leaf filename, from the client's IP address and
 888  * the server-generated short-hand clientid.
 889  */
 890 void
 891 rfs4_ss_clid(rfs4_client_t *cp)
 892 {
 893         const char *kinet_ntop6(uchar_t *, char *, size_t);
 894         char leaf[MAXNAMELEN], buf[INET6_ADDRSTRLEN];
 895         struct sockaddr *ca;
 896         uchar_t *b;
 897 
 898         if (rfs4_ss_enabled == 0) {
 899                 return;
 900         }
 901 
 902         buf[0] = 0;
 903 
 904         ca = (struct sockaddr *)&cp->rc_addr;
 905 
 906         /*
 907          * Convert the caller's IP address to a dotted string
 908          */
 909         if (ca->sa_family == AF_INET) {
 910                 b = (uchar_t *)&((struct sockaddr_in *)ca)->sin_addr;
 911                 (void) sprintf(buf, "%03d.%03d.%03d.%03d", b[0] & 0xFF,
 912                     b[1] & 0xFF, b[2] & 0xFF, b[3] & 0xFF);
 913         } else if (ca->sa_family == AF_INET6) {
 914                 struct sockaddr_in6 *sin6;
 915 
 916                 sin6 = (struct sockaddr_in6 *)ca;
 917                 (void) kinet_ntop6((uchar_t *)&sin6->sin6_addr,
 918                     buf, INET6_ADDRSTRLEN);
 919         }
 920 
 921         (void) snprintf(leaf, MAXNAMELEN, "%s-%llx", buf,
 922             (longlong_t)cp->rc_clientid);
 923         rfs4_ss_clid_write(cp, leaf);
 924 }
 925 
 926 /*
 927  * Place client information into stable storage: 2/3.
 928  * DSS: distributed stable storage: the file may need to be written to
 929  * multiple directories.
 930  */
 931 static void
 932 rfs4_ss_clid_write(rfs4_client_t *cp, char *leaf)
 933 {
 934         rfs4_servinst_t *sip;
 935 
 936         /*
 937          * It should be sufficient to write the leaf file to (all) DSS paths
 938          * associated with just this client's instance. However, since our
 939          * per-instance client grouping is solely temporal, HA-NFSv4 RG
 940          * failover might result in us losing DSS data.
 941          *
 942          * Until the client grouping is improved, we must write the DSS data
 943          * to all instances' paths. Start at the current instance, and
 944          * walk the list backwards to the first.
 945          */
 946         mutex_enter(&rfs4_servinst_lock);
 947         for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) {
 948                 int i, npaths = sip->dss_npaths;
 949 
 950                 /* write the leaf file to all DSS paths */
 951                 for (i = 0; i < npaths; i++) {
 952                         rfs4_dss_path_t *dss_path = sip->dss_paths[i];
 953 
 954                         /* HA-NFSv4 path might have been failed-away from us */
 955                         if (dss_path == NULL)
 956                                 continue;
 957 
 958                         rfs4_ss_clid_write_one(cp, dss_path->path, leaf);
 959                 }
 960         }
 961         mutex_exit(&rfs4_servinst_lock);
 962 }
 963 
 964 /*
 965  * Place client information into stable storage: 3/3.
 966  * Write the stable storage data to the requested file.
 967  */
 968 static void
 969 rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dss_path, char *leaf)
 970 {
 971         int ioflag;
 972         int file_vers = NFS4_SS_VERSION;
 973         size_t dirlen;
 974         struct uio uio;
 975         struct iovec iov[4];
 976         char *dir;
 977         rfs4_ss_pn_t *ss_pn;
 978         vnode_t *vp;
 979         nfs_client_id4 *cl_id4 = &(cp->rc_nfs_client);
 980 
 981         /* allow 2 extra bytes for '/' & NUL */
 982         dirlen = strlen(dss_path) + strlen(NFS4_DSS_STATE_LEAF) + 2;
 983         dir = kmem_alloc(dirlen, KM_SLEEP);
 984         (void) sprintf(dir, "%s/%s", dss_path, NFS4_DSS_STATE_LEAF);
 985 
 986         ss_pn = rfs4_ss_pnalloc(dir, leaf);
 987         /* rfs4_ss_pnalloc takes its own copy */
 988         kmem_free(dir, dirlen);
 989         if (ss_pn == NULL)
 990                 return;
 991 
 992         if (vn_open(ss_pn->pn, UIO_SYSSPACE, FCREAT|FWRITE, 0600, &vp,
 993             CRCREAT, 0)) {
 994                 rfs4_ss_pnfree(ss_pn);
 995                 return;
 996         }
 997 
 998         /*
 999          * We need to record leaf - i.e. the filename - so that we know
1000          * what to remove, in the future. However, the dir part of cp->ss_pn
1001          * should never be referenced directly, since it's potentially only
1002          * one of several paths with this leaf in it.
1003          */
1004         if (cp->rc_ss_pn != NULL) {
1005                 if (strcmp(cp->rc_ss_pn->leaf, leaf) == 0) {
1006                         /* we've already recorded *this* leaf */
1007                         rfs4_ss_pnfree(ss_pn);
1008                 } else {
1009                         /* replace with this leaf */
1010                         rfs4_ss_pnfree(cp->rc_ss_pn);
1011                         cp->rc_ss_pn = ss_pn;
1012                 }
1013         } else {
1014                 cp->rc_ss_pn = ss_pn;
1015         }
1016 
1017         /*
1018          * Build a scatter list that points to the nfs_client_id4
1019          */
1020         iov[0].iov_base = (caddr_t)&file_vers;
1021         iov[0].iov_len = sizeof (int);
1022         iov[1].iov_base = (caddr_t)&(cl_id4->verifier);
1023         iov[1].iov_len = NFS4_VERIFIER_SIZE;
1024         iov[2].iov_base = (caddr_t)&(cl_id4->id_len);
1025         iov[2].iov_len = sizeof (uint_t);
1026         iov[3].iov_base = (caddr_t)cl_id4->id_val;
1027         iov[3].iov_len = cl_id4->id_len;
1028 
1029         uio.uio_iov = iov;
1030         uio.uio_iovcnt = 4;
1031         uio.uio_loffset = 0;
1032         uio.uio_segflg = UIO_SYSSPACE;
1033         uio.uio_llimit = (rlim64_t)MAXOFFSET_T;
1034         uio.uio_resid = cl_id4->id_len + sizeof (int) +
1035             NFS4_VERIFIER_SIZE + sizeof (uint_t);
1036 
1037         ioflag = uio.uio_fmode = (FWRITE|FSYNC);
1038         uio.uio_extflg = UIO_COPY_DEFAULT;
1039 
1040         (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1041         /* write the full client id to the file. */
1042         (void) VOP_WRITE(vp, &uio, ioflag, CRED(), NULL);
1043         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1044 
1045         (void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
1046         VN_RELE(vp);
1047 }
1048 
1049 /*
1050  * DSS: distributed stable storage.
1051  * Unpack the list of paths passed by nfsd.
1052  * Use nvlist_alloc(9F) to manage the data.
1053  * The caller is responsible for allocating and freeing the buffer.
1054  */
1055 int
1056 rfs4_dss_setpaths(char *buf, size_t buflen)
1057 {
1058         int error;
1059 
1060         /*
1061          * If this is a "warm start", i.e. we previously had DSS paths,
1062          * preserve the old paths.
1063          */
1064         if (rfs4_dss_paths != NULL) {
1065                 /*
1066                  * Before we lose the ptr, destroy the nvlist and pathnames
1067                  * array from the warm start before this one.
1068                  */
1069                 nvlist_free(rfs4_dss_oldpaths);
1070                 rfs4_dss_oldpaths = rfs4_dss_paths;
1071         }
1072 
1073         /* unpack the buffer into a searchable nvlist */
1074         error = nvlist_unpack(buf, buflen, &rfs4_dss_paths, KM_SLEEP);
1075         if (error)
1076                 return (error);
1077 
1078         /*
1079          * Search the nvlist for the pathnames nvpair (which is the only nvpair
1080          * in the list, and record its location.
1081          */
1082         error = nvlist_lookup_string_array(rfs4_dss_paths, NFS4_DSS_NVPAIR_NAME,
1083             &rfs4_dss_newpaths, &rfs4_dss_numnewpaths);
1084         return (error);
1085 }
1086 
1087 /*
1088  * Ultimately the nfssys() call NFS4_CLR_STATE endsup here
1089  * to find and mark the client for forced expire.
1090  */
1091 static void
1092 rfs4_client_scrub(rfs4_entry_t ent, void *arg)
1093 {
1094         rfs4_client_t *cp = (rfs4_client_t *)ent;
1095         struct nfs4clrst_args *clr = arg;
1096         struct sockaddr_in6 *ent_sin6;
1097         struct in6_addr  clr_in6;
1098         struct sockaddr_in  *ent_sin;
1099         struct in_addr   clr_in;
1100 
1101         if (clr->addr_type != cp->rc_addr.ss_family) {
1102                 return;
1103         }
1104 
1105         switch (clr->addr_type) {
1106 
1107         case AF_INET6:
1108                 /* copyin the address from user space */
1109                 if (copyin(clr->ap, &clr_in6, sizeof (clr_in6))) {
1110                         break;
1111                 }
1112 
1113                 ent_sin6 = (struct sockaddr_in6 *)&cp->rc_addr;
1114 
1115                 /*
1116                  * now compare, and if equivalent mark entry
1117                  * for forced expiration
1118                  */
1119                 if (IN6_ARE_ADDR_EQUAL(&ent_sin6->sin6_addr, &clr_in6)) {
1120                         cp->rc_forced_expire = 1;
1121                 }
1122                 break;
1123 
1124         case AF_INET:
1125                 /* copyin the address from user space */
1126                 if (copyin(clr->ap, &clr_in, sizeof (clr_in))) {
1127                         break;
1128                 }
1129 
1130                 ent_sin = (struct sockaddr_in *)&cp->rc_addr;
1131 
1132                 /*
1133                  * now compare, and if equivalent mark entry
1134                  * for forced expiration
1135                  */
1136                 if (ent_sin->sin_addr.s_addr == clr_in.s_addr) {
1137                         cp->rc_forced_expire = 1;
1138                 }
1139                 break;
1140 
1141         default:
1142                 /* force this assert to fail */
1143                 ASSERT(clr->addr_type != clr->addr_type);
1144         }
1145 }
1146 
1147 /*
1148  * This is called from nfssys() in order to clear server state
1149  * for the specified client IP Address.
1150  */
1151 void
1152 rfs4_clear_client_state(struct nfs4clrst_args *clr)
1153 {
1154         (void) rfs4_dbe_walk(rfs4_client_tab, rfs4_client_scrub, clr);
1155 }
1156 
1157 /*
1158  * Used to initialize the NFSv4 server's state or database.  All of
1159  * the tables are created and timers are set. Only called when NFSv4
1160  * service is provided.
1161  */
1162 void
1163 rfs4_state_init()
1164 {
1165         int start_grace;
1166         extern boolean_t rfs4_cpr_callb(void *, int);
1167         char *dss_path = NFS4_DSS_VAR_DIR;
1168         time_t start_time;
1169 
1170         mutex_enter(&rfs4_state_lock);
1171 
1172         /*
1173          * If the server state database has already been initialized,
1174          * skip it
1175          */
1176         if (rfs4_server_state != NULL) {
1177                 mutex_exit(&rfs4_state_lock);
1178                 return;
1179         }
1180 
1181         rw_init(&rfs4_findclient_lock, NULL, RW_DEFAULT, NULL);
1182 
1183         /*
1184          * Set the boot time.  If the server
1185          * has been restarted quickly and has had the opportunity to
1186          * service clients, then the start_time needs to be bumped
1187          * regardless.  A small window but it exists...
1188          */
1189         start_time = gethrestime_sec();
1190         if (rfs4_start_time < start_time)
1191                 rfs4_start_time = start_time;
1192         else
1193                 rfs4_start_time++;
1194 
1195         /* DSS: distributed stable storage: initialise served paths list */
1196         rfs4_dss_pathlist = NULL;
1197 
1198         /*
1199          * Create the first server instance, or a new one if the server has
1200          * been restarted; see above comments on rfs4_start_time. Don't
1201          * start its grace period; that will be done later, to maximise the
1202          * clients' recovery window.
1203          */
1204         start_grace = 0;
1205         rfs4_servinst_create(start_grace, 1, &dss_path);
1206 
1207         /* reset the "first NFSv4 request" status */
1208         rfs4_seen_first_compound = 0;
1209 
1210         /*
1211          * Add a CPR callback so that we can update client
1212          * access times to extend the lease after a suspend
1213          * and resume (using the same class as rpcmod/connmgr)
1214          */
1215         cpr_id = callb_add(rfs4_cpr_callb, 0, CB_CL_CPR_RPC, "rfs4");
1216 
1217         /* set the various cache timers for table creation */
1218         if (rfs4_client_cache_time == 0)
1219                 rfs4_client_cache_time = CLIENT_CACHE_TIME;
1220         if (rfs4_openowner_cache_time == 0)
1221                 rfs4_openowner_cache_time = OPENOWNER_CACHE_TIME;
1222         if (rfs4_state_cache_time == 0)
1223                 rfs4_state_cache_time = STATE_CACHE_TIME;
1224         if (rfs4_lo_state_cache_time == 0)
1225                 rfs4_lo_state_cache_time = LO_STATE_CACHE_TIME;
1226         if (rfs4_lockowner_cache_time == 0)
1227                 rfs4_lockowner_cache_time = LOCKOWNER_CACHE_TIME;
1228         if (rfs4_file_cache_time == 0)
1229                 rfs4_file_cache_time = FILE_CACHE_TIME;
1230         if (rfs4_deleg_state_cache_time == 0)
1231                 rfs4_deleg_state_cache_time = DELEG_STATE_CACHE_TIME;
1232 
1233         /* Create the overall database to hold all server state */
1234         rfs4_server_state = rfs4_database_create(rfs4_database_debug);
1235 
1236         /* Now create the individual tables */
1237         rfs4_client_cache_time *= rfs4_lease_time;
1238         rfs4_client_tab = rfs4_table_create(rfs4_server_state,
1239             "Client",
1240             rfs4_client_cache_time,
1241             2,
1242             rfs4_client_create,
1243             rfs4_client_destroy,
1244             rfs4_client_expiry,
1245             sizeof (rfs4_client_t),
1246             TABSIZE,
1247             MAXTABSZ/8, 100);
1248         rfs4_nfsclnt_idx = rfs4_index_create(rfs4_client_tab,
1249             "nfs_client_id4", nfsclnt_hash,
1250             nfsclnt_compare, nfsclnt_mkkey,
1251             TRUE);
1252         rfs4_clientid_idx = rfs4_index_create(rfs4_client_tab,
1253             "client_id", clientid_hash,
1254             clientid_compare, clientid_mkkey,
1255             FALSE);
1256 
1257         rfs4_clntip_cache_time = 86400 * 365;   /* about a year */
1258         rfs4_clntip_tab = rfs4_table_create(rfs4_server_state,
1259             "ClntIP",
1260             rfs4_clntip_cache_time,
1261             1,
1262             rfs4_clntip_create,
1263             rfs4_clntip_destroy,
1264             rfs4_clntip_expiry,
1265             sizeof (rfs4_clntip_t),
1266             TABSIZE,
1267             MAXTABSZ, 100);
1268         rfs4_clntip_idx = rfs4_index_create(rfs4_clntip_tab,
1269             "client_ip", clntip_hash,
1270             clntip_compare, clntip_mkkey,
1271             TRUE);
1272 
1273         rfs4_openowner_cache_time *= rfs4_lease_time;
1274         rfs4_openowner_tab = rfs4_table_create(rfs4_server_state,
1275             "OpenOwner",
1276             rfs4_openowner_cache_time,
1277             1,
1278             rfs4_openowner_create,
1279             rfs4_openowner_destroy,
1280             rfs4_openowner_expiry,
1281             sizeof (rfs4_openowner_t),
1282             TABSIZE,
1283             MAXTABSZ, 100);
1284         rfs4_openowner_idx = rfs4_index_create(rfs4_openowner_tab,
1285             "open_owner4", openowner_hash,
1286             openowner_compare,
1287             openowner_mkkey, TRUE);
1288 
1289         rfs4_state_cache_time *= rfs4_lease_time;
1290         rfs4_state_tab = rfs4_table_create(rfs4_server_state,
1291             "OpenStateID",
1292             rfs4_state_cache_time,
1293             3,
1294             rfs4_state_create,
1295             rfs4_state_destroy,
1296             rfs4_state_expiry,
1297             sizeof (rfs4_state_t),
1298             TABSIZE,
1299             MAXTABSZ, 100);
1300 
1301         rfs4_state_owner_file_idx = rfs4_index_create(rfs4_state_tab,
1302             "Openowner-File",
1303             state_owner_file_hash,
1304             state_owner_file_compare,
1305             state_owner_file_mkkey, TRUE);
1306 
1307         rfs4_state_idx = rfs4_index_create(rfs4_state_tab,
1308             "State-id", state_hash,
1309             state_compare, state_mkkey, FALSE);
1310 
1311         rfs4_state_file_idx = rfs4_index_create(rfs4_state_tab,
1312             "File", state_file_hash,
1313             state_file_compare, state_file_mkkey,
1314             FALSE);
1315 
1316         rfs4_lo_state_cache_time *= rfs4_lease_time;
1317         rfs4_lo_state_tab = rfs4_table_create(rfs4_server_state,
1318             "LockStateID",
1319             rfs4_lo_state_cache_time,
1320             2,
1321             rfs4_lo_state_create,
1322             rfs4_lo_state_destroy,
1323             rfs4_lo_state_expiry,
1324             sizeof (rfs4_lo_state_t),
1325             TABSIZE,
1326             MAXTABSZ, 100);
1327 
1328         rfs4_lo_state_owner_idx = rfs4_index_create(rfs4_lo_state_tab,
1329             "lockownerxstate",
1330             lo_state_lo_hash,
1331             lo_state_lo_compare,
1332             lo_state_lo_mkkey, TRUE);
1333 
1334         rfs4_lo_state_idx = rfs4_index_create(rfs4_lo_state_tab,
1335             "State-id",
1336             lo_state_hash, lo_state_compare,
1337             lo_state_mkkey, FALSE);
1338 
1339         rfs4_lockowner_cache_time *= rfs4_lease_time;
1340 
1341         rfs4_lockowner_tab = rfs4_table_create(rfs4_server_state,
1342             "Lockowner",
1343             rfs4_lockowner_cache_time,
1344             2,
1345             rfs4_lockowner_create,
1346             rfs4_lockowner_destroy,
1347             rfs4_lockowner_expiry,
1348             sizeof (rfs4_lockowner_t),
1349             TABSIZE,
1350             MAXTABSZ, 100);
1351 
1352         rfs4_lockowner_idx = rfs4_index_create(rfs4_lockowner_tab,
1353             "lock_owner4", lockowner_hash,
1354             lockowner_compare,
1355             lockowner_mkkey, TRUE);
1356 
1357         rfs4_lockowner_pid_idx = rfs4_index_create(rfs4_lockowner_tab,
1358             "pid", pid_hash,
1359             pid_compare, pid_mkkey,
1360             FALSE);
1361 
1362         rfs4_file_cache_time *= rfs4_lease_time;
1363         rfs4_file_tab = rfs4_table_create(rfs4_server_state,
1364             "File",
1365             rfs4_file_cache_time,
1366             1,
1367             rfs4_file_create,
1368             rfs4_file_destroy,
1369             NULL,
1370             sizeof (rfs4_file_t),
1371             TABSIZE,
1372             MAXTABSZ, -1);
1373 
1374         rfs4_file_idx = rfs4_index_create(rfs4_file_tab,
1375             "Filehandle", file_hash,
1376             file_compare, file_mkkey, TRUE);
1377 
1378         rfs4_deleg_state_cache_time *= rfs4_lease_time;
1379         rfs4_deleg_state_tab = rfs4_table_create(rfs4_server_state,
1380             "DelegStateID",
1381             rfs4_deleg_state_cache_time,
1382             2,
1383             rfs4_deleg_state_create,
1384             rfs4_deleg_state_destroy,
1385             rfs4_deleg_state_expiry,
1386             sizeof (rfs4_deleg_state_t),
1387             TABSIZE,
1388             MAXTABSZ, 100);
1389         rfs4_deleg_idx = rfs4_index_create(rfs4_deleg_state_tab,
1390             "DelegByFileClient",
1391             deleg_hash,
1392             deleg_compare,
1393             deleg_mkkey, TRUE);
1394 
1395         rfs4_deleg_state_idx = rfs4_index_create(rfs4_deleg_state_tab,
1396             "DelegState",
1397             deleg_state_hash,
1398             deleg_state_compare,
1399             deleg_state_mkkey, FALSE);
1400 
1401         /*
1402          * Init the stable storage.
1403          */
1404         rfs4_ss_init();
1405 
1406         rfs4_client_clrst = rfs4_clear_client_state;
1407 
1408         mutex_exit(&rfs4_state_lock);
1409 }
1410 
1411 
1412 /*
1413  * Used at server shutdown to cleanup all of the NFSv4 server's structures
1414  * and other state.
1415  */
1416 void
1417 rfs4_state_fini()
1418 {
1419         rfs4_database_t *dbp;
1420 
1421         mutex_enter(&rfs4_state_lock);
1422 
1423         if (rfs4_server_state == NULL) {
1424                 mutex_exit(&rfs4_state_lock);
1425                 return;
1426         }
1427 
1428         rfs4_client_clrst = NULL;
1429 
1430         rfs4_set_deleg_policy(SRV_NEVER_DELEGATE);
1431         dbp = rfs4_server_state;
1432         rfs4_server_state = NULL;
1433 
1434         /*
1435          * Cleanup the CPR callback.
1436          */
1437         if (cpr_id)
1438                 (void) callb_delete(cpr_id);
1439 
1440         rw_destroy(&rfs4_findclient_lock);
1441 
1442         /* First stop all of the reaper threads in the database */
1443         rfs4_database_shutdown(dbp);
1444         /* clean up any dangling stable storage structures */
1445         rfs4_ss_fini();
1446         /* Now actually destroy/release the database and its tables */
1447         rfs4_database_destroy(dbp);
1448 
1449         /* Reset the cache timers for next time */
1450         rfs4_client_cache_time = 0;
1451         rfs4_openowner_cache_time = 0;
1452         rfs4_state_cache_time = 0;
1453         rfs4_lo_state_cache_time = 0;
1454         rfs4_lockowner_cache_time = 0;
1455         rfs4_file_cache_time = 0;
1456         rfs4_deleg_state_cache_time = 0;
1457 
1458         mutex_exit(&rfs4_state_lock);
1459 
1460         /* destroy server instances and current instance ptr */
1461         rfs4_servinst_destroy_all();
1462 
1463         /* reset the "first NFSv4 request" status */
1464         rfs4_seen_first_compound = 0;
1465 
1466         /* DSS: distributed stable storage */
1467         nvlist_free(rfs4_dss_oldpaths);
1468         nvlist_free(rfs4_dss_paths);
1469         rfs4_dss_paths = rfs4_dss_oldpaths = NULL;
1470 }
1471 
1472 typedef union {
1473         struct {
1474                 uint32_t start_time;
1475                 uint32_t c_id;
1476         } impl_id;
1477         clientid4 id4;
1478 } cid;
1479 
1480 static int foreign_stateid(stateid_t *id);
1481 static int foreign_clientid(cid *cidp);
1482 static void embed_nodeid(cid *cidp);
1483 
1484 typedef union {
1485         struct {
1486                 uint32_t c_id;
1487                 uint32_t gen_num;
1488         } cv_impl;
1489         verifier4       confirm_verf;
1490 } scid_confirm_verf;
1491 
1492 static uint32_t
1493 clientid_hash(void *key)
1494 {
1495         cid *idp = key;
1496 
1497         return (idp->impl_id.c_id);
1498 }
1499 
1500 static bool_t
1501 clientid_compare(rfs4_entry_t entry, void *key)
1502 {
1503         rfs4_client_t *cp = (rfs4_client_t *)entry;
1504         clientid4 *idp = key;
1505 
1506         return (*idp == cp->rc_clientid);
1507 }
1508 
1509 static void *
1510 clientid_mkkey(rfs4_entry_t entry)
1511 {
1512         rfs4_client_t *cp = (rfs4_client_t *)entry;
1513 
1514         return (&cp->rc_clientid);
1515 }
1516 
1517 static uint32_t
1518 nfsclnt_hash(void *key)
1519 {
1520         nfs_client_id4 *client = key;
1521         int i;
1522         uint32_t hash = 0;
1523 
1524         for (i = 0; i < client->id_len; i++) {
1525                 hash <<= 1;
1526                 hash += (uint_t)client->id_val[i];
1527         }
1528         return (hash);
1529 }
1530 
1531 
1532 static bool_t
1533 nfsclnt_compare(rfs4_entry_t entry, void *key)
1534 {
1535         rfs4_client_t *cp = (rfs4_client_t *)entry;
1536         nfs_client_id4 *nfs_client = key;
1537 
1538         if (cp->rc_nfs_client.id_len != nfs_client->id_len)
1539                 return (FALSE);
1540 
1541         return (bcmp(cp->rc_nfs_client.id_val, nfs_client->id_val,
1542             nfs_client->id_len) == 0);
1543 }
1544 
1545 static void *
1546 nfsclnt_mkkey(rfs4_entry_t entry)
1547 {
1548         rfs4_client_t *cp = (rfs4_client_t *)entry;
1549 
1550         return (&cp->rc_nfs_client);
1551 }
1552 
1553 static bool_t
1554 rfs4_client_expiry(rfs4_entry_t u_entry)
1555 {
1556         rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1557         bool_t cp_expired;
1558 
1559         if (rfs4_dbe_is_invalid(cp->rc_dbe)) {
1560                 cp->rc_ss_remove = 1;
1561                 return (TRUE);
1562         }
1563         /*
1564          * If the sysadmin has used clear_locks for this
1565          * entry then forced_expire will be set and we
1566          * want this entry to be reaped. Or the entry
1567          * has exceeded its lease period.
1568          */
1569         cp_expired = (cp->rc_forced_expire ||
1570             (gethrestime_sec() - cp->rc_last_access
1571             > rfs4_lease_time));
1572 
1573         if (!cp->rc_ss_remove && cp_expired)
1574                 cp->rc_ss_remove = 1;
1575         return (cp_expired);
1576 }
1577 
1578 /*
1579  * Remove the leaf file from all distributed stable storage paths.
1580  */
1581 static void
1582 rfs4_dss_remove_cpleaf(rfs4_client_t *cp)
1583 {
1584         rfs4_servinst_t *sip;
1585         char *leaf = cp->rc_ss_pn->leaf;
1586 
1587         /*
1588          * since the state files are written to all DSS
1589          * paths we must remove this leaf file instance
1590          * from all server instances.
1591          */
1592 
1593         mutex_enter(&rfs4_servinst_lock);
1594         for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) {
1595                 /* remove the leaf file associated with this server instance */
1596                 rfs4_dss_remove_leaf(sip, NFS4_DSS_STATE_LEAF, leaf);
1597         }
1598         mutex_exit(&rfs4_servinst_lock);
1599 }
1600 
1601 static void
1602 rfs4_dss_remove_leaf(rfs4_servinst_t *sip, char *dir_leaf, char *leaf)
1603 {
1604         int i, npaths = sip->dss_npaths;
1605 
1606         for (i = 0; i < npaths; i++) {
1607                 rfs4_dss_path_t *dss_path = sip->dss_paths[i];
1608                 char *path, *dir;
1609                 size_t pathlen;
1610 
1611                 /* the HA-NFSv4 path might have been failed-over away from us */
1612                 if (dss_path == NULL)
1613                         continue;
1614 
1615                 dir = dss_path->path;
1616 
1617                 /* allow 3 extra bytes for two '/' & a NUL */
1618                 pathlen = strlen(dir) + strlen(dir_leaf) + strlen(leaf) + 3;
1619                 path = kmem_alloc(pathlen, KM_SLEEP);
1620                 (void) sprintf(path, "%s/%s/%s", dir, dir_leaf, leaf);
1621 
1622                 (void) vn_remove(path, UIO_SYSSPACE, RMFILE);
1623 
1624                 kmem_free(path, pathlen);
1625         }
1626 }
1627 
1628 static void
1629 rfs4_client_destroy(rfs4_entry_t u_entry)
1630 {
1631         rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1632 
1633         mutex_destroy(cp->rc_cbinfo.cb_lock);
1634         cv_destroy(cp->rc_cbinfo.cb_cv);
1635         cv_destroy(cp->rc_cbinfo.cb_cv_nullcaller);
1636         list_destroy(&cp->rc_openownerlist);
1637 
1638         /* free callback info */
1639         rfs4_cbinfo_free(&cp->rc_cbinfo);
1640 
1641         if (cp->rc_cp_confirmed)
1642                 rfs4_client_rele(cp->rc_cp_confirmed);
1643 
1644         if (cp->rc_ss_pn) {
1645                 /* check if the stable storage files need to be removed */
1646                 if (cp->rc_ss_remove)
1647                         rfs4_dss_remove_cpleaf(cp);
1648                 rfs4_ss_pnfree(cp->rc_ss_pn);
1649         }
1650 
1651         /* Free the client supplied client id */
1652         kmem_free(cp->rc_nfs_client.id_val, cp->rc_nfs_client.id_len);
1653 
1654         if (cp->rc_sysidt != LM_NOSYSID)
1655                 lm_free_sysidt(cp->rc_sysidt);
1656 }
1657 
1658 static bool_t
1659 rfs4_client_create(rfs4_entry_t u_entry, void *arg)
1660 {
1661         rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1662         nfs_client_id4 *client = (nfs_client_id4 *)arg;
1663         struct sockaddr *ca;
1664         cid *cidp;
1665         scid_confirm_verf *scvp;
1666 
1667         /* Get a clientid to give to the client */
1668         cidp = (cid *)&cp->rc_clientid;
1669         cidp->impl_id.start_time = rfs4_start_time;
1670         cidp->impl_id.c_id = (uint32_t)rfs4_dbe_getid(cp->rc_dbe);
1671 
1672         /* If we are booted as a cluster node, embed our nodeid */
1673         if (cluster_bootflags & CLUSTER_BOOTED)
1674                 embed_nodeid(cidp);
1675 
1676         /* Allocate and copy client's client id value */
1677         cp->rc_nfs_client.id_val = kmem_alloc(client->id_len, KM_SLEEP);
1678         cp->rc_nfs_client.id_len = client->id_len;
1679         bcopy(client->id_val, cp->rc_nfs_client.id_val, client->id_len);
1680         cp->rc_nfs_client.verifier = client->verifier;
1681 
1682         /* Copy client's IP address */
1683         ca = client->cl_addr;
1684         if (ca->sa_family == AF_INET)
1685                 bcopy(ca, &cp->rc_addr, sizeof (struct sockaddr_in));
1686         else if (ca->sa_family == AF_INET6)
1687                 bcopy(ca, &cp->rc_addr, sizeof (struct sockaddr_in6));
1688         cp->rc_nfs_client.cl_addr = (struct sockaddr *)&cp->rc_addr;
1689 
1690         /* Init the value for the SETCLIENTID_CONFIRM verifier */
1691         scvp = (scid_confirm_verf *)&cp->rc_confirm_verf;
1692         scvp->cv_impl.c_id = cidp->impl_id.c_id;
1693         scvp->cv_impl.gen_num = 0;
1694 
1695         /* An F_UNLKSYS has been done for this client */
1696         cp->rc_unlksys_completed = FALSE;
1697 
1698         /* We need the client to ack us */
1699         cp->rc_need_confirm = TRUE;
1700         cp->rc_cp_confirmed = NULL;
1701 
1702         /* TRUE all the time until the callback path actually fails */
1703         cp->rc_cbinfo.cb_notified_of_cb_path_down = TRUE;
1704 
1705         /* Initialize the access time to now */
1706         cp->rc_last_access = gethrestime_sec();
1707 
1708         cp->rc_cr_set = NULL;
1709 
1710         cp->rc_sysidt = LM_NOSYSID;
1711 
1712         list_create(&cp->rc_openownerlist, sizeof (rfs4_openowner_t),
1713             offsetof(rfs4_openowner_t, ro_node));
1714 
1715         /* set up the callback control structure */
1716         cp->rc_cbinfo.cb_state = CB_UNINIT;
1717         mutex_init(cp->rc_cbinfo.cb_lock, NULL, MUTEX_DEFAULT, NULL);
1718         cv_init(cp->rc_cbinfo.cb_cv, NULL, CV_DEFAULT, NULL);
1719         cv_init(cp->rc_cbinfo.cb_cv_nullcaller, NULL, CV_DEFAULT, NULL);
1720 
1721         /*
1722          * Associate the client_t with the current server instance.
1723          * The hold is solely to satisfy the calling requirement of
1724          * rfs4_servinst_assign(). In this case it's not strictly necessary.
1725          */
1726         rfs4_dbe_hold(cp->rc_dbe);
1727         rfs4_servinst_assign(cp, rfs4_cur_servinst);
1728         rfs4_dbe_rele(cp->rc_dbe);
1729 
1730         return (TRUE);
1731 }
1732 
1733 /*
1734  * Caller wants to generate/update the setclientid_confirm verifier
1735  * associated with a client.  This is done during the SETCLIENTID
1736  * processing.
1737  */
1738 void
1739 rfs4_client_scv_next(rfs4_client_t *cp)
1740 {
1741         scid_confirm_verf *scvp;
1742 
1743         /* Init the value for the SETCLIENTID_CONFIRM verifier */
1744         scvp = (scid_confirm_verf *)&cp->rc_confirm_verf;
1745         scvp->cv_impl.gen_num++;
1746 }
1747 
1748 void
1749 rfs4_client_rele(rfs4_client_t *cp)
1750 {
1751         rfs4_dbe_rele(cp->rc_dbe);
1752 }
1753 
1754 rfs4_client_t *
1755 rfs4_findclient(nfs_client_id4 *client, bool_t *create, rfs4_client_t *oldcp)
1756 {
1757         rfs4_client_t *cp;
1758 
1759 
1760         if (oldcp) {
1761                 rw_enter(&rfs4_findclient_lock, RW_WRITER);
1762                 rfs4_dbe_hide(oldcp->rc_dbe);
1763         } else {
1764                 rw_enter(&rfs4_findclient_lock, RW_READER);
1765         }
1766 
1767         cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_nfsclnt_idx, client,
1768             create, (void *)client, RFS4_DBS_VALID);
1769 
1770         if (oldcp)
1771                 rfs4_dbe_unhide(oldcp->rc_dbe);
1772 
1773         rw_exit(&rfs4_findclient_lock);
1774 
1775         return (cp);
1776 }
1777 
1778 rfs4_client_t *
1779 rfs4_findclient_by_id(clientid4 clientid, bool_t find_unconfirmed)
1780 {
1781         rfs4_client_t *cp;
1782         bool_t create = FALSE;
1783         cid *cidp = (cid *)&clientid;
1784 
1785         /* If we're a cluster and the nodeid isn't right, short-circuit */
1786         if (cluster_bootflags & CLUSTER_BOOTED && foreign_clientid(cidp))
1787                 return (NULL);
1788 
1789         rw_enter(&rfs4_findclient_lock, RW_READER);
1790 
1791         cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_clientid_idx, &clientid,
1792             &create, NULL, RFS4_DBS_VALID);
1793 
1794         rw_exit(&rfs4_findclient_lock);
1795 
1796         if (cp && cp->rc_need_confirm && find_unconfirmed == FALSE) {
1797                 rfs4_client_rele(cp);
1798                 return (NULL);
1799         } else {
1800                 return (cp);
1801         }
1802 }
1803 
1804 static uint32_t
1805 clntip_hash(void *key)
1806 {
1807         struct sockaddr *addr = key;
1808         int i, len = 0;
1809         uint32_t hash = 0;
1810         char *ptr;
1811 
1812         if (addr->sa_family == AF_INET) {
1813                 struct sockaddr_in *a = (struct sockaddr_in *)addr;
1814                 len = sizeof (struct in_addr);
1815                 ptr = (char *)&a->sin_addr;
1816         } else if (addr->sa_family == AF_INET6) {
1817                 struct sockaddr_in6 *a = (struct sockaddr_in6 *)addr;
1818                 len = sizeof (struct in6_addr);
1819                 ptr = (char *)&a->sin6_addr;
1820         } else
1821                 return (0);
1822 
1823         for (i = 0; i < len; i++) {
1824                 hash <<= 1;
1825                 hash += (uint_t)ptr[i];
1826         }
1827         return (hash);
1828 }
1829 
1830 static bool_t
1831 clntip_compare(rfs4_entry_t entry, void *key)
1832 {
1833         rfs4_clntip_t *cp = (rfs4_clntip_t *)entry;
1834         struct sockaddr *addr = key;
1835         int len = 0;
1836         char *p1, *p2;
1837 
1838         if (addr->sa_family == AF_INET) {
1839                 struct sockaddr_in *a1 = (struct sockaddr_in *)&cp->ri_addr;
1840                 struct sockaddr_in *a2 = (struct sockaddr_in *)addr;
1841                 len = sizeof (struct in_addr);
1842                 p1 = (char *)&a1->sin_addr;
1843                 p2 = (char *)&a2->sin_addr;
1844         } else if (addr->sa_family == AF_INET6) {
1845                 struct sockaddr_in6 *a1 = (struct sockaddr_in6 *)&cp->ri_addr;
1846                 struct sockaddr_in6 *a2 = (struct sockaddr_in6 *)addr;
1847                 len = sizeof (struct in6_addr);
1848                 p1 = (char *)&a1->sin6_addr;
1849                 p2 = (char *)&a2->sin6_addr;
1850         } else
1851                 return (0);
1852 
1853         return (bcmp(p1, p2, len) == 0);
1854 }
1855 
1856 static void *
1857 clntip_mkkey(rfs4_entry_t entry)
1858 {
1859         rfs4_clntip_t *cp = (rfs4_clntip_t *)entry;
1860 
1861         return (&cp->ri_addr);
1862 }
1863 
1864 static bool_t
1865 rfs4_clntip_expiry(rfs4_entry_t u_entry)
1866 {
1867         rfs4_clntip_t *cp = (rfs4_clntip_t *)u_entry;
1868 
1869         if (rfs4_dbe_is_invalid(cp->ri_dbe))
1870                 return (TRUE);
1871         return (FALSE);
1872 }
1873 
1874 /* ARGSUSED */
1875 static void
1876 rfs4_clntip_destroy(rfs4_entry_t u_entry)
1877 {
1878 }
1879 
1880 static bool_t
1881 rfs4_clntip_create(rfs4_entry_t u_entry, void *arg)
1882 {
1883         rfs4_clntip_t *cp = (rfs4_clntip_t *)u_entry;
1884         struct sockaddr *ca = (struct sockaddr *)arg;
1885 
1886         /* Copy client's IP address */
1887         if (ca->sa_family == AF_INET)
1888                 bcopy(ca, &cp->ri_addr, sizeof (struct sockaddr_in));
1889         else if (ca->sa_family == AF_INET6)
1890                 bcopy(ca, &cp->ri_addr, sizeof (struct sockaddr_in6));
1891         else
1892                 return (FALSE);
1893         cp->ri_no_referrals = 1;
1894 
1895         return (TRUE);
1896 }
1897 
1898 rfs4_clntip_t *
1899 rfs4_find_clntip(struct sockaddr *addr, bool_t *create)
1900 {
1901         rfs4_clntip_t *cp;
1902 
1903         rw_enter(&rfs4_findclient_lock, RW_READER);
1904 
1905         cp = (rfs4_clntip_t *)rfs4_dbsearch(rfs4_clntip_idx, addr,
1906             create, addr, RFS4_DBS_VALID);
1907 
1908         rw_exit(&rfs4_findclient_lock);
1909 
1910         return (cp);
1911 }
1912 
1913 void
1914 rfs4_invalidate_clntip(struct sockaddr *addr)
1915 {
1916         rfs4_clntip_t *cp;
1917         bool_t create = FALSE;
1918 
1919         rw_enter(&rfs4_findclient_lock, RW_READER);
1920 
1921         cp = (rfs4_clntip_t *)rfs4_dbsearch(rfs4_clntip_idx, addr,
1922             &create, NULL, RFS4_DBS_VALID);
1923         if (cp == NULL) {
1924                 rw_exit(&rfs4_findclient_lock);
1925                 return;
1926         }
1927         rfs4_dbe_invalidate(cp->ri_dbe);
1928         rfs4_dbe_rele(cp->ri_dbe);
1929 
1930         rw_exit(&rfs4_findclient_lock);
1931 }
1932 
1933 bool_t
1934 rfs4_lease_expired(rfs4_client_t *cp)
1935 {
1936         bool_t rc;
1937 
1938         rfs4_dbe_lock(cp->rc_dbe);
1939 
1940         /*
1941          * If the admin has executed clear_locks for this
1942          * client id, force expire will be set, so no need
1943          * to calculate anything because it's "outa here".
1944          */
1945         if (cp->rc_forced_expire) {
1946                 rc = TRUE;
1947         } else {
1948                 rc = (gethrestime_sec() - cp->rc_last_access > rfs4_lease_time);
1949         }
1950 
1951         /*
1952          * If the lease has expired we will also want
1953          * to remove any stable storage state data. So
1954          * mark the client id accordingly.
1955          */
1956         if (!cp->rc_ss_remove)
1957                 cp->rc_ss_remove = (rc == TRUE);
1958 
1959         rfs4_dbe_unlock(cp->rc_dbe);
1960 
1961         return (rc);
1962 }
1963 
1964 void
1965 rfs4_update_lease(rfs4_client_t *cp)
1966 {
1967         rfs4_dbe_lock(cp->rc_dbe);
1968         if (!cp->rc_forced_expire)
1969                 cp->rc_last_access = gethrestime_sec();
1970         rfs4_dbe_unlock(cp->rc_dbe);
1971 }
1972 
1973 
1974 static bool_t
1975 EQOPENOWNER(open_owner4 *a, open_owner4 *b)
1976 {
1977         bool_t rc;
1978 
1979         if (a->clientid != b->clientid)
1980                 return (FALSE);
1981 
1982         if (a->owner_len != b->owner_len)
1983                 return (FALSE);
1984 
1985         rc = (bcmp(a->owner_val, b->owner_val, a->owner_len) == 0);
1986 
1987         return (rc);
1988 }
1989 
1990 static uint_t
1991 openowner_hash(void *key)
1992 {
1993         int i;
1994         open_owner4 *openowner = key;
1995         uint_t hash = 0;
1996 
1997         for (i = 0; i < openowner->owner_len; i++) {
1998                 hash <<= 4;
1999                 hash += (uint_t)openowner->owner_val[i];
2000         }
2001         hash += (uint_t)openowner->clientid;
2002         hash |= (openowner->clientid >> 32);
2003 
2004         return (hash);
2005 }
2006 
2007 static bool_t
2008 openowner_compare(rfs4_entry_t u_entry, void *key)
2009 {
2010         rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2011         open_owner4 *arg = key;
2012 
2013         return (EQOPENOWNER(&oo->ro_owner, arg));
2014 }
2015 
2016 void *
2017 openowner_mkkey(rfs4_entry_t u_entry)
2018 {
2019         rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2020 
2021         return (&oo->ro_owner);
2022 }
2023 
2024 /* ARGSUSED */
2025 static bool_t
2026 rfs4_openowner_expiry(rfs4_entry_t u_entry)
2027 {
2028         /* openstateid held us and did all needed delay */
2029         return (TRUE);
2030 }
2031 
2032 static void
2033 rfs4_openowner_destroy(rfs4_entry_t u_entry)
2034 {
2035         rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2036 
2037         /* Remove open owner from client's lists of open owners */
2038         rfs4_dbe_lock(oo->ro_client->rc_dbe);
2039         list_remove(&oo->ro_client->rc_openownerlist, oo);
2040         rfs4_dbe_unlock(oo->ro_client->rc_dbe);
2041 
2042         /* One less reference to the client */
2043         rfs4_client_rele(oo->ro_client);
2044         oo->ro_client = NULL;
2045 
2046         /* Free the last reply for this lock owner */
2047         rfs4_free_reply(&oo->ro_reply);
2048 
2049         if (oo->ro_reply_fh.nfs_fh4_val) {
2050                 kmem_free(oo->ro_reply_fh.nfs_fh4_val,
2051                     oo->ro_reply_fh.nfs_fh4_len);
2052                 oo->ro_reply_fh.nfs_fh4_val = NULL;
2053                 oo->ro_reply_fh.nfs_fh4_len = 0;
2054         }
2055 
2056         rfs4_sw_destroy(&oo->ro_sw);
2057         list_destroy(&oo->ro_statelist);
2058 
2059         /* Free the lock owner id */
2060         kmem_free(oo->ro_owner.owner_val, oo->ro_owner.owner_len);
2061 }
2062 
2063 void
2064 rfs4_openowner_rele(rfs4_openowner_t *oo)
2065 {
2066         rfs4_dbe_rele(oo->ro_dbe);
2067 }
2068 
2069 static bool_t
2070 rfs4_openowner_create(rfs4_entry_t u_entry, void *arg)
2071 {
2072         rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2073         rfs4_openowner_t *argp = (rfs4_openowner_t *)arg;
2074         open_owner4 *openowner = &argp->ro_owner;
2075         seqid4 seqid = argp->ro_open_seqid;
2076         rfs4_client_t *cp;
2077         bool_t create = FALSE;
2078 
2079         rw_enter(&rfs4_findclient_lock, RW_READER);
2080 
2081         cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_clientid_idx,
2082             &openowner->clientid,
2083             &create, NULL, RFS4_DBS_VALID);
2084 
2085         rw_exit(&rfs4_findclient_lock);
2086 
2087         if (cp == NULL)
2088                 return (FALSE);
2089 
2090         oo->ro_reply_fh.nfs_fh4_len = 0;
2091         oo->ro_reply_fh.nfs_fh4_val = NULL;
2092 
2093         oo->ro_owner.clientid = openowner->clientid;
2094         oo->ro_owner.owner_val =
2095             kmem_alloc(openowner->owner_len, KM_SLEEP);
2096 
2097         bcopy(openowner->owner_val,
2098             oo->ro_owner.owner_val, openowner->owner_len);
2099 
2100         oo->ro_owner.owner_len = openowner->owner_len;
2101 
2102         oo->ro_need_confirm = TRUE;
2103 
2104         rfs4_sw_init(&oo->ro_sw);
2105 
2106         oo->ro_open_seqid = seqid;
2107         bzero(&oo->ro_reply, sizeof (nfs_resop4));
2108         oo->ro_client = cp;
2109         oo->ro_cr_set = NULL;
2110 
2111         list_create(&oo->ro_statelist, sizeof (rfs4_state_t),
2112             offsetof(rfs4_state_t, rs_node));
2113 
2114         /* Insert openowner into client's open owner list */
2115         rfs4_dbe_lock(cp->rc_dbe);
2116         list_insert_tail(&cp->rc_openownerlist, oo);
2117         rfs4_dbe_unlock(cp->rc_dbe);
2118 
2119         return (TRUE);
2120 }
2121 
2122 rfs4_openowner_t *
2123 rfs4_findopenowner(open_owner4 *openowner, bool_t *create, seqid4 seqid)
2124 {
2125         rfs4_openowner_t *oo;
2126         rfs4_openowner_t arg;
2127 
2128         arg.ro_owner = *openowner;
2129         arg.ro_open_seqid = seqid;
2130         oo = (rfs4_openowner_t *)rfs4_dbsearch(rfs4_openowner_idx, openowner,
2131             create, &arg, RFS4_DBS_VALID);
2132 
2133         return (oo);
2134 }
2135 
2136 void
2137 rfs4_update_open_sequence(rfs4_openowner_t *oo)
2138 {
2139 
2140         rfs4_dbe_lock(oo->ro_dbe);
2141 
2142         oo->ro_open_seqid++;
2143 
2144         rfs4_dbe_unlock(oo->ro_dbe);
2145 }
2146 
2147 void
2148 rfs4_update_open_resp(rfs4_openowner_t *oo, nfs_resop4 *resp, nfs_fh4 *fh)
2149 {
2150 
2151         rfs4_dbe_lock(oo->ro_dbe);
2152 
2153         rfs4_free_reply(&oo->ro_reply);
2154 
2155         rfs4_copy_reply(&oo->ro_reply, resp);
2156 
2157         /* Save the filehandle if provided and free if not used */
2158         if (resp->nfs_resop4_u.opopen.status == NFS4_OK &&
2159             fh && fh->nfs_fh4_len) {
2160                 if (oo->ro_reply_fh.nfs_fh4_val == NULL)
2161                         oo->ro_reply_fh.nfs_fh4_val =
2162                             kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
2163                 nfs_fh4_copy(fh, &oo->ro_reply_fh);
2164         } else {
2165                 if (oo->ro_reply_fh.nfs_fh4_val) {
2166                         kmem_free(oo->ro_reply_fh.nfs_fh4_val,
2167                             oo->ro_reply_fh.nfs_fh4_len);
2168                         oo->ro_reply_fh.nfs_fh4_val = NULL;
2169                         oo->ro_reply_fh.nfs_fh4_len = 0;
2170                 }
2171         }
2172 
2173         rfs4_dbe_unlock(oo->ro_dbe);
2174 }
2175 
2176 static bool_t
2177 lockowner_compare(rfs4_entry_t u_entry, void *key)
2178 {
2179         rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2180         lock_owner4 *b = (lock_owner4 *)key;
2181 
2182         if (lo->rl_owner.clientid != b->clientid)
2183                 return (FALSE);
2184 
2185         if (lo->rl_owner.owner_len != b->owner_len)
2186                 return (FALSE);
2187 
2188         return (bcmp(lo->rl_owner.owner_val, b->owner_val,
2189             lo->rl_owner.owner_len) == 0);
2190 }
2191 
2192 void *
2193 lockowner_mkkey(rfs4_entry_t u_entry)
2194 {
2195         rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2196 
2197         return (&lo->rl_owner);
2198 }
2199 
2200 static uint32_t
2201 lockowner_hash(void *key)
2202 {
2203         int i;
2204         lock_owner4 *lockowner = key;
2205         uint_t hash = 0;
2206 
2207         for (i = 0; i < lockowner->owner_len; i++) {
2208                 hash <<= 4;
2209                 hash += (uint_t)lockowner->owner_val[i];
2210         }
2211         hash += (uint_t)lockowner->clientid;
2212         hash |= (lockowner->clientid >> 32);
2213 
2214         return (hash);
2215 }
2216 
2217 static uint32_t
2218 pid_hash(void *key)
2219 {
2220         return ((uint32_t)(uintptr_t)key);
2221 }
2222 
2223 static void *
2224 pid_mkkey(rfs4_entry_t u_entry)
2225 {
2226         rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2227 
2228         return ((void *)(uintptr_t)lo->rl_pid);
2229 }
2230 
2231 static bool_t
2232 pid_compare(rfs4_entry_t u_entry, void *key)
2233 {
2234         rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2235 
2236         return (lo->rl_pid == (pid_t)(uintptr_t)key);
2237 }
2238 
2239 static void
2240 rfs4_lockowner_destroy(rfs4_entry_t u_entry)
2241 {
2242         rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2243 
2244         /* Free the lock owner id */
2245         kmem_free(lo->rl_owner.owner_val, lo->rl_owner.owner_len);
2246         rfs4_client_rele(lo->rl_client);
2247 }
2248 
2249 void
2250 rfs4_lockowner_rele(rfs4_lockowner_t *lo)
2251 {
2252         rfs4_dbe_rele(lo->rl_dbe);
2253 }
2254 
2255 /* ARGSUSED */
2256 static bool_t
2257 rfs4_lockowner_expiry(rfs4_entry_t u_entry)
2258 {
2259         /*
2260          * Since expiry is called with no other references on
2261          * this struct, go ahead and have it removed.
2262          */
2263         return (TRUE);
2264 }
2265 
2266 static bool_t
2267 rfs4_lockowner_create(rfs4_entry_t u_entry, void *arg)
2268 {
2269         rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2270         lock_owner4 *lockowner = (lock_owner4 *)arg;
2271         rfs4_client_t *cp;
2272         bool_t create = FALSE;
2273 
2274         rw_enter(&rfs4_findclient_lock, RW_READER);
2275 
2276         cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_clientid_idx,
2277             &lockowner->clientid,
2278             &create, NULL, RFS4_DBS_VALID);
2279 
2280         rw_exit(&rfs4_findclient_lock);
2281 
2282         if (cp == NULL)
2283                 return (FALSE);
2284 
2285         /* Reference client */
2286         lo->rl_client = cp;
2287         lo->rl_owner.clientid = lockowner->clientid;
2288         lo->rl_owner.owner_val = kmem_alloc(lockowner->owner_len, KM_SLEEP);
2289         bcopy(lockowner->owner_val, lo->rl_owner.owner_val,
2290             lockowner->owner_len);
2291         lo->rl_owner.owner_len = lockowner->owner_len;
2292         lo->rl_pid = rfs4_dbe_getid(lo->rl_dbe);
2293 
2294         return (TRUE);
2295 }
2296 
2297 rfs4_lockowner_t *
2298 rfs4_findlockowner(lock_owner4 *lockowner, bool_t *create)
2299 {
2300         rfs4_lockowner_t *lo;
2301 
2302         lo = (rfs4_lockowner_t *)rfs4_dbsearch(rfs4_lockowner_idx, lockowner,
2303             create, lockowner, RFS4_DBS_VALID);
2304 
2305         return (lo);
2306 }
2307 
2308 rfs4_lockowner_t *
2309 rfs4_findlockowner_by_pid(pid_t pid)
2310 {
2311         rfs4_lockowner_t *lo;
2312         bool_t create = FALSE;
2313 
2314         lo = (rfs4_lockowner_t *)rfs4_dbsearch(rfs4_lockowner_pid_idx,
2315             (void *)(uintptr_t)pid, &create, NULL, RFS4_DBS_VALID);
2316 
2317         return (lo);
2318 }
2319 
2320 
2321 static uint32_t
2322 file_hash(void *key)
2323 {
2324         return (ADDRHASH(key));
2325 }
2326 
2327 static void *
2328 file_mkkey(rfs4_entry_t u_entry)
2329 {
2330         rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2331 
2332         return (fp->rf_vp);
2333 }
2334 
2335 static bool_t
2336 file_compare(rfs4_entry_t u_entry, void *key)
2337 {
2338         rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2339 
2340         return (fp->rf_vp == (vnode_t *)key);
2341 }
2342 
2343 static void
2344 rfs4_file_destroy(rfs4_entry_t u_entry)
2345 {
2346         rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2347 
2348         list_destroy(&fp->rf_delegstatelist);
2349 
2350         if (fp->rf_filehandle.nfs_fh4_val)
2351                 kmem_free(fp->rf_filehandle.nfs_fh4_val,
2352                     fp->rf_filehandle.nfs_fh4_len);
2353         cv_destroy(fp->rf_dinfo.rd_recall_cv);
2354         if (fp->rf_vp) {
2355                 vnode_t *vp = fp->rf_vp;
2356 
2357                 mutex_enter(&vp->v_vsd_lock);
2358                 (void) vsd_set(vp, nfs4_srv_vkey, NULL);
2359                 mutex_exit(&vp->v_vsd_lock);
2360                 VN_RELE(vp);
2361                 fp->rf_vp = NULL;
2362         }
2363         rw_destroy(&fp->rf_file_rwlock);
2364 }
2365 
2366 /*
2367  * Used to unlock the underlying dbe struct only
2368  */
2369 void
2370 rfs4_file_rele(rfs4_file_t *fp)
2371 {
2372         rfs4_dbe_rele(fp->rf_dbe);
2373 }
2374 
2375 typedef struct {
2376     vnode_t *vp;
2377     nfs_fh4 *fh;
2378 } rfs4_fcreate_arg;
2379 
2380 static bool_t
2381 rfs4_file_create(rfs4_entry_t u_entry, void *arg)
2382 {
2383         rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2384         rfs4_fcreate_arg *ap = (rfs4_fcreate_arg *)arg;
2385         vnode_t *vp = ap->vp;
2386         nfs_fh4 *fh = ap->fh;
2387 
2388         VN_HOLD(vp);
2389 
2390         fp->rf_filehandle.nfs_fh4_len = 0;
2391         fp->rf_filehandle.nfs_fh4_val = NULL;
2392         ASSERT(fh && fh->nfs_fh4_len);
2393         if (fh && fh->nfs_fh4_len) {
2394                 fp->rf_filehandle.nfs_fh4_val =
2395                     kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
2396                 nfs_fh4_copy(fh, &fp->rf_filehandle);
2397         }
2398         fp->rf_vp = vp;
2399 
2400         list_create(&fp->rf_delegstatelist, sizeof (rfs4_deleg_state_t),
2401             offsetof(rfs4_deleg_state_t, rds_node));
2402 
2403         fp->rf_share_deny = fp->rf_share_access = fp->rf_access_read = 0;
2404         fp->rf_access_write = fp->rf_deny_read = fp->rf_deny_write = 0;
2405 
2406         mutex_init(fp->rf_dinfo.rd_recall_lock, NULL, MUTEX_DEFAULT, NULL);
2407         cv_init(fp->rf_dinfo.rd_recall_cv, NULL, CV_DEFAULT, NULL);
2408 
2409         fp->rf_dinfo.rd_dtype = OPEN_DELEGATE_NONE;
2410 
2411         rw_init(&fp->rf_file_rwlock, NULL, RW_DEFAULT, NULL);
2412 
2413         mutex_enter(&vp->v_vsd_lock);
2414         VERIFY(vsd_set(vp, nfs4_srv_vkey, (void *)fp) == 0);
2415         mutex_exit(&vp->v_vsd_lock);
2416 
2417         return (TRUE);
2418 }
2419 
2420 rfs4_file_t *
2421 rfs4_findfile(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2422 {
2423         rfs4_file_t *fp;
2424         rfs4_fcreate_arg arg;
2425 
2426         arg.vp = vp;
2427         arg.fh = fh;
2428 
2429         if (*create == TRUE)
2430                 fp = (rfs4_file_t *)rfs4_dbsearch(rfs4_file_idx, vp, create,
2431                     &arg, RFS4_DBS_VALID);
2432         else {
2433                 mutex_enter(&vp->v_vsd_lock);
2434                 fp = (rfs4_file_t *)vsd_get(vp, nfs4_srv_vkey);
2435                 if (fp) {
2436                         rfs4_dbe_lock(fp->rf_dbe);
2437                         if (rfs4_dbe_is_invalid(fp->rf_dbe) ||
2438                             (rfs4_dbe_refcnt(fp->rf_dbe) == 0)) {
2439                                 rfs4_dbe_unlock(fp->rf_dbe);
2440                                 fp = NULL;
2441                         } else {
2442                                 rfs4_dbe_hold(fp->rf_dbe);
2443                                 rfs4_dbe_unlock(fp->rf_dbe);
2444                         }
2445                 }
2446                 mutex_exit(&vp->v_vsd_lock);
2447         }
2448         return (fp);
2449 }
2450 
2451 /*
2452  * Find a file in the db and once it is located, take the rw lock.
2453  * Need to check the vnode pointer and if it does not exist (it was
2454  * removed between the db location and check) redo the find.  This
2455  * assumes that a file struct that has a NULL vnode pointer is marked
2456  * at 'invalid' and will not be found in the db the second time
2457  * around.
2458  */
2459 rfs4_file_t *
2460 rfs4_findfile_withlock(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2461 {
2462         rfs4_file_t *fp;
2463         rfs4_fcreate_arg arg;
2464         bool_t screate = *create;
2465 
2466         if (screate == FALSE) {
2467                 mutex_enter(&vp->v_vsd_lock);
2468                 fp = (rfs4_file_t *)vsd_get(vp, nfs4_srv_vkey);
2469                 if (fp) {
2470                         rfs4_dbe_lock(fp->rf_dbe);
2471                         if (rfs4_dbe_is_invalid(fp->rf_dbe) ||
2472                             (rfs4_dbe_refcnt(fp->rf_dbe) == 0)) {
2473                                 rfs4_dbe_unlock(fp->rf_dbe);
2474                                 mutex_exit(&vp->v_vsd_lock);
2475                                 fp = NULL;
2476                         } else {
2477                                 rfs4_dbe_hold(fp->rf_dbe);
2478                                 rfs4_dbe_unlock(fp->rf_dbe);
2479                                 mutex_exit(&vp->v_vsd_lock);
2480                                 rw_enter(&fp->rf_file_rwlock, RW_WRITER);
2481                                 if (fp->rf_vp == NULL) {
2482                                         rw_exit(&fp->rf_file_rwlock);
2483                                         rfs4_file_rele(fp);
2484                                         fp = NULL;
2485                                 }
2486                         }
2487                 } else {
2488                         mutex_exit(&vp->v_vsd_lock);
2489                 }
2490         } else {
2491 retry:
2492                 arg.vp = vp;
2493                 arg.fh = fh;
2494 
2495                 fp = (rfs4_file_t *)rfs4_dbsearch(rfs4_file_idx, vp, create,
2496                     &arg, RFS4_DBS_VALID);
2497                 if (fp != NULL) {
2498                         rw_enter(&fp->rf_file_rwlock, RW_WRITER);
2499                         if (fp->rf_vp == NULL) {
2500                                 rw_exit(&fp->rf_file_rwlock);
2501                                 rfs4_file_rele(fp);
2502                                 *create = screate;
2503                                 goto retry;
2504                         }
2505                 }
2506         }
2507 
2508         return (fp);
2509 }
2510 
2511 static uint32_t
2512 lo_state_hash(void *key)
2513 {
2514         stateid_t *id = key;
2515 
2516         return (id->bits.ident+id->bits.pid);
2517 }
2518 
2519 static bool_t
2520 lo_state_compare(rfs4_entry_t u_entry, void *key)
2521 {
2522         rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2523         stateid_t *id = key;
2524         bool_t rc;
2525 
2526         rc = (lsp->rls_lockid.bits.boottime == id->bits.boottime &&
2527             lsp->rls_lockid.bits.type == id->bits.type &&
2528             lsp->rls_lockid.bits.ident == id->bits.ident &&
2529             lsp->rls_lockid.bits.pid == id->bits.pid);
2530 
2531         return (rc);
2532 }
2533 
2534 static void *
2535 lo_state_mkkey(rfs4_entry_t u_entry)
2536 {
2537         rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2538 
2539         return (&lsp->rls_lockid);
2540 }
2541 
2542 static bool_t
2543 rfs4_lo_state_expiry(rfs4_entry_t u_entry)
2544 {
2545         rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2546 
2547         if (rfs4_dbe_is_invalid(lsp->rls_dbe))
2548                 return (TRUE);
2549         if (lsp->rls_state->rs_closed)
2550                 return (TRUE);
2551         return ((gethrestime_sec() -
2552             lsp->rls_state->rs_owner->ro_client->rc_last_access
2553             > rfs4_lease_time));
2554 }
2555 
2556 static void
2557 rfs4_lo_state_destroy(rfs4_entry_t u_entry)
2558 {
2559         rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2560 
2561         rfs4_dbe_lock(lsp->rls_state->rs_dbe);
2562         list_remove(&lsp->rls_state->rs_lostatelist, lsp);
2563         rfs4_dbe_unlock(lsp->rls_state->rs_dbe);
2564 
2565         rfs4_sw_destroy(&lsp->rls_sw);
2566 
2567         /* Make sure to release the file locks */
2568         if (lsp->rls_locks_cleaned == FALSE) {
2569                 lsp->rls_locks_cleaned = TRUE;
2570                 if (lsp->rls_locker->rl_client->rc_sysidt != LM_NOSYSID) {
2571                         /* Is the PxFS kernel module loaded? */
2572                         if (lm_remove_file_locks != NULL) {
2573                                 int new_sysid;
2574 
2575                                 /* Encode the cluster nodeid in new sysid */
2576                                 new_sysid =
2577                                     lsp->rls_locker->rl_client->rc_sysidt;
2578                                 lm_set_nlmid_flk(&new_sysid);
2579 
2580                                 /*
2581                                  * This PxFS routine removes file locks for a
2582                                  * client over all nodes of a cluster.
2583                                  */
2584                                 DTRACE_PROBE1(nfss_i_clust_rm_lck,
2585                                     int, new_sysid);
2586                                 (*lm_remove_file_locks)(new_sysid);
2587                         } else {
2588                                 (void) cleanlocks(
2589                                     lsp->rls_state->rs_finfo->rf_vp,
2590                                     lsp->rls_locker->rl_pid,
2591                                     lsp->rls_locker->rl_client->rc_sysidt);
2592                         }
2593                 }
2594         }
2595 
2596         /* Free the last reply for this state */
2597         rfs4_free_reply(&lsp->rls_reply);
2598 
2599         rfs4_lockowner_rele(lsp->rls_locker);
2600         lsp->rls_locker = NULL;
2601 
2602         rfs4_state_rele_nounlock(lsp->rls_state);
2603         lsp->rls_state = NULL;
2604 }
2605 
2606 static bool_t
2607 rfs4_lo_state_create(rfs4_entry_t u_entry, void *arg)
2608 {
2609         rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2610         rfs4_lo_state_t *argp = (rfs4_lo_state_t *)arg;
2611         rfs4_lockowner_t *lo = argp->rls_locker;
2612         rfs4_state_t *sp = argp->rls_state;
2613 
2614         lsp->rls_state = sp;
2615 
2616         lsp->rls_lockid = sp->rs_stateid;
2617         lsp->rls_lockid.bits.type = LOCKID;
2618         lsp->rls_lockid.bits.chgseq = 0;
2619         lsp->rls_lockid.bits.pid = lo->rl_pid;
2620 
2621         lsp->rls_locks_cleaned = FALSE;
2622         lsp->rls_lock_completed = FALSE;
2623 
2624         rfs4_sw_init(&lsp->rls_sw);
2625 
2626         /* Attached the supplied lock owner */
2627         rfs4_dbe_hold(lo->rl_dbe);
2628         lsp->rls_locker = lo;
2629 
2630         rfs4_dbe_lock(sp->rs_dbe);
2631         list_insert_tail(&sp->rs_lostatelist, lsp);
2632         rfs4_dbe_hold(sp->rs_dbe);
2633         rfs4_dbe_unlock(sp->rs_dbe);
2634 
2635         return (TRUE);
2636 }
2637 
2638 void
2639 rfs4_lo_state_rele(rfs4_lo_state_t *lsp, bool_t unlock_fp)
2640 {
2641         if (unlock_fp == TRUE)
2642                 rw_exit(&lsp->rls_state->rs_finfo->rf_file_rwlock);
2643         rfs4_dbe_rele(lsp->rls_dbe);
2644 }
2645 
2646 static rfs4_lo_state_t *
2647 rfs4_findlo_state(stateid_t *id, bool_t lock_fp)
2648 {
2649         rfs4_lo_state_t *lsp;
2650         bool_t create = FALSE;
2651 
2652         lsp = (rfs4_lo_state_t *)rfs4_dbsearch(rfs4_lo_state_idx, id,
2653             &create, NULL, RFS4_DBS_VALID);
2654         if (lock_fp == TRUE && lsp != NULL)
2655                 rw_enter(&lsp->rls_state->rs_finfo->rf_file_rwlock, RW_READER);
2656 
2657         return (lsp);
2658 }
2659 
2660 
2661 static uint32_t
2662 lo_state_lo_hash(void *key)
2663 {
2664         rfs4_lo_state_t *lsp = key;
2665 
2666         return (ADDRHASH(lsp->rls_locker) ^ ADDRHASH(lsp->rls_state));
2667 }
2668 
2669 static bool_t
2670 lo_state_lo_compare(rfs4_entry_t u_entry, void *key)
2671 {
2672         rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2673         rfs4_lo_state_t *keyp = key;
2674 
2675         return (keyp->rls_locker == lsp->rls_locker &&
2676             keyp->rls_state == lsp->rls_state);
2677 }
2678 
2679 static void *
2680 lo_state_lo_mkkey(rfs4_entry_t u_entry)
2681 {
2682         return (u_entry);
2683 }
2684 
2685 rfs4_lo_state_t *
2686 rfs4_findlo_state_by_owner(rfs4_lockowner_t *lo, rfs4_state_t *sp,
2687     bool_t *create)
2688 {
2689         rfs4_lo_state_t *lsp;
2690         rfs4_lo_state_t arg;
2691 
2692         arg.rls_locker = lo;
2693         arg.rls_state = sp;
2694 
2695         lsp = (rfs4_lo_state_t *)rfs4_dbsearch(rfs4_lo_state_owner_idx, &arg,
2696             create, &arg, RFS4_DBS_VALID);
2697 
2698         return (lsp);
2699 }
2700 
2701 static stateid_t
2702 get_stateid(id_t eid)
2703 {
2704         stateid_t id;
2705 
2706         id.bits.boottime = rfs4_start_time;
2707         id.bits.ident = eid;
2708         id.bits.chgseq = 0;
2709         id.bits.type = 0;
2710         id.bits.pid = 0;
2711 
2712         /*
2713          * If we are booted as a cluster node, embed our nodeid.
2714          * We've already done sanity checks in rfs4_client_create() so no
2715          * need to repeat them here.
2716          */
2717         id.bits.clnodeid = (cluster_bootflags & CLUSTER_BOOTED) ?
2718             clconf_get_nodeid() : 0;
2719 
2720         return (id);
2721 }
2722 
2723 /*
2724  * For use only when booted as a cluster node.
2725  * Returns TRUE if the embedded nodeid indicates that this stateid was
2726  * generated on another node.
2727  */
2728 static int
2729 foreign_stateid(stateid_t *id)
2730 {
2731         ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2732         return (id->bits.clnodeid != (uint32_t)clconf_get_nodeid());
2733 }
2734 
2735 /*
2736  * For use only when booted as a cluster node.
2737  * Returns TRUE if the embedded nodeid indicates that this clientid was
2738  * generated on another node.
2739  */
2740 static int
2741 foreign_clientid(cid *cidp)
2742 {
2743         ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2744         return (cidp->impl_id.c_id >> CLUSTER_NODEID_SHIFT !=
2745             (uint32_t)clconf_get_nodeid());
2746 }
2747 
2748 /*
2749  * For use only when booted as a cluster node.
2750  * Embed our cluster nodeid into the clientid.
2751  */
2752 static void
2753 embed_nodeid(cid *cidp)
2754 {
2755         int clnodeid;
2756         /*
2757          * Currently, our state tables are small enough that their
2758          * ids will leave enough bits free for the nodeid. If the
2759          * tables become larger, we mustn't overwrite the id.
2760          * Equally, we only have room for so many bits of nodeid, so
2761          * must check that too.
2762          */
2763         ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2764         ASSERT(cidp->impl_id.c_id >> CLUSTER_NODEID_SHIFT == 0);
2765         clnodeid = clconf_get_nodeid();
2766         ASSERT(clnodeid <= CLUSTER_MAX_NODEID);
2767         ASSERT(clnodeid != NODEID_UNKNOWN);
2768         cidp->impl_id.c_id |= (clnodeid << CLUSTER_NODEID_SHIFT);
2769 }
2770 
2771 static uint32_t
2772 state_hash(void *key)
2773 {
2774         stateid_t *ip = (stateid_t *)key;
2775 
2776         return (ip->bits.ident);
2777 }
2778 
2779 static bool_t
2780 state_compare(rfs4_entry_t u_entry, void *key)
2781 {
2782         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2783         stateid_t *id = (stateid_t *)key;
2784         bool_t rc;
2785 
2786         rc = (sp->rs_stateid.bits.boottime == id->bits.boottime &&
2787             sp->rs_stateid.bits.ident == id->bits.ident);
2788 
2789         return (rc);
2790 }
2791 
2792 static void *
2793 state_mkkey(rfs4_entry_t u_entry)
2794 {
2795         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2796 
2797         return (&sp->rs_stateid);
2798 }
2799 
2800 static void
2801 rfs4_state_destroy(rfs4_entry_t u_entry)
2802 {
2803         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2804 
2805         /* remove from openowner list */
2806         rfs4_dbe_lock(sp->rs_owner->ro_dbe);
2807         list_remove(&sp->rs_owner->ro_statelist, sp);
2808         rfs4_dbe_unlock(sp->rs_owner->ro_dbe);
2809 
2810         list_destroy(&sp->rs_lostatelist);
2811 
2812         /* release any share locks for this stateid if it's still open */
2813         if (!sp->rs_closed) {
2814                 rfs4_dbe_lock(sp->rs_dbe);
2815                 (void) rfs4_unshare(sp);
2816                 rfs4_dbe_unlock(sp->rs_dbe);
2817         }
2818 
2819         /* Were done with the file */
2820         rfs4_file_rele(sp->rs_finfo);
2821         sp->rs_finfo = NULL;
2822 
2823         /* And now with the openowner */
2824         rfs4_openowner_rele(sp->rs_owner);
2825         sp->rs_owner = NULL;
2826 }
2827 
2828 static void
2829 rfs4_state_rele_nounlock(rfs4_state_t *sp)
2830 {
2831         rfs4_dbe_rele(sp->rs_dbe);
2832 }
2833 
2834 void
2835 rfs4_state_rele(rfs4_state_t *sp)
2836 {
2837         rw_exit(&sp->rs_finfo->rf_file_rwlock);
2838         rfs4_dbe_rele(sp->rs_dbe);
2839 }
2840 
2841 static uint32_t
2842 deleg_hash(void *key)
2843 {
2844         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)key;
2845 
2846         return (ADDRHASH(dsp->rds_client) ^ ADDRHASH(dsp->rds_finfo));
2847 }
2848 
2849 static bool_t
2850 deleg_compare(rfs4_entry_t u_entry, void *key)
2851 {
2852         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2853         rfs4_deleg_state_t *kdsp = (rfs4_deleg_state_t *)key;
2854 
2855         return (dsp->rds_client == kdsp->rds_client &&
2856             dsp->rds_finfo == kdsp->rds_finfo);
2857 }
2858 
2859 static void *
2860 deleg_mkkey(rfs4_entry_t u_entry)
2861 {
2862         return (u_entry);
2863 }
2864 
2865 static uint32_t
2866 deleg_state_hash(void *key)
2867 {
2868         stateid_t *ip = (stateid_t *)key;
2869 
2870         return (ip->bits.ident);
2871 }
2872 
2873 static bool_t
2874 deleg_state_compare(rfs4_entry_t u_entry, void *key)
2875 {
2876         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2877         stateid_t *id = (stateid_t *)key;
2878         bool_t rc;
2879 
2880         if (id->bits.type != DELEGID)
2881                 return (FALSE);
2882 
2883         rc = (dsp->rds_delegid.bits.boottime == id->bits.boottime &&
2884             dsp->rds_delegid.bits.ident == id->bits.ident);
2885 
2886         return (rc);
2887 }
2888 
2889 static void *
2890 deleg_state_mkkey(rfs4_entry_t u_entry)
2891 {
2892         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2893 
2894         return (&dsp->rds_delegid);
2895 }
2896 
2897 static bool_t
2898 rfs4_deleg_state_expiry(rfs4_entry_t u_entry)
2899 {
2900         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2901 
2902         if (rfs4_dbe_is_invalid(dsp->rds_dbe))
2903                 return (TRUE);
2904 
2905         if (dsp->rds_dtype == OPEN_DELEGATE_NONE)
2906                 return (TRUE);
2907 
2908         if ((gethrestime_sec() - dsp->rds_client->rc_last_access
2909             > rfs4_lease_time)) {
2910                 rfs4_dbe_invalidate(dsp->rds_dbe);
2911                 return (TRUE);
2912         }
2913 
2914         return (FALSE);
2915 }
2916 
2917 static bool_t
2918 rfs4_deleg_state_create(rfs4_entry_t u_entry, void *argp)
2919 {
2920         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2921         rfs4_file_t *fp = ((rfs4_deleg_state_t *)argp)->rds_finfo;
2922         rfs4_client_t *cp = ((rfs4_deleg_state_t *)argp)->rds_client;
2923 
2924         rfs4_dbe_hold(fp->rf_dbe);
2925         rfs4_dbe_hold(cp->rc_dbe);
2926 
2927         dsp->rds_delegid = get_stateid(rfs4_dbe_getid(dsp->rds_dbe));
2928         dsp->rds_delegid.bits.type = DELEGID;
2929         dsp->rds_finfo = fp;
2930         dsp->rds_client = cp;
2931         dsp->rds_dtype = OPEN_DELEGATE_NONE;
2932 
2933         dsp->rds_time_granted = gethrestime_sec();   /* observability */
2934         dsp->rds_time_revoked = 0;
2935 
2936         list_link_init(&dsp->rds_node);
2937 
2938         return (TRUE);
2939 }
2940 
2941 static void
2942 rfs4_deleg_state_destroy(rfs4_entry_t u_entry)
2943 {
2944         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2945 
2946         /* return delegation if necessary */
2947         rfs4_return_deleg(dsp, FALSE);
2948 
2949         /* Were done with the file */
2950         rfs4_file_rele(dsp->rds_finfo);
2951         dsp->rds_finfo = NULL;
2952 
2953         /* And now with the openowner */
2954         rfs4_client_rele(dsp->rds_client);
2955         dsp->rds_client = NULL;
2956 }
2957 
2958 rfs4_deleg_state_t *
2959 rfs4_finddeleg(rfs4_state_t *sp, bool_t *create)
2960 {
2961         rfs4_deleg_state_t ds, *dsp;
2962 
2963         ds.rds_client = sp->rs_owner->ro_client;
2964         ds.rds_finfo = sp->rs_finfo;
2965 
2966         dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(rfs4_deleg_idx, &ds,
2967             create, &ds, RFS4_DBS_VALID);
2968 
2969         return (dsp);
2970 }
2971 
2972 rfs4_deleg_state_t *
2973 rfs4_finddelegstate(stateid_t *id)
2974 {
2975         rfs4_deleg_state_t *dsp;
2976         bool_t create = FALSE;
2977 
2978         dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(rfs4_deleg_state_idx, id,
2979             &create, NULL, RFS4_DBS_VALID);
2980 
2981         return (dsp);
2982 }
2983 
2984 void
2985 rfs4_deleg_state_rele(rfs4_deleg_state_t *dsp)
2986 {
2987         rfs4_dbe_rele(dsp->rds_dbe);
2988 }
2989 
2990 void
2991 rfs4_update_lock_sequence(rfs4_lo_state_t *lsp)
2992 {
2993 
2994         rfs4_dbe_lock(lsp->rls_dbe);
2995 
2996         /*
2997          * If we are skipping sequence id checking, this means that
2998          * this is the first lock request and therefore the sequence
2999          * id does not need to be updated.  This only happens on the
3000          * first lock request for a lockowner
3001          */
3002         if (!lsp->rls_skip_seqid_check)
3003                 lsp->rls_seqid++;
3004 
3005         rfs4_dbe_unlock(lsp->rls_dbe);
3006 }
3007 
3008 void
3009 rfs4_update_lock_resp(rfs4_lo_state_t *lsp, nfs_resop4 *resp)
3010 {
3011 
3012         rfs4_dbe_lock(lsp->rls_dbe);
3013 
3014         rfs4_free_reply(&lsp->rls_reply);
3015 
3016         rfs4_copy_reply(&lsp->rls_reply, resp);
3017 
3018         rfs4_dbe_unlock(lsp->rls_dbe);
3019 }
3020 
3021 void
3022 rfs4_free_opens(rfs4_openowner_t *oo, bool_t invalidate,
3023     bool_t close_of_client)
3024 {
3025         rfs4_state_t *sp;
3026 
3027         rfs4_dbe_lock(oo->ro_dbe);
3028 
3029         for (sp = list_head(&oo->ro_statelist); sp != NULL;
3030             sp = list_next(&oo->ro_statelist, sp)) {
3031                 rfs4_state_close(sp, FALSE, close_of_client, CRED());
3032                 if (invalidate == TRUE)
3033                         rfs4_dbe_invalidate(sp->rs_dbe);
3034         }
3035 
3036         rfs4_dbe_invalidate(oo->ro_dbe);
3037         rfs4_dbe_unlock(oo->ro_dbe);
3038 }
3039 
3040 static uint32_t
3041 state_owner_file_hash(void *key)
3042 {
3043         rfs4_state_t *sp = key;
3044 
3045         return (ADDRHASH(sp->rs_owner) ^ ADDRHASH(sp->rs_finfo));
3046 }
3047 
3048 static bool_t
3049 state_owner_file_compare(rfs4_entry_t u_entry, void *key)
3050 {
3051         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3052         rfs4_state_t *arg = key;
3053 
3054         if (sp->rs_closed == TRUE)
3055                 return (FALSE);
3056 
3057         return (arg->rs_owner == sp->rs_owner && arg->rs_finfo == sp->rs_finfo);
3058 }
3059 
3060 static void *
3061 state_owner_file_mkkey(rfs4_entry_t u_entry)
3062 {
3063         return (u_entry);
3064 }
3065 
3066 static uint32_t
3067 state_file_hash(void *key)
3068 {
3069         return (ADDRHASH(key));
3070 }
3071 
3072 static bool_t
3073 state_file_compare(rfs4_entry_t u_entry, void *key)
3074 {
3075         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3076         rfs4_file_t *fp = key;
3077 
3078         if (sp->rs_closed == TRUE)
3079                 return (FALSE);
3080 
3081         return (fp == sp->rs_finfo);
3082 }
3083 
3084 static void *
3085 state_file_mkkey(rfs4_entry_t u_entry)
3086 {
3087         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3088 
3089         return (sp->rs_finfo);
3090 }
3091 
3092 rfs4_state_t *
3093 rfs4_findstate_by_owner_file(rfs4_openowner_t *oo, rfs4_file_t *fp,
3094         bool_t *create)
3095 {
3096         rfs4_state_t *sp;
3097         rfs4_state_t key;
3098 
3099         key.rs_owner = oo;
3100         key.rs_finfo = fp;
3101 
3102         sp = (rfs4_state_t *)rfs4_dbsearch(rfs4_state_owner_file_idx, &key,
3103             create, &key, RFS4_DBS_VALID);
3104 
3105         return (sp);
3106 }
3107 
3108 /* This returns ANY state struct that refers to this file */
3109 static rfs4_state_t *
3110 rfs4_findstate_by_file(rfs4_file_t *fp)
3111 {
3112         bool_t create = FALSE;
3113 
3114         return ((rfs4_state_t *)rfs4_dbsearch(rfs4_state_file_idx, fp,
3115             &create, fp, RFS4_DBS_VALID));
3116 }
3117 
3118 static bool_t
3119 rfs4_state_expiry(rfs4_entry_t u_entry)
3120 {
3121         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3122 
3123         if (rfs4_dbe_is_invalid(sp->rs_dbe))
3124                 return (TRUE);
3125 
3126         if (sp->rs_closed == TRUE &&
3127             ((gethrestime_sec() - rfs4_dbe_get_timerele(sp->rs_dbe))
3128             > rfs4_lease_time))
3129                 return (TRUE);
3130 
3131         return ((gethrestime_sec() - sp->rs_owner->ro_client->rc_last_access
3132             > rfs4_lease_time));
3133 }
3134 
3135 static bool_t
3136 rfs4_state_create(rfs4_entry_t u_entry, void *argp)
3137 {
3138         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3139         rfs4_file_t *fp = ((rfs4_state_t *)argp)->rs_finfo;
3140         rfs4_openowner_t *oo = ((rfs4_state_t *)argp)->rs_owner;
3141 
3142         rfs4_dbe_hold(fp->rf_dbe);
3143         rfs4_dbe_hold(oo->ro_dbe);
3144         sp->rs_stateid = get_stateid(rfs4_dbe_getid(sp->rs_dbe));
3145         sp->rs_stateid.bits.type = OPENID;
3146         sp->rs_owner = oo;
3147         sp->rs_finfo = fp;
3148 
3149         list_create(&sp->rs_lostatelist, sizeof (rfs4_lo_state_t),
3150             offsetof(rfs4_lo_state_t, rls_node));
3151 
3152         /* Insert state on per open owner's list */
3153         rfs4_dbe_lock(oo->ro_dbe);
3154         list_insert_tail(&oo->ro_statelist, sp);
3155         rfs4_dbe_unlock(oo->ro_dbe);
3156 
3157         return (TRUE);
3158 }
3159 
3160 static rfs4_state_t *
3161 rfs4_findstate(stateid_t *id, rfs4_dbsearch_type_t find_invalid, bool_t lock_fp)
3162 {
3163         rfs4_state_t *sp;
3164         bool_t create = FALSE;
3165 
3166         sp = (rfs4_state_t *)rfs4_dbsearch(rfs4_state_idx, id,
3167             &create, NULL, find_invalid);
3168         if (lock_fp == TRUE && sp != NULL)
3169                 rw_enter(&sp->rs_finfo->rf_file_rwlock, RW_READER);
3170 
3171         return (sp);
3172 }
3173 
3174 void
3175 rfs4_state_close(rfs4_state_t *sp, bool_t lock_held, bool_t close_of_client,
3176     cred_t *cr)
3177 {
3178         /* Remove the associated lo_state owners */
3179         if (!lock_held)
3180                 rfs4_dbe_lock(sp->rs_dbe);
3181 
3182         /*
3183          * If refcnt == 0, the dbe is about to be destroyed.
3184          * lock state will be released by the reaper thread.
3185          */
3186 
3187         if (rfs4_dbe_refcnt(sp->rs_dbe) > 0) {
3188                 if (sp->rs_closed == FALSE) {
3189                         rfs4_release_share_lock_state(sp, cr, close_of_client);
3190                         sp->rs_closed = TRUE;
3191                 }
3192         }
3193 
3194         if (!lock_held)
3195                 rfs4_dbe_unlock(sp->rs_dbe);
3196 }
3197 
3198 /*
3199  * Remove all state associated with the given client.
3200  */
3201 void
3202 rfs4_client_state_remove(rfs4_client_t *cp)
3203 {
3204         rfs4_openowner_t *oo;
3205 
3206         rfs4_dbe_lock(cp->rc_dbe);
3207 
3208         for (oo = list_head(&cp->rc_openownerlist); oo != NULL;
3209             oo = list_next(&cp->rc_openownerlist, oo)) {
3210                 rfs4_free_opens(oo, TRUE, TRUE);
3211         }
3212 
3213         rfs4_dbe_unlock(cp->rc_dbe);
3214 }
3215 
3216 void
3217 rfs4_client_close(rfs4_client_t *cp)
3218 {
3219         /* Mark client as going away. */
3220         rfs4_dbe_lock(cp->rc_dbe);
3221         rfs4_dbe_invalidate(cp->rc_dbe);
3222         rfs4_dbe_unlock(cp->rc_dbe);
3223 
3224         rfs4_client_state_remove(cp);
3225 
3226         /* Release the client */
3227         rfs4_client_rele(cp);
3228 }
3229 
3230 nfsstat4
3231 rfs4_check_clientid(clientid4 *cp, int setclid_confirm)
3232 {
3233         cid *cidp = (cid *) cp;
3234 
3235         /*
3236          * If we are booted as a cluster node, check the embedded nodeid.
3237          * If it indicates that this clientid was generated on another node,
3238          * inform the client accordingly.
3239          */
3240         if (cluster_bootflags & CLUSTER_BOOTED && foreign_clientid(cidp))
3241                 return (NFS4ERR_STALE_CLIENTID);
3242 
3243         /*
3244          * If the server start time matches the time provided
3245          * by the client (via the clientid) and this is NOT a
3246          * setclientid_confirm then return EXPIRED.
3247          */
3248         if (!setclid_confirm && cidp->impl_id.start_time == rfs4_start_time)
3249                 return (NFS4ERR_EXPIRED);
3250 
3251         return (NFS4ERR_STALE_CLIENTID);
3252 }
3253 
3254 /*
3255  * This is used when a stateid has not been found amongst the
3256  * current server's state.  Check the stateid to see if it
3257  * was from this server instantiation or not.
3258  */
3259 static nfsstat4
3260 what_stateid_error(stateid_t *id, stateid_type_t type)
3261 {
3262         /* If we are booted as a cluster node, was stateid locally generated? */
3263         if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3264                 return (NFS4ERR_STALE_STATEID);
3265 
3266         /* If types don't match then no use checking further */
3267         if (type != id->bits.type)
3268                 return (NFS4ERR_BAD_STATEID);
3269 
3270         /* From a different server instantiation, return STALE */
3271         if (id->bits.boottime != rfs4_start_time)
3272                 return (NFS4ERR_STALE_STATEID);
3273 
3274         /*
3275          * From this server but the state is most likely beyond lease
3276          * timeout: return NFS4ERR_EXPIRED.  However, there is the
3277          * case of a delegation stateid.  For delegations, there is a
3278          * case where the state can be removed without the client's
3279          * knowledge/consent: revocation.  In the case of delegation
3280          * revocation, the delegation state will be removed and will
3281          * not be found.  If the client does something like a
3282          * DELEGRETURN or even a READ/WRITE with a delegatoin stateid
3283          * that has been revoked, the server should return BAD_STATEID
3284          * instead of the more common EXPIRED error.
3285          */
3286         if (id->bits.boottime == rfs4_start_time) {
3287                 if (type == DELEGID)
3288                         return (NFS4ERR_BAD_STATEID);
3289                 else
3290                         return (NFS4ERR_EXPIRED);
3291         }
3292 
3293         return (NFS4ERR_BAD_STATEID);
3294 }
3295 
3296 /*
3297  * Used later on to find the various state structs.  When called from
3298  * rfs4_check_stateid()->rfs4_get_all_state(), no file struct lock is
3299  * taken (it is not needed) and helps on the read/write path with
3300  * respect to performance.
3301  */
3302 static nfsstat4
3303 rfs4_get_state_lockit(stateid4 *stateid, rfs4_state_t **spp,
3304     rfs4_dbsearch_type_t find_invalid, bool_t lock_fp)
3305 {
3306         stateid_t *id = (stateid_t *)stateid;
3307         rfs4_state_t *sp;
3308 
3309         *spp = NULL;
3310 
3311         /* If we are booted as a cluster node, was stateid locally generated? */
3312         if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3313                 return (NFS4ERR_STALE_STATEID);
3314 
3315         sp = rfs4_findstate(id, find_invalid, lock_fp);
3316         if (sp == NULL) {
3317                 return (what_stateid_error(id, OPENID));
3318         }
3319 
3320         if (rfs4_lease_expired(sp->rs_owner->ro_client)) {
3321                 if (lock_fp == TRUE)
3322                         rfs4_state_rele(sp);
3323                 else
3324                         rfs4_state_rele_nounlock(sp);
3325                 return (NFS4ERR_EXPIRED);
3326         }
3327 
3328         *spp = sp;
3329 
3330         return (NFS4_OK);
3331 }
3332 
3333 nfsstat4
3334 rfs4_get_state(stateid4 *stateid, rfs4_state_t **spp,
3335     rfs4_dbsearch_type_t find_invalid)
3336 {
3337         return (rfs4_get_state_lockit(stateid, spp, find_invalid, TRUE));
3338 }
3339 
3340 int
3341 rfs4_check_stateid_seqid(rfs4_state_t *sp, stateid4 *stateid)
3342 {
3343         stateid_t *id = (stateid_t *)stateid;
3344 
3345         if (rfs4_lease_expired(sp->rs_owner->ro_client))
3346                 return (NFS4_CHECK_STATEID_EXPIRED);
3347 
3348         /* Stateid is some time in the future - that's bad */
3349         if (sp->rs_stateid.bits.chgseq < id->bits.chgseq)
3350                 return (NFS4_CHECK_STATEID_BAD);
3351 
3352         if (sp->rs_stateid.bits.chgseq == id->bits.chgseq + 1)
3353                 return (NFS4_CHECK_STATEID_REPLAY);
3354 
3355         /* Stateid is some time in the past - that's old */
3356         if (sp->rs_stateid.bits.chgseq > id->bits.chgseq)
3357                 return (NFS4_CHECK_STATEID_OLD);
3358 
3359         /* Caller needs to know about confirmation before closure */
3360         if (sp->rs_owner->ro_need_confirm)
3361                 return (NFS4_CHECK_STATEID_UNCONFIRMED);
3362 
3363         if (sp->rs_closed == TRUE)
3364                 return (NFS4_CHECK_STATEID_CLOSED);
3365 
3366         return (NFS4_CHECK_STATEID_OKAY);
3367 }
3368 
3369 int
3370 rfs4_check_lo_stateid_seqid(rfs4_lo_state_t *lsp, stateid4 *stateid)
3371 {
3372         stateid_t *id = (stateid_t *)stateid;
3373 
3374         if (rfs4_lease_expired(lsp->rls_state->rs_owner->ro_client))
3375                 return (NFS4_CHECK_STATEID_EXPIRED);
3376 
3377         /* Stateid is some time in the future - that's bad */
3378         if (lsp->rls_lockid.bits.chgseq < id->bits.chgseq)
3379                 return (NFS4_CHECK_STATEID_BAD);
3380 
3381         if (lsp->rls_lockid.bits.chgseq == id->bits.chgseq + 1)
3382                 return (NFS4_CHECK_STATEID_REPLAY);
3383 
3384         /* Stateid is some time in the past - that's old */
3385         if (lsp->rls_lockid.bits.chgseq > id->bits.chgseq)
3386                 return (NFS4_CHECK_STATEID_OLD);
3387 
3388         if (lsp->rls_state->rs_closed == TRUE)
3389                 return (NFS4_CHECK_STATEID_CLOSED);
3390 
3391         return (NFS4_CHECK_STATEID_OKAY);
3392 }
3393 
3394 nfsstat4
3395 rfs4_get_deleg_state(stateid4 *stateid, rfs4_deleg_state_t **dspp)
3396 {
3397         stateid_t *id = (stateid_t *)stateid;
3398         rfs4_deleg_state_t *dsp;
3399 
3400         *dspp = NULL;
3401 
3402         /* If we are booted as a cluster node, was stateid locally generated? */
3403         if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3404                 return (NFS4ERR_STALE_STATEID);
3405 
3406         dsp = rfs4_finddelegstate(id);
3407         if (dsp == NULL) {
3408                 return (what_stateid_error(id, DELEGID));
3409         }
3410 
3411         if (rfs4_lease_expired(dsp->rds_client)) {
3412                 rfs4_deleg_state_rele(dsp);
3413                 return (NFS4ERR_EXPIRED);
3414         }
3415 
3416         *dspp = dsp;
3417 
3418         return (NFS4_OK);
3419 }
3420 
3421 nfsstat4
3422 rfs4_get_lo_state(stateid4 *stateid, rfs4_lo_state_t **lspp, bool_t lock_fp)
3423 {
3424         stateid_t *id = (stateid_t *)stateid;
3425         rfs4_lo_state_t *lsp;
3426 
3427         *lspp = NULL;
3428 
3429         /* If we are booted as a cluster node, was stateid locally generated? */
3430         if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3431                 return (NFS4ERR_STALE_STATEID);
3432 
3433         lsp = rfs4_findlo_state(id, lock_fp);
3434         if (lsp == NULL) {
3435                 return (what_stateid_error(id, LOCKID));
3436         }
3437 
3438         if (rfs4_lease_expired(lsp->rls_state->rs_owner->ro_client)) {
3439                 rfs4_lo_state_rele(lsp, lock_fp);
3440                 return (NFS4ERR_EXPIRED);
3441         }
3442 
3443         *lspp = lsp;
3444 
3445         return (NFS4_OK);
3446 }
3447 
3448 static nfsstat4
3449 rfs4_get_all_state(stateid4 *sid, rfs4_state_t **spp,
3450     rfs4_deleg_state_t **dspp, rfs4_lo_state_t **lspp)
3451 {
3452         rfs4_state_t *sp = NULL;
3453         rfs4_deleg_state_t *dsp = NULL;
3454         rfs4_lo_state_t *lsp = NULL;
3455         stateid_t *id;
3456         nfsstat4 status;
3457 
3458         *spp = NULL; *dspp = NULL; *lspp = NULL;
3459 
3460         id = (stateid_t *)sid;
3461         switch (id->bits.type) {
3462         case OPENID:
3463                 status = rfs4_get_state_lockit(sid, &sp, FALSE, FALSE);
3464                 break;
3465         case DELEGID:
3466                 status = rfs4_get_deleg_state(sid, &dsp);
3467                 break;
3468         case LOCKID:
3469                 status = rfs4_get_lo_state(sid, &lsp, FALSE);
3470                 if (status == NFS4_OK) {
3471                         sp = lsp->rls_state;
3472                         rfs4_dbe_hold(sp->rs_dbe);
3473                 }
3474                 break;
3475         default:
3476                 status = NFS4ERR_BAD_STATEID;
3477         }
3478 
3479         if (status == NFS4_OK) {
3480                 *spp = sp;
3481                 *dspp = dsp;
3482                 *lspp = lsp;
3483         }
3484 
3485         return (status);
3486 }
3487 
3488 /*
3489  * Given the I/O mode (FREAD or FWRITE), this checks whether the
3490  * rfs4_state_t struct has access to do this operation and if so
3491  * return NFS4_OK; otherwise the proper NFSv4 error is returned.
3492  */
3493 nfsstat4
3494 rfs4_state_has_access(rfs4_state_t *sp, int mode, vnode_t *vp)
3495 {
3496         nfsstat4 stat = NFS4_OK;
3497         rfs4_file_t *fp;
3498         bool_t create = FALSE;
3499 
3500         rfs4_dbe_lock(sp->rs_dbe);
3501         if (mode == FWRITE) {
3502                 if (!(sp->rs_share_access & OPEN4_SHARE_ACCESS_WRITE)) {
3503                         stat = NFS4ERR_OPENMODE;
3504                 }
3505         } else if (mode == FREAD) {
3506                 if (!(sp->rs_share_access & OPEN4_SHARE_ACCESS_READ)) {
3507                         /*
3508                          * If we have OPENed the file with DENYing access
3509                          * to both READ and WRITE then no one else could
3510                          * have OPENed the file, hence no conflicting READ
3511                          * deny.  This check is merely an optimization.
3512                          */
3513                         if (sp->rs_share_deny == OPEN4_SHARE_DENY_BOTH)
3514                                 goto out;
3515 
3516                         /* Check against file struct's DENY mode */
3517                         fp = rfs4_findfile(vp, NULL, &create);
3518                         if (fp != NULL) {
3519                                 int deny_read = 0;
3520                                 rfs4_dbe_lock(fp->rf_dbe);
3521                                 /*
3522                                  * Check if any other open owner has the file
3523                                  * OPENed with deny READ.
3524                                  */
3525                                 if (sp->rs_share_deny & OPEN4_SHARE_DENY_READ)
3526                                         deny_read = 1;
3527                                 ASSERT(fp->rf_deny_read >= deny_read);
3528                                 if (fp->rf_deny_read > deny_read)
3529                                         stat = NFS4ERR_OPENMODE;
3530                                 rfs4_dbe_unlock(fp->rf_dbe);
3531                                 rfs4_file_rele(fp);
3532                         }
3533                 }
3534         } else {
3535                 /* Illegal I/O mode */
3536                 stat = NFS4ERR_INVAL;
3537         }
3538 out:
3539         rfs4_dbe_unlock(sp->rs_dbe);
3540         return (stat);
3541 }
3542 
3543 /*
3544  * Given the I/O mode (FREAD or FWRITE), the vnode, the stateid and whether
3545  * the file is being truncated, return NFS4_OK if allowed or appropriate
3546  * V4 error if not. Note NFS4ERR_DELAY will be returned and a recall on
3547  * the associated file will be done if the I/O is not consistent with any
3548  * delegation in effect on the file. Should be holding VOP_RWLOCK, either
3549  * as reader or writer as appropriate. rfs4_op_open will acquire the
3550  * VOP_RWLOCK as writer when setting up delegation. If the stateid is bad
3551  * this routine will return NFS4ERR_BAD_STATEID. In addition, through the
3552  * deleg parameter, we will return whether a write delegation is held by
3553  * the client associated with this stateid.
3554  * If the server instance associated with the relevant client is in its
3555  * grace period, return NFS4ERR_GRACE.
3556  */
3557 
3558 nfsstat4
3559 rfs4_check_stateid(int mode, vnode_t *vp,
3560     stateid4 *stateid, bool_t trunc, bool_t *deleg,
3561     bool_t do_access, caller_context_t *ct)
3562 {
3563         rfs4_file_t *fp;
3564         bool_t create = FALSE;
3565         rfs4_state_t *sp;
3566         rfs4_deleg_state_t *dsp;
3567         rfs4_lo_state_t *lsp;
3568         stateid_t *id = (stateid_t *)stateid;
3569         nfsstat4 stat = NFS4_OK;
3570 
3571         if (ct != NULL) {
3572                 ct->cc_sysid = 0;
3573                 ct->cc_pid = 0;
3574                 ct->cc_caller_id = nfs4_srv_caller_id;
3575                 ct->cc_flags = CC_DONTBLOCK;
3576         }
3577 
3578         if (ISSPECIAL(stateid)) {
3579                 fp = rfs4_findfile(vp, NULL, &create);
3580                 if (fp == NULL)
3581                         return (NFS4_OK);
3582                 if (fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_NONE) {
3583                         rfs4_file_rele(fp);
3584                         return (NFS4_OK);
3585                 }
3586                 if (mode == FWRITE ||
3587                     fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_WRITE) {
3588                         rfs4_recall_deleg(fp, trunc, NULL);
3589                         rfs4_file_rele(fp);
3590                         return (NFS4ERR_DELAY);
3591                 }
3592                 rfs4_file_rele(fp);
3593                 return (NFS4_OK);
3594         } else {
3595                 stat = rfs4_get_all_state(stateid, &sp, &dsp, &lsp);
3596                 if (stat != NFS4_OK)
3597                         return (stat);
3598                 if (lsp != NULL) {
3599                         /* Is associated server instance in its grace period? */
3600                         if (rfs4_clnt_in_grace(lsp->rls_locker->rl_client)) {
3601                                 rfs4_lo_state_rele(lsp, FALSE);
3602                                 if (sp != NULL)
3603                                         rfs4_state_rele_nounlock(sp);
3604                                 return (NFS4ERR_GRACE);
3605                         }
3606                         if (id->bits.type == LOCKID) {
3607                                 /* Seqid in the future? - that's bad */
3608                                 if (lsp->rls_lockid.bits.chgseq <
3609                                     id->bits.chgseq) {
3610                                         rfs4_lo_state_rele(lsp, FALSE);
3611                                         if (sp != NULL)
3612                                                 rfs4_state_rele_nounlock(sp);
3613                                         return (NFS4ERR_BAD_STATEID);
3614                                 }
3615                                 /* Seqid in the past? - that's old */
3616                                 if (lsp->rls_lockid.bits.chgseq >
3617                                     id->bits.chgseq) {
3618                                         rfs4_lo_state_rele(lsp, FALSE);
3619                                         if (sp != NULL)
3620                                                 rfs4_state_rele_nounlock(sp);
3621                                         return (NFS4ERR_OLD_STATEID);
3622                                 }
3623                                 /* Ensure specified filehandle matches */
3624                                 if (lsp->rls_state->rs_finfo->rf_vp != vp) {
3625                                         rfs4_lo_state_rele(lsp, FALSE);
3626                                         if (sp != NULL)
3627                                                 rfs4_state_rele_nounlock(sp);
3628                                         return (NFS4ERR_BAD_STATEID);
3629                                 }
3630                         }
3631                         if (ct != NULL) {
3632                                 ct->cc_sysid =
3633                                     lsp->rls_locker->rl_client->rc_sysidt;
3634                                 ct->cc_pid = lsp->rls_locker->rl_pid;
3635                         }
3636                         rfs4_lo_state_rele(lsp, FALSE);
3637                 }
3638 
3639                 /* Stateid provided was an "open" stateid */
3640                 if (sp != NULL) {
3641                         /* Is associated server instance in its grace period? */
3642                         if (rfs4_clnt_in_grace(sp->rs_owner->ro_client)) {
3643                                 rfs4_state_rele_nounlock(sp);
3644                                 return (NFS4ERR_GRACE);
3645                         }
3646                         if (id->bits.type == OPENID) {
3647                                 /* Seqid in the future? - that's bad */
3648                                 if (sp->rs_stateid.bits.chgseq <
3649                                     id->bits.chgseq) {
3650                                         rfs4_state_rele_nounlock(sp);
3651                                         return (NFS4ERR_BAD_STATEID);
3652                                 }
3653                                 /* Seqid in the past - that's old */
3654                                 if (sp->rs_stateid.bits.chgseq >
3655                                     id->bits.chgseq) {
3656                                         rfs4_state_rele_nounlock(sp);
3657                                         return (NFS4ERR_OLD_STATEID);
3658                                 }
3659                         }
3660                         /* Ensure specified filehandle matches */
3661                         if (sp->rs_finfo->rf_vp != vp) {
3662                                 rfs4_state_rele_nounlock(sp);
3663                                 return (NFS4ERR_BAD_STATEID);
3664                         }
3665 
3666                         if (sp->rs_owner->ro_need_confirm) {
3667                                 rfs4_state_rele_nounlock(sp);
3668                                 return (NFS4ERR_BAD_STATEID);
3669                         }
3670 
3671                         if (sp->rs_closed == TRUE) {
3672                                 rfs4_state_rele_nounlock(sp);
3673                                 return (NFS4ERR_OLD_STATEID);
3674                         }
3675 
3676                         if (do_access)
3677                                 stat = rfs4_state_has_access(sp, mode, vp);
3678                         else
3679                                 stat = NFS4_OK;
3680 
3681                         /*
3682                          * Return whether this state has write
3683                          * delegation if desired
3684                          */
3685                         if (deleg && (sp->rs_finfo->rf_dinfo.rd_dtype ==
3686                             OPEN_DELEGATE_WRITE))
3687                                 *deleg = TRUE;
3688 
3689                         /*
3690                          * We got a valid stateid, so we update the
3691                          * lease on the client. Ideally we would like
3692                          * to do this after the calling op succeeds,
3693                          * but for now this will be good
3694                          * enough. Callers of this routine are
3695                          * currently insulated from the state stuff.
3696                          */
3697                         rfs4_update_lease(sp->rs_owner->ro_client);
3698 
3699                         /*
3700                          * If a delegation is present on this file and
3701                          * this is a WRITE, then update the lastwrite
3702                          * time to indicate that activity is present.
3703                          */
3704                         if (sp->rs_finfo->rf_dinfo.rd_dtype ==
3705                             OPEN_DELEGATE_WRITE &&
3706                             mode == FWRITE) {
3707                                 sp->rs_finfo->rf_dinfo.rd_time_lastwrite =
3708                                     gethrestime_sec();
3709                         }
3710 
3711                         rfs4_state_rele_nounlock(sp);
3712 
3713                         return (stat);
3714                 }
3715 
3716                 if (dsp != NULL) {
3717                         /* Is associated server instance in its grace period? */
3718                         if (rfs4_clnt_in_grace(dsp->rds_client)) {
3719                                 rfs4_deleg_state_rele(dsp);
3720                                 return (NFS4ERR_GRACE);
3721                         }
3722                         if (dsp->rds_delegid.bits.chgseq != id->bits.chgseq) {
3723                                 rfs4_deleg_state_rele(dsp);
3724                                 return (NFS4ERR_BAD_STATEID);
3725                         }
3726 
3727                         /* Ensure specified filehandle matches */
3728                         if (dsp->rds_finfo->rf_vp != vp) {
3729                                 rfs4_deleg_state_rele(dsp);
3730                                 return (NFS4ERR_BAD_STATEID);
3731                         }
3732                         /*
3733                          * Return whether this state has write
3734                          * delegation if desired
3735                          */
3736                         if (deleg && (dsp->rds_finfo->rf_dinfo.rd_dtype ==
3737                             OPEN_DELEGATE_WRITE))
3738                                 *deleg = TRUE;
3739 
3740                         rfs4_update_lease(dsp->rds_client);
3741 
3742                         /*
3743                          * If a delegation is present on this file and
3744                          * this is a WRITE, then update the lastwrite
3745                          * time to indicate that activity is present.
3746                          */
3747                         if (dsp->rds_finfo->rf_dinfo.rd_dtype ==
3748                             OPEN_DELEGATE_WRITE && mode == FWRITE) {
3749                                 dsp->rds_finfo->rf_dinfo.rd_time_lastwrite =
3750                                     gethrestime_sec();
3751                         }
3752 
3753                         /*
3754                          * XXX - what happens if this is a WRITE and the
3755                          * delegation type of for READ.
3756                          */
3757                         rfs4_deleg_state_rele(dsp);
3758 
3759                         return (stat);
3760                 }
3761                 /*
3762                  * If we got this far, something bad happened
3763                  */
3764                 return (NFS4ERR_BAD_STATEID);
3765         }
3766 }
3767 
3768 
3769 /*
3770  * This is a special function in that for the file struct provided the
3771  * server wants to remove/close all current state associated with the
3772  * file.  The prime use of this would be with OP_REMOVE to force the
3773  * release of state and particularly of file locks.
3774  *
3775  * There is an assumption that there is no delegations outstanding on
3776  * this file at this point.  The caller should have waited for those
3777  * to be returned or revoked.
3778  */
3779 void
3780 rfs4_close_all_state(rfs4_file_t *fp)
3781 {
3782         rfs4_state_t *sp;
3783 
3784         rfs4_dbe_lock(fp->rf_dbe);
3785 
3786 #ifdef DEBUG
3787         /* only applies when server is handing out delegations */
3788         if (rfs4_deleg_policy != SRV_NEVER_DELEGATE)
3789                 ASSERT(fp->rf_dinfo.rd_hold_grant > 0);
3790 #endif
3791 
3792         /* No delegations for this file */
3793         ASSERT(list_is_empty(&fp->rf_delegstatelist));
3794 
3795         /* Make sure that it can not be found */
3796         rfs4_dbe_invalidate(fp->rf_dbe);
3797 
3798         if (fp->rf_vp == NULL) {
3799                 rfs4_dbe_unlock(fp->rf_dbe);
3800                 return;
3801         }
3802         rfs4_dbe_unlock(fp->rf_dbe);
3803 
3804         /*
3805          * Hold as writer to prevent other server threads from
3806          * processing requests related to the file while all state is
3807          * being removed.
3808          */
3809         rw_enter(&fp->rf_file_rwlock, RW_WRITER);
3810 
3811         /* Remove ALL state from the file */
3812         while (sp = rfs4_findstate_by_file(fp)) {
3813                 rfs4_state_close(sp, FALSE, FALSE, CRED());
3814                 rfs4_state_rele_nounlock(sp);
3815         }
3816 
3817         /*
3818          * This is only safe since there are no further references to
3819          * the file.
3820          */
3821         rfs4_dbe_lock(fp->rf_dbe);
3822         if (fp->rf_vp) {
3823                 vnode_t *vp = fp->rf_vp;
3824 
3825                 mutex_enter(&vp->v_vsd_lock);
3826                 (void) vsd_set(vp, nfs4_srv_vkey, NULL);
3827                 mutex_exit(&vp->v_vsd_lock);
3828                 VN_RELE(vp);
3829                 fp->rf_vp = NULL;
3830         }
3831         rfs4_dbe_unlock(fp->rf_dbe);
3832 
3833         /* Finally let other references to proceed */
3834         rw_exit(&fp->rf_file_rwlock);
3835 }
3836 
3837 /*
3838  * This function is used as a target for the rfs4_dbe_walk() call
3839  * below.  The purpose of this function is to see if the
3840  * lockowner_state refers to a file that resides within the exportinfo
3841  * export.  If so, then remove the lock_owner state (file locks and
3842  * share "locks") for this object since the intent is the server is
3843  * unexporting the specified directory.  Be sure to invalidate the
3844  * object after the state has been released
3845  */
3846 static void
3847 rfs4_lo_state_walk_callout(rfs4_entry_t u_entry, void *e)
3848 {
3849         rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
3850         struct exportinfo *exi = (struct exportinfo *)e;
3851         nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
3852         fhandle_t *efhp;
3853 
3854         efhp = (fhandle_t *)&exi->exi_fh;
3855         exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
3856 
3857         FH_TO_FMT4(efhp, exi_fhp);
3858 
3859         finfo_fhp = (nfs_fh4_fmt_t *)lsp->rls_state->rs_finfo->
3860             rf_filehandle.nfs_fh4_val;
3861 
3862         if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3863             bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3864             exi_fhp->fh4_xlen) == 0) {
3865                 rfs4_state_close(lsp->rls_state, FALSE, FALSE, CRED());
3866                 rfs4_dbe_invalidate(lsp->rls_dbe);
3867                 rfs4_dbe_invalidate(lsp->rls_state->rs_dbe);
3868         }
3869 }
3870 
3871 /*
3872  * This function is used as a target for the rfs4_dbe_walk() call
3873  * below.  The purpose of this function is to see if the state refers
3874  * to a file that resides within the exportinfo export.  If so, then
3875  * remove the open state for this object since the intent is the
3876  * server is unexporting the specified directory.  The main result for
3877  * this type of entry is to invalidate it such it will not be found in
3878  * the future.
3879  */
3880 static void
3881 rfs4_state_walk_callout(rfs4_entry_t u_entry, void *e)
3882 {
3883         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3884         struct exportinfo *exi = (struct exportinfo *)e;
3885         nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
3886         fhandle_t *efhp;
3887 
3888         efhp = (fhandle_t *)&exi->exi_fh;
3889         exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
3890 
3891         FH_TO_FMT4(efhp, exi_fhp);
3892 
3893         finfo_fhp =
3894             (nfs_fh4_fmt_t *)sp->rs_finfo->rf_filehandle.nfs_fh4_val;
3895 
3896         if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3897             bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3898             exi_fhp->fh4_xlen) == 0) {
3899                 rfs4_state_close(sp, TRUE, FALSE, CRED());
3900                 rfs4_dbe_invalidate(sp->rs_dbe);
3901         }
3902 }
3903 
3904 /*
3905  * This function is used as a target for the rfs4_dbe_walk() call
3906  * below.  The purpose of this function is to see if the state refers
3907  * to a file that resides within the exportinfo export.  If so, then
3908  * remove the deleg state for this object since the intent is the
3909  * server is unexporting the specified directory.  The main result for
3910  * this type of entry is to invalidate it such it will not be found in
3911  * the future.
3912  */
3913 static void
3914 rfs4_deleg_state_walk_callout(rfs4_entry_t u_entry, void *e)
3915 {
3916         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
3917         struct exportinfo *exi = (struct exportinfo *)e;
3918         nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
3919         fhandle_t *efhp;
3920 
3921         efhp = (fhandle_t *)&exi->exi_fh;
3922         exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
3923 
3924         FH_TO_FMT4(efhp, exi_fhp);
3925 
3926         finfo_fhp =
3927             (nfs_fh4_fmt_t *)dsp->rds_finfo->rf_filehandle.nfs_fh4_val;
3928 
3929         if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3930             bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3931             exi_fhp->fh4_xlen) == 0) {
3932                 rfs4_dbe_invalidate(dsp->rds_dbe);
3933         }
3934 }
3935 
3936 /*
3937  * This function is used as a target for the rfs4_dbe_walk() call
3938  * below.  The purpose of this function is to see if the state refers
3939  * to a file that resides within the exportinfo export.  If so, then
3940  * release vnode hold for this object since the intent is the server
3941  * is unexporting the specified directory.  Invalidation will prevent
3942  * this struct from being found in the future.
3943  */
3944 static void
3945 rfs4_file_walk_callout(rfs4_entry_t u_entry, void *e)
3946 {
3947         rfs4_file_t *fp = (rfs4_file_t *)u_entry;
3948         struct exportinfo *exi = (struct exportinfo *)e;
3949         nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
3950         fhandle_t *efhp;
3951 
3952         efhp = (fhandle_t *)&exi->exi_fh;
3953         exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
3954 
3955         FH_TO_FMT4(efhp, exi_fhp);
3956 
3957         finfo_fhp = (nfs_fh4_fmt_t *)fp->rf_filehandle.nfs_fh4_val;
3958 
3959         if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3960             bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3961             exi_fhp->fh4_xlen) == 0) {
3962                 if (fp->rf_vp) {
3963                         vnode_t *vp = fp->rf_vp;
3964 
3965                         /*
3966                          * don't leak monitors and remove the reference
3967                          * put on the vnode when the delegation was granted.
3968                          */
3969                         if (fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_READ) {
3970                                 (void) fem_uninstall(vp, deleg_rdops,
3971                                     (void *)fp);
3972                                 vn_open_downgrade(vp, FREAD);
3973                         } else if (fp->rf_dinfo.rd_dtype ==
3974                             OPEN_DELEGATE_WRITE) {
3975                                 (void) fem_uninstall(vp, deleg_wrops,
3976                                     (void *)fp);
3977                                 vn_open_downgrade(vp, FREAD|FWRITE);
3978                         }
3979                         mutex_enter(&vp->v_vsd_lock);
3980                         (void) vsd_set(vp, nfs4_srv_vkey, NULL);
3981                         mutex_exit(&vp->v_vsd_lock);
3982                         VN_RELE(vp);
3983                         fp->rf_vp = NULL;
3984                 }
3985                 rfs4_dbe_invalidate(fp->rf_dbe);
3986         }
3987 }
3988 
3989 /*
3990  * Given a directory that is being unexported, cleanup/release all
3991  * state in the server that refers to objects residing underneath this
3992  * particular export.  The ordering of the release is important.
3993  * Lock_owner, then state and then file.
3994  */
3995 void
3996 rfs4_clean_state_exi(struct exportinfo *exi)
3997 {
3998         mutex_enter(&rfs4_state_lock);
3999 
4000         if (rfs4_server_state == NULL) {
4001                 mutex_exit(&rfs4_state_lock);
4002                 return;
4003         }
4004 
4005         rfs4_dbe_walk(rfs4_lo_state_tab, rfs4_lo_state_walk_callout, exi);
4006         rfs4_dbe_walk(rfs4_state_tab, rfs4_state_walk_callout, exi);
4007         rfs4_dbe_walk(rfs4_deleg_state_tab, rfs4_deleg_state_walk_callout, exi);
4008         rfs4_dbe_walk(rfs4_file_tab, rfs4_file_walk_callout, exi);
4009 
4010         mutex_exit(&rfs4_state_lock);
4011 }