Print this page
5045 use atomic_{inc,dec}_* instead of atomic_add_*
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/nfs/nfs_subr.c
+++ new/usr/src/uts/common/fs/nfs/nfs_subr.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 /*
27 27 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
28 28 */
29 29
30 30 #include <sys/param.h>
31 31 #include <sys/types.h>
32 32 #include <sys/systm.h>
33 33 #include <sys/cred.h>
34 34 #include <sys/proc.h>
35 35 #include <sys/user.h>
36 36 #include <sys/time.h>
37 37 #include <sys/buf.h>
38 38 #include <sys/vfs.h>
39 39 #include <sys/vnode.h>
40 40 #include <sys/socket.h>
41 41 #include <sys/uio.h>
42 42 #include <sys/tiuser.h>
43 43 #include <sys/swap.h>
44 44 #include <sys/errno.h>
45 45 #include <sys/debug.h>
46 46 #include <sys/kmem.h>
47 47 #include <sys/kstat.h>
48 48 #include <sys/cmn_err.h>
49 49 #include <sys/vtrace.h>
50 50 #include <sys/session.h>
51 51 #include <sys/dnlc.h>
52 52 #include <sys/bitmap.h>
53 53 #include <sys/acl.h>
54 54 #include <sys/ddi.h>
55 55 #include <sys/pathname.h>
56 56 #include <sys/flock.h>
57 57 #include <sys/dirent.h>
58 58 #include <sys/flock.h>
59 59 #include <sys/callb.h>
60 60 #include <sys/atomic.h>
61 61 #include <sys/list.h>
62 62 #include <sys/tsol/tnet.h>
63 63 #include <sys/priv.h>
64 64 #include <sys/sdt.h>
65 65 #include <sys/attr.h>
66 66
67 67 #include <inet/ip6.h>
68 68
69 69 #include <rpc/types.h>
70 70 #include <rpc/xdr.h>
71 71 #include <rpc/auth.h>
72 72 #include <rpc/clnt.h>
73 73
74 74 #include <nfs/nfs.h>
75 75 #include <nfs/nfs4.h>
76 76 #include <nfs/nfs_clnt.h>
77 77 #include <nfs/rnode.h>
78 78 #include <nfs/nfs_acl.h>
79 79
80 80 #include <sys/tsol/label.h>
81 81
82 82 /*
83 83 * The hash queues for the access to active and cached rnodes
84 84 * are organized as doubly linked lists. A reader/writer lock
85 85 * for each hash bucket is used to control access and to synchronize
86 86 * lookups, additions, and deletions from the hash queue.
87 87 *
88 88 * The rnode freelist is organized as a doubly linked list with
89 89 * a head pointer. Additions and deletions are synchronized via
90 90 * a single mutex.
91 91 *
92 92 * In order to add an rnode to the free list, it must be hashed into
93 93 * a hash queue and the exclusive lock to the hash queue be held.
94 94 * If an rnode is not hashed into a hash queue, then it is destroyed
95 95 * because it represents no valuable information that can be reused
96 96 * about the file. The exclusive lock to the hash queue must be
97 97 * held in order to prevent a lookup in the hash queue from finding
98 98 * the rnode and using it and assuming that the rnode is not on the
99 99 * freelist. The lookup in the hash queue will have the hash queue
100 100 * locked, either exclusive or shared.
101 101 *
102 102 * The vnode reference count for each rnode is not allowed to drop
103 103 * below 1. This prevents external entities, such as the VM
104 104 * subsystem, from acquiring references to vnodes already on the
105 105 * freelist and then trying to place them back on the freelist
106 106 * when their reference is released. This means that the when an
107 107 * rnode is looked up in the hash queues, then either the rnode
108 108 * is removed from the freelist and that reference is transferred to
109 109 * the new reference or the vnode reference count must be incremented
110 110 * accordingly. The mutex for the freelist must be held in order to
111 111 * accurately test to see if the rnode is on the freelist or not.
112 112 * The hash queue lock might be held shared and it is possible that
113 113 * two different threads may race to remove the rnode from the
114 114 * freelist. This race can be resolved by holding the mutex for the
115 115 * freelist. Please note that the mutex for the freelist does not
116 116 * need to held if the rnode is not on the freelist. It can not be
117 117 * placed on the freelist due to the requirement that the thread
118 118 * putting the rnode on the freelist must hold the exclusive lock
119 119 * to the hash queue and the thread doing the lookup in the hash
120 120 * queue is holding either a shared or exclusive lock to the hash
121 121 * queue.
122 122 *
123 123 * The lock ordering is:
124 124 *
125 125 * hash bucket lock -> vnode lock
126 126 * hash bucket lock -> freelist lock
127 127 */
128 128 static rhashq_t *rtable;
129 129
130 130 static kmutex_t rpfreelist_lock;
131 131 static rnode_t *rpfreelist = NULL;
132 132 static long rnew = 0;
133 133 long nrnode = 0;
134 134
135 135 static int rtablesize;
136 136 static int rtablemask;
137 137
138 138 static int hashlen = 4;
139 139
140 140 static struct kmem_cache *rnode_cache;
141 141
142 142 /*
143 143 * Mutex to protect the following variables:
144 144 * nfs_major
145 145 * nfs_minor
146 146 */
147 147 kmutex_t nfs_minor_lock;
148 148 int nfs_major;
149 149 int nfs_minor;
150 150
151 151 /* Do we allow preepoch (negative) time values otw? */
152 152 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */
153 153
154 154 /*
155 155 * Access cache
156 156 */
157 157 static acache_hash_t *acache;
158 158 static long nacache; /* used strictly to size the number of hash queues */
159 159
160 160 static int acachesize;
161 161 static int acachemask;
162 162 static struct kmem_cache *acache_cache;
163 163
164 164 /*
165 165 * Client side utilities
166 166 */
167 167
168 168 /*
169 169 * client side statistics
170 170 */
171 171 static const struct clstat clstat_tmpl = {
172 172 { "calls", KSTAT_DATA_UINT64 },
173 173 { "badcalls", KSTAT_DATA_UINT64 },
174 174 { "clgets", KSTAT_DATA_UINT64 },
175 175 { "cltoomany", KSTAT_DATA_UINT64 },
176 176 #ifdef DEBUG
177 177 { "clalloc", KSTAT_DATA_UINT64 },
178 178 { "noresponse", KSTAT_DATA_UINT64 },
179 179 { "failover", KSTAT_DATA_UINT64 },
180 180 { "remap", KSTAT_DATA_UINT64 },
181 181 #endif
182 182 };
183 183
184 184 /*
185 185 * The following are statistics that describe behavior of the system as a whole
186 186 * and doesn't correspond to any one particular zone.
187 187 */
188 188 #ifdef DEBUG
189 189 static struct clstat_debug {
190 190 kstat_named_t nrnode; /* number of allocated rnodes */
191 191 kstat_named_t access; /* size of access cache */
192 192 kstat_named_t dirent; /* size of readdir cache */
193 193 kstat_named_t dirents; /* size of readdir buf cache */
194 194 kstat_named_t reclaim; /* number of reclaims */
195 195 kstat_named_t clreclaim; /* number of cl reclaims */
196 196 kstat_named_t f_reclaim; /* number of free reclaims */
197 197 kstat_named_t a_reclaim; /* number of active reclaims */
198 198 kstat_named_t r_reclaim; /* number of rnode reclaims */
199 199 kstat_named_t rpath; /* bytes used to store rpaths */
200 200 } clstat_debug = {
201 201 { "nrnode", KSTAT_DATA_UINT64 },
202 202 { "access", KSTAT_DATA_UINT64 },
203 203 { "dirent", KSTAT_DATA_UINT64 },
204 204 { "dirents", KSTAT_DATA_UINT64 },
205 205 { "reclaim", KSTAT_DATA_UINT64 },
206 206 { "clreclaim", KSTAT_DATA_UINT64 },
207 207 { "f_reclaim", KSTAT_DATA_UINT64 },
208 208 { "a_reclaim", KSTAT_DATA_UINT64 },
209 209 { "r_reclaim", KSTAT_DATA_UINT64 },
210 210 { "r_path", KSTAT_DATA_UINT64 },
211 211 };
212 212 #endif /* DEBUG */
213 213
214 214 /*
215 215 * We keep a global list of per-zone client data, so we can clean up all zones
216 216 * if we get low on memory.
217 217 */
218 218 static list_t nfs_clnt_list;
219 219 static kmutex_t nfs_clnt_list_lock;
220 220 static zone_key_t nfsclnt_zone_key;
221 221
222 222 static struct kmem_cache *chtab_cache;
223 223
224 224 /*
225 225 * Some servers do not properly update the attributes of the
226 226 * directory when changes are made. To allow interoperability
227 227 * with these broken servers, the nfs_disable_rddir_cache
228 228 * parameter must be set in /etc/system
229 229 */
230 230 int nfs_disable_rddir_cache = 0;
231 231
232 232 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
233 233 struct chtab **);
234 234 void clfree(CLIENT *, struct chtab *);
235 235 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
236 236 struct chtab **, struct nfs_clnt *);
237 237 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
238 238 struct chtab **, struct nfs_clnt *);
239 239 static void clreclaim(void *);
240 240 static int nfs_feedback(int, int, mntinfo_t *);
241 241 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
242 242 caddr_t, cred_t *, int *, enum clnt_stat *, int,
243 243 failinfo_t *);
244 244 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
245 245 caddr_t, cred_t *, int *, int, failinfo_t *);
246 246 static void rinactive(rnode_t *, cred_t *);
247 247 static int rtablehash(nfs_fhandle *);
248 248 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
249 249 struct vnodeops *,
250 250 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
251 251 cred_t *),
252 252 int (*)(const void *, const void *), int *, cred_t *,
253 253 char *, char *);
254 254 static void rp_rmfree(rnode_t *);
255 255 static void rp_addhash(rnode_t *);
256 256 static void rp_rmhash_locked(rnode_t *);
257 257 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
258 258 static void destroy_rnode(rnode_t *);
259 259 static void rddir_cache_free(rddir_cache *);
260 260 static int nfs_free_data_reclaim(rnode_t *);
261 261 static int nfs_active_data_reclaim(rnode_t *);
262 262 static int nfs_free_reclaim(void);
263 263 static int nfs_active_reclaim(void);
264 264 static int nfs_rnode_reclaim(void);
265 265 static void nfs_reclaim(void *);
266 266 static int failover_safe(failinfo_t *);
267 267 static void failover_newserver(mntinfo_t *mi);
268 268 static void failover_thread(mntinfo_t *mi);
269 269 static int failover_wait(mntinfo_t *);
270 270 static int failover_remap(failinfo_t *);
271 271 static int failover_lookup(char *, vnode_t *,
272 272 int (*)(vnode_t *, char *, vnode_t **,
273 273 struct pathname *, int, vnode_t *, cred_t *, int),
274 274 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
275 275 vnode_t **);
276 276 static void nfs_free_r_path(rnode_t *);
277 277 static void nfs_set_vroot(vnode_t *);
278 278 static char *nfs_getsrvnames(mntinfo_t *, size_t *);
279 279
280 280 /*
281 281 * from rpcsec module (common/rpcsec)
282 282 */
283 283 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
284 284 extern void sec_clnt_freeh(AUTH *);
285 285 extern void sec_clnt_freeinfo(struct sec_data *);
286 286
287 287 /*
288 288 * used in mount policy
289 289 */
290 290 extern ts_label_t *getflabel_cipso(vfs_t *);
291 291
292 292 /*
293 293 * EIO or EINTR are not recoverable errors.
294 294 */
295 295 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO))
296 296
297 297 #ifdef DEBUG
298 298 #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n"
299 299 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n"
300 300 #else
301 301 #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n"
302 302 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n"
303 303 #endif
304 304 /*
305 305 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
306 306 */
307 307 static int
308 308 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
309 309 struct chtab **chp, struct nfs_clnt *nfscl)
310 310 {
311 311 struct chhead *ch, *newch;
312 312 struct chhead **plistp;
313 313 struct chtab *cp;
314 314 int error;
315 315 k_sigset_t smask;
316 316
317 317 if (newcl == NULL || chp == NULL || ci == NULL)
318 318 return (EINVAL);
319 319
320 320 *newcl = NULL;
321 321 *chp = NULL;
322 322
323 323 /*
324 324 * Find an unused handle or create one
325 325 */
326 326 newch = NULL;
327 327 nfscl->nfscl_stat.clgets.value.ui64++;
328 328 top:
329 329 /*
330 330 * Find the correct entry in the cache to check for free
331 331 * client handles. The search is based on the RPC program
332 332 * number, program version number, dev_t for the transport
333 333 * device, and the protocol family.
334 334 */
335 335 mutex_enter(&nfscl->nfscl_chtable_lock);
336 336 plistp = &nfscl->nfscl_chtable;
337 337 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
338 338 if (ch->ch_prog == ci->cl_prog &&
339 339 ch->ch_vers == ci->cl_vers &&
340 340 ch->ch_dev == svp->sv_knconf->knc_rdev &&
341 341 (strcmp(ch->ch_protofmly,
342 342 svp->sv_knconf->knc_protofmly) == 0))
343 343 break;
344 344 plistp = &ch->ch_next;
345 345 }
346 346
347 347 /*
348 348 * If we didn't find a cache entry for this quadruple, then
349 349 * create one. If we don't have one already preallocated,
350 350 * then drop the cache lock, create one, and then start over.
351 351 * If we did have a preallocated entry, then just add it to
352 352 * the front of the list.
353 353 */
354 354 if (ch == NULL) {
355 355 if (newch == NULL) {
356 356 mutex_exit(&nfscl->nfscl_chtable_lock);
357 357 newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
358 358 newch->ch_timesused = 0;
359 359 newch->ch_prog = ci->cl_prog;
360 360 newch->ch_vers = ci->cl_vers;
361 361 newch->ch_dev = svp->sv_knconf->knc_rdev;
362 362 newch->ch_protofmly = kmem_alloc(
363 363 strlen(svp->sv_knconf->knc_protofmly) + 1,
364 364 KM_SLEEP);
365 365 (void) strcpy(newch->ch_protofmly,
366 366 svp->sv_knconf->knc_protofmly);
367 367 newch->ch_list = NULL;
368 368 goto top;
369 369 }
370 370 ch = newch;
371 371 newch = NULL;
372 372 ch->ch_next = nfscl->nfscl_chtable;
373 373 nfscl->nfscl_chtable = ch;
374 374 /*
375 375 * We found a cache entry, but if it isn't on the front of the
376 376 * list, then move it to the front of the list to try to take
377 377 * advantage of locality of operations.
378 378 */
379 379 } else if (ch != nfscl->nfscl_chtable) {
380 380 *plistp = ch->ch_next;
381 381 ch->ch_next = nfscl->nfscl_chtable;
382 382 nfscl->nfscl_chtable = ch;
383 383 }
384 384
385 385 /*
386 386 * If there was a free client handle cached, then remove it
387 387 * from the list, init it, and use it.
388 388 */
389 389 if (ch->ch_list != NULL) {
390 390 cp = ch->ch_list;
391 391 ch->ch_list = cp->ch_list;
392 392 mutex_exit(&nfscl->nfscl_chtable_lock);
393 393 if (newch != NULL) {
394 394 kmem_free(newch->ch_protofmly,
395 395 strlen(newch->ch_protofmly) + 1);
396 396 kmem_free(newch, sizeof (*newch));
397 397 }
398 398 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
399 399 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
400 400 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
401 401 &cp->ch_client->cl_auth);
402 402 if (error || cp->ch_client->cl_auth == NULL) {
403 403 CLNT_DESTROY(cp->ch_client);
404 404 kmem_cache_free(chtab_cache, cp);
405 405 return ((error != 0) ? error : EINTR);
406 406 }
407 407 ch->ch_timesused++;
↓ open down ↓ |
407 lines elided |
↑ open up ↑ |
408 408 *newcl = cp->ch_client;
409 409 *chp = cp;
410 410 return (0);
411 411 }
412 412
413 413 /*
414 414 * There weren't any free client handles which fit, so allocate
415 415 * a new one and use that.
416 416 */
417 417 #ifdef DEBUG
418 - atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
418 + atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
419 419 #endif
420 420 mutex_exit(&nfscl->nfscl_chtable_lock);
421 421
422 422 nfscl->nfscl_stat.cltoomany.value.ui64++;
423 423 if (newch != NULL) {
424 424 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
425 425 kmem_free(newch, sizeof (*newch));
426 426 }
427 427
428 428 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
429 429 cp->ch_head = ch;
430 430
431 431 sigintr(&smask, (int)ci->cl_flags & MI_INT);
432 432 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
433 433 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
434 434 sigunintr(&smask);
435 435
436 436 if (error != 0) {
437 437 kmem_cache_free(chtab_cache, cp);
438 438 #ifdef DEBUG
439 - atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
439 + atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
440 440 #endif
441 441 /*
442 442 * Warning is unnecessary if error is EINTR.
443 443 */
444 444 if (error != EINTR) {
445 445 nfs_cmn_err(error, CE_WARN,
446 446 "clget: couldn't create handle: %m\n");
447 447 }
448 448 return (error);
449 449 }
450 450 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
451 451 auth_destroy(cp->ch_client->cl_auth);
452 452 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
453 453 &cp->ch_client->cl_auth);
454 454 if (error || cp->ch_client->cl_auth == NULL) {
455 455 CLNT_DESTROY(cp->ch_client);
456 456 kmem_cache_free(chtab_cache, cp);
457 457 #ifdef DEBUG
458 - atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
458 + atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
459 459 #endif
460 460 return ((error != 0) ? error : EINTR);
461 461 }
462 462 ch->ch_timesused++;
463 463 *newcl = cp->ch_client;
464 464 ASSERT(cp->ch_client->cl_nosignal == FALSE);
465 465 *chp = cp;
466 466 return (0);
467 467 }
468 468
469 469 int
470 470 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
471 471 struct chtab **chp)
472 472 {
473 473 struct nfs_clnt *nfscl;
474 474
475 475 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
476 476 ASSERT(nfscl != NULL);
477 477
478 478 return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
479 479 }
480 480
481 481 static int
482 482 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
483 483 struct chtab **chp, struct nfs_clnt *nfscl)
484 484 {
485 485 clinfo_t ci;
486 486 int error;
487 487
488 488 /*
489 489 * Set read buffer size to rsize
490 490 * and add room for RPC headers.
491 491 */
492 492 ci.cl_readsize = mi->mi_tsize;
493 493 if (ci.cl_readsize != 0)
494 494 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
495 495
496 496 /*
497 497 * If soft mount and server is down just try once.
498 498 * meaning: do not retransmit.
499 499 */
500 500 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
501 501 ci.cl_retrans = 0;
502 502 else
503 503 ci.cl_retrans = mi->mi_retrans;
504 504
505 505 ci.cl_prog = NFS_ACL_PROGRAM;
506 506 ci.cl_vers = mi->mi_vers;
507 507 ci.cl_flags = mi->mi_flags;
508 508
509 509 /*
510 510 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
511 511 * security flavor, the client tries to establish a security context
512 512 * by contacting the server. If the connection is timed out or reset,
513 513 * e.g. server reboot, we will try again.
514 514 */
515 515 do {
516 516 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
517 517
518 518 if (error == 0)
519 519 break;
520 520
521 521 /*
522 522 * For forced unmount or zone shutdown, bail out, no retry.
523 523 */
524 524 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
525 525 error = EIO;
526 526 break;
527 527 }
528 528
529 529 /* do not retry for softmount */
530 530 if (!(mi->mi_flags & MI_HARD))
531 531 break;
532 532
533 533 /* let the caller deal with the failover case */
534 534 if (FAILOVER_MOUNT(mi))
535 535 break;
536 536
537 537 } while (error == ETIMEDOUT || error == ECONNRESET);
538 538
539 539 return (error);
540 540 }
541 541
542 542 static int
543 543 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
544 544 struct chtab **chp, struct nfs_clnt *nfscl)
545 545 {
546 546 clinfo_t ci;
547 547 int error;
548 548
549 549 /*
550 550 * Set read buffer size to rsize
551 551 * and add room for RPC headers.
552 552 */
553 553 ci.cl_readsize = mi->mi_tsize;
554 554 if (ci.cl_readsize != 0)
555 555 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
556 556
557 557 /*
558 558 * If soft mount and server is down just try once.
559 559 * meaning: do not retransmit.
560 560 */
561 561 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
562 562 ci.cl_retrans = 0;
563 563 else
564 564 ci.cl_retrans = mi->mi_retrans;
565 565
566 566 ci.cl_prog = mi->mi_prog;
567 567 ci.cl_vers = mi->mi_vers;
568 568 ci.cl_flags = mi->mi_flags;
569 569
570 570 /*
571 571 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
572 572 * security flavor, the client tries to establish a security context
573 573 * by contacting the server. If the connection is timed out or reset,
574 574 * e.g. server reboot, we will try again.
575 575 */
576 576 do {
577 577 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
578 578
579 579 if (error == 0)
580 580 break;
581 581
582 582 /*
583 583 * For forced unmount or zone shutdown, bail out, no retry.
584 584 */
585 585 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
586 586 error = EIO;
587 587 break;
588 588 }
589 589
590 590 /* do not retry for softmount */
591 591 if (!(mi->mi_flags & MI_HARD))
592 592 break;
593 593
594 594 /* let the caller deal with the failover case */
595 595 if (FAILOVER_MOUNT(mi))
596 596 break;
597 597
598 598 } while (error == ETIMEDOUT || error == ECONNRESET);
599 599
600 600 return (error);
601 601 }
602 602
603 603 static void
604 604 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
605 605 {
606 606 if (cl->cl_auth != NULL) {
607 607 sec_clnt_freeh(cl->cl_auth);
608 608 cl->cl_auth = NULL;
609 609 }
610 610
611 611 /*
612 612 * Timestamp this cache entry so that we know when it was last
613 613 * used.
614 614 */
615 615 cp->ch_freed = gethrestime_sec();
616 616
617 617 /*
618 618 * Add the free client handle to the front of the list.
619 619 * This way, the list will be sorted in youngest to oldest
620 620 * order.
621 621 */
622 622 mutex_enter(&nfscl->nfscl_chtable_lock);
623 623 cp->ch_list = cp->ch_head->ch_list;
624 624 cp->ch_head->ch_list = cp;
625 625 mutex_exit(&nfscl->nfscl_chtable_lock);
626 626 }
627 627
628 628 void
629 629 clfree(CLIENT *cl, struct chtab *cp)
630 630 {
631 631 struct nfs_clnt *nfscl;
632 632
633 633 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
634 634 ASSERT(nfscl != NULL);
635 635
636 636 clfree_impl(cl, cp, nfscl);
637 637 }
638 638
639 639 #define CL_HOLDTIME 60 /* time to hold client handles */
640 640
641 641 static void
642 642 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
643 643 {
644 644 struct chhead *ch;
645 645 struct chtab *cp; /* list of objects that can be reclaimed */
646 646 struct chtab *cpe;
647 647 struct chtab *cpl;
648 648 struct chtab **cpp;
649 649 #ifdef DEBUG
650 650 int n = 0;
651 651 #endif
652 652
653 653 /*
654 654 * Need to reclaim some memory, so step through the cache
655 655 * looking through the lists for entries which can be freed.
656 656 */
657 657 cp = NULL;
658 658
659 659 mutex_enter(&nfscl->nfscl_chtable_lock);
660 660
661 661 /*
662 662 * Here we step through each non-NULL quadruple and start to
663 663 * construct the reclaim list pointed to by cp. Note that
664 664 * cp will contain all eligible chtab entries. When this traversal
665 665 * completes, chtab entries from the last quadruple will be at the
666 666 * front of cp and entries from previously inspected quadruples have
667 667 * been appended to the rear of cp.
668 668 */
669 669 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
670 670 if (ch->ch_list == NULL)
671 671 continue;
672 672 /*
673 673 * Search each list for entries older then
674 674 * cl_holdtime seconds. The lists are maintained
675 675 * in youngest to oldest order so that when the
676 676 * first entry is found which is old enough, then
677 677 * all of the rest of the entries on the list will
678 678 * be old enough as well.
679 679 */
680 680 cpl = ch->ch_list;
681 681 cpp = &ch->ch_list;
682 682 while (cpl != NULL &&
683 683 cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
684 684 cpp = &cpl->ch_list;
685 685 cpl = cpl->ch_list;
686 686 }
687 687 if (cpl != NULL) {
688 688 *cpp = NULL;
689 689 if (cp != NULL) {
690 690 cpe = cpl;
691 691 while (cpe->ch_list != NULL)
692 692 cpe = cpe->ch_list;
693 693 cpe->ch_list = cp;
694 694 }
695 695 cp = cpl;
696 696 }
697 697 }
698 698
699 699 mutex_exit(&nfscl->nfscl_chtable_lock);
700 700
701 701 /*
702 702 * If cp is empty, then there is nothing to reclaim here.
703 703 */
704 704 if (cp == NULL)
705 705 return;
706 706
707 707 /*
708 708 * Step through the list of entries to free, destroying each client
709 709 * handle and kmem_free'ing the memory for each entry.
710 710 */
711 711 while (cp != NULL) {
712 712 #ifdef DEBUG
713 713 n++;
714 714 #endif
715 715 CLNT_DESTROY(cp->ch_client);
716 716 cpl = cp->ch_list;
717 717 kmem_cache_free(chtab_cache, cp);
718 718 cp = cpl;
719 719 }
720 720
721 721 #ifdef DEBUG
722 722 /*
723 723 * Update clalloc so that nfsstat shows the current number
724 724 * of allocated client handles.
725 725 */
726 726 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
727 727 #endif
728 728 }
729 729
730 730 /* ARGSUSED */
731 731 static void
732 732 clreclaim(void *all)
733 733 {
734 734 struct nfs_clnt *nfscl;
735 735
736 736 #ifdef DEBUG
737 737 clstat_debug.clreclaim.value.ui64++;
738 738 #endif
739 739 /*
740 740 * The system is low on memory; go through and try to reclaim some from
741 741 * every zone on the system.
742 742 */
743 743 mutex_enter(&nfs_clnt_list_lock);
744 744 nfscl = list_head(&nfs_clnt_list);
745 745 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
746 746 clreclaim_zone(nfscl, CL_HOLDTIME);
747 747 mutex_exit(&nfs_clnt_list_lock);
748 748 }
749 749
750 750 /*
751 751 * Minimum time-out values indexed by call type
752 752 * These units are in "eights" of a second to avoid multiplies
753 753 */
754 754 static unsigned int minimum_timeo[] = {
755 755 6, 7, 10
756 756 };
757 757
758 758 /*
759 759 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
760 760 */
761 761 #define MAXTIMO (20*hz)
762 762 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
763 763 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
764 764
765 765 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */
766 766 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */
767 767 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
768 768
769 769 /*
770 770 * Function called when rfscall notices that we have been
771 771 * re-transmitting, or when we get a response without retransmissions.
772 772 * Return 1 if the transfer size was adjusted down - 0 if no change.
773 773 */
774 774 static int
775 775 nfs_feedback(int flag, int which, mntinfo_t *mi)
776 776 {
777 777 int kind;
778 778 int r = 0;
779 779
780 780 mutex_enter(&mi->mi_lock);
781 781 if (flag == FEEDBACK_REXMIT1) {
782 782 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
783 783 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
784 784 goto done;
785 785 if (mi->mi_curread > MIN_NFS_TSIZE) {
786 786 mi->mi_curread /= 2;
787 787 if (mi->mi_curread < MIN_NFS_TSIZE)
788 788 mi->mi_curread = MIN_NFS_TSIZE;
789 789 r = 1;
790 790 }
791 791
792 792 if (mi->mi_curwrite > MIN_NFS_TSIZE) {
793 793 mi->mi_curwrite /= 2;
794 794 if (mi->mi_curwrite < MIN_NFS_TSIZE)
795 795 mi->mi_curwrite = MIN_NFS_TSIZE;
796 796 r = 1;
797 797 }
798 798 } else if (flag == FEEDBACK_OK) {
799 799 kind = mi->mi_timer_type[which];
800 800 if (kind == 0 ||
801 801 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
802 802 goto done;
803 803 if (kind == 1) {
804 804 if (mi->mi_curread >= mi->mi_tsize)
805 805 goto done;
806 806 mi->mi_curread += MIN_NFS_TSIZE;
807 807 if (mi->mi_curread > mi->mi_tsize/2)
808 808 mi->mi_curread = mi->mi_tsize;
809 809 } else if (kind == 2) {
810 810 if (mi->mi_curwrite >= mi->mi_stsize)
811 811 goto done;
812 812 mi->mi_curwrite += MIN_NFS_TSIZE;
813 813 if (mi->mi_curwrite > mi->mi_stsize/2)
814 814 mi->mi_curwrite = mi->mi_stsize;
815 815 }
816 816 }
817 817 done:
818 818 mutex_exit(&mi->mi_lock);
819 819 return (r);
820 820 }
821 821
822 822 #ifdef DEBUG
823 823 static int rfs2call_hits = 0;
824 824 static int rfs2call_misses = 0;
825 825 #endif
826 826
827 827 int
828 828 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
829 829 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
830 830 enum nfsstat *statusp, int flags, failinfo_t *fi)
831 831 {
832 832 int rpcerror;
833 833 enum clnt_stat rpc_status;
834 834
835 835 ASSERT(statusp != NULL);
836 836
837 837 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
838 838 cr, douprintf, &rpc_status, flags, fi);
839 839 if (!rpcerror) {
840 840 /*
841 841 * See crnetadjust() for comments.
842 842 */
843 843 if (*statusp == NFSERR_ACCES &&
844 844 (cr = crnetadjust(cr)) != NULL) {
845 845 #ifdef DEBUG
846 846 rfs2call_hits++;
847 847 #endif
848 848 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
849 849 resp, cr, douprintf, NULL, flags, fi);
850 850 crfree(cr);
851 851 #ifdef DEBUG
852 852 if (*statusp == NFSERR_ACCES)
853 853 rfs2call_misses++;
854 854 #endif
855 855 }
856 856 } else if (rpc_status == RPC_PROCUNAVAIL) {
857 857 *statusp = NFSERR_OPNOTSUPP;
858 858 rpcerror = 0;
859 859 }
860 860
861 861 return (rpcerror);
862 862 }
863 863
864 864 #define NFS3_JUKEBOX_DELAY 10 * hz
865 865
866 866 static clock_t nfs3_jukebox_delay = 0;
867 867
868 868 #ifdef DEBUG
869 869 static int rfs3call_hits = 0;
870 870 static int rfs3call_misses = 0;
871 871 #endif
872 872
873 873 int
874 874 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
875 875 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
876 876 nfsstat3 *statusp, int flags, failinfo_t *fi)
877 877 {
878 878 int rpcerror;
879 879 int user_informed;
880 880
881 881 user_informed = 0;
882 882 do {
883 883 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
884 884 cr, douprintf, NULL, flags, fi);
885 885 if (!rpcerror) {
886 886 cred_t *crr;
887 887 if (*statusp == NFS3ERR_JUKEBOX) {
888 888 if (ttoproc(curthread) == &p0) {
889 889 rpcerror = EAGAIN;
890 890 break;
891 891 }
892 892 if (!user_informed) {
893 893 user_informed = 1;
894 894 uprintf(
895 895 "file temporarily unavailable on the server, retrying...\n");
896 896 }
897 897 delay(nfs3_jukebox_delay);
898 898 }
899 899 /*
900 900 * See crnetadjust() for comments.
901 901 */
902 902 else if (*statusp == NFS3ERR_ACCES &&
903 903 (crr = crnetadjust(cr)) != NULL) {
904 904 #ifdef DEBUG
905 905 rfs3call_hits++;
906 906 #endif
907 907 rpcerror = rfscall(mi, which, xdrargs, argsp,
908 908 xdrres, resp, crr, douprintf,
909 909 NULL, flags, fi);
910 910
911 911 crfree(crr);
912 912 #ifdef DEBUG
913 913 if (*statusp == NFS3ERR_ACCES)
914 914 rfs3call_misses++;
915 915 #endif
916 916 }
917 917 }
918 918 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
919 919
920 920 return (rpcerror);
921 921 }
922 922
923 923 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
924 924 #define INC_READERS(mi) { \
925 925 mi->mi_readers++; \
926 926 }
927 927 #define DEC_READERS(mi) { \
928 928 mi->mi_readers--; \
929 929 if (mi->mi_readers == 0) \
930 930 cv_broadcast(&mi->mi_failover_cv); \
931 931 }
932 932
933 933 static int
934 934 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
935 935 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
936 936 enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
937 937 {
938 938 CLIENT *client;
939 939 struct chtab *ch;
940 940 cred_t *cr = icr;
941 941 enum clnt_stat status;
942 942 struct rpc_err rpcerr, rpcerr_tmp;
943 943 struct timeval wait;
944 944 int timeo; /* in units of hz */
945 945 int my_rsize, my_wsize;
946 946 bool_t tryagain;
947 947 bool_t cred_cloned = FALSE;
948 948 k_sigset_t smask;
949 949 servinfo_t *svp;
950 950 struct nfs_clnt *nfscl;
951 951 zoneid_t zoneid = getzoneid();
952 952 char *msg;
953 953 #ifdef DEBUG
954 954 char *bufp;
955 955 #endif
956 956
957 957
958 958 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
959 959 "rfscall_start:which %d mi %p", which, mi);
960 960
961 961 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
962 962 ASSERT(nfscl != NULL);
963 963
964 964 nfscl->nfscl_stat.calls.value.ui64++;
965 965 mi->mi_reqs[which].value.ui64++;
966 966
967 967 rpcerr.re_status = RPC_SUCCESS;
968 968
969 969 /*
970 970 * In case of forced unmount or zone shutdown, return EIO.
971 971 */
972 972
973 973 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
974 974 rpcerr.re_status = RPC_FAILED;
975 975 rpcerr.re_errno = EIO;
976 976 return (rpcerr.re_errno);
977 977 }
978 978
979 979 /*
980 980 * Remember the transfer sizes in case
981 981 * nfs_feedback changes them underneath us.
982 982 */
983 983 my_rsize = mi->mi_curread;
984 984 my_wsize = mi->mi_curwrite;
985 985
986 986 /*
987 987 * NFS client failover support
988 988 *
989 989 * If this rnode is not in sync with the current server (VALID_FH),
990 990 * we'd like to do a remap to get in sync. We can be interrupted
991 991 * in failover_remap(), and if so we'll bail. Otherwise, we'll
992 992 * use the best info we have to try the RPC. Part of that is
993 993 * unconditionally updating the filehandle copy kept for V3.
994 994 *
995 995 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
996 996 * rw_enter(); we're trying to keep the current server from being
997 997 * changed on us until we're done with the remapping and have a
998 998 * matching client handle. We don't want to sending a filehandle
999 999 * to the wrong host.
1000 1000 */
1001 1001 failoverretry:
1002 1002 if (FAILOVER_MOUNT(mi)) {
1003 1003 mutex_enter(&mi->mi_lock);
1004 1004 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1005 1005 if (failover_wait(mi)) {
1006 1006 mutex_exit(&mi->mi_lock);
1007 1007 return (EINTR);
1008 1008 }
1009 1009 }
1010 1010 INC_READERS(mi);
1011 1011 mutex_exit(&mi->mi_lock);
1012 1012 if (fi) {
1013 1013 if (!VALID_FH(fi) &&
1014 1014 !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1015 1015 int remaperr;
1016 1016
1017 1017 svp = mi->mi_curr_serv;
1018 1018 remaperr = failover_remap(fi);
1019 1019 if (remaperr != 0) {
1020 1020 #ifdef DEBUG
1021 1021 if (remaperr != EINTR)
1022 1022 nfs_cmn_err(remaperr, CE_WARN,
1023 1023 "rfscall couldn't failover: %m");
1024 1024 #endif
1025 1025 mutex_enter(&mi->mi_lock);
1026 1026 DEC_READERS(mi);
1027 1027 mutex_exit(&mi->mi_lock);
1028 1028 /*
1029 1029 * If failover_remap returns ETIMEDOUT
1030 1030 * and the filesystem is hard mounted
1031 1031 * we have to retry the call with a new
1032 1032 * server.
1033 1033 */
1034 1034 if ((mi->mi_flags & MI_HARD) &&
1035 1035 IS_RECOVERABLE_ERROR(remaperr)) {
1036 1036 if (svp == mi->mi_curr_serv)
1037 1037 failover_newserver(mi);
1038 1038 rpcerr.re_status = RPC_SUCCESS;
1039 1039 goto failoverretry;
1040 1040 }
1041 1041 rpcerr.re_errno = remaperr;
1042 1042 return (remaperr);
1043 1043 }
1044 1044 }
1045 1045 if (fi->fhp && fi->copyproc)
1046 1046 (*fi->copyproc)(fi->fhp, fi->vp);
1047 1047 }
1048 1048 }
1049 1049
1050 1050 /* For TSOL, use a new cred which has net_mac_aware flag */
1051 1051 if (!cred_cloned && is_system_labeled()) {
1052 1052 cred_cloned = TRUE;
1053 1053 cr = crdup(icr);
1054 1054 (void) setpflags(NET_MAC_AWARE, 1, cr);
1055 1055 }
1056 1056
1057 1057 /*
1058 1058 * clget() calls clnt_tli_kinit() which clears the xid, so we
1059 1059 * are guaranteed to reprocess the retry as a new request.
1060 1060 */
1061 1061 svp = mi->mi_curr_serv;
1062 1062 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1063 1063
1064 1064 if (FAILOVER_MOUNT(mi)) {
1065 1065 mutex_enter(&mi->mi_lock);
1066 1066 DEC_READERS(mi);
1067 1067 mutex_exit(&mi->mi_lock);
1068 1068
1069 1069 if ((rpcerr.re_errno == ETIMEDOUT ||
1070 1070 rpcerr.re_errno == ECONNRESET) &&
1071 1071 failover_safe(fi)) {
1072 1072 if (svp == mi->mi_curr_serv)
1073 1073 failover_newserver(mi);
1074 1074 goto failoverretry;
1075 1075 }
1076 1076 }
1077 1077 if (rpcerr.re_errno != 0)
1078 1078 return (rpcerr.re_errno);
1079 1079
1080 1080 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1081 1081 svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1082 1082 timeo = (mi->mi_timeo * hz) / 10;
1083 1083 } else {
1084 1084 mutex_enter(&mi->mi_lock);
1085 1085 timeo = CLNT_SETTIMERS(client,
1086 1086 &(mi->mi_timers[mi->mi_timer_type[which]]),
1087 1087 &(mi->mi_timers[NFS_CALLTYPES]),
1088 1088 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1089 1089 (void (*)())NULL, (caddr_t)mi, 0);
1090 1090 mutex_exit(&mi->mi_lock);
1091 1091 }
1092 1092
1093 1093 /*
1094 1094 * If hard mounted fs, retry call forever unless hard error occurs.
1095 1095 */
1096 1096 do {
1097 1097 tryagain = FALSE;
1098 1098
1099 1099 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1100 1100 status = RPC_FAILED;
1101 1101 rpcerr.re_status = RPC_FAILED;
1102 1102 rpcerr.re_errno = EIO;
1103 1103 break;
1104 1104 }
1105 1105
1106 1106 TICK_TO_TIMEVAL(timeo, &wait);
1107 1107
1108 1108 /*
1109 1109 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1110 1110 * and SIGTERM. (Preserving the existing masks).
1111 1111 * Mask out SIGINT if mount option nointr is specified.
1112 1112 */
1113 1113 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1114 1114 if (!(mi->mi_flags & MI_INT))
1115 1115 client->cl_nosignal = TRUE;
1116 1116
1117 1117 /*
1118 1118 * If there is a current signal, then don't bother
1119 1119 * even trying to send out the request because we
1120 1120 * won't be able to block waiting for the response.
1121 1121 * Simply assume RPC_INTR and get on with it.
1122 1122 */
1123 1123 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1124 1124 status = RPC_INTR;
1125 1125 else {
1126 1126 status = CLNT_CALL(client, which, xdrargs, argsp,
1127 1127 xdrres, resp, wait);
1128 1128 }
1129 1129
1130 1130 if (!(mi->mi_flags & MI_INT))
1131 1131 client->cl_nosignal = FALSE;
1132 1132 /*
1133 1133 * restore original signal mask
1134 1134 */
1135 1135 sigunintr(&smask);
1136 1136
1137 1137 switch (status) {
1138 1138 case RPC_SUCCESS:
1139 1139 if ((mi->mi_flags & MI_DYNAMIC) &&
1140 1140 mi->mi_timer_type[which] != 0 &&
1141 1141 (mi->mi_curread != my_rsize ||
1142 1142 mi->mi_curwrite != my_wsize))
1143 1143 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1144 1144 break;
1145 1145
1146 1146 case RPC_INTR:
1147 1147 /*
1148 1148 * There is no way to recover from this error,
1149 1149 * even if mount option nointr is specified.
1150 1150 * SIGKILL, for example, cannot be blocked.
1151 1151 */
1152 1152 rpcerr.re_status = RPC_INTR;
1153 1153 rpcerr.re_errno = EINTR;
1154 1154 break;
1155 1155
1156 1156 case RPC_UDERROR:
1157 1157 /*
1158 1158 * If the NFS server is local (vold) and
1159 1159 * it goes away then we get RPC_UDERROR.
1160 1160 * This is a retryable error, so we would
1161 1161 * loop, so check to see if the specific
1162 1162 * error was ECONNRESET, indicating that
1163 1163 * target did not exist at all. If so,
1164 1164 * return with RPC_PROGUNAVAIL and
1165 1165 * ECONNRESET to indicate why.
1166 1166 */
1167 1167 CLNT_GETERR(client, &rpcerr);
1168 1168 if (rpcerr.re_errno == ECONNRESET) {
1169 1169 rpcerr.re_status = RPC_PROGUNAVAIL;
1170 1170 rpcerr.re_errno = ECONNRESET;
1171 1171 break;
1172 1172 }
1173 1173 /*FALLTHROUGH*/
1174 1174
1175 1175 default: /* probably RPC_TIMEDOUT */
1176 1176 if (IS_UNRECOVERABLE_RPC(status))
1177 1177 break;
1178 1178
1179 1179 /*
1180 1180 * increment server not responding count
1181 1181 */
1182 1182 mutex_enter(&mi->mi_lock);
1183 1183 mi->mi_noresponse++;
1184 1184 mutex_exit(&mi->mi_lock);
1185 1185 #ifdef DEBUG
1186 1186 nfscl->nfscl_stat.noresponse.value.ui64++;
1187 1187 #endif
1188 1188
1189 1189 if (!(mi->mi_flags & MI_HARD)) {
1190 1190 if (!(mi->mi_flags & MI_SEMISOFT) ||
1191 1191 (mi->mi_ss_call_type[which] == 0))
1192 1192 break;
1193 1193 }
1194 1194
1195 1195 /*
1196 1196 * The call is in progress (over COTS).
1197 1197 * Try the CLNT_CALL again, but don't
1198 1198 * print a noisy error message.
1199 1199 */
1200 1200 if (status == RPC_INPROGRESS) {
1201 1201 tryagain = TRUE;
1202 1202 break;
1203 1203 }
1204 1204
1205 1205 if (flags & RFSCALL_SOFT)
1206 1206 break;
1207 1207
1208 1208 /*
1209 1209 * On zone shutdown, just move on.
1210 1210 */
1211 1211 if (zone_status_get(curproc->p_zone) >=
1212 1212 ZONE_IS_SHUTTING_DOWN) {
1213 1213 rpcerr.re_status = RPC_FAILED;
1214 1214 rpcerr.re_errno = EIO;
1215 1215 break;
1216 1216 }
1217 1217
1218 1218 /*
1219 1219 * NFS client failover support
1220 1220 *
1221 1221 * If the current server just failed us, we'll
1222 1222 * start the process of finding a new server.
1223 1223 * After that, we can just retry.
1224 1224 */
1225 1225 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1226 1226 if (svp == mi->mi_curr_serv)
1227 1227 failover_newserver(mi);
1228 1228 clfree_impl(client, ch, nfscl);
1229 1229 goto failoverretry;
1230 1230 }
1231 1231
1232 1232 tryagain = TRUE;
1233 1233 timeo = backoff(timeo);
1234 1234
1235 1235 CLNT_GETERR(client, &rpcerr_tmp);
1236 1236 if ((status == RPC_CANTSEND) &&
1237 1237 (rpcerr_tmp.re_errno == ENOBUFS))
1238 1238 msg = SRV_QFULL_MSG;
1239 1239 else
1240 1240 msg = SRV_NOTRESP_MSG;
1241 1241
1242 1242 mutex_enter(&mi->mi_lock);
1243 1243 if (!(mi->mi_flags & MI_PRINTED)) {
1244 1244 mi->mi_flags |= MI_PRINTED;
1245 1245 mutex_exit(&mi->mi_lock);
1246 1246 #ifdef DEBUG
1247 1247 zprintf(zoneid, msg, mi->mi_vers,
1248 1248 svp->sv_hostname);
1249 1249 #else
1250 1250 zprintf(zoneid, msg, svp->sv_hostname);
1251 1251 #endif
1252 1252 } else
1253 1253 mutex_exit(&mi->mi_lock);
1254 1254 if (*douprintf && nfs_has_ctty()) {
1255 1255 *douprintf = 0;
1256 1256 if (!(mi->mi_flags & MI_NOPRINT))
1257 1257 #ifdef DEBUG
1258 1258 uprintf(msg, mi->mi_vers,
1259 1259 svp->sv_hostname);
1260 1260 #else
1261 1261 uprintf(msg, svp->sv_hostname);
1262 1262 #endif
1263 1263 }
1264 1264
1265 1265 /*
1266 1266 * If doing dynamic adjustment of transfer
1267 1267 * size and if it's a read or write call
1268 1268 * and if the transfer size changed while
1269 1269 * retransmitting or if the feedback routine
1270 1270 * changed the transfer size,
1271 1271 * then exit rfscall so that the transfer
1272 1272 * size can be adjusted at the vnops level.
1273 1273 */
1274 1274 if ((mi->mi_flags & MI_DYNAMIC) &&
1275 1275 mi->mi_timer_type[which] != 0 &&
1276 1276 (mi->mi_curread != my_rsize ||
1277 1277 mi->mi_curwrite != my_wsize ||
1278 1278 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1279 1279 /*
1280 1280 * On read or write calls, return
1281 1281 * back to the vnode ops level if
1282 1282 * the transfer size changed.
1283 1283 */
1284 1284 clfree_impl(client, ch, nfscl);
1285 1285 if (cred_cloned)
1286 1286 crfree(cr);
1287 1287 return (ENFS_TRYAGAIN);
1288 1288 }
1289 1289 }
1290 1290 } while (tryagain);
1291 1291
1292 1292 if (status != RPC_SUCCESS) {
1293 1293 /*
1294 1294 * Let soft mounts use the timed out message.
1295 1295 */
1296 1296 if (status == RPC_INPROGRESS)
1297 1297 status = RPC_TIMEDOUT;
1298 1298 nfscl->nfscl_stat.badcalls.value.ui64++;
1299 1299 if (status != RPC_INTR) {
1300 1300 mutex_enter(&mi->mi_lock);
1301 1301 mi->mi_flags |= MI_DOWN;
1302 1302 mutex_exit(&mi->mi_lock);
1303 1303 CLNT_GETERR(client, &rpcerr);
1304 1304 #ifdef DEBUG
1305 1305 bufp = clnt_sperror(client, svp->sv_hostname);
1306 1306 zprintf(zoneid, "NFS%d %s failed for %s\n",
1307 1307 mi->mi_vers, mi->mi_rfsnames[which], bufp);
1308 1308 if (nfs_has_ctty()) {
1309 1309 if (!(mi->mi_flags & MI_NOPRINT)) {
1310 1310 uprintf("NFS%d %s failed for %s\n",
1311 1311 mi->mi_vers, mi->mi_rfsnames[which],
1312 1312 bufp);
1313 1313 }
1314 1314 }
1315 1315 kmem_free(bufp, MAXPATHLEN);
1316 1316 #else
1317 1317 zprintf(zoneid,
1318 1318 "NFS %s failed for server %s: error %d (%s)\n",
1319 1319 mi->mi_rfsnames[which], svp->sv_hostname,
1320 1320 status, clnt_sperrno(status));
1321 1321 if (nfs_has_ctty()) {
1322 1322 if (!(mi->mi_flags & MI_NOPRINT)) {
1323 1323 uprintf(
1324 1324 "NFS %s failed for server %s: error %d (%s)\n",
1325 1325 mi->mi_rfsnames[which],
1326 1326 svp->sv_hostname, status,
1327 1327 clnt_sperrno(status));
1328 1328 }
1329 1329 }
1330 1330 #endif
1331 1331 /*
1332 1332 * when CLNT_CALL() fails with RPC_AUTHERROR,
1333 1333 * re_errno is set appropriately depending on
1334 1334 * the authentication error
1335 1335 */
1336 1336 if (status == RPC_VERSMISMATCH ||
1337 1337 status == RPC_PROGVERSMISMATCH)
1338 1338 rpcerr.re_errno = EIO;
1339 1339 }
1340 1340 } else {
1341 1341 /*
1342 1342 * Test the value of mi_down and mi_printed without
1343 1343 * holding the mi_lock mutex. If they are both zero,
1344 1344 * then it is okay to skip the down and printed
1345 1345 * processing. This saves on a mutex_enter and
1346 1346 * mutex_exit pair for a normal, successful RPC.
1347 1347 * This was just complete overhead.
1348 1348 */
1349 1349 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1350 1350 mutex_enter(&mi->mi_lock);
1351 1351 mi->mi_flags &= ~MI_DOWN;
1352 1352 if (mi->mi_flags & MI_PRINTED) {
1353 1353 mi->mi_flags &= ~MI_PRINTED;
1354 1354 mutex_exit(&mi->mi_lock);
1355 1355 #ifdef DEBUG
1356 1356 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1357 1357 zprintf(zoneid, "NFS%d server %s ok\n",
1358 1358 mi->mi_vers, svp->sv_hostname);
1359 1359 #else
1360 1360 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1361 1361 zprintf(zoneid, "NFS server %s ok\n",
1362 1362 svp->sv_hostname);
1363 1363 #endif
1364 1364 } else
1365 1365 mutex_exit(&mi->mi_lock);
1366 1366 }
1367 1367
1368 1368 if (*douprintf == 0) {
1369 1369 if (!(mi->mi_flags & MI_NOPRINT))
1370 1370 #ifdef DEBUG
1371 1371 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1372 1372 uprintf("NFS%d server %s ok\n",
1373 1373 mi->mi_vers, svp->sv_hostname);
1374 1374 #else
1375 1375 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1376 1376 uprintf("NFS server %s ok\n", svp->sv_hostname);
1377 1377 #endif
1378 1378 *douprintf = 1;
1379 1379 }
1380 1380 }
1381 1381
1382 1382 clfree_impl(client, ch, nfscl);
1383 1383 if (cred_cloned)
1384 1384 crfree(cr);
1385 1385
1386 1386 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1387 1387
1388 1388 if (rpc_status != NULL)
1389 1389 *rpc_status = rpcerr.re_status;
1390 1390
1391 1391 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1392 1392 rpcerr.re_errno);
1393 1393
1394 1394 return (rpcerr.re_errno);
1395 1395 }
1396 1396
1397 1397 #ifdef DEBUG
1398 1398 static int acl2call_hits = 0;
1399 1399 static int acl2call_misses = 0;
1400 1400 #endif
1401 1401
1402 1402 int
1403 1403 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1404 1404 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1405 1405 enum nfsstat *statusp, int flags, failinfo_t *fi)
1406 1406 {
1407 1407 int rpcerror;
1408 1408
1409 1409 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1410 1410 cr, douprintf, flags, fi);
1411 1411 if (!rpcerror) {
1412 1412 /*
1413 1413 * See comments with crnetadjust().
1414 1414 */
1415 1415 if (*statusp == NFSERR_ACCES &&
1416 1416 (cr = crnetadjust(cr)) != NULL) {
1417 1417 #ifdef DEBUG
1418 1418 acl2call_hits++;
1419 1419 #endif
1420 1420 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1421 1421 resp, cr, douprintf, flags, fi);
1422 1422 crfree(cr);
1423 1423 #ifdef DEBUG
1424 1424 if (*statusp == NFSERR_ACCES)
1425 1425 acl2call_misses++;
1426 1426 #endif
1427 1427 }
1428 1428 }
1429 1429
1430 1430 return (rpcerror);
1431 1431 }
1432 1432
1433 1433 #ifdef DEBUG
1434 1434 static int acl3call_hits = 0;
1435 1435 static int acl3call_misses = 0;
1436 1436 #endif
1437 1437
1438 1438 int
1439 1439 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1440 1440 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1441 1441 nfsstat3 *statusp, int flags, failinfo_t *fi)
1442 1442 {
1443 1443 int rpcerror;
1444 1444 int user_informed;
1445 1445
1446 1446 user_informed = 0;
1447 1447
1448 1448 do {
1449 1449 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1450 1450 cr, douprintf, flags, fi);
1451 1451 if (!rpcerror) {
1452 1452 cred_t *crr;
1453 1453 if (*statusp == NFS3ERR_JUKEBOX) {
1454 1454 if (!user_informed) {
1455 1455 user_informed = 1;
1456 1456 uprintf(
1457 1457 "file temporarily unavailable on the server, retrying...\n");
1458 1458 }
1459 1459 delay(nfs3_jukebox_delay);
1460 1460 }
1461 1461 /*
1462 1462 * See crnetadjust() for comments.
1463 1463 */
1464 1464 else if (*statusp == NFS3ERR_ACCES &&
1465 1465 (crr = crnetadjust(cr)) != NULL) {
1466 1466 #ifdef DEBUG
1467 1467 acl3call_hits++;
1468 1468 #endif
1469 1469 rpcerror = aclcall(mi, which, xdrargs, argsp,
1470 1470 xdrres, resp, crr, douprintf, flags, fi);
1471 1471
1472 1472 crfree(crr);
1473 1473 #ifdef DEBUG
1474 1474 if (*statusp == NFS3ERR_ACCES)
1475 1475 acl3call_misses++;
1476 1476 #endif
1477 1477 }
1478 1478 }
1479 1479 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1480 1480
1481 1481 return (rpcerror);
1482 1482 }
1483 1483
1484 1484 static int
1485 1485 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1486 1486 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1487 1487 int flags, failinfo_t *fi)
1488 1488 {
1489 1489 CLIENT *client;
1490 1490 struct chtab *ch;
1491 1491 cred_t *cr = icr;
1492 1492 bool_t cred_cloned = FALSE;
1493 1493 enum clnt_stat status;
1494 1494 struct rpc_err rpcerr;
1495 1495 struct timeval wait;
1496 1496 int timeo; /* in units of hz */
1497 1497 #if 0 /* notyet */
1498 1498 int my_rsize, my_wsize;
1499 1499 #endif
1500 1500 bool_t tryagain;
1501 1501 k_sigset_t smask;
1502 1502 servinfo_t *svp;
1503 1503 struct nfs_clnt *nfscl;
1504 1504 zoneid_t zoneid = getzoneid();
1505 1505 #ifdef DEBUG
1506 1506 char *bufp;
1507 1507 #endif
1508 1508
1509 1509 #if 0 /* notyet */
1510 1510 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1511 1511 "rfscall_start:which %d mi %p", which, mi);
1512 1512 #endif
1513 1513
1514 1514 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1515 1515 ASSERT(nfscl != NULL);
1516 1516
1517 1517 nfscl->nfscl_stat.calls.value.ui64++;
1518 1518 mi->mi_aclreqs[which].value.ui64++;
1519 1519
1520 1520 rpcerr.re_status = RPC_SUCCESS;
1521 1521
1522 1522 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1523 1523 rpcerr.re_status = RPC_FAILED;
1524 1524 rpcerr.re_errno = EIO;
1525 1525 return (rpcerr.re_errno);
1526 1526 }
1527 1527
1528 1528 #if 0 /* notyet */
1529 1529 /*
1530 1530 * Remember the transfer sizes in case
1531 1531 * nfs_feedback changes them underneath us.
1532 1532 */
1533 1533 my_rsize = mi->mi_curread;
1534 1534 my_wsize = mi->mi_curwrite;
1535 1535 #endif
1536 1536
1537 1537 /*
1538 1538 * NFS client failover support
1539 1539 *
1540 1540 * If this rnode is not in sync with the current server (VALID_FH),
1541 1541 * we'd like to do a remap to get in sync. We can be interrupted
1542 1542 * in failover_remap(), and if so we'll bail. Otherwise, we'll
1543 1543 * use the best info we have to try the RPC. Part of that is
1544 1544 * unconditionally updating the filehandle copy kept for V3.
1545 1545 *
1546 1546 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1547 1547 * rw_enter(); we're trying to keep the current server from being
1548 1548 * changed on us until we're done with the remapping and have a
1549 1549 * matching client handle. We don't want to sending a filehandle
1550 1550 * to the wrong host.
1551 1551 */
1552 1552 failoverretry:
1553 1553 if (FAILOVER_MOUNT(mi)) {
1554 1554 mutex_enter(&mi->mi_lock);
1555 1555 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1556 1556 if (failover_wait(mi)) {
1557 1557 mutex_exit(&mi->mi_lock);
1558 1558 return (EINTR);
1559 1559 }
1560 1560 }
1561 1561 INC_READERS(mi);
1562 1562 mutex_exit(&mi->mi_lock);
1563 1563 if (fi) {
1564 1564 if (!VALID_FH(fi) &&
1565 1565 !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1566 1566 int remaperr;
1567 1567
1568 1568 svp = mi->mi_curr_serv;
1569 1569 remaperr = failover_remap(fi);
1570 1570 if (remaperr != 0) {
1571 1571 #ifdef DEBUG
1572 1572 if (remaperr != EINTR)
1573 1573 nfs_cmn_err(remaperr, CE_WARN,
1574 1574 "aclcall couldn't failover: %m");
1575 1575 #endif
1576 1576 mutex_enter(&mi->mi_lock);
1577 1577 DEC_READERS(mi);
1578 1578 mutex_exit(&mi->mi_lock);
1579 1579
1580 1580 /*
1581 1581 * If failover_remap returns ETIMEDOUT
1582 1582 * and the filesystem is hard mounted
1583 1583 * we have to retry the call with a new
1584 1584 * server.
1585 1585 */
1586 1586 if ((mi->mi_flags & MI_HARD) &&
1587 1587 IS_RECOVERABLE_ERROR(remaperr)) {
1588 1588 if (svp == mi->mi_curr_serv)
1589 1589 failover_newserver(mi);
1590 1590 rpcerr.re_status = RPC_SUCCESS;
1591 1591 goto failoverretry;
1592 1592 }
1593 1593 return (remaperr);
1594 1594 }
1595 1595 }
1596 1596 if (fi->fhp && fi->copyproc)
1597 1597 (*fi->copyproc)(fi->fhp, fi->vp);
1598 1598 }
1599 1599 }
1600 1600
1601 1601 /* For TSOL, use a new cred which has net_mac_aware flag */
1602 1602 if (!cred_cloned && is_system_labeled()) {
1603 1603 cred_cloned = TRUE;
1604 1604 cr = crdup(icr);
1605 1605 (void) setpflags(NET_MAC_AWARE, 1, cr);
1606 1606 }
1607 1607
1608 1608 /*
1609 1609 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1610 1610 * are guaranteed to reprocess the retry as a new request.
1611 1611 */
1612 1612 svp = mi->mi_curr_serv;
1613 1613 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1614 1614 if (FAILOVER_MOUNT(mi)) {
1615 1615 mutex_enter(&mi->mi_lock);
1616 1616 DEC_READERS(mi);
1617 1617 mutex_exit(&mi->mi_lock);
1618 1618
1619 1619 if ((rpcerr.re_errno == ETIMEDOUT ||
1620 1620 rpcerr.re_errno == ECONNRESET) &&
1621 1621 failover_safe(fi)) {
1622 1622 if (svp == mi->mi_curr_serv)
1623 1623 failover_newserver(mi);
1624 1624 goto failoverretry;
1625 1625 }
1626 1626 }
1627 1627 if (rpcerr.re_errno != 0) {
1628 1628 if (cred_cloned)
1629 1629 crfree(cr);
1630 1630 return (rpcerr.re_errno);
1631 1631 }
1632 1632
1633 1633 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1634 1634 svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1635 1635 timeo = (mi->mi_timeo * hz) / 10;
1636 1636 } else {
1637 1637 mutex_enter(&mi->mi_lock);
1638 1638 timeo = CLNT_SETTIMERS(client,
1639 1639 &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1640 1640 &(mi->mi_timers[NFS_CALLTYPES]),
1641 1641 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1642 1642 (void (*)()) 0, (caddr_t)mi, 0);
1643 1643 mutex_exit(&mi->mi_lock);
1644 1644 }
1645 1645
1646 1646 /*
1647 1647 * If hard mounted fs, retry call forever unless hard error occurs.
1648 1648 */
1649 1649 do {
1650 1650 tryagain = FALSE;
1651 1651
1652 1652 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1653 1653 status = RPC_FAILED;
1654 1654 rpcerr.re_status = RPC_FAILED;
1655 1655 rpcerr.re_errno = EIO;
1656 1656 break;
1657 1657 }
1658 1658
1659 1659 TICK_TO_TIMEVAL(timeo, &wait);
1660 1660
1661 1661 /*
1662 1662 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1663 1663 * and SIGTERM. (Preserving the existing masks).
1664 1664 * Mask out SIGINT if mount option nointr is specified.
1665 1665 */
1666 1666 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1667 1667 if (!(mi->mi_flags & MI_INT))
1668 1668 client->cl_nosignal = TRUE;
1669 1669
1670 1670 /*
1671 1671 * If there is a current signal, then don't bother
1672 1672 * even trying to send out the request because we
1673 1673 * won't be able to block waiting for the response.
1674 1674 * Simply assume RPC_INTR and get on with it.
1675 1675 */
1676 1676 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1677 1677 status = RPC_INTR;
1678 1678 else {
1679 1679 status = CLNT_CALL(client, which, xdrargs, argsp,
1680 1680 xdrres, resp, wait);
1681 1681 }
1682 1682
1683 1683 if (!(mi->mi_flags & MI_INT))
1684 1684 client->cl_nosignal = FALSE;
1685 1685 /*
1686 1686 * restore original signal mask
1687 1687 */
1688 1688 sigunintr(&smask);
1689 1689
1690 1690 switch (status) {
1691 1691 case RPC_SUCCESS:
1692 1692 #if 0 /* notyet */
1693 1693 if ((mi->mi_flags & MI_DYNAMIC) &&
1694 1694 mi->mi_timer_type[which] != 0 &&
1695 1695 (mi->mi_curread != my_rsize ||
1696 1696 mi->mi_curwrite != my_wsize))
1697 1697 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1698 1698 #endif
1699 1699 break;
1700 1700
1701 1701 /*
1702 1702 * Unfortunately, there are servers in the world which
1703 1703 * are not coded correctly. They are not prepared to
1704 1704 * handle RPC requests to the NFS port which are not
1705 1705 * NFS requests. Thus, they may try to process the
1706 1706 * NFS_ACL request as if it were an NFS request. This
1707 1707 * does not work. Generally, an error will be generated
1708 1708 * on the client because it will not be able to decode
1709 1709 * the response from the server. However, it seems
1710 1710 * possible that the server may not be able to decode
1711 1711 * the arguments. Thus, the criteria for deciding
1712 1712 * whether the server supports NFS_ACL or not is whether
1713 1713 * the following RPC errors are returned from CLNT_CALL.
1714 1714 */
1715 1715 case RPC_CANTDECODERES:
1716 1716 case RPC_PROGUNAVAIL:
1717 1717 case RPC_CANTDECODEARGS:
1718 1718 case RPC_PROGVERSMISMATCH:
1719 1719 mutex_enter(&mi->mi_lock);
1720 1720 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1721 1721 mutex_exit(&mi->mi_lock);
1722 1722 break;
1723 1723
1724 1724 /*
1725 1725 * If the server supports NFS_ACL but not the new ops
1726 1726 * for extended attributes, make sure we don't retry.
1727 1727 */
1728 1728 case RPC_PROCUNAVAIL:
1729 1729 mutex_enter(&mi->mi_lock);
1730 1730 mi->mi_flags &= ~MI_EXTATTR;
1731 1731 mutex_exit(&mi->mi_lock);
1732 1732 break;
1733 1733
1734 1734 case RPC_INTR:
1735 1735 /*
1736 1736 * There is no way to recover from this error,
1737 1737 * even if mount option nointr is specified.
1738 1738 * SIGKILL, for example, cannot be blocked.
1739 1739 */
1740 1740 rpcerr.re_status = RPC_INTR;
1741 1741 rpcerr.re_errno = EINTR;
1742 1742 break;
1743 1743
1744 1744 case RPC_UDERROR:
1745 1745 /*
1746 1746 * If the NFS server is local (vold) and
1747 1747 * it goes away then we get RPC_UDERROR.
1748 1748 * This is a retryable error, so we would
1749 1749 * loop, so check to see if the specific
1750 1750 * error was ECONNRESET, indicating that
1751 1751 * target did not exist at all. If so,
1752 1752 * return with RPC_PROGUNAVAIL and
1753 1753 * ECONNRESET to indicate why.
1754 1754 */
1755 1755 CLNT_GETERR(client, &rpcerr);
1756 1756 if (rpcerr.re_errno == ECONNRESET) {
1757 1757 rpcerr.re_status = RPC_PROGUNAVAIL;
1758 1758 rpcerr.re_errno = ECONNRESET;
1759 1759 break;
1760 1760 }
1761 1761 /*FALLTHROUGH*/
1762 1762
1763 1763 default: /* probably RPC_TIMEDOUT */
1764 1764 if (IS_UNRECOVERABLE_RPC(status))
1765 1765 break;
1766 1766
1767 1767 /*
1768 1768 * increment server not responding count
1769 1769 */
1770 1770 mutex_enter(&mi->mi_lock);
1771 1771 mi->mi_noresponse++;
1772 1772 mutex_exit(&mi->mi_lock);
1773 1773 #ifdef DEBUG
1774 1774 nfscl->nfscl_stat.noresponse.value.ui64++;
1775 1775 #endif
1776 1776
1777 1777 if (!(mi->mi_flags & MI_HARD)) {
1778 1778 if (!(mi->mi_flags & MI_SEMISOFT) ||
1779 1779 (mi->mi_acl_ss_call_type[which] == 0))
1780 1780 break;
1781 1781 }
1782 1782
1783 1783 /*
1784 1784 * The call is in progress (over COTS).
1785 1785 * Try the CLNT_CALL again, but don't
1786 1786 * print a noisy error message.
1787 1787 */
1788 1788 if (status == RPC_INPROGRESS) {
1789 1789 tryagain = TRUE;
1790 1790 break;
1791 1791 }
1792 1792
1793 1793 if (flags & RFSCALL_SOFT)
1794 1794 break;
1795 1795
1796 1796 /*
1797 1797 * On zone shutdown, just move on.
1798 1798 */
1799 1799 if (zone_status_get(curproc->p_zone) >=
1800 1800 ZONE_IS_SHUTTING_DOWN) {
1801 1801 rpcerr.re_status = RPC_FAILED;
1802 1802 rpcerr.re_errno = EIO;
1803 1803 break;
1804 1804 }
1805 1805
1806 1806 /*
1807 1807 * NFS client failover support
1808 1808 *
1809 1809 * If the current server just failed us, we'll
1810 1810 * start the process of finding a new server.
1811 1811 * After that, we can just retry.
1812 1812 */
1813 1813 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1814 1814 if (svp == mi->mi_curr_serv)
1815 1815 failover_newserver(mi);
1816 1816 clfree_impl(client, ch, nfscl);
1817 1817 goto failoverretry;
1818 1818 }
1819 1819
1820 1820 tryagain = TRUE;
1821 1821 timeo = backoff(timeo);
1822 1822 mutex_enter(&mi->mi_lock);
1823 1823 if (!(mi->mi_flags & MI_PRINTED)) {
1824 1824 mi->mi_flags |= MI_PRINTED;
1825 1825 mutex_exit(&mi->mi_lock);
1826 1826 #ifdef DEBUG
1827 1827 zprintf(zoneid,
1828 1828 "NFS_ACL%d server %s not responding still trying\n",
1829 1829 mi->mi_vers, svp->sv_hostname);
1830 1830 #else
1831 1831 zprintf(zoneid,
1832 1832 "NFS server %s not responding still trying\n",
1833 1833 svp->sv_hostname);
1834 1834 #endif
1835 1835 } else
1836 1836 mutex_exit(&mi->mi_lock);
1837 1837 if (*douprintf && nfs_has_ctty()) {
1838 1838 *douprintf = 0;
1839 1839 if (!(mi->mi_flags & MI_NOPRINT))
1840 1840 #ifdef DEBUG
1841 1841 uprintf(
1842 1842 "NFS_ACL%d server %s not responding still trying\n",
1843 1843 mi->mi_vers, svp->sv_hostname);
1844 1844 #else
1845 1845 uprintf(
1846 1846 "NFS server %s not responding still trying\n",
1847 1847 svp->sv_hostname);
1848 1848 #endif
1849 1849 }
1850 1850
1851 1851 #if 0 /* notyet */
1852 1852 /*
1853 1853 * If doing dynamic adjustment of transfer
1854 1854 * size and if it's a read or write call
1855 1855 * and if the transfer size changed while
1856 1856 * retransmitting or if the feedback routine
1857 1857 * changed the transfer size,
1858 1858 * then exit rfscall so that the transfer
1859 1859 * size can be adjusted at the vnops level.
1860 1860 */
1861 1861 if ((mi->mi_flags & MI_DYNAMIC) &&
1862 1862 mi->mi_acl_timer_type[which] != 0 &&
1863 1863 (mi->mi_curread != my_rsize ||
1864 1864 mi->mi_curwrite != my_wsize ||
1865 1865 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1866 1866 /*
1867 1867 * On read or write calls, return
1868 1868 * back to the vnode ops level if
1869 1869 * the transfer size changed.
1870 1870 */
1871 1871 clfree_impl(client, ch, nfscl);
1872 1872 if (cred_cloned)
1873 1873 crfree(cr);
1874 1874 return (ENFS_TRYAGAIN);
1875 1875 }
1876 1876 #endif
1877 1877 }
1878 1878 } while (tryagain);
1879 1879
1880 1880 if (status != RPC_SUCCESS) {
1881 1881 /*
1882 1882 * Let soft mounts use the timed out message.
1883 1883 */
1884 1884 if (status == RPC_INPROGRESS)
1885 1885 status = RPC_TIMEDOUT;
1886 1886 nfscl->nfscl_stat.badcalls.value.ui64++;
1887 1887 if (status == RPC_CANTDECODERES ||
1888 1888 status == RPC_PROGUNAVAIL ||
1889 1889 status == RPC_PROCUNAVAIL ||
1890 1890 status == RPC_CANTDECODEARGS ||
1891 1891 status == RPC_PROGVERSMISMATCH)
1892 1892 CLNT_GETERR(client, &rpcerr);
1893 1893 else if (status != RPC_INTR) {
1894 1894 mutex_enter(&mi->mi_lock);
1895 1895 mi->mi_flags |= MI_DOWN;
1896 1896 mutex_exit(&mi->mi_lock);
1897 1897 CLNT_GETERR(client, &rpcerr);
1898 1898 #ifdef DEBUG
1899 1899 bufp = clnt_sperror(client, svp->sv_hostname);
1900 1900 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1901 1901 mi->mi_vers, mi->mi_aclnames[which], bufp);
1902 1902 if (nfs_has_ctty()) {
1903 1903 if (!(mi->mi_flags & MI_NOPRINT)) {
1904 1904 uprintf("NFS_ACL%d %s failed for %s\n",
1905 1905 mi->mi_vers, mi->mi_aclnames[which],
1906 1906 bufp);
1907 1907 }
1908 1908 }
1909 1909 kmem_free(bufp, MAXPATHLEN);
1910 1910 #else
1911 1911 zprintf(zoneid,
1912 1912 "NFS %s failed for server %s: error %d (%s)\n",
1913 1913 mi->mi_aclnames[which], svp->sv_hostname,
1914 1914 status, clnt_sperrno(status));
1915 1915 if (nfs_has_ctty()) {
1916 1916 if (!(mi->mi_flags & MI_NOPRINT))
1917 1917 uprintf(
1918 1918 "NFS %s failed for server %s: error %d (%s)\n",
1919 1919 mi->mi_aclnames[which],
1920 1920 svp->sv_hostname, status,
1921 1921 clnt_sperrno(status));
1922 1922 }
1923 1923 #endif
1924 1924 /*
1925 1925 * when CLNT_CALL() fails with RPC_AUTHERROR,
1926 1926 * re_errno is set appropriately depending on
1927 1927 * the authentication error
1928 1928 */
1929 1929 if (status == RPC_VERSMISMATCH ||
1930 1930 status == RPC_PROGVERSMISMATCH)
1931 1931 rpcerr.re_errno = EIO;
1932 1932 }
1933 1933 } else {
1934 1934 /*
1935 1935 * Test the value of mi_down and mi_printed without
1936 1936 * holding the mi_lock mutex. If they are both zero,
1937 1937 * then it is okay to skip the down and printed
1938 1938 * processing. This saves on a mutex_enter and
1939 1939 * mutex_exit pair for a normal, successful RPC.
1940 1940 * This was just complete overhead.
1941 1941 */
1942 1942 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1943 1943 mutex_enter(&mi->mi_lock);
1944 1944 mi->mi_flags &= ~MI_DOWN;
1945 1945 if (mi->mi_flags & MI_PRINTED) {
1946 1946 mi->mi_flags &= ~MI_PRINTED;
1947 1947 mutex_exit(&mi->mi_lock);
1948 1948 #ifdef DEBUG
1949 1949 zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1950 1950 mi->mi_vers, svp->sv_hostname);
1951 1951 #else
1952 1952 zprintf(zoneid, "NFS server %s ok\n",
1953 1953 svp->sv_hostname);
1954 1954 #endif
1955 1955 } else
1956 1956 mutex_exit(&mi->mi_lock);
1957 1957 }
1958 1958
1959 1959 if (*douprintf == 0) {
1960 1960 if (!(mi->mi_flags & MI_NOPRINT))
1961 1961 #ifdef DEBUG
1962 1962 uprintf("NFS_ACL%d server %s ok\n",
1963 1963 mi->mi_vers, svp->sv_hostname);
1964 1964 #else
1965 1965 uprintf("NFS server %s ok\n", svp->sv_hostname);
1966 1966 #endif
1967 1967 *douprintf = 1;
1968 1968 }
1969 1969 }
1970 1970
1971 1971 clfree_impl(client, ch, nfscl);
1972 1972 if (cred_cloned)
1973 1973 crfree(cr);
1974 1974
1975 1975 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1976 1976
1977 1977 #if 0 /* notyet */
1978 1978 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1979 1979 rpcerr.re_errno);
1980 1980 #endif
1981 1981
1982 1982 return (rpcerr.re_errno);
1983 1983 }
1984 1984
1985 1985 int
1986 1986 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1987 1987 {
1988 1988 uint_t mask = vap->va_mask;
1989 1989
1990 1990 if (!(mask & AT_MODE))
1991 1991 sa->sa_mode = (uint32_t)-1;
1992 1992 else
1993 1993 sa->sa_mode = vap->va_mode;
1994 1994 if (!(mask & AT_UID))
1995 1995 sa->sa_uid = (uint32_t)-1;
1996 1996 else
1997 1997 sa->sa_uid = (uint32_t)vap->va_uid;
1998 1998 if (!(mask & AT_GID))
1999 1999 sa->sa_gid = (uint32_t)-1;
2000 2000 else
2001 2001 sa->sa_gid = (uint32_t)vap->va_gid;
2002 2002 if (!(mask & AT_SIZE))
2003 2003 sa->sa_size = (uint32_t)-1;
2004 2004 else
2005 2005 sa->sa_size = (uint32_t)vap->va_size;
2006 2006 if (!(mask & AT_ATIME))
2007 2007 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2008 2008 else {
2009 2009 /* check time validity */
2010 2010 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2011 2011 return (EOVERFLOW);
2012 2012 }
2013 2013 sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2014 2014 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2015 2015 }
2016 2016 if (!(mask & AT_MTIME))
2017 2017 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2018 2018 else {
2019 2019 /* check time validity */
2020 2020 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2021 2021 return (EOVERFLOW);
2022 2022 }
2023 2023 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2024 2024 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2025 2025 }
2026 2026 return (0);
2027 2027 }
2028 2028
2029 2029 int
2030 2030 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2031 2031 {
2032 2032 uint_t mask = vap->va_mask;
2033 2033
2034 2034 if (!(mask & AT_MODE))
2035 2035 sa->mode.set_it = FALSE;
2036 2036 else {
2037 2037 sa->mode.set_it = TRUE;
2038 2038 sa->mode.mode = (mode3)vap->va_mode;
2039 2039 }
2040 2040 if (!(mask & AT_UID))
2041 2041 sa->uid.set_it = FALSE;
2042 2042 else {
2043 2043 sa->uid.set_it = TRUE;
2044 2044 sa->uid.uid = (uid3)vap->va_uid;
2045 2045 }
2046 2046 if (!(mask & AT_GID))
2047 2047 sa->gid.set_it = FALSE;
2048 2048 else {
2049 2049 sa->gid.set_it = TRUE;
2050 2050 sa->gid.gid = (gid3)vap->va_gid;
2051 2051 }
2052 2052 if (!(mask & AT_SIZE))
2053 2053 sa->size.set_it = FALSE;
2054 2054 else {
2055 2055 sa->size.set_it = TRUE;
2056 2056 sa->size.size = (size3)vap->va_size;
2057 2057 }
2058 2058 if (!(mask & AT_ATIME))
2059 2059 sa->atime.set_it = DONT_CHANGE;
2060 2060 else {
2061 2061 /* check time validity */
2062 2062 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2063 2063 return (EOVERFLOW);
2064 2064 }
2065 2065 sa->atime.set_it = SET_TO_CLIENT_TIME;
2066 2066 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2067 2067 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2068 2068 }
2069 2069 if (!(mask & AT_MTIME))
2070 2070 sa->mtime.set_it = DONT_CHANGE;
2071 2071 else {
2072 2072 /* check time validity */
2073 2073 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2074 2074 return (EOVERFLOW);
2075 2075 }
2076 2076 sa->mtime.set_it = SET_TO_CLIENT_TIME;
2077 2077 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2078 2078 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2079 2079 }
2080 2080 return (0);
2081 2081 }
2082 2082
2083 2083 void
2084 2084 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2085 2085 {
2086 2086
2087 2087 da->da_fhandle = VTOFH(dvp);
2088 2088 da->da_name = nm;
2089 2089 da->da_flags = 0;
2090 2090 }
2091 2091
2092 2092 void
2093 2093 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2094 2094 {
2095 2095
2096 2096 da->dirp = VTOFH3(dvp);
2097 2097 da->name = nm;
2098 2098 }
2099 2099
2100 2100 int
2101 2101 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2102 2102 {
2103 2103 int error;
2104 2104 rnode_t *rp;
2105 2105 struct vattr va;
2106 2106
2107 2107 va.va_mask = AT_MODE | AT_GID;
2108 2108 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2109 2109 if (error)
2110 2110 return (error);
2111 2111
2112 2112 /*
2113 2113 * To determine the expected group-id of the created file:
2114 2114 * 1) If the filesystem was not mounted with the Old-BSD-compatible
2115 2115 * GRPID option, and the directory's set-gid bit is clear,
2116 2116 * then use the process's gid.
2117 2117 * 2) Otherwise, set the group-id to the gid of the parent directory.
2118 2118 */
2119 2119 rp = VTOR(dvp);
2120 2120 mutex_enter(&rp->r_statelock);
2121 2121 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2122 2122 *gidp = crgetgid(cr);
2123 2123 else
2124 2124 *gidp = va.va_gid;
2125 2125 mutex_exit(&rp->r_statelock);
2126 2126 return (0);
2127 2127 }
2128 2128
2129 2129 int
2130 2130 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2131 2131 {
2132 2132 int error;
2133 2133 struct vattr va;
2134 2134
2135 2135 va.va_mask = AT_MODE;
2136 2136 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2137 2137 if (error)
2138 2138 return (error);
2139 2139
2140 2140 /*
2141 2141 * Modify the expected mode (om) so that the set-gid bit matches
2142 2142 * that of the parent directory (dvp).
2143 2143 */
2144 2144 if (va.va_mode & VSGID)
2145 2145 *omp |= VSGID;
2146 2146 else
2147 2147 *omp &= ~VSGID;
2148 2148 return (0);
2149 2149 }
2150 2150
2151 2151 void
2152 2152 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2153 2153 {
2154 2154
2155 2155 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2156 2156 if (!(vp->v_flag & VSWAPLIKE)) {
2157 2157 mutex_enter(&vp->v_lock);
2158 2158 vp->v_flag |= VSWAPLIKE;
2159 2159 mutex_exit(&vp->v_lock);
2160 2160 }
2161 2161 } else {
2162 2162 if (vp->v_flag & VSWAPLIKE) {
2163 2163 mutex_enter(&vp->v_lock);
2164 2164 vp->v_flag &= ~VSWAPLIKE;
2165 2165 mutex_exit(&vp->v_lock);
2166 2166 }
2167 2167 }
2168 2168 }
2169 2169
2170 2170 /*
2171 2171 * Free the resources associated with an rnode.
2172 2172 */
2173 2173 static void
2174 2174 rinactive(rnode_t *rp, cred_t *cr)
2175 2175 {
2176 2176 vnode_t *vp;
2177 2177 cred_t *cred;
2178 2178 char *contents;
2179 2179 int size;
2180 2180 vsecattr_t *vsp;
2181 2181 int error;
2182 2182 nfs3_pathconf_info *info;
2183 2183
2184 2184 /*
2185 2185 * Before freeing anything, wait until all asynchronous
2186 2186 * activity is done on this rnode. This will allow all
2187 2187 * asynchronous read ahead and write behind i/o's to
2188 2188 * finish.
2189 2189 */
2190 2190 mutex_enter(&rp->r_statelock);
2191 2191 while (rp->r_count > 0)
2192 2192 cv_wait(&rp->r_cv, &rp->r_statelock);
2193 2193 mutex_exit(&rp->r_statelock);
2194 2194
2195 2195 /*
2196 2196 * Flush and invalidate all pages associated with the vnode.
2197 2197 */
2198 2198 vp = RTOV(rp);
2199 2199 if (vn_has_cached_data(vp)) {
2200 2200 ASSERT(vp->v_type != VCHR);
2201 2201 if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2202 2202 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2203 2203 if (error && (error == ENOSPC || error == EDQUOT)) {
2204 2204 mutex_enter(&rp->r_statelock);
2205 2205 if (!rp->r_error)
2206 2206 rp->r_error = error;
2207 2207 mutex_exit(&rp->r_statelock);
2208 2208 }
2209 2209 }
2210 2210 nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2211 2211 }
2212 2212
2213 2213 /*
2214 2214 * Free any held credentials and caches which may be associated
2215 2215 * with this rnode.
2216 2216 */
2217 2217 mutex_enter(&rp->r_statelock);
2218 2218 cred = rp->r_cred;
2219 2219 rp->r_cred = NULL;
2220 2220 contents = rp->r_symlink.contents;
2221 2221 size = rp->r_symlink.size;
2222 2222 rp->r_symlink.contents = NULL;
2223 2223 vsp = rp->r_secattr;
2224 2224 rp->r_secattr = NULL;
2225 2225 info = rp->r_pathconf;
2226 2226 rp->r_pathconf = NULL;
2227 2227 mutex_exit(&rp->r_statelock);
2228 2228
2229 2229 /*
2230 2230 * Free the held credential.
2231 2231 */
2232 2232 if (cred != NULL)
2233 2233 crfree(cred);
2234 2234
2235 2235 /*
2236 2236 * Free the access cache entries.
2237 2237 */
2238 2238 (void) nfs_access_purge_rp(rp);
2239 2239
2240 2240 /*
2241 2241 * Free the readdir cache entries.
2242 2242 */
2243 2243 if (HAVE_RDDIR_CACHE(rp))
2244 2244 nfs_purge_rddir_cache(vp);
2245 2245
2246 2246 /*
2247 2247 * Free the symbolic link cache.
2248 2248 */
2249 2249 if (contents != NULL) {
2250 2250
2251 2251 kmem_free((void *)contents, size);
2252 2252 }
2253 2253
2254 2254 /*
2255 2255 * Free any cached ACL.
2256 2256 */
2257 2257 if (vsp != NULL)
2258 2258 nfs_acl_free(vsp);
2259 2259
2260 2260 /*
2261 2261 * Free any cached pathconf information.
2262 2262 */
2263 2263 if (info != NULL)
2264 2264 kmem_free(info, sizeof (*info));
2265 2265 }
2266 2266
2267 2267 /*
2268 2268 * Return a vnode for the given NFS Version 2 file handle.
2269 2269 * If no rnode exists for this fhandle, create one and put it
2270 2270 * into the hash queues. If the rnode for this fhandle
2271 2271 * already exists, return it.
2272 2272 *
2273 2273 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2274 2274 */
2275 2275 vnode_t *
2276 2276 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2277 2277 hrtime_t t, cred_t *cr, char *dnm, char *nm)
2278 2278 {
2279 2279 int newnode;
2280 2280 int index;
2281 2281 vnode_t *vp;
2282 2282 nfs_fhandle nfh;
2283 2283 vattr_t va;
2284 2284
2285 2285 nfh.fh_len = NFS_FHSIZE;
2286 2286 bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2287 2287
2288 2288 index = rtablehash(&nfh);
2289 2289 rw_enter(&rtable[index].r_lock, RW_READER);
2290 2290
2291 2291 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2292 2292 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2293 2293
2294 2294 if (attr != NULL) {
2295 2295 if (!newnode) {
2296 2296 rw_exit(&rtable[index].r_lock);
2297 2297 (void) nfs_cache_fattr(vp, attr, &va, t, cr);
2298 2298 } else {
2299 2299 if (attr->na_type < NFNON || attr->na_type > NFSOC)
2300 2300 vp->v_type = VBAD;
2301 2301 else
2302 2302 vp->v_type = n2v_type(attr);
2303 2303 /*
2304 2304 * A translation here seems to be necessary
2305 2305 * because this function can be called
2306 2306 * with `attr' that has come from the wire,
2307 2307 * and been operated on by vattr_to_nattr().
2308 2308 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2309 2309 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2310 2310 * ->makenfsnode().
2311 2311 */
2312 2312 if ((attr->na_rdev & 0xffff0000) == 0)
2313 2313 vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2314 2314 else
2315 2315 vp->v_rdev = expldev(n2v_rdev(attr));
2316 2316 nfs_attrcache(vp, attr, t);
2317 2317 rw_exit(&rtable[index].r_lock);
2318 2318 }
2319 2319 } else {
2320 2320 if (newnode) {
2321 2321 PURGE_ATTRCACHE(vp);
2322 2322 }
2323 2323 rw_exit(&rtable[index].r_lock);
2324 2324 }
2325 2325
2326 2326 return (vp);
2327 2327 }
2328 2328
2329 2329 /*
2330 2330 * Return a vnode for the given NFS Version 3 file handle.
2331 2331 * If no rnode exists for this fhandle, create one and put it
2332 2332 * into the hash queues. If the rnode for this fhandle
2333 2333 * already exists, return it.
2334 2334 *
2335 2335 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2336 2336 */
2337 2337 vnode_t *
2338 2338 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2339 2339 cred_t *cr, char *dnm, char *nm)
2340 2340 {
2341 2341 int newnode;
2342 2342 int index;
2343 2343 vnode_t *vp;
2344 2344
2345 2345 index = rtablehash((nfs_fhandle *)fh);
2346 2346 rw_enter(&rtable[index].r_lock, RW_READER);
2347 2347
2348 2348 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2349 2349 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2350 2350 dnm, nm);
2351 2351
2352 2352 if (vap == NULL) {
2353 2353 if (newnode) {
2354 2354 PURGE_ATTRCACHE(vp);
2355 2355 }
2356 2356 rw_exit(&rtable[index].r_lock);
2357 2357 return (vp);
2358 2358 }
2359 2359
2360 2360 if (!newnode) {
2361 2361 rw_exit(&rtable[index].r_lock);
2362 2362 nfs_attr_cache(vp, vap, t, cr);
2363 2363 } else {
2364 2364 rnode_t *rp = VTOR(vp);
2365 2365
2366 2366 vp->v_type = vap->va_type;
2367 2367 vp->v_rdev = vap->va_rdev;
2368 2368
2369 2369 mutex_enter(&rp->r_statelock);
2370 2370 if (rp->r_mtime <= t)
2371 2371 nfs_attrcache_va(vp, vap);
2372 2372 mutex_exit(&rp->r_statelock);
2373 2373 rw_exit(&rtable[index].r_lock);
2374 2374 }
2375 2375
2376 2376 return (vp);
2377 2377 }
2378 2378
2379 2379 vnode_t *
2380 2380 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2381 2381 cred_t *cr, char *dnm, char *nm)
2382 2382 {
2383 2383 int newnode;
2384 2384 int index;
2385 2385 vnode_t *vp;
2386 2386 vattr_t va;
2387 2387
2388 2388 index = rtablehash((nfs_fhandle *)fh);
2389 2389 rw_enter(&rtable[index].r_lock, RW_READER);
2390 2390
2391 2391 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2392 2392 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2393 2393 dnm, nm);
2394 2394
2395 2395 if (attr == NULL) {
2396 2396 if (newnode) {
2397 2397 PURGE_ATTRCACHE(vp);
2398 2398 }
2399 2399 rw_exit(&rtable[index].r_lock);
2400 2400 return (vp);
2401 2401 }
2402 2402
2403 2403 if (!newnode) {
2404 2404 rw_exit(&rtable[index].r_lock);
2405 2405 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2406 2406 } else {
2407 2407 if (attr->type < NF3REG || attr->type > NF3FIFO)
2408 2408 vp->v_type = VBAD;
2409 2409 else
2410 2410 vp->v_type = nf3_to_vt[attr->type];
2411 2411 vp->v_rdev = makedevice(attr->rdev.specdata1,
2412 2412 attr->rdev.specdata2);
2413 2413 nfs3_attrcache(vp, attr, t);
2414 2414 rw_exit(&rtable[index].r_lock);
2415 2415 }
2416 2416
2417 2417 return (vp);
2418 2418 }
2419 2419
2420 2420 /*
2421 2421 * Read this comment before making changes to rtablehash()!
2422 2422 * This is a hash function in which seemingly obvious and harmless
2423 2423 * changes can cause escalations costing million dollars!
2424 2424 * Know what you are doing.
2425 2425 *
2426 2426 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The
2427 2427 * algorithm is currently detailed here:
2428 2428 *
2429 2429 * http://burtleburtle.net/bob/hash/doobs.html
2430 2430 *
2431 2431 * Of course, the above link may not be valid by the time you are reading
2432 2432 * this, but suffice it to say that the one-at-a-time algorithm works well in
2433 2433 * almost all cases. If you are changing the algorithm be sure to verify that
2434 2434 * the hash algorithm still provides even distribution in all cases and with
2435 2435 * any server returning filehandles in whatever order (sequential or random).
2436 2436 */
2437 2437 static int
2438 2438 rtablehash(nfs_fhandle *fh)
2439 2439 {
2440 2440 ulong_t hash, len, i;
2441 2441 char *key;
2442 2442
2443 2443 key = fh->fh_buf;
2444 2444 len = (ulong_t)fh->fh_len;
2445 2445 for (hash = 0, i = 0; i < len; i++) {
2446 2446 hash += key[i];
2447 2447 hash += (hash << 10);
2448 2448 hash ^= (hash >> 6);
2449 2449 }
2450 2450 hash += (hash << 3);
2451 2451 hash ^= (hash >> 11);
2452 2452 hash += (hash << 15);
2453 2453 return (hash & rtablemask);
2454 2454 }
2455 2455
2456 2456 static vnode_t *
2457 2457 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2458 2458 struct vnodeops *vops,
2459 2459 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2460 2460 int (*compar)(const void *, const void *),
2461 2461 int *newnode, cred_t *cr, char *dnm, char *nm)
2462 2462 {
2463 2463 rnode_t *rp;
2464 2464 rnode_t *trp;
2465 2465 vnode_t *vp;
2466 2466 mntinfo_t *mi;
2467 2467
2468 2468 ASSERT(RW_READ_HELD(&rhtp->r_lock));
2469 2469
2470 2470 mi = VFTOMI(vfsp);
2471 2471 start:
2472 2472 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2473 2473 vp = RTOV(rp);
2474 2474 nfs_set_vroot(vp);
2475 2475 *newnode = 0;
2476 2476 return (vp);
2477 2477 }
2478 2478 rw_exit(&rhtp->r_lock);
2479 2479
2480 2480 mutex_enter(&rpfreelist_lock);
2481 2481 if (rpfreelist != NULL && rnew >= nrnode) {
2482 2482 rp = rpfreelist;
2483 2483 rp_rmfree(rp);
2484 2484 mutex_exit(&rpfreelist_lock);
2485 2485
2486 2486 vp = RTOV(rp);
2487 2487
2488 2488 if (rp->r_flags & RHASHED) {
2489 2489 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2490 2490 mutex_enter(&vp->v_lock);
2491 2491 if (vp->v_count > 1) {
2492 2492 vp->v_count--;
2493 2493 mutex_exit(&vp->v_lock);
2494 2494 rw_exit(&rp->r_hashq->r_lock);
2495 2495 rw_enter(&rhtp->r_lock, RW_READER);
2496 2496 goto start;
2497 2497 }
2498 2498 mutex_exit(&vp->v_lock);
2499 2499 rp_rmhash_locked(rp);
2500 2500 rw_exit(&rp->r_hashq->r_lock);
2501 2501 }
2502 2502
2503 2503 rinactive(rp, cr);
2504 2504
2505 2505 mutex_enter(&vp->v_lock);
2506 2506 if (vp->v_count > 1) {
2507 2507 vp->v_count--;
2508 2508 mutex_exit(&vp->v_lock);
2509 2509 rw_enter(&rhtp->r_lock, RW_READER);
2510 2510 goto start;
2511 2511 }
2512 2512 mutex_exit(&vp->v_lock);
2513 2513 vn_invalid(vp);
2514 2514 /*
2515 2515 * destroy old locks before bzero'ing and
2516 2516 * recreating the locks below.
2517 2517 */
2518 2518 nfs_rw_destroy(&rp->r_rwlock);
2519 2519 nfs_rw_destroy(&rp->r_lkserlock);
2520 2520 mutex_destroy(&rp->r_statelock);
2521 2521 cv_destroy(&rp->r_cv);
2522 2522 cv_destroy(&rp->r_commit.c_cv);
2523 2523 nfs_free_r_path(rp);
2524 2524 avl_destroy(&rp->r_dir);
2525 2525 /*
2526 2526 * Make sure that if rnode is recycled then
2527 2527 * VFS count is decremented properly before
2528 2528 * reuse.
2529 2529 */
↓ open down ↓ |
2061 lines elided |
↑ open up ↑ |
2530 2530 VFS_RELE(vp->v_vfsp);
2531 2531 vn_reinit(vp);
2532 2532 } else {
2533 2533 vnode_t *new_vp;
2534 2534
2535 2535 mutex_exit(&rpfreelist_lock);
2536 2536
2537 2537 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2538 2538 new_vp = vn_alloc(KM_SLEEP);
2539 2539
2540 - atomic_add_long((ulong_t *)&rnew, 1);
2540 + atomic_inc_ulong((ulong_t *)&rnew);
2541 2541 #ifdef DEBUG
2542 2542 clstat_debug.nrnode.value.ui64++;
2543 2543 #endif
2544 2544 vp = new_vp;
2545 2545 }
2546 2546
2547 2547 bzero(rp, sizeof (*rp));
2548 2548 rp->r_vnode = vp;
2549 2549 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2550 2550 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2551 2551 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2552 2552 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2553 2553 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2554 2554 rp->r_fh.fh_len = fh->fh_len;
2555 2555 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2556 2556 rp->r_server = mi->mi_curr_serv;
2557 2557 if (FAILOVER_MOUNT(mi)) {
2558 2558 /*
2559 2559 * If replicated servers, stash pathnames
2560 2560 */
2561 2561 if (dnm != NULL && nm != NULL) {
2562 2562 char *s, *p;
2563 2563 uint_t len;
2564 2564
2565 2565 len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2566 2566 rp->r_path = kmem_alloc(len, KM_SLEEP);
2567 2567 #ifdef DEBUG
2568 2568 clstat_debug.rpath.value.ui64 += len;
2569 2569 #endif
2570 2570 s = rp->r_path;
2571 2571 for (p = dnm; *p; p++)
2572 2572 *s++ = *p;
2573 2573 *s++ = '/';
2574 2574 for (p = nm; *p; p++)
2575 2575 *s++ = *p;
2576 2576 *s = '\0';
2577 2577 } else {
2578 2578 /* special case for root */
2579 2579 rp->r_path = kmem_alloc(2, KM_SLEEP);
2580 2580 #ifdef DEBUG
2581 2581 clstat_debug.rpath.value.ui64 += 2;
2582 2582 #endif
2583 2583 *rp->r_path = '.';
2584 2584 *(rp->r_path + 1) = '\0';
2585 2585 }
2586 2586 }
2587 2587 VFS_HOLD(vfsp);
2588 2588 rp->r_putapage = putapage;
2589 2589 rp->r_hashq = rhtp;
2590 2590 rp->r_flags = RREADDIRPLUS;
2591 2591 avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2592 2592 offsetof(rddir_cache, tree));
2593 2593 vn_setops(vp, vops);
2594 2594 vp->v_data = (caddr_t)rp;
2595 2595 vp->v_vfsp = vfsp;
2596 2596 vp->v_type = VNON;
2597 2597 vp->v_flag |= VMODSORT;
2598 2598 nfs_set_vroot(vp);
2599 2599
2600 2600 /*
2601 2601 * There is a race condition if someone else
2602 2602 * alloc's the rnode while no locks are held, so we
2603 2603 * check again and recover if found.
2604 2604 */
2605 2605 rw_enter(&rhtp->r_lock, RW_WRITER);
2606 2606 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2607 2607 vp = RTOV(trp);
2608 2608 nfs_set_vroot(vp);
2609 2609 *newnode = 0;
2610 2610 rw_exit(&rhtp->r_lock);
2611 2611 rp_addfree(rp, cr);
2612 2612 rw_enter(&rhtp->r_lock, RW_READER);
2613 2613 return (vp);
2614 2614 }
2615 2615 rp_addhash(rp);
2616 2616 *newnode = 1;
2617 2617 return (vp);
2618 2618 }
2619 2619
2620 2620 /*
2621 2621 * Callback function to check if the page should be marked as
2622 2622 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2623 2623 */
2624 2624 int
2625 2625 nfs_setmod_check(page_t *pp)
2626 2626 {
2627 2627 if (pp->p_fsdata != C_NOCOMMIT) {
2628 2628 pp->p_fsdata = C_NOCOMMIT;
2629 2629 return (1);
2630 2630 }
2631 2631 return (0);
2632 2632 }
2633 2633
2634 2634 static void
2635 2635 nfs_set_vroot(vnode_t *vp)
2636 2636 {
2637 2637 rnode_t *rp;
2638 2638 nfs_fhandle *rootfh;
2639 2639
2640 2640 rp = VTOR(vp);
2641 2641 rootfh = &rp->r_server->sv_fhandle;
2642 2642 if (rootfh->fh_len == rp->r_fh.fh_len &&
2643 2643 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2644 2644 if (!(vp->v_flag & VROOT)) {
2645 2645 mutex_enter(&vp->v_lock);
2646 2646 vp->v_flag |= VROOT;
2647 2647 mutex_exit(&vp->v_lock);
2648 2648 }
2649 2649 }
2650 2650 }
2651 2651
2652 2652 static void
2653 2653 nfs_free_r_path(rnode_t *rp)
2654 2654 {
2655 2655 char *path;
2656 2656 size_t len;
2657 2657
2658 2658 path = rp->r_path;
2659 2659 if (path) {
2660 2660 rp->r_path = NULL;
2661 2661 len = strlen(path) + 1;
2662 2662 kmem_free(path, len);
2663 2663 #ifdef DEBUG
2664 2664 clstat_debug.rpath.value.ui64 -= len;
2665 2665 #endif
2666 2666 }
2667 2667 }
2668 2668
2669 2669 /*
2670 2670 * Put an rnode on the free list.
2671 2671 *
2672 2672 * Rnodes which were allocated above and beyond the normal limit
2673 2673 * are immediately freed.
2674 2674 */
2675 2675 void
2676 2676 rp_addfree(rnode_t *rp, cred_t *cr)
2677 2677 {
2678 2678 vnode_t *vp;
2679 2679 struct vfs *vfsp;
2680 2680
2681 2681 vp = RTOV(rp);
2682 2682 ASSERT(vp->v_count >= 1);
2683 2683 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2684 2684
2685 2685 /*
2686 2686 * If we have too many rnodes allocated and there are no
2687 2687 * references to this rnode, or if the rnode is no longer
2688 2688 * accessible by it does not reside in the hash queues,
2689 2689 * or if an i/o error occurred while writing to the file,
2690 2690 * then just free it instead of putting it on the rnode
2691 2691 * freelist.
2692 2692 */
2693 2693 vfsp = vp->v_vfsp;
2694 2694 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2695 2695 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2696 2696 if (rp->r_flags & RHASHED) {
2697 2697 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2698 2698 mutex_enter(&vp->v_lock);
2699 2699 if (vp->v_count > 1) {
2700 2700 vp->v_count--;
2701 2701 mutex_exit(&vp->v_lock);
2702 2702 rw_exit(&rp->r_hashq->r_lock);
2703 2703 return;
2704 2704 }
2705 2705 mutex_exit(&vp->v_lock);
2706 2706 rp_rmhash_locked(rp);
2707 2707 rw_exit(&rp->r_hashq->r_lock);
2708 2708 }
2709 2709
2710 2710 rinactive(rp, cr);
2711 2711
2712 2712 /*
2713 2713 * Recheck the vnode reference count. We need to
2714 2714 * make sure that another reference has not been
2715 2715 * acquired while we were not holding v_lock. The
2716 2716 * rnode is not in the rnode hash queues, so the
2717 2717 * only way for a reference to have been acquired
2718 2718 * is for a VOP_PUTPAGE because the rnode was marked
2719 2719 * with RDIRTY or for a modified page. This
2720 2720 * reference may have been acquired before our call
2721 2721 * to rinactive. The i/o may have been completed,
2722 2722 * thus allowing rinactive to complete, but the
2723 2723 * reference to the vnode may not have been released
2724 2724 * yet. In any case, the rnode can not be destroyed
2725 2725 * until the other references to this vnode have been
2726 2726 * released. The other references will take care of
2727 2727 * either destroying the rnode or placing it on the
2728 2728 * rnode freelist. If there are no other references,
2729 2729 * then the rnode may be safely destroyed.
2730 2730 */
2731 2731 mutex_enter(&vp->v_lock);
2732 2732 if (vp->v_count > 1) {
2733 2733 vp->v_count--;
2734 2734 mutex_exit(&vp->v_lock);
2735 2735 return;
2736 2736 }
2737 2737 mutex_exit(&vp->v_lock);
2738 2738
2739 2739 destroy_rnode(rp);
2740 2740 return;
2741 2741 }
2742 2742
2743 2743 /*
2744 2744 * Lock the hash queue and then recheck the reference count
2745 2745 * to ensure that no other threads have acquired a reference
2746 2746 * to indicate that the rnode should not be placed on the
2747 2747 * freelist. If another reference has been acquired, then
2748 2748 * just release this one and let the other thread complete
2749 2749 * the processing of adding this rnode to the freelist.
2750 2750 */
2751 2751 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2752 2752
2753 2753 mutex_enter(&vp->v_lock);
2754 2754 if (vp->v_count > 1) {
2755 2755 vp->v_count--;
2756 2756 mutex_exit(&vp->v_lock);
2757 2757 rw_exit(&rp->r_hashq->r_lock);
2758 2758 return;
2759 2759 }
2760 2760 mutex_exit(&vp->v_lock);
2761 2761
2762 2762 /*
2763 2763 * If there is no cached data or metadata for this file, then
2764 2764 * put the rnode on the front of the freelist so that it will
2765 2765 * be reused before other rnodes which may have cached data or
2766 2766 * metadata associated with them.
2767 2767 */
2768 2768 mutex_enter(&rpfreelist_lock);
2769 2769 if (rpfreelist == NULL) {
2770 2770 rp->r_freef = rp;
2771 2771 rp->r_freeb = rp;
2772 2772 rpfreelist = rp;
2773 2773 } else {
2774 2774 rp->r_freef = rpfreelist;
2775 2775 rp->r_freeb = rpfreelist->r_freeb;
2776 2776 rpfreelist->r_freeb->r_freef = rp;
2777 2777 rpfreelist->r_freeb = rp;
2778 2778 if (!vn_has_cached_data(vp) &&
2779 2779 !HAVE_RDDIR_CACHE(rp) &&
2780 2780 rp->r_symlink.contents == NULL &&
2781 2781 rp->r_secattr == NULL &&
2782 2782 rp->r_pathconf == NULL)
2783 2783 rpfreelist = rp;
2784 2784 }
2785 2785 mutex_exit(&rpfreelist_lock);
2786 2786
2787 2787 rw_exit(&rp->r_hashq->r_lock);
2788 2788 }
2789 2789
2790 2790 /*
2791 2791 * Remove an rnode from the free list.
2792 2792 *
2793 2793 * The caller must be holding rpfreelist_lock and the rnode
2794 2794 * must be on the freelist.
2795 2795 */
2796 2796 static void
2797 2797 rp_rmfree(rnode_t *rp)
2798 2798 {
2799 2799
2800 2800 ASSERT(MUTEX_HELD(&rpfreelist_lock));
2801 2801 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2802 2802
2803 2803 if (rp == rpfreelist) {
2804 2804 rpfreelist = rp->r_freef;
2805 2805 if (rp == rpfreelist)
2806 2806 rpfreelist = NULL;
2807 2807 }
2808 2808
2809 2809 rp->r_freeb->r_freef = rp->r_freef;
2810 2810 rp->r_freef->r_freeb = rp->r_freeb;
2811 2811
2812 2812 rp->r_freef = rp->r_freeb = NULL;
2813 2813 }
2814 2814
2815 2815 /*
2816 2816 * Put a rnode in the hash table.
2817 2817 *
2818 2818 * The caller must be holding the exclusive hash queue lock.
2819 2819 */
2820 2820 static void
2821 2821 rp_addhash(rnode_t *rp)
2822 2822 {
2823 2823
2824 2824 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2825 2825 ASSERT(!(rp->r_flags & RHASHED));
2826 2826
2827 2827 rp->r_hashf = rp->r_hashq->r_hashf;
2828 2828 rp->r_hashq->r_hashf = rp;
2829 2829 rp->r_hashb = (rnode_t *)rp->r_hashq;
2830 2830 rp->r_hashf->r_hashb = rp;
2831 2831
2832 2832 mutex_enter(&rp->r_statelock);
2833 2833 rp->r_flags |= RHASHED;
2834 2834 mutex_exit(&rp->r_statelock);
2835 2835 }
2836 2836
2837 2837 /*
2838 2838 * Remove a rnode from the hash table.
2839 2839 *
2840 2840 * The caller must be holding the hash queue lock.
2841 2841 */
2842 2842 static void
2843 2843 rp_rmhash_locked(rnode_t *rp)
2844 2844 {
2845 2845
2846 2846 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2847 2847 ASSERT(rp->r_flags & RHASHED);
2848 2848
2849 2849 rp->r_hashb->r_hashf = rp->r_hashf;
2850 2850 rp->r_hashf->r_hashb = rp->r_hashb;
2851 2851
2852 2852 mutex_enter(&rp->r_statelock);
2853 2853 rp->r_flags &= ~RHASHED;
2854 2854 mutex_exit(&rp->r_statelock);
2855 2855 }
2856 2856
2857 2857 /*
2858 2858 * Remove a rnode from the hash table.
2859 2859 *
2860 2860 * The caller must not be holding the hash queue lock.
2861 2861 */
2862 2862 void
2863 2863 rp_rmhash(rnode_t *rp)
2864 2864 {
2865 2865
2866 2866 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2867 2867 rp_rmhash_locked(rp);
2868 2868 rw_exit(&rp->r_hashq->r_lock);
2869 2869 }
2870 2870
2871 2871 /*
2872 2872 * Lookup a rnode by fhandle.
2873 2873 *
2874 2874 * The caller must be holding the hash queue lock, either shared or exclusive.
2875 2875 */
2876 2876 static rnode_t *
2877 2877 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2878 2878 {
2879 2879 rnode_t *rp;
2880 2880 vnode_t *vp;
2881 2881
2882 2882 ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2883 2883
2884 2884 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2885 2885 vp = RTOV(rp);
2886 2886 if (vp->v_vfsp == vfsp &&
2887 2887 rp->r_fh.fh_len == fh->fh_len &&
2888 2888 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2889 2889 /*
2890 2890 * remove rnode from free list, if necessary.
2891 2891 */
2892 2892 if (rp->r_freef != NULL) {
2893 2893 mutex_enter(&rpfreelist_lock);
2894 2894 /*
2895 2895 * If the rnode is on the freelist,
2896 2896 * then remove it and use that reference
2897 2897 * as the new reference. Otherwise,
2898 2898 * need to increment the reference count.
2899 2899 */
2900 2900 if (rp->r_freef != NULL) {
2901 2901 rp_rmfree(rp);
2902 2902 mutex_exit(&rpfreelist_lock);
2903 2903 } else {
2904 2904 mutex_exit(&rpfreelist_lock);
2905 2905 VN_HOLD(vp);
2906 2906 }
2907 2907 } else
2908 2908 VN_HOLD(vp);
2909 2909 return (rp);
2910 2910 }
2911 2911 }
2912 2912 return (NULL);
2913 2913 }
2914 2914
2915 2915 /*
2916 2916 * Return 1 if there is a active vnode belonging to this vfs in the
2917 2917 * rtable cache.
2918 2918 *
2919 2919 * Several of these checks are done without holding the usual
2920 2920 * locks. This is safe because destroy_rtable(), rp_addfree(),
2921 2921 * etc. will redo the necessary checks before actually destroying
2922 2922 * any rnodes.
2923 2923 */
2924 2924 int
2925 2925 check_rtable(struct vfs *vfsp)
2926 2926 {
2927 2927 int index;
2928 2928 rnode_t *rp;
2929 2929 vnode_t *vp;
2930 2930
2931 2931 for (index = 0; index < rtablesize; index++) {
2932 2932 rw_enter(&rtable[index].r_lock, RW_READER);
2933 2933 for (rp = rtable[index].r_hashf;
2934 2934 rp != (rnode_t *)(&rtable[index]);
2935 2935 rp = rp->r_hashf) {
2936 2936 vp = RTOV(rp);
2937 2937 if (vp->v_vfsp == vfsp) {
2938 2938 if (rp->r_freef == NULL ||
2939 2939 (vn_has_cached_data(vp) &&
2940 2940 (rp->r_flags & RDIRTY)) ||
2941 2941 rp->r_count > 0) {
2942 2942 rw_exit(&rtable[index].r_lock);
2943 2943 return (1);
2944 2944 }
2945 2945 }
2946 2946 }
2947 2947 rw_exit(&rtable[index].r_lock);
2948 2948 }
2949 2949 return (0);
2950 2950 }
2951 2951
2952 2952 /*
2953 2953 * Destroy inactive vnodes from the hash queues which belong to this
2954 2954 * vfs. It is essential that we destroy all inactive vnodes during a
2955 2955 * forced unmount as well as during a normal unmount.
2956 2956 */
2957 2957 void
2958 2958 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2959 2959 {
2960 2960 int index;
2961 2961 rnode_t *rp;
2962 2962 rnode_t *rlist;
2963 2963 rnode_t *r_hashf;
2964 2964 vnode_t *vp;
2965 2965
2966 2966 rlist = NULL;
2967 2967
2968 2968 for (index = 0; index < rtablesize; index++) {
2969 2969 rw_enter(&rtable[index].r_lock, RW_WRITER);
2970 2970 for (rp = rtable[index].r_hashf;
2971 2971 rp != (rnode_t *)(&rtable[index]);
2972 2972 rp = r_hashf) {
2973 2973 /* save the hash pointer before destroying */
2974 2974 r_hashf = rp->r_hashf;
2975 2975 vp = RTOV(rp);
2976 2976 if (vp->v_vfsp == vfsp) {
2977 2977 mutex_enter(&rpfreelist_lock);
2978 2978 if (rp->r_freef != NULL) {
2979 2979 rp_rmfree(rp);
2980 2980 mutex_exit(&rpfreelist_lock);
2981 2981 rp_rmhash_locked(rp);
2982 2982 rp->r_hashf = rlist;
2983 2983 rlist = rp;
2984 2984 } else
2985 2985 mutex_exit(&rpfreelist_lock);
2986 2986 }
2987 2987 }
2988 2988 rw_exit(&rtable[index].r_lock);
2989 2989 }
2990 2990
2991 2991 for (rp = rlist; rp != NULL; rp = rlist) {
2992 2992 rlist = rp->r_hashf;
2993 2993 /*
2994 2994 * This call to rp_addfree will end up destroying the
2995 2995 * rnode, but in a safe way with the appropriate set
2996 2996 * of checks done.
2997 2997 */
2998 2998 rp_addfree(rp, cr);
2999 2999 }
3000 3000
3001 3001 }
3002 3002
3003 3003 /*
3004 3004 * This routine destroys all the resources associated with the rnode
3005 3005 * and then the rnode itself.
3006 3006 */
3007 3007 static void
3008 3008 destroy_rnode(rnode_t *rp)
3009 3009 {
3010 3010 vnode_t *vp;
3011 3011 vfs_t *vfsp;
↓ open down ↓ |
461 lines elided |
↑ open up ↑ |
3012 3012
3013 3013 vp = RTOV(rp);
3014 3014 vfsp = vp->v_vfsp;
3015 3015
3016 3016 ASSERT(vp->v_count == 1);
3017 3017 ASSERT(rp->r_count == 0);
3018 3018 ASSERT(rp->r_lmpl == NULL);
3019 3019 ASSERT(rp->r_mapcnt == 0);
3020 3020 ASSERT(!(rp->r_flags & RHASHED));
3021 3021 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3022 - atomic_add_long((ulong_t *)&rnew, -1);
3022 + atomic_dec_ulong((ulong_t *)&rnew);
3023 3023 #ifdef DEBUG
3024 3024 clstat_debug.nrnode.value.ui64--;
3025 3025 #endif
3026 3026 nfs_rw_destroy(&rp->r_rwlock);
3027 3027 nfs_rw_destroy(&rp->r_lkserlock);
3028 3028 mutex_destroy(&rp->r_statelock);
3029 3029 cv_destroy(&rp->r_cv);
3030 3030 cv_destroy(&rp->r_commit.c_cv);
3031 3031 if (rp->r_flags & RDELMAPLIST)
3032 3032 list_destroy(&rp->r_indelmap);
3033 3033 nfs_free_r_path(rp);
3034 3034 avl_destroy(&rp->r_dir);
3035 3035 vn_invalid(vp);
3036 3036 vn_free(vp);
3037 3037 kmem_cache_free(rnode_cache, rp);
3038 3038 VFS_RELE(vfsp);
3039 3039 }
3040 3040
3041 3041 /*
3042 3042 * Flush all vnodes in this (or every) vfs.
3043 3043 * Used by nfs_sync and by nfs_unmount.
3044 3044 */
3045 3045 void
3046 3046 rflush(struct vfs *vfsp, cred_t *cr)
3047 3047 {
3048 3048 int index;
3049 3049 rnode_t *rp;
3050 3050 vnode_t *vp, **vplist;
3051 3051 long num, cnt;
3052 3052
3053 3053 /*
3054 3054 * Check to see whether there is anything to do.
3055 3055 */
3056 3056 num = rnew;
3057 3057 if (num == 0)
3058 3058 return;
3059 3059
3060 3060 /*
3061 3061 * Allocate a slot for all currently active rnodes on the
3062 3062 * supposition that they all may need flushing.
3063 3063 */
3064 3064 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3065 3065 cnt = 0;
3066 3066
3067 3067 /*
3068 3068 * Walk the hash queues looking for rnodes with page
3069 3069 * lists associated with them. Make a list of these
3070 3070 * files.
3071 3071 */
3072 3072 for (index = 0; index < rtablesize; index++) {
3073 3073 rw_enter(&rtable[index].r_lock, RW_READER);
3074 3074 for (rp = rtable[index].r_hashf;
3075 3075 rp != (rnode_t *)(&rtable[index]);
3076 3076 rp = rp->r_hashf) {
3077 3077 vp = RTOV(rp);
3078 3078 /*
3079 3079 * Don't bother sync'ing a vp if it
3080 3080 * is part of virtual swap device or
3081 3081 * if VFS is read-only
3082 3082 */
3083 3083 if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3084 3084 continue;
3085 3085 /*
3086 3086 * If flushing all mounted file systems or
3087 3087 * the vnode belongs to this vfs, has pages
3088 3088 * and is marked as either dirty or mmap'd,
3089 3089 * hold and add this vnode to the list of
3090 3090 * vnodes to flush.
3091 3091 */
3092 3092 if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3093 3093 vn_has_cached_data(vp) &&
3094 3094 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3095 3095 VN_HOLD(vp);
3096 3096 vplist[cnt++] = vp;
3097 3097 if (cnt == num) {
3098 3098 rw_exit(&rtable[index].r_lock);
3099 3099 goto toomany;
3100 3100 }
3101 3101 }
3102 3102 }
3103 3103 rw_exit(&rtable[index].r_lock);
3104 3104 }
3105 3105 toomany:
3106 3106
3107 3107 /*
3108 3108 * Flush and release all of the files on the list.
3109 3109 */
3110 3110 while (cnt-- > 0) {
3111 3111 vp = vplist[cnt];
3112 3112 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3113 3113 VN_RELE(vp);
3114 3114 }
3115 3115
3116 3116 /*
3117 3117 * Free the space allocated to hold the list.
3118 3118 */
3119 3119 kmem_free(vplist, num * sizeof (*vplist));
3120 3120 }
3121 3121
3122 3122 /*
3123 3123 * This probably needs to be larger than or equal to
3124 3124 * log2(sizeof (struct rnode)) due to the way that rnodes are
3125 3125 * allocated.
3126 3126 */
3127 3127 #define ACACHE_SHIFT_BITS 9
3128 3128
3129 3129 static int
3130 3130 acachehash(rnode_t *rp, cred_t *cr)
3131 3131 {
3132 3132
3133 3133 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3134 3134 acachemask);
3135 3135 }
3136 3136
3137 3137 #ifdef DEBUG
3138 3138 static long nfs_access_cache_hits = 0;
3139 3139 static long nfs_access_cache_misses = 0;
3140 3140 #endif
3141 3141
3142 3142 nfs_access_type_t
3143 3143 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3144 3144 {
3145 3145 vnode_t *vp;
3146 3146 acache_t *ap;
3147 3147 acache_hash_t *hp;
3148 3148 nfs_access_type_t all;
3149 3149
3150 3150 vp = RTOV(rp);
3151 3151 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3152 3152 return (NFS_ACCESS_UNKNOWN);
3153 3153
3154 3154 if (rp->r_acache != NULL) {
3155 3155 hp = &acache[acachehash(rp, cr)];
3156 3156 rw_enter(&hp->lock, RW_READER);
3157 3157 ap = hp->next;
3158 3158 while (ap != (acache_t *)hp) {
3159 3159 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3160 3160 if ((ap->known & acc) == acc) {
3161 3161 #ifdef DEBUG
3162 3162 nfs_access_cache_hits++;
3163 3163 #endif
3164 3164 if ((ap->allowed & acc) == acc)
3165 3165 all = NFS_ACCESS_ALLOWED;
3166 3166 else
3167 3167 all = NFS_ACCESS_DENIED;
3168 3168 } else {
3169 3169 #ifdef DEBUG
3170 3170 nfs_access_cache_misses++;
3171 3171 #endif
3172 3172 all = NFS_ACCESS_UNKNOWN;
3173 3173 }
3174 3174 rw_exit(&hp->lock);
3175 3175 return (all);
3176 3176 }
3177 3177 ap = ap->next;
3178 3178 }
3179 3179 rw_exit(&hp->lock);
3180 3180 }
3181 3181
3182 3182 #ifdef DEBUG
3183 3183 nfs_access_cache_misses++;
3184 3184 #endif
3185 3185 return (NFS_ACCESS_UNKNOWN);
3186 3186 }
3187 3187
3188 3188 void
3189 3189 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3190 3190 {
3191 3191 acache_t *ap;
3192 3192 acache_t *nap;
3193 3193 acache_hash_t *hp;
3194 3194
3195 3195 hp = &acache[acachehash(rp, cr)];
3196 3196
3197 3197 /*
3198 3198 * Allocate now assuming that mostly an allocation will be
3199 3199 * required. This allows the allocation to happen without
3200 3200 * holding the hash bucket locked.
3201 3201 */
3202 3202 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3203 3203 if (nap != NULL) {
3204 3204 nap->known = acc;
3205 3205 nap->allowed = resacc;
3206 3206 nap->rnode = rp;
3207 3207 crhold(cr);
3208 3208 nap->cred = cr;
3209 3209 nap->hashq = hp;
3210 3210 }
3211 3211
3212 3212 rw_enter(&hp->lock, RW_WRITER);
3213 3213
3214 3214 if (rp->r_acache != NULL) {
3215 3215 ap = hp->next;
3216 3216 while (ap != (acache_t *)hp) {
3217 3217 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3218 3218 ap->known |= acc;
3219 3219 ap->allowed &= ~acc;
3220 3220 ap->allowed |= resacc;
3221 3221 rw_exit(&hp->lock);
3222 3222 if (nap != NULL) {
3223 3223 crfree(nap->cred);
3224 3224 kmem_cache_free(acache_cache, nap);
3225 3225 }
3226 3226 return;
3227 3227 }
3228 3228 ap = ap->next;
3229 3229 }
3230 3230 }
3231 3231
3232 3232 if (nap != NULL) {
3233 3233 #ifdef DEBUG
3234 3234 clstat_debug.access.value.ui64++;
3235 3235 #endif
3236 3236 nap->next = hp->next;
3237 3237 hp->next = nap;
3238 3238 nap->next->prev = nap;
3239 3239 nap->prev = (acache_t *)hp;
3240 3240
3241 3241 mutex_enter(&rp->r_statelock);
3242 3242 nap->list = rp->r_acache;
3243 3243 rp->r_acache = nap;
3244 3244 mutex_exit(&rp->r_statelock);
3245 3245 }
3246 3246
3247 3247 rw_exit(&hp->lock);
3248 3248 }
3249 3249
3250 3250 int
3251 3251 nfs_access_purge_rp(rnode_t *rp)
3252 3252 {
3253 3253 acache_t *ap;
3254 3254 acache_t *tmpap;
3255 3255 acache_t *rplist;
3256 3256
3257 3257 /*
3258 3258 * If there aren't any cached entries, then there is nothing
3259 3259 * to free.
3260 3260 */
3261 3261 if (rp->r_acache == NULL)
3262 3262 return (0);
3263 3263
3264 3264 mutex_enter(&rp->r_statelock);
3265 3265 rplist = rp->r_acache;
3266 3266 rp->r_acache = NULL;
3267 3267 mutex_exit(&rp->r_statelock);
3268 3268
3269 3269 /*
3270 3270 * Loop through each entry in the list pointed to in the
3271 3271 * rnode. Remove each of these entries from the hash
3272 3272 * queue that it is on and remove it from the list in
3273 3273 * the rnode.
3274 3274 */
3275 3275 for (ap = rplist; ap != NULL; ap = tmpap) {
3276 3276 rw_enter(&ap->hashq->lock, RW_WRITER);
3277 3277 ap->prev->next = ap->next;
3278 3278 ap->next->prev = ap->prev;
3279 3279 rw_exit(&ap->hashq->lock);
3280 3280
3281 3281 tmpap = ap->list;
3282 3282 crfree(ap->cred);
3283 3283 kmem_cache_free(acache_cache, ap);
3284 3284 #ifdef DEBUG
3285 3285 clstat_debug.access.value.ui64--;
3286 3286 #endif
3287 3287 }
3288 3288
3289 3289 return (1);
3290 3290 }
3291 3291
3292 3292 static const char prefix[] = ".nfs";
3293 3293
3294 3294 static kmutex_t newnum_lock;
3295 3295
3296 3296 int
3297 3297 newnum(void)
3298 3298 {
3299 3299 static uint_t newnum = 0;
3300 3300 uint_t id;
3301 3301
3302 3302 mutex_enter(&newnum_lock);
3303 3303 if (newnum == 0)
3304 3304 newnum = gethrestime_sec() & 0xffff;
3305 3305 id = newnum++;
3306 3306 mutex_exit(&newnum_lock);
3307 3307 return (id);
3308 3308 }
3309 3309
3310 3310 char *
3311 3311 newname(void)
3312 3312 {
3313 3313 char *news;
3314 3314 char *s;
3315 3315 const char *p;
3316 3316 uint_t id;
3317 3317
3318 3318 id = newnum();
3319 3319 news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3320 3320 s = news;
3321 3321 p = prefix;
3322 3322 while (*p != '\0')
3323 3323 *s++ = *p++;
3324 3324 while (id != 0) {
3325 3325 *s++ = "0123456789ABCDEF"[id & 0x0f];
3326 3326 id >>= 4;
3327 3327 }
3328 3328 *s = '\0';
3329 3329 return (news);
3330 3330 }
3331 3331
3332 3332 /*
3333 3333 * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3334 3334 * framework.
3335 3335 */
3336 3336 static int
3337 3337 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3338 3338 {
3339 3339 ksp->ks_snaptime = gethrtime();
3340 3340 if (rw == KSTAT_WRITE) {
3341 3341 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3342 3342 #ifdef DEBUG
3343 3343 /*
3344 3344 * Currently only the global zone can write to kstats, but we
3345 3345 * add the check just for paranoia.
3346 3346 */
3347 3347 if (INGLOBALZONE(curproc))
3348 3348 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3349 3349 sizeof (clstat_debug));
3350 3350 #endif
3351 3351 } else {
3352 3352 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3353 3353 #ifdef DEBUG
3354 3354 /*
3355 3355 * If we're displaying the "global" debug kstat values, we
3356 3356 * display them as-is to all zones since in fact they apply to
3357 3357 * the system as a whole.
3358 3358 */
3359 3359 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3360 3360 sizeof (clstat_debug));
3361 3361 #endif
3362 3362 }
3363 3363 return (0);
3364 3364 }
3365 3365
3366 3366 static void *
3367 3367 clinit_zone(zoneid_t zoneid)
3368 3368 {
3369 3369 kstat_t *nfs_client_kstat;
3370 3370 struct nfs_clnt *nfscl;
3371 3371 uint_t ndata;
3372 3372
3373 3373 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3374 3374 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3375 3375 nfscl->nfscl_chtable = NULL;
3376 3376 nfscl->nfscl_zoneid = zoneid;
3377 3377
3378 3378 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3379 3379 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3380 3380 #ifdef DEBUG
3381 3381 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3382 3382 #endif
3383 3383 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3384 3384 "misc", KSTAT_TYPE_NAMED, ndata,
3385 3385 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3386 3386 nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3387 3387 nfs_client_kstat->ks_snapshot = cl_snapshot;
3388 3388 kstat_install(nfs_client_kstat);
3389 3389 }
3390 3390 mutex_enter(&nfs_clnt_list_lock);
3391 3391 list_insert_head(&nfs_clnt_list, nfscl);
3392 3392 mutex_exit(&nfs_clnt_list_lock);
3393 3393 return (nfscl);
3394 3394 }
3395 3395
3396 3396 /*ARGSUSED*/
3397 3397 static void
3398 3398 clfini_zone(zoneid_t zoneid, void *arg)
3399 3399 {
3400 3400 struct nfs_clnt *nfscl = arg;
3401 3401 chhead_t *chp, *next;
3402 3402
3403 3403 if (nfscl == NULL)
3404 3404 return;
3405 3405 mutex_enter(&nfs_clnt_list_lock);
3406 3406 list_remove(&nfs_clnt_list, nfscl);
3407 3407 mutex_exit(&nfs_clnt_list_lock);
3408 3408 clreclaim_zone(nfscl, 0);
3409 3409 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3410 3410 ASSERT(chp->ch_list == NULL);
3411 3411 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3412 3412 next = chp->ch_next;
3413 3413 kmem_free(chp, sizeof (*chp));
3414 3414 }
3415 3415 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3416 3416 mutex_destroy(&nfscl->nfscl_chtable_lock);
3417 3417 kmem_free(nfscl, sizeof (*nfscl));
3418 3418 }
3419 3419
3420 3420 /*
3421 3421 * Called by endpnt_destructor to make sure the client handles are
3422 3422 * cleaned up before the RPC endpoints. This becomes a no-op if
3423 3423 * clfini_zone (above) is called first. This function is needed
3424 3424 * (rather than relying on clfini_zone to clean up) because the ZSD
3425 3425 * callbacks have no ordering mechanism, so we have no way to ensure
3426 3426 * that clfini_zone is called before endpnt_destructor.
3427 3427 */
3428 3428 void
3429 3429 clcleanup_zone(zoneid_t zoneid)
3430 3430 {
3431 3431 struct nfs_clnt *nfscl;
3432 3432
3433 3433 mutex_enter(&nfs_clnt_list_lock);
3434 3434 nfscl = list_head(&nfs_clnt_list);
3435 3435 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3436 3436 if (nfscl->nfscl_zoneid == zoneid) {
3437 3437 clreclaim_zone(nfscl, 0);
3438 3438 break;
3439 3439 }
3440 3440 }
3441 3441 mutex_exit(&nfs_clnt_list_lock);
3442 3442 }
3443 3443
3444 3444 int
3445 3445 nfs_subrinit(void)
3446 3446 {
3447 3447 int i;
3448 3448 ulong_t nrnode_max;
3449 3449
3450 3450 /*
3451 3451 * Allocate and initialize the rnode hash queues
3452 3452 */
3453 3453 if (nrnode <= 0)
3454 3454 nrnode = ncsize;
3455 3455 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3456 3456 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3457 3457 zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3458 3458 "!setting nrnode to max value of %ld", nrnode_max);
3459 3459 nrnode = nrnode_max;
3460 3460 }
3461 3461
3462 3462 rtablesize = 1 << highbit(nrnode / hashlen);
3463 3463 rtablemask = rtablesize - 1;
3464 3464 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3465 3465 for (i = 0; i < rtablesize; i++) {
3466 3466 rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3467 3467 rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3468 3468 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3469 3469 }
3470 3470 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3471 3471 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3472 3472
3473 3473 /*
3474 3474 * Allocate and initialize the access cache
3475 3475 */
3476 3476
3477 3477 /*
3478 3478 * Initial guess is one access cache entry per rnode unless
3479 3479 * nacache is set to a non-zero value and then it is used to
3480 3480 * indicate a guess at the number of access cache entries.
3481 3481 */
3482 3482 if (nacache > 0)
3483 3483 acachesize = 1 << highbit(nacache / hashlen);
3484 3484 else
3485 3485 acachesize = rtablesize;
3486 3486 acachemask = acachesize - 1;
3487 3487 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3488 3488 for (i = 0; i < acachesize; i++) {
3489 3489 acache[i].next = (acache_t *)&acache[i];
3490 3490 acache[i].prev = (acache_t *)&acache[i];
3491 3491 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3492 3492 }
3493 3493 acache_cache = kmem_cache_create("nfs_access_cache",
3494 3494 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3495 3495 /*
3496 3496 * Allocate and initialize the client handle cache
3497 3497 */
3498 3498 chtab_cache = kmem_cache_create("client_handle_cache",
3499 3499 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3500 3500 /*
3501 3501 * Initialize the list of per-zone client handles (and associated data).
3502 3502 * This needs to be done before we call zone_key_create().
3503 3503 */
3504 3504 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3505 3505 offsetof(struct nfs_clnt, nfscl_node));
3506 3506 /*
3507 3507 * Initialize the zone_key for per-zone client handle lists.
3508 3508 */
3509 3509 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3510 3510 /*
3511 3511 * Initialize the various mutexes and reader/writer locks
3512 3512 */
3513 3513 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3514 3514 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3515 3515 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3516 3516
3517 3517 /*
3518 3518 * Assign unique major number for all nfs mounts
3519 3519 */
3520 3520 if ((nfs_major = getudev()) == -1) {
3521 3521 zcmn_err(GLOBAL_ZONEID, CE_WARN,
3522 3522 "nfs: init: can't get unique device number");
3523 3523 nfs_major = 0;
3524 3524 }
3525 3525 nfs_minor = 0;
3526 3526
3527 3527 if (nfs3_jukebox_delay == 0)
3528 3528 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3529 3529
3530 3530 return (0);
3531 3531 }
3532 3532
3533 3533 void
3534 3534 nfs_subrfini(void)
3535 3535 {
3536 3536 int i;
3537 3537
3538 3538 /*
3539 3539 * Deallocate the rnode hash queues
3540 3540 */
3541 3541 kmem_cache_destroy(rnode_cache);
3542 3542
3543 3543 for (i = 0; i < rtablesize; i++)
3544 3544 rw_destroy(&rtable[i].r_lock);
3545 3545 kmem_free(rtable, rtablesize * sizeof (*rtable));
3546 3546
3547 3547 /*
3548 3548 * Deallocated the access cache
3549 3549 */
3550 3550 kmem_cache_destroy(acache_cache);
3551 3551
3552 3552 for (i = 0; i < acachesize; i++)
3553 3553 rw_destroy(&acache[i].lock);
3554 3554 kmem_free(acache, acachesize * sizeof (*acache));
3555 3555
3556 3556 /*
3557 3557 * Deallocate the client handle cache
3558 3558 */
3559 3559 kmem_cache_destroy(chtab_cache);
3560 3560
3561 3561 /*
3562 3562 * Destroy the various mutexes and reader/writer locks
3563 3563 */
3564 3564 mutex_destroy(&rpfreelist_lock);
3565 3565 mutex_destroy(&newnum_lock);
3566 3566 mutex_destroy(&nfs_minor_lock);
3567 3567 (void) zone_key_delete(nfsclnt_zone_key);
3568 3568 }
3569 3569
3570 3570 enum nfsstat
3571 3571 puterrno(int error)
3572 3572 {
3573 3573
3574 3574 switch (error) {
3575 3575 case EOPNOTSUPP:
3576 3576 return (NFSERR_OPNOTSUPP);
3577 3577 case ENAMETOOLONG:
3578 3578 return (NFSERR_NAMETOOLONG);
3579 3579 case ENOTEMPTY:
3580 3580 return (NFSERR_NOTEMPTY);
3581 3581 case EDQUOT:
3582 3582 return (NFSERR_DQUOT);
3583 3583 case ESTALE:
3584 3584 return (NFSERR_STALE);
3585 3585 case EREMOTE:
3586 3586 return (NFSERR_REMOTE);
3587 3587 case ENOSYS:
3588 3588 return (NFSERR_OPNOTSUPP);
3589 3589 case EOVERFLOW:
3590 3590 return (NFSERR_INVAL);
3591 3591 default:
3592 3592 return ((enum nfsstat)error);
3593 3593 }
3594 3594 /* NOTREACHED */
3595 3595 }
3596 3596
3597 3597 int
3598 3598 geterrno(enum nfsstat status)
3599 3599 {
3600 3600
3601 3601 switch (status) {
3602 3602 case NFSERR_OPNOTSUPP:
3603 3603 return (EOPNOTSUPP);
3604 3604 case NFSERR_NAMETOOLONG:
3605 3605 return (ENAMETOOLONG);
3606 3606 case NFSERR_NOTEMPTY:
3607 3607 return (ENOTEMPTY);
3608 3608 case NFSERR_DQUOT:
3609 3609 return (EDQUOT);
3610 3610 case NFSERR_STALE:
3611 3611 return (ESTALE);
3612 3612 case NFSERR_REMOTE:
3613 3613 return (EREMOTE);
3614 3614 case NFSERR_WFLUSH:
3615 3615 return (EIO);
3616 3616 default:
3617 3617 return ((int)status);
3618 3618 }
3619 3619 /* NOTREACHED */
3620 3620 }
3621 3621
3622 3622 enum nfsstat3
3623 3623 puterrno3(int error)
3624 3624 {
3625 3625
3626 3626 #ifdef DEBUG
3627 3627 switch (error) {
3628 3628 case 0:
3629 3629 return (NFS3_OK);
3630 3630 case EPERM:
3631 3631 return (NFS3ERR_PERM);
3632 3632 case ENOENT:
3633 3633 return (NFS3ERR_NOENT);
3634 3634 case EIO:
3635 3635 return (NFS3ERR_IO);
3636 3636 case ENXIO:
3637 3637 return (NFS3ERR_NXIO);
3638 3638 case EACCES:
3639 3639 return (NFS3ERR_ACCES);
3640 3640 case EEXIST:
3641 3641 return (NFS3ERR_EXIST);
3642 3642 case EXDEV:
3643 3643 return (NFS3ERR_XDEV);
3644 3644 case ENODEV:
3645 3645 return (NFS3ERR_NODEV);
3646 3646 case ENOTDIR:
3647 3647 return (NFS3ERR_NOTDIR);
3648 3648 case EISDIR:
3649 3649 return (NFS3ERR_ISDIR);
3650 3650 case EINVAL:
3651 3651 return (NFS3ERR_INVAL);
3652 3652 case EFBIG:
3653 3653 return (NFS3ERR_FBIG);
3654 3654 case ENOSPC:
3655 3655 return (NFS3ERR_NOSPC);
3656 3656 case EROFS:
3657 3657 return (NFS3ERR_ROFS);
3658 3658 case EMLINK:
3659 3659 return (NFS3ERR_MLINK);
3660 3660 case ENAMETOOLONG:
3661 3661 return (NFS3ERR_NAMETOOLONG);
3662 3662 case ENOTEMPTY:
3663 3663 return (NFS3ERR_NOTEMPTY);
3664 3664 case EDQUOT:
3665 3665 return (NFS3ERR_DQUOT);
3666 3666 case ESTALE:
3667 3667 return (NFS3ERR_STALE);
3668 3668 case EREMOTE:
3669 3669 return (NFS3ERR_REMOTE);
3670 3670 case ENOSYS:
3671 3671 case EOPNOTSUPP:
3672 3672 return (NFS3ERR_NOTSUPP);
3673 3673 case EOVERFLOW:
3674 3674 return (NFS3ERR_INVAL);
3675 3675 default:
3676 3676 zcmn_err(getzoneid(), CE_WARN,
3677 3677 "puterrno3: got error %d", error);
3678 3678 return ((enum nfsstat3)error);
3679 3679 }
3680 3680 #else
3681 3681 switch (error) {
3682 3682 case ENAMETOOLONG:
3683 3683 return (NFS3ERR_NAMETOOLONG);
3684 3684 case ENOTEMPTY:
3685 3685 return (NFS3ERR_NOTEMPTY);
3686 3686 case EDQUOT:
3687 3687 return (NFS3ERR_DQUOT);
3688 3688 case ESTALE:
3689 3689 return (NFS3ERR_STALE);
3690 3690 case ENOSYS:
3691 3691 case EOPNOTSUPP:
3692 3692 return (NFS3ERR_NOTSUPP);
3693 3693 case EREMOTE:
3694 3694 return (NFS3ERR_REMOTE);
3695 3695 case EOVERFLOW:
3696 3696 return (NFS3ERR_INVAL);
3697 3697 default:
3698 3698 return ((enum nfsstat3)error);
3699 3699 }
3700 3700 #endif
3701 3701 }
3702 3702
3703 3703 int
3704 3704 geterrno3(enum nfsstat3 status)
3705 3705 {
3706 3706
3707 3707 #ifdef DEBUG
3708 3708 switch (status) {
3709 3709 case NFS3_OK:
3710 3710 return (0);
3711 3711 case NFS3ERR_PERM:
3712 3712 return (EPERM);
3713 3713 case NFS3ERR_NOENT:
3714 3714 return (ENOENT);
3715 3715 case NFS3ERR_IO:
3716 3716 return (EIO);
3717 3717 case NFS3ERR_NXIO:
3718 3718 return (ENXIO);
3719 3719 case NFS3ERR_ACCES:
3720 3720 return (EACCES);
3721 3721 case NFS3ERR_EXIST:
3722 3722 return (EEXIST);
3723 3723 case NFS3ERR_XDEV:
3724 3724 return (EXDEV);
3725 3725 case NFS3ERR_NODEV:
3726 3726 return (ENODEV);
3727 3727 case NFS3ERR_NOTDIR:
3728 3728 return (ENOTDIR);
3729 3729 case NFS3ERR_ISDIR:
3730 3730 return (EISDIR);
3731 3731 case NFS3ERR_INVAL:
3732 3732 return (EINVAL);
3733 3733 case NFS3ERR_FBIG:
3734 3734 return (EFBIG);
3735 3735 case NFS3ERR_NOSPC:
3736 3736 return (ENOSPC);
3737 3737 case NFS3ERR_ROFS:
3738 3738 return (EROFS);
3739 3739 case NFS3ERR_MLINK:
3740 3740 return (EMLINK);
3741 3741 case NFS3ERR_NAMETOOLONG:
3742 3742 return (ENAMETOOLONG);
3743 3743 case NFS3ERR_NOTEMPTY:
3744 3744 return (ENOTEMPTY);
3745 3745 case NFS3ERR_DQUOT:
3746 3746 return (EDQUOT);
3747 3747 case NFS3ERR_STALE:
3748 3748 return (ESTALE);
3749 3749 case NFS3ERR_REMOTE:
3750 3750 return (EREMOTE);
3751 3751 case NFS3ERR_BADHANDLE:
3752 3752 return (ESTALE);
3753 3753 case NFS3ERR_NOT_SYNC:
3754 3754 return (EINVAL);
3755 3755 case NFS3ERR_BAD_COOKIE:
3756 3756 return (ENOENT);
3757 3757 case NFS3ERR_NOTSUPP:
3758 3758 return (EOPNOTSUPP);
3759 3759 case NFS3ERR_TOOSMALL:
3760 3760 return (EINVAL);
3761 3761 case NFS3ERR_SERVERFAULT:
3762 3762 return (EIO);
3763 3763 case NFS3ERR_BADTYPE:
3764 3764 return (EINVAL);
3765 3765 case NFS3ERR_JUKEBOX:
3766 3766 return (ENXIO);
3767 3767 default:
3768 3768 zcmn_err(getzoneid(), CE_WARN,
3769 3769 "geterrno3: got status %d", status);
3770 3770 return ((int)status);
3771 3771 }
3772 3772 #else
3773 3773 switch (status) {
3774 3774 case NFS3ERR_NAMETOOLONG:
3775 3775 return (ENAMETOOLONG);
3776 3776 case NFS3ERR_NOTEMPTY:
3777 3777 return (ENOTEMPTY);
3778 3778 case NFS3ERR_DQUOT:
3779 3779 return (EDQUOT);
3780 3780 case NFS3ERR_STALE:
3781 3781 case NFS3ERR_BADHANDLE:
3782 3782 return (ESTALE);
3783 3783 case NFS3ERR_NOTSUPP:
3784 3784 return (EOPNOTSUPP);
3785 3785 case NFS3ERR_REMOTE:
3786 3786 return (EREMOTE);
3787 3787 case NFS3ERR_NOT_SYNC:
3788 3788 case NFS3ERR_TOOSMALL:
3789 3789 case NFS3ERR_BADTYPE:
3790 3790 return (EINVAL);
3791 3791 case NFS3ERR_BAD_COOKIE:
3792 3792 return (ENOENT);
3793 3793 case NFS3ERR_SERVERFAULT:
3794 3794 return (EIO);
3795 3795 case NFS3ERR_JUKEBOX:
3796 3796 return (ENXIO);
3797 3797 default:
3798 3798 return ((int)status);
3799 3799 }
3800 3800 #endif
3801 3801 }
3802 3802
3803 3803 rddir_cache *
3804 3804 rddir_cache_alloc(int flags)
3805 3805 {
↓ open down ↓ |
773 lines elided |
↑ open up ↑ |
3806 3806 rddir_cache *rc;
3807 3807
3808 3808 rc = kmem_alloc(sizeof (*rc), flags);
3809 3809 if (rc != NULL) {
3810 3810 rc->entries = NULL;
3811 3811 rc->flags = RDDIR;
3812 3812 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3813 3813 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3814 3814 rc->count = 1;
3815 3815 #ifdef DEBUG
3816 - atomic_add_64(&clstat_debug.dirent.value.ui64, 1);
3816 + atomic_inc_64(&clstat_debug.dirent.value.ui64);
3817 3817 #endif
3818 3818 }
3819 3819 return (rc);
3820 3820 }
3821 3821
3822 3822 static void
3823 3823 rddir_cache_free(rddir_cache *rc)
3824 3824 {
3825 3825
3826 3826 #ifdef DEBUG
3827 - atomic_add_64(&clstat_debug.dirent.value.ui64, -1);
3827 + atomic_dec_64(&clstat_debug.dirent.value.ui64);
3828 3828 #endif
3829 3829 if (rc->entries != NULL) {
3830 3830 #ifdef DEBUG
3831 3831 rddir_cache_buf_free(rc->entries, rc->buflen);
3832 3832 #else
3833 3833 kmem_free(rc->entries, rc->buflen);
3834 3834 #endif
3835 3835 }
3836 3836 cv_destroy(&rc->cv);
3837 3837 mutex_destroy(&rc->lock);
3838 3838 kmem_free(rc, sizeof (*rc));
3839 3839 }
3840 3840
3841 3841 void
3842 3842 rddir_cache_hold(rddir_cache *rc)
3843 3843 {
3844 3844
3845 3845 mutex_enter(&rc->lock);
3846 3846 rc->count++;
3847 3847 mutex_exit(&rc->lock);
3848 3848 }
3849 3849
3850 3850 void
3851 3851 rddir_cache_rele(rddir_cache *rc)
3852 3852 {
3853 3853
3854 3854 mutex_enter(&rc->lock);
3855 3855 ASSERT(rc->count > 0);
3856 3856 if (--rc->count == 0) {
3857 3857 mutex_exit(&rc->lock);
3858 3858 rddir_cache_free(rc);
3859 3859 } else
3860 3860 mutex_exit(&rc->lock);
3861 3861 }
3862 3862
3863 3863 #ifdef DEBUG
3864 3864 char *
3865 3865 rddir_cache_buf_alloc(size_t size, int flags)
3866 3866 {
3867 3867 char *rc;
3868 3868
3869 3869 rc = kmem_alloc(size, flags);
3870 3870 if (rc != NULL)
3871 3871 atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3872 3872 return (rc);
3873 3873 }
3874 3874
3875 3875 void
3876 3876 rddir_cache_buf_free(void *addr, size_t size)
3877 3877 {
3878 3878
3879 3879 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3880 3880 kmem_free(addr, size);
3881 3881 }
3882 3882 #endif
3883 3883
3884 3884 static int
3885 3885 nfs_free_data_reclaim(rnode_t *rp)
3886 3886 {
3887 3887 char *contents;
3888 3888 int size;
3889 3889 vsecattr_t *vsp;
3890 3890 nfs3_pathconf_info *info;
3891 3891 int freed;
3892 3892 cred_t *cred;
3893 3893
3894 3894 /*
3895 3895 * Free any held credentials and caches which
3896 3896 * may be associated with this rnode.
3897 3897 */
3898 3898 mutex_enter(&rp->r_statelock);
3899 3899 cred = rp->r_cred;
3900 3900 rp->r_cred = NULL;
3901 3901 contents = rp->r_symlink.contents;
3902 3902 size = rp->r_symlink.size;
3903 3903 rp->r_symlink.contents = NULL;
3904 3904 vsp = rp->r_secattr;
3905 3905 rp->r_secattr = NULL;
3906 3906 info = rp->r_pathconf;
3907 3907 rp->r_pathconf = NULL;
3908 3908 mutex_exit(&rp->r_statelock);
3909 3909
3910 3910 if (cred != NULL)
3911 3911 crfree(cred);
3912 3912
3913 3913 /*
3914 3914 * Free the access cache entries.
3915 3915 */
3916 3916 freed = nfs_access_purge_rp(rp);
3917 3917
3918 3918 if (!HAVE_RDDIR_CACHE(rp) &&
3919 3919 contents == NULL &&
3920 3920 vsp == NULL &&
3921 3921 info == NULL)
3922 3922 return (freed);
3923 3923
3924 3924 /*
3925 3925 * Free the readdir cache entries
3926 3926 */
3927 3927 if (HAVE_RDDIR_CACHE(rp))
3928 3928 nfs_purge_rddir_cache(RTOV(rp));
3929 3929
3930 3930 /*
3931 3931 * Free the symbolic link cache.
3932 3932 */
3933 3933 if (contents != NULL) {
3934 3934
3935 3935 kmem_free((void *)contents, size);
3936 3936 }
3937 3937
3938 3938 /*
3939 3939 * Free any cached ACL.
3940 3940 */
3941 3941 if (vsp != NULL)
3942 3942 nfs_acl_free(vsp);
3943 3943
3944 3944 /*
3945 3945 * Free any cached pathconf information.
3946 3946 */
3947 3947 if (info != NULL)
3948 3948 kmem_free(info, sizeof (*info));
3949 3949
3950 3950 return (1);
3951 3951 }
3952 3952
3953 3953 static int
3954 3954 nfs_active_data_reclaim(rnode_t *rp)
3955 3955 {
3956 3956 char *contents;
3957 3957 int size;
3958 3958 vsecattr_t *vsp;
3959 3959 nfs3_pathconf_info *info;
3960 3960 int freed;
3961 3961
3962 3962 /*
3963 3963 * Free any held credentials and caches which
3964 3964 * may be associated with this rnode.
3965 3965 */
3966 3966 if (!mutex_tryenter(&rp->r_statelock))
3967 3967 return (0);
3968 3968 contents = rp->r_symlink.contents;
3969 3969 size = rp->r_symlink.size;
3970 3970 rp->r_symlink.contents = NULL;
3971 3971 vsp = rp->r_secattr;
3972 3972 rp->r_secattr = NULL;
3973 3973 info = rp->r_pathconf;
3974 3974 rp->r_pathconf = NULL;
3975 3975 mutex_exit(&rp->r_statelock);
3976 3976
3977 3977 /*
3978 3978 * Free the access cache entries.
3979 3979 */
3980 3980 freed = nfs_access_purge_rp(rp);
3981 3981
3982 3982 if (!HAVE_RDDIR_CACHE(rp) &&
3983 3983 contents == NULL &&
3984 3984 vsp == NULL &&
3985 3985 info == NULL)
3986 3986 return (freed);
3987 3987
3988 3988 /*
3989 3989 * Free the readdir cache entries
3990 3990 */
3991 3991 if (HAVE_RDDIR_CACHE(rp))
3992 3992 nfs_purge_rddir_cache(RTOV(rp));
3993 3993
3994 3994 /*
3995 3995 * Free the symbolic link cache.
3996 3996 */
3997 3997 if (contents != NULL) {
3998 3998
3999 3999 kmem_free((void *)contents, size);
4000 4000 }
4001 4001
4002 4002 /*
4003 4003 * Free any cached ACL.
4004 4004 */
4005 4005 if (vsp != NULL)
4006 4006 nfs_acl_free(vsp);
4007 4007
4008 4008 /*
4009 4009 * Free any cached pathconf information.
4010 4010 */
4011 4011 if (info != NULL)
4012 4012 kmem_free(info, sizeof (*info));
4013 4013
4014 4014 return (1);
4015 4015 }
4016 4016
4017 4017 static int
4018 4018 nfs_free_reclaim(void)
4019 4019 {
4020 4020 int freed;
4021 4021 rnode_t *rp;
4022 4022
4023 4023 #ifdef DEBUG
4024 4024 clstat_debug.f_reclaim.value.ui64++;
4025 4025 #endif
4026 4026 freed = 0;
4027 4027 mutex_enter(&rpfreelist_lock);
4028 4028 rp = rpfreelist;
4029 4029 if (rp != NULL) {
4030 4030 do {
4031 4031 if (nfs_free_data_reclaim(rp))
4032 4032 freed = 1;
4033 4033 } while ((rp = rp->r_freef) != rpfreelist);
4034 4034 }
4035 4035 mutex_exit(&rpfreelist_lock);
4036 4036 return (freed);
4037 4037 }
4038 4038
4039 4039 static int
4040 4040 nfs_active_reclaim(void)
4041 4041 {
4042 4042 int freed;
4043 4043 int index;
4044 4044 rnode_t *rp;
4045 4045
4046 4046 #ifdef DEBUG
4047 4047 clstat_debug.a_reclaim.value.ui64++;
4048 4048 #endif
4049 4049 freed = 0;
4050 4050 for (index = 0; index < rtablesize; index++) {
4051 4051 rw_enter(&rtable[index].r_lock, RW_READER);
4052 4052 for (rp = rtable[index].r_hashf;
4053 4053 rp != (rnode_t *)(&rtable[index]);
4054 4054 rp = rp->r_hashf) {
4055 4055 if (nfs_active_data_reclaim(rp))
4056 4056 freed = 1;
4057 4057 }
4058 4058 rw_exit(&rtable[index].r_lock);
4059 4059 }
4060 4060 return (freed);
4061 4061 }
4062 4062
4063 4063 static int
4064 4064 nfs_rnode_reclaim(void)
4065 4065 {
4066 4066 int freed;
4067 4067 rnode_t *rp;
4068 4068 vnode_t *vp;
4069 4069
4070 4070 #ifdef DEBUG
4071 4071 clstat_debug.r_reclaim.value.ui64++;
4072 4072 #endif
4073 4073 freed = 0;
4074 4074 mutex_enter(&rpfreelist_lock);
4075 4075 while ((rp = rpfreelist) != NULL) {
4076 4076 rp_rmfree(rp);
4077 4077 mutex_exit(&rpfreelist_lock);
4078 4078 if (rp->r_flags & RHASHED) {
4079 4079 vp = RTOV(rp);
4080 4080 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4081 4081 mutex_enter(&vp->v_lock);
4082 4082 if (vp->v_count > 1) {
4083 4083 vp->v_count--;
4084 4084 mutex_exit(&vp->v_lock);
4085 4085 rw_exit(&rp->r_hashq->r_lock);
4086 4086 mutex_enter(&rpfreelist_lock);
4087 4087 continue;
4088 4088 }
4089 4089 mutex_exit(&vp->v_lock);
4090 4090 rp_rmhash_locked(rp);
4091 4091 rw_exit(&rp->r_hashq->r_lock);
4092 4092 }
4093 4093 /*
4094 4094 * This call to rp_addfree will end up destroying the
4095 4095 * rnode, but in a safe way with the appropriate set
4096 4096 * of checks done.
4097 4097 */
4098 4098 rp_addfree(rp, CRED());
4099 4099 mutex_enter(&rpfreelist_lock);
4100 4100 }
4101 4101 mutex_exit(&rpfreelist_lock);
4102 4102 return (freed);
4103 4103 }
4104 4104
4105 4105 /*ARGSUSED*/
4106 4106 static void
4107 4107 nfs_reclaim(void *cdrarg)
4108 4108 {
4109 4109
4110 4110 #ifdef DEBUG
4111 4111 clstat_debug.reclaim.value.ui64++;
4112 4112 #endif
4113 4113 if (nfs_free_reclaim())
4114 4114 return;
4115 4115
4116 4116 if (nfs_active_reclaim())
4117 4117 return;
4118 4118
4119 4119 (void) nfs_rnode_reclaim();
4120 4120 }
4121 4121
4122 4122 /*
4123 4123 * NFS client failover support
4124 4124 *
4125 4125 * Routines to copy filehandles
4126 4126 */
4127 4127 void
4128 4128 nfscopyfh(caddr_t fhp, vnode_t *vp)
4129 4129 {
4130 4130 fhandle_t *dest = (fhandle_t *)fhp;
4131 4131
4132 4132 if (dest != NULL)
4133 4133 *dest = *VTOFH(vp);
4134 4134 }
4135 4135
4136 4136 void
4137 4137 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4138 4138 {
4139 4139 nfs_fh3 *dest = (nfs_fh3 *)fhp;
4140 4140
4141 4141 if (dest != NULL)
4142 4142 *dest = *VTOFH3(vp);
4143 4143 }
4144 4144
4145 4145 /*
4146 4146 * NFS client failover support
4147 4147 *
4148 4148 * failover_safe() will test various conditions to ensure that
4149 4149 * failover is permitted for this vnode. It will be denied
4150 4150 * if:
4151 4151 * 1) the operation in progress does not support failover (NULL fi)
4152 4152 * 2) there are no available replicas (NULL mi_servers->sv_next)
4153 4153 * 3) any locks are outstanding on this file
4154 4154 */
4155 4155 static int
4156 4156 failover_safe(failinfo_t *fi)
4157 4157 {
4158 4158
4159 4159 /*
4160 4160 * Does this op permit failover?
4161 4161 */
4162 4162 if (fi == NULL || fi->vp == NULL)
4163 4163 return (0);
4164 4164
4165 4165 /*
4166 4166 * Are there any alternates to failover to?
4167 4167 */
4168 4168 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4169 4169 return (0);
4170 4170
4171 4171 /*
4172 4172 * Disable check; we've forced local locking
4173 4173 *
4174 4174 * if (flk_has_remote_locks(fi->vp))
4175 4175 * return (0);
4176 4176 */
4177 4177
4178 4178 /*
4179 4179 * If we have no partial path, we can't do anything
4180 4180 */
4181 4181 if (VTOR(fi->vp)->r_path == NULL)
4182 4182 return (0);
4183 4183
4184 4184 return (1);
4185 4185 }
4186 4186
4187 4187 #include <sys/thread.h>
4188 4188
4189 4189 /*
4190 4190 * NFS client failover support
4191 4191 *
4192 4192 * failover_newserver() will start a search for a new server,
4193 4193 * preferably by starting an async thread to do the work. If
4194 4194 * someone is already doing this (recognizable by MI_BINDINPROG
4195 4195 * being set), it will simply return and the calling thread
4196 4196 * will queue on the mi_failover_cv condition variable.
4197 4197 */
4198 4198 static void
4199 4199 failover_newserver(mntinfo_t *mi)
4200 4200 {
4201 4201 /*
4202 4202 * Check if someone else is doing this already
4203 4203 */
4204 4204 mutex_enter(&mi->mi_lock);
4205 4205 if (mi->mi_flags & MI_BINDINPROG) {
4206 4206 mutex_exit(&mi->mi_lock);
4207 4207 return;
4208 4208 }
4209 4209 mi->mi_flags |= MI_BINDINPROG;
4210 4210
4211 4211 /*
4212 4212 * Need to hold the vfs struct so that it can't be released
4213 4213 * while the failover thread is selecting a new server.
4214 4214 */
4215 4215 VFS_HOLD(mi->mi_vfsp);
4216 4216
4217 4217 /*
4218 4218 * Start a thread to do the real searching.
4219 4219 */
4220 4220 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4221 4221
4222 4222 mutex_exit(&mi->mi_lock);
4223 4223 }
4224 4224
4225 4225 /*
4226 4226 * NFS client failover support
4227 4227 *
4228 4228 * failover_thread() will find a new server to replace the one
4229 4229 * currently in use, wake up other threads waiting on this mount
4230 4230 * point, and die. It will start at the head of the server list
4231 4231 * and poll servers until it finds one with an NFS server which is
4232 4232 * registered and responds to a NULL procedure ping.
4233 4233 *
4234 4234 * XXX failover_thread is unsafe within the scope of the
4235 4235 * present model defined for cpr to suspend the system.
4236 4236 * Specifically, over-the-wire calls made by the thread
4237 4237 * are unsafe. The thread needs to be reevaluated in case of
4238 4238 * future updates to the cpr suspend model.
4239 4239 */
4240 4240 static void
4241 4241 failover_thread(mntinfo_t *mi)
4242 4242 {
4243 4243 servinfo_t *svp = NULL;
4244 4244 CLIENT *cl;
4245 4245 enum clnt_stat status;
4246 4246 struct timeval tv;
4247 4247 int error;
4248 4248 int oncethru = 0;
4249 4249 callb_cpr_t cprinfo;
4250 4250 rnode_t *rp;
4251 4251 int index;
4252 4252 char *srvnames;
4253 4253 size_t srvnames_len;
4254 4254 struct nfs_clnt *nfscl = NULL;
4255 4255 zoneid_t zoneid = getzoneid();
4256 4256
4257 4257 #ifdef DEBUG
4258 4258 /*
4259 4259 * This is currently only needed to access counters which exist on
4260 4260 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4261 4261 * on non-DEBUG kernels.
4262 4262 */
4263 4263 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4264 4264 ASSERT(nfscl != NULL);
4265 4265 #endif
4266 4266
4267 4267 /*
4268 4268 * Its safe to piggyback on the mi_lock since failover_newserver()
4269 4269 * code guarantees that there will be only one failover thread
4270 4270 * per mountinfo at any instance.
4271 4271 */
4272 4272 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4273 4273 "failover_thread");
4274 4274
4275 4275 mutex_enter(&mi->mi_lock);
4276 4276 while (mi->mi_readers) {
4277 4277 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4278 4278 cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4279 4279 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4280 4280 }
4281 4281 mutex_exit(&mi->mi_lock);
4282 4282
4283 4283 tv.tv_sec = 2;
4284 4284 tv.tv_usec = 0;
4285 4285
4286 4286 /*
4287 4287 * Ping the null NFS procedure of every server in
4288 4288 * the list until one responds. We always start
4289 4289 * at the head of the list and always skip the one
4290 4290 * that is current, since it's caused us a problem.
4291 4291 */
4292 4292 while (svp == NULL) {
4293 4293 for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4294 4294 if (!oncethru && svp == mi->mi_curr_serv)
4295 4295 continue;
4296 4296
4297 4297 /*
4298 4298 * If the file system was forcibly umounted
4299 4299 * while trying to do a failover, then just
4300 4300 * give up on the failover. It won't matter
4301 4301 * what the server is.
4302 4302 */
4303 4303 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4304 4304 svp = NULL;
4305 4305 goto done;
4306 4306 }
4307 4307
4308 4308 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4309 4309 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4310 4310 if (error)
4311 4311 continue;
4312 4312
4313 4313 if (!(mi->mi_flags & MI_INT))
4314 4314 cl->cl_nosignal = TRUE;
4315 4315 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4316 4316 xdr_void, NULL, tv);
4317 4317 if (!(mi->mi_flags & MI_INT))
4318 4318 cl->cl_nosignal = FALSE;
4319 4319 AUTH_DESTROY(cl->cl_auth);
4320 4320 CLNT_DESTROY(cl);
4321 4321 if (status == RPC_SUCCESS) {
4322 4322 if (svp == mi->mi_curr_serv) {
4323 4323 #ifdef DEBUG
4324 4324 zcmn_err(zoneid, CE_NOTE,
4325 4325 "NFS%d: failing over: selecting original server %s",
4326 4326 mi->mi_vers, svp->sv_hostname);
4327 4327 #else
4328 4328 zcmn_err(zoneid, CE_NOTE,
4329 4329 "NFS: failing over: selecting original server %s",
4330 4330 svp->sv_hostname);
4331 4331 #endif
4332 4332 } else {
4333 4333 #ifdef DEBUG
4334 4334 zcmn_err(zoneid, CE_NOTE,
4335 4335 "NFS%d: failing over from %s to %s",
4336 4336 mi->mi_vers,
4337 4337 mi->mi_curr_serv->sv_hostname,
4338 4338 svp->sv_hostname);
4339 4339 #else
4340 4340 zcmn_err(zoneid, CE_NOTE,
4341 4341 "NFS: failing over from %s to %s",
4342 4342 mi->mi_curr_serv->sv_hostname,
4343 4343 svp->sv_hostname);
4344 4344 #endif
4345 4345 }
4346 4346 break;
4347 4347 }
4348 4348 }
4349 4349
4350 4350 if (svp == NULL) {
4351 4351 if (!oncethru) {
4352 4352 srvnames = nfs_getsrvnames(mi, &srvnames_len);
4353 4353 #ifdef DEBUG
4354 4354 zprintf(zoneid,
4355 4355 "NFS%d servers %s not responding "
4356 4356 "still trying\n", mi->mi_vers, srvnames);
4357 4357 #else
4358 4358 zprintf(zoneid, "NFS servers %s not responding "
4359 4359 "still trying\n", srvnames);
4360 4360 #endif
4361 4361 oncethru = 1;
4362 4362 }
4363 4363 mutex_enter(&mi->mi_lock);
4364 4364 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4365 4365 mutex_exit(&mi->mi_lock);
4366 4366 delay(hz);
4367 4367 mutex_enter(&mi->mi_lock);
4368 4368 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4369 4369 mutex_exit(&mi->mi_lock);
4370 4370 }
4371 4371 }
4372 4372
4373 4373 if (oncethru) {
4374 4374 #ifdef DEBUG
4375 4375 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4376 4376 #else
4377 4377 zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4378 4378 #endif
4379 4379 }
4380 4380
4381 4381 if (svp != mi->mi_curr_serv) {
4382 4382 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4383 4383 index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4384 4384 rw_enter(&rtable[index].r_lock, RW_WRITER);
4385 4385 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4386 4386 mi->mi_vfsp);
4387 4387 if (rp != NULL) {
4388 4388 if (rp->r_flags & RHASHED)
4389 4389 rp_rmhash_locked(rp);
4390 4390 rw_exit(&rtable[index].r_lock);
4391 4391 rp->r_server = svp;
4392 4392 rp->r_fh = svp->sv_fhandle;
4393 4393 (void) nfs_free_data_reclaim(rp);
4394 4394 index = rtablehash(&rp->r_fh);
4395 4395 rp->r_hashq = &rtable[index];
4396 4396 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4397 4397 vn_exists(RTOV(rp));
4398 4398 rp_addhash(rp);
4399 4399 rw_exit(&rp->r_hashq->r_lock);
4400 4400 VN_RELE(RTOV(rp));
4401 4401 } else
4402 4402 rw_exit(&rtable[index].r_lock);
4403 4403 }
4404 4404
4405 4405 done:
4406 4406 if (oncethru)
4407 4407 kmem_free(srvnames, srvnames_len);
4408 4408 mutex_enter(&mi->mi_lock);
4409 4409 mi->mi_flags &= ~MI_BINDINPROG;
4410 4410 if (svp != NULL) {
4411 4411 mi->mi_curr_serv = svp;
4412 4412 mi->mi_failover++;
4413 4413 #ifdef DEBUG
4414 4414 nfscl->nfscl_stat.failover.value.ui64++;
4415 4415 #endif
4416 4416 }
4417 4417 cv_broadcast(&mi->mi_failover_cv);
4418 4418 CALLB_CPR_EXIT(&cprinfo);
4419 4419 VFS_RELE(mi->mi_vfsp);
4420 4420 zthread_exit();
4421 4421 /* NOTREACHED */
4422 4422 }
4423 4423
4424 4424 /*
4425 4425 * NFS client failover support
4426 4426 *
4427 4427 * failover_wait() will put the thread to sleep until MI_BINDINPROG
4428 4428 * is cleared, meaning that failover is complete. Called with
4429 4429 * mi_lock mutex held.
4430 4430 */
4431 4431 static int
4432 4432 failover_wait(mntinfo_t *mi)
4433 4433 {
4434 4434 k_sigset_t smask;
4435 4435
4436 4436 /*
4437 4437 * If someone else is hunting for a living server,
4438 4438 * sleep until it's done. After our sleep, we may
4439 4439 * be bound to the right server and get off cheaply.
4440 4440 */
4441 4441 while (mi->mi_flags & MI_BINDINPROG) {
4442 4442 /*
4443 4443 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4444 4444 * and SIGTERM. (Preserving the existing masks).
4445 4445 * Mask out SIGINT if mount option nointr is specified.
4446 4446 */
4447 4447 sigintr(&smask, (int)mi->mi_flags & MI_INT);
4448 4448 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4449 4449 /*
4450 4450 * restore original signal mask
4451 4451 */
4452 4452 sigunintr(&smask);
4453 4453 return (EINTR);
4454 4454 }
4455 4455 /*
4456 4456 * restore original signal mask
4457 4457 */
4458 4458 sigunintr(&smask);
4459 4459 }
4460 4460 return (0);
4461 4461 }
4462 4462
4463 4463 /*
4464 4464 * NFS client failover support
4465 4465 *
4466 4466 * failover_remap() will do a partial pathname lookup and find the
4467 4467 * desired vnode on the current server. The interim vnode will be
4468 4468 * discarded after we pilfer the new filehandle.
4469 4469 *
4470 4470 * Side effects:
4471 4471 * - This routine will also update the filehandle in the args structure
4472 4472 * pointed to by the fi->fhp pointer if it is non-NULL.
4473 4473 */
4474 4474
4475 4475 static int
4476 4476 failover_remap(failinfo_t *fi)
4477 4477 {
4478 4478 vnode_t *vp, *nvp, *rootvp;
4479 4479 rnode_t *rp, *nrp;
4480 4480 mntinfo_t *mi;
4481 4481 int error;
4482 4482 #ifdef DEBUG
4483 4483 struct nfs_clnt *nfscl;
4484 4484
4485 4485 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4486 4486 ASSERT(nfscl != NULL);
4487 4487 #endif
4488 4488 /*
4489 4489 * Sanity check
4490 4490 */
4491 4491 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4492 4492 return (EINVAL);
4493 4493 vp = fi->vp;
4494 4494 rp = VTOR(vp);
4495 4495 mi = VTOMI(vp);
4496 4496
4497 4497 if (!(vp->v_flag & VROOT)) {
4498 4498 /*
4499 4499 * Given the root fh, use the path stored in
4500 4500 * the rnode to find the fh for the new server.
4501 4501 */
4502 4502 error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4503 4503 if (error)
4504 4504 return (error);
4505 4505
4506 4506 error = failover_lookup(rp->r_path, rootvp,
4507 4507 fi->lookupproc, fi->xattrdirproc, &nvp);
4508 4508
4509 4509 VN_RELE(rootvp);
4510 4510
4511 4511 if (error)
4512 4512 return (error);
4513 4513
4514 4514 /*
4515 4515 * If we found the same rnode, we're done now
4516 4516 */
4517 4517 if (nvp == vp) {
4518 4518 /*
4519 4519 * Failed and the new server may physically be same
4520 4520 * OR may share a same disk subsystem. In this case
4521 4521 * file handle for a particular file path is not going
4522 4522 * to change, given the same filehandle lookup will
4523 4523 * always locate the same rnode as the existing one.
4524 4524 * All we might need to do is to update the r_server
4525 4525 * with the current servinfo.
4526 4526 */
4527 4527 if (!VALID_FH(fi)) {
4528 4528 rp->r_server = mi->mi_curr_serv;
4529 4529 }
4530 4530 VN_RELE(nvp);
4531 4531 return (0);
4532 4532 }
4533 4533
4534 4534 /*
4535 4535 * Try to make it so that no one else will find this
4536 4536 * vnode because it is just a temporary to hold the
4537 4537 * new file handle until that file handle can be
4538 4538 * copied to the original vnode/rnode.
4539 4539 */
4540 4540 nrp = VTOR(nvp);
4541 4541 mutex_enter(&mi->mi_remap_lock);
4542 4542 /*
4543 4543 * Some other thread could have raced in here and could
4544 4544 * have done the remap for this particular rnode before
4545 4545 * this thread here. Check for rp->r_server and
4546 4546 * mi->mi_curr_serv and return if they are same.
4547 4547 */
4548 4548 if (VALID_FH(fi)) {
4549 4549 mutex_exit(&mi->mi_remap_lock);
4550 4550 VN_RELE(nvp);
4551 4551 return (0);
4552 4552 }
4553 4553
4554 4554 if (nrp->r_flags & RHASHED)
4555 4555 rp_rmhash(nrp);
4556 4556
4557 4557 /*
4558 4558 * As a heuristic check on the validity of the new
4559 4559 * file, check that the size and type match against
4560 4560 * that we remember from the old version.
4561 4561 */
4562 4562 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4563 4563 mutex_exit(&mi->mi_remap_lock);
4564 4564 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4565 4565 "NFS replicas %s and %s: file %s not same.",
4566 4566 rp->r_server->sv_hostname,
4567 4567 nrp->r_server->sv_hostname, rp->r_path);
4568 4568 VN_RELE(nvp);
4569 4569 return (EINVAL);
4570 4570 }
4571 4571
4572 4572 /*
4573 4573 * snarf the filehandle from the new rnode
4574 4574 * then release it, again while updating the
4575 4575 * hash queues for the rnode.
4576 4576 */
4577 4577 if (rp->r_flags & RHASHED)
4578 4578 rp_rmhash(rp);
4579 4579 rp->r_server = mi->mi_curr_serv;
4580 4580 rp->r_fh = nrp->r_fh;
4581 4581 rp->r_hashq = nrp->r_hashq;
4582 4582 /*
4583 4583 * Copy the attributes from the new rnode to the old
4584 4584 * rnode. This will help to reduce unnecessary page
4585 4585 * cache flushes.
4586 4586 */
4587 4587 rp->r_attr = nrp->r_attr;
4588 4588 rp->r_attrtime = nrp->r_attrtime;
4589 4589 rp->r_mtime = nrp->r_mtime;
4590 4590 (void) nfs_free_data_reclaim(rp);
4591 4591 nfs_setswaplike(vp, &rp->r_attr);
4592 4592 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4593 4593 rp_addhash(rp);
4594 4594 rw_exit(&rp->r_hashq->r_lock);
4595 4595 mutex_exit(&mi->mi_remap_lock);
4596 4596 VN_RELE(nvp);
4597 4597 }
4598 4598
4599 4599 /*
4600 4600 * Update successful failover remap count
4601 4601 */
4602 4602 mutex_enter(&mi->mi_lock);
4603 4603 mi->mi_remap++;
4604 4604 mutex_exit(&mi->mi_lock);
4605 4605 #ifdef DEBUG
4606 4606 nfscl->nfscl_stat.remap.value.ui64++;
4607 4607 #endif
4608 4608
4609 4609 /*
4610 4610 * If we have a copied filehandle to update, do it now.
4611 4611 */
4612 4612 if (fi->fhp != NULL && fi->copyproc != NULL)
4613 4613 (*fi->copyproc)(fi->fhp, vp);
4614 4614
4615 4615 return (0);
4616 4616 }
4617 4617
4618 4618 /*
4619 4619 * NFS client failover support
4620 4620 *
4621 4621 * We want a simple pathname lookup routine to parse the pieces
4622 4622 * of path in rp->r_path. We know that the path was a created
4623 4623 * as rnodes were made, so we know we have only to deal with
4624 4624 * paths that look like:
4625 4625 * dir1/dir2/dir3/file
4626 4626 * Any evidence of anything like .., symlinks, and ENOTDIR
4627 4627 * are hard errors, because they mean something in this filesystem
4628 4628 * is different from the one we came from, or has changed under
4629 4629 * us in some way. If this is true, we want the failure.
4630 4630 *
4631 4631 * Extended attributes: if the filesystem is mounted with extended
4632 4632 * attributes enabled (-o xattr), the attribute directory will be
4633 4633 * represented in the r_path as the magic name XATTR_RPATH. So if
4634 4634 * we see that name in the pathname, is must be because this node
4635 4635 * is an extended attribute. Therefore, look it up that way.
4636 4636 */
4637 4637 static int
4638 4638 failover_lookup(char *path, vnode_t *root,
4639 4639 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4640 4640 vnode_t *, cred_t *, int),
4641 4641 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4642 4642 vnode_t **new)
4643 4643 {
4644 4644 vnode_t *dvp, *nvp;
4645 4645 int error = EINVAL;
4646 4646 char *s, *p, *tmppath;
4647 4647 size_t len;
4648 4648 mntinfo_t *mi;
4649 4649 bool_t xattr;
4650 4650
4651 4651 /* Make local copy of path */
4652 4652 len = strlen(path) + 1;
4653 4653 tmppath = kmem_alloc(len, KM_SLEEP);
4654 4654 (void) strcpy(tmppath, path);
4655 4655 s = tmppath;
4656 4656
4657 4657 dvp = root;
4658 4658 VN_HOLD(dvp);
4659 4659 mi = VTOMI(root);
4660 4660 xattr = mi->mi_flags & MI_EXTATTR;
4661 4661
4662 4662 do {
4663 4663 p = strchr(s, '/');
4664 4664 if (p != NULL)
4665 4665 *p = '\0';
4666 4666 if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4667 4667 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4668 4668 RFSCALL_SOFT);
4669 4669 } else {
4670 4670 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4671 4671 CRED(), RFSCALL_SOFT);
4672 4672 }
4673 4673 if (p != NULL)
4674 4674 *p++ = '/';
4675 4675 if (error) {
4676 4676 VN_RELE(dvp);
4677 4677 kmem_free(tmppath, len);
4678 4678 return (error);
4679 4679 }
4680 4680 s = p;
4681 4681 VN_RELE(dvp);
4682 4682 dvp = nvp;
4683 4683 } while (p != NULL);
4684 4684
4685 4685 if (nvp != NULL && new != NULL)
4686 4686 *new = nvp;
4687 4687 kmem_free(tmppath, len);
4688 4688 return (0);
4689 4689 }
4690 4690
4691 4691 /*
4692 4692 * NFS client failover support
4693 4693 *
4694 4694 * sv_free() frees the malloc'd portion of a "servinfo_t".
4695 4695 */
4696 4696 void
4697 4697 sv_free(servinfo_t *svp)
4698 4698 {
4699 4699 servinfo_t *next;
4700 4700 struct knetconfig *knconf;
4701 4701
4702 4702 while (svp != NULL) {
4703 4703 next = svp->sv_next;
4704 4704 if (svp->sv_secdata)
4705 4705 sec_clnt_freeinfo(svp->sv_secdata);
4706 4706 if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4707 4707 kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4708 4708 knconf = svp->sv_knconf;
4709 4709 if (knconf != NULL) {
4710 4710 if (knconf->knc_protofmly != NULL)
4711 4711 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4712 4712 if (knconf->knc_proto != NULL)
4713 4713 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4714 4714 kmem_free(knconf, sizeof (*knconf));
4715 4715 }
4716 4716 knconf = svp->sv_origknconf;
4717 4717 if (knconf != NULL) {
4718 4718 if (knconf->knc_protofmly != NULL)
4719 4719 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4720 4720 if (knconf->knc_proto != NULL)
4721 4721 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4722 4722 kmem_free(knconf, sizeof (*knconf));
4723 4723 }
4724 4724 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4725 4725 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4726 4726 mutex_destroy(&svp->sv_lock);
4727 4727 kmem_free(svp, sizeof (*svp));
4728 4728 svp = next;
4729 4729 }
4730 4730 }
4731 4731
4732 4732 /*
4733 4733 * Only can return non-zero if intr != 0.
4734 4734 */
4735 4735 int
4736 4736 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4737 4737 {
4738 4738
4739 4739 mutex_enter(&l->lock);
4740 4740
4741 4741 /*
4742 4742 * If this is a nested enter, then allow it. There
4743 4743 * must be as many exits as enters through.
4744 4744 */
4745 4745 if (l->owner == curthread) {
4746 4746 /* lock is held for writing by current thread */
4747 4747 ASSERT(rw == RW_READER || rw == RW_WRITER);
4748 4748 l->count--;
4749 4749 } else if (rw == RW_READER) {
4750 4750 /*
4751 4751 * While there is a writer active or writers waiting,
4752 4752 * then wait for them to finish up and move on. Then,
4753 4753 * increment the count to indicate that a reader is
4754 4754 * active.
4755 4755 */
4756 4756 while (l->count < 0 || l->waiters > 0) {
4757 4757 if (intr) {
4758 4758 klwp_t *lwp = ttolwp(curthread);
4759 4759
4760 4760 if (lwp != NULL)
4761 4761 lwp->lwp_nostop++;
4762 4762 if (!cv_wait_sig(&l->cv, &l->lock)) {
4763 4763 if (lwp != NULL)
4764 4764 lwp->lwp_nostop--;
4765 4765 mutex_exit(&l->lock);
4766 4766 return (EINTR);
4767 4767 }
4768 4768 if (lwp != NULL)
4769 4769 lwp->lwp_nostop--;
4770 4770 } else
4771 4771 cv_wait(&l->cv, &l->lock);
4772 4772 }
4773 4773 ASSERT(l->count < INT_MAX);
4774 4774 #ifdef DEBUG
4775 4775 if ((l->count % 10000) == 9999)
4776 4776 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4777 4777 "rwlock @ %p\n", l->count, (void *)&l);
4778 4778 #endif
4779 4779 l->count++;
4780 4780 } else {
4781 4781 ASSERT(rw == RW_WRITER);
4782 4782 /*
4783 4783 * While there are readers active or a writer
4784 4784 * active, then wait for all of the readers
4785 4785 * to finish or for the writer to finish.
4786 4786 * Then, set the owner field to curthread and
4787 4787 * decrement count to indicate that a writer
4788 4788 * is active.
4789 4789 */
4790 4790 while (l->count > 0 || l->owner != NULL) {
4791 4791 l->waiters++;
4792 4792 if (intr) {
4793 4793 klwp_t *lwp = ttolwp(curthread);
4794 4794
4795 4795 if (lwp != NULL)
4796 4796 lwp->lwp_nostop++;
4797 4797 if (!cv_wait_sig(&l->cv, &l->lock)) {
4798 4798 if (lwp != NULL)
4799 4799 lwp->lwp_nostop--;
4800 4800 l->waiters--;
4801 4801 cv_broadcast(&l->cv);
4802 4802 mutex_exit(&l->lock);
4803 4803 return (EINTR);
4804 4804 }
4805 4805 if (lwp != NULL)
4806 4806 lwp->lwp_nostop--;
4807 4807 } else
4808 4808 cv_wait(&l->cv, &l->lock);
4809 4809 l->waiters--;
4810 4810 }
4811 4811 l->owner = curthread;
4812 4812 l->count--;
4813 4813 }
4814 4814
4815 4815 mutex_exit(&l->lock);
4816 4816
4817 4817 return (0);
4818 4818 }
4819 4819
4820 4820 /*
4821 4821 * If the lock is available, obtain it and return non-zero. If there is
4822 4822 * already a conflicting lock, return 0 immediately.
4823 4823 */
4824 4824
4825 4825 int
4826 4826 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4827 4827 {
4828 4828 mutex_enter(&l->lock);
4829 4829
4830 4830 /*
4831 4831 * If this is a nested enter, then allow it. There
4832 4832 * must be as many exits as enters through.
4833 4833 */
4834 4834 if (l->owner == curthread) {
4835 4835 /* lock is held for writing by current thread */
4836 4836 ASSERT(rw == RW_READER || rw == RW_WRITER);
4837 4837 l->count--;
4838 4838 } else if (rw == RW_READER) {
4839 4839 /*
4840 4840 * If there is a writer active or writers waiting, deny the
4841 4841 * lock. Otherwise, bump the count of readers.
4842 4842 */
4843 4843 if (l->count < 0 || l->waiters > 0) {
4844 4844 mutex_exit(&l->lock);
4845 4845 return (0);
4846 4846 }
4847 4847 l->count++;
4848 4848 } else {
4849 4849 ASSERT(rw == RW_WRITER);
4850 4850 /*
4851 4851 * If there are readers active or a writer active, deny the
4852 4852 * lock. Otherwise, set the owner field to curthread and
4853 4853 * decrement count to indicate that a writer is active.
4854 4854 */
4855 4855 if (l->count > 0 || l->owner != NULL) {
4856 4856 mutex_exit(&l->lock);
4857 4857 return (0);
4858 4858 }
4859 4859 l->owner = curthread;
4860 4860 l->count--;
4861 4861 }
4862 4862
4863 4863 mutex_exit(&l->lock);
4864 4864
4865 4865 return (1);
4866 4866 }
4867 4867
4868 4868 void
4869 4869 nfs_rw_exit(nfs_rwlock_t *l)
4870 4870 {
4871 4871
4872 4872 mutex_enter(&l->lock);
4873 4873 /*
4874 4874 * If this is releasing a writer lock, then increment count to
4875 4875 * indicate that there is one less writer active. If this was
4876 4876 * the last of possibly nested writer locks, then clear the owner
4877 4877 * field as well to indicate that there is no writer active
4878 4878 * and wakeup any possible waiting writers or readers.
4879 4879 *
4880 4880 * If releasing a reader lock, then just decrement count to
4881 4881 * indicate that there is one less reader active. If this was
4882 4882 * the last active reader and there are writer(s) waiting,
4883 4883 * then wake up the first.
4884 4884 */
4885 4885 if (l->owner != NULL) {
4886 4886 ASSERT(l->owner == curthread);
4887 4887 l->count++;
4888 4888 if (l->count == 0) {
4889 4889 l->owner = NULL;
4890 4890 cv_broadcast(&l->cv);
4891 4891 }
4892 4892 } else {
4893 4893 ASSERT(l->count > 0);
4894 4894 l->count--;
4895 4895 if (l->count == 0 && l->waiters > 0)
4896 4896 cv_broadcast(&l->cv);
4897 4897 }
4898 4898 mutex_exit(&l->lock);
4899 4899 }
4900 4900
4901 4901 int
4902 4902 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4903 4903 {
4904 4904
4905 4905 if (rw == RW_READER)
4906 4906 return (l->count > 0);
4907 4907 ASSERT(rw == RW_WRITER);
4908 4908 return (l->count < 0);
4909 4909 }
4910 4910
4911 4911 /* ARGSUSED */
4912 4912 void
4913 4913 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4914 4914 {
4915 4915
4916 4916 l->count = 0;
4917 4917 l->waiters = 0;
4918 4918 l->owner = NULL;
4919 4919 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4920 4920 cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4921 4921 }
4922 4922
4923 4923 void
4924 4924 nfs_rw_destroy(nfs_rwlock_t *l)
4925 4925 {
4926 4926
4927 4927 mutex_destroy(&l->lock);
4928 4928 cv_destroy(&l->cv);
4929 4929 }
4930 4930
4931 4931 int
4932 4932 nfs3_rddir_compar(const void *x, const void *y)
4933 4933 {
4934 4934 rddir_cache *a = (rddir_cache *)x;
4935 4935 rddir_cache *b = (rddir_cache *)y;
4936 4936
4937 4937 if (a->nfs3_cookie == b->nfs3_cookie) {
4938 4938 if (a->buflen == b->buflen)
4939 4939 return (0);
4940 4940 if (a->buflen < b->buflen)
4941 4941 return (-1);
4942 4942 return (1);
4943 4943 }
4944 4944
4945 4945 if (a->nfs3_cookie < b->nfs3_cookie)
4946 4946 return (-1);
4947 4947
4948 4948 return (1);
4949 4949 }
4950 4950
4951 4951 int
4952 4952 nfs_rddir_compar(const void *x, const void *y)
4953 4953 {
4954 4954 rddir_cache *a = (rddir_cache *)x;
4955 4955 rddir_cache *b = (rddir_cache *)y;
4956 4956
4957 4957 if (a->nfs_cookie == b->nfs_cookie) {
4958 4958 if (a->buflen == b->buflen)
4959 4959 return (0);
4960 4960 if (a->buflen < b->buflen)
4961 4961 return (-1);
4962 4962 return (1);
4963 4963 }
4964 4964
4965 4965 if (a->nfs_cookie < b->nfs_cookie)
4966 4966 return (-1);
4967 4967
4968 4968 return (1);
4969 4969 }
4970 4970
4971 4971 static char *
4972 4972 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4973 4973 {
4974 4974 servinfo_t *s;
4975 4975 char *srvnames;
4976 4976 char *namep;
4977 4977 size_t length;
4978 4978
4979 4979 /*
4980 4980 * Calculate the length of the string required to hold all
4981 4981 * of the server names plus either a comma or a null
4982 4982 * character following each individual one.
4983 4983 */
4984 4984 length = 0;
4985 4985 for (s = mi->mi_servers; s != NULL; s = s->sv_next)
4986 4986 length += s->sv_hostnamelen;
4987 4987
4988 4988 srvnames = kmem_alloc(length, KM_SLEEP);
4989 4989
4990 4990 namep = srvnames;
4991 4991 for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
4992 4992 (void) strcpy(namep, s->sv_hostname);
4993 4993 namep += s->sv_hostnamelen - 1;
4994 4994 *namep++ = ',';
4995 4995 }
4996 4996 *--namep = '\0';
4997 4997
4998 4998 *len = length;
4999 4999
5000 5000 return (srvnames);
5001 5001 }
5002 5002
5003 5003 /*
5004 5004 * These two functions are temporary and designed for the upgrade-workaround
5005 5005 * only. They cannot be used for general zone-crossing NFS client support, and
5006 5006 * will be removed shortly.
5007 5007 *
5008 5008 * When the workaround is enabled, all NFS traffic is forced into the global
5009 5009 * zone. These functions are called when the code needs to refer to the state
5010 5010 * of the underlying network connection. They're not called when the function
5011 5011 * needs to refer to the state of the process that invoked the system call.
5012 5012 * (E.g., when checking whether the zone is shutting down during the mount()
5013 5013 * call.)
5014 5014 */
5015 5015
5016 5016 struct zone *
5017 5017 nfs_zone(void)
5018 5018 {
5019 5019 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5020 5020 }
5021 5021
5022 5022 zoneid_t
5023 5023 nfs_zoneid(void)
5024 5024 {
5025 5025 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5026 5026 }
5027 5027
5028 5028 /*
5029 5029 * nfs_mount_label_policy:
5030 5030 * Determine whether the mount is allowed according to MAC check,
5031 5031 * by comparing (where appropriate) label of the remote server
5032 5032 * against the label of the zone being mounted into.
5033 5033 *
5034 5034 * Returns:
5035 5035 * 0 : access allowed
5036 5036 * -1 : read-only access allowed (i.e., read-down)
5037 5037 * >0 : error code, such as EACCES
5038 5038 */
5039 5039 int
5040 5040 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5041 5041 struct knetconfig *knconf, cred_t *cr)
5042 5042 {
5043 5043 int addr_type;
5044 5044 void *ipaddr;
5045 5045 bslabel_t *server_sl, *mntlabel;
5046 5046 zone_t *mntzone = NULL;
5047 5047 ts_label_t *zlabel;
5048 5048 tsol_tpc_t *tp;
5049 5049 ts_label_t *tsl = NULL;
5050 5050 int retv;
5051 5051
5052 5052 /*
5053 5053 * Get the zone's label. Each zone on a labeled system has a label.
5054 5054 */
5055 5055 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5056 5056 zlabel = mntzone->zone_slabel;
5057 5057 ASSERT(zlabel != NULL);
5058 5058 label_hold(zlabel);
5059 5059
5060 5060 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5061 5061 addr_type = IPV4_VERSION;
5062 5062 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5063 5063 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5064 5064 addr_type = IPV6_VERSION;
5065 5065 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5066 5066 } else {
5067 5067 retv = 0;
5068 5068 goto out;
5069 5069 }
5070 5070
5071 5071 retv = EACCES; /* assume the worst */
5072 5072
5073 5073 /*
5074 5074 * Next, get the assigned label of the remote server.
5075 5075 */
5076 5076 tp = find_tpc(ipaddr, addr_type, B_FALSE);
5077 5077 if (tp == NULL)
5078 5078 goto out; /* error getting host entry */
5079 5079
5080 5080 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5081 5081 goto rel_tpc; /* invalid domain */
5082 5082 if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5083 5083 (tp->tpc_tp.host_type != UNLABELED))
5084 5084 goto rel_tpc; /* invalid hosttype */
5085 5085
5086 5086 if (tp->tpc_tp.host_type == SUN_CIPSO) {
5087 5087 tsl = getflabel_cipso(vfsp);
5088 5088 if (tsl == NULL)
5089 5089 goto rel_tpc; /* error getting server lbl */
5090 5090
5091 5091 server_sl = label2bslabel(tsl);
5092 5092 } else { /* UNLABELED */
5093 5093 server_sl = &tp->tpc_tp.tp_def_label;
5094 5094 }
5095 5095
5096 5096 mntlabel = label2bslabel(zlabel);
5097 5097
5098 5098 /*
5099 5099 * Now compare labels to complete the MAC check. If the labels
5100 5100 * are equal or if the requestor is in the global zone and has
5101 5101 * NET_MAC_AWARE, then allow read-write access. (Except for
5102 5102 * mounts into the global zone itself; restrict these to
5103 5103 * read-only.)
5104 5104 *
5105 5105 * If the requestor is in some other zone, but his label
5106 5106 * dominates the server, then allow read-down.
5107 5107 *
5108 5108 * Otherwise, access is denied.
5109 5109 */
5110 5110 if (blequal(mntlabel, server_sl) ||
5111 5111 (crgetzoneid(cr) == GLOBAL_ZONEID &&
5112 5112 getpflags(NET_MAC_AWARE, cr) != 0)) {
5113 5113 if ((mntzone == global_zone) ||
5114 5114 !blequal(mntlabel, server_sl))
5115 5115 retv = -1; /* read-only */
5116 5116 else
5117 5117 retv = 0; /* access OK */
5118 5118 } else if (bldominates(mntlabel, server_sl)) {
5119 5119 retv = -1; /* read-only */
5120 5120 } else {
5121 5121 retv = EACCES;
5122 5122 }
5123 5123
5124 5124 if (tsl != NULL)
5125 5125 label_rele(tsl);
5126 5126
5127 5127 rel_tpc:
5128 5128 TPC_RELE(tp);
5129 5129 out:
5130 5130 if (mntzone)
5131 5131 zone_rele(mntzone);
5132 5132 label_rele(zlabel);
5133 5133 return (retv);
5134 5134 }
5135 5135
5136 5136 boolean_t
5137 5137 nfs_has_ctty(void)
5138 5138 {
5139 5139 boolean_t rv;
5140 5140 mutex_enter(&curproc->p_splock);
5141 5141 rv = (curproc->p_sessp->s_vp != NULL);
5142 5142 mutex_exit(&curproc->p_splock);
5143 5143 return (rv);
5144 5144 }
5145 5145
5146 5146 /*
5147 5147 * See if xattr directory to see if it has any generic user attributes
5148 5148 */
5149 5149 int
5150 5150 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5151 5151 {
5152 5152 struct uio uio;
5153 5153 struct iovec iov;
5154 5154 char *dbuf;
5155 5155 struct dirent64 *dp;
5156 5156 size_t dlen = 8 * 1024;
5157 5157 size_t dbuflen;
5158 5158 int eof = 0;
5159 5159 int error;
5160 5160
5161 5161 *valp = 0;
5162 5162 dbuf = kmem_alloc(dlen, KM_SLEEP);
5163 5163 uio.uio_iov = &iov;
5164 5164 uio.uio_iovcnt = 1;
5165 5165 uio.uio_segflg = UIO_SYSSPACE;
5166 5166 uio.uio_fmode = 0;
5167 5167 uio.uio_extflg = UIO_COPY_CACHED;
5168 5168 uio.uio_loffset = 0;
5169 5169 uio.uio_resid = dlen;
5170 5170 iov.iov_base = dbuf;
5171 5171 iov.iov_len = dlen;
5172 5172 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5173 5173 error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5174 5174 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5175 5175
5176 5176 dbuflen = dlen - uio.uio_resid;
5177 5177
5178 5178 if (error || dbuflen == 0) {
5179 5179 kmem_free(dbuf, dlen);
5180 5180 return (error);
5181 5181 }
5182 5182
5183 5183 dp = (dirent64_t *)dbuf;
5184 5184
5185 5185 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5186 5186 if (strcmp(dp->d_name, ".") == 0 ||
5187 5187 strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5188 5188 VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5189 5189 VIEW_READONLY) == 0) {
5190 5190 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5191 5191 continue;
5192 5192 }
5193 5193
5194 5194 *valp = 1;
5195 5195 break;
5196 5196 }
5197 5197 kmem_free(dbuf, dlen);
5198 5198 return (0);
5199 5199 }
↓ open down ↓ |
1362 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX