1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T
  28  * All Rights Reserved
  29  */
  30 
  31 /*
  32  * Portions of this source code were derived from Berkeley 4.3 BSD
  33  * under license from the Regents of the University of California.
  34  */
  35 
  36 
  37 /*
  38  * Implements a kernel based, client side RPC.
  39  */
  40 
  41 #include <sys/param.h>
  42 #include <sys/types.h>
  43 #include <sys/systm.h>
  44 #include <sys/sysmacros.h>
  45 #include <sys/stream.h>
  46 #include <sys/strsubr.h>
  47 #include <sys/ddi.h>
  48 #include <sys/tiuser.h>
  49 #include <sys/tihdr.h>
  50 #include <sys/t_kuser.h>
  51 #include <sys/errno.h>
  52 #include <sys/kmem.h>
  53 #include <sys/debug.h>
  54 #include <sys/kstat.h>
  55 #include <sys/t_lock.h>
  56 #include <sys/cmn_err.h>
  57 #include <sys/conf.h>
  58 #include <sys/disp.h>
  59 #include <sys/taskq.h>
  60 #include <sys/list.h>
  61 #include <sys/atomic.h>
  62 #include <sys/zone.h>
  63 #include <netinet/in.h>
  64 #include <rpc/types.h>
  65 #include <rpc/xdr.h>
  66 #include <rpc/auth.h>
  67 #include <rpc/clnt.h>
  68 #include <rpc/rpc_msg.h>
  69 
  70 #include <sys/sdt.h>
  71 
  72 static enum clnt_stat clnt_clts_kcallit(CLIENT *, rpcproc_t, xdrproc_t,
  73                     caddr_t, xdrproc_t, caddr_t, struct timeval);
  74 static void     clnt_clts_kabort(CLIENT *);
  75 static void     clnt_clts_kerror(CLIENT *, struct rpc_err *);
  76 static bool_t   clnt_clts_kfreeres(CLIENT *, xdrproc_t, caddr_t);
  77 static bool_t   clnt_clts_kcontrol(CLIENT *, int, char *);
  78 static void     clnt_clts_kdestroy(CLIENT *);
  79 static int      clnt_clts_ksettimers(CLIENT *, struct rpc_timers *,
  80                     struct rpc_timers *, int, void (*)(), caddr_t, uint32_t);
  81 
  82 /*
  83  * Operations vector for CLTS based RPC
  84  */
  85 static struct clnt_ops clts_ops = {
  86         clnt_clts_kcallit,      /* do rpc call */
  87         clnt_clts_kabort,       /* abort call */
  88         clnt_clts_kerror,       /* return error status */
  89         clnt_clts_kfreeres,     /* free results */
  90         clnt_clts_kdestroy,     /* destroy rpc handle */
  91         clnt_clts_kcontrol,     /* the ioctl() of rpc */
  92         clnt_clts_ksettimers    /* set retry timers */
  93 };
  94 
  95 /*
  96  * Endpoint for CLTS (INET, INET6, loopback, etc.)
  97  */
  98 typedef struct endpnt_type {
  99         struct endpnt_type *e_next;     /* pointer to next endpoint type */
 100         list_t          e_pool;         /* list of available endpoints */
 101         list_t          e_ilist;        /* list of idle endpoints */
 102         struct endpnt   *e_pcurr;       /* pointer to current endpoint */
 103         char            e_protofmly[KNC_STRSIZE];       /* protocol family */
 104         dev_t           e_rdev;         /* device */
 105         kmutex_t        e_plock;        /* pool lock */
 106         kmutex_t        e_ilock;        /* idle list lock */
 107         timeout_id_t    e_itimer;       /* timer to dispatch the taskq */
 108         uint_t          e_cnt;          /* number of endpoints in the pool */
 109         zoneid_t        e_zoneid;       /* zoneid of endpoint type */
 110         kcondvar_t      e_async_cv;     /* cv for asynchronous reap threads */
 111         uint_t          e_async_count;  /* count of asynchronous reap threads */
 112 } endpnt_type_t;
 113 
 114 typedef struct endpnt {
 115         list_node_t     e_node;         /* link to the pool */
 116         list_node_t     e_idle;         /* link to the idle list */
 117         endpnt_type_t   *e_type;        /* back pointer to endpoint type */
 118         TIUSER          *e_tiptr;       /* pointer to transport endpoint */
 119         queue_t         *e_wq;          /* write queue */
 120         uint_t          e_flags;        /* endpoint flags */
 121         uint_t          e_ref;          /* ref count on endpoint */
 122         kcondvar_t      e_cv;           /* condition variable */
 123         kmutex_t        e_lock;         /* protects cv and flags */
 124         time_t          e_itime;        /* time when rele'd */
 125 } endpnt_t;
 126 
 127 #define ENDPNT_ESTABLISHED      0x1     /* endpoint is established */
 128 #define ENDPNT_WAITING          0x2     /* thread waiting for endpoint */
 129 #define ENDPNT_BOUND            0x4     /* endpoint is bound */
 130 #define ENDPNT_STALE            0x8     /* endpoint is dead */
 131 #define ENDPNT_ONIDLE           0x10    /* endpoint is on the idle list */
 132 
 133 static krwlock_t        endpnt_type_lock; /* protects endpnt_type_list */
 134 static endpnt_type_t    *endpnt_type_list = NULL; /* list of CLTS endpoints */
 135 static struct kmem_cache        *endpnt_cache; /* cache of endpnt_t's */
 136 static taskq_t                  *endpnt_taskq; /* endpnt_t reaper thread */
 137 static bool_t                   taskq_created; /* flag for endpnt_taskq */
 138 static kmutex_t                 endpnt_taskq_lock; /* taskq lock */
 139 static zone_key_t               endpnt_destructor_key;
 140 
 141 #define DEFAULT_ENDPOINT_REAP_INTERVAL 60 /* 1 minute */
 142 #define DEFAULT_INTERVAL_SHIFT 30 /* 30 seconds */
 143 
 144 /*
 145  * Endpoint tunables
 146  */
 147 static int      clnt_clts_max_endpoints = -1;
 148 static int      clnt_clts_hash_size = DEFAULT_HASH_SIZE;
 149 static time_t   clnt_clts_endpoint_reap_interval = -1;
 150 static clock_t  clnt_clts_taskq_dispatch_interval;
 151 
 152 /*
 153  * Response completion hash queue
 154  */
 155 static call_table_t *clts_call_ht;
 156 
 157 /*
 158  * Routines for the endpoint manager
 159  */
 160 static struct endpnt_type *endpnt_type_create(struct knetconfig *);
 161 static void endpnt_type_free(struct endpnt_type *);
 162 static int check_endpnt(struct endpnt *, struct endpnt **);
 163 static struct endpnt *endpnt_get(struct knetconfig *, int);
 164 static void endpnt_rele(struct endpnt *);
 165 static void endpnt_reap_settimer(endpnt_type_t *);
 166 static void endpnt_reap(endpnt_type_t *);
 167 static void endpnt_reap_dispatch(void *);
 168 static void endpnt_reclaim(zoneid_t);
 169 
 170 
 171 /*
 172  * Request dipatching function.
 173  */
 174 static int clnt_clts_dispatch_send(queue_t *q, mblk_t *, struct netbuf *addr,
 175                                         calllist_t *, uint_t, cred_t *);
 176 
 177 /*
 178  * The size of the preserialized RPC header information.
 179  */
 180 #define CKU_HDRSIZE     20
 181 /*
 182  * The initial allocation size.  It is small to reduce space requirements.
 183  */
 184 #define CKU_INITSIZE    2048
 185 /*
 186  * The size of additional allocations, if required.  It is larger to
 187  * reduce the number of actual allocations.
 188  */
 189 #define CKU_ALLOCSIZE   8192
 190 
 191 /*
 192  * Private data per rpc handle.  This structure is allocated by
 193  * clnt_clts_kcreate, and freed by clnt_clts_kdestroy.
 194  */
 195 struct cku_private {
 196         CLIENT                   cku_client;    /* client handle */
 197         int                      cku_retrys;    /* request retrys */
 198         calllist_t               cku_call;
 199         struct endpnt           *cku_endpnt;    /* open end point */
 200         struct knetconfig        cku_config;
 201         struct netbuf            cku_addr;      /* remote address */
 202         struct rpc_err           cku_err;       /* error status */
 203         XDR                      cku_outxdr;    /* xdr stream for output */
 204         XDR                      cku_inxdr;     /* xdr stream for input */
 205         char                     cku_rpchdr[CKU_HDRSIZE + 4]; /* rpc header */
 206         struct cred             *cku_cred;      /* credentials */
 207         struct rpc_timers       *cku_timers;    /* for estimating RTT */
 208         struct rpc_timers       *cku_timeall;   /* for estimating RTT */
 209         void                     (*cku_feedback)(int, int, caddr_t);
 210                                                 /* ptr to feedback rtn */
 211         caddr_t                  cku_feedarg;   /* argument for feedback func */
 212         uint32_t                 cku_xid;       /* current XID */
 213         bool_t                   cku_bcast;     /* RPC broadcast hint */
 214         int                     cku_useresvport; /* Use reserved port */
 215         struct rpc_clts_client  *cku_stats;     /* counters for the zone */
 216 };
 217 
 218 static const struct rpc_clts_client {
 219         kstat_named_t   rccalls;
 220         kstat_named_t   rcbadcalls;
 221         kstat_named_t   rcretrans;
 222         kstat_named_t   rcbadxids;
 223         kstat_named_t   rctimeouts;
 224         kstat_named_t   rcnewcreds;
 225         kstat_named_t   rcbadverfs;
 226         kstat_named_t   rctimers;
 227         kstat_named_t   rcnomem;
 228         kstat_named_t   rccantsend;
 229 } clts_rcstat_tmpl = {
 230         { "calls",      KSTAT_DATA_UINT64 },
 231         { "badcalls",   KSTAT_DATA_UINT64 },
 232         { "retrans",    KSTAT_DATA_UINT64 },
 233         { "badxids",    KSTAT_DATA_UINT64 },
 234         { "timeouts",   KSTAT_DATA_UINT64 },
 235         { "newcreds",   KSTAT_DATA_UINT64 },
 236         { "badverfs",   KSTAT_DATA_UINT64 },
 237         { "timers",     KSTAT_DATA_UINT64 },
 238         { "nomem",      KSTAT_DATA_UINT64 },
 239         { "cantsend",   KSTAT_DATA_UINT64 },
 240 };
 241 
 242 static uint_t clts_rcstat_ndata =
 243         sizeof (clts_rcstat_tmpl) / sizeof (kstat_named_t);
 244 
 245 #define RCSTAT_INCR(s, x)                       \
 246         atomic_add_64(&(s)->x.value.ui64, 1)
 247 
 248 #define ptoh(p)         (&((p)->cku_client))
 249 #define htop(h)         ((struct cku_private *)((h)->cl_private))
 250 
 251 /*
 252  * Times to retry
 253  */
 254 #define SNDTRIES        4
 255 #define REFRESHES       2       /* authentication refreshes */
 256 
 257 /*
 258  * The following is used to determine the global default behavior for
 259  * CLTS when binding to a local port.
 260  *
 261  * If the value is set to 1 the default will be to select a reserved
 262  * (aka privileged) port, if the value is zero the default will be to
 263  * use non-reserved ports.  Users of kRPC may override this by using
 264  * CLNT_CONTROL() and CLSET_BINDRESVPORT.
 265  */
 266 static int clnt_clts_do_bindresvport = 1;
 267 
 268 #define BINDRESVPORT_RETRIES 5
 269 
 270 void
 271 clnt_clts_stats_init(zoneid_t zoneid, struct rpc_clts_client **statsp)
 272 {
 273         kstat_t *ksp;
 274         kstat_named_t *knp;
 275 
 276         knp = rpcstat_zone_init_common(zoneid, "unix", "rpc_clts_client",
 277             (const kstat_named_t *)&clts_rcstat_tmpl,
 278             sizeof (clts_rcstat_tmpl));
 279         /*
 280          * Backwards compatibility for old kstat clients
 281          */
 282         ksp = kstat_create_zone("unix", 0, "rpc_client", "rpc",
 283             KSTAT_TYPE_NAMED, clts_rcstat_ndata,
 284             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid);
 285         if (ksp) {
 286                 ksp->ks_data = knp;
 287                 kstat_install(ksp);
 288         }
 289         *statsp = (struct rpc_clts_client *)knp;
 290 }
 291 
 292 void
 293 clnt_clts_stats_fini(zoneid_t zoneid, struct rpc_clts_client **statsp)
 294 {
 295         rpcstat_zone_fini_common(zoneid, "unix", "rpc_clts_client");
 296         kstat_delete_byname_zone("unix", 0, "rpc_client", zoneid);
 297         kmem_free(*statsp, sizeof (clts_rcstat_tmpl));
 298 }
 299 
 300 /*
 301  * Create an rpc handle for a clts rpc connection.
 302  * Allocates space for the handle structure and the private data.
 303  */
 304 /* ARGSUSED */
 305 int
 306 clnt_clts_kcreate(struct knetconfig *config, struct netbuf *addr,
 307         rpcprog_t pgm, rpcvers_t vers, int retrys, struct cred *cred,
 308         CLIENT **cl)
 309 {
 310         CLIENT *h;
 311         struct cku_private *p;
 312         struct rpc_msg call_msg;
 313         int error;
 314         int plen;
 315 
 316         if (cl == NULL)
 317                 return (EINVAL);
 318 
 319         *cl = NULL;
 320         error = 0;
 321 
 322         p = kmem_zalloc(sizeof (*p), KM_SLEEP);
 323 
 324         h = ptoh(p);
 325 
 326         /* handle */
 327         h->cl_ops = &clts_ops;
 328         h->cl_private = (caddr_t)p;
 329         h->cl_auth = authkern_create();
 330 
 331         /* call message, just used to pre-serialize below */
 332         call_msg.rm_xid = 0;
 333         call_msg.rm_direction = CALL;
 334         call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION;
 335         call_msg.rm_call.cb_prog = pgm;
 336         call_msg.rm_call.cb_vers = vers;
 337 
 338         /* private */
 339         clnt_clts_kinit(h, addr, retrys, cred);
 340 
 341         xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, CKU_HDRSIZE, XDR_ENCODE);
 342 
 343         /* pre-serialize call message header */
 344         if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) {
 345                 error = EINVAL;         /* XXX */
 346                 goto bad;
 347         }
 348 
 349         p->cku_config.knc_rdev = config->knc_rdev;
 350         p->cku_config.knc_semantics = config->knc_semantics;
 351         plen = strlen(config->knc_protofmly) + 1;
 352         p->cku_config.knc_protofmly = kmem_alloc(plen, KM_SLEEP);
 353         bcopy(config->knc_protofmly, p->cku_config.knc_protofmly, plen);
 354         p->cku_useresvport = -1; /* value is has not been set */
 355 
 356         cv_init(&p->cku_call.call_cv, NULL, CV_DEFAULT, NULL);
 357         mutex_init(&p->cku_call.call_lock, NULL, MUTEX_DEFAULT, NULL);
 358 
 359         *cl = h;
 360         return (0);
 361 
 362 bad:
 363         auth_destroy(h->cl_auth);
 364         kmem_free(p->cku_addr.buf, addr->maxlen);
 365         kmem_free(p, sizeof (struct cku_private));
 366 
 367         return (error);
 368 }
 369 
 370 void
 371 clnt_clts_kinit(CLIENT *h, struct netbuf *addr, int retrys, cred_t *cred)
 372 {
 373         /* LINTED pointer alignment */
 374         struct cku_private *p = htop(h);
 375         struct rpcstat *rsp;
 376 
 377         rsp = zone_getspecific(rpcstat_zone_key, rpc_zone());
 378         ASSERT(rsp != NULL);
 379 
 380         p->cku_retrys = retrys;
 381 
 382         if (p->cku_addr.maxlen < addr->len) {
 383                 if (p->cku_addr.maxlen != 0 && p->cku_addr.buf != NULL)
 384                         kmem_free(p->cku_addr.buf, p->cku_addr.maxlen);
 385 
 386                 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP);
 387                 p->cku_addr.maxlen = addr->maxlen;
 388         }
 389 
 390         p->cku_addr.len = addr->len;
 391         bcopy(addr->buf, p->cku_addr.buf, addr->len);
 392 
 393         p->cku_cred = cred;
 394         p->cku_xid = 0;
 395         p->cku_timers = NULL;
 396         p->cku_timeall = NULL;
 397         p->cku_feedback = NULL;
 398         p->cku_bcast = FALSE;
 399         p->cku_call.call_xid = 0;
 400         p->cku_call.call_hash = 0;
 401         p->cku_call.call_notified = FALSE;
 402         p->cku_call.call_next = NULL;
 403         p->cku_call.call_prev = NULL;
 404         p->cku_call.call_reply = NULL;
 405         p->cku_call.call_wq = NULL;
 406         p->cku_stats = rsp->rpc_clts_client;
 407 }
 408 
 409 /*
 410  * set the timers.  Return current retransmission timeout.
 411  */
 412 static int
 413 clnt_clts_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all,
 414         int minimum, void (*feedback)(int, int, caddr_t), caddr_t arg,
 415         uint32_t xid)
 416 {
 417         /* LINTED pointer alignment */
 418         struct cku_private *p = htop(h);
 419         int value;
 420 
 421         p->cku_feedback = feedback;
 422         p->cku_feedarg = arg;
 423         p->cku_timers = t;
 424         p->cku_timeall = all;
 425         if (xid)
 426                 p->cku_xid = xid;
 427         value = all->rt_rtxcur;
 428         value += t->rt_rtxcur;
 429         if (value < minimum)
 430                 return (minimum);
 431         RCSTAT_INCR(p->cku_stats, rctimers);
 432         return (value);
 433 }
 434 
 435 /*
 436  * Time out back off function. tim is in HZ
 437  */
 438 #define MAXTIMO (20 * hz)
 439 #define backoff(tim)    (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
 440 #define dobackoff(tim)  ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
 441 
 442 #define RETRY_POLL_TIMO 30
 443 
 444 /*
 445  * Call remote procedure.
 446  * Most of the work of rpc is done here.  We serialize what is left
 447  * of the header (some was pre-serialized in the handle), serialize
 448  * the arguments, and send it off.  We wait for a reply or a time out.
 449  * Timeout causes an immediate return, other packet problems may cause
 450  * a retry on the receive.  When a good packet is received we deserialize
 451  * it, and check verification.  A bad reply code will cause one retry
 452  * with full (longhand) credentials.
 453  */
 454 enum clnt_stat
 455 clnt_clts_kcallit_addr(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args,
 456         caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp,
 457         struct timeval wait, struct netbuf *sin)
 458 {
 459         /* LINTED pointer alignment */
 460         struct cku_private *p = htop(h);
 461         XDR *xdrs;
 462         int stries = p->cku_retrys;
 463         int refreshes = REFRESHES;      /* number of times to refresh cred */
 464         int round_trip;                 /* time the RPC */
 465         int error;
 466         mblk_t *mp;
 467         mblk_t *mpdup;
 468         mblk_t *resp = NULL;
 469         mblk_t *tmp;
 470         calllist_t *call = &p->cku_call;
 471         clock_t ori_timout, timout;
 472         bool_t interrupted;
 473         enum clnt_stat status;
 474         struct rpc_msg reply_msg;
 475         enum clnt_stat re_status;
 476         endpnt_t *endpt;
 477 
 478         RCSTAT_INCR(p->cku_stats, rccalls);
 479 
 480         RPCLOG(2, "clnt_clts_kcallit_addr: wait.tv_sec: %ld\n", wait.tv_sec);
 481         RPCLOG(2, "clnt_clts_kcallit_addr: wait.tv_usec: %ld\n", wait.tv_usec);
 482 
 483         timout = TIMEVAL_TO_TICK(&wait);
 484         ori_timout = timout;
 485 
 486         if (p->cku_xid == 0) {
 487                 p->cku_xid = alloc_xid();
 488                 if (p->cku_endpnt != NULL)
 489                         endpnt_rele(p->cku_endpnt);
 490                 p->cku_endpnt = NULL;
 491         }
 492         call->call_zoneid = rpc_zoneid();
 493 
 494         mpdup = NULL;
 495 call_again:
 496 
 497         if (mpdup == NULL) {
 498 
 499                 while ((mp = allocb(CKU_INITSIZE, BPRI_LO)) == NULL) {
 500                         if (strwaitbuf(CKU_INITSIZE, BPRI_LO)) {
 501                                 p->cku_err.re_status = RPC_SYSTEMERROR;
 502                                 p->cku_err.re_errno = ENOSR;
 503                                 goto done;
 504                         }
 505                 }
 506 
 507                 xdrs = &p->cku_outxdr;
 508                 xdrmblk_init(xdrs, mp, XDR_ENCODE, CKU_ALLOCSIZE);
 509 
 510                 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
 511                         /*
 512                          * Copy in the preserialized RPC header
 513                          * information.
 514                          */
 515                         bcopy(p->cku_rpchdr, mp->b_rptr, CKU_HDRSIZE);
 516 
 517                         /*
 518                          * transaction id is the 1st thing in the output
 519                          * buffer.
 520                          */
 521                         /* LINTED pointer alignment */
 522                         (*(uint32_t *)(mp->b_rptr)) = p->cku_xid;
 523 
 524                         /* Skip the preserialized stuff. */
 525                         XDR_SETPOS(xdrs, CKU_HDRSIZE);
 526 
 527                         /* Serialize dynamic stuff into the output buffer. */
 528                         if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) ||
 529                             (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) ||
 530                             (!(*xdr_args)(xdrs, argsp))) {
 531                                 freemsg(mp);
 532                                 p->cku_err.re_status = RPC_CANTENCODEARGS;
 533                                 p->cku_err.re_errno = EIO;
 534                                 goto done;
 535                         }
 536                 } else {
 537                         uint32_t *uproc = (uint32_t *)
 538                             &p->cku_rpchdr[CKU_HDRSIZE];
 539                         IXDR_PUT_U_INT32(uproc, procnum);
 540 
 541                         (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid;
 542                         XDR_SETPOS(xdrs, 0);
 543 
 544                         /* Serialize the procedure number and the arguments. */
 545                         if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr,
 546                             CKU_HDRSIZE+4, xdrs, xdr_args, argsp)) {
 547                                 freemsg(mp);
 548                                 p->cku_err.re_status = RPC_CANTENCODEARGS;
 549                                 p->cku_err.re_errno = EIO;
 550                                 goto done;
 551                         }
 552                 }
 553         } else
 554                 mp = mpdup;
 555 
 556         mpdup = dupmsg(mp);
 557         if (mpdup == NULL) {
 558                 freemsg(mp);
 559                 p->cku_err.re_status = RPC_SYSTEMERROR;
 560                 p->cku_err.re_errno = ENOSR;
 561                 goto done;
 562         }
 563 
 564         /*
 565          * Grab an endpnt only if the endpoint is NULL.  We could be retrying
 566          * the request and in this case we want to go through the same
 567          * source port, so that the duplicate request cache may detect a
 568          * retry.
 569          */
 570 
 571         if (p->cku_endpnt == NULL)
 572                 p->cku_endpnt = endpnt_get(&p->cku_config, p->cku_useresvport);
 573 
 574         if (p->cku_endpnt == NULL) {
 575                 freemsg(mp);
 576                 p->cku_err.re_status = RPC_SYSTEMERROR;
 577                 p->cku_err.re_errno = ENOSR;
 578                 goto done;
 579         }
 580 
 581         round_trip = ddi_get_lbolt();
 582 
 583         error = clnt_clts_dispatch_send(p->cku_endpnt->e_wq, mp,
 584             &p->cku_addr, call, p->cku_xid, p->cku_cred);
 585 
 586         if (error != 0) {
 587                 freemsg(mp);
 588                 p->cku_err.re_status = RPC_CANTSEND;
 589                 p->cku_err.re_errno = error;
 590                 RCSTAT_INCR(p->cku_stats, rccantsend);
 591                 goto done1;
 592         }
 593 
 594         RPCLOG(64, "clnt_clts_kcallit_addr: sent call for xid 0x%x\n",
 595             p->cku_xid);
 596 
 597         /*
 598          * There are two reasons for which we go back to to tryread.
 599          *
 600          * a) In case the status is RPC_PROCUNAVAIL and we sent out a
 601          *    broadcast we should not get any invalid messages with the
 602          *    RPC_PROCUNAVAIL error back. Some broken RPC implementations
 603          *    send them and for this we have to ignore them ( as we would
 604          *    have never received them ) and look for another message
 605          *    which might contain the valid response because we don't know
 606          *    how many broken implementations are in the network. So we are
 607          *    going to loop until
 608          *    - we received a valid response
 609          *    - we have processed all invalid responses and
 610          *      got a time out when we try to receive again a
 611          *      message.
 612          *
 613          * b) We will jump back to tryread also in case we failed
 614          *    within the AUTH_VALIDATE. In this case we should move
 615          *    on and loop until we received a valid response or we
 616          *    have processed all responses with broken authentication
 617          *    and we got a time out when we try to receive a message.
 618          */
 619 tryread:
 620         mutex_enter(&call->call_lock);
 621         interrupted = FALSE;
 622         if (call->call_notified == FALSE) {
 623                 klwp_t *lwp = ttolwp(curthread);
 624                 clock_t cv_wait_ret = 1; /* init to > 0 */
 625                 clock_t cv_timout = timout;
 626 
 627                 if (lwp != NULL)
 628                         lwp->lwp_nostop++;
 629 
 630                 cv_timout += ddi_get_lbolt();
 631 
 632                 if (h->cl_nosignal)
 633                         while ((cv_wait_ret =
 634                             cv_timedwait(&call->call_cv,
 635                             &call->call_lock, cv_timout)) > 0 &&
 636                             call->call_notified == FALSE)
 637                                 ;
 638                 else
 639                         while ((cv_wait_ret =
 640                             cv_timedwait_sig(&call->call_cv,
 641                             &call->call_lock, cv_timout)) > 0 &&
 642                             call->call_notified == FALSE)
 643                                 ;
 644 
 645                 if (cv_wait_ret == 0)
 646                         interrupted = TRUE;
 647 
 648                 if (lwp != NULL)
 649                         lwp->lwp_nostop--;
 650         }
 651         resp = call->call_reply;
 652         call->call_reply = NULL;
 653         status = call->call_status;
 654         /*
 655          * We have to reset the call_notified here. In case we have
 656          * to do a retry ( e.g. in case we got a RPC_PROCUNAVAIL
 657          * error ) we need to set this to false to ensure that
 658          * we will wait for the next message. When the next message
 659          * is going to arrive the function clnt_clts_dispatch_notify
 660          * will set this to true again.
 661          */
 662         call->call_notified = FALSE;
 663         call->call_status = RPC_TIMEDOUT;
 664         mutex_exit(&call->call_lock);
 665 
 666         if (status == RPC_TIMEDOUT) {
 667                 if (interrupted) {
 668                         /*
 669                          * We got interrupted, bail out
 670                          */
 671                         p->cku_err.re_status = RPC_INTR;
 672                         p->cku_err.re_errno = EINTR;
 673                         goto done1;
 674                 } else {
 675                         RPCLOG(8, "clnt_clts_kcallit_addr: "
 676                             "request w/xid 0x%x timedout "
 677                             "waiting for reply\n", p->cku_xid);
 678 #if 0 /* XXX not yet */
 679                         /*
 680                          * Timeout may be due to a dead gateway. Send
 681                          * an ioctl downstream advising deletion of
 682                          * route when we reach the half-way point to
 683                          * timing out.
 684                          */
 685                         if (stries == p->cku_retrys/2) {
 686                                 t_kadvise(p->cku_endpnt->e_tiptr,
 687                                     (uchar_t *)p->cku_addr.buf,
 688                                     p->cku_addr.len);
 689                         }
 690 #endif /* not yet */
 691                         p->cku_err.re_status = RPC_TIMEDOUT;
 692                         p->cku_err.re_errno = ETIMEDOUT;
 693                         RCSTAT_INCR(p->cku_stats, rctimeouts);
 694                         goto done1;
 695                 }
 696         }
 697 
 698         ASSERT(resp != NULL);
 699 
 700         /*
 701          * Prepare the message for further processing.  We need to remove
 702          * the datagram header and copy the source address if necessary.  No
 703          * need to verify the header since rpcmod took care of that.
 704          */
 705         /*
 706          * Copy the source address if the caller has supplied a netbuf.
 707          */
 708         if (sin != NULL) {
 709                 union T_primitives *pptr;
 710 
 711                 pptr = (union T_primitives *)resp->b_rptr;
 712                 bcopy(resp->b_rptr + pptr->unitdata_ind.SRC_offset, sin->buf,
 713                     pptr->unitdata_ind.SRC_length);
 714                 sin->len = pptr->unitdata_ind.SRC_length;
 715         }
 716 
 717         /*
 718          * Pop off the datagram header.
 719          * It was retained in rpcmodrput().
 720          */
 721         tmp = resp;
 722         resp = resp->b_cont;
 723         tmp->b_cont = NULL;
 724         freeb(tmp);
 725 
 726         round_trip = ddi_get_lbolt() - round_trip;
 727         /*
 728          * Van Jacobson timer algorithm here, only if NOT a retransmission.
 729          */
 730         if (p->cku_timers != NULL && stries == p->cku_retrys) {
 731                 int rt;
 732 
 733                 rt = round_trip;
 734                 rt -= (p->cku_timers->rt_srtt >> 3);
 735                 p->cku_timers->rt_srtt += rt;
 736                 if (rt < 0)
 737                         rt = - rt;
 738                 rt -= (p->cku_timers->rt_deviate >> 2);
 739                 p->cku_timers->rt_deviate += rt;
 740                 p->cku_timers->rt_rtxcur =
 741                     (clock_t)((p->cku_timers->rt_srtt >> 2) +
 742                     p->cku_timers->rt_deviate) >> 1;
 743 
 744                 rt = round_trip;
 745                 rt -= (p->cku_timeall->rt_srtt >> 3);
 746                 p->cku_timeall->rt_srtt += rt;
 747                 if (rt < 0)
 748                         rt = - rt;
 749                 rt -= (p->cku_timeall->rt_deviate >> 2);
 750                 p->cku_timeall->rt_deviate += rt;
 751                 p->cku_timeall->rt_rtxcur =
 752                     (clock_t)((p->cku_timeall->rt_srtt >> 2) +
 753                     p->cku_timeall->rt_deviate) >> 1;
 754                 if (p->cku_feedback != NULL) {
 755                         (*p->cku_feedback)(FEEDBACK_OK, procnum,
 756                             p->cku_feedarg);
 757                 }
 758         }
 759 
 760         /*
 761          * Process reply
 762          */
 763         xdrs = &(p->cku_inxdr);
 764         xdrmblk_init(xdrs, resp, XDR_DECODE, 0);
 765 
 766         reply_msg.rm_direction = REPLY;
 767         reply_msg.rm_reply.rp_stat = MSG_ACCEPTED;
 768         reply_msg.acpted_rply.ar_stat = SUCCESS;
 769         reply_msg.acpted_rply.ar_verf = _null_auth;
 770         /*
 771          *  xdr_results will be done in AUTH_UNWRAP.
 772          */
 773         reply_msg.acpted_rply.ar_results.where = NULL;
 774         reply_msg.acpted_rply.ar_results.proc = xdr_void;
 775 
 776         /*
 777          * Decode and validate the response.
 778          */
 779         if (!xdr_replymsg(xdrs, &reply_msg)) {
 780                 p->cku_err.re_status = RPC_CANTDECODERES;
 781                 p->cku_err.re_errno = EIO;
 782                 (void) xdr_rpc_free_verifier(xdrs, &reply_msg);
 783                 goto done1;
 784         }
 785 
 786         _seterr_reply(&reply_msg, &(p->cku_err));
 787 
 788         re_status = p->cku_err.re_status;
 789         if (re_status == RPC_SUCCESS) {
 790                 /*
 791                  * Reply is good, check auth.
 792                  */
 793                 if (!AUTH_VALIDATE(h->cl_auth,
 794                     &reply_msg.acpted_rply.ar_verf)) {
 795                         p->cku_err.re_status = RPC_AUTHERROR;
 796                         p->cku_err.re_why = AUTH_INVALIDRESP;
 797                         RCSTAT_INCR(p->cku_stats, rcbadverfs);
 798                         (void) xdr_rpc_free_verifier(xdrs, &reply_msg);
 799                         goto tryread;
 800                 }
 801                 if (!AUTH_UNWRAP(h->cl_auth, xdrs, xdr_results, resultsp)) {
 802                         p->cku_err.re_status = RPC_CANTDECODERES;
 803                         p->cku_err.re_errno = EIO;
 804                 }
 805                 (void) xdr_rpc_free_verifier(xdrs, &reply_msg);
 806                 goto done1;
 807         }
 808         /* set errno in case we can't recover */
 809         if (re_status != RPC_VERSMISMATCH &&
 810             re_status != RPC_AUTHERROR && re_status != RPC_PROGVERSMISMATCH)
 811                 p->cku_err.re_errno = EIO;
 812         /*
 813          * Determine whether or not we're doing an RPC
 814          * broadcast. Some server implementations don't
 815          * follow RFC 1050, section 7.4.2 in that they
 816          * don't remain silent when they see a proc
 817          * they don't support. Therefore we keep trying
 818          * to receive on RPC_PROCUNAVAIL, hoping to get
 819          * a valid response from a compliant server.
 820          */
 821         if (re_status == RPC_PROCUNAVAIL && p->cku_bcast) {
 822                 (void) xdr_rpc_free_verifier(xdrs, &reply_msg);
 823                 goto tryread;
 824         }
 825         if (re_status == RPC_AUTHERROR) {
 826 
 827                 (void) xdr_rpc_free_verifier(xdrs, &reply_msg);
 828                 call_table_remove(call);
 829                 if (call->call_reply != NULL) {
 830                         freemsg(call->call_reply);
 831                         call->call_reply = NULL;
 832                 }
 833 
 834                 /*
 835                  * Maybe our credential need to be refreshed
 836                  */
 837                 if (refreshes > 0 &&
 838                     AUTH_REFRESH(h->cl_auth, &reply_msg, p->cku_cred)) {
 839                         /*
 840                          * The credential is refreshed. Try the request again.
 841                          * Even if stries == 0, we still retry as long as
 842                          * refreshes > 0. This prevents a soft authentication
 843                          * error turning into a hard one at an upper level.
 844                          */
 845                         refreshes--;
 846                         RCSTAT_INCR(p->cku_stats, rcbadcalls);
 847                         RCSTAT_INCR(p->cku_stats, rcnewcreds);
 848 
 849                         freemsg(mpdup);
 850                         mpdup = NULL;
 851                         freemsg(resp);
 852                         resp = NULL;
 853                         goto call_again;
 854                 }
 855                 /*
 856                  * We have used the client handle to do an AUTH_REFRESH
 857                  * and the RPC status may be set to RPC_SUCCESS;
 858                  * Let's make sure to set it to RPC_AUTHERROR.
 859                  */
 860                 p->cku_err.re_status = RPC_CANTDECODERES;
 861 
 862                 /*
 863                  * Map recoverable and unrecoverable
 864                  * authentication errors to appropriate errno
 865                  */
 866                 switch (p->cku_err.re_why) {
 867                 case AUTH_TOOWEAK:
 868                         /*
 869                          * Could be an nfsportmon failure, set
 870                          * useresvport and try again.
 871                          */
 872                         if (p->cku_useresvport != 1) {
 873                                 p->cku_useresvport = 1;
 874 
 875                                 freemsg(mpdup);
 876                                 mpdup = NULL;
 877                                 freemsg(resp);
 878                                 resp = NULL;
 879 
 880                                 endpt = p->cku_endpnt;
 881                                 if (endpt->e_tiptr != NULL) {
 882                                         mutex_enter(&endpt->e_lock);
 883                                         endpt->e_flags &= ~ENDPNT_BOUND;
 884                                         (void) t_kclose(endpt->e_tiptr, 1);
 885                                         endpt->e_tiptr = NULL;
 886                                         mutex_exit(&endpt->e_lock);
 887 
 888                                 }
 889 
 890                                 p->cku_xid = alloc_xid();
 891                                 endpnt_rele(p->cku_endpnt);
 892                                 p->cku_endpnt = NULL;
 893                                 goto call_again;
 894                         }
 895                         /* FALLTHRU */
 896                 case AUTH_BADCRED:
 897                 case AUTH_BADVERF:
 898                 case AUTH_INVALIDRESP:
 899                 case AUTH_FAILED:
 900                 case RPCSEC_GSS_NOCRED:
 901                 case RPCSEC_GSS_FAILED:
 902                         p->cku_err.re_errno = EACCES;
 903                         break;
 904                 case AUTH_REJECTEDCRED:
 905                 case AUTH_REJECTEDVERF:
 906                 default:
 907                         p->cku_err.re_errno = EIO;
 908                         break;
 909                 }
 910                 RPCLOG(1, "clnt_clts_kcallit : authentication failed "
 911                     "with RPC_AUTHERROR of type %d\n",
 912                     p->cku_err.re_why);
 913                 goto done;
 914         }
 915 
 916         (void) xdr_rpc_free_verifier(xdrs, &reply_msg);
 917 
 918 done1:
 919         call_table_remove(call);
 920         if (call->call_reply != NULL) {
 921                 freemsg(call->call_reply);
 922                 call->call_reply = NULL;
 923         }
 924         RPCLOG(64, "clnt_clts_kcallit_addr: xid 0x%x taken off dispatch list",
 925             p->cku_xid);
 926 
 927 done:
 928         if (resp != NULL) {
 929                 freemsg(resp);
 930                 resp = NULL;
 931         }
 932 
 933         if ((p->cku_err.re_status != RPC_SUCCESS) &&
 934             (p->cku_err.re_status != RPC_INTR) &&
 935             (p->cku_err.re_status != RPC_UDERROR) &&
 936             !IS_UNRECOVERABLE_RPC(p->cku_err.re_status)) {
 937                 if (p->cku_feedback != NULL && stries == p->cku_retrys) {
 938                         (*p->cku_feedback)(FEEDBACK_REXMIT1, procnum,
 939                             p->cku_feedarg);
 940                 }
 941 
 942                 timout = backoff(timout);
 943                 if (p->cku_timeall != (struct rpc_timers *)0)
 944                         p->cku_timeall->rt_rtxcur = timout;
 945 
 946                 if (p->cku_err.re_status == RPC_SYSTEMERROR ||
 947                     p->cku_err.re_status == RPC_CANTSEND) {
 948                         /*
 949                          * Errors due to lack of resources, wait a bit
 950                          * and try again.
 951                          */
 952                         (void) delay(hz/10);
 953                 }
 954                 if (stries-- > 0) {
 955                         RCSTAT_INCR(p->cku_stats, rcretrans);
 956                         goto call_again;
 957                 }
 958         }
 959 
 960         if (mpdup != NULL)
 961                 freemsg(mpdup);
 962 
 963         if (p->cku_err.re_status != RPC_SUCCESS) {
 964                 RCSTAT_INCR(p->cku_stats, rcbadcalls);
 965         }
 966 
 967         /*
 968          * Allow the endpoint to be held by the client handle in case this
 969          * RPC was not successful.  A retry may occur at a higher level and
 970          * in this case we may want to send the request over the same
 971          * source port.
 972          * Endpoint is also released for one-way RPC: no reply, nor retransmit
 973          * is expected.
 974          */
 975         if ((p->cku_err.re_status == RPC_SUCCESS ||
 976             (p->cku_err.re_status == RPC_TIMEDOUT && ori_timout == 0)) &&
 977             p->cku_endpnt != NULL) {
 978                 endpnt_rele(p->cku_endpnt);
 979                 p->cku_endpnt = NULL;
 980         } else {
 981                 DTRACE_PROBE2(clnt_clts_kcallit_done, int, p->cku_err.re_status,
 982                     struct endpnt *, p->cku_endpnt);
 983         }
 984 
 985         return (p->cku_err.re_status);
 986 }
 987 
 988 static enum clnt_stat
 989 clnt_clts_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args,
 990         caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp,
 991         struct timeval wait)
 992 {
 993         return (clnt_clts_kcallit_addr(h, procnum, xdr_args, argsp,
 994             xdr_results, resultsp, wait, NULL));
 995 }
 996 
 997 /*
 998  * Return error info on this handle.
 999  */
1000 static void
1001 clnt_clts_kerror(CLIENT *h, struct rpc_err *err)
1002 {
1003         /* LINTED pointer alignment */
1004         struct cku_private *p = htop(h);
1005 
1006         *err = p->cku_err;
1007 }
1008 
1009 static bool_t
1010 clnt_clts_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr)
1011 {
1012         /* LINTED pointer alignment */
1013         struct cku_private *p = htop(h);
1014         XDR *xdrs;
1015 
1016         xdrs = &(p->cku_outxdr);
1017         xdrs->x_op = XDR_FREE;
1018         return ((*xdr_res)(xdrs, res_ptr));
1019 }
1020 
1021 /*ARGSUSED*/
1022 static void
1023 clnt_clts_kabort(CLIENT *h)
1024 {
1025 }
1026 
1027 static bool_t
1028 clnt_clts_kcontrol(CLIENT *h, int cmd, char *arg)
1029 {
1030         /* LINTED pointer alignment */
1031         struct cku_private *p = htop(h);
1032 
1033         switch (cmd) {
1034         case CLSET_XID:
1035                 p->cku_xid = *((uint32_t *)arg);
1036                 return (TRUE);
1037 
1038         case CLGET_XID:
1039                 *((uint32_t *)arg) = p->cku_xid;
1040                 return (TRUE);
1041 
1042         case CLSET_BCAST:
1043                 p->cku_bcast = *((uint32_t *)arg);
1044                 return (TRUE);
1045 
1046         case CLGET_BCAST:
1047                 *((uint32_t *)arg) = p->cku_bcast;
1048                 return (TRUE);
1049         case CLSET_BINDRESVPORT:
1050                 if (arg == NULL)
1051                         return (FALSE);
1052 
1053                 if (*(int *)arg != 1 && *(int *)arg != 0)
1054                         return (FALSE);
1055 
1056                 p->cku_useresvport = *(int *)arg;
1057 
1058                 return (TRUE);
1059 
1060         case CLGET_BINDRESVPORT:
1061                 if (arg == NULL)
1062                         return (FALSE);
1063 
1064                 *(int *)arg = p->cku_useresvport;
1065 
1066                 return (TRUE);
1067 
1068         default:
1069                 return (FALSE);
1070         }
1071 }
1072 
1073 /*
1074  * Destroy rpc handle.
1075  * Frees the space used for output buffer, private data, and handle
1076  * structure, and the file pointer/TLI data on last reference.
1077  */
1078 static void
1079 clnt_clts_kdestroy(CLIENT *h)
1080 {
1081         /* LINTED pointer alignment */
1082         struct cku_private *p = htop(h);
1083         calllist_t *call = &p->cku_call;
1084 
1085         int plen;
1086 
1087         RPCLOG(8, "clnt_clts_kdestroy h: %p\n", (void *)h);
1088         RPCLOG(8, "clnt_clts_kdestroy h: xid=0x%x\n", p->cku_xid);
1089 
1090         if (p->cku_endpnt != NULL)
1091                 endpnt_rele(p->cku_endpnt);
1092 
1093         cv_destroy(&call->call_cv);
1094         mutex_destroy(&call->call_lock);
1095 
1096         plen = strlen(p->cku_config.knc_protofmly) + 1;
1097         kmem_free(p->cku_config.knc_protofmly, plen);
1098         kmem_free(p->cku_addr.buf, p->cku_addr.maxlen);
1099         kmem_free(p, sizeof (*p));
1100 }
1101 
1102 /*
1103  * The connectionless (CLTS) kRPC endpoint management subsystem.
1104  *
1105  * Because endpoints are potentially shared among threads making RPC calls,
1106  * they are managed in a pool according to type (endpnt_type_t).  Each
1107  * endpnt_type_t points to a list of usable endpoints through the e_pool
1108  * field, which is of type list_t.  list_t is a doubly-linked list.
1109  * The number of endpoints in the pool is stored in the e_cnt field of
1110  * endpnt_type_t and the endpoints are reference counted using the e_ref field
1111  * in the endpnt_t structure.
1112  *
1113  * As an optimization, endpoints that have no references are also linked
1114  * to an idle list via e_ilist which is also of type list_t.  When a thread
1115  * calls endpnt_get() to obtain a transport endpoint, the idle list is first
1116  * consulted and if such an endpoint exists, it is removed from the idle list
1117  * and returned to the caller.
1118  *
1119  * If the idle list is empty, then a check is made to see if more endpoints
1120  * can be created.  If so, we proceed and create a new endpoint which is added
1121  * to the pool and returned to the caller.  If we have reached the limit and
1122  * cannot make a new endpoint then one is returned to the caller via round-
1123  * robin policy.
1124  *
1125  * When an endpoint is placed on the idle list by a thread calling
1126  * endpnt_rele(), it is timestamped and then a reaper taskq is scheduled to
1127  * be dispatched if one hasn't already been.  When the timer fires, the
1128  * taskq traverses the idle list and checks to see which endpoints are
1129  * eligible to be closed.  It determines this by checking if the timestamp
1130  * when the endpoint was released has exceeded the the threshold for how long
1131  * it should stay alive.
1132  *
1133  * endpnt_t structures remain persistent until the memory reclaim callback,
1134  * endpnt_reclaim(), is invoked.
1135  *
1136  * Here is an example of how the data structures would be laid out by the
1137  * subsystem:
1138  *
1139  *       endpnt_type_t
1140  *
1141  *       loopback                                 inet
1142  *       _______________                          ______________
1143  *      | e_next        |----------------------->| e_next       |---->>
1144  *      | e_pool        |<---+                   | e_pool       |<----+
1145  *      | e_ilist       |<---+--+                | e_ilist      |<----+--+
1146  *   +->| e_pcurr       |----+--+--+       +->| e_pcurr      |-----+--+--+
1147  *   |  | ...           |    |  |  |          |  | ...          |     |  |  |
1148  *   |  | e_itimer (90) |    |  |  |          |  | e_itimer (0) |     |  |  |
1149  *   |  | e_cnt (1)     |    |  |  |          |  | e_cnt (3)    |     |  |  |
1150  *   |  +---------------+    |  |  |          |  +--------------+     |  |  |
1151  *   |                       |  |  |          |                       |  |  |
1152  *   |   endpnt_t            |  |  |          |                       |  |  |
1153  *   |   ____________        |  |  |          |   ____________        |  |  |
1154  *   |  | e_node     |<------+  |  |       |  | e_node     |<------+  |  |
1155  *   |  | e_idle     |<---------+  |       |  | e_idle     |       |  |  |
1156  *   +--| e_type     |<------------+       +--| e_type     |       |  |  |
1157  *      | e_tiptr    |                        |  | e_tiptr    |       |  |  |
1158  *      | ...        |                        |  | ...        |       |  |  |
1159  *      | e_lock     |                        |  | e_lock     |       |  |  |
1160  *      | ...        |                        |  | ...        |       |  |  |
1161  *      | e_ref (0)  |                        |  | e_ref (2)  |       |  |  |
1162  *      | e_itime    |                        |  | e_itime    |       |  |  |
1163  *      +------------+                        |  +------------+       |  |  |
1164  *                                            |                       |  |  |
1165  *                                            |                       |  |  |
1166  *                                            |   ____________        |  |  |
1167  *                                            |  | e_node     |<------+  |  |
1168  *                                            |  | e_idle     |<------+--+  |
1169  *                                            +--| e_type     |       |     |
1170  *                                            |  | e_tiptr    |       |     |
1171  *                                            |  | ...        |       |     |
1172  *                                            |  | e_lock     |       |     |
1173  *                                            |  | ...        |       |     |
1174  *                                            |  | e_ref (0)  |       |     |
1175  *                                            |  | e_itime    |       |     |
1176  *                                            |  +------------+       |     |
1177  *                                            |                       |     |
1178  *                                            |                       |     |
1179  *                                            |   ____________        |     |
1180  *                                            |  | e_node     |<------+     |
1181  *                                            |  | e_idle     |             |
1182  *                                            +--| e_type     |<------------+
1183  *                                               | e_tiptr    |
1184  *                                               | ...        |
1185  *                                               | e_lock     |
1186  *                                               | ...        |
1187  *                                               | e_ref (1)  |
1188  *                                               | e_itime    |
1189  *                                               +------------+
1190  *
1191  * Endpoint locking strategy:
1192  *
1193  * The following functions manipulate lists which hold the endpoint and the
1194  * endpoints themselves:
1195  *
1196  * endpnt_get()/check_endpnt()/endpnt_rele()/endpnt_reap()/do_endpnt_reclaim()
1197  *
1198  * Lock description follows:
1199  *
1200  * endpnt_type_lock: Global reader/writer lock which protects accesses to the
1201  *                   endpnt_type_list.
1202  *
1203  * e_plock: Lock defined in the endpnt_type_t.  It is intended to
1204  *          protect accesses to the pool of endopints (e_pool) for a given
1205  *          endpnt_type_t.
1206  *
1207  * e_ilock: Lock defined in endpnt_type_t.  It is intended to protect accesses
1208  *          to the idle list (e_ilist) of available endpoints for a given
1209  *          endpnt_type_t.  It also protects access to the e_itimer, e_async_cv,
1210  *          and e_async_count fields in endpnt_type_t.
1211  *
1212  * e_lock: Lock defined in the endpnt structure.  It is intended to protect
1213  *         flags, cv, and ref count.
1214  *
1215  * The order goes as follows so as not to induce deadlock.
1216  *
1217  * endpnt_type_lock -> e_plock -> e_ilock -> e_lock
1218  *
1219  * Interaction with Zones and shutting down:
1220  *
1221  * endpnt_type_ts are uniquely identified by the (e_zoneid, e_rdev, e_protofmly)
1222  * tuple, which means that a zone may not reuse another zone's idle endpoints
1223  * without first doing a t_kclose().
1224  *
1225  * A zone's endpnt_type_ts are destroyed when a zone is shut down; e_async_cv
1226  * and e_async_count are used to keep track of the threads in endpnt_taskq
1227  * trying to reap endpnt_ts in the endpnt_type_t.
1228  */
1229 
1230 /*
1231  * Allocate and initialize an endpnt_type_t
1232  */
1233 static struct endpnt_type *
1234 endpnt_type_create(struct knetconfig *config)
1235 {
1236         struct endpnt_type      *etype;
1237 
1238         /*
1239          * Allocate a new endpoint type to hang a list of
1240          * endpoints off of it.
1241          */
1242         etype = kmem_alloc(sizeof (struct endpnt_type), KM_SLEEP);
1243         etype->e_next = NULL;
1244         etype->e_pcurr = NULL;
1245         etype->e_itimer = 0;
1246         etype->e_cnt = 0;
1247 
1248         (void) strncpy(etype->e_protofmly, config->knc_protofmly, KNC_STRSIZE);
1249         mutex_init(&etype->e_plock, NULL, MUTEX_DEFAULT, NULL);
1250         mutex_init(&etype->e_ilock, NULL, MUTEX_DEFAULT, NULL);
1251         etype->e_rdev = config->knc_rdev;
1252         etype->e_zoneid = rpc_zoneid();
1253         etype->e_async_count = 0;
1254         cv_init(&etype->e_async_cv, NULL, CV_DEFAULT, NULL);
1255 
1256         list_create(&etype->e_pool, sizeof (endpnt_t),
1257             offsetof(endpnt_t, e_node));
1258         list_create(&etype->e_ilist, sizeof (endpnt_t),
1259             offsetof(endpnt_t, e_idle));
1260 
1261         /*
1262          * Check to see if we need to create a taskq for endpoint
1263          * reaping
1264          */
1265         mutex_enter(&endpnt_taskq_lock);
1266         if (taskq_created == FALSE) {
1267                 taskq_created = TRUE;
1268                 mutex_exit(&endpnt_taskq_lock);
1269                 ASSERT(endpnt_taskq == NULL);
1270                 endpnt_taskq = taskq_create("clts_endpnt_taskq", 1,
1271                     minclsyspri, 200, INT_MAX, 0);
1272         } else
1273                 mutex_exit(&endpnt_taskq_lock);
1274 
1275         return (etype);
1276 }
1277 
1278 /*
1279  * Free an endpnt_type_t
1280  */
1281 static void
1282 endpnt_type_free(struct endpnt_type *etype)
1283 {
1284         mutex_destroy(&etype->e_plock);
1285         mutex_destroy(&etype->e_ilock);
1286         list_destroy(&etype->e_pool);
1287         list_destroy(&etype->e_ilist);
1288         kmem_free(etype, sizeof (endpnt_type_t));
1289 }
1290 
1291 /*
1292  * Check the endpoint to ensure that it is suitable for use.
1293  *
1294  * Possible return values:
1295  *
1296  * return (1) - Endpoint is established, but needs to be re-opened.
1297  * return (0) && *newp == NULL - Endpoint is established, but unusable.
1298  * return (0) && *newp != NULL - Endpoint is established and usable.
1299  */
1300 static int
1301 check_endpnt(struct endpnt *endp, struct endpnt **newp)
1302 {
1303         *newp = endp;
1304 
1305         mutex_enter(&endp->e_lock);
1306         ASSERT(endp->e_ref >= 1);
1307 
1308         /*
1309          * The first condition we check for is if the endpoint has been
1310          * allocated, but is unusable either because it has been closed or
1311          * has been marked stale.  Only *one* thread will be allowed to
1312          * execute the then clause.  This is enforced because the first thread
1313          * to check this condition will clear the flags, so that subsequent
1314          * thread(s) checking this endpoint will move on.
1315          */
1316         if ((endp->e_flags & ENDPNT_ESTABLISHED) &&
1317             (!(endp->e_flags & ENDPNT_BOUND) ||
1318             (endp->e_flags & ENDPNT_STALE))) {
1319                 /*
1320                  * Clear the flags here since they will be
1321                  * set again by this thread.  They need to be
1322                  * individually cleared because we want to maintain
1323                  * the state for ENDPNT_ONIDLE.
1324                  */
1325                 endp->e_flags &= ~(ENDPNT_ESTABLISHED |
1326                     ENDPNT_WAITING | ENDPNT_BOUND | ENDPNT_STALE);
1327                 mutex_exit(&endp->e_lock);
1328                 return (1);
1329         }
1330 
1331         /*
1332          * The second condition is meant for any thread that is waiting for
1333          * an endpoint to become established.  It will cv_wait() until
1334          * the condition for the endpoint has been changed to ENDPNT_BOUND or
1335          * ENDPNT_STALE.
1336          */
1337         while (!(endp->e_flags & ENDPNT_BOUND) &&
1338             !(endp->e_flags & ENDPNT_STALE)) {
1339                 endp->e_flags |= ENDPNT_WAITING;
1340                 cv_wait(&endp->e_cv, &endp->e_lock);
1341         }
1342 
1343         ASSERT(endp->e_flags & ENDPNT_ESTABLISHED);
1344 
1345         /*
1346          * The last case we check for is if the endpoint has been marked stale.
1347          * If this is the case then set *newp to NULL and return, so that the
1348          * caller is notified of the error and can take appropriate action.
1349          */
1350         if (endp->e_flags & ENDPNT_STALE) {
1351                 endp->e_ref--;
1352                 *newp = NULL;
1353         }
1354         mutex_exit(&endp->e_lock);
1355         return (0);
1356 }
1357 
1358 #ifdef DEBUG
1359 /*
1360  * Provide a fault injection setting to test error conditions.
1361  */
1362 static int endpnt_get_return_null = 0;
1363 #endif
1364 
1365 /*
1366  * Returns a handle (struct endpnt *) to an open and bound endpoint
1367  * specified by the knetconfig passed in.  Returns NULL if no valid endpoint
1368  * can be obtained.
1369  */
1370 static struct endpnt *
1371 endpnt_get(struct knetconfig *config, int useresvport)
1372 {
1373         struct endpnt_type      *n_etype = NULL;
1374         struct endpnt_type      *np = NULL;
1375         struct endpnt           *new = NULL;
1376         struct endpnt           *endp = NULL;
1377         struct endpnt           *next = NULL;
1378         TIUSER                  *tiptr = NULL;
1379         int                     rtries = BINDRESVPORT_RETRIES;
1380         int                     i = 0;
1381         int                     error;
1382         int                     retval;
1383         zoneid_t                zoneid = rpc_zoneid();
1384         cred_t                  *cr;
1385 
1386         RPCLOG(1, "endpnt_get: protofmly %s, ", config->knc_protofmly);
1387         RPCLOG(1, "rdev %ld\n", config->knc_rdev);
1388 
1389 #ifdef DEBUG
1390         /*
1391          * Inject fault if desired.  Pretend we have a stale endpoint
1392          * and return NULL.
1393          */
1394         if (endpnt_get_return_null > 0) {
1395                 endpnt_get_return_null--;
1396                 return (NULL);
1397         }
1398 #endif
1399         rw_enter(&endpnt_type_lock, RW_READER);
1400 
1401 top:
1402         for (np = endpnt_type_list; np != NULL; np = np->e_next)
1403                 if ((np->e_zoneid == zoneid) &&
1404                     (np->e_rdev == config->knc_rdev) &&
1405                     (strcmp(np->e_protofmly,
1406                     config->knc_protofmly) == 0))
1407                         break;
1408 
1409         if (np == NULL && n_etype != NULL) {
1410                 ASSERT(rw_write_held(&endpnt_type_lock));
1411 
1412                 /*
1413                  * Link the endpoint type onto the list
1414                  */
1415                 n_etype->e_next = endpnt_type_list;
1416                 endpnt_type_list = n_etype;
1417                 np = n_etype;
1418                 n_etype = NULL;
1419         }
1420 
1421         if (np == NULL) {
1422                 /*
1423                  * The logic here is that we were unable to find an
1424                  * endpnt_type_t that matched our criteria, so we allocate a
1425                  * new one.  Because kmem_alloc() needs to be called with
1426                  * KM_SLEEP, we drop our locks so that we don't induce
1427                  * deadlock.  After allocating and initializing the
1428                  * endpnt_type_t, we reaquire the lock and go back to check
1429                  * if this entry needs to be added to the list.  Since we do
1430                  * some operations without any locking other threads may
1431                  * have been looking for the same endpnt_type_t and gone
1432                  * through this code path.  We check for this case and allow
1433                  * one thread to link its endpnt_type_t to the list and the
1434                  * other threads will simply free theirs.
1435                  */
1436                 rw_exit(&endpnt_type_lock);
1437                 n_etype = endpnt_type_create(config);
1438 
1439                 /*
1440                  * We need to reaquire the lock with RW_WRITER here so that
1441                  * we can safely link the new endpoint type onto the list.
1442                  */
1443                 rw_enter(&endpnt_type_lock, RW_WRITER);
1444                 goto top;
1445         }
1446 
1447         rw_exit(&endpnt_type_lock);
1448         /*
1449          * If n_etype is not NULL, then another thread was able to
1450          * insert an endpnt_type_t of this type  onto the list before
1451          * we did.  Go ahead and free ours.
1452          */
1453         if (n_etype != NULL)
1454                 endpnt_type_free(n_etype);
1455 
1456         mutex_enter(&np->e_ilock);
1457         /*
1458          * The algorithm to hand out endpoints is to first
1459          * give out those that are idle if such endpoints
1460          * exist.  Otherwise, create a new one if we haven't
1461          * reached the max threshold.  Finally, we give out
1462          * endpoints in a pseudo LRU fashion (round-robin).
1463          *
1464          * Note:  The idle list is merely a hint of those endpoints
1465          * that should be idle.  There exists a window after the
1466          * endpoint is released and before it is linked back onto the
1467          * idle list where a thread could get a reference to it and
1468          * use it.  This is okay, since the reference counts will
1469          * still be consistent.
1470          */
1471         if ((endp = (endpnt_t *)list_head(&np->e_ilist)) != NULL) {
1472                 timeout_id_t t_id = 0;
1473 
1474                 mutex_enter(&endp->e_lock);
1475                 endp->e_ref++;
1476                 endp->e_itime = 0;
1477                 endp->e_flags &= ~ENDPNT_ONIDLE;
1478                 mutex_exit(&endp->e_lock);
1479 
1480                 /*
1481                  * Pop the endpoint off the idle list and hand it off
1482                  */
1483                 list_remove(&np->e_ilist, endp);
1484 
1485                 if (np->e_itimer != 0) {
1486                         t_id = np->e_itimer;
1487                         np->e_itimer = 0;
1488                 }
1489                 mutex_exit(&np->e_ilock);
1490                 /*
1491                  * Reset the idle timer if it has been set
1492                  */
1493                 if (t_id != (timeout_id_t)0)
1494                         (void) untimeout(t_id);
1495 
1496                 if (check_endpnt(endp, &new) == 0)
1497                         return (new);
1498         } else if (np->e_cnt >= clnt_clts_max_endpoints) {
1499                 /*
1500                  * There are no idle endpoints currently, so
1501                  * create a new one if we have not reached the maximum or
1502                  * hand one out in round-robin.
1503                  */
1504                 mutex_exit(&np->e_ilock);
1505                 mutex_enter(&np->e_plock);
1506                 endp = np->e_pcurr;
1507                 mutex_enter(&endp->e_lock);
1508                 endp->e_ref++;
1509                 mutex_exit(&endp->e_lock);
1510 
1511                 ASSERT(endp != NULL);
1512                 /*
1513                  * Advance the pointer to the next eligible endpoint, if
1514                  * necessary.
1515                  */
1516                 if (np->e_cnt > 1) {
1517                         next = (endpnt_t *)list_next(&np->e_pool, np->e_pcurr);
1518                         if (next == NULL)
1519                                 next = (endpnt_t *)list_head(&np->e_pool);
1520                         np->e_pcurr = next;
1521                 }
1522 
1523                 mutex_exit(&np->e_plock);
1524 
1525                 /*
1526                  * We need to check to see if this endpoint is bound or
1527                  * not.  If it is in progress then just wait until
1528                  * the set up is complete
1529                  */
1530                 if (check_endpnt(endp, &new) == 0)
1531                         return (new);
1532         } else {
1533                 mutex_exit(&np->e_ilock);
1534                 mutex_enter(&np->e_plock);
1535 
1536                 /*
1537                  * Allocate a new endpoint to use.  If we can't allocate any
1538                  * more memory then use one that is already established if any
1539                  * such endpoints exist.
1540                  */
1541                 new = kmem_cache_alloc(endpnt_cache, KM_NOSLEEP);
1542                 if (new == NULL) {
1543                         RPCLOG0(1, "endpnt_get: kmem_cache_alloc failed\n");
1544                         /*
1545                          * Try to recover by using an existing endpoint.
1546                          */
1547                         if (np->e_cnt <= 0) {
1548                                 mutex_exit(&np->e_plock);
1549                                 return (NULL);
1550                         }
1551                         endp = np->e_pcurr;
1552                         if ((next = list_next(&np->e_pool, np->e_pcurr)) !=
1553                             NULL)
1554                                 np->e_pcurr = next;
1555                         ASSERT(endp != NULL);
1556                         mutex_enter(&endp->e_lock);
1557                         endp->e_ref++;
1558                         mutex_exit(&endp->e_lock);
1559                         mutex_exit(&np->e_plock);
1560 
1561                         if (check_endpnt(endp, &new) == 0)
1562                                 return (new);
1563                 } else {
1564                         /*
1565                          * Partially init an endpoint structure and put
1566                          * it on the list, so that other interested threads
1567                          * know that one is being created
1568                          */
1569                         bzero(new, sizeof (struct endpnt));
1570 
1571                         cv_init(&new->e_cv, NULL, CV_DEFAULT, NULL);
1572                         mutex_init(&new->e_lock, NULL, MUTEX_DEFAULT, NULL);
1573                         new->e_ref = 1;
1574                         new->e_type = np;
1575 
1576                         /*
1577                          * Link the endpoint into the pool.
1578                          */
1579                         list_insert_head(&np->e_pool, new);
1580                         np->e_cnt++;
1581                         if (np->e_pcurr == NULL)
1582                                 np->e_pcurr = new;
1583                         mutex_exit(&np->e_plock);
1584                 }
1585         }
1586 
1587         /*
1588          * The transport should be opened with sufficient privs
1589          */
1590         cr = zone_kcred();
1591         error = t_kopen(NULL, config->knc_rdev, FREAD|FWRITE|FNDELAY, &tiptr,
1592             cr);
1593         if (error) {
1594                 RPCLOG(1, "endpnt_get: t_kopen: %d\n", error);
1595                 goto bad;
1596         }
1597 
1598         new->e_tiptr = tiptr;
1599         rpc_poptimod(tiptr->fp->f_vnode);
1600 
1601         /*
1602          * Allow the kernel to push the module on behalf of the user.
1603          */
1604         error = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"rpcmod", 0,
1605             K_TO_K, cr, &retval);
1606         if (error) {
1607                 RPCLOG(1, "endpnt_get: kstr_push on rpcmod failed %d\n", error);
1608                 goto bad;
1609         }
1610 
1611         error = strioctl(tiptr->fp->f_vnode, RPC_CLIENT, 0, 0, K_TO_K,
1612             cr, &retval);
1613         if (error) {
1614                 RPCLOG(1, "endpnt_get: strioctl failed %d\n", error);
1615                 goto bad;
1616         }
1617 
1618         /*
1619          * Connectionless data flow should bypass the stream head.
1620          */
1621         new->e_wq = tiptr->fp->f_vnode->v_stream->sd_wrq->q_next;
1622 
1623         error = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"timod", 0,
1624             K_TO_K, cr, &retval);
1625         if (error) {
1626                 RPCLOG(1, "endpnt_get: kstr_push on timod failed %d\n", error);
1627                 goto bad;
1628         }
1629 
1630         /*
1631          * Attempt to bind the endpoint.  If we fail then propogate
1632          * error back to calling subsystem, so that it can be handled
1633          * appropriately.
1634          * If the caller has not specified reserved port usage then
1635          * take the system default.
1636          */
1637         if (useresvport == -1)
1638                 useresvport = clnt_clts_do_bindresvport;
1639 
1640         if (useresvport &&
1641             (strcmp(config->knc_protofmly, NC_INET) == 0 ||
1642             strcmp(config->knc_protofmly, NC_INET6) == 0)) {
1643 
1644                 while ((error =
1645                     bindresvport(new->e_tiptr, NULL, NULL, FALSE)) != 0) {
1646                         RPCLOG(1,
1647                             "endpnt_get: bindresvport error %d\n", error);
1648                         if (error != EPROTO) {
1649                                 if (rtries-- <= 0)
1650                                         goto bad;
1651 
1652                                 delay(hz << i++);
1653                                 continue;
1654                         }
1655 
1656                         (void) t_kclose(new->e_tiptr, 1);
1657                         /*
1658                          * reopen with all privileges
1659                          */
1660                         error = t_kopen(NULL, config->knc_rdev,
1661                             FREAD|FWRITE|FNDELAY,
1662                             &new->e_tiptr, cr);
1663                         if (error) {
1664                                 RPCLOG(1, "endpnt_get: t_kopen: %d\n", error);
1665                                         new->e_tiptr = NULL;
1666                                         goto bad;
1667                         }
1668                 }
1669         } else if ((error = t_kbind(new->e_tiptr, NULL, NULL)) != 0) {
1670                 RPCLOG(1, "endpnt_get: t_kbind failed: %d\n", error);
1671                 goto bad;
1672         }
1673 
1674         /*
1675          * Set the flags and notify and waiters that we have an established
1676          * endpoint.
1677          */
1678         mutex_enter(&new->e_lock);
1679         new->e_flags |= ENDPNT_ESTABLISHED;
1680         new->e_flags |= ENDPNT_BOUND;
1681         if (new->e_flags & ENDPNT_WAITING) {
1682                 cv_broadcast(&new->e_cv);
1683                 new->e_flags &= ~ENDPNT_WAITING;
1684         }
1685         mutex_exit(&new->e_lock);
1686 
1687         return (new);
1688 
1689 bad:
1690         ASSERT(new != NULL);
1691         /*
1692          * mark this endpoint as stale and notify any threads waiting
1693          * on this endpoint that it will be going away.
1694          */
1695         mutex_enter(&new->e_lock);
1696         if (new->e_ref > 0) {
1697                 new->e_flags |= ENDPNT_ESTABLISHED;
1698                 new->e_flags |= ENDPNT_STALE;
1699                 if (new->e_flags & ENDPNT_WAITING) {
1700                         cv_broadcast(&new->e_cv);
1701                         new->e_flags &= ~ENDPNT_WAITING;
1702                 }
1703         }
1704         new->e_ref--;
1705         new->e_tiptr = NULL;
1706         mutex_exit(&new->e_lock);
1707 
1708         /*
1709          * If there was a transport endopoint opened, then close it.
1710          */
1711         if (tiptr != NULL)
1712                 (void) t_kclose(tiptr, 1);
1713 
1714         return (NULL);
1715 }
1716 
1717 /*
1718  * Release a referece to the endpoint
1719  */
1720 static void
1721 endpnt_rele(struct endpnt *sp)
1722 {
1723         mutex_enter(&sp->e_lock);
1724         ASSERT(sp->e_ref > 0);
1725         sp->e_ref--;
1726         /*
1727          * If the ref count is zero, then start the idle timer and link
1728          * the endpoint onto the idle list.
1729          */
1730         if (sp->e_ref == 0) {
1731                 sp->e_itime = gethrestime_sec();
1732 
1733                 /*
1734                  * Check to see if the endpoint is already linked to the idle
1735                  * list, so that we don't try to reinsert it.
1736                  */
1737                 if (sp->e_flags & ENDPNT_ONIDLE) {
1738                         mutex_exit(&sp->e_lock);
1739                         mutex_enter(&sp->e_type->e_ilock);
1740                         endpnt_reap_settimer(sp->e_type);
1741                         mutex_exit(&sp->e_type->e_ilock);
1742                         return;
1743                 }
1744 
1745                 sp->e_flags |= ENDPNT_ONIDLE;
1746                 mutex_exit(&sp->e_lock);
1747                 mutex_enter(&sp->e_type->e_ilock);
1748                 list_insert_tail(&sp->e_type->e_ilist, sp);
1749                 endpnt_reap_settimer(sp->e_type);
1750                 mutex_exit(&sp->e_type->e_ilock);
1751         } else
1752                 mutex_exit(&sp->e_lock);
1753 }
1754 
1755 static void
1756 endpnt_reap_settimer(endpnt_type_t *etp)
1757 {
1758         if (etp->e_itimer == (timeout_id_t)0)
1759                 etp->e_itimer = timeout(endpnt_reap_dispatch, (void *)etp,
1760                     clnt_clts_taskq_dispatch_interval);
1761 }
1762 
1763 static void
1764 endpnt_reap_dispatch(void *a)
1765 {
1766         endpnt_type_t *etp = a;
1767 
1768         /*
1769          * The idle timer has fired, so dispatch the taskq to close the
1770          * endpoint.
1771          */
1772         if (taskq_dispatch(endpnt_taskq, (task_func_t *)endpnt_reap, etp,
1773             TQ_NOSLEEP) == NULL)
1774                 return;
1775         mutex_enter(&etp->e_ilock);
1776         etp->e_async_count++;
1777         mutex_exit(&etp->e_ilock);
1778 }
1779 
1780 /*
1781  * Traverse the idle list and close those endpoints that have reached their
1782  * timeout interval.
1783  */
1784 static void
1785 endpnt_reap(endpnt_type_t *etp)
1786 {
1787         struct endpnt *e;
1788         struct endpnt *next_node = NULL;
1789 
1790         mutex_enter(&etp->e_ilock);
1791         e = list_head(&etp->e_ilist);
1792         while (e != NULL) {
1793                 next_node = list_next(&etp->e_ilist, e);
1794 
1795                 mutex_enter(&e->e_lock);
1796                 if (e->e_ref > 0) {
1797                         mutex_exit(&e->e_lock);
1798                         e = next_node;
1799                         continue;
1800                 }
1801 
1802                 ASSERT(e->e_ref == 0);
1803                 if (e->e_itime > 0 &&
1804                     (e->e_itime + clnt_clts_endpoint_reap_interval) <
1805                     gethrestime_sec()) {
1806                         e->e_flags &= ~ENDPNT_BOUND;
1807                         (void) t_kclose(e->e_tiptr, 1);
1808                         e->e_tiptr = NULL;
1809                         e->e_itime = 0;
1810                 }
1811                 mutex_exit(&e->e_lock);
1812                 e = next_node;
1813         }
1814         etp->e_itimer = 0;
1815         if (--etp->e_async_count == 0)
1816                 cv_signal(&etp->e_async_cv);
1817         mutex_exit(&etp->e_ilock);
1818 }
1819 
1820 static void
1821 endpnt_reclaim(zoneid_t zoneid)
1822 {
1823         struct endpnt_type *np;
1824         struct endpnt *e;
1825         struct endpnt *next_node = NULL;
1826         list_t free_list;
1827         int rcnt = 0;
1828 
1829         list_create(&free_list, sizeof (endpnt_t), offsetof(endpnt_t, e_node));
1830 
1831         RPCLOG0(1, "endpnt_reclaim: reclaim callback started\n");
1832         rw_enter(&endpnt_type_lock, RW_READER);
1833         for (np = endpnt_type_list; np != NULL; np = np->e_next) {
1834                 if (zoneid != ALL_ZONES && zoneid != np->e_zoneid)
1835                         continue;
1836 
1837                 mutex_enter(&np->e_plock);
1838                 RPCLOG(1, "endpnt_reclaim: protofmly %s, ",
1839                     np->e_protofmly);
1840                 RPCLOG(1, "rdev %ld\n", np->e_rdev);
1841                 RPCLOG(1, "endpnt_reclaim: found %d endpoint(s)\n",
1842                     np->e_cnt);
1843 
1844                 if (np->e_cnt == 0) {
1845                         mutex_exit(&np->e_plock);
1846                         continue;
1847                 }
1848 
1849                 /*
1850                  * The nice thing about maintaining an idle list is that if
1851                  * there are any endpoints to reclaim, they are going to be
1852                  * on this list.  Just go through and reap the one's that
1853                  * have ref counts of zero.
1854                  */
1855                 mutex_enter(&np->e_ilock);
1856                 e = list_head(&np->e_ilist);
1857                 while (e != NULL) {
1858                         next_node = list_next(&np->e_ilist, e);
1859                         mutex_enter(&e->e_lock);
1860                         if (e->e_ref > 0) {
1861                                 mutex_exit(&e->e_lock);
1862                                 e = next_node;
1863                                 continue;
1864                         }
1865                         ASSERT(e->e_ref == 0);
1866                         mutex_exit(&e->e_lock);
1867 
1868                         list_remove(&np->e_ilist, e);
1869                         list_remove(&np->e_pool, e);
1870                         list_insert_head(&free_list, e);
1871 
1872                         rcnt++;
1873                         np->e_cnt--;
1874                         e = next_node;
1875                 }
1876                 mutex_exit(&np->e_ilock);
1877                 /*
1878                  * Reset the current pointer to be safe
1879                  */
1880                 if ((e = (struct endpnt *)list_head(&np->e_pool)) != NULL)
1881                         np->e_pcurr = e;
1882                 else {
1883                         ASSERT(np->e_cnt == 0);
1884                         np->e_pcurr = NULL;
1885                 }
1886 
1887                 mutex_exit(&np->e_plock);
1888         }
1889         rw_exit(&endpnt_type_lock);
1890 
1891         while ((e = list_head(&free_list)) != NULL) {
1892                 list_remove(&free_list, e);
1893                 if (e->e_tiptr != NULL)
1894                         (void) t_kclose(e->e_tiptr, 1);
1895 
1896                 cv_destroy(&e->e_cv);
1897                 mutex_destroy(&e->e_lock);
1898                 kmem_cache_free(endpnt_cache, e);
1899         }
1900         list_destroy(&free_list);
1901         RPCLOG(1, "endpnt_reclaim: reclaimed %d endpoint(s)\n", rcnt);
1902 }
1903 
1904 /*
1905  * Endpoint reclaim zones destructor callback routine.
1906  *
1907  * After reclaiming any cached entries, we basically go through the endpnt_type
1908  * list, canceling outstanding timeouts and free'ing data structures.
1909  */
1910 /* ARGSUSED */
1911 static void
1912 endpnt_destructor(zoneid_t zoneid, void *a)
1913 {
1914         struct endpnt_type **npp;
1915         struct endpnt_type *np;
1916         struct endpnt_type *free_list = NULL;
1917         timeout_id_t t_id = 0;
1918         extern void clcleanup_zone(zoneid_t);
1919         extern void clcleanup4_zone(zoneid_t);
1920 
1921         /* Make sure NFS client handles are released. */
1922         clcleanup_zone(zoneid);
1923         clcleanup4_zone(zoneid);
1924 
1925         endpnt_reclaim(zoneid);
1926         /*
1927          * We don't need to be holding on to any locks across the call to
1928          * endpnt_reclaim() and the code below; we know that no-one can
1929          * be holding open connections for this zone (all processes and kernel
1930          * threads are gone), so nothing could be adding anything to the list.
1931          */
1932         rw_enter(&endpnt_type_lock, RW_WRITER);
1933         npp = &endpnt_type_list;
1934         while ((np = *npp) != NULL) {
1935                 if (np->e_zoneid != zoneid) {
1936                         npp = &np->e_next;
1937                         continue;
1938                 }
1939                 mutex_enter(&np->e_plock);
1940                 mutex_enter(&np->e_ilock);
1941                 if (np->e_itimer != 0) {
1942                         t_id = np->e_itimer;
1943                         np->e_itimer = 0;
1944                 }
1945                 ASSERT(np->e_cnt == 0);
1946                 ASSERT(list_head(&np->e_pool) == NULL);
1947                 ASSERT(list_head(&np->e_ilist) == NULL);
1948 
1949                 mutex_exit(&np->e_ilock);
1950                 mutex_exit(&np->e_plock);
1951 
1952                 /*
1953                  * untimeout() any outstanding timers that have not yet fired.
1954                  */
1955                 if (t_id != (timeout_id_t)0)
1956                         (void) untimeout(t_id);
1957                 *npp = np->e_next;
1958                 np->e_next = free_list;
1959                 free_list = np;
1960         }
1961         rw_exit(&endpnt_type_lock);
1962 
1963         while (free_list != NULL) {
1964                 np = free_list;
1965                 free_list = free_list->e_next;
1966                 /*
1967                  * Wait for threads in endpnt_taskq trying to reap endpnt_ts in
1968                  * the endpnt_type_t.
1969                  */
1970                 mutex_enter(&np->e_ilock);
1971                 while (np->e_async_count > 0)
1972                         cv_wait(&np->e_async_cv, &np->e_ilock);
1973                 cv_destroy(&np->e_async_cv);
1974                 mutex_destroy(&np->e_plock);
1975                 mutex_destroy(&np->e_ilock);
1976                 list_destroy(&np->e_pool);
1977                 list_destroy(&np->e_ilist);
1978                 kmem_free(np, sizeof (endpnt_type_t));
1979         }
1980 }
1981 
1982 /*
1983  * Endpoint reclaim kmem callback routine.
1984  */
1985 /* ARGSUSED */
1986 static void
1987 endpnt_repossess(void *a)
1988 {
1989         /*
1990          * Reclaim idle endpnt's from all zones.
1991          */
1992         if (endpnt_taskq != NULL)
1993                 (void) taskq_dispatch(endpnt_taskq,
1994                     (task_func_t *)endpnt_reclaim, (void *)ALL_ZONES,
1995                     TQ_NOSLEEP);
1996 }
1997 
1998 /*
1999  * RPC request dispatch routine.  Constructs a datagram message and wraps it
2000  * around the RPC request to pass downstream.
2001  */
2002 static int
2003 clnt_clts_dispatch_send(queue_t *q, mblk_t *mp, struct netbuf *addr,
2004     calllist_t *cp, uint_t xid, cred_t *cr)
2005 {
2006         mblk_t *bp;
2007         int msgsz;
2008         struct T_unitdata_req *udreq;
2009 
2010         /*
2011          * Set up the call record.
2012          */
2013         cp->call_wq = q;
2014         cp->call_xid = xid;
2015         cp->call_status = RPC_TIMEDOUT;
2016         cp->call_notified = FALSE;
2017         RPCLOG(64,
2018             "clnt_clts_dispatch_send: putting xid 0x%x on "
2019             "dispatch list\n", xid);
2020         cp->call_hash = call_hash(xid, clnt_clts_hash_size);
2021         cp->call_bucket = &clts_call_ht[cp->call_hash];
2022         call_table_enter(cp);
2023 
2024         /*
2025          * Construct the datagram
2026          */
2027         msgsz = (int)TUNITDATAREQSZ;
2028         /*
2029          * Note: if the receiver uses SCM_UCRED/getpeerucred the pid will
2030          * appear as -1.
2031          */
2032         while (!(bp = allocb_cred(msgsz + addr->len, cr, NOPID))) {
2033                 if (strwaitbuf(msgsz + addr->len, BPRI_LO))
2034                         return (ENOSR);
2035         }
2036 
2037         udreq = (struct T_unitdata_req *)bp->b_wptr;
2038         udreq->PRIM_type = T_UNITDATA_REQ;
2039         udreq->DEST_length = addr->len;
2040 
2041         if (addr->len) {
2042                 bcopy(addr->buf, bp->b_wptr + msgsz, addr->len);
2043                 udreq->DEST_offset = (t_scalar_t)msgsz;
2044                 msgsz += addr->len;
2045         } else
2046                 udreq->DEST_offset = 0;
2047         udreq->OPT_length = 0;
2048         udreq->OPT_offset = 0;
2049 
2050         bp->b_datap->db_type = M_PROTO;
2051         bp->b_wptr += msgsz;
2052 
2053         /*
2054          * Link the datagram header with the actual data
2055          */
2056         linkb(bp, mp);
2057 
2058         /*
2059          * Send downstream.
2060          */
2061         if (canput(cp->call_wq)) {
2062                 put(cp->call_wq, bp);
2063                 return (0);
2064         }
2065 
2066         return (EIO);
2067 }
2068 
2069 /*
2070  * RPC response delivery routine.  Deliver the response to the waiting
2071  * thread by matching the xid.
2072  */
2073 void
2074 clnt_clts_dispatch_notify(mblk_t *mp, int resp_off, zoneid_t zoneid)
2075 {
2076         calllist_t *e = NULL;
2077         call_table_t *chtp;
2078         uint32_t xid;
2079         uint_t hash;
2080         unsigned char *hdr_offset;
2081         mblk_t *resp;
2082 
2083         /*
2084          * If the RPC response is not contained in the same mblk as the
2085          * datagram header, then move to the next mblk.
2086          */
2087         hdr_offset = mp->b_rptr;
2088         resp = mp;
2089         if ((mp->b_wptr - (mp->b_rptr + resp_off)) == 0)
2090                 resp = mp->b_cont;
2091         else
2092                 resp->b_rptr += resp_off;
2093 
2094         ASSERT(resp != NULL);
2095 
2096         if ((IS_P2ALIGNED(resp->b_rptr, sizeof (uint32_t))) &&
2097             (resp->b_wptr - resp->b_rptr) >= sizeof (xid))
2098                 xid = *((uint32_t *)resp->b_rptr);
2099         else {
2100                 int i = 0;
2101                 unsigned char *p = (unsigned char *)&xid;
2102                 unsigned char *rptr;
2103                 mblk_t *tmp = resp;
2104 
2105                 /*
2106                  * Copy the xid, byte-by-byte into xid.
2107                  */
2108                 while (tmp) {
2109                         rptr = tmp->b_rptr;
2110                         while (rptr < tmp->b_wptr) {
2111                                 *p++ = *rptr++;
2112                                 if (++i >= sizeof (xid))
2113                                         goto done_xid_copy;
2114                         }
2115                         tmp = tmp->b_cont;
2116                 }
2117 
2118                 /*
2119                  * If we got here, we ran out of mblk space before the
2120                  * xid could be copied.
2121                  */
2122                 ASSERT(tmp == NULL && i < sizeof (xid));
2123 
2124                 RPCLOG0(1,
2125                     "clnt_dispatch_notify(clts): message less than "
2126                     "size of xid\n");
2127 
2128                 freemsg(mp);
2129                 return;
2130         }
2131 
2132 done_xid_copy:
2133 
2134         /*
2135          * Reset the read pointer back to the beginning of the protocol
2136          * header if we moved it.
2137          */
2138         if (mp->b_rptr != hdr_offset)
2139                 mp->b_rptr = hdr_offset;
2140 
2141         hash = call_hash(xid, clnt_clts_hash_size);
2142         chtp = &clts_call_ht[hash];
2143         /* call_table_find returns with the hash bucket locked */
2144         call_table_find(chtp, xid, e);
2145 
2146         if (e != NULL) {
2147                 mutex_enter(&e->call_lock);
2148 
2149                 /*
2150                  * verify that the reply is coming in on
2151                  * the same zone that it was sent from.
2152                  */
2153                 if (e->call_zoneid != zoneid) {
2154                         mutex_exit(&e->call_lock);
2155                         mutex_exit(&chtp->ct_lock);
2156                         RPCLOG0(8, "clnt_dispatch_notify (clts): incorrect "
2157                             "zoneid\n");
2158                         freemsg(mp);
2159                         return;
2160                 }
2161 
2162                 /*
2163                  * found thread waiting for this reply.
2164                  */
2165                 if (e->call_reply) {
2166                         RPCLOG(8,
2167                             "clnt_dispatch_notify (clts): discarding old "
2168                             "reply for xid 0x%x\n",
2169                             xid);
2170                         freemsg(e->call_reply);
2171                 }
2172                 e->call_notified = TRUE;
2173                 e->call_reply = mp;
2174                 e->call_status = RPC_SUCCESS;
2175                 cv_signal(&e->call_cv);
2176                 mutex_exit(&e->call_lock);
2177                 mutex_exit(&chtp->ct_lock);
2178         } else {
2179                 zone_t *zone;
2180                 struct rpcstat *rpcstat;
2181 
2182                 mutex_exit(&chtp->ct_lock);
2183                 RPCLOG(8, "clnt_dispatch_notify (clts): no caller for reply "
2184                     "0x%x\n", xid);
2185                 freemsg(mp);
2186                 /*
2187                  * This is unfortunate, but we need to lookup the zone so we
2188                  * can increment its "rcbadxids" counter.
2189                  */
2190                 zone = zone_find_by_id(zoneid);
2191                 if (zone == NULL) {
2192                         /*
2193                          * The zone went away...
2194                          */
2195                         return;
2196                 }
2197                 rpcstat = zone_getspecific(rpcstat_zone_key, zone);
2198                 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
2199                         /*
2200                          * Not interested
2201                          */
2202                         zone_rele(zone);
2203                         return;
2204                 }
2205                 RCSTAT_INCR(rpcstat->rpc_clts_client, rcbadxids);
2206                 zone_rele(zone);
2207         }
2208 }
2209 
2210 /*
2211  * Init routine.  Called when rpcmod is loaded.
2212  */
2213 void
2214 clnt_clts_init(void)
2215 {
2216         endpnt_cache = kmem_cache_create("clnt_clts_endpnt_cache",
2217             sizeof (struct endpnt), 0, NULL, NULL, endpnt_repossess, NULL,
2218             NULL, 0);
2219 
2220         rw_init(&endpnt_type_lock, NULL, RW_DEFAULT, NULL);
2221 
2222         /*
2223          * Perform simple bounds checking to make sure that the setting is
2224          * reasonable
2225          */
2226         if (clnt_clts_max_endpoints <= 0) {
2227                 if (clnt_clts_do_bindresvport)
2228                         clnt_clts_max_endpoints = RESERVED_PORTSPACE;
2229                 else
2230                         clnt_clts_max_endpoints = NONRESERVED_PORTSPACE;
2231         }
2232 
2233         if (clnt_clts_do_bindresvport &&
2234             clnt_clts_max_endpoints > RESERVED_PORTSPACE)
2235                 clnt_clts_max_endpoints = RESERVED_PORTSPACE;
2236         else if (clnt_clts_max_endpoints > NONRESERVED_PORTSPACE)
2237                 clnt_clts_max_endpoints = NONRESERVED_PORTSPACE;
2238 
2239         if (clnt_clts_hash_size < DEFAULT_MIN_HASH_SIZE)
2240                 clnt_clts_hash_size = DEFAULT_MIN_HASH_SIZE;
2241 
2242         /*
2243          * Defer creating the taskq until rpcmod gets pushed.  If we are
2244          * in diskless boot mode, rpcmod will get loaded early even before
2245          * thread_create() is available.
2246          */
2247         endpnt_taskq = NULL;
2248         taskq_created = FALSE;
2249         mutex_init(&endpnt_taskq_lock, NULL, MUTEX_DEFAULT, NULL);
2250 
2251         if (clnt_clts_endpoint_reap_interval < DEFAULT_ENDPOINT_REAP_INTERVAL)
2252                 clnt_clts_endpoint_reap_interval =
2253                     DEFAULT_ENDPOINT_REAP_INTERVAL;
2254 
2255         /*
2256          * Dispatch the taskq at an interval which is offset from the
2257          * interval that the endpoints should be reaped.
2258          */
2259         clnt_clts_taskq_dispatch_interval =
2260             (clnt_clts_endpoint_reap_interval + DEFAULT_INTERVAL_SHIFT) * hz;
2261 
2262         /*
2263          * Initialize the completion queue
2264          */
2265         clts_call_ht = call_table_init(clnt_clts_hash_size);
2266         /*
2267          * Initialize the zone destructor callback.
2268          */
2269         zone_key_create(&endpnt_destructor_key, NULL, NULL, endpnt_destructor);
2270 }
2271 
2272 void
2273 clnt_clts_fini(void)
2274 {
2275         (void) zone_key_delete(endpnt_destructor_key);
2276 }