Print this page
5045 use atomic_{inc,dec}_* instead of atomic_add_*
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/rpc/svc_rdma.c
+++ new/usr/src/uts/common/rpc/svc_rdma.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
25 25 /* All Rights Reserved */
26 26 /*
27 27 * Portions of this source code were derived from Berkeley
28 28 * 4.3 BSD under license from the Regents of the University of
29 29 * California.
30 30 */
31 31
32 32 /*
33 33 * Server side of RPC over RDMA in the kernel.
34 34 */
35 35
36 36 #include <sys/param.h>
37 37 #include <sys/types.h>
38 38 #include <sys/user.h>
39 39 #include <sys/sysmacros.h>
40 40 #include <sys/proc.h>
41 41 #include <sys/file.h>
42 42 #include <sys/errno.h>
43 43 #include <sys/kmem.h>
44 44 #include <sys/debug.h>
45 45 #include <sys/systm.h>
46 46 #include <sys/cmn_err.h>
47 47 #include <sys/kstat.h>
48 48 #include <sys/vtrace.h>
49 49 #include <sys/debug.h>
50 50
51 51 #include <rpc/types.h>
52 52 #include <rpc/xdr.h>
53 53 #include <rpc/auth.h>
54 54 #include <rpc/clnt.h>
55 55 #include <rpc/rpc_msg.h>
56 56 #include <rpc/svc.h>
57 57 #include <rpc/rpc_rdma.h>
58 58 #include <sys/ddi.h>
59 59 #include <sys/sunddi.h>
60 60
61 61 #include <inet/common.h>
62 62 #include <inet/ip.h>
63 63 #include <inet/ip6.h>
64 64
65 65 #include <nfs/nfs.h>
66 66 #include <sys/sdt.h>
67 67
68 68 #define SVC_RDMA_SUCCESS 0
69 69 #define SVC_RDMA_FAIL -1
70 70
71 71 #define SVC_CREDIT_FACTOR (0.5)
72 72
73 73 #define MSG_IS_RPCSEC_GSS(msg) \
74 74 ((msg)->rm_reply.rp_acpt.ar_verf.oa_flavor == RPCSEC_GSS)
75 75
76 76
77 77 uint32_t rdma_bufs_granted = RDMA_BUFS_GRANT;
78 78
79 79 /*
80 80 * RDMA transport specific data associated with SVCMASTERXPRT
81 81 */
82 82 struct rdma_data {
83 83 SVCMASTERXPRT *rd_xprt; /* back ptr to SVCMASTERXPRT */
84 84 struct rdma_svc_data rd_data; /* rdma data */
85 85 rdma_mod_t *r_mod; /* RDMA module containing ops ptr */
86 86 };
87 87
88 88 /*
89 89 * Plugin connection specific data stashed away in clone SVCXPRT
90 90 */
91 91 struct clone_rdma_data {
92 92 bool_t cloned; /* xprt cloned for thread processing */
93 93 CONN *conn; /* RDMA connection */
94 94 rdma_buf_t rpcbuf; /* RPC req/resp buffer */
95 95 struct clist *cl_reply; /* reply chunk buffer info */
96 96 struct clist *cl_wlist; /* write list clist */
97 97 };
98 98
99 99
100 100 #define MAXADDRLEN 128 /* max length for address mask */
101 101
102 102 /*
103 103 * Routines exported through ops vector.
104 104 */
105 105 static bool_t svc_rdma_krecv(SVCXPRT *, mblk_t *, struct rpc_msg *);
106 106 static bool_t svc_rdma_ksend(SVCXPRT *, struct rpc_msg *);
107 107 static bool_t svc_rdma_kgetargs(SVCXPRT *, xdrproc_t, caddr_t);
108 108 static bool_t svc_rdma_kfreeargs(SVCXPRT *, xdrproc_t, caddr_t);
109 109 void svc_rdma_kdestroy(SVCMASTERXPRT *);
110 110 static int svc_rdma_kdup(struct svc_req *, caddr_t, int,
111 111 struct dupreq **, bool_t *);
112 112 static void svc_rdma_kdupdone(struct dupreq *, caddr_t,
113 113 void (*)(), int, int);
114 114 static int32_t *svc_rdma_kgetres(SVCXPRT *, int);
115 115 static void svc_rdma_kfreeres(SVCXPRT *);
116 116 static void svc_rdma_kclone_destroy(SVCXPRT *);
117 117 static void svc_rdma_kstart(SVCMASTERXPRT *);
118 118 void svc_rdma_kstop(SVCMASTERXPRT *);
119 119 static void svc_rdma_kclone_xprt(SVCXPRT *, SVCXPRT *);
120 120 static void svc_rdma_ktattrs(SVCXPRT *, int, void **);
121 121
122 122 static int svc_process_long_reply(SVCXPRT *, xdrproc_t,
123 123 caddr_t, struct rpc_msg *, bool_t, int *,
124 124 int *, int *, unsigned int *);
125 125
126 126 static int svc_compose_rpcmsg(SVCXPRT *, CONN *, xdrproc_t,
127 127 caddr_t, rdma_buf_t *, XDR **, struct rpc_msg *,
128 128 bool_t, uint_t *);
129 129 static bool_t rpcmsg_length(xdrproc_t,
130 130 caddr_t,
131 131 struct rpc_msg *, bool_t, int);
132 132
133 133 /*
134 134 * Server transport operations vector.
135 135 */
136 136 struct svc_ops rdma_svc_ops = {
137 137 svc_rdma_krecv, /* Get requests */
138 138 svc_rdma_kgetargs, /* Deserialize arguments */
139 139 svc_rdma_ksend, /* Send reply */
140 140 svc_rdma_kfreeargs, /* Free argument data space */
141 141 svc_rdma_kdestroy, /* Destroy transport handle */
142 142 svc_rdma_kdup, /* Check entry in dup req cache */
143 143 svc_rdma_kdupdone, /* Mark entry in dup req cache as done */
144 144 svc_rdma_kgetres, /* Get pointer to response buffer */
145 145 svc_rdma_kfreeres, /* Destroy pre-serialized response header */
146 146 svc_rdma_kclone_destroy, /* Destroy a clone xprt */
147 147 svc_rdma_kstart, /* Tell `ready-to-receive' to rpcmod */
148 148 svc_rdma_kclone_xprt, /* Transport specific clone xprt */
149 149 svc_rdma_ktattrs /* Get Transport Attributes */
150 150 };
151 151
152 152 /*
153 153 * Server statistics
154 154 * NOTE: This structure type is duplicated in the NFS fast path.
155 155 */
156 156 struct {
157 157 kstat_named_t rscalls;
158 158 kstat_named_t rsbadcalls;
159 159 kstat_named_t rsnullrecv;
160 160 kstat_named_t rsbadlen;
161 161 kstat_named_t rsxdrcall;
162 162 kstat_named_t rsdupchecks;
163 163 kstat_named_t rsdupreqs;
164 164 kstat_named_t rslongrpcs;
165 165 kstat_named_t rstotalreplies;
166 166 kstat_named_t rstotallongreplies;
167 167 kstat_named_t rstotalinlinereplies;
168 168 } rdmarsstat = {
169 169 { "calls", KSTAT_DATA_UINT64 },
170 170 { "badcalls", KSTAT_DATA_UINT64 },
171 171 { "nullrecv", KSTAT_DATA_UINT64 },
172 172 { "badlen", KSTAT_DATA_UINT64 },
173 173 { "xdrcall", KSTAT_DATA_UINT64 },
174 174 { "dupchecks", KSTAT_DATA_UINT64 },
↓ open down ↓ |
174 lines elided |
↑ open up ↑ |
175 175 { "dupreqs", KSTAT_DATA_UINT64 },
176 176 { "longrpcs", KSTAT_DATA_UINT64 },
177 177 { "totalreplies", KSTAT_DATA_UINT64 },
178 178 { "totallongreplies", KSTAT_DATA_UINT64 },
179 179 { "totalinlinereplies", KSTAT_DATA_UINT64 },
180 180 };
181 181
182 182 kstat_named_t *rdmarsstat_ptr = (kstat_named_t *)&rdmarsstat;
183 183 uint_t rdmarsstat_ndata = sizeof (rdmarsstat) / sizeof (kstat_named_t);
184 184
185 -#define RSSTAT_INCR(x) atomic_add_64(&rdmarsstat.x.value.ui64, 1)
185 +#define RSSTAT_INCR(x) atomic_inc_64(&rdmarsstat.x.value.ui64)
186 186 /*
187 187 * Create a transport record.
188 188 * The transport record, output buffer, and private data structure
189 189 * are allocated. The output buffer is serialized into using xdrmem.
190 190 * There is one transport record per user process which implements a
191 191 * set of services.
192 192 */
193 193 /* ARGSUSED */
194 194 int
195 195 svc_rdma_kcreate(char *netid, SVC_CALLOUT_TABLE *sct, int id,
196 196 rdma_xprt_group_t *started_xprts)
197 197 {
198 198 int error;
199 199 SVCMASTERXPRT *xprt;
200 200 struct rdma_data *rd;
201 201 rdma_registry_t *rmod;
202 202 rdma_xprt_record_t *xprt_rec;
203 203 queue_t *q;
204 204 /*
205 205 * modload the RDMA plugins is not already done.
206 206 */
207 207 if (!rdma_modloaded) {
208 208 /*CONSTANTCONDITION*/
209 209 ASSERT(sizeof (struct clone_rdma_data) <= SVC_P2LEN);
210 210
211 211 mutex_enter(&rdma_modload_lock);
212 212 if (!rdma_modloaded) {
213 213 error = rdma_modload();
214 214 }
215 215 mutex_exit(&rdma_modload_lock);
216 216
217 217 if (error)
218 218 return (error);
219 219 }
220 220
221 221 /*
222 222 * master_xprt_count is the count of master transport handles
223 223 * that were successfully created and are ready to recieve for
224 224 * RDMA based access.
225 225 */
226 226 error = 0;
227 227 xprt_rec = NULL;
228 228 rw_enter(&rdma_lock, RW_READER);
229 229 if (rdma_mod_head == NULL) {
230 230 started_xprts->rtg_count = 0;
231 231 rw_exit(&rdma_lock);
232 232 if (rdma_dev_available)
233 233 return (EPROTONOSUPPORT);
234 234 else
235 235 return (ENODEV);
236 236 }
237 237
238 238 /*
239 239 * If we have reached here, then atleast one RDMA plugin has loaded.
240 240 * Create a master_xprt, make it start listenining on the device,
241 241 * if an error is generated, record it, we might need to shut
242 242 * the master_xprt.
243 243 * SVC_START() calls svc_rdma_kstart which calls plugin binding
244 244 * routines.
245 245 */
246 246 for (rmod = rdma_mod_head; rmod != NULL; rmod = rmod->r_next) {
247 247
248 248 /*
249 249 * One SVCMASTERXPRT per RDMA plugin.
250 250 */
251 251 xprt = kmem_zalloc(sizeof (*xprt), KM_SLEEP);
252 252 xprt->xp_ops = &rdma_svc_ops;
253 253 xprt->xp_sct = sct;
254 254 xprt->xp_type = T_RDMA;
255 255 mutex_init(&xprt->xp_req_lock, NULL, MUTEX_DEFAULT, NULL);
256 256 mutex_init(&xprt->xp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
257 257 xprt->xp_req_head = (mblk_t *)0;
258 258 xprt->xp_req_tail = (mblk_t *)0;
259 259 xprt->xp_threads = 0;
260 260 xprt->xp_detached_threads = 0;
261 261
262 262 rd = kmem_zalloc(sizeof (*rd), KM_SLEEP);
263 263 xprt->xp_p2 = (caddr_t)rd;
264 264 rd->rd_xprt = xprt;
265 265 rd->r_mod = rmod->r_mod;
266 266
267 267 q = &rd->rd_data.q;
268 268 xprt->xp_wq = q;
269 269 q->q_ptr = &rd->rd_xprt;
270 270 xprt->xp_netid = NULL;
271 271
272 272 /*
273 273 * Each of the plugins will have their own Service ID
274 274 * to listener specific mapping, like port number for VI
275 275 * and service name for IB.
276 276 */
277 277 rd->rd_data.svcid = id;
278 278 error = svc_xprt_register(xprt, id);
279 279 if (error) {
280 280 DTRACE_PROBE(krpc__e__svcrdma__xprt__reg);
281 281 goto cleanup;
282 282 }
283 283
284 284 SVC_START(xprt);
285 285 if (!rd->rd_data.active) {
286 286 svc_xprt_unregister(xprt);
287 287 error = rd->rd_data.err_code;
288 288 goto cleanup;
289 289 }
290 290
291 291 /*
292 292 * This is set only when there is atleast one or more
293 293 * transports successfully created. We insert the pointer
294 294 * to the created RDMA master xprt into a separately maintained
295 295 * list. This way we can easily reference it later to cleanup,
296 296 * when NFS kRPC service pool is going away/unregistered.
297 297 */
298 298 started_xprts->rtg_count ++;
299 299 xprt_rec = kmem_alloc(sizeof (*xprt_rec), KM_SLEEP);
300 300 xprt_rec->rtr_xprt_ptr = xprt;
301 301 xprt_rec->rtr_next = started_xprts->rtg_listhead;
302 302 started_xprts->rtg_listhead = xprt_rec;
303 303 continue;
304 304 cleanup:
305 305 SVC_DESTROY(xprt);
306 306 if (error == RDMA_FAILED)
307 307 error = EPROTONOSUPPORT;
308 308 }
309 309
310 310 rw_exit(&rdma_lock);
311 311
312 312 /*
313 313 * Don't return any error even if a single plugin was started
314 314 * successfully.
315 315 */
316 316 if (started_xprts->rtg_count == 0)
317 317 return (error);
318 318 return (0);
319 319 }
320 320
321 321 /*
322 322 * Cleanup routine for freeing up memory allocated by
323 323 * svc_rdma_kcreate()
324 324 */
325 325 void
326 326 svc_rdma_kdestroy(SVCMASTERXPRT *xprt)
327 327 {
328 328 struct rdma_data *rd = (struct rdma_data *)xprt->xp_p2;
329 329
330 330
331 331 mutex_destroy(&xprt->xp_req_lock);
332 332 mutex_destroy(&xprt->xp_thread_lock);
333 333 kmem_free(rd, sizeof (*rd));
334 334 kmem_free(xprt, sizeof (*xprt));
335 335 }
336 336
337 337
338 338 static void
339 339 svc_rdma_kstart(SVCMASTERXPRT *xprt)
340 340 {
341 341 struct rdma_svc_data *svcdata;
342 342 rdma_mod_t *rmod;
343 343
344 344 svcdata = &((struct rdma_data *)xprt->xp_p2)->rd_data;
345 345 rmod = ((struct rdma_data *)xprt->xp_p2)->r_mod;
346 346
347 347 /*
348 348 * Create a listener for module at this port
349 349 */
350 350
351 351 if (rmod->rdma_count != 0)
352 352 (*rmod->rdma_ops->rdma_svc_listen)(svcdata);
353 353 else
354 354 svcdata->err_code = RDMA_FAILED;
355 355 }
356 356
357 357 void
358 358 svc_rdma_kstop(SVCMASTERXPRT *xprt)
359 359 {
360 360 struct rdma_svc_data *svcdata;
361 361 rdma_mod_t *rmod;
362 362
363 363 svcdata = &((struct rdma_data *)xprt->xp_p2)->rd_data;
364 364 rmod = ((struct rdma_data *)xprt->xp_p2)->r_mod;
365 365
366 366 /*
367 367 * Call the stop listener routine for each plugin. If rdma_count is
368 368 * already zero set active to zero.
369 369 */
370 370 if (rmod->rdma_count != 0)
371 371 (*rmod->rdma_ops->rdma_svc_stop)(svcdata);
372 372 else
373 373 svcdata->active = 0;
374 374 if (svcdata->active)
375 375 DTRACE_PROBE(krpc__e__svcrdma__kstop);
376 376 }
377 377
378 378 /* ARGSUSED */
379 379 static void
380 380 svc_rdma_kclone_destroy(SVCXPRT *clone_xprt)
381 381 {
382 382
383 383 struct clone_rdma_data *cdrp;
384 384 cdrp = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
385 385
386 386 /*
387 387 * Only free buffers and release connection when cloned is set.
388 388 */
389 389 if (cdrp->cloned != TRUE)
390 390 return;
391 391
392 392 rdma_buf_free(cdrp->conn, &cdrp->rpcbuf);
393 393 if (cdrp->cl_reply) {
394 394 clist_free(cdrp->cl_reply);
395 395 cdrp->cl_reply = NULL;
396 396 }
397 397 RDMA_REL_CONN(cdrp->conn);
398 398
399 399 cdrp->cloned = 0;
400 400 }
401 401
402 402 /*
403 403 * Clone the xprt specific information. It will be freed by
404 404 * SVC_CLONE_DESTROY.
405 405 */
406 406 static void
407 407 svc_rdma_kclone_xprt(SVCXPRT *src_xprt, SVCXPRT *dst_xprt)
408 408 {
409 409 struct clone_rdma_data *srcp2;
410 410 struct clone_rdma_data *dstp2;
411 411
412 412 srcp2 = (struct clone_rdma_data *)src_xprt->xp_p2buf;
413 413 dstp2 = (struct clone_rdma_data *)dst_xprt->xp_p2buf;
414 414
415 415 if (srcp2->conn != NULL) {
416 416 srcp2->cloned = TRUE;
417 417 *dstp2 = *srcp2;
418 418 }
419 419 }
420 420
421 421 static void
422 422 svc_rdma_ktattrs(SVCXPRT *clone_xprt, int attrflag, void **tattr)
423 423 {
424 424 CONN *conn;
425 425 *tattr = NULL;
426 426
427 427 switch (attrflag) {
428 428 case SVC_TATTR_ADDRMASK:
429 429 conn = ((struct clone_rdma_data *)clone_xprt->xp_p2buf)->conn;
430 430 ASSERT(conn != NULL);
431 431 if (conn)
432 432 *tattr = (void *)&conn->c_addrmask;
433 433 }
434 434 }
435 435
436 436 static bool_t
437 437 svc_rdma_krecv(SVCXPRT *clone_xprt, mblk_t *mp, struct rpc_msg *msg)
438 438 {
439 439 XDR *xdrs;
440 440 CONN *conn;
441 441 rdma_recv_data_t *rdp = (rdma_recv_data_t *)mp->b_rptr;
442 442 struct clone_rdma_data *crdp;
443 443 struct clist *cl = NULL;
444 444 struct clist *wcl = NULL;
445 445 struct clist *cllong = NULL;
446 446
447 447 rdma_stat status;
448 448 uint32_t vers, op, pos, xid;
449 449 uint32_t rdma_credit;
450 450 uint32_t wcl_total_length = 0;
451 451 bool_t wwl = FALSE;
452 452
453 453 crdp = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
454 454 RSSTAT_INCR(rscalls);
455 455 conn = rdp->conn;
456 456
457 457 status = rdma_svc_postrecv(conn);
458 458 if (status != RDMA_SUCCESS) {
459 459 DTRACE_PROBE(krpc__e__svcrdma__krecv__postrecv);
460 460 goto badrpc_call;
461 461 }
462 462
463 463 xdrs = &clone_xprt->xp_xdrin;
464 464 xdrmem_create(xdrs, rdp->rpcmsg.addr, rdp->rpcmsg.len, XDR_DECODE);
465 465 xid = *(uint32_t *)rdp->rpcmsg.addr;
466 466 XDR_SETPOS(xdrs, sizeof (uint32_t));
467 467
468 468 if (! xdr_u_int(xdrs, &vers) ||
469 469 ! xdr_u_int(xdrs, &rdma_credit) ||
470 470 ! xdr_u_int(xdrs, &op)) {
471 471 DTRACE_PROBE(krpc__e__svcrdma__krecv__uint);
472 472 goto xdr_err;
473 473 }
474 474
475 475 /* Checking if the status of the recv operation was normal */
476 476 if (rdp->status != 0) {
477 477 DTRACE_PROBE1(krpc__e__svcrdma__krecv__invalid__status,
478 478 int, rdp->status);
479 479 goto badrpc_call;
480 480 }
481 481
482 482 if (! xdr_do_clist(xdrs, &cl)) {
483 483 DTRACE_PROBE(krpc__e__svcrdma__krecv__do__clist);
484 484 goto xdr_err;
485 485 }
486 486
487 487 if (!xdr_decode_wlist_svc(xdrs, &wcl, &wwl, &wcl_total_length, conn)) {
488 488 DTRACE_PROBE(krpc__e__svcrdma__krecv__decode__wlist);
489 489 if (cl)
490 490 clist_free(cl);
491 491 goto xdr_err;
492 492 }
493 493 crdp->cl_wlist = wcl;
494 494
495 495 crdp->cl_reply = NULL;
496 496 (void) xdr_decode_reply_wchunk(xdrs, &crdp->cl_reply);
497 497
498 498 /*
499 499 * A chunk at 0 offset indicates that the RPC call message
500 500 * is in a chunk. Get the RPC call message chunk.
501 501 */
502 502 if (cl != NULL && op == RDMA_NOMSG) {
503 503
504 504 /* Remove RPC call message chunk from chunklist */
505 505 cllong = cl;
506 506 cl = cl->c_next;
507 507 cllong->c_next = NULL;
508 508
509 509
510 510 /* Allocate and register memory for the RPC call msg chunk */
511 511 cllong->rb_longbuf.type = RDMA_LONG_BUFFER;
512 512 cllong->rb_longbuf.len = cllong->c_len > LONG_REPLY_LEN ?
513 513 cllong->c_len : LONG_REPLY_LEN;
514 514
515 515 if (rdma_buf_alloc(conn, &cllong->rb_longbuf)) {
516 516 clist_free(cllong);
517 517 goto cll_malloc_err;
518 518 }
519 519
520 520 cllong->u.c_daddr3 = cllong->rb_longbuf.addr;
521 521
522 522 if (cllong->u.c_daddr == NULL) {
523 523 DTRACE_PROBE(krpc__e__svcrdma__krecv__nomem);
524 524 rdma_buf_free(conn, &cllong->rb_longbuf);
525 525 clist_free(cllong);
526 526 goto cll_malloc_err;
527 527 }
528 528
529 529 status = clist_register(conn, cllong, CLIST_REG_DST);
530 530 if (status) {
531 531 DTRACE_PROBE(krpc__e__svcrdma__krecv__clist__reg);
532 532 rdma_buf_free(conn, &cllong->rb_longbuf);
533 533 clist_free(cllong);
534 534 goto cll_malloc_err;
535 535 }
536 536
537 537 /*
538 538 * Now read the RPC call message in
539 539 */
540 540 status = RDMA_READ(conn, cllong, WAIT);
541 541 if (status) {
542 542 DTRACE_PROBE(krpc__e__svcrdma__krecv__read);
543 543 (void) clist_deregister(conn, cllong);
544 544 rdma_buf_free(conn, &cllong->rb_longbuf);
545 545 clist_free(cllong);
546 546 goto cll_malloc_err;
547 547 }
548 548
549 549 status = clist_syncmem(conn, cllong, CLIST_REG_DST);
550 550 (void) clist_deregister(conn, cllong);
551 551
552 552 xdrrdma_create(xdrs, (caddr_t)(uintptr_t)cllong->u.c_daddr3,
553 553 cllong->c_len, 0, cl, XDR_DECODE, conn);
554 554
555 555 crdp->rpcbuf = cllong->rb_longbuf;
556 556 crdp->rpcbuf.len = cllong->c_len;
557 557 clist_free(cllong);
558 558 RDMA_BUF_FREE(conn, &rdp->rpcmsg);
559 559 } else {
560 560 pos = XDR_GETPOS(xdrs);
561 561 xdrrdma_create(xdrs, rdp->rpcmsg.addr + pos,
562 562 rdp->rpcmsg.len - pos, 0, cl, XDR_DECODE, conn);
563 563 crdp->rpcbuf = rdp->rpcmsg;
564 564
565 565 /* Use xdrrdmablk_ops to indicate there is a read chunk list */
566 566 if (cl != NULL) {
567 567 int32_t flg = XDR_RDMA_RLIST_REG;
568 568
569 569 XDR_CONTROL(xdrs, XDR_RDMA_SET_FLAGS, &flg);
570 570 xdrs->x_ops = &xdrrdmablk_ops;
571 571 }
572 572 }
573 573
574 574 if (crdp->cl_wlist) {
575 575 int32_t flg = XDR_RDMA_WLIST_REG;
576 576
577 577 XDR_CONTROL(xdrs, XDR_RDMA_SET_WLIST, crdp->cl_wlist);
578 578 XDR_CONTROL(xdrs, XDR_RDMA_SET_FLAGS, &flg);
579 579 }
580 580
581 581 if (! xdr_callmsg(xdrs, msg)) {
582 582 DTRACE_PROBE(krpc__e__svcrdma__krecv__callmsg);
583 583 RSSTAT_INCR(rsxdrcall);
584 584 goto callmsg_err;
585 585 }
586 586
587 587 /*
588 588 * Point the remote transport address in the service_transport
589 589 * handle at the address in the request.
590 590 */
591 591 clone_xprt->xp_rtaddr.buf = conn->c_raddr.buf;
592 592 clone_xprt->xp_rtaddr.len = conn->c_raddr.len;
593 593 clone_xprt->xp_rtaddr.maxlen = conn->c_raddr.len;
594 594
595 595 clone_xprt->xp_lcladdr.buf = conn->c_laddr.buf;
596 596 clone_xprt->xp_lcladdr.len = conn->c_laddr.len;
597 597 clone_xprt->xp_lcladdr.maxlen = conn->c_laddr.len;
598 598
599 599 /*
600 600 * In case of RDMA, connection management is
601 601 * entirely done in rpcib module and netid in the
602 602 * SVCMASTERXPRT is NULL. Initialize the clone netid
603 603 * from the connection.
604 604 */
605 605
606 606 clone_xprt->xp_netid = conn->c_netid;
607 607
608 608 clone_xprt->xp_xid = xid;
609 609 crdp->conn = conn;
610 610
611 611 freeb(mp);
612 612
613 613 return (TRUE);
614 614
615 615 callmsg_err:
616 616 rdma_buf_free(conn, &crdp->rpcbuf);
617 617
618 618 cll_malloc_err:
619 619 if (cl)
620 620 clist_free(cl);
621 621 xdr_err:
622 622 XDR_DESTROY(xdrs);
623 623
624 624 badrpc_call:
625 625 RDMA_BUF_FREE(conn, &rdp->rpcmsg);
626 626 RDMA_REL_CONN(conn);
627 627 freeb(mp);
628 628 RSSTAT_INCR(rsbadcalls);
629 629 return (FALSE);
630 630 }
631 631
632 632 static int
633 633 svc_process_long_reply(SVCXPRT * clone_xprt,
634 634 xdrproc_t xdr_results, caddr_t xdr_location,
635 635 struct rpc_msg *msg, bool_t has_args, int *msglen,
636 636 int *freelen, int *numchunks, unsigned int *final_len)
637 637 {
638 638 int status;
639 639 XDR xdrslong;
640 640 struct clist *wcl = NULL;
641 641 int count = 0;
642 642 int alloc_len;
643 643 char *memp;
644 644 rdma_buf_t long_rpc = {0};
645 645 struct clone_rdma_data *crdp;
646 646
647 647 crdp = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
648 648
649 649 bzero(&xdrslong, sizeof (xdrslong));
650 650
651 651 /* Choose a size for the long rpc response */
652 652 if (MSG_IS_RPCSEC_GSS(msg)) {
653 653 alloc_len = RNDUP(MAX_AUTH_BYTES + *msglen);
654 654 } else {
655 655 alloc_len = RNDUP(*msglen);
656 656 }
657 657
658 658 if (alloc_len <= 64 * 1024) {
659 659 if (alloc_len > 32 * 1024) {
660 660 alloc_len = 64 * 1024;
661 661 } else {
662 662 if (alloc_len > 16 * 1024) {
663 663 alloc_len = 32 * 1024;
664 664 } else {
665 665 alloc_len = 16 * 1024;
666 666 }
667 667 }
668 668 }
669 669
670 670 long_rpc.type = RDMA_LONG_BUFFER;
671 671 long_rpc.len = alloc_len;
672 672 if (rdma_buf_alloc(crdp->conn, &long_rpc)) {
673 673 return (SVC_RDMA_FAIL);
674 674 }
675 675
676 676 memp = long_rpc.addr;
677 677 xdrmem_create(&xdrslong, memp, alloc_len, XDR_ENCODE);
678 678
679 679 msg->rm_xid = clone_xprt->xp_xid;
680 680
681 681 if (!(xdr_replymsg(&xdrslong, msg) &&
682 682 (!has_args || SVCAUTH_WRAP(&clone_xprt->xp_auth, &xdrslong,
683 683 xdr_results, xdr_location)))) {
684 684 rdma_buf_free(crdp->conn, &long_rpc);
685 685 DTRACE_PROBE(krpc__e__svcrdma__longrep__authwrap);
686 686 return (SVC_RDMA_FAIL);
687 687 }
688 688
689 689 *final_len = XDR_GETPOS(&xdrslong);
690 690
691 691 DTRACE_PROBE1(krpc__i__replylen, uint_t, *final_len);
692 692 *numchunks = 0;
693 693 *freelen = 0;
694 694
695 695 wcl = crdp->cl_reply;
696 696 wcl->rb_longbuf = long_rpc;
697 697
698 698 count = *final_len;
699 699 while ((wcl != NULL) && (count > 0)) {
700 700
701 701 if (wcl->c_dmemhandle.mrc_rmr == 0)
702 702 break;
703 703
704 704 DTRACE_PROBE2(krpc__i__write__chunks, uint32_t, count,
705 705 uint32_t, wcl->c_len);
706 706
707 707 if (wcl->c_len > count) {
708 708 wcl->c_len = count;
709 709 }
710 710 wcl->w.c_saddr3 = (caddr_t)memp;
711 711
712 712 count -= wcl->c_len;
713 713 *numchunks += 1;
714 714 memp += wcl->c_len;
715 715 wcl = wcl->c_next;
716 716 }
717 717
718 718 /*
719 719 * Make rest of the chunks 0-len
720 720 */
721 721 while (wcl != NULL) {
722 722 if (wcl->c_dmemhandle.mrc_rmr == 0)
723 723 break;
724 724 wcl->c_len = 0;
725 725 wcl = wcl->c_next;
726 726 }
727 727
728 728 wcl = crdp->cl_reply;
729 729
730 730 /*
731 731 * MUST fail if there are still more data
732 732 */
733 733 if (count > 0) {
734 734 rdma_buf_free(crdp->conn, &long_rpc);
735 735 DTRACE_PROBE(krpc__e__svcrdma__longrep__dlen__clist);
736 736 return (SVC_RDMA_FAIL);
737 737 }
738 738
739 739 if (clist_register(crdp->conn, wcl, CLIST_REG_SOURCE) != RDMA_SUCCESS) {
740 740 rdma_buf_free(crdp->conn, &long_rpc);
741 741 DTRACE_PROBE(krpc__e__svcrdma__longrep__clistreg);
742 742 return (SVC_RDMA_FAIL);
743 743 }
744 744
745 745 status = clist_syncmem(crdp->conn, wcl, CLIST_REG_SOURCE);
746 746
747 747 if (status) {
748 748 (void) clist_deregister(crdp->conn, wcl);
749 749 rdma_buf_free(crdp->conn, &long_rpc);
750 750 DTRACE_PROBE(krpc__e__svcrdma__longrep__syncmem);
751 751 return (SVC_RDMA_FAIL);
752 752 }
753 753
754 754 status = RDMA_WRITE(crdp->conn, wcl, WAIT);
755 755
756 756 (void) clist_deregister(crdp->conn, wcl);
757 757 rdma_buf_free(crdp->conn, &wcl->rb_longbuf);
758 758
759 759 if (status != RDMA_SUCCESS) {
760 760 DTRACE_PROBE(krpc__e__svcrdma__longrep__write);
761 761 return (SVC_RDMA_FAIL);
762 762 }
763 763
764 764 return (SVC_RDMA_SUCCESS);
765 765 }
766 766
767 767
768 768 static int
769 769 svc_compose_rpcmsg(SVCXPRT * clone_xprt, CONN * conn, xdrproc_t xdr_results,
770 770 caddr_t xdr_location, rdma_buf_t *rpcreply, XDR ** xdrs,
771 771 struct rpc_msg *msg, bool_t has_args, uint_t *len)
772 772 {
773 773 /*
774 774 * Get a pre-allocated buffer for rpc reply
775 775 */
776 776 rpcreply->type = SEND_BUFFER;
777 777 if (rdma_buf_alloc(conn, rpcreply)) {
778 778 DTRACE_PROBE(krpc__e__svcrdma__rpcmsg__reply__nofreebufs);
779 779 return (SVC_RDMA_FAIL);
780 780 }
781 781
782 782 xdrrdma_create(*xdrs, rpcreply->addr, rpcreply->len,
783 783 0, NULL, XDR_ENCODE, conn);
784 784
785 785 msg->rm_xid = clone_xprt->xp_xid;
786 786
787 787 if (has_args) {
788 788 if (!(xdr_replymsg(*xdrs, msg) &&
789 789 (!has_args ||
790 790 SVCAUTH_WRAP(&clone_xprt->xp_auth, *xdrs,
791 791 xdr_results, xdr_location)))) {
792 792 rdma_buf_free(conn, rpcreply);
793 793 DTRACE_PROBE(
794 794 krpc__e__svcrdma__rpcmsg__reply__authwrap1);
795 795 return (SVC_RDMA_FAIL);
796 796 }
797 797 } else {
798 798 if (!xdr_replymsg(*xdrs, msg)) {
799 799 rdma_buf_free(conn, rpcreply);
800 800 DTRACE_PROBE(
801 801 krpc__e__svcrdma__rpcmsg__reply__authwrap2);
802 802 return (SVC_RDMA_FAIL);
803 803 }
804 804 }
805 805
806 806 *len = XDR_GETPOS(*xdrs);
807 807
808 808 return (SVC_RDMA_SUCCESS);
809 809 }
810 810
811 811 /*
812 812 * Send rpc reply.
813 813 */
814 814 static bool_t
815 815 svc_rdma_ksend(SVCXPRT * clone_xprt, struct rpc_msg *msg)
816 816 {
817 817 XDR *xdrs_rpc = &(clone_xprt->xp_xdrout);
818 818 XDR xdrs_rhdr;
819 819 CONN *conn = NULL;
820 820 rdma_buf_t rbuf_resp = {0}, rbuf_rpc_resp = {0};
821 821
822 822 struct clone_rdma_data *crdp;
823 823 struct clist *cl_read = NULL;
824 824 struct clist *cl_send = NULL;
825 825 struct clist *cl_write = NULL;
826 826 xdrproc_t xdr_results; /* results XDR encoding function */
827 827 caddr_t xdr_location; /* response results pointer */
828 828
829 829 int retval = FALSE;
830 830 int status, msglen, num_wreply_segments = 0;
831 831 uint32_t rdma_credit = 0;
832 832 int freelen = 0;
833 833 bool_t has_args;
834 834 uint_t final_resp_len, rdma_response_op, vers;
835 835
836 836 bzero(&xdrs_rhdr, sizeof (XDR));
837 837 crdp = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
838 838 conn = crdp->conn;
839 839
840 840 /*
841 841 * If there is a result procedure specified in the reply message,
842 842 * it will be processed in the xdr_replymsg and SVCAUTH_WRAP.
843 843 * We need to make sure it won't be processed twice, so we null
844 844 * it for xdr_replymsg here.
845 845 */
846 846 has_args = FALSE;
847 847 if (msg->rm_reply.rp_stat == MSG_ACCEPTED &&
848 848 msg->rm_reply.rp_acpt.ar_stat == SUCCESS) {
849 849 if ((xdr_results = msg->acpted_rply.ar_results.proc) != NULL) {
850 850 has_args = TRUE;
851 851 xdr_location = msg->acpted_rply.ar_results.where;
852 852 msg->acpted_rply.ar_results.proc = xdr_void;
853 853 msg->acpted_rply.ar_results.where = NULL;
854 854 }
855 855 }
856 856
857 857 /*
858 858 * Given the limit on the inline response size (RPC_MSG_SZ),
859 859 * there is a need to make a guess as to the overall size of
860 860 * the response. If the resultant size is beyond the inline
861 861 * size, then the server needs to use the "reply chunk list"
862 862 * provided by the client (if the client provided one). An
863 863 * example of this type of response would be a READDIR
864 864 * response (e.g. a small directory read would fit in RPC_MSG_SZ
865 865 * and that is the preference but it may not fit)
866 866 *
867 867 * Combine the encoded size and the size of the true results
868 868 * and then make the decision about where to encode and send results.
869 869 *
870 870 * One important note, this calculation is ignoring the size
871 871 * of the encoding of the authentication overhead. The reason
872 872 * for this is rooted in the complexities of access to the
873 873 * encoded size of RPCSEC_GSS related authentiation,
874 874 * integrity, and privacy.
875 875 *
876 876 * If it turns out that the encoded authentication bumps the
877 877 * response over the RPC_MSG_SZ limit, then it may need to
878 878 * attempt to encode for the reply chunk list.
879 879 */
880 880
881 881 /*
882 882 * Calculating the "sizeof" the RPC response header and the
883 883 * encoded results.
884 884 */
885 885 msglen = xdr_sizeof(xdr_replymsg, msg);
886 886
887 887 if (msglen > 0) {
888 888 RSSTAT_INCR(rstotalreplies);
889 889 }
890 890 if (has_args)
891 891 msglen += xdrrdma_sizeof(xdr_results, xdr_location,
892 892 rdma_minchunk, NULL, NULL);
893 893
894 894 DTRACE_PROBE1(krpc__i__svcrdma__ksend__msglen, int, msglen);
895 895
896 896 status = SVC_RDMA_SUCCESS;
897 897
898 898 if (msglen < RPC_MSG_SZ) {
899 899 /*
900 900 * Looks like the response will fit in the inline
901 901 * response; let's try
902 902 */
903 903 RSSTAT_INCR(rstotalinlinereplies);
904 904
905 905 rdma_response_op = RDMA_MSG;
906 906
907 907 status = svc_compose_rpcmsg(clone_xprt, conn, xdr_results,
908 908 xdr_location, &rbuf_rpc_resp, &xdrs_rpc, msg,
909 909 has_args, &final_resp_len);
910 910
911 911 DTRACE_PROBE1(krpc__i__srdma__ksend__compose_status,
912 912 int, status);
913 913 DTRACE_PROBE1(krpc__i__srdma__ksend__compose_len,
914 914 int, final_resp_len);
915 915
916 916 if (status == SVC_RDMA_SUCCESS && crdp->cl_reply) {
917 917 clist_free(crdp->cl_reply);
918 918 crdp->cl_reply = NULL;
919 919 }
920 920 }
921 921
922 922 /*
923 923 * If the encode failed (size?) or the message really is
924 924 * larger than what is allowed, try the response chunk list.
925 925 */
926 926 if (status != SVC_RDMA_SUCCESS || msglen >= RPC_MSG_SZ) {
927 927 /*
928 928 * attempting to use a reply chunk list when there
929 929 * isn't one won't get very far...
930 930 */
931 931 if (crdp->cl_reply == NULL) {
932 932 DTRACE_PROBE(krpc__e__svcrdma__ksend__noreplycl);
933 933 goto out;
934 934 }
935 935
936 936 RSSTAT_INCR(rstotallongreplies);
937 937
938 938 msglen = xdr_sizeof(xdr_replymsg, msg);
939 939 msglen += xdrrdma_sizeof(xdr_results, xdr_location, 0,
940 940 NULL, NULL);
941 941
942 942 status = svc_process_long_reply(clone_xprt, xdr_results,
943 943 xdr_location, msg, has_args, &msglen, &freelen,
944 944 &num_wreply_segments, &final_resp_len);
945 945
946 946 DTRACE_PROBE1(krpc__i__svcrdma__ksend__longreplen,
947 947 int, final_resp_len);
948 948
949 949 if (status != SVC_RDMA_SUCCESS) {
950 950 DTRACE_PROBE(krpc__e__svcrdma__ksend__compose__failed);
951 951 goto out;
952 952 }
953 953
954 954 rdma_response_op = RDMA_NOMSG;
955 955 }
956 956
957 957 DTRACE_PROBE1(krpc__i__svcrdma__ksend__rdmamsg__len,
958 958 int, final_resp_len);
959 959
960 960 rbuf_resp.type = SEND_BUFFER;
961 961 if (rdma_buf_alloc(conn, &rbuf_resp)) {
962 962 rdma_buf_free(conn, &rbuf_rpc_resp);
963 963 DTRACE_PROBE(krpc__e__svcrdma__ksend__nofreebufs);
964 964 goto out;
965 965 }
966 966
967 967 rdma_credit = rdma_bufs_granted;
968 968
969 969 vers = RPCRDMA_VERS;
970 970 xdrmem_create(&xdrs_rhdr, rbuf_resp.addr, rbuf_resp.len, XDR_ENCODE);
971 971 (*(uint32_t *)rbuf_resp.addr) = msg->rm_xid;
972 972 /* Skip xid and set the xdr position accordingly. */
973 973 XDR_SETPOS(&xdrs_rhdr, sizeof (uint32_t));
974 974 if (!xdr_u_int(&xdrs_rhdr, &vers) ||
975 975 !xdr_u_int(&xdrs_rhdr, &rdma_credit) ||
976 976 !xdr_u_int(&xdrs_rhdr, &rdma_response_op)) {
977 977 rdma_buf_free(conn, &rbuf_rpc_resp);
978 978 rdma_buf_free(conn, &rbuf_resp);
979 979 DTRACE_PROBE(krpc__e__svcrdma__ksend__uint);
980 980 goto out;
981 981 }
982 982
983 983 /*
984 984 * Now XDR the read chunk list, actually always NULL
985 985 */
986 986 (void) xdr_encode_rlist_svc(&xdrs_rhdr, cl_read);
987 987
988 988 /*
989 989 * encode write list -- we already drove RDMA_WRITEs
990 990 */
991 991 cl_write = crdp->cl_wlist;
992 992 if (!xdr_encode_wlist(&xdrs_rhdr, cl_write)) {
993 993 DTRACE_PROBE(krpc__e__svcrdma__ksend__enc__wlist);
994 994 rdma_buf_free(conn, &rbuf_rpc_resp);
995 995 rdma_buf_free(conn, &rbuf_resp);
996 996 goto out;
997 997 }
998 998
999 999 /*
1000 1000 * XDR encode the RDMA_REPLY write chunk
1001 1001 */
1002 1002 if (!xdr_encode_reply_wchunk(&xdrs_rhdr, crdp->cl_reply,
1003 1003 num_wreply_segments)) {
1004 1004 rdma_buf_free(conn, &rbuf_rpc_resp);
1005 1005 rdma_buf_free(conn, &rbuf_resp);
1006 1006 goto out;
1007 1007 }
1008 1008
1009 1009 clist_add(&cl_send, 0, XDR_GETPOS(&xdrs_rhdr), &rbuf_resp.handle,
1010 1010 rbuf_resp.addr, NULL, NULL);
1011 1011
1012 1012 if (rdma_response_op == RDMA_MSG) {
1013 1013 clist_add(&cl_send, 0, final_resp_len, &rbuf_rpc_resp.handle,
1014 1014 rbuf_rpc_resp.addr, NULL, NULL);
1015 1015 }
1016 1016
1017 1017 status = RDMA_SEND(conn, cl_send, msg->rm_xid);
1018 1018
1019 1019 if (status == RDMA_SUCCESS) {
1020 1020 retval = TRUE;
1021 1021 }
1022 1022
1023 1023 out:
1024 1024 /*
1025 1025 * Free up sendlist chunks
1026 1026 */
1027 1027 if (cl_send != NULL)
1028 1028 clist_free(cl_send);
1029 1029
1030 1030 /*
1031 1031 * Destroy private data for xdr rdma
1032 1032 */
1033 1033 if (clone_xprt->xp_xdrout.x_ops != NULL) {
1034 1034 XDR_DESTROY(&(clone_xprt->xp_xdrout));
1035 1035 }
1036 1036
1037 1037 if (crdp->cl_reply) {
1038 1038 clist_free(crdp->cl_reply);
1039 1039 crdp->cl_reply = NULL;
1040 1040 }
1041 1041
1042 1042 /*
1043 1043 * This is completely disgusting. If public is set it is
1044 1044 * a pointer to a structure whose first field is the address
1045 1045 * of the function to free that structure and any related
1046 1046 * stuff. (see rrokfree in nfs_xdr.c).
1047 1047 */
1048 1048 if (xdrs_rpc->x_public) {
1049 1049 /* LINTED pointer alignment */
1050 1050 (**((int (**)()) xdrs_rpc->x_public)) (xdrs_rpc->x_public);
1051 1051 }
1052 1052
1053 1053 if (xdrs_rhdr.x_ops != NULL) {
1054 1054 XDR_DESTROY(&xdrs_rhdr);
1055 1055 }
1056 1056
1057 1057 return (retval);
1058 1058 }
1059 1059
1060 1060 /*
1061 1061 * Deserialize arguments.
1062 1062 */
1063 1063 static bool_t
1064 1064 svc_rdma_kgetargs(SVCXPRT *clone_xprt, xdrproc_t xdr_args, caddr_t args_ptr)
1065 1065 {
1066 1066 if ((SVCAUTH_UNWRAP(&clone_xprt->xp_auth, &clone_xprt->xp_xdrin,
1067 1067 xdr_args, args_ptr)) != TRUE)
1068 1068 return (FALSE);
1069 1069 return (TRUE);
1070 1070 }
1071 1071
1072 1072 static bool_t
1073 1073 svc_rdma_kfreeargs(SVCXPRT *clone_xprt, xdrproc_t xdr_args,
1074 1074 caddr_t args_ptr)
1075 1075 {
1076 1076 struct clone_rdma_data *crdp;
1077 1077 bool_t retval;
1078 1078
1079 1079 /*
1080 1080 * If the cloned bit is true, then this transport specific
1081 1081 * rmda data has been duplicated into another cloned xprt. Do
1082 1082 * not free, or release the connection, it is still in use. The
1083 1083 * buffers will be freed and the connection released later by
1084 1084 * SVC_CLONE_DESTROY().
1085 1085 */
1086 1086 crdp = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
1087 1087 if (crdp->cloned == TRUE) {
1088 1088 crdp->cloned = 0;
1089 1089 return (TRUE);
1090 1090 }
1091 1091
1092 1092 /*
1093 1093 * Free the args if needed then XDR_DESTROY
1094 1094 */
1095 1095 if (args_ptr) {
1096 1096 XDR *xdrs = &clone_xprt->xp_xdrin;
1097 1097
1098 1098 xdrs->x_op = XDR_FREE;
1099 1099 retval = (*xdr_args)(xdrs, args_ptr);
1100 1100 }
1101 1101
1102 1102 XDR_DESTROY(&(clone_xprt->xp_xdrin));
1103 1103 rdma_buf_free(crdp->conn, &crdp->rpcbuf);
1104 1104 if (crdp->cl_reply) {
1105 1105 clist_free(crdp->cl_reply);
1106 1106 crdp->cl_reply = NULL;
1107 1107 }
1108 1108 RDMA_REL_CONN(crdp->conn);
1109 1109
1110 1110 return (retval);
1111 1111 }
1112 1112
1113 1113 /* ARGSUSED */
1114 1114 static int32_t *
1115 1115 svc_rdma_kgetres(SVCXPRT *clone_xprt, int size)
1116 1116 {
1117 1117 return (NULL);
1118 1118 }
1119 1119
1120 1120 /* ARGSUSED */
1121 1121 static void
1122 1122 svc_rdma_kfreeres(SVCXPRT *clone_xprt)
1123 1123 {
1124 1124 }
1125 1125
1126 1126 /*
1127 1127 * the dup cacheing routines below provide a cache of non-failure
1128 1128 * transaction id's. rpc service routines can use this to detect
1129 1129 * retransmissions and re-send a non-failure response.
1130 1130 */
1131 1131
1132 1132 /*
1133 1133 * MAXDUPREQS is the number of cached items. It should be adjusted
1134 1134 * to the service load so that there is likely to be a response entry
1135 1135 * when the first retransmission comes in.
1136 1136 */
1137 1137 #define MAXDUPREQS 1024
1138 1138
1139 1139 /*
1140 1140 * This should be appropriately scaled to MAXDUPREQS.
1141 1141 */
1142 1142 #define DRHASHSZ 257
1143 1143
1144 1144 #if ((DRHASHSZ & (DRHASHSZ - 1)) == 0)
1145 1145 #define XIDHASH(xid) ((xid) & (DRHASHSZ - 1))
1146 1146 #else
1147 1147 #define XIDHASH(xid) ((xid) % DRHASHSZ)
1148 1148 #endif
1149 1149 #define DRHASH(dr) XIDHASH((dr)->dr_xid)
1150 1150 #define REQTOXID(req) ((req)->rq_xprt->xp_xid)
1151 1151
1152 1152 static int rdmandupreqs = 0;
1153 1153 int rdmamaxdupreqs = MAXDUPREQS;
1154 1154 static kmutex_t rdmadupreq_lock;
1155 1155 static struct dupreq *rdmadrhashtbl[DRHASHSZ];
1156 1156 static int rdmadrhashstat[DRHASHSZ];
1157 1157
1158 1158 static void unhash(struct dupreq *);
1159 1159
1160 1160 /*
1161 1161 * rdmadrmru points to the head of a circular linked list in lru order.
1162 1162 * rdmadrmru->dr_next == drlru
1163 1163 */
1164 1164 struct dupreq *rdmadrmru;
1165 1165
1166 1166 /*
1167 1167 * svc_rdma_kdup searches the request cache and returns 0 if the
1168 1168 * request is not found in the cache. If it is found, then it
1169 1169 * returns the state of the request (in progress or done) and
1170 1170 * the status or attributes that were part of the original reply.
1171 1171 */
1172 1172 static int
1173 1173 svc_rdma_kdup(struct svc_req *req, caddr_t res, int size, struct dupreq **drpp,
1174 1174 bool_t *dupcachedp)
1175 1175 {
1176 1176 struct dupreq *dr;
1177 1177 uint32_t xid;
1178 1178 uint32_t drhash;
1179 1179 int status;
1180 1180
1181 1181 xid = REQTOXID(req);
1182 1182 mutex_enter(&rdmadupreq_lock);
1183 1183 RSSTAT_INCR(rsdupchecks);
1184 1184 /*
1185 1185 * Check to see whether an entry already exists in the cache.
1186 1186 */
1187 1187 dr = rdmadrhashtbl[XIDHASH(xid)];
1188 1188 while (dr != NULL) {
1189 1189 if (dr->dr_xid == xid &&
1190 1190 dr->dr_proc == req->rq_proc &&
1191 1191 dr->dr_prog == req->rq_prog &&
1192 1192 dr->dr_vers == req->rq_vers &&
1193 1193 dr->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
1194 1194 bcmp((caddr_t)dr->dr_addr.buf,
1195 1195 (caddr_t)req->rq_xprt->xp_rtaddr.buf,
1196 1196 dr->dr_addr.len) == 0) {
1197 1197 status = dr->dr_status;
1198 1198 if (status == DUP_DONE) {
1199 1199 bcopy(dr->dr_resp.buf, res, size);
1200 1200 if (dupcachedp != NULL)
1201 1201 *dupcachedp = (dr->dr_resfree != NULL);
1202 1202 } else {
1203 1203 dr->dr_status = DUP_INPROGRESS;
1204 1204 *drpp = dr;
1205 1205 }
1206 1206 RSSTAT_INCR(rsdupreqs);
1207 1207 mutex_exit(&rdmadupreq_lock);
1208 1208 return (status);
1209 1209 }
1210 1210 dr = dr->dr_chain;
1211 1211 }
1212 1212
1213 1213 /*
1214 1214 * There wasn't an entry, either allocate a new one or recycle
1215 1215 * an old one.
1216 1216 */
1217 1217 if (rdmandupreqs < rdmamaxdupreqs) {
1218 1218 dr = kmem_alloc(sizeof (*dr), KM_NOSLEEP);
1219 1219 if (dr == NULL) {
1220 1220 mutex_exit(&rdmadupreq_lock);
1221 1221 return (DUP_ERROR);
1222 1222 }
1223 1223 dr->dr_resp.buf = NULL;
1224 1224 dr->dr_resp.maxlen = 0;
1225 1225 dr->dr_addr.buf = NULL;
1226 1226 dr->dr_addr.maxlen = 0;
1227 1227 if (rdmadrmru) {
1228 1228 dr->dr_next = rdmadrmru->dr_next;
1229 1229 rdmadrmru->dr_next = dr;
1230 1230 } else {
1231 1231 dr->dr_next = dr;
1232 1232 }
1233 1233 rdmandupreqs++;
1234 1234 } else {
1235 1235 dr = rdmadrmru->dr_next;
1236 1236 while (dr->dr_status == DUP_INPROGRESS) {
1237 1237 dr = dr->dr_next;
1238 1238 if (dr == rdmadrmru->dr_next) {
1239 1239 mutex_exit(&rdmadupreq_lock);
1240 1240 return (DUP_ERROR);
1241 1241 }
1242 1242 }
1243 1243 unhash(dr);
1244 1244 if (dr->dr_resfree) {
1245 1245 (*dr->dr_resfree)(dr->dr_resp.buf);
1246 1246 }
1247 1247 }
1248 1248 dr->dr_resfree = NULL;
1249 1249 rdmadrmru = dr;
1250 1250
1251 1251 dr->dr_xid = REQTOXID(req);
1252 1252 dr->dr_prog = req->rq_prog;
1253 1253 dr->dr_vers = req->rq_vers;
1254 1254 dr->dr_proc = req->rq_proc;
1255 1255 if (dr->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
1256 1256 if (dr->dr_addr.buf != NULL)
1257 1257 kmem_free(dr->dr_addr.buf, dr->dr_addr.maxlen);
1258 1258 dr->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
1259 1259 dr->dr_addr.buf = kmem_alloc(dr->dr_addr.maxlen, KM_NOSLEEP);
1260 1260 if (dr->dr_addr.buf == NULL) {
1261 1261 dr->dr_addr.maxlen = 0;
1262 1262 dr->dr_status = DUP_DROP;
1263 1263 mutex_exit(&rdmadupreq_lock);
1264 1264 return (DUP_ERROR);
1265 1265 }
1266 1266 }
1267 1267 dr->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
1268 1268 bcopy(req->rq_xprt->xp_rtaddr.buf, dr->dr_addr.buf, dr->dr_addr.len);
1269 1269 if (dr->dr_resp.maxlen < size) {
1270 1270 if (dr->dr_resp.buf != NULL)
1271 1271 kmem_free(dr->dr_resp.buf, dr->dr_resp.maxlen);
1272 1272 dr->dr_resp.maxlen = (unsigned int)size;
1273 1273 dr->dr_resp.buf = kmem_alloc(size, KM_NOSLEEP);
1274 1274 if (dr->dr_resp.buf == NULL) {
1275 1275 dr->dr_resp.maxlen = 0;
1276 1276 dr->dr_status = DUP_DROP;
1277 1277 mutex_exit(&rdmadupreq_lock);
1278 1278 return (DUP_ERROR);
1279 1279 }
1280 1280 }
1281 1281 dr->dr_status = DUP_INPROGRESS;
1282 1282
1283 1283 drhash = (uint32_t)DRHASH(dr);
1284 1284 dr->dr_chain = rdmadrhashtbl[drhash];
1285 1285 rdmadrhashtbl[drhash] = dr;
1286 1286 rdmadrhashstat[drhash]++;
1287 1287 mutex_exit(&rdmadupreq_lock);
1288 1288 *drpp = dr;
1289 1289 return (DUP_NEW);
1290 1290 }
1291 1291
1292 1292 /*
1293 1293 * svc_rdma_kdupdone marks the request done (DUP_DONE or DUP_DROP)
1294 1294 * and stores the response.
1295 1295 */
1296 1296 static void
1297 1297 svc_rdma_kdupdone(struct dupreq *dr, caddr_t res, void (*dis_resfree)(),
1298 1298 int size, int status)
1299 1299 {
1300 1300 ASSERT(dr->dr_resfree == NULL);
1301 1301 if (status == DUP_DONE) {
1302 1302 bcopy(res, dr->dr_resp.buf, size);
1303 1303 dr->dr_resfree = dis_resfree;
1304 1304 }
1305 1305 dr->dr_status = status;
1306 1306 }
1307 1307
1308 1308 /*
1309 1309 * This routine expects that the mutex, rdmadupreq_lock, is already held.
1310 1310 */
1311 1311 static void
1312 1312 unhash(struct dupreq *dr)
1313 1313 {
1314 1314 struct dupreq *drt;
1315 1315 struct dupreq *drtprev = NULL;
1316 1316 uint32_t drhash;
1317 1317
1318 1318 ASSERT(MUTEX_HELD(&rdmadupreq_lock));
1319 1319
1320 1320 drhash = (uint32_t)DRHASH(dr);
1321 1321 drt = rdmadrhashtbl[drhash];
1322 1322 while (drt != NULL) {
1323 1323 if (drt == dr) {
1324 1324 rdmadrhashstat[drhash]--;
1325 1325 if (drtprev == NULL) {
1326 1326 rdmadrhashtbl[drhash] = drt->dr_chain;
1327 1327 } else {
1328 1328 drtprev->dr_chain = drt->dr_chain;
1329 1329 }
1330 1330 return;
1331 1331 }
1332 1332 drtprev = drt;
1333 1333 drt = drt->dr_chain;
1334 1334 }
1335 1335 }
1336 1336
1337 1337 bool_t
1338 1338 rdma_get_wchunk(struct svc_req *req, iovec_t *iov, struct clist *wlist)
1339 1339 {
1340 1340 struct clist *clist;
1341 1341 uint32_t tlen;
1342 1342
1343 1343 if (req->rq_xprt->xp_type != T_RDMA) {
1344 1344 return (FALSE);
1345 1345 }
1346 1346
1347 1347 tlen = 0;
1348 1348 clist = wlist;
1349 1349 while (clist) {
1350 1350 tlen += clist->c_len;
1351 1351 clist = clist->c_next;
1352 1352 }
1353 1353
1354 1354 /*
1355 1355 * set iov to addr+len of first segment of first wchunk of
1356 1356 * wlist sent by client. krecv() already malloc'd a buffer
1357 1357 * large enough, but registration is deferred until we write
1358 1358 * the buffer back to (NFS) client using RDMA_WRITE.
1359 1359 */
1360 1360 iov->iov_base = (caddr_t)(uintptr_t)wlist->w.c_saddr;
1361 1361 iov->iov_len = tlen;
1362 1362
1363 1363 return (TRUE);
1364 1364 }
1365 1365
1366 1366 /*
1367 1367 * routine to setup the read chunk lists
1368 1368 */
1369 1369
1370 1370 int
1371 1371 rdma_setup_read_chunks(struct clist *wcl, uint32_t count, int *wcl_len)
1372 1372 {
1373 1373 int data_len, avail_len;
1374 1374 uint_t round_len;
1375 1375
1376 1376 data_len = avail_len = 0;
1377 1377
1378 1378 while (wcl != NULL && count > 0) {
1379 1379 if (wcl->c_dmemhandle.mrc_rmr == 0)
1380 1380 break;
1381 1381
1382 1382 if (wcl->c_len < count) {
1383 1383 data_len += wcl->c_len;
1384 1384 avail_len = 0;
1385 1385 } else {
1386 1386 data_len += count;
1387 1387 avail_len = wcl->c_len - count;
1388 1388 wcl->c_len = count;
1389 1389 }
1390 1390 count -= wcl->c_len;
1391 1391
1392 1392 if (count == 0)
1393 1393 break;
1394 1394
1395 1395 wcl = wcl->c_next;
1396 1396 }
1397 1397
1398 1398 /*
1399 1399 * MUST fail if there are still more data
1400 1400 */
1401 1401 if (count > 0) {
1402 1402 DTRACE_PROBE2(krpc__e__rdma_setup_read_chunks_clist_len,
1403 1403 int, data_len, int, count);
1404 1404 return (FALSE);
1405 1405 }
1406 1406
1407 1407 /*
1408 1408 * Round up the last chunk to 4-byte boundary
1409 1409 */
1410 1410 *wcl_len = roundup(data_len, BYTES_PER_XDR_UNIT);
1411 1411 round_len = *wcl_len - data_len;
1412 1412
1413 1413 if (round_len) {
1414 1414
1415 1415 /*
1416 1416 * If there is space in the current chunk,
1417 1417 * add the roundup to the chunk.
1418 1418 */
1419 1419 if (avail_len >= round_len) {
1420 1420 wcl->c_len += round_len;
1421 1421 } else {
1422 1422 /*
1423 1423 * try the next one.
1424 1424 */
1425 1425 wcl = wcl->c_next;
1426 1426 if ((wcl == NULL) || (wcl->c_len < round_len)) {
1427 1427 DTRACE_PROBE1(
1428 1428 krpc__e__rdma_setup_read_chunks_rndup,
1429 1429 int, round_len);
1430 1430 return (FALSE);
1431 1431 }
1432 1432 wcl->c_len = round_len;
1433 1433 }
1434 1434 }
1435 1435
1436 1436 wcl = wcl->c_next;
1437 1437
1438 1438 /*
1439 1439 * Make rest of the chunks 0-len
1440 1440 */
1441 1441
1442 1442 clist_zero_len(wcl);
1443 1443
1444 1444 return (TRUE);
1445 1445 }
↓ open down ↓ |
1250 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX