Print this page
XXXX introduce drv_sectohz
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/rpc/rpcib.c
+++ new/usr/src/uts/common/rpc/rpcib.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * Copyright (c) 2007, The Ohio State University. All rights reserved.
28 28 *
29 29 * Portions of this source code is developed by the team members of
30 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31 31 * headed by Professor Dhabaleswar K. (DK) Panda.
32 32 *
33 33 * Acknowledgements to contributions from developors:
34 34 * Ranjit Noronha: noronha@cse.ohio-state.edu
35 35 * Lei Chai : chail@cse.ohio-state.edu
36 36 * Weikuan Yu : yuw@cse.ohio-state.edu
37 37 *
38 38 */
39 39
40 40 /*
41 41 * The rpcib plugin. Implements the interface for RDMATF's
42 42 * interaction with IBTF.
43 43 */
44 44
45 45 #include <sys/param.h>
46 46 #include <sys/types.h>
47 47 #include <sys/user.h>
48 48 #include <sys/systm.h>
49 49 #include <sys/sysmacros.h>
50 50 #include <sys/proc.h>
51 51 #include <sys/socket.h>
52 52 #include <sys/file.h>
53 53 #include <sys/stream.h>
54 54 #include <sys/strsubr.h>
55 55 #include <sys/stropts.h>
56 56 #include <sys/errno.h>
57 57 #include <sys/kmem.h>
58 58 #include <sys/debug.h>
59 59 #include <sys/pathname.h>
60 60 #include <sys/kstat.h>
61 61 #include <sys/t_lock.h>
62 62 #include <sys/ddi.h>
63 63 #include <sys/cmn_err.h>
64 64 #include <sys/time.h>
65 65 #include <sys/isa_defs.h>
66 66 #include <sys/callb.h>
67 67 #include <sys/sunddi.h>
68 68 #include <sys/sunndi.h>
69 69 #include <sys/sdt.h>
70 70 #include <sys/ib/ibtl/ibti.h>
71 71 #include <rpc/rpc.h>
72 72 #include <rpc/ib.h>
73 73 #include <sys/modctl.h>
74 74 #include <sys/kstr.h>
75 75 #include <sys/sockio.h>
76 76 #include <sys/vnode.h>
77 77 #include <sys/tiuser.h>
78 78 #include <net/if.h>
79 79 #include <net/if_types.h>
80 80 #include <sys/cred.h>
81 81 #include <rpc/rpc_rdma.h>
82 82 #include <nfs/nfs.h>
83 83 #include <sys/atomic.h>
84 84
85 85 #define NFS_RDMA_PORT 20049
86 86
87 87
88 88 /*
89 89 * Convenience structures for connection management
90 90 */
91 91 typedef struct rpcib_ipaddrs {
92 92 void *ri_list; /* pointer to list of addresses */
93 93 uint_t ri_count; /* number of addresses in list */
94 94 uint_t ri_size; /* size of ri_list in bytes */
95 95 } rpcib_ipaddrs_t;
96 96
97 97
98 98 typedef struct rpcib_ping {
99 99 rib_hca_t *hca;
100 100 ibt_path_info_t path;
101 101 ibt_ip_addr_t srcip;
102 102 ibt_ip_addr_t dstip;
103 103 } rpcib_ping_t;
104 104
105 105 /*
106 106 * Prototype declarations for driver ops
107 107 */
108 108 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
109 109 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
110 110 void *, void **);
111 111 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
112 112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
113 113 static int rpcib_do_ip_ioctl(int, int, void *);
114 114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
115 115 static int rpcib_cache_kstat_update(kstat_t *, int);
116 116 static void rib_force_cleanup(void *);
117 117 static void rib_stop_hca_services(rib_hca_t *);
118 118 static void rib_attach_hca(void);
119 119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
120 120 struct netbuf *d_svcaddr, CONN **conn);
121 121
122 122 struct {
123 123 kstat_named_t cache_limit;
124 124 kstat_named_t cache_allocation;
125 125 kstat_named_t cache_hits;
126 126 kstat_named_t cache_misses;
127 127 kstat_named_t cache_misses_above_the_limit;
128 128 } rpcib_kstat = {
129 129 {"cache_limit", KSTAT_DATA_UINT64 },
130 130 {"cache_allocation", KSTAT_DATA_UINT64 },
131 131 {"cache_hits", KSTAT_DATA_UINT64 },
132 132 {"cache_misses", KSTAT_DATA_UINT64 },
133 133 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
134 134 };
135 135
136 136 /* rpcib cb_ops */
137 137 static struct cb_ops rpcib_cbops = {
138 138 nulldev, /* open */
139 139 nulldev, /* close */
140 140 nodev, /* strategy */
141 141 nodev, /* print */
142 142 nodev, /* dump */
143 143 nodev, /* read */
144 144 nodev, /* write */
145 145 nodev, /* ioctl */
146 146 nodev, /* devmap */
147 147 nodev, /* mmap */
148 148 nodev, /* segmap */
149 149 nochpoll, /* poll */
150 150 ddi_prop_op, /* prop_op */
151 151 NULL, /* stream */
152 152 D_MP, /* cb_flag */
153 153 CB_REV, /* rev */
154 154 nodev, /* int (*cb_aread)() */
155 155 nodev /* int (*cb_awrite)() */
156 156 };
157 157
158 158 /*
159 159 * Device options
160 160 */
161 161 static struct dev_ops rpcib_ops = {
162 162 DEVO_REV, /* devo_rev, */
163 163 0, /* refcnt */
164 164 rpcib_getinfo, /* info */
165 165 nulldev, /* identify */
166 166 nulldev, /* probe */
167 167 rpcib_attach, /* attach */
168 168 rpcib_detach, /* detach */
169 169 nodev, /* reset */
170 170 &rpcib_cbops, /* driver ops - devctl interfaces */
171 171 NULL, /* bus operations */
172 172 NULL, /* power */
173 173 ddi_quiesce_not_needed, /* quiesce */
174 174 };
175 175
176 176 /*
177 177 * Module linkage information.
178 178 */
179 179
180 180 static struct modldrv rib_modldrv = {
181 181 &mod_driverops, /* Driver module */
182 182 "RPCIB plugin driver", /* Driver name and version */
183 183 &rpcib_ops, /* Driver ops */
184 184 };
185 185
186 186 static struct modlinkage rib_modlinkage = {
187 187 MODREV_1,
188 188 (void *)&rib_modldrv,
189 189 NULL
190 190 };
191 191
192 192 typedef struct rib_lrc_entry {
193 193 struct rib_lrc_entry *forw;
194 194 struct rib_lrc_entry *back;
195 195 char *lrc_buf;
196 196
197 197 uint32_t lrc_len;
198 198 void *avl_node;
199 199 bool_t registered;
200 200
201 201 struct mrc lrc_mhandle;
202 202 bool_t lrc_on_freed_list;
203 203 } rib_lrc_entry_t;
204 204
205 205 typedef struct cache_struct {
206 206 rib_lrc_entry_t r;
207 207 uint32_t len;
208 208 uint32_t elements;
209 209 kmutex_t node_lock;
210 210 avl_node_t avl_link;
211 211 } cache_avl_struct_t;
212 212
213 213 uint64_t cache_limit = 100 * 1024 * 1024;
214 214 static uint64_t cache_watermark = 80 * 1024 * 1024;
215 215 static bool_t stats_enabled = FALSE;
216 216
217 217 static uint64_t max_unsignaled_rws = 5;
218 218 int nfs_rdma_port = NFS_RDMA_PORT;
219 219
220 220 #define RIBNETID_TCP "tcp"
221 221 #define RIBNETID_TCP6 "tcp6"
222 222
223 223 /*
224 224 * rib_stat: private data pointer used when registering
225 225 * with the IBTF. It is returned to the consumer
226 226 * in all callbacks.
227 227 */
228 228 static rpcib_state_t *rib_stat = NULL;
229 229
230 230 #define RNR_RETRIES IBT_RNR_RETRY_1
231 231 #define MAX_PORTS 2
232 232 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D
233 233 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */
234 234
235 235 int preposted_rbufs = RDMA_BUFS_GRANT;
236 236 int send_threshold = 1;
237 237
238 238 /*
239 239 * Old cards with Tavor driver have limited memory footprint
240 240 * when booted in 32bit. The rib_max_rbufs tunable can be
241 241 * tuned for more buffers if needed.
242 242 */
243 243
244 244 #if !defined(_ELF64) && !defined(__sparc)
245 245 int rib_max_rbufs = MAX_BUFS;
246 246 #else
247 247 int rib_max_rbufs = 10 * MAX_BUFS;
248 248 #endif /* !(_ELF64) && !(__sparc) */
249 249
250 250 int rib_conn_timeout = 60 * 12; /* 12 minutes */
251 251
252 252 /*
253 253 * State of the plugin.
254 254 * ACCEPT = accepting new connections and requests.
255 255 * NO_ACCEPT = not accepting new connection and requests.
256 256 * This should eventually move to rpcib_state_t structure, since this
257 257 * will tell in which state the plugin is for a particular type of service
258 258 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
259 259 * state for one and in no_accept state for the other.
260 260 */
261 261 int plugin_state;
262 262 kmutex_t plugin_state_lock;
263 263
264 264 ldi_ident_t rpcib_li;
265 265
266 266 /*
267 267 * RPCIB RDMATF operations
268 268 */
269 269 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
270 270 static rdma_stat rib_disconnect(CONN *conn);
271 271 static void rib_listen(struct rdma_svc_data *rd);
272 272 static void rib_listen_stop(struct rdma_svc_data *rd);
273 273 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf,
274 274 uint_t buflen, struct mrc *buf_handle);
275 275 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
276 276 struct mrc buf_handle);
277 277 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
278 278 caddr_t buf, uint_t buflen, struct mrc *buf_handle);
279 279 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
280 280 struct mrc buf_handle);
281 281 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf,
282 282 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
283 283 void *lrc);
284 284 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
285 285 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
286 286 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
287 287 caddr_t buf, int len, int cpu);
288 288
289 289 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
290 290
291 291 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
292 292 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
293 293
294 294 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
295 295
296 296 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
297 297 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
298 298 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
299 299 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
300 300 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
301 301 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
302 302 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
303 303 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
304 304 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
305 305 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
306 306 int addr_type, void *, CONN **);
307 307 static rdma_stat rib_conn_release(CONN *conn);
308 308 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int,
309 309 rpcib_ping_t *, CONN **);
310 310 static rdma_stat rib_getinfo(rdma_info_t *info);
311 311
312 312 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
313 313 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
314 314 static void rib_destroy_cache(rib_hca_t *hca);
315 315 static void rib_server_side_cache_reclaim(void *argp);
316 316 static int avl_compare(const void *t1, const void *t2);
317 317
318 318 static void rib_stop_services(rib_hca_t *);
319 319 static void rib_close_channels(rib_conn_list_t *);
320 320 static void rib_conn_close(void *);
321 321 static void rib_recv_rele(rib_qp_t *);
322 322 static rdma_stat rib_conn_release_locked(CONN *conn);
323 323
324 324 /*
325 325 * RPCIB addressing operations
326 326 */
327 327
328 328 /*
329 329 * RDMA operations the RPCIB module exports
330 330 */
331 331 static rdmaops_t rib_ops = {
332 332 rib_reachable,
333 333 rib_conn_get,
334 334 rib_conn_release,
335 335 rib_listen,
336 336 rib_listen_stop,
337 337 rib_registermem,
338 338 rib_deregistermem,
339 339 rib_registermemsync,
340 340 rib_deregistermemsync,
341 341 rib_syncmem,
342 342 rib_reg_buf_alloc,
343 343 rib_reg_buf_free,
344 344 rib_send,
345 345 rib_send_resp,
346 346 rib_post_resp,
347 347 rib_post_resp_remove,
348 348 rib_post_recv,
349 349 rib_recv,
350 350 rib_read,
351 351 rib_write,
352 352 rib_getinfo,
353 353 };
354 354
355 355 /*
356 356 * RDMATF RPCIB plugin details
357 357 */
358 358 static rdma_mod_t rib_mod = {
359 359 "ibtf", /* api name */
360 360 RDMATF_VERS_1,
361 361 0,
362 362 &rib_ops, /* rdma op vector for ibtf */
363 363 };
364 364
365 365 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
366 366 static rdma_stat rib_qp_init(rib_qp_t *, int);
367 367 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
368 368 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
369 369 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
370 370 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
371 371 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
372 372 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
373 373 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
374 374 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
375 375 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
376 376 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
377 377 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
378 378 rib_qp_t **);
379 379 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
380 380 rib_qp_t **);
381 381 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
382 382 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
383 383 static int rib_free_sendwait(struct send_wid *);
384 384 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
385 385 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
386 386 static void rdma_done_rem_list(rib_qp_t *);
387 387 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
388 388
389 389 static void rib_async_handler(void *,
390 390 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
391 391 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
392 392 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
393 393 static int rib_free_svc_recv(struct svc_recv *);
394 394 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
395 395 static void rib_free_wid(struct recv_wid *);
396 396 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
397 397 static void rib_detach_hca(ibt_hca_hdl_t);
398 398 static void rib_close_a_channel(CONN *);
399 399 static void rib_send_hold(rib_qp_t *);
400 400 static void rib_send_rele(rib_qp_t *);
401 401
402 402 /*
403 403 * Registration with IBTF as a consumer
404 404 */
405 405 static struct ibt_clnt_modinfo_s rib_modinfo = {
406 406 IBTI_V_CURR,
407 407 IBT_GENERIC,
408 408 rib_async_handler, /* async event handler */
409 409 NULL, /* Memory Region Handler */
410 410 "nfs/ib"
411 411 };
412 412
413 413 /*
414 414 * Global strucuture
415 415 */
416 416
417 417 typedef struct rpcib_s {
418 418 dev_info_t *rpcib_dip;
419 419 kmutex_t rpcib_mutex;
420 420 } rpcib_t;
421 421
422 422 rpcib_t rpcib;
423 423
424 424 /*
425 425 * /etc/system controlled variable to control
426 426 * debugging in rpcib kernel module.
427 427 * Set it to values greater that 1 to control
428 428 * the amount of debugging messages required.
429 429 */
430 430 int rib_debug = 0;
431 431
432 432 int
433 433 _init(void)
434 434 {
435 435 int error;
436 436
437 437 error = mod_install((struct modlinkage *)&rib_modlinkage);
438 438 if (error != 0) {
439 439 /*
440 440 * Could not load module
441 441 */
442 442 return (error);
443 443 }
444 444 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
445 445 return (0);
446 446 }
447 447
448 448 int
449 449 _fini()
450 450 {
451 451 int status;
452 452
453 453 /*
454 454 * Remove module
455 455 */
456 456 if ((status = mod_remove(&rib_modlinkage)) != 0) {
457 457 return (status);
458 458 }
459 459 mutex_destroy(&plugin_state_lock);
460 460 return (0);
461 461 }
462 462
463 463 int
464 464 _info(struct modinfo *modinfop)
465 465 {
466 466 return (mod_info(&rib_modlinkage, modinfop));
467 467 }
468 468
469 469 /*
470 470 * rpcib_getinfo()
471 471 * Given the device number, return the devinfo pointer or the
472 472 * instance number.
473 473 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
474 474 */
475 475
476 476 /*ARGSUSED*/
477 477 static int
478 478 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
479 479 {
480 480 int ret = DDI_SUCCESS;
481 481
482 482 switch (cmd) {
483 483 case DDI_INFO_DEVT2DEVINFO:
484 484 if (rpcib.rpcib_dip != NULL)
485 485 *result = rpcib.rpcib_dip;
486 486 else {
487 487 *result = NULL;
488 488 ret = DDI_FAILURE;
489 489 }
490 490 break;
491 491
492 492 case DDI_INFO_DEVT2INSTANCE:
493 493 *result = NULL;
494 494 break;
495 495
496 496 default:
497 497 ret = DDI_FAILURE;
498 498 }
499 499 return (ret);
500 500 }
501 501
502 502 static void
503 503 rpcib_free_hca_list()
504 504 {
505 505 rib_hca_t *hca, *hcap;
506 506
507 507 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
508 508 hca = rib_stat->hcas_list;
509 509 rib_stat->hcas_list = NULL;
510 510 rw_exit(&rib_stat->hcas_list_lock);
511 511 while (hca != NULL) {
512 512 rw_enter(&hca->state_lock, RW_WRITER);
513 513 hcap = hca;
514 514 hca = hca->next;
515 515 rib_stat->nhca_inited--;
516 516 rib_mod.rdma_count--;
517 517 hcap->state = HCA_DETACHED;
518 518 rw_exit(&hcap->state_lock);
519 519 rib_stop_hca_services(hcap);
520 520
521 521 kmem_free(hcap, sizeof (*hcap));
522 522 }
523 523 }
524 524
525 525 static rdma_stat
526 526 rpcib_free_service_list()
527 527 {
528 528 rib_service_t *service;
529 529 ibt_status_t ret;
530 530
531 531 rw_enter(&rib_stat->service_list_lock, RW_WRITER);
532 532 while (rib_stat->service_list != NULL) {
533 533 service = rib_stat->service_list;
534 534 ret = ibt_unbind_all_services(service->srv_hdl);
535 535 if (ret != IBT_SUCCESS) {
536 536 rw_exit(&rib_stat->service_list_lock);
537 537 #ifdef DEBUG
538 538 cmn_err(CE_NOTE, "rpcib_free_service_list: "
539 539 "ibt_unbind_all_services failed (%d)\n", (int)ret);
540 540 #endif
541 541 return (RDMA_FAILED);
542 542 }
543 543 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
544 544 service->srv_hdl);
545 545 if (ret != IBT_SUCCESS) {
546 546 rw_exit(&rib_stat->service_list_lock);
547 547 #ifdef DEBUG
548 548 cmn_err(CE_NOTE, "rpcib_free_service_list: "
549 549 "ibt_deregister_service failed (%d)\n", (int)ret);
550 550 #endif
551 551 return (RDMA_FAILED);
552 552 }
553 553 rib_stat->service_list = service->next;
554 554 kmem_free(service, sizeof (rib_service_t));
555 555 }
556 556 rw_exit(&rib_stat->service_list_lock);
557 557
558 558 return (RDMA_SUCCESS);
559 559 }
560 560
561 561 static int
562 562 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
563 563 {
564 564 ibt_status_t ibt_status;
565 565 rdma_stat r_status;
566 566
567 567 switch (cmd) {
568 568 case DDI_ATTACH:
569 569 break;
570 570 case DDI_RESUME:
571 571 return (DDI_SUCCESS);
572 572 default:
573 573 return (DDI_FAILURE);
574 574 }
575 575
576 576 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
577 577
578 578 mutex_enter(&rpcib.rpcib_mutex);
579 579 if (rpcib.rpcib_dip != NULL) {
580 580 mutex_exit(&rpcib.rpcib_mutex);
581 581 return (DDI_FAILURE);
582 582 }
583 583 rpcib.rpcib_dip = dip;
584 584 mutex_exit(&rpcib.rpcib_mutex);
585 585 /*
586 586 * Create the "rpcib" minor-node.
587 587 */
588 588 if (ddi_create_minor_node(dip,
589 589 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
590 590 /* Error message, no cmn_err as they print on console */
591 591 return (DDI_FAILURE);
592 592 }
593 593
594 594 if (rib_stat == NULL) {
595 595 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
596 596 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
597 597 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
598 598 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
599 599 }
600 600
601 601 rib_stat->hca_count = ibt_get_hca_list(NULL);
602 602 if (rib_stat->hca_count < 1) {
603 603 mutex_destroy(&rib_stat->listen_lock);
604 604 rw_destroy(&rib_stat->hcas_list_lock);
605 605 mutex_destroy(&rib_stat->open_hca_lock);
606 606 kmem_free(rib_stat, sizeof (*rib_stat));
607 607 rib_stat = NULL;
608 608 return (DDI_FAILURE);
609 609 }
610 610
611 611 ibt_status = ibt_attach(&rib_modinfo, dip,
612 612 (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
613 613
614 614 if (ibt_status != IBT_SUCCESS) {
615 615 mutex_destroy(&rib_stat->listen_lock);
616 616 rw_destroy(&rib_stat->hcas_list_lock);
617 617 mutex_destroy(&rib_stat->open_hca_lock);
618 618 kmem_free(rib_stat, sizeof (*rib_stat));
619 619 rib_stat = NULL;
620 620 return (DDI_FAILURE);
621 621 }
622 622
623 623 rib_stat->service_list = NULL;
624 624 rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
625 625 mutex_enter(&rib_stat->open_hca_lock);
626 626 if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
627 627 mutex_exit(&rib_stat->open_hca_lock);
628 628 goto open_fail;
629 629 }
630 630 mutex_exit(&rib_stat->open_hca_lock);
631 631
632 632 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
633 633 DDI_PROP_SUCCESS) {
634 634 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
635 635 "failed.");
636 636 goto register_fail;
637 637 }
638 638
639 639 /*
640 640 * Register with rdmatf
641 641 */
642 642 r_status = rdma_register_mod(&rib_mod);
643 643 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
644 644 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
645 645 "status = %d", r_status);
646 646 goto register_fail;
647 647 }
648 648
649 649 return (DDI_SUCCESS);
650 650
651 651 register_fail:
652 652
653 653 open_fail:
654 654 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
655 655 rpcib_free_hca_list();
656 656 (void) rpcib_free_service_list();
657 657 mutex_destroy(&rib_stat->listen_lock);
658 658 rw_destroy(&rib_stat->hcas_list_lock);
659 659 mutex_destroy(&rib_stat->open_hca_lock);
660 660 rw_destroy(&rib_stat->service_list_lock);
661 661 kmem_free(rib_stat, sizeof (*rib_stat));
662 662 rib_stat = NULL;
663 663 return (DDI_FAILURE);
664 664 }
665 665
666 666 /*ARGSUSED*/
667 667 static int
668 668 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
669 669 {
670 670 switch (cmd) {
671 671
672 672 case DDI_DETACH:
673 673 break;
674 674
675 675 case DDI_SUSPEND:
676 676 default:
677 677 return (DDI_FAILURE);
678 678 }
679 679
680 680 /*
681 681 * Detach the hca and free resources
682 682 */
683 683 mutex_enter(&plugin_state_lock);
684 684 plugin_state = NO_ACCEPT;
685 685 mutex_exit(&plugin_state_lock);
686 686
687 687 if (rpcib_free_service_list() != RDMA_SUCCESS)
688 688 return (DDI_FAILURE);
689 689 rpcib_free_hca_list();
690 690
691 691 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
692 692 mutex_destroy(&rib_stat->listen_lock);
693 693 rw_destroy(&rib_stat->hcas_list_lock);
694 694 mutex_destroy(&rib_stat->open_hca_lock);
695 695 rw_destroy(&rib_stat->service_list_lock);
696 696
697 697 kmem_free(rib_stat, sizeof (*rib_stat));
698 698 rib_stat = NULL;
699 699
700 700 mutex_enter(&rpcib.rpcib_mutex);
701 701 rpcib.rpcib_dip = NULL;
702 702 mutex_exit(&rpcib.rpcib_mutex);
703 703 mutex_destroy(&rpcib.rpcib_mutex);
704 704 return (DDI_SUCCESS);
705 705 }
706 706
707 707
708 708 static void rib_rbufpool_free(rib_hca_t *, int);
709 709 static void rib_rbufpool_deregister(rib_hca_t *, int);
710 710 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
711 711 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
712 712 static rdma_stat rib_rem_replylist(rib_qp_t *);
713 713 static int rib_remreply(rib_qp_t *, struct reply *);
714 714 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
715 715 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
716 716
717 717
718 718 /*
719 719 * One CQ pair per HCA
720 720 */
721 721 static rdma_stat
722 722 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
723 723 rib_cq_t **cqp)
724 724 {
725 725 rib_cq_t *cq;
726 726 ibt_cq_attr_t cq_attr;
727 727 uint32_t real_size;
728 728 ibt_status_t status;
729 729 rdma_stat error = RDMA_SUCCESS;
730 730
731 731 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
732 732 cq->rib_hca = hca;
733 733 bzero(&cq_attr, sizeof (cq_attr));
734 734 cq_attr.cq_size = cq_size;
735 735 cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
736 736 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
737 737 &real_size);
738 738 if (status != IBT_SUCCESS) {
739 739 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
740 740 " status=%d", status);
741 741 error = RDMA_FAILED;
742 742 goto fail;
743 743 }
744 744 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
745 745
746 746 /*
747 747 * Enable CQ callbacks. CQ Callbacks are single shot
748 748 * (e.g. you have to call ibt_enable_cq_notify()
749 749 * after each callback to get another one).
750 750 */
751 751 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
752 752 if (status != IBT_SUCCESS) {
753 753 cmn_err(CE_WARN, "rib_create_cq: "
754 754 "enable_cq_notify failed, status %d", status);
755 755 error = RDMA_FAILED;
756 756 goto fail;
757 757 }
758 758 *cqp = cq;
759 759
760 760 return (error);
761 761 fail:
762 762 if (cq->rib_cq_hdl)
763 763 (void) ibt_free_cq(cq->rib_cq_hdl);
764 764 if (cq)
765 765 kmem_free(cq, sizeof (rib_cq_t));
766 766 return (error);
767 767 }
768 768
769 769 /*
770 770 * rpcib_find_hca
771 771 *
772 772 * Caller should have already locked the hcas_lock before calling
773 773 * this function.
774 774 */
775 775 static rib_hca_t *
776 776 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
777 777 {
778 778 rib_hca_t *hca = ribstat->hcas_list;
779 779
780 780 while (hca && hca->hca_guid != guid)
781 781 hca = hca->next;
782 782
783 783 return (hca);
784 784 }
785 785
786 786 static rdma_stat
787 787 rpcib_open_hcas(rpcib_state_t *ribstat)
788 788 {
789 789 rib_hca_t *hca;
790 790 ibt_status_t ibt_status;
791 791 rdma_stat status;
792 792 ibt_hca_portinfo_t *pinfop;
793 793 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS;
794 794 uint_t size, cq_size;
795 795 int i;
796 796 kstat_t *ksp;
797 797 cache_avl_struct_t example_avl_node;
798 798 char rssc_name[32];
799 799 int old_nhca_inited = ribstat->nhca_inited;
800 800 ib_guid_t *hca_guids;
801 801
802 802 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
803 803
804 804 ribstat->hca_count = ibt_get_hca_list(&hca_guids);
805 805 if (ribstat->hca_count == 0)
806 806 return (RDMA_FAILED);
807 807
808 808 rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
809 809 /*
810 810 * Open a hca and setup for RDMA
811 811 */
812 812 for (i = 0; i < ribstat->hca_count; i++) {
813 813 if (rpcib_find_hca(ribstat, hca_guids[i]))
814 814 continue;
815 815 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
816 816
817 817 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
818 818 hca_guids[i], &hca->hca_hdl);
819 819 if (ibt_status != IBT_SUCCESS) {
820 820 kmem_free(hca, sizeof (rib_hca_t));
821 821 continue;
822 822 }
823 823 hca->hca_guid = hca_guids[i];
824 824 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
825 825 hca->state = HCA_INITED;
826 826
827 827 /*
828 828 * query HCA info
829 829 */
830 830 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
831 831 if (ibt_status != IBT_SUCCESS) {
832 832 goto fail1;
833 833 }
834 834
835 835 /*
836 836 * One PD (Protection Domain) per HCA.
837 837 * A qp is allowed to access a memory region
838 838 * only when it's in the same PD as that of
839 839 * the memory region.
840 840 */
841 841 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
842 842 if (ibt_status != IBT_SUCCESS) {
843 843 goto fail1;
844 844 }
845 845
846 846 /*
847 847 * query HCA ports
848 848 */
849 849 ibt_status = ibt_query_hca_ports(hca->hca_hdl,
850 850 0, &pinfop, &hca->hca_nports, &size);
851 851 if (ibt_status != IBT_SUCCESS) {
852 852 goto fail2;
853 853 }
854 854 hca->hca_ports = pinfop;
855 855 hca->hca_pinfosz = size;
856 856 pinfop = NULL;
857 857
858 858 cq_size = DEF_CQ_SIZE; /* default cq size */
859 859 /*
860 860 * Create 2 pairs of cq's (1 pair for client
861 861 * and the other pair for server) on this hca.
862 862 * If number of qp's gets too large, then several
863 863 * cq's will be needed.
864 864 */
865 865 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
866 866 &hca->svc_rcq);
867 867 if (status != RDMA_SUCCESS) {
868 868 goto fail3;
869 869 }
870 870
871 871 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
872 872 &hca->svc_scq);
873 873 if (status != RDMA_SUCCESS) {
874 874 goto fail3;
875 875 }
876 876
877 877 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
878 878 &hca->clnt_rcq);
879 879 if (status != RDMA_SUCCESS) {
880 880 goto fail3;
881 881 }
882 882
883 883 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
884 884 &hca->clnt_scq);
885 885 if (status != RDMA_SUCCESS) {
886 886 goto fail3;
887 887 }
888 888
889 889 /*
890 890 * Create buffer pools.
891 891 * Note rib_rbuf_create also allocates memory windows.
892 892 */
893 893 hca->recv_pool = rib_rbufpool_create(hca,
894 894 RECV_BUFFER, rib_max_rbufs);
895 895 if (hca->recv_pool == NULL) {
896 896 goto fail3;
897 897 }
898 898
899 899 hca->send_pool = rib_rbufpool_create(hca,
900 900 SEND_BUFFER, rib_max_rbufs);
901 901 if (hca->send_pool == NULL) {
902 902 rib_rbufpool_destroy(hca, RECV_BUFFER);
903 903 goto fail3;
904 904 }
905 905
906 906 if (hca->server_side_cache == NULL) {
907 907 (void) sprintf(rssc_name,
908 908 "rib_srvr_cache_%llx",
909 909 (long long unsigned int) hca->hca_guid);
910 910 hca->server_side_cache = kmem_cache_create(
911 911 rssc_name,
912 912 sizeof (cache_avl_struct_t), 0,
913 913 NULL,
914 914 NULL,
915 915 rib_server_side_cache_reclaim,
916 916 hca, NULL, 0);
917 917 }
918 918
919 919 avl_create(&hca->avl_tree,
920 920 avl_compare,
921 921 sizeof (cache_avl_struct_t),
922 922 (uint_t)(uintptr_t)&example_avl_node.avl_link-
923 923 (uint_t)(uintptr_t)&example_avl_node);
924 924
925 925 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
926 926 hca->iblock);
927 927 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
928 928 rw_init(&hca->avl_rw_lock,
929 929 NULL, RW_DRIVER, hca->iblock);
930 930 mutex_init(&hca->cache_allocation_lock,
931 931 NULL, MUTEX_DRIVER, NULL);
932 932 hca->avl_init = TRUE;
933 933
934 934 /* Create kstats for the cache */
935 935 ASSERT(INGLOBALZONE(curproc));
936 936
937 937 if (!stats_enabled) {
938 938 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
939 939 KSTAT_TYPE_NAMED,
940 940 sizeof (rpcib_kstat) / sizeof (kstat_named_t),
941 941 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
942 942 GLOBAL_ZONEID);
943 943 if (ksp) {
944 944 ksp->ks_data = (void *) &rpcib_kstat;
945 945 ksp->ks_update = rpcib_cache_kstat_update;
946 946 kstat_install(ksp);
947 947 stats_enabled = TRUE;
948 948 }
949 949 }
950 950 if (hca->cleanup_helper == NULL) {
951 951 char tq_name[sizeof (hca->hca_guid) * 2 + 1];
952 952
953 953 (void) snprintf(tq_name, sizeof (tq_name), "%llX",
954 954 (unsigned long long int) hca->hca_guid);
955 955 hca->cleanup_helper = ddi_taskq_create(NULL,
956 956 tq_name, 1, TASKQ_DEFAULTPRI, 0);
957 957 }
958 958
959 959 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
960 960 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
961 961 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
962 962 hca->iblock);
963 963 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
964 964 hca->iblock);
965 965 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
966 966 hca->inuse = TRUE;
967 967
968 968 hca->next = ribstat->hcas_list;
969 969 ribstat->hcas_list = hca;
970 970 ribstat->nhca_inited++;
971 971 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
972 972 continue;
973 973
974 974 fail3:
975 975 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
976 976 fail2:
977 977 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
978 978 fail1:
979 979 (void) ibt_close_hca(hca->hca_hdl);
980 980 kmem_free(hca, sizeof (rib_hca_t));
981 981 }
982 982 rw_exit(&ribstat->hcas_list_lock);
983 983 ibt_free_hca_list(hca_guids, ribstat->hca_count);
984 984 rib_mod.rdma_count = rib_stat->nhca_inited;
985 985
986 986 /*
987 987 * return success if at least one new hca has been configured.
988 988 */
989 989 if (ribstat->nhca_inited != old_nhca_inited)
990 990 return (RDMA_SUCCESS);
991 991 else
992 992 return (RDMA_FAILED);
993 993 }
994 994
995 995 /*
996 996 * Callback routines
997 997 */
998 998
999 999 /*
1000 1000 * SCQ handlers
1001 1001 */
1002 1002 /* ARGSUSED */
1003 1003 static void
1004 1004 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1005 1005 {
1006 1006 ibt_status_t ibt_status;
1007 1007 ibt_wc_t wc;
1008 1008 struct send_wid *wd;
1009 1009 CONN *conn;
1010 1010 rib_qp_t *qp;
1011 1011 int i;
1012 1012
1013 1013 /*
1014 1014 * Re-enable cq notify here to avoid missing any
1015 1015 * completion queue notification.
1016 1016 */
1017 1017 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1018 1018
1019 1019 ibt_status = IBT_SUCCESS;
1020 1020 while (ibt_status != IBT_CQ_EMPTY) {
1021 1021 bzero(&wc, sizeof (wc));
1022 1022 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1023 1023 if (ibt_status != IBT_SUCCESS)
1024 1024 return;
1025 1025
1026 1026 /*
1027 1027 * Got a send completion
1028 1028 */
1029 1029 if (wc.wc_id != RDMA_DUMMY_WRID) {
1030 1030 wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1031 1031 qp = wd->qp;
1032 1032 conn = qptoc(qp);
1033 1033
1034 1034 mutex_enter(&wd->sendwait_lock);
1035 1035 switch (wc.wc_status) {
1036 1036 case IBT_WC_SUCCESS:
1037 1037 wd->status = RDMA_SUCCESS;
1038 1038 break;
1039 1039 default:
1040 1040 /*
1041 1041 * RC Send Q Error Code Local state Remote State
1042 1042 * ==================== =========== ============
1043 1043 * IBT_WC_BAD_RESPONSE_ERR ERROR None
1044 1044 * IBT_WC_LOCAL_LEN_ERR ERROR None
1045 1045 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None
1046 1046 * IBT_WC_LOCAL_PROTECT_ERR ERROR None
1047 1047 * IBT_WC_MEM_WIN_BIND_ERR ERROR None
1048 1048 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR
1049 1049 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR
1050 1050 * IBT_WC_REMOTE_OP_ERR ERROR ERROR
1051 1051 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None
1052 1052 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None
1053 1053 * IBT_WC_WR_FLUSHED_ERR ERROR None
1054 1054 */
1055 1055 /*
1056 1056 * Channel in error state. Set connection to
1057 1057 * ERROR and cleanup will happen either from
1058 1058 * conn_release or from rib_conn_get
1059 1059 */
1060 1060 wd->status = RDMA_FAILED;
1061 1061 mutex_enter(&conn->c_lock);
1062 1062 if (conn->c_state != C_DISCONN_PEND)
1063 1063 conn->c_state = C_ERROR_CONN;
1064 1064 mutex_exit(&conn->c_lock);
1065 1065 break;
1066 1066 }
1067 1067
1068 1068 if (wd->cv_sig == 1) {
1069 1069 /*
1070 1070 * Notify poster
1071 1071 */
1072 1072 cv_signal(&wd->wait_cv);
1073 1073 mutex_exit(&wd->sendwait_lock);
1074 1074 } else {
1075 1075 /*
1076 1076 * Poster not waiting for notification.
1077 1077 * Free the send buffers and send_wid
1078 1078 */
1079 1079 for (i = 0; i < wd->nsbufs; i++) {
1080 1080 rib_rbuf_free(qptoc(wd->qp),
1081 1081 SEND_BUFFER,
1082 1082 (void *)(uintptr_t)wd->sbufaddr[i]);
1083 1083 }
1084 1084
1085 1085 /* decrement the send ref count */
1086 1086 rib_send_rele(qp);
1087 1087
1088 1088 mutex_exit(&wd->sendwait_lock);
1089 1089 (void) rib_free_sendwait(wd);
1090 1090 }
1091 1091 }
1092 1092 }
1093 1093 }
1094 1094
1095 1095 /* ARGSUSED */
1096 1096 static void
1097 1097 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1098 1098 {
1099 1099 ibt_status_t ibt_status;
1100 1100 ibt_wc_t wc;
1101 1101 struct send_wid *wd;
1102 1102 rib_qp_t *qp;
1103 1103 CONN *conn;
1104 1104 int i;
1105 1105
1106 1106 /*
1107 1107 * Re-enable cq notify here to avoid missing any
1108 1108 * completion queue notification.
1109 1109 */
1110 1110 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1111 1111
1112 1112 ibt_status = IBT_SUCCESS;
1113 1113 while (ibt_status != IBT_CQ_EMPTY) {
1114 1114 bzero(&wc, sizeof (wc));
1115 1115 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1116 1116 if (ibt_status != IBT_SUCCESS)
1117 1117 return;
1118 1118
1119 1119 /*
1120 1120 * Got a send completion
1121 1121 */
1122 1122 if (wc.wc_id != RDMA_DUMMY_WRID) {
1123 1123 wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1124 1124 qp = wd->qp;
1125 1125 conn = qptoc(qp);
1126 1126 mutex_enter(&wd->sendwait_lock);
1127 1127
1128 1128 switch (wc.wc_status) {
1129 1129 case IBT_WC_SUCCESS:
1130 1130 wd->status = RDMA_SUCCESS;
1131 1131 break;
1132 1132 default:
1133 1133 /*
1134 1134 * Channel in error state. Set connection to
1135 1135 * ERROR and cleanup will happen either from
1136 1136 * conn_release or conn timeout.
1137 1137 */
1138 1138 wd->status = RDMA_FAILED;
1139 1139 mutex_enter(&conn->c_lock);
1140 1140 if (conn->c_state != C_DISCONN_PEND)
1141 1141 conn->c_state = C_ERROR_CONN;
1142 1142 mutex_exit(&conn->c_lock);
1143 1143 break;
1144 1144 }
1145 1145
1146 1146 if (wd->cv_sig == 1) {
1147 1147 /*
1148 1148 * Update completion status and notify poster
1149 1149 */
1150 1150 cv_signal(&wd->wait_cv);
1151 1151 mutex_exit(&wd->sendwait_lock);
1152 1152 } else {
1153 1153 /*
1154 1154 * Poster not waiting for notification.
1155 1155 * Free the send buffers and send_wid
1156 1156 */
1157 1157 for (i = 0; i < wd->nsbufs; i++) {
1158 1158 rib_rbuf_free(qptoc(wd->qp),
1159 1159 SEND_BUFFER,
1160 1160 (void *)(uintptr_t)wd->sbufaddr[i]);
1161 1161 }
1162 1162
1163 1163 /* decrement the send ref count */
1164 1164 rib_send_rele(qp);
1165 1165
1166 1166 mutex_exit(&wd->sendwait_lock);
1167 1167 (void) rib_free_sendwait(wd);
1168 1168 }
1169 1169 }
1170 1170 }
1171 1171 }
1172 1172
1173 1173 /*
1174 1174 * RCQ handler
1175 1175 */
1176 1176 /* ARGSUSED */
1177 1177 static void
1178 1178 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1179 1179 {
1180 1180 rib_qp_t *qp;
1181 1181 ibt_status_t ibt_status;
1182 1182 ibt_wc_t wc;
1183 1183 struct recv_wid *rwid;
1184 1184
1185 1185 /*
1186 1186 * Re-enable cq notify here to avoid missing any
1187 1187 * completion queue notification.
1188 1188 */
1189 1189 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1190 1190
1191 1191 ibt_status = IBT_SUCCESS;
1192 1192 while (ibt_status != IBT_CQ_EMPTY) {
1193 1193 bzero(&wc, sizeof (wc));
1194 1194 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1195 1195 if (ibt_status != IBT_SUCCESS)
1196 1196 return;
1197 1197
1198 1198 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1199 1199 qp = rwid->qp;
1200 1200
1201 1201 if (wc.wc_status == IBT_WC_SUCCESS) {
1202 1202 XDR inxdrs, *xdrs;
1203 1203 uint_t xid, vers, op, find_xid = 0;
1204 1204 struct reply *r;
1205 1205 CONN *conn = qptoc(qp);
1206 1206 uint32_t rdma_credit = 0;
1207 1207
1208 1208 xdrs = &inxdrs;
1209 1209 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1210 1210 wc.wc_bytes_xfer, XDR_DECODE);
1211 1211 /*
1212 1212 * Treat xid as opaque (xid is the first entity
1213 1213 * in the rpc rdma message).
1214 1214 */
1215 1215 xid = *(uint32_t *)(uintptr_t)rwid->addr;
1216 1216
1217 1217 /* Skip xid and set the xdr position accordingly. */
1218 1218 XDR_SETPOS(xdrs, sizeof (uint32_t));
1219 1219 (void) xdr_u_int(xdrs, &vers);
1220 1220 (void) xdr_u_int(xdrs, &rdma_credit);
1221 1221 (void) xdr_u_int(xdrs, &op);
1222 1222 XDR_DESTROY(xdrs);
1223 1223
1224 1224 if (vers != RPCRDMA_VERS) {
1225 1225 /*
1226 1226 * Invalid RPC/RDMA version. Cannot
1227 1227 * interoperate. Set connection to
1228 1228 * ERROR state and bail out.
1229 1229 */
1230 1230 mutex_enter(&conn->c_lock);
1231 1231 if (conn->c_state != C_DISCONN_PEND)
1232 1232 conn->c_state = C_ERROR_CONN;
1233 1233 mutex_exit(&conn->c_lock);
1234 1234 rib_rbuf_free(conn, RECV_BUFFER,
1235 1235 (void *)(uintptr_t)rwid->addr);
1236 1236 rib_free_wid(rwid);
1237 1237 rib_recv_rele(qp);
1238 1238 continue;
1239 1239 }
1240 1240
1241 1241 mutex_enter(&qp->replylist_lock);
1242 1242 for (r = qp->replylist; r != NULL; r = r->next) {
1243 1243 if (r->xid == xid) {
1244 1244 find_xid = 1;
1245 1245 switch (op) {
1246 1246 case RDMA_MSG:
1247 1247 case RDMA_NOMSG:
1248 1248 case RDMA_MSGP:
1249 1249 r->status = RDMA_SUCCESS;
1250 1250 r->vaddr_cq = rwid->addr;
1251 1251 r->bytes_xfer =
1252 1252 wc.wc_bytes_xfer;
1253 1253 cv_signal(&r->wait_cv);
1254 1254 break;
1255 1255 default:
1256 1256 rib_rbuf_free(qptoc(qp),
1257 1257 RECV_BUFFER,
1258 1258 (void *)(uintptr_t)
1259 1259 rwid->addr);
1260 1260 break;
1261 1261 }
1262 1262 break;
1263 1263 }
1264 1264 }
1265 1265 mutex_exit(&qp->replylist_lock);
1266 1266 if (find_xid == 0) {
1267 1267 /* RPC caller not waiting for reply */
1268 1268
1269 1269 DTRACE_PROBE1(rpcib__i__nomatchxid1,
1270 1270 int, xid);
1271 1271
1272 1272 rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1273 1273 (void *)(uintptr_t)rwid->addr);
1274 1274 }
1275 1275 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1276 1276 CONN *conn = qptoc(qp);
1277 1277
1278 1278 /*
1279 1279 * Connection being flushed. Just free
1280 1280 * the posted buffer
1281 1281 */
1282 1282 rib_rbuf_free(conn, RECV_BUFFER,
1283 1283 (void *)(uintptr_t)rwid->addr);
1284 1284 } else {
1285 1285 CONN *conn = qptoc(qp);
1286 1286 /*
1287 1287 * RC Recv Q Error Code Local state Remote State
1288 1288 * ==================== =========== ============
1289 1289 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd
1290 1290 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd
1291 1291 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd
1292 1292 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd
1293 1293 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd
1294 1294 * IBT_WC_WR_FLUSHED_ERR None None
1295 1295 */
1296 1296 /*
1297 1297 * Channel in error state. Set connection
1298 1298 * in ERROR state.
1299 1299 */
1300 1300 mutex_enter(&conn->c_lock);
1301 1301 if (conn->c_state != C_DISCONN_PEND)
1302 1302 conn->c_state = C_ERROR_CONN;
1303 1303 mutex_exit(&conn->c_lock);
1304 1304 rib_rbuf_free(conn, RECV_BUFFER,
1305 1305 (void *)(uintptr_t)rwid->addr);
1306 1306 }
1307 1307 rib_free_wid(rwid);
1308 1308 rib_recv_rele(qp);
1309 1309 }
1310 1310 }
1311 1311
1312 1312 /* Server side */
1313 1313 /* ARGSUSED */
1314 1314 static void
1315 1315 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1316 1316 {
1317 1317 rdma_recv_data_t *rdp;
1318 1318 rib_qp_t *qp;
1319 1319 ibt_status_t ibt_status;
1320 1320 ibt_wc_t wc;
1321 1321 struct svc_recv *s_recvp;
1322 1322 CONN *conn;
1323 1323 mblk_t *mp;
1324 1324
1325 1325 /*
1326 1326 * Re-enable cq notify here to avoid missing any
1327 1327 * completion queue notification.
1328 1328 */
1329 1329 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1330 1330
1331 1331 ibt_status = IBT_SUCCESS;
1332 1332 while (ibt_status != IBT_CQ_EMPTY) {
1333 1333 bzero(&wc, sizeof (wc));
1334 1334 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1335 1335 if (ibt_status != IBT_SUCCESS)
1336 1336 return;
1337 1337
1338 1338 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1339 1339 qp = s_recvp->qp;
1340 1340 conn = qptoc(qp);
1341 1341
1342 1342 if (wc.wc_status == IBT_WC_SUCCESS) {
1343 1343 XDR inxdrs, *xdrs;
1344 1344 uint_t xid, vers, op;
1345 1345 uint32_t rdma_credit;
1346 1346
1347 1347 xdrs = &inxdrs;
1348 1348 /* s_recvp->vaddr stores data */
1349 1349 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1350 1350 wc.wc_bytes_xfer, XDR_DECODE);
1351 1351
1352 1352 /*
1353 1353 * Treat xid as opaque (xid is the first entity
1354 1354 * in the rpc rdma message).
1355 1355 */
1356 1356 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1357 1357 /* Skip xid and set the xdr position accordingly. */
1358 1358 XDR_SETPOS(xdrs, sizeof (uint32_t));
1359 1359 if (!xdr_u_int(xdrs, &vers) ||
1360 1360 !xdr_u_int(xdrs, &rdma_credit) ||
1361 1361 !xdr_u_int(xdrs, &op)) {
1362 1362 rib_rbuf_free(conn, RECV_BUFFER,
1363 1363 (void *)(uintptr_t)s_recvp->vaddr);
1364 1364 XDR_DESTROY(xdrs);
1365 1365 rib_recv_rele(qp);
1366 1366 (void) rib_free_svc_recv(s_recvp);
1367 1367 continue;
1368 1368 }
1369 1369 XDR_DESTROY(xdrs);
1370 1370
1371 1371 if (vers != RPCRDMA_VERS) {
1372 1372 /*
1373 1373 * Invalid RPC/RDMA version.
1374 1374 * Drop rpc rdma message.
1375 1375 */
1376 1376 rib_rbuf_free(conn, RECV_BUFFER,
1377 1377 (void *)(uintptr_t)s_recvp->vaddr);
1378 1378 rib_recv_rele(qp);
1379 1379 (void) rib_free_svc_recv(s_recvp);
1380 1380 continue;
1381 1381 }
1382 1382 /*
1383 1383 * Is this for RDMA_DONE?
1384 1384 */
1385 1385 if (op == RDMA_DONE) {
1386 1386 rib_rbuf_free(conn, RECV_BUFFER,
1387 1387 (void *)(uintptr_t)s_recvp->vaddr);
1388 1388 /*
1389 1389 * Wake up the thread waiting on
1390 1390 * a RDMA_DONE for xid
1391 1391 */
1392 1392 mutex_enter(&qp->rdlist_lock);
1393 1393 rdma_done_notify(qp, xid);
1394 1394 mutex_exit(&qp->rdlist_lock);
1395 1395 rib_recv_rele(qp);
1396 1396 (void) rib_free_svc_recv(s_recvp);
1397 1397 continue;
1398 1398 }
1399 1399
1400 1400 mutex_enter(&plugin_state_lock);
1401 1401 mutex_enter(&conn->c_lock);
1402 1402 if ((plugin_state == ACCEPT) &&
1403 1403 (conn->c_state == C_CONNECTED)) {
1404 1404 conn->c_ref++;
1405 1405 mutex_exit(&conn->c_lock);
1406 1406 while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1407 1407 == NULL)
1408 1408 (void) strwaitbuf(
1409 1409 sizeof (*rdp), BPRI_LO);
1410 1410 /*
1411 1411 * Plugin is in accept state, hence the master
1412 1412 * transport queue for this is still accepting
1413 1413 * requests. Hence we can call svc_queuereq to
1414 1414 * queue this recieved msg.
1415 1415 */
1416 1416 rdp = (rdma_recv_data_t *)mp->b_rptr;
1417 1417 rdp->conn = conn;
1418 1418 rdp->rpcmsg.addr =
1419 1419 (caddr_t)(uintptr_t)s_recvp->vaddr;
1420 1420 rdp->rpcmsg.type = RECV_BUFFER;
1421 1421 rdp->rpcmsg.len = wc.wc_bytes_xfer;
1422 1422 rdp->status = wc.wc_status;
1423 1423 mp->b_wptr += sizeof (*rdp);
1424 1424 (void) svc_queuereq((queue_t *)rib_stat->q, mp,
1425 1425 FALSE);
1426 1426 mutex_exit(&plugin_state_lock);
1427 1427 } else {
1428 1428 /*
1429 1429 * The master transport for this is going
1430 1430 * away and the queue is not accepting anymore
1431 1431 * requests for krpc, so don't do anything, just
1432 1432 * free the msg.
1433 1433 */
1434 1434 mutex_exit(&conn->c_lock);
1435 1435 mutex_exit(&plugin_state_lock);
1436 1436 rib_rbuf_free(conn, RECV_BUFFER,
1437 1437 (void *)(uintptr_t)s_recvp->vaddr);
1438 1438 }
1439 1439 } else {
1440 1440 rib_rbuf_free(conn, RECV_BUFFER,
1441 1441 (void *)(uintptr_t)s_recvp->vaddr);
1442 1442 }
1443 1443 rib_recv_rele(qp);
1444 1444 (void) rib_free_svc_recv(s_recvp);
1445 1445 }
1446 1446 }
1447 1447
1448 1448 static void
1449 1449 rib_attach_hca()
1450 1450 {
1451 1451 mutex_enter(&rib_stat->open_hca_lock);
1452 1452 (void) rpcib_open_hcas(rib_stat);
1453 1453 rib_listen(NULL);
1454 1454 mutex_exit(&rib_stat->open_hca_lock);
1455 1455 }
1456 1456
1457 1457 /*
1458 1458 * Handles DR event of IBT_HCA_DETACH_EVENT.
1459 1459 */
1460 1460 /* ARGSUSED */
1461 1461 static void
1462 1462 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1463 1463 ibt_async_code_t code, ibt_async_event_t *event)
1464 1464 {
1465 1465 switch (code) {
1466 1466 case IBT_HCA_ATTACH_EVENT:
1467 1467 rib_attach_hca();
1468 1468 break;
1469 1469 case IBT_HCA_DETACH_EVENT:
1470 1470 rib_detach_hca(hca_hdl);
1471 1471 #ifdef DEBUG
1472 1472 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1473 1473 #endif
1474 1474 break;
1475 1475 case IBT_EVENT_PORT_UP:
1476 1476 /*
1477 1477 * A port is up. We should call rib_listen() since there is
1478 1478 * a chance that rib_listen() may have failed during
1479 1479 * rib_attach_hca() because the port had not been up yet.
1480 1480 */
1481 1481 rib_listen(NULL);
1482 1482 #ifdef DEBUG
1483 1483 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1484 1484 #endif
1485 1485 break;
1486 1486 #ifdef DEBUG
1487 1487 case IBT_EVENT_PATH_MIGRATED:
1488 1488 cmn_err(CE_NOTE, "rib_async_handler(): "
1489 1489 "IBT_EVENT_PATH_MIGRATED\n");
1490 1490 break;
1491 1491 case IBT_EVENT_SQD:
1492 1492 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1493 1493 break;
1494 1494 case IBT_EVENT_COM_EST:
1495 1495 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1496 1496 break;
1497 1497 case IBT_ERROR_CATASTROPHIC_CHAN:
1498 1498 cmn_err(CE_NOTE, "rib_async_handler(): "
1499 1499 "IBT_ERROR_CATASTROPHIC_CHAN\n");
1500 1500 break;
1501 1501 case IBT_ERROR_INVALID_REQUEST_CHAN:
1502 1502 cmn_err(CE_NOTE, "rib_async_handler(): "
1503 1503 "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1504 1504 break;
1505 1505 case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1506 1506 cmn_err(CE_NOTE, "rib_async_handler(): "
1507 1507 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1508 1508 break;
1509 1509 case IBT_ERROR_PATH_MIGRATE_REQ:
1510 1510 cmn_err(CE_NOTE, "rib_async_handler(): "
1511 1511 "IBT_ERROR_PATH_MIGRATE_REQ\n");
1512 1512 break;
1513 1513 case IBT_ERROR_CQ:
1514 1514 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1515 1515 break;
1516 1516 case IBT_ERROR_PORT_DOWN:
1517 1517 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1518 1518 break;
1519 1519 case IBT_ASYNC_OPAQUE1:
1520 1520 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1521 1521 break;
1522 1522 case IBT_ASYNC_OPAQUE2:
1523 1523 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1524 1524 break;
1525 1525 case IBT_ASYNC_OPAQUE3:
1526 1526 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1527 1527 break;
1528 1528 case IBT_ASYNC_OPAQUE4:
1529 1529 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1530 1530 break;
1531 1531 #endif
1532 1532 default:
1533 1533 break;
1534 1534 }
1535 1535 }
1536 1536
1537 1537 /*
1538 1538 * Client's reachable function.
1539 1539 */
1540 1540 static rdma_stat
1541 1541 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1542 1542 {
1543 1543 rdma_stat status;
1544 1544 rpcib_ping_t rpt;
1545 1545 struct netbuf saddr;
1546 1546 CONN *conn;
1547 1547
1548 1548 bzero(&saddr, sizeof (struct netbuf));
1549 1549 status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn);
1550 1550
1551 1551 if (status == RDMA_SUCCESS) {
1552 1552 *handle = (void *)rpt.hca;
1553 1553 /* release the reference */
1554 1554 (void) rib_conn_release(conn);
1555 1555 return (RDMA_SUCCESS);
1556 1556 } else {
1557 1557 *handle = NULL;
1558 1558 DTRACE_PROBE(rpcib__i__pingfailed);
1559 1559 return (RDMA_FAILED);
1560 1560 }
1561 1561 }
1562 1562
1563 1563 /* Client side qp creation */
1564 1564 static rdma_stat
1565 1565 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1566 1566 {
1567 1567 rib_qp_t *kqp = NULL;
1568 1568 CONN *conn;
1569 1569 rdma_clnt_cred_ctrl_t *cc_info;
1570 1570
1571 1571 ASSERT(qp != NULL);
1572 1572 *qp = NULL;
1573 1573
1574 1574 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1575 1575 conn = qptoc(kqp);
1576 1576 kqp->hca = hca;
1577 1577 kqp->rdmaconn.c_rdmamod = &rib_mod;
1578 1578 kqp->rdmaconn.c_private = (caddr_t)kqp;
1579 1579
1580 1580 kqp->mode = RIB_CLIENT;
1581 1581 kqp->chan_flags = IBT_BLOCKING;
1582 1582 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1583 1583 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1584 1584 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1585 1585 /*
1586 1586 * Initialize
1587 1587 */
1588 1588 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1589 1589 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1590 1590 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1591 1591 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1592 1592 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1593 1593 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1594 1594 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1595 1595 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1596 1596 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1597 1597 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1598 1598 /*
1599 1599 * Initialize the client credit control
1600 1600 * portion of the rdmaconn struct.
1601 1601 */
1602 1602 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1603 1603 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1604 1604 cc_info->clnt_cc_granted_ops = 0;
1605 1605 cc_info->clnt_cc_in_flight_ops = 0;
1606 1606 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1607 1607
1608 1608 *qp = kqp;
1609 1609 return (RDMA_SUCCESS);
1610 1610 }
1611 1611
1612 1612 /* Server side qp creation */
1613 1613 static rdma_stat
1614 1614 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1615 1615 {
1616 1616 rib_qp_t *kqp = NULL;
1617 1617 ibt_chan_sizes_t chan_sizes;
1618 1618 ibt_rc_chan_alloc_args_t qp_attr;
1619 1619 ibt_status_t ibt_status;
1620 1620 rdma_srv_cred_ctrl_t *cc_info;
1621 1621
1622 1622 *qp = NULL;
1623 1623
1624 1624 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1625 1625 kqp->hca = hca;
1626 1626 kqp->port_num = port;
1627 1627 kqp->rdmaconn.c_rdmamod = &rib_mod;
1628 1628 kqp->rdmaconn.c_private = (caddr_t)kqp;
1629 1629
1630 1630 /*
1631 1631 * Create the qp handle
1632 1632 */
1633 1633 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1634 1634 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1635 1635 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1636 1636 qp_attr.rc_pd = hca->pd_hdl;
1637 1637 qp_attr.rc_hca_port_num = port;
1638 1638 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1639 1639 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1640 1640 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1641 1641 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1642 1642 qp_attr.rc_clone_chan = NULL;
1643 1643 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1644 1644 qp_attr.rc_flags = IBT_WR_SIGNALED;
1645 1645
1646 1646 rw_enter(&hca->state_lock, RW_READER);
1647 1647 if (hca->state != HCA_DETACHED) {
1648 1648 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1649 1649 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1650 1650 &chan_sizes);
1651 1651 } else {
1652 1652 rw_exit(&hca->state_lock);
1653 1653 goto fail;
1654 1654 }
1655 1655 rw_exit(&hca->state_lock);
1656 1656
1657 1657 if (ibt_status != IBT_SUCCESS) {
1658 1658 DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1659 1659 int, ibt_status);
1660 1660 goto fail;
1661 1661 }
1662 1662
1663 1663 kqp->mode = RIB_SERVER;
1664 1664 kqp->chan_flags = IBT_BLOCKING;
1665 1665 kqp->q = q; /* server ONLY */
1666 1666
1667 1667 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1668 1668 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1669 1669 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1670 1670 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1671 1671 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1672 1672 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1673 1673 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1674 1674 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1675 1675 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1676 1676 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1677 1677 /*
1678 1678 * Set the private data area to qp to be used in callbacks
1679 1679 */
1680 1680 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1681 1681 kqp->rdmaconn.c_state = C_CONNECTED;
1682 1682
1683 1683 /*
1684 1684 * Initialize the server credit control
1685 1685 * portion of the rdmaconn struct.
1686 1686 */
1687 1687 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1688 1688 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1689 1689 cc_info->srv_cc_buffers_granted = preposted_rbufs;
1690 1690 cc_info->srv_cc_cur_buffers_used = 0;
1691 1691 cc_info->srv_cc_posted = preposted_rbufs;
1692 1692
1693 1693 *qp = kqp;
1694 1694
1695 1695 return (RDMA_SUCCESS);
1696 1696 fail:
1697 1697 if (kqp)
1698 1698 kmem_free(kqp, sizeof (rib_qp_t));
1699 1699
1700 1700 return (RDMA_FAILED);
1701 1701 }
1702 1702
1703 1703 /* ARGSUSED */
1704 1704 ibt_cm_status_t
1705 1705 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1706 1706 ibt_cm_return_args_t *ret_args, void *priv_data,
1707 1707 ibt_priv_data_len_t len)
1708 1708 {
1709 1709 rib_hca_t *hca;
1710 1710
1711 1711 hca = (rib_hca_t *)clnt_hdl;
1712 1712
1713 1713 switch (event->cm_type) {
1714 1714
1715 1715 /* got a connection close event */
1716 1716 case IBT_CM_EVENT_CONN_CLOSED:
1717 1717 {
1718 1718 CONN *conn;
1719 1719 rib_qp_t *qp;
1720 1720
1721 1721 /* check reason why connection was closed */
1722 1722 switch (event->cm_event.closed) {
1723 1723 case IBT_CM_CLOSED_DREP_RCVD:
1724 1724 case IBT_CM_CLOSED_DREQ_TIMEOUT:
1725 1725 case IBT_CM_CLOSED_DUP:
1726 1726 case IBT_CM_CLOSED_ABORT:
1727 1727 case IBT_CM_CLOSED_ALREADY:
1728 1728 /*
1729 1729 * These cases indicate the local end initiated
1730 1730 * the closing of the channel. Nothing to do here.
1731 1731 */
1732 1732 break;
1733 1733 default:
1734 1734 /*
1735 1735 * Reason for CONN_CLOSED event must be one of
1736 1736 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1737 1737 * or IBT_CM_CLOSED_STALE. These indicate cases were
1738 1738 * the remote end is closing the channel. In these
1739 1739 * cases free the channel and transition to error
1740 1740 * state
1741 1741 */
1742 1742 qp = ibt_get_chan_private(event->cm_channel);
1743 1743 conn = qptoc(qp);
1744 1744 mutex_enter(&conn->c_lock);
1745 1745 if (conn->c_state == C_DISCONN_PEND) {
1746 1746 mutex_exit(&conn->c_lock);
1747 1747 break;
1748 1748 }
1749 1749
1750 1750 conn->c_state = C_ERROR_CONN;
1751 1751
1752 1752 /*
1753 1753 * Free the conn if c_ref is down to 0 already
1754 1754 */
1755 1755 if (conn->c_ref == 0) {
1756 1756 /*
1757 1757 * Remove from list and free conn
1758 1758 */
1759 1759 conn->c_state = C_DISCONN_PEND;
1760 1760 mutex_exit(&conn->c_lock);
1761 1761 rw_enter(&hca->state_lock, RW_READER);
1762 1762 if (hca->state != HCA_DETACHED)
1763 1763 (void) rib_disconnect_channel(conn,
1764 1764 &hca->cl_conn_list);
1765 1765 rw_exit(&hca->state_lock);
1766 1766 } else {
1767 1767 /*
1768 1768 * conn will be freed when c_ref goes to 0.
1769 1769 * Indicate to cleaning thread not to close
1770 1770 * the connection, but just free the channel.
1771 1771 */
1772 1772 conn->c_flags |= C_CLOSE_NOTNEEDED;
1773 1773 mutex_exit(&conn->c_lock);
1774 1774 }
1775 1775 #ifdef DEBUG
1776 1776 if (rib_debug)
1777 1777 cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1778 1778 "(CONN_CLOSED) channel disconnected");
1779 1779 #endif
1780 1780 break;
1781 1781 }
1782 1782 break;
1783 1783 }
1784 1784 default:
1785 1785 break;
1786 1786 }
1787 1787 return (IBT_CM_ACCEPT);
1788 1788 }
1789 1789
1790 1790 /*
1791 1791 * Connect to the server.
1792 1792 */
1793 1793 rdma_stat
1794 1794 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1795 1795 {
1796 1796 ibt_chan_open_args_t chan_args; /* channel args */
1797 1797 ibt_chan_sizes_t chan_sizes;
1798 1798 ibt_rc_chan_alloc_args_t qp_attr;
1799 1799 ibt_status_t ibt_status;
1800 1800 ibt_rc_returns_t ret_args; /* conn reject info */
1801 1801 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */
1802 1802 ibt_ip_cm_info_t ipcm_info;
1803 1803 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1804 1804
1805 1805
1806 1806 (void) bzero(&chan_args, sizeof (chan_args));
1807 1807 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1808 1808 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1809 1809
1810 1810 ipcm_info.src_addr.family = rptp->srcip.family;
1811 1811 switch (ipcm_info.src_addr.family) {
1812 1812 case AF_INET:
1813 1813 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1814 1814 break;
1815 1815 case AF_INET6:
1816 1816 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1817 1817 break;
1818 1818 }
1819 1819
1820 1820 ipcm_info.dst_addr.family = rptp->srcip.family;
1821 1821 switch (ipcm_info.dst_addr.family) {
1822 1822 case AF_INET:
1823 1823 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1824 1824 break;
1825 1825 case AF_INET6:
1826 1826 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1827 1827 break;
1828 1828 }
1829 1829
1830 1830 ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1831 1831
1832 1832 ibt_status = ibt_format_ip_private_data(&ipcm_info,
1833 1833 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1834 1834
1835 1835 if (ibt_status != IBT_SUCCESS) {
1836 1836 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1837 1837 return (-1);
1838 1838 }
1839 1839
1840 1840 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1841 1841 /* Alloc a RC channel */
1842 1842 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1843 1843 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1844 1844 qp_attr.rc_pd = hca->pd_hdl;
1845 1845 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1846 1846 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1847 1847 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1848 1848 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1849 1849 qp_attr.rc_clone_chan = NULL;
1850 1850 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1851 1851 qp_attr.rc_flags = IBT_WR_SIGNALED;
1852 1852
1853 1853 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1854 1854 chan_args.oc_path = &rptp->path;
1855 1855
1856 1856 chan_args.oc_cm_handler = rib_clnt_cm_handler;
1857 1857 chan_args.oc_cm_clnt_private = (void *)hca;
1858 1858 chan_args.oc_rdma_ra_out = 4;
1859 1859 chan_args.oc_rdma_ra_in = 4;
1860 1860 chan_args.oc_path_retry_cnt = 2;
1861 1861 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1862 1862 chan_args.oc_priv_data = cmp_ip_pvt;
1863 1863 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1864 1864
1865 1865 refresh:
1866 1866 rw_enter(&hca->state_lock, RW_READER);
1867 1867 if (hca->state != HCA_DETACHED) {
1868 1868 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1869 1869 IBT_ACHAN_NO_FLAGS,
1870 1870 &qp_attr, &qp->qp_hdl,
1871 1871 &chan_sizes);
1872 1872 } else {
1873 1873 rw_exit(&hca->state_lock);
1874 1874 return (RDMA_FAILED);
1875 1875 }
1876 1876 rw_exit(&hca->state_lock);
1877 1877
1878 1878 if (ibt_status != IBT_SUCCESS) {
1879 1879 DTRACE_PROBE1(rpcib__i_conntosrv,
1880 1880 int, ibt_status);
1881 1881 return (RDMA_FAILED);
1882 1882 }
1883 1883
1884 1884 /* Connect to the Server */
1885 1885 (void) bzero(&ret_args, sizeof (ret_args));
1886 1886 mutex_enter(&qp->cb_lock);
1887 1887 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1888 1888 IBT_BLOCKING, &chan_args, &ret_args);
1889 1889 if (ibt_status != IBT_SUCCESS) {
1890 1890 DTRACE_PROBE2(rpcib__i_openrctosrv,
1891 1891 int, ibt_status, int, ret_args.rc_status);
1892 1892
1893 1893 (void) ibt_free_channel(qp->qp_hdl);
1894 1894 qp->qp_hdl = NULL;
1895 1895 mutex_exit(&qp->cb_lock);
1896 1896 if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1897 1897 ret_args.rc_status == IBT_CM_CONN_STALE) {
1898 1898 /*
1899 1899 * Got IBT_CM_CONN_STALE probably because of stale
1900 1900 * data on the passive end of a channel that existed
1901 1901 * prior to reboot. Retry establishing a channel
1902 1902 * REFRESH_ATTEMPTS times, during which time the
1903 1903 * stale conditions on the server might clear up.
1904 1904 */
1905 1905 goto refresh;
1906 1906 }
1907 1907 return (RDMA_FAILED);
1908 1908 }
1909 1909 mutex_exit(&qp->cb_lock);
1910 1910 /*
1911 1911 * Set the private data area to qp to be used in callbacks
1912 1912 */
1913 1913 ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1914 1914 return (RDMA_SUCCESS);
1915 1915 }
1916 1916
1917 1917 rdma_stat
1918 1918 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1919 1919 {
1920 1920 uint_t i, addr_count;
1921 1921 ibt_status_t ibt_status;
1922 1922 uint8_t num_paths_p;
1923 1923 ibt_ip_path_attr_t ipattr;
1924 1924 ibt_path_ip_src_t srcip;
1925 1925 rpcib_ipaddrs_t addrs4;
1926 1926 rpcib_ipaddrs_t addrs6;
1927 1927 struct sockaddr_in *sinp;
1928 1928 struct sockaddr_in6 *sin6p;
1929 1929 rdma_stat retval = RDMA_FAILED;
1930 1930 rib_hca_t *hca;
1931 1931
1932 1932 if ((addr_type != AF_INET) && (addr_type != AF_INET6))
1933 1933 return (RDMA_INVAL);
1934 1934 ASSERT(raddr->buf != NULL);
1935 1935
1936 1936 bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1937 1937
1938 1938 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1939 1939 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1940 1940 retval = RDMA_FAILED;
1941 1941 goto done2;
1942 1942 }
1943 1943
1944 1944 if (addr_type == AF_INET) {
1945 1945 addr_count = addrs4.ri_count;
1946 1946 sinp = (struct sockaddr_in *)raddr->buf;
1947 1947 rptp->dstip.family = AF_INET;
1948 1948 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1949 1949 sinp = addrs4.ri_list;
1950 1950 } else {
1951 1951 addr_count = addrs6.ri_count;
1952 1952 sin6p = (struct sockaddr_in6 *)raddr->buf;
1953 1953 rptp->dstip.family = AF_INET6;
1954 1954 rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1955 1955 sin6p = addrs6.ri_list;
1956 1956 }
1957 1957
1958 1958 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1959 1959 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1960 1960 rw_enter(&hca->state_lock, RW_READER);
1961 1961 if (hca->state == HCA_DETACHED) {
1962 1962 rw_exit(&hca->state_lock);
1963 1963 continue;
1964 1964 }
1965 1965
1966 1966 ipattr.ipa_dst_ip = &rptp->dstip;
1967 1967 ipattr.ipa_hca_guid = hca->hca_guid;
1968 1968 ipattr.ipa_ndst = 1;
1969 1969 ipattr.ipa_max_paths = 1;
1970 1970 ipattr.ipa_src_ip.family = rptp->dstip.family;
1971 1971 for (i = 0; i < addr_count; i++) {
1972 1972 num_paths_p = 0;
1973 1973 if (addr_type == AF_INET) {
1974 1974 ipattr.ipa_src_ip.un.ip4addr =
1975 1975 sinp[i].sin_addr.s_addr;
1976 1976 } else {
1977 1977 ipattr.ipa_src_ip.un.ip6addr =
1978 1978 sin6p[i].sin6_addr;
1979 1979 }
1980 1980 bzero(&srcip, sizeof (ibt_path_ip_src_t));
1981 1981
1982 1982 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1983 1983 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1984 1984 &num_paths_p, &srcip);
1985 1985 if (ibt_status == IBT_SUCCESS &&
1986 1986 num_paths_p != 0 &&
1987 1987 rptp->path.pi_hca_guid == hca->hca_guid) {
1988 1988 rptp->hca = hca;
1989 1989 rw_exit(&hca->state_lock);
1990 1990 if (addr_type == AF_INET) {
1991 1991 rptp->srcip.family = AF_INET;
1992 1992 rptp->srcip.un.ip4addr =
1993 1993 srcip.ip_primary.un.ip4addr;
1994 1994 } else {
1995 1995 rptp->srcip.family = AF_INET6;
1996 1996 rptp->srcip.un.ip6addr =
1997 1997 srcip.ip_primary.un.ip6addr;
1998 1998
1999 1999 }
2000 2000 retval = RDMA_SUCCESS;
2001 2001 goto done1;
2002 2002 }
2003 2003 }
2004 2004 rw_exit(&hca->state_lock);
2005 2005 }
2006 2006 done1:
2007 2007 rw_exit(&rib_stat->hcas_list_lock);
2008 2008 done2:
2009 2009 if (addrs4.ri_size > 0)
2010 2010 kmem_free(addrs4.ri_list, addrs4.ri_size);
2011 2011 if (addrs6.ri_size > 0)
2012 2012 kmem_free(addrs6.ri_list, addrs6.ri_size);
2013 2013 return (retval);
2014 2014 }
2015 2015
2016 2016 /*
2017 2017 * Close channel, remove from connection list and
2018 2018 * free up resources allocated for that channel.
2019 2019 */
2020 2020 rdma_stat
2021 2021 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2022 2022 {
2023 2023 rib_qp_t *qp = ctoqp(conn);
2024 2024 rib_hca_t *hca;
2025 2025
2026 2026 mutex_enter(&conn->c_lock);
2027 2027 if (conn->c_timeout != NULL) {
2028 2028 mutex_exit(&conn->c_lock);
2029 2029 (void) untimeout(conn->c_timeout);
2030 2030 mutex_enter(&conn->c_lock);
2031 2031 }
2032 2032
2033 2033 while (conn->c_flags & C_CLOSE_PENDING) {
2034 2034 cv_wait(&conn->c_cv, &conn->c_lock);
2035 2035 }
2036 2036 mutex_exit(&conn->c_lock);
2037 2037
2038 2038 /*
2039 2039 * c_ref == 0 and connection is in C_DISCONN_PEND
2040 2040 */
2041 2041 hca = qp->hca;
2042 2042 if (conn_list != NULL)
2043 2043 (void) rib_rm_conn(conn, conn_list);
2044 2044
2045 2045 /*
2046 2046 * There is only one case where we get here with
2047 2047 * qp_hdl = NULL, which is during connection setup on
2048 2048 * the client. In such a case there are no posted
2049 2049 * send/recv buffers.
2050 2050 */
2051 2051 if (qp->qp_hdl != NULL) {
2052 2052 mutex_enter(&qp->posted_rbufs_lock);
2053 2053 while (qp->n_posted_rbufs)
2054 2054 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2055 2055 mutex_exit(&qp->posted_rbufs_lock);
2056 2056
2057 2057 mutex_enter(&qp->send_rbufs_lock);
2058 2058 while (qp->n_send_rbufs)
2059 2059 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
2060 2060 mutex_exit(&qp->send_rbufs_lock);
2061 2061
2062 2062 (void) ibt_free_channel(qp->qp_hdl);
2063 2063 qp->qp_hdl = NULL;
2064 2064 }
2065 2065
2066 2066 ASSERT(qp->rdlist == NULL);
2067 2067
2068 2068 if (qp->replylist != NULL) {
2069 2069 (void) rib_rem_replylist(qp);
2070 2070 }
2071 2071
2072 2072 cv_destroy(&qp->cb_conn_cv);
2073 2073 cv_destroy(&qp->posted_rbufs_cv);
2074 2074 cv_destroy(&qp->send_rbufs_cv);
2075 2075 mutex_destroy(&qp->cb_lock);
2076 2076 mutex_destroy(&qp->replylist_lock);
2077 2077 mutex_destroy(&qp->posted_rbufs_lock);
2078 2078 mutex_destroy(&qp->send_rbufs_lock);
2079 2079 mutex_destroy(&qp->rdlist_lock);
2080 2080
2081 2081 cv_destroy(&conn->c_cv);
2082 2082 mutex_destroy(&conn->c_lock);
2083 2083
2084 2084 if (conn->c_raddr.buf != NULL) {
2085 2085 kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2086 2086 }
2087 2087 if (conn->c_laddr.buf != NULL) {
2088 2088 kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2089 2089 }
2090 2090 if (conn->c_netid != NULL) {
2091 2091 kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1));
2092 2092 }
2093 2093 if (conn->c_addrmask.buf != NULL) {
2094 2094 kmem_free(conn->c_addrmask.buf, conn->c_addrmask.len);
2095 2095 }
2096 2096
2097 2097 /*
2098 2098 * Credit control cleanup.
2099 2099 */
2100 2100 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2101 2101 rdma_clnt_cred_ctrl_t *cc_info;
2102 2102 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2103 2103 cv_destroy(&cc_info->clnt_cc_cv);
2104 2104 }
2105 2105
2106 2106 kmem_free(qp, sizeof (rib_qp_t));
2107 2107
2108 2108 /*
2109 2109 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2110 2110 * then the hca is no longer being used.
2111 2111 */
2112 2112 if (conn_list != NULL) {
2113 2113 rw_enter(&hca->state_lock, RW_READER);
2114 2114 if (hca->state == HCA_DETACHED) {
2115 2115 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2116 2116 if (hca->srv_conn_list.conn_hd == NULL) {
2117 2117 rw_enter(&hca->cl_conn_list.conn_lock,
2118 2118 RW_READER);
2119 2119
2120 2120 if (hca->cl_conn_list.conn_hd == NULL) {
2121 2121 mutex_enter(&hca->inuse_lock);
2122 2122 hca->inuse = FALSE;
2123 2123 cv_signal(&hca->cb_cv);
2124 2124 mutex_exit(&hca->inuse_lock);
2125 2125 }
2126 2126 rw_exit(&hca->cl_conn_list.conn_lock);
2127 2127 }
2128 2128 rw_exit(&hca->srv_conn_list.conn_lock);
2129 2129 }
2130 2130 rw_exit(&hca->state_lock);
2131 2131 }
2132 2132
2133 2133 return (RDMA_SUCCESS);
2134 2134 }
2135 2135
2136 2136 /*
2137 2137 * All sends are done under the protection of
2138 2138 * the wdesc->sendwait_lock. n_send_rbufs count
2139 2139 * is protected using the send_rbufs_lock.
2140 2140 * lock ordering is:
2141 2141 * sendwait_lock -> send_rbufs_lock
2142 2142 */
2143 2143
2144 2144 void
2145 2145 rib_send_hold(rib_qp_t *qp)
2146 2146 {
2147 2147 mutex_enter(&qp->send_rbufs_lock);
2148 2148 qp->n_send_rbufs++;
2149 2149 mutex_exit(&qp->send_rbufs_lock);
2150 2150 }
2151 2151
2152 2152 void
2153 2153 rib_send_rele(rib_qp_t *qp)
2154 2154 {
2155 2155 mutex_enter(&qp->send_rbufs_lock);
2156 2156 qp->n_send_rbufs--;
2157 2157 if (qp->n_send_rbufs == 0)
2158 2158 cv_signal(&qp->send_rbufs_cv);
2159 2159 mutex_exit(&qp->send_rbufs_lock);
2160 2160 }
2161 2161
2162 2162 void
2163 2163 rib_recv_rele(rib_qp_t *qp)
2164 2164 {
2165 2165 mutex_enter(&qp->posted_rbufs_lock);
2166 2166 qp->n_posted_rbufs--;
2167 2167 if (qp->n_posted_rbufs == 0)
2168 2168 cv_signal(&qp->posted_rbufs_cv);
2169 2169 mutex_exit(&qp->posted_rbufs_lock);
2170 2170 }
2171 2171
2172 2172 /*
2173 2173 * Wait for send completion notification. Only on receiving a
2174 2174 * notification be it a successful or error completion, free the
2175 2175 * send_wid.
2176 2176 */
2177 2177 static rdma_stat
2178 2178 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2179 2179 {
↓ open down ↓ |
2179 lines elided |
↑ open up ↑ |
2180 2180 clock_t timout, cv_wait_ret;
2181 2181 rdma_stat error = RDMA_SUCCESS;
2182 2182 int i;
2183 2183
2184 2184 /*
2185 2185 * Wait for send to complete
2186 2186 */
2187 2187 ASSERT(wd != NULL);
2188 2188 mutex_enter(&wd->sendwait_lock);
2189 2189 if (wd->status == (uint_t)SEND_WAIT) {
2190 - timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2190 + timout = drv_sectohz(SEND_WAIT_TIME) +
2191 2191 ddi_get_lbolt();
2192 2192
2193 2193 if (qp->mode == RIB_SERVER) {
2194 2194 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2195 2195 &wd->sendwait_lock, timout)) > 0 &&
2196 2196 wd->status == (uint_t)SEND_WAIT)
2197 2197 ;
2198 2198 switch (cv_wait_ret) {
2199 2199 case -1: /* timeout */
2200 2200 DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2201 2201
2202 2202 wd->cv_sig = 0; /* no signal needed */
2203 2203 error = RDMA_TIMEDOUT;
2204 2204 break;
2205 2205 default: /* got send completion */
2206 2206 break;
2207 2207 }
2208 2208 } else {
2209 2209 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2210 2210 &wd->sendwait_lock, timout)) > 0 &&
2211 2211 wd->status == (uint_t)SEND_WAIT)
2212 2212 ;
2213 2213 switch (cv_wait_ret) {
2214 2214 case -1: /* timeout */
2215 2215 DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2216 2216
2217 2217 wd->cv_sig = 0; /* no signal needed */
2218 2218 error = RDMA_TIMEDOUT;
2219 2219 break;
2220 2220 case 0: /* interrupted */
2221 2221 DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2222 2222
2223 2223 wd->cv_sig = 0; /* no signal needed */
2224 2224 error = RDMA_INTR;
2225 2225 break;
2226 2226 default: /* got send completion */
2227 2227 break;
2228 2228 }
2229 2229 }
2230 2230 }
2231 2231
2232 2232 if (wd->status != (uint_t)SEND_WAIT) {
2233 2233 /* got send completion */
2234 2234 if (wd->status != RDMA_SUCCESS) {
2235 2235 switch (wd->status) {
2236 2236 case RDMA_CONNLOST:
2237 2237 error = RDMA_CONNLOST;
2238 2238 break;
2239 2239 default:
2240 2240 error = RDMA_FAILED;
2241 2241 break;
2242 2242 }
2243 2243 }
2244 2244 for (i = 0; i < wd->nsbufs; i++) {
2245 2245 rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2246 2246 (void *)(uintptr_t)wd->sbufaddr[i]);
2247 2247 }
2248 2248
2249 2249 rib_send_rele(qp);
2250 2250
2251 2251 mutex_exit(&wd->sendwait_lock);
2252 2252 (void) rib_free_sendwait(wd);
2253 2253
2254 2254 } else {
2255 2255 mutex_exit(&wd->sendwait_lock);
2256 2256 }
2257 2257 return (error);
2258 2258 }
2259 2259
2260 2260 static struct send_wid *
2261 2261 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2262 2262 {
2263 2263 struct send_wid *wd;
2264 2264
2265 2265 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2266 2266 wd->xid = xid;
2267 2267 wd->cv_sig = cv_sig;
2268 2268 wd->qp = qp;
2269 2269 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2270 2270 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2271 2271 wd->status = (uint_t)SEND_WAIT;
2272 2272
2273 2273 return (wd);
2274 2274 }
2275 2275
2276 2276 static int
2277 2277 rib_free_sendwait(struct send_wid *wdesc)
2278 2278 {
2279 2279 cv_destroy(&wdesc->wait_cv);
2280 2280 mutex_destroy(&wdesc->sendwait_lock);
2281 2281 kmem_free(wdesc, sizeof (*wdesc));
2282 2282
2283 2283 return (0);
2284 2284 }
2285 2285
2286 2286 static rdma_stat
2287 2287 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2288 2288 {
2289 2289 mutex_enter(&qp->replylist_lock);
2290 2290 if (rep != NULL) {
2291 2291 (void) rib_remreply(qp, rep);
2292 2292 mutex_exit(&qp->replylist_lock);
2293 2293 return (RDMA_SUCCESS);
2294 2294 }
2295 2295 mutex_exit(&qp->replylist_lock);
2296 2296 return (RDMA_FAILED);
2297 2297 }
2298 2298
2299 2299 /*
2300 2300 * Send buffers are freed here only in case of error in posting
2301 2301 * on QP. If the post succeeded, the send buffers are freed upon
2302 2302 * send completion in rib_sendwait() or in the scq_handler.
2303 2303 */
2304 2304 rdma_stat
2305 2305 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2306 2306 int send_sig, int cv_sig, caddr_t *swid)
2307 2307 {
2308 2308 struct send_wid *wdesc;
2309 2309 struct clist *clp;
2310 2310 ibt_status_t ibt_status = IBT_SUCCESS;
2311 2311 rdma_stat ret = RDMA_SUCCESS;
2312 2312 ibt_send_wr_t tx_wr;
2313 2313 int i, nds;
2314 2314 ibt_wr_ds_t sgl[DSEG_MAX];
2315 2315 uint_t total_msg_size;
2316 2316 rib_qp_t *qp;
2317 2317
2318 2318 qp = ctoqp(conn);
2319 2319
2320 2320 ASSERT(cl != NULL);
2321 2321
2322 2322 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2323 2323
2324 2324 nds = 0;
2325 2325 total_msg_size = 0;
2326 2326 clp = cl;
2327 2327 while (clp != NULL) {
2328 2328 if (nds >= DSEG_MAX) {
2329 2329 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2330 2330 return (RDMA_FAILED);
2331 2331 }
2332 2332 sgl[nds].ds_va = clp->w.c_saddr;
2333 2333 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2334 2334 sgl[nds].ds_len = clp->c_len;
2335 2335 total_msg_size += clp->c_len;
2336 2336 clp = clp->c_next;
2337 2337 nds++;
2338 2338 }
2339 2339
2340 2340 if (send_sig) {
2341 2341 /* Set SEND_SIGNAL flag. */
2342 2342 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2343 2343 wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2344 2344 *swid = (caddr_t)wdesc;
2345 2345 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2346 2346 mutex_enter(&wdesc->sendwait_lock);
2347 2347 wdesc->nsbufs = nds;
2348 2348 for (i = 0; i < nds; i++) {
2349 2349 wdesc->sbufaddr[i] = sgl[i].ds_va;
2350 2350 }
2351 2351 } else {
2352 2352 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2353 2353 *swid = NULL;
2354 2354 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2355 2355 }
2356 2356
2357 2357 tx_wr.wr_opcode = IBT_WRC_SEND;
2358 2358 tx_wr.wr_trans = IBT_RC_SRV;
2359 2359 tx_wr.wr_nds = nds;
2360 2360 tx_wr.wr_sgl = sgl;
2361 2361
2362 2362 mutex_enter(&conn->c_lock);
2363 2363 if (conn->c_state == C_CONNECTED) {
2364 2364 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2365 2365 }
2366 2366 if (conn->c_state != C_CONNECTED ||
2367 2367 ibt_status != IBT_SUCCESS) {
2368 2368 if (conn->c_state != C_DISCONN_PEND)
2369 2369 conn->c_state = C_ERROR_CONN;
2370 2370 mutex_exit(&conn->c_lock);
2371 2371 if (send_sig) {
2372 2372 for (i = 0; i < nds; i++) {
2373 2373 rib_rbuf_free(conn, SEND_BUFFER,
2374 2374 (void *)(uintptr_t)wdesc->sbufaddr[i]);
2375 2375 }
2376 2376 mutex_exit(&wdesc->sendwait_lock);
2377 2377 (void) rib_free_sendwait(wdesc);
2378 2378 }
2379 2379 return (RDMA_CONNLOST);
2380 2380 }
2381 2381
2382 2382 mutex_exit(&conn->c_lock);
2383 2383
2384 2384 if (send_sig) {
2385 2385 rib_send_hold(qp);
2386 2386 mutex_exit(&wdesc->sendwait_lock);
2387 2387 if (cv_sig) {
2388 2388 /*
2389 2389 * cv_wait for send to complete.
2390 2390 * We can fail due to a timeout or signal or
2391 2391 * unsuccessful send.
2392 2392 */
2393 2393 ret = rib_sendwait(qp, wdesc);
2394 2394
2395 2395 return (ret);
2396 2396 }
2397 2397 }
2398 2398
2399 2399 return (RDMA_SUCCESS);
2400 2400 }
2401 2401
2402 2402
2403 2403 rdma_stat
2404 2404 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2405 2405 {
2406 2406 rdma_stat ret;
2407 2407 caddr_t wd;
2408 2408
2409 2409 /* send-wait & cv_signal */
2410 2410 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2411 2411 return (ret);
2412 2412 }
2413 2413
2414 2414 /*
2415 2415 * Deprecated/obsolete interface not used currently
2416 2416 * but earlier used for READ-READ protocol.
2417 2417 * Send RPC reply and wait for RDMA_DONE.
2418 2418 */
2419 2419 rdma_stat
2420 2420 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2421 2421 {
2422 2422 rdma_stat ret = RDMA_SUCCESS;
2423 2423 struct rdma_done_list *rd;
2424 2424 clock_t cv_wait_ret;
2425 2425 caddr_t *wid = NULL;
2426 2426 rib_qp_t *qp = ctoqp(conn);
2427 2427
2428 2428 mutex_enter(&qp->rdlist_lock);
2429 2429 rd = rdma_done_add(qp, msgid);
2430 2430
↓ open down ↓ |
230 lines elided |
↑ open up ↑ |
2431 2431 /* No cv_signal (whether send-wait or no-send-wait) */
2432 2432 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2433 2433
2434 2434 if (ret != RDMA_SUCCESS) {
2435 2435 rdma_done_rm(qp, rd);
2436 2436 } else {
2437 2437 /*
2438 2438 * Wait for RDMA_DONE from remote end
2439 2439 */
2440 2440 cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv,
2441 - &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000),
2441 + &qp->rdlist_lock, drv_sectohz(REPLY_WAIT_TIME),
2442 2442 TR_CLOCK_TICK);
2443 2443
2444 2444 rdma_done_rm(qp, rd);
2445 2445
2446 2446 if (cv_wait_ret < 0) {
2447 2447 ret = RDMA_TIMEDOUT;
2448 2448 }
2449 2449 }
2450 2450
2451 2451 mutex_exit(&qp->rdlist_lock);
2452 2452 return (ret);
2453 2453 }
2454 2454
2455 2455 static struct recv_wid *
2456 2456 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2457 2457 {
2458 2458 struct recv_wid *rwid;
2459 2459
2460 2460 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2461 2461 rwid->xid = msgid;
2462 2462 rwid->addr = sgl->ds_va;
2463 2463 rwid->qp = qp;
2464 2464
2465 2465 return (rwid);
2466 2466 }
2467 2467
2468 2468 static void
2469 2469 rib_free_wid(struct recv_wid *rwid)
2470 2470 {
2471 2471 kmem_free(rwid, sizeof (struct recv_wid));
2472 2472 }
2473 2473
2474 2474 rdma_stat
2475 2475 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2476 2476 {
2477 2477 rib_qp_t *qp = ctoqp(conn);
2478 2478 struct clist *clp = cl;
2479 2479 struct reply *rep;
2480 2480 struct recv_wid *rwid;
2481 2481 int nds;
2482 2482 ibt_wr_ds_t sgl[DSEG_MAX];
2483 2483 ibt_recv_wr_t recv_wr;
2484 2484 rdma_stat ret;
2485 2485 ibt_status_t ibt_status;
2486 2486
2487 2487 /*
2488 2488 * rdma_clnt_postrecv uses RECV_BUFFER.
2489 2489 */
2490 2490
2491 2491 nds = 0;
2492 2492 while (cl != NULL) {
2493 2493 if (nds >= DSEG_MAX) {
2494 2494 ret = RDMA_FAILED;
2495 2495 goto done;
2496 2496 }
2497 2497 sgl[nds].ds_va = cl->w.c_saddr;
2498 2498 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2499 2499 sgl[nds].ds_len = cl->c_len;
2500 2500 cl = cl->c_next;
2501 2501 nds++;
2502 2502 }
2503 2503
2504 2504 if (nds != 1) {
2505 2505 ret = RDMA_FAILED;
2506 2506 goto done;
2507 2507 }
2508 2508
2509 2509 bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2510 2510 recv_wr.wr_nds = nds;
2511 2511 recv_wr.wr_sgl = sgl;
2512 2512
2513 2513 rwid = rib_create_wid(qp, &sgl[0], msgid);
2514 2514 if (rwid) {
2515 2515 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2516 2516 } else {
2517 2517 ret = RDMA_NORESOURCE;
2518 2518 goto done;
2519 2519 }
2520 2520 rep = rib_addreplylist(qp, msgid);
2521 2521 if (!rep) {
2522 2522 rib_free_wid(rwid);
2523 2523 ret = RDMA_NORESOURCE;
2524 2524 goto done;
2525 2525 }
2526 2526
2527 2527 mutex_enter(&conn->c_lock);
2528 2528
2529 2529 if (conn->c_state == C_CONNECTED) {
2530 2530 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2531 2531 }
2532 2532
2533 2533 if (conn->c_state != C_CONNECTED ||
2534 2534 ibt_status != IBT_SUCCESS) {
2535 2535 if (conn->c_state != C_DISCONN_PEND)
2536 2536 conn->c_state = C_ERROR_CONN;
2537 2537 mutex_exit(&conn->c_lock);
2538 2538 rib_free_wid(rwid);
2539 2539 (void) rib_rem_rep(qp, rep);
2540 2540 ret = RDMA_CONNLOST;
2541 2541 goto done;
2542 2542 }
2543 2543
2544 2544 mutex_enter(&qp->posted_rbufs_lock);
2545 2545 qp->n_posted_rbufs++;
2546 2546 mutex_exit(&qp->posted_rbufs_lock);
2547 2547
2548 2548 mutex_exit(&conn->c_lock);
2549 2549 return (RDMA_SUCCESS);
2550 2550
2551 2551 done:
2552 2552 while (clp != NULL) {
2553 2553 rib_rbuf_free(conn, RECV_BUFFER,
2554 2554 (void *)(uintptr_t)clp->w.c_saddr3);
2555 2555 clp = clp->c_next;
2556 2556 }
2557 2557 return (ret);
2558 2558 }
2559 2559
2560 2560 rdma_stat
2561 2561 rib_svc_post(CONN* conn, struct clist *cl)
2562 2562 {
2563 2563 rib_qp_t *qp = ctoqp(conn);
2564 2564 struct svc_recv *s_recvp;
2565 2565 int nds;
2566 2566 ibt_wr_ds_t sgl[DSEG_MAX];
2567 2567 ibt_recv_wr_t recv_wr;
2568 2568 ibt_status_t ibt_status;
2569 2569
2570 2570 nds = 0;
2571 2571 while (cl != NULL) {
2572 2572 if (nds >= DSEG_MAX) {
2573 2573 return (RDMA_FAILED);
2574 2574 }
2575 2575 sgl[nds].ds_va = cl->w.c_saddr;
2576 2576 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2577 2577 sgl[nds].ds_len = cl->c_len;
2578 2578 cl = cl->c_next;
2579 2579 nds++;
2580 2580 }
2581 2581
2582 2582 if (nds != 1) {
2583 2583 rib_rbuf_free(conn, RECV_BUFFER,
2584 2584 (caddr_t)(uintptr_t)sgl[0].ds_va);
2585 2585
2586 2586 return (RDMA_FAILED);
2587 2587 }
2588 2588
2589 2589 bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2590 2590 recv_wr.wr_nds = nds;
2591 2591 recv_wr.wr_sgl = sgl;
2592 2592
2593 2593 s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2594 2594 /* Use s_recvp's addr as wr id */
2595 2595 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2596 2596 mutex_enter(&conn->c_lock);
2597 2597 if (conn->c_state == C_CONNECTED) {
2598 2598 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2599 2599 }
2600 2600 if (conn->c_state != C_CONNECTED ||
2601 2601 ibt_status != IBT_SUCCESS) {
2602 2602 if (conn->c_state != C_DISCONN_PEND)
2603 2603 conn->c_state = C_ERROR_CONN;
2604 2604 mutex_exit(&conn->c_lock);
2605 2605 rib_rbuf_free(conn, RECV_BUFFER,
2606 2606 (caddr_t)(uintptr_t)sgl[0].ds_va);
2607 2607 (void) rib_free_svc_recv(s_recvp);
2608 2608
2609 2609 return (RDMA_CONNLOST);
2610 2610 }
2611 2611 mutex_exit(&conn->c_lock);
2612 2612
2613 2613 return (RDMA_SUCCESS);
2614 2614 }
2615 2615
2616 2616 /* Client */
2617 2617 rdma_stat
2618 2618 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2619 2619 {
2620 2620 return (rib_clnt_post(conn, cl, msgid));
2621 2621 }
2622 2622
2623 2623 /* Client */
2624 2624 rdma_stat
2625 2625 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2626 2626 {
2627 2627 rib_qp_t *qp = ctoqp(conn);
2628 2628 struct reply *rep;
2629 2629
2630 2630 mutex_enter(&qp->replylist_lock);
2631 2631 for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2632 2632 if (rep->xid == msgid) {
2633 2633 if (rep->vaddr_cq) {
2634 2634 rib_rbuf_free(conn, RECV_BUFFER,
2635 2635 (caddr_t)(uintptr_t)rep->vaddr_cq);
2636 2636 }
2637 2637 (void) rib_remreply(qp, rep);
2638 2638 break;
2639 2639 }
2640 2640 }
2641 2641 mutex_exit(&qp->replylist_lock);
2642 2642
2643 2643 return (RDMA_SUCCESS);
2644 2644 }
2645 2645
2646 2646 /* Server */
2647 2647 rdma_stat
2648 2648 rib_post_recv(CONN *conn, struct clist *cl)
2649 2649 {
2650 2650 rib_qp_t *qp = ctoqp(conn);
2651 2651
2652 2652 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2653 2653 mutex_enter(&qp->posted_rbufs_lock);
2654 2654 qp->n_posted_rbufs++;
2655 2655 mutex_exit(&qp->posted_rbufs_lock);
2656 2656 return (RDMA_SUCCESS);
2657 2657 }
2658 2658 return (RDMA_FAILED);
2659 2659 }
2660 2660
2661 2661 /*
2662 2662 * Client side only interface to "recv" the rpc reply buf
2663 2663 * posted earlier by rib_post_resp(conn, cl, msgid).
2664 2664 */
2665 2665 rdma_stat
2666 2666 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2667 2667 {
2668 2668 struct reply *rep = NULL;
2669 2669 clock_t timout, cv_wait_ret;
2670 2670 rdma_stat ret = RDMA_SUCCESS;
2671 2671 rib_qp_t *qp = ctoqp(conn);
2672 2672
2673 2673 /*
2674 2674 * Find the reply structure for this msgid
2675 2675 */
2676 2676 mutex_enter(&qp->replylist_lock);
2677 2677
2678 2678 for (rep = qp->replylist; rep != NULL; rep = rep->next) {
↓ open down ↓ |
227 lines elided |
↑ open up ↑ |
2679 2679 if (rep->xid == msgid)
2680 2680 break;
2681 2681 }
2682 2682
2683 2683 if (rep != NULL) {
2684 2684 /*
2685 2685 * If message not yet received, wait.
2686 2686 */
2687 2687 if (rep->status == (uint_t)REPLY_WAIT) {
2688 2688 timout = ddi_get_lbolt() +
2689 - drv_usectohz(REPLY_WAIT_TIME * 1000000);
2689 + drv_sectohz(REPLY_WAIT_TIME);
2690 2690
2691 2691 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2692 2692 &qp->replylist_lock, timout)) > 0 &&
2693 2693 rep->status == (uint_t)REPLY_WAIT)
2694 2694 ;
2695 2695
2696 2696 switch (cv_wait_ret) {
2697 2697 case -1: /* timeout */
2698 2698 ret = RDMA_TIMEDOUT;
2699 2699 break;
2700 2700 case 0:
2701 2701 ret = RDMA_INTR;
2702 2702 break;
2703 2703 default:
2704 2704 break;
2705 2705 }
2706 2706 }
2707 2707
2708 2708 if (rep->status == RDMA_SUCCESS) {
2709 2709 struct clist *cl = NULL;
2710 2710
2711 2711 /*
2712 2712 * Got message successfully
2713 2713 */
2714 2714 clist_add(&cl, 0, rep->bytes_xfer, NULL,
2715 2715 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2716 2716 *clp = cl;
2717 2717 } else {
2718 2718 if (rep->status != (uint_t)REPLY_WAIT) {
2719 2719 /*
2720 2720 * Got error in reply message. Free
2721 2721 * recv buffer here.
2722 2722 */
2723 2723 ret = rep->status;
2724 2724 rib_rbuf_free(conn, RECV_BUFFER,
2725 2725 (caddr_t)(uintptr_t)rep->vaddr_cq);
2726 2726 }
2727 2727 }
2728 2728 (void) rib_remreply(qp, rep);
2729 2729 } else {
2730 2730 /*
2731 2731 * No matching reply structure found for given msgid on the
2732 2732 * reply wait list.
2733 2733 */
2734 2734 ret = RDMA_INVAL;
2735 2735 DTRACE_PROBE(rpcib__i__nomatchxid2);
2736 2736 }
2737 2737
2738 2738 /*
2739 2739 * Done.
2740 2740 */
2741 2741 mutex_exit(&qp->replylist_lock);
2742 2742 return (ret);
2743 2743 }
2744 2744
2745 2745 /*
2746 2746 * RDMA write a buffer to the remote address.
2747 2747 */
2748 2748 rdma_stat
2749 2749 rib_write(CONN *conn, struct clist *cl, int wait)
2750 2750 {
2751 2751 ibt_send_wr_t tx_wr;
2752 2752 int cv_sig;
2753 2753 ibt_wr_ds_t sgl[DSEG_MAX];
2754 2754 struct send_wid *wdesc;
2755 2755 ibt_status_t ibt_status;
2756 2756 rdma_stat ret = RDMA_SUCCESS;
2757 2757 rib_qp_t *qp = ctoqp(conn);
2758 2758 uint64_t n_writes = 0;
2759 2759
2760 2760 if (cl == NULL) {
2761 2761 return (RDMA_FAILED);
2762 2762 }
2763 2763
2764 2764 while ((cl != NULL)) {
2765 2765 if (cl->c_len > 0) {
2766 2766 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2767 2767 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2768 2768 tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2769 2769 cl->c_dmemhandle.mrc_rmr; /* rkey */
2770 2770 sgl[0].ds_va = cl->w.c_saddr;
2771 2771 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2772 2772 sgl[0].ds_len = cl->c_len;
2773 2773
2774 2774 if (wait) {
2775 2775 cv_sig = 1;
2776 2776 } else {
2777 2777 if (n_writes > max_unsignaled_rws) {
2778 2778 n_writes = 0;
2779 2779 cv_sig = 1;
2780 2780 } else {
2781 2781 cv_sig = 0;
2782 2782 }
2783 2783 }
2784 2784
2785 2785 if (cv_sig) {
2786 2786 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2787 2787 wdesc = rib_init_sendwait(0, cv_sig, qp);
2788 2788 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2789 2789 mutex_enter(&wdesc->sendwait_lock);
2790 2790 } else {
2791 2791 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2792 2792 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2793 2793 }
2794 2794 tx_wr.wr_opcode = IBT_WRC_RDMAW;
2795 2795 tx_wr.wr_trans = IBT_RC_SRV;
2796 2796 tx_wr.wr_nds = 1;
2797 2797 tx_wr.wr_sgl = sgl;
2798 2798
2799 2799 mutex_enter(&conn->c_lock);
2800 2800 if (conn->c_state == C_CONNECTED) {
2801 2801 ibt_status =
2802 2802 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2803 2803 }
2804 2804 if (conn->c_state != C_CONNECTED ||
2805 2805 ibt_status != IBT_SUCCESS) {
2806 2806 if (conn->c_state != C_DISCONN_PEND)
2807 2807 conn->c_state = C_ERROR_CONN;
2808 2808 mutex_exit(&conn->c_lock);
2809 2809 if (cv_sig) {
2810 2810 mutex_exit(&wdesc->sendwait_lock);
2811 2811 (void) rib_free_sendwait(wdesc);
2812 2812 }
2813 2813 return (RDMA_CONNLOST);
2814 2814 }
2815 2815
2816 2816 mutex_exit(&conn->c_lock);
2817 2817
2818 2818 /*
2819 2819 * Wait for send to complete
2820 2820 */
2821 2821 if (cv_sig) {
2822 2822
2823 2823 rib_send_hold(qp);
2824 2824 mutex_exit(&wdesc->sendwait_lock);
2825 2825
2826 2826 ret = rib_sendwait(qp, wdesc);
2827 2827 if (ret != 0)
2828 2828 return (ret);
2829 2829 }
2830 2830 n_writes ++;
2831 2831 }
2832 2832 cl = cl->c_next;
2833 2833 }
2834 2834 return (RDMA_SUCCESS);
2835 2835 }
2836 2836
2837 2837 /*
2838 2838 * RDMA Read a buffer from the remote address.
2839 2839 */
2840 2840 rdma_stat
2841 2841 rib_read(CONN *conn, struct clist *cl, int wait)
2842 2842 {
2843 2843 ibt_send_wr_t rx_wr;
2844 2844 int cv_sig = 0;
2845 2845 ibt_wr_ds_t sgl;
2846 2846 struct send_wid *wdesc;
2847 2847 ibt_status_t ibt_status = IBT_SUCCESS;
2848 2848 rdma_stat ret = RDMA_SUCCESS;
2849 2849 rib_qp_t *qp = ctoqp(conn);
2850 2850
2851 2851 if (cl == NULL) {
2852 2852 return (RDMA_FAILED);
2853 2853 }
2854 2854
2855 2855 while (cl != NULL) {
2856 2856 bzero(&rx_wr, sizeof (ibt_send_wr_t));
2857 2857 /*
2858 2858 * Remote address is at the head chunk item in list.
2859 2859 */
2860 2860 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2861 2861 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2862 2862
2863 2863 sgl.ds_va = cl->u.c_daddr;
2864 2864 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2865 2865 sgl.ds_len = cl->c_len;
2866 2866
2867 2867 /*
2868 2868 * If there are multiple chunks to be read, and
2869 2869 * wait is set, ask for signal only for the last chunk
2870 2870 * and wait only on the last chunk. The completion of
2871 2871 * RDMA_READ on last chunk ensures that reads on all
2872 2872 * previous chunks are also completed.
2873 2873 */
2874 2874 if (wait && (cl->c_next == NULL)) {
2875 2875 cv_sig = 1;
2876 2876 wdesc = rib_init_sendwait(0, cv_sig, qp);
2877 2877 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2878 2878 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2879 2879 mutex_enter(&wdesc->sendwait_lock);
2880 2880 } else {
2881 2881 rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2882 2882 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2883 2883 }
2884 2884 rx_wr.wr_opcode = IBT_WRC_RDMAR;
2885 2885 rx_wr.wr_trans = IBT_RC_SRV;
2886 2886 rx_wr.wr_nds = 1;
2887 2887 rx_wr.wr_sgl = &sgl;
2888 2888
2889 2889 mutex_enter(&conn->c_lock);
2890 2890 if (conn->c_state == C_CONNECTED) {
2891 2891 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2892 2892 }
2893 2893 if (conn->c_state != C_CONNECTED ||
2894 2894 ibt_status != IBT_SUCCESS) {
2895 2895 if (conn->c_state != C_DISCONN_PEND)
2896 2896 conn->c_state = C_ERROR_CONN;
2897 2897 mutex_exit(&conn->c_lock);
2898 2898 if (wait && (cl->c_next == NULL)) {
2899 2899 mutex_exit(&wdesc->sendwait_lock);
2900 2900 (void) rib_free_sendwait(wdesc);
2901 2901 }
2902 2902 return (RDMA_CONNLOST);
2903 2903 }
2904 2904
2905 2905 mutex_exit(&conn->c_lock);
2906 2906
2907 2907 /*
2908 2908 * Wait for send to complete if this is the
2909 2909 * last item in the list.
2910 2910 */
2911 2911 if (wait && cl->c_next == NULL) {
2912 2912 rib_send_hold(qp);
2913 2913 mutex_exit(&wdesc->sendwait_lock);
2914 2914
2915 2915 ret = rib_sendwait(qp, wdesc);
2916 2916
2917 2917 if (ret != 0)
2918 2918 return (ret);
2919 2919 }
2920 2920 cl = cl->c_next;
2921 2921 }
2922 2922 return (RDMA_SUCCESS);
2923 2923 }
2924 2924
2925 2925 /*
2926 2926 * rib_srv_cm_handler()
2927 2927 * Connection Manager callback to handle RC connection requests.
2928 2928 */
2929 2929 /* ARGSUSED */
2930 2930 static ibt_cm_status_t
2931 2931 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2932 2932 ibt_cm_return_args_t *ret_args, void *priv_data,
2933 2933 ibt_priv_data_len_t len)
2934 2934 {
2935 2935 queue_t *q;
2936 2936 rib_qp_t *qp;
2937 2937 rib_hca_t *hca;
2938 2938 rdma_stat status = RDMA_SUCCESS;
2939 2939 int i;
2940 2940 struct clist cl;
2941 2941 rdma_buf_t rdbuf = {0};
2942 2942 void *buf = NULL;
2943 2943 CONN *conn;
2944 2944 ibt_ip_cm_info_t ipinfo;
2945 2945 struct sockaddr_in *s;
2946 2946 struct sockaddr_in6 *s6;
2947 2947 int sin_size = sizeof (struct sockaddr_in);
2948 2948 int in_size = sizeof (struct in_addr);
2949 2949 int sin6_size = sizeof (struct sockaddr_in6);
2950 2950
2951 2951 ASSERT(any != NULL);
2952 2952 ASSERT(event != NULL);
2953 2953
2954 2954 hca = (rib_hca_t *)any;
2955 2955
2956 2956 /* got a connection request */
2957 2957 switch (event->cm_type) {
2958 2958 case IBT_CM_EVENT_REQ_RCV:
2959 2959 /*
2960 2960 * If the plugin is in the NO_ACCEPT state, bail out.
2961 2961 */
2962 2962 mutex_enter(&plugin_state_lock);
2963 2963 if (plugin_state == NO_ACCEPT) {
2964 2964 mutex_exit(&plugin_state_lock);
2965 2965 return (IBT_CM_REJECT);
2966 2966 }
2967 2967 mutex_exit(&plugin_state_lock);
2968 2968
2969 2969 /*
2970 2970 * Need to send a MRA MAD to CM so that it does not
2971 2971 * timeout on us.
2972 2972 */
2973 2973 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2974 2974 event->cm_event.req.req_timeout * 8, NULL, 0);
2975 2975
2976 2976 mutex_enter(&rib_stat->open_hca_lock);
2977 2977 q = rib_stat->q;
2978 2978 mutex_exit(&rib_stat->open_hca_lock);
2979 2979
2980 2980 status = rib_svc_create_chan(hca, (caddr_t)q,
2981 2981 event->cm_event.req.req_prim_hca_port, &qp);
2982 2982
2983 2983 if (status) {
2984 2984 return (IBT_CM_REJECT);
2985 2985 }
2986 2986
2987 2987 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2988 2988 ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2989 2989 ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2990 2990 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2991 2991
2992 2992 /*
2993 2993 * Pre-posts RECV buffers
2994 2994 */
2995 2995 conn = qptoc(qp);
2996 2996 for (i = 0; i < preposted_rbufs; i++) {
2997 2997 bzero(&rdbuf, sizeof (rdbuf));
2998 2998 rdbuf.type = RECV_BUFFER;
2999 2999 buf = rib_rbuf_alloc(conn, &rdbuf);
3000 3000 if (buf == NULL) {
3001 3001 /*
3002 3002 * A connection is not established yet.
3003 3003 * Just flush the channel. Buffers
3004 3004 * posted till now will error out with
3005 3005 * IBT_WC_WR_FLUSHED_ERR.
3006 3006 */
3007 3007 (void) ibt_flush_channel(qp->qp_hdl);
3008 3008 (void) rib_disconnect_channel(conn, NULL);
3009 3009 return (IBT_CM_REJECT);
3010 3010 }
3011 3011
3012 3012 bzero(&cl, sizeof (cl));
3013 3013 cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
3014 3014 cl.c_len = rdbuf.len;
3015 3015 cl.c_smemhandle.mrc_lmr =
3016 3016 rdbuf.handle.mrc_lmr; /* lkey */
3017 3017 cl.c_next = NULL;
3018 3018 status = rib_post_recv(conn, &cl);
3019 3019 if (status != RDMA_SUCCESS) {
3020 3020 /*
3021 3021 * A connection is not established yet.
3022 3022 * Just flush the channel. Buffers
3023 3023 * posted till now will error out with
3024 3024 * IBT_WC_WR_FLUSHED_ERR.
3025 3025 */
3026 3026 (void) ibt_flush_channel(qp->qp_hdl);
3027 3027 (void) rib_disconnect_channel(conn, NULL);
3028 3028 return (IBT_CM_REJECT);
3029 3029 }
3030 3030 }
3031 3031 (void) rib_add_connlist(conn, &hca->srv_conn_list);
3032 3032
3033 3033 /*
3034 3034 * Get the address translation
3035 3035 */
3036 3036 rw_enter(&hca->state_lock, RW_READER);
3037 3037 if (hca->state == HCA_DETACHED) {
3038 3038 rw_exit(&hca->state_lock);
3039 3039 return (IBT_CM_REJECT);
3040 3040 }
3041 3041 rw_exit(&hca->state_lock);
3042 3042
3043 3043 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
3044 3044
3045 3045 if (ibt_get_ip_data(event->cm_priv_data_len,
3046 3046 event->cm_priv_data,
3047 3047 &ipinfo) != IBT_SUCCESS) {
3048 3048
3049 3049 return (IBT_CM_REJECT);
3050 3050 }
3051 3051
3052 3052 switch (ipinfo.src_addr.family) {
3053 3053 case AF_INET:
3054 3054
3055 3055 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1,
3056 3056 KM_SLEEP);
3057 3057 (void) strcpy(conn->c_netid, RIBNETID_TCP);
3058 3058
3059 3059 conn->c_raddr.maxlen =
3060 3060 conn->c_raddr.len = sin_size;
3061 3061 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3062 3062
3063 3063 s = (struct sockaddr_in *)conn->c_raddr.buf;
3064 3064 s->sin_family = AF_INET;
3065 3065 bcopy((void *)&ipinfo.src_addr.un.ip4addr,
3066 3066 &s->sin_addr, in_size);
3067 3067
3068 3068 conn->c_laddr.maxlen =
3069 3069 conn->c_laddr.len = sin_size;
3070 3070 conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3071 3071
3072 3072 s = (struct sockaddr_in *)conn->c_laddr.buf;
3073 3073 s->sin_family = AF_INET;
3074 3074 bcopy((void *)&ipinfo.dst_addr.un.ip4addr,
3075 3075 &s->sin_addr, in_size);
3076 3076
3077 3077 conn->c_addrmask.maxlen = conn->c_addrmask.len =
3078 3078 sizeof (struct sockaddr_in);
3079 3079 conn->c_addrmask.buf =
3080 3080 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3081 3081 ((struct sockaddr_in *)
3082 3082 conn->c_addrmask.buf)->sin_addr.s_addr =
3083 3083 (uint32_t)~0;
3084 3084 ((struct sockaddr_in *)
3085 3085 conn->c_addrmask.buf)->sin_family =
3086 3086 (sa_family_t)~0;
3087 3087 break;
3088 3088
3089 3089 case AF_INET6:
3090 3090
3091 3091 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1,
3092 3092 KM_SLEEP);
3093 3093 (void) strcpy(conn->c_netid, RIBNETID_TCP6);
3094 3094
3095 3095 conn->c_raddr.maxlen =
3096 3096 conn->c_raddr.len = sin6_size;
3097 3097 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3098 3098
3099 3099 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3100 3100 s6->sin6_family = AF_INET6;
3101 3101 bcopy((void *)&ipinfo.src_addr.un.ip6addr,
3102 3102 &s6->sin6_addr,
3103 3103 sizeof (struct in6_addr));
3104 3104
3105 3105 conn->c_laddr.maxlen =
3106 3106 conn->c_laddr.len = sin6_size;
3107 3107 conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3108 3108
3109 3109 s6 = (struct sockaddr_in6 *)conn->c_laddr.buf;
3110 3110 s6->sin6_family = AF_INET6;
3111 3111 bcopy((void *)&ipinfo.dst_addr.un.ip6addr,
3112 3112 &s6->sin6_addr,
3113 3113 sizeof (struct in6_addr));
3114 3114
3115 3115 conn->c_addrmask.maxlen = conn->c_addrmask.len =
3116 3116 sizeof (struct sockaddr_in6);
3117 3117 conn->c_addrmask.buf =
3118 3118 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3119 3119 (void) memset(&((struct sockaddr_in6 *)
3120 3120 conn->c_addrmask.buf)->sin6_addr, (uchar_t)~0,
3121 3121 sizeof (struct in6_addr));
3122 3122 ((struct sockaddr_in6 *)
3123 3123 conn->c_addrmask.buf)->sin6_family =
3124 3124 (sa_family_t)~0;
3125 3125 break;
3126 3126
3127 3127 default:
3128 3128 return (IBT_CM_REJECT);
3129 3129 }
3130 3130
3131 3131 break;
3132 3132
3133 3133 case IBT_CM_EVENT_CONN_CLOSED:
3134 3134 {
3135 3135 CONN *conn;
3136 3136 rib_qp_t *qp;
3137 3137
3138 3138 switch (event->cm_event.closed) {
3139 3139 case IBT_CM_CLOSED_DREP_RCVD:
3140 3140 case IBT_CM_CLOSED_DREQ_TIMEOUT:
3141 3141 case IBT_CM_CLOSED_DUP:
3142 3142 case IBT_CM_CLOSED_ABORT:
3143 3143 case IBT_CM_CLOSED_ALREADY:
3144 3144 /*
3145 3145 * These cases indicate the local end initiated
3146 3146 * the closing of the channel. Nothing to do here.
3147 3147 */
3148 3148 break;
3149 3149 default:
3150 3150 /*
3151 3151 * Reason for CONN_CLOSED event must be one of
3152 3152 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3153 3153 * or IBT_CM_CLOSED_STALE. These indicate cases were
3154 3154 * the remote end is closing the channel. In these
3155 3155 * cases free the channel and transition to error
3156 3156 * state
3157 3157 */
3158 3158 qp = ibt_get_chan_private(event->cm_channel);
3159 3159 conn = qptoc(qp);
3160 3160 mutex_enter(&conn->c_lock);
3161 3161 if (conn->c_state == C_DISCONN_PEND) {
3162 3162 mutex_exit(&conn->c_lock);
3163 3163 break;
3164 3164 }
3165 3165 conn->c_state = C_ERROR_CONN;
3166 3166
3167 3167 /*
3168 3168 * Free the conn if c_ref goes down to 0
3169 3169 */
3170 3170 if (conn->c_ref == 0) {
3171 3171 /*
3172 3172 * Remove from list and free conn
3173 3173 */
3174 3174 conn->c_state = C_DISCONN_PEND;
3175 3175 mutex_exit(&conn->c_lock);
3176 3176 (void) rib_disconnect_channel(conn,
3177 3177 &hca->srv_conn_list);
3178 3178 } else {
3179 3179 /*
3180 3180 * conn will be freed when c_ref goes to 0.
3181 3181 * Indicate to cleaning thread not to close
3182 3182 * the connection, but just free the channel.
3183 3183 */
3184 3184 conn->c_flags |= C_CLOSE_NOTNEEDED;
3185 3185 mutex_exit(&conn->c_lock);
3186 3186 }
3187 3187 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3188 3188 break;
3189 3189 }
3190 3190 break;
3191 3191 }
3192 3192 case IBT_CM_EVENT_CONN_EST:
3193 3193 /*
3194 3194 * RTU received, hence connection established.
3195 3195 */
3196 3196 if (rib_debug > 1)
3197 3197 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3198 3198 "(CONN_EST) channel established");
3199 3199 break;
3200 3200
3201 3201 default:
3202 3202 if (rib_debug > 2) {
3203 3203 /* Let CM handle the following events. */
3204 3204 if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3205 3205 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3206 3206 "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3207 3207 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3208 3208 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3209 3209 "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3210 3210 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3211 3211 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3212 3212 "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3213 3213 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3214 3214 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3215 3215 "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3216 3216 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3217 3217 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3218 3218 "server recv'ed IBT_CM_EVENT_FAILURE\n");
3219 3219 }
3220 3220 }
3221 3221 return (IBT_CM_DEFAULT);
3222 3222 }
3223 3223
3224 3224 /* accept all other CM messages (i.e. let the CM handle them) */
3225 3225 return (IBT_CM_ACCEPT);
3226 3226 }
3227 3227
3228 3228 static rdma_stat
3229 3229 rib_register_service(rib_hca_t *hca, int service_type,
3230 3230 uint8_t protocol_num, in_port_t dst_port)
3231 3231 {
3232 3232 ibt_srv_desc_t sdesc;
3233 3233 ibt_hca_portinfo_t *port_infop;
3234 3234 ib_svc_id_t srv_id;
3235 3235 ibt_srv_hdl_t srv_hdl;
3236 3236 uint_t port_size;
3237 3237 uint_t pki, i, num_ports, nbinds;
3238 3238 ibt_status_t ibt_status;
3239 3239 rib_service_t *service;
3240 3240 ib_pkey_t pkey;
3241 3241
3242 3242 /*
3243 3243 * Query all ports for the given HCA
3244 3244 */
3245 3245 rw_enter(&hca->state_lock, RW_READER);
3246 3246 if (hca->state != HCA_DETACHED) {
3247 3247 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3248 3248 &num_ports, &port_size);
3249 3249 rw_exit(&hca->state_lock);
3250 3250 } else {
3251 3251 rw_exit(&hca->state_lock);
3252 3252 return (RDMA_FAILED);
3253 3253 }
3254 3254 if (ibt_status != IBT_SUCCESS) {
3255 3255 return (RDMA_FAILED);
3256 3256 }
3257 3257
3258 3258 DTRACE_PROBE1(rpcib__i__regservice_numports,
3259 3259 int, num_ports);
3260 3260
3261 3261 for (i = 0; i < num_ports; i++) {
3262 3262 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3263 3263 DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3264 3264 int, i+1);
3265 3265 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3266 3266 DTRACE_PROBE1(rpcib__i__regservice__portactive,
3267 3267 int, i+1);
3268 3268 }
3269 3269 }
3270 3270
3271 3271 /*
3272 3272 * Get all the IP addresses on this system to register the
3273 3273 * given "service type" on all DNS recognized IP addrs.
3274 3274 * Each service type such as NFS will have all the systems
3275 3275 * IP addresses as its different names. For now the only
3276 3276 * type of service we support in RPCIB is NFS.
3277 3277 */
3278 3278 rw_enter(&rib_stat->service_list_lock, RW_WRITER);
3279 3279 /*
3280 3280 * Start registering and binding service to active
3281 3281 * on active ports on this HCA.
3282 3282 */
3283 3283 nbinds = 0;
3284 3284 for (service = rib_stat->service_list;
3285 3285 service && (service->srv_type != service_type);
3286 3286 service = service->next)
3287 3287 ;
3288 3288
3289 3289 if (service == NULL) {
3290 3290 /*
3291 3291 * We use IP addresses as the service names for
3292 3292 * service registration. Register each of them
3293 3293 * with CM to obtain a svc_id and svc_hdl. We do not
3294 3294 * register the service with machine's loopback address.
3295 3295 */
3296 3296 (void) bzero(&srv_id, sizeof (ib_svc_id_t));
3297 3297 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3298 3298 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3299 3299 sdesc.sd_handler = rib_srv_cm_handler;
3300 3300 sdesc.sd_flags = 0;
3301 3301 ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3302 3302 &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
3303 3303 1, &srv_hdl, &srv_id);
3304 3304 if ((ibt_status != IBT_SUCCESS) &&
3305 3305 (ibt_status != IBT_CM_SERVICE_EXISTS)) {
3306 3306 rw_exit(&rib_stat->service_list_lock);
3307 3307 DTRACE_PROBE1(rpcib__i__regservice__ibtres,
3308 3308 int, ibt_status);
3309 3309 ibt_free_portinfo(port_infop, port_size);
3310 3310 return (RDMA_FAILED);
3311 3311 }
3312 3312
3313 3313 /*
3314 3314 * Allocate and prepare a service entry
3315 3315 */
3316 3316 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
3317 3317
3318 3318 service->srv_type = service_type;
3319 3319 service->srv_hdl = srv_hdl;
3320 3320 service->srv_id = srv_id;
3321 3321
3322 3322 service->next = rib_stat->service_list;
3323 3323 rib_stat->service_list = service;
3324 3324 DTRACE_PROBE1(rpcib__i__regservice__new__service,
3325 3325 int, service->srv_type);
3326 3326 } else {
3327 3327 srv_hdl = service->srv_hdl;
3328 3328 srv_id = service->srv_id;
3329 3329 DTRACE_PROBE1(rpcib__i__regservice__existing__service,
3330 3330 int, service->srv_type);
3331 3331 }
3332 3332
3333 3333 for (i = 0; i < num_ports; i++) {
3334 3334 ibt_sbind_hdl_t sbp;
3335 3335 rib_hca_service_t *hca_srv;
3336 3336 ib_gid_t gid;
3337 3337
3338 3338 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3339 3339 continue;
3340 3340
3341 3341 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3342 3342 pkey = port_infop[i].p_pkey_tbl[pki];
3343 3343
3344 3344 rw_enter(&hca->bound_services_lock, RW_READER);
3345 3345 gid = port_infop[i].p_sgid_tbl[0];
3346 3346 for (hca_srv = hca->bound_services; hca_srv;
3347 3347 hca_srv = hca_srv->next) {
3348 3348 if ((hca_srv->srv_id == service->srv_id) &&
3349 3349 (hca_srv->gid.gid_prefix ==
3350 3350 gid.gid_prefix) &&
3351 3351 (hca_srv->gid.gid_guid == gid.gid_guid))
3352 3352 break;
3353 3353 }
3354 3354 rw_exit(&hca->bound_services_lock);
3355 3355 if (hca_srv != NULL) {
3356 3356 /*
3357 3357 * port is alreay bound the the service
3358 3358 */
3359 3359 DTRACE_PROBE1(
3360 3360 rpcib__i__regservice__already__bound,
3361 3361 int, i+1);
3362 3362 nbinds++;
3363 3363 continue;
3364 3364 }
3365 3365
3366 3366 if ((pkey & IBSRM_HB) &&
3367 3367 (pkey != IB_PKEY_INVALID_FULL)) {
3368 3368
3369 3369 sbp = NULL;
3370 3370 ibt_status = ibt_bind_service(srv_hdl,
3371 3371 gid, NULL, hca, &sbp);
3372 3372
3373 3373 if (ibt_status == IBT_SUCCESS) {
3374 3374 hca_srv = kmem_zalloc(
3375 3375 sizeof (rib_hca_service_t),
3376 3376 KM_SLEEP);
3377 3377 hca_srv->srv_id = srv_id;
3378 3378 hca_srv->gid = gid;
3379 3379 hca_srv->sbind_hdl = sbp;
3380 3380
3381 3381 rw_enter(&hca->bound_services_lock,
3382 3382 RW_WRITER);
3383 3383 hca_srv->next = hca->bound_services;
3384 3384 hca->bound_services = hca_srv;
3385 3385 rw_exit(&hca->bound_services_lock);
3386 3386 nbinds++;
3387 3387 }
3388 3388
3389 3389 DTRACE_PROBE1(rpcib__i__regservice__bindres,
3390 3390 int, ibt_status);
3391 3391 }
3392 3392 }
3393 3393 }
3394 3394 rw_exit(&rib_stat->service_list_lock);
3395 3395
3396 3396 ibt_free_portinfo(port_infop, port_size);
3397 3397
3398 3398 if (nbinds == 0) {
3399 3399 return (RDMA_FAILED);
3400 3400 } else {
3401 3401 /*
3402 3402 * Put this plugin into accept state, since atleast
3403 3403 * one registration was successful.
3404 3404 */
3405 3405 mutex_enter(&plugin_state_lock);
3406 3406 plugin_state = ACCEPT;
3407 3407 mutex_exit(&plugin_state_lock);
3408 3408 return (RDMA_SUCCESS);
3409 3409 }
3410 3410 }
3411 3411
3412 3412 void
3413 3413 rib_listen(struct rdma_svc_data *rd)
3414 3414 {
3415 3415 rdma_stat status;
3416 3416 int n_listening = 0;
3417 3417 rib_hca_t *hca;
3418 3418
3419 3419 mutex_enter(&rib_stat->listen_lock);
3420 3420 /*
3421 3421 * if rd parameter is NULL then it means that rib_stat->q is
3422 3422 * already initialized by a call from RDMA and we just want to
3423 3423 * add a newly attached HCA to the same listening state as other
3424 3424 * HCAs.
3425 3425 */
3426 3426 if (rd == NULL) {
3427 3427 if (rib_stat->q == NULL) {
3428 3428 mutex_exit(&rib_stat->listen_lock);
3429 3429 return;
3430 3430 }
3431 3431 } else {
3432 3432 rib_stat->q = &rd->q;
3433 3433 }
3434 3434 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3435 3435 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3436 3436 /*
3437 3437 * First check if a hca is still attached
3438 3438 */
3439 3439 rw_enter(&hca->state_lock, RW_READER);
3440 3440 if (hca->state != HCA_INITED) {
3441 3441 rw_exit(&hca->state_lock);
3442 3442 continue;
3443 3443 }
3444 3444 rw_exit(&hca->state_lock);
3445 3445
3446 3446 /*
3447 3447 * Right now the only service type is NFS. Hence
3448 3448 * force feed this value. Ideally to communicate
3449 3449 * the service type it should be passed down in
3450 3450 * rdma_svc_data.
3451 3451 */
3452 3452 status = rib_register_service(hca, NFS,
3453 3453 IPPROTO_TCP, nfs_rdma_port);
3454 3454 if (status == RDMA_SUCCESS)
3455 3455 n_listening++;
3456 3456 }
3457 3457 rw_exit(&rib_stat->hcas_list_lock);
3458 3458
3459 3459 /*
3460 3460 * Service active on an HCA, check rd->err_code for more
3461 3461 * explainable errors.
3462 3462 */
3463 3463 if (rd) {
3464 3464 if (n_listening > 0) {
3465 3465 rd->active = 1;
3466 3466 rd->err_code = RDMA_SUCCESS;
3467 3467 } else {
3468 3468 rd->active = 0;
3469 3469 rd->err_code = RDMA_FAILED;
3470 3470 }
3471 3471 }
3472 3472 mutex_exit(&rib_stat->listen_lock);
3473 3473 }
3474 3474
3475 3475 /* XXXX */
3476 3476 /* ARGSUSED */
3477 3477 static void
3478 3478 rib_listen_stop(struct rdma_svc_data *svcdata)
3479 3479 {
3480 3480 rib_hca_t *hca;
3481 3481
3482 3482 mutex_enter(&rib_stat->listen_lock);
3483 3483 /*
3484 3484 * KRPC called the RDMATF to stop the listeners, this means
3485 3485 * stop sending incomming or recieved requests to KRPC master
3486 3486 * transport handle for RDMA-IB. This is also means that the
3487 3487 * master transport handle, responsible for us, is going away.
3488 3488 */
3489 3489 mutex_enter(&plugin_state_lock);
3490 3490 plugin_state = NO_ACCEPT;
3491 3491 if (svcdata != NULL)
3492 3492 svcdata->active = 0;
3493 3493 mutex_exit(&plugin_state_lock);
3494 3494
3495 3495 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3496 3496 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3497 3497 /*
3498 3498 * First check if a hca is still attached
3499 3499 */
3500 3500 rw_enter(&hca->state_lock, RW_READER);
3501 3501 if (hca->state == HCA_DETACHED) {
3502 3502 rw_exit(&hca->state_lock);
3503 3503 continue;
3504 3504 }
3505 3505 rib_close_channels(&hca->srv_conn_list);
3506 3506 rib_stop_services(hca);
3507 3507 rw_exit(&hca->state_lock);
3508 3508 }
3509 3509 rw_exit(&rib_stat->hcas_list_lock);
3510 3510
3511 3511 /*
3512 3512 * Avoid rib_listen() using the stale q field.
3513 3513 * This could happen if a port goes up after all services
3514 3514 * are already unregistered.
3515 3515 */
3516 3516 rib_stat->q = NULL;
3517 3517 mutex_exit(&rib_stat->listen_lock);
3518 3518 }
3519 3519
3520 3520 /*
3521 3521 * Traverse the HCA's service list to unbind and deregister services.
3522 3522 * For each bound service of HCA to be removed, first find the corresponding
3523 3523 * service handle (srv_hdl) and then unbind the service by calling
3524 3524 * ibt_unbind_service().
3525 3525 */
3526 3526 static void
3527 3527 rib_stop_services(rib_hca_t *hca)
3528 3528 {
3529 3529 rib_hca_service_t *srv_list, *to_remove;
3530 3530
3531 3531 /*
3532 3532 * unbind and deregister the services for this service type.
3533 3533 * Right now there is only one service type. In future it will
3534 3534 * be passed down to this function.
3535 3535 */
3536 3536 rw_enter(&hca->bound_services_lock, RW_READER);
3537 3537 srv_list = hca->bound_services;
3538 3538 hca->bound_services = NULL;
3539 3539 rw_exit(&hca->bound_services_lock);
3540 3540
3541 3541 while (srv_list != NULL) {
3542 3542 rib_service_t *sc;
3543 3543
3544 3544 to_remove = srv_list;
3545 3545 srv_list = to_remove->next;
3546 3546 rw_enter(&rib_stat->service_list_lock, RW_READER);
3547 3547 for (sc = rib_stat->service_list;
3548 3548 sc && (sc->srv_id != to_remove->srv_id);
3549 3549 sc = sc->next)
3550 3550 ;
3551 3551 /*
3552 3552 * if sc is NULL then the service doesn't exist anymore,
3553 3553 * probably just removed completely through rib_stat.
3554 3554 */
3555 3555 if (sc != NULL)
3556 3556 (void) ibt_unbind_service(sc->srv_hdl,
3557 3557 to_remove->sbind_hdl);
3558 3558 rw_exit(&rib_stat->service_list_lock);
3559 3559 kmem_free(to_remove, sizeof (rib_hca_service_t));
3560 3560 }
3561 3561 }
3562 3562
3563 3563 static struct svc_recv *
3564 3564 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3565 3565 {
3566 3566 struct svc_recv *recvp;
3567 3567
3568 3568 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3569 3569 recvp->vaddr = sgl->ds_va;
3570 3570 recvp->qp = qp;
3571 3571 recvp->bytes_xfer = 0;
3572 3572 return (recvp);
3573 3573 }
3574 3574
3575 3575 static int
3576 3576 rib_free_svc_recv(struct svc_recv *recvp)
3577 3577 {
3578 3578 kmem_free(recvp, sizeof (*recvp));
3579 3579
3580 3580 return (0);
3581 3581 }
3582 3582
3583 3583 static struct reply *
3584 3584 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3585 3585 {
3586 3586 struct reply *rep;
3587 3587
3588 3588
3589 3589 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3590 3590 if (rep == NULL) {
3591 3591 DTRACE_PROBE(rpcib__i__addrreply__nomem);
3592 3592 return (NULL);
3593 3593 }
3594 3594 rep->xid = msgid;
3595 3595 rep->vaddr_cq = NULL;
3596 3596 rep->bytes_xfer = 0;
3597 3597 rep->status = (uint_t)REPLY_WAIT;
3598 3598 rep->prev = NULL;
3599 3599 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3600 3600
3601 3601 mutex_enter(&qp->replylist_lock);
3602 3602 if (qp->replylist) {
3603 3603 rep->next = qp->replylist;
3604 3604 qp->replylist->prev = rep;
3605 3605 }
3606 3606 qp->rep_list_size++;
3607 3607
3608 3608 DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3609 3609 int, qp->rep_list_size);
3610 3610
3611 3611 qp->replylist = rep;
3612 3612 mutex_exit(&qp->replylist_lock);
3613 3613
3614 3614 return (rep);
3615 3615 }
3616 3616
3617 3617 static rdma_stat
3618 3618 rib_rem_replylist(rib_qp_t *qp)
3619 3619 {
3620 3620 struct reply *r, *n;
3621 3621
3622 3622 mutex_enter(&qp->replylist_lock);
3623 3623 for (r = qp->replylist; r != NULL; r = n) {
3624 3624 n = r->next;
3625 3625 (void) rib_remreply(qp, r);
3626 3626 }
3627 3627 mutex_exit(&qp->replylist_lock);
3628 3628
3629 3629 return (RDMA_SUCCESS);
3630 3630 }
3631 3631
3632 3632 static int
3633 3633 rib_remreply(rib_qp_t *qp, struct reply *rep)
3634 3634 {
3635 3635
3636 3636 ASSERT(MUTEX_HELD(&qp->replylist_lock));
3637 3637 if (rep->prev) {
3638 3638 rep->prev->next = rep->next;
3639 3639 }
3640 3640 if (rep->next) {
3641 3641 rep->next->prev = rep->prev;
3642 3642 }
3643 3643 if (qp->replylist == rep)
3644 3644 qp->replylist = rep->next;
3645 3645
3646 3646 cv_destroy(&rep->wait_cv);
3647 3647 qp->rep_list_size--;
3648 3648
3649 3649 DTRACE_PROBE1(rpcib__i__remreply__listsize,
3650 3650 int, qp->rep_list_size);
3651 3651
3652 3652 kmem_free(rep, sizeof (*rep));
3653 3653
3654 3654 return (0);
3655 3655 }
3656 3656
3657 3657 rdma_stat
3658 3658 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
3659 3659 struct mrc *buf_handle)
3660 3660 {
3661 3661 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
3662 3662 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
3663 3663 rdma_stat status;
3664 3664 rib_hca_t *hca = (ctoqp(conn))->hca;
3665 3665
3666 3666 /*
3667 3667 * Note: ALL buffer pools use the same memory type RDMARW.
3668 3668 */
3669 3669 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3670 3670 if (status == RDMA_SUCCESS) {
3671 3671 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3672 3672 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3673 3673 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3674 3674 } else {
3675 3675 buf_handle->mrc_linfo = NULL;
3676 3676 buf_handle->mrc_lmr = 0;
3677 3677 buf_handle->mrc_rmr = 0;
3678 3678 }
3679 3679 return (status);
3680 3680 }
3681 3681
3682 3682 static rdma_stat
3683 3683 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3684 3684 ibt_mr_flags_t spec,
3685 3685 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3686 3686 {
3687 3687 ibt_mr_attr_t mem_attr;
3688 3688 ibt_status_t ibt_status;
3689 3689 mem_attr.mr_vaddr = (uintptr_t)buf;
3690 3690 mem_attr.mr_len = (ib_msglen_t)size;
3691 3691 mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3692 3692 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3693 3693 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3694 3694 IBT_MR_ENABLE_WINDOW_BIND | spec;
3695 3695
3696 3696 rw_enter(&hca->state_lock, RW_READER);
3697 3697 if (hca->state != HCA_DETACHED) {
3698 3698 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3699 3699 &mem_attr, mr_hdlp, mr_descp);
3700 3700 rw_exit(&hca->state_lock);
3701 3701 } else {
3702 3702 rw_exit(&hca->state_lock);
3703 3703 return (RDMA_FAILED);
3704 3704 }
3705 3705
3706 3706 if (ibt_status != IBT_SUCCESS) {
3707 3707 return (RDMA_FAILED);
3708 3708 }
3709 3709 return (RDMA_SUCCESS);
3710 3710 }
3711 3711
3712 3712 rdma_stat
3713 3713 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
3714 3714 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3715 3715 {
3716 3716 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
3717 3717 rib_lrc_entry_t *l;
3718 3718 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
3719 3719 rdma_stat status;
3720 3720 rib_hca_t *hca = (ctoqp(conn))->hca;
3721 3721
3722 3722 /*
3723 3723 * Non-coherent memory registration.
3724 3724 */
3725 3725 l = (rib_lrc_entry_t *)lrc;
3726 3726 if (l) {
3727 3727 if (l->registered) {
3728 3728 buf_handle->mrc_linfo =
3729 3729 (uintptr_t)l->lrc_mhandle.mrc_linfo;
3730 3730 buf_handle->mrc_lmr =
3731 3731 (uint32_t)l->lrc_mhandle.mrc_lmr;
3732 3732 buf_handle->mrc_rmr =
3733 3733 (uint32_t)l->lrc_mhandle.mrc_rmr;
3734 3734 *sync_handle = (RIB_SYNCMEM_HANDLE)
3735 3735 (uintptr_t)l->lrc_mhandle.mrc_linfo;
3736 3736 return (RDMA_SUCCESS);
3737 3737 } else {
3738 3738 /* Always register the whole buffer */
3739 3739 buf = (caddr_t)l->lrc_buf;
3740 3740 buflen = l->lrc_len;
3741 3741 }
3742 3742 }
3743 3743 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3744 3744
3745 3745 if (status == RDMA_SUCCESS) {
3746 3746 if (l) {
3747 3747 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3748 3748 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey;
3749 3749 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey;
3750 3750 l->registered = TRUE;
3751 3751 }
3752 3752 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3753 3753 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3754 3754 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3755 3755 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3756 3756 } else {
3757 3757 buf_handle->mrc_linfo = NULL;
3758 3758 buf_handle->mrc_lmr = 0;
3759 3759 buf_handle->mrc_rmr = 0;
3760 3760 }
3761 3761 return (status);
3762 3762 }
3763 3763
3764 3764 /* ARGSUSED */
3765 3765 rdma_stat
3766 3766 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3767 3767 {
3768 3768 rib_hca_t *hca = (ctoqp(conn))->hca;
3769 3769 /*
3770 3770 * Allow memory deregistration even if HCA is
3771 3771 * getting detached. Need all outstanding
3772 3772 * memory registrations to be deregistered
3773 3773 * before HCA_DETACH_EVENT can be accepted.
3774 3774 */
3775 3775 (void) ibt_deregister_mr(hca->hca_hdl,
3776 3776 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3777 3777 return (RDMA_SUCCESS);
3778 3778 }
3779 3779
3780 3780 /* ARGSUSED */
3781 3781 rdma_stat
3782 3782 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3783 3783 RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3784 3784 {
3785 3785 rib_lrc_entry_t *l;
3786 3786 l = (rib_lrc_entry_t *)lrc;
3787 3787 if (l)
3788 3788 if (l->registered)
3789 3789 return (RDMA_SUCCESS);
3790 3790
3791 3791 (void) rib_deregistermem(conn, buf, buf_handle);
3792 3792
3793 3793 return (RDMA_SUCCESS);
3794 3794 }
3795 3795
3796 3796 /* ARGSUSED */
3797 3797 rdma_stat
3798 3798 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3799 3799 int len, int cpu)
3800 3800 {
3801 3801 ibt_status_t status;
3802 3802 rib_hca_t *hca = (ctoqp(conn))->hca;
3803 3803 ibt_mr_sync_t mr_segment;
3804 3804
3805 3805 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3806 3806 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3807 3807 mr_segment.ms_len = (ib_memlen_t)len;
3808 3808 if (cpu) {
3809 3809 /* make incoming data visible to memory */
3810 3810 mr_segment.ms_flags = IBT_SYNC_WRITE;
3811 3811 } else {
3812 3812 /* make memory changes visible to IO */
3813 3813 mr_segment.ms_flags = IBT_SYNC_READ;
3814 3814 }
3815 3815 rw_enter(&hca->state_lock, RW_READER);
3816 3816 if (hca->state != HCA_DETACHED) {
3817 3817 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3818 3818 rw_exit(&hca->state_lock);
3819 3819 } else {
3820 3820 rw_exit(&hca->state_lock);
3821 3821 return (RDMA_FAILED);
3822 3822 }
3823 3823
3824 3824 if (status == IBT_SUCCESS)
3825 3825 return (RDMA_SUCCESS);
3826 3826 else {
3827 3827 return (RDMA_FAILED);
3828 3828 }
3829 3829 }
3830 3830
3831 3831 /*
3832 3832 * XXXX ????
3833 3833 */
3834 3834 static rdma_stat
3835 3835 rib_getinfo(rdma_info_t *info)
3836 3836 {
3837 3837 /*
3838 3838 * XXXX Hack!
3839 3839 */
3840 3840 info->addrlen = 16;
3841 3841 info->mts = 1000000;
3842 3842 info->mtu = 1000000;
3843 3843
3844 3844 return (RDMA_SUCCESS);
3845 3845 }
3846 3846
3847 3847 rib_bufpool_t *
3848 3848 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3849 3849 {
3850 3850 rib_bufpool_t *rbp = NULL;
3851 3851 bufpool_t *bp = NULL;
3852 3852 caddr_t buf;
3853 3853 ibt_mr_attr_t mem_attr;
3854 3854 ibt_status_t ibt_status;
3855 3855 int i, j;
3856 3856
3857 3857 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3858 3858
3859 3859 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3860 3860 num * sizeof (void *), KM_SLEEP);
3861 3861
3862 3862 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3863 3863 bp->numelems = num;
3864 3864
3865 3865
3866 3866 switch (ptype) {
3867 3867 case SEND_BUFFER:
3868 3868 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3869 3869 bp->rsize = RPC_MSG_SZ;
3870 3870 break;
3871 3871 case RECV_BUFFER:
3872 3872 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3873 3873 bp->rsize = RPC_BUF_SIZE;
3874 3874 break;
3875 3875 default:
3876 3876 goto fail;
3877 3877 }
3878 3878
3879 3879 /*
3880 3880 * Register the pool.
3881 3881 */
3882 3882 bp->bufsize = num * bp->rsize;
3883 3883 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3884 3884 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3885 3885 sizeof (ibt_mr_hdl_t), KM_SLEEP);
3886 3886 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3887 3887 sizeof (ibt_mr_desc_t), KM_SLEEP);
3888 3888 rw_enter(&hca->state_lock, RW_READER);
3889 3889
3890 3890 if (hca->state == HCA_DETACHED) {
3891 3891 rw_exit(&hca->state_lock);
3892 3892 goto fail;
3893 3893 }
3894 3894
3895 3895 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3896 3896 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3897 3897 mem_attr.mr_vaddr = (uintptr_t)buf;
3898 3898 mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3899 3899 mem_attr.mr_as = NULL;
3900 3900 ibt_status = ibt_register_mr(hca->hca_hdl,
3901 3901 hca->pd_hdl, &mem_attr,
3902 3902 &rbp->mr_hdl[i],
3903 3903 &rbp->mr_desc[i]);
3904 3904 if (ibt_status != IBT_SUCCESS) {
3905 3905 for (j = 0; j < i; j++) {
3906 3906 (void) ibt_deregister_mr(hca->hca_hdl,
3907 3907 rbp->mr_hdl[j]);
3908 3908 }
3909 3909 rw_exit(&hca->state_lock);
3910 3910 goto fail;
3911 3911 }
3912 3912 }
3913 3913 rw_exit(&hca->state_lock);
3914 3914 buf = (caddr_t)bp->buf;
3915 3915 for (i = 0; i < num; i++, buf += bp->rsize) {
3916 3916 bp->buflist[i] = (void *)buf;
3917 3917 }
3918 3918 bp->buffree = num - 1; /* no. of free buffers */
3919 3919 rbp->bpool = bp;
3920 3920
3921 3921 return (rbp);
3922 3922 fail:
3923 3923 if (bp) {
3924 3924 if (bp->buf)
3925 3925 kmem_free(bp->buf, bp->bufsize);
3926 3926 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3927 3927 }
3928 3928 if (rbp) {
3929 3929 if (rbp->mr_hdl)
3930 3930 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3931 3931 if (rbp->mr_desc)
3932 3932 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3933 3933 kmem_free(rbp, sizeof (rib_bufpool_t));
3934 3934 }
3935 3935 return (NULL);
3936 3936 }
3937 3937
3938 3938 static void
3939 3939 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3940 3940 {
3941 3941 int i;
3942 3942 rib_bufpool_t *rbp = NULL;
3943 3943 bufpool_t *bp;
3944 3944
3945 3945 /*
3946 3946 * Obtain pool address based on type of pool
3947 3947 */
3948 3948 switch (ptype) {
3949 3949 case SEND_BUFFER:
3950 3950 rbp = hca->send_pool;
3951 3951 break;
3952 3952 case RECV_BUFFER:
3953 3953 rbp = hca->recv_pool;
3954 3954 break;
3955 3955 default:
3956 3956 return;
3957 3957 }
3958 3958 if (rbp == NULL)
3959 3959 return;
3960 3960
3961 3961 bp = rbp->bpool;
3962 3962
3963 3963 /*
3964 3964 * Deregister the pool memory and free it.
3965 3965 */
3966 3966 for (i = 0; i < bp->numelems; i++) {
3967 3967 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3968 3968 }
3969 3969 }
3970 3970
3971 3971 static void
3972 3972 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3973 3973 {
3974 3974
3975 3975 rib_bufpool_t *rbp = NULL;
3976 3976 bufpool_t *bp;
3977 3977
3978 3978 /*
3979 3979 * Obtain pool address based on type of pool
3980 3980 */
3981 3981 switch (ptype) {
3982 3982 case SEND_BUFFER:
3983 3983 rbp = hca->send_pool;
3984 3984 break;
3985 3985 case RECV_BUFFER:
3986 3986 rbp = hca->recv_pool;
3987 3987 break;
3988 3988 default:
3989 3989 return;
3990 3990 }
3991 3991 if (rbp == NULL)
3992 3992 return;
3993 3993
3994 3994 bp = rbp->bpool;
3995 3995
3996 3996 /*
3997 3997 * Free the pool memory.
3998 3998 */
3999 3999 if (rbp->mr_hdl)
4000 4000 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
4001 4001
4002 4002 if (rbp->mr_desc)
4003 4003 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4004 4004 if (bp->buf)
4005 4005 kmem_free(bp->buf, bp->bufsize);
4006 4006 mutex_destroy(&bp->buflock);
4007 4007 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4008 4008 kmem_free(rbp, sizeof (rib_bufpool_t));
4009 4009 }
4010 4010
4011 4011 void
4012 4012 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4013 4013 {
4014 4014 /*
4015 4015 * Deregister the pool memory and free it.
4016 4016 */
4017 4017 rib_rbufpool_deregister(hca, ptype);
4018 4018 rib_rbufpool_free(hca, ptype);
4019 4019 }
4020 4020
4021 4021 /*
4022 4022 * Fetch a buffer from the pool of type specified in rdbuf->type.
4023 4023 */
4024 4024 static rdma_stat
4025 4025 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4026 4026 {
4027 4027 rib_lrc_entry_t *rlep;
4028 4028
4029 4029 if (rdbuf->type == RDMA_LONG_BUFFER) {
4030 4030 rlep = rib_get_cache_buf(conn, rdbuf->len);
4031 4031 rdbuf->rb_private = (caddr_t)rlep;
4032 4032 rdbuf->addr = rlep->lrc_buf;
4033 4033 rdbuf->handle = rlep->lrc_mhandle;
4034 4034 return (RDMA_SUCCESS);
4035 4035 }
4036 4036
4037 4037 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4038 4038 if (rdbuf->addr) {
4039 4039 switch (rdbuf->type) {
4040 4040 case SEND_BUFFER:
4041 4041 rdbuf->len = RPC_MSG_SZ; /* 1K */
4042 4042 break;
4043 4043 case RECV_BUFFER:
4044 4044 rdbuf->len = RPC_BUF_SIZE; /* 2K */
4045 4045 break;
4046 4046 default:
4047 4047 rdbuf->len = 0;
4048 4048 }
4049 4049 return (RDMA_SUCCESS);
4050 4050 } else
4051 4051 return (RDMA_FAILED);
4052 4052 }
4053 4053
4054 4054 /*
4055 4055 * Fetch a buffer of specified type.
4056 4056 * Note that rdbuf->handle is mw's rkey.
4057 4057 */
4058 4058 static void *
4059 4059 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4060 4060 {
4061 4061 rib_qp_t *qp = ctoqp(conn);
4062 4062 rib_hca_t *hca = qp->hca;
4063 4063 rdma_btype ptype = rdbuf->type;
4064 4064 void *buf;
4065 4065 rib_bufpool_t *rbp = NULL;
4066 4066 bufpool_t *bp;
4067 4067 int i;
4068 4068
4069 4069 /*
4070 4070 * Obtain pool address based on type of pool
4071 4071 */
4072 4072 switch (ptype) {
4073 4073 case SEND_BUFFER:
4074 4074 rbp = hca->send_pool;
4075 4075 break;
4076 4076 case RECV_BUFFER:
4077 4077 rbp = hca->recv_pool;
4078 4078 break;
4079 4079 default:
4080 4080 return (NULL);
4081 4081 }
4082 4082 if (rbp == NULL)
4083 4083 return (NULL);
4084 4084
4085 4085 bp = rbp->bpool;
4086 4086
4087 4087 mutex_enter(&bp->buflock);
4088 4088 if (bp->buffree < 0) {
4089 4089 mutex_exit(&bp->buflock);
4090 4090 return (NULL);
4091 4091 }
4092 4092
4093 4093 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4094 4094 buf = bp->buflist[bp->buffree];
4095 4095 rdbuf->addr = buf;
4096 4096 rdbuf->len = bp->rsize;
4097 4097 for (i = bp->numelems - 1; i >= 0; i--) {
4098 4098 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4099 4099 rdbuf->handle.mrc_rmr =
4100 4100 (uint32_t)rbp->mr_desc[i].md_rkey;
4101 4101 rdbuf->handle.mrc_linfo =
4102 4102 (uintptr_t)rbp->mr_hdl[i];
4103 4103 rdbuf->handle.mrc_lmr =
4104 4104 (uint32_t)rbp->mr_desc[i].md_lkey;
4105 4105 bp->buffree--;
4106 4106
4107 4107 mutex_exit(&bp->buflock);
4108 4108
4109 4109 return (buf);
4110 4110 }
4111 4111 }
4112 4112
4113 4113 mutex_exit(&bp->buflock);
4114 4114
4115 4115 return (NULL);
4116 4116 }
4117 4117
4118 4118 static void
4119 4119 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4120 4120 {
4121 4121
4122 4122 if (rdbuf->type == RDMA_LONG_BUFFER) {
4123 4123 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
4124 4124 rdbuf->rb_private = NULL;
4125 4125 return;
4126 4126 }
4127 4127 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4128 4128 }
4129 4129
4130 4130 static void
4131 4131 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4132 4132 {
4133 4133 rib_qp_t *qp = ctoqp(conn);
4134 4134 rib_hca_t *hca = qp->hca;
4135 4135 rib_bufpool_t *rbp = NULL;
4136 4136 bufpool_t *bp;
4137 4137
4138 4138 /*
4139 4139 * Obtain pool address based on type of pool
4140 4140 */
4141 4141 switch (ptype) {
4142 4142 case SEND_BUFFER:
4143 4143 rbp = hca->send_pool;
4144 4144 break;
4145 4145 case RECV_BUFFER:
4146 4146 rbp = hca->recv_pool;
4147 4147 break;
4148 4148 default:
4149 4149 return;
4150 4150 }
4151 4151 if (rbp == NULL)
4152 4152 return;
4153 4153
4154 4154 bp = rbp->bpool;
4155 4155
4156 4156 mutex_enter(&bp->buflock);
4157 4157 if (++bp->buffree >= bp->numelems) {
4158 4158 /*
4159 4159 * Should never happen
4160 4160 */
4161 4161 bp->buffree--;
4162 4162 } else {
4163 4163 bp->buflist[bp->buffree] = buf;
4164 4164 }
4165 4165 mutex_exit(&bp->buflock);
4166 4166 }
4167 4167
4168 4168 static rdma_stat
4169 4169 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4170 4170 {
4171 4171 rw_enter(&connlist->conn_lock, RW_WRITER);
4172 4172 if (connlist->conn_hd) {
4173 4173 cn->c_next = connlist->conn_hd;
4174 4174 connlist->conn_hd->c_prev = cn;
4175 4175 }
4176 4176 connlist->conn_hd = cn;
4177 4177 rw_exit(&connlist->conn_lock);
4178 4178
4179 4179 return (RDMA_SUCCESS);
4180 4180 }
4181 4181
4182 4182 static rdma_stat
4183 4183 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4184 4184 {
4185 4185 rw_enter(&connlist->conn_lock, RW_WRITER);
4186 4186 if (cn->c_prev) {
4187 4187 cn->c_prev->c_next = cn->c_next;
4188 4188 }
4189 4189 if (cn->c_next) {
4190 4190 cn->c_next->c_prev = cn->c_prev;
4191 4191 }
4192 4192 if (connlist->conn_hd == cn)
4193 4193 connlist->conn_hd = cn->c_next;
4194 4194 rw_exit(&connlist->conn_lock);
4195 4195
4196 4196 return (RDMA_SUCCESS);
4197 4197 }
4198 4198
4199 4199 /* ARGSUSED */
4200 4200 static rdma_stat
4201 4201 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4202 4202 int addr_type, void *handle, CONN **conn)
4203 4203 {
4204 4204 rdma_stat status;
4205 4205 rpcib_ping_t rpt;
4206 4206
4207 4207 status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn);
4208 4208 return (status);
4209 4209 }
4210 4210
4211 4211 /*
4212 4212 * rib_find_hca_connection
4213 4213 *
4214 4214 * if there is an existing connection to the specified address then
4215 4215 * it will be returned in conn, otherwise conn will be set to NULL.
4216 4216 * Also cleans up any connection that is in error state.
4217 4217 */
4218 4218 static int
4219 4219 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
4220 4220 struct netbuf *d_svcaddr, CONN **conn)
4221 4221 {
4222 4222 CONN *cn;
4223 4223 clock_t cv_stat, timout;
4224 4224
4225 4225 *conn = NULL;
4226 4226 again:
4227 4227 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4228 4228 cn = hca->cl_conn_list.conn_hd;
4229 4229 while (cn != NULL) {
4230 4230 /*
4231 4231 * First, clear up any connection in the ERROR state
4232 4232 */
4233 4233 mutex_enter(&cn->c_lock);
4234 4234 if (cn->c_state == C_ERROR_CONN) {
4235 4235 if (cn->c_ref == 0) {
4236 4236 /*
4237 4237 * Remove connection from list and destroy it.
4238 4238 */
4239 4239 cn->c_state = C_DISCONN_PEND;
4240 4240 mutex_exit(&cn->c_lock);
4241 4241 rw_exit(&hca->cl_conn_list.conn_lock);
4242 4242 rib_conn_close((void *)cn);
4243 4243 goto again;
4244 4244 }
4245 4245 mutex_exit(&cn->c_lock);
4246 4246 cn = cn->c_next;
4247 4247 continue;
4248 4248 }
4249 4249 if (cn->c_state == C_DISCONN_PEND) {
4250 4250 mutex_exit(&cn->c_lock);
4251 4251 cn = cn->c_next;
4252 4252 continue;
4253 4253 }
4254 4254
4255 4255 /*
4256 4256 * source address is only checked for if there is one,
4257 4257 * this is the case for retries.
4258 4258 */
4259 4259 if ((cn->c_raddr.len == d_svcaddr->len) &&
4260 4260 (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
4261 4261 d_svcaddr->len) == 0) &&
4262 4262 ((s_svcaddr->len == 0) ||
4263 4263 ((cn->c_laddr.len == s_svcaddr->len) &&
4264 4264 (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
4265 4265 s_svcaddr->len) == 0)))) {
4266 4266 /*
4267 4267 * Our connection. Give up conn list lock
4268 4268 * as we are done traversing the list.
4269 4269 */
4270 4270 rw_exit(&hca->cl_conn_list.conn_lock);
4271 4271 if (cn->c_state == C_CONNECTED) {
4272 4272 cn->c_ref++; /* sharing a conn */
4273 4273 mutex_exit(&cn->c_lock);
↓ open down ↓ |
1574 lines elided |
↑ open up ↑ |
4274 4274 *conn = cn;
4275 4275 return (RDMA_SUCCESS);
4276 4276 }
4277 4277 if (cn->c_state == C_CONN_PEND) {
4278 4278 /*
4279 4279 * Hold a reference to this conn before
4280 4280 * we give up the lock.
4281 4281 */
4282 4282 cn->c_ref++;
4283 4283 timout = ddi_get_lbolt() +
4284 - drv_usectohz(CONN_WAIT_TIME * 1000000);
4284 + drv_sectohz(CONN_WAIT_TIME);
4285 4285 while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4286 4286 &cn->c_lock, timout)) > 0 &&
4287 4287 cn->c_state == C_CONN_PEND)
4288 4288 ;
4289 4289 if (cv_stat == 0) {
4290 4290 (void) rib_conn_release_locked(cn);
4291 4291 return (RDMA_INTR);
4292 4292 }
4293 4293 if (cv_stat < 0) {
4294 4294 (void) rib_conn_release_locked(cn);
4295 4295 return (RDMA_TIMEDOUT);
4296 4296 }
4297 4297 if (cn->c_state == C_CONNECTED) {
4298 4298 *conn = cn;
4299 4299 mutex_exit(&cn->c_lock);
4300 4300 return (RDMA_SUCCESS);
4301 4301 } else {
4302 4302 (void) rib_conn_release_locked(cn);
4303 4303 return (RDMA_TIMEDOUT);
4304 4304 }
4305 4305 }
4306 4306 }
4307 4307 mutex_exit(&cn->c_lock);
4308 4308 cn = cn->c_next;
4309 4309 }
4310 4310 rw_exit(&hca->cl_conn_list.conn_lock);
4311 4311 *conn = NULL;
4312 4312 return (RDMA_FAILED);
4313 4313 }
4314 4314
4315 4315 /*
4316 4316 * Connection management.
4317 4317 * IBTF does not support recycling of channels. So connections are only
4318 4318 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4319 4319 * C_DISCONN_PEND state. No C_IDLE state.
4320 4320 * C_CONN_PEND state: Connection establishment in progress to the server.
4321 4321 * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4322 4322 * It has an RC channel associated with it. ibt_post_send/recv are allowed
4323 4323 * only in this state.
4324 4324 * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4325 4325 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4326 4326 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4327 4327 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4328 4328 * c_ref drops to 0 (this indicates that RPC has no more references to this
4329 4329 * connection), the connection should be destroyed. A connection transitions
4330 4330 * into this state when it is being destroyed.
4331 4331 */
4332 4332 /* ARGSUSED */
4333 4333 static rdma_stat
4334 4334 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4335 4335 int addr_type, rpcib_ping_t *rpt, CONN **conn)
4336 4336 {
4337 4337 CONN *cn;
4338 4338 int status;
4339 4339 rib_hca_t *hca;
4340 4340 rib_qp_t *qp;
4341 4341 int s_addr_len;
4342 4342 char *s_addr_buf;
4343 4343
4344 4344 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
4345 4345 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
4346 4346 rw_enter(&hca->state_lock, RW_READER);
4347 4347 if (hca->state != HCA_DETACHED) {
4348 4348 status = rib_find_hca_connection(hca, s_svcaddr,
4349 4349 d_svcaddr, conn);
4350 4350 rw_exit(&hca->state_lock);
4351 4351 if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
4352 4352 rw_exit(&rib_stat->hcas_list_lock);
4353 4353 return (status);
4354 4354 }
4355 4355 } else
4356 4356 rw_exit(&hca->state_lock);
4357 4357 }
4358 4358 rw_exit(&rib_stat->hcas_list_lock);
4359 4359
4360 4360 /*
4361 4361 * No existing connection found, establish a new connection.
4362 4362 */
4363 4363 bzero(rpt, sizeof (rpcib_ping_t));
4364 4364
4365 4365 status = rib_ping_srv(addr_type, d_svcaddr, rpt);
4366 4366 if (status != RDMA_SUCCESS) {
4367 4367 return (RDMA_FAILED);
4368 4368 }
4369 4369 hca = rpt->hca;
4370 4370
4371 4371 if (rpt->srcip.family == AF_INET) {
4372 4372 s_addr_len = sizeof (rpt->srcip.un.ip4addr);
4373 4373 s_addr_buf = (char *)&rpt->srcip.un.ip4addr;
4374 4374 } else if (rpt->srcip.family == AF_INET6) {
4375 4375 s_addr_len = sizeof (rpt->srcip.un.ip6addr);
4376 4376 s_addr_buf = (char *)&rpt->srcip.un.ip6addr;
4377 4377 } else {
4378 4378 return (RDMA_FAILED);
4379 4379 }
4380 4380
4381 4381 /*
4382 4382 * Channel to server doesn't exist yet, create one.
4383 4383 */
4384 4384 if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
4385 4385 return (RDMA_FAILED);
4386 4386 }
4387 4387 cn = qptoc(qp);
4388 4388 cn->c_state = C_CONN_PEND;
4389 4389 cn->c_ref = 1;
4390 4390
4391 4391 cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
4392 4392 bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
4393 4393 cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
4394 4394
4395 4395 if (rpt->srcip.family == AF_INET) {
4396 4396 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP);
4397 4397 (void) strcpy(cn->c_netid, RIBNETID_TCP);
4398 4398
4399 4399 cn->c_addrmask.len = cn->c_addrmask.maxlen =
4400 4400 sizeof (struct sockaddr_in);
4401 4401 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4402 4402
4403 4403 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_addr.s_addr =
4404 4404 (uint32_t)~0;
4405 4405 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_family =
4406 4406 (ushort_t)~0;
4407 4407
4408 4408 } else {
4409 4409 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP);
4410 4410 (void) strcpy(cn->c_netid, RIBNETID_TCP6);
4411 4411
4412 4412 cn->c_addrmask.len = cn->c_addrmask.maxlen =
4413 4413 sizeof (struct sockaddr_in6);
4414 4414 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4415 4415
4416 4416 (void) memset(
4417 4417 &((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_addr,
4418 4418 (uchar_t)~0, sizeof (struct in6_addr));
4419 4419 ((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_family =
4420 4420 (sa_family_t)~0;
4421 4421 }
4422 4422
4423 4423 /*
4424 4424 * Add to conn list.
4425 4425 * We had given up the READER lock. In the time since then,
4426 4426 * another thread might have created the connection we are
4427 4427 * trying here. But for now, that is quiet alright - there
4428 4428 * might be two connections between a pair of hosts instead
4429 4429 * of one. If we really want to close that window,
4430 4430 * then need to check the list after acquiring the
4431 4431 * WRITER lock.
4432 4432 */
4433 4433 (void) rib_add_connlist(cn, &hca->cl_conn_list);
4434 4434 status = rib_conn_to_srv(hca, qp, rpt);
4435 4435 mutex_enter(&cn->c_lock);
4436 4436
4437 4437 if (cn->c_flags & C_CLOSE_PENDING) {
4438 4438 /*
4439 4439 * This handles a case where the module or
4440 4440 * HCA detached in the time a connection is
4441 4441 * established. In such a case close the
4442 4442 * connection immediately if this is the
4443 4443 * only reference.
4444 4444 */
4445 4445 if (cn->c_ref == 1) {
4446 4446 cn->c_ref--;
4447 4447 cn->c_state = C_DISCONN_PEND;
4448 4448 mutex_exit(&cn->c_lock);
4449 4449 rib_conn_close((void *)cn);
4450 4450 return (RDMA_FAILED);
4451 4451 }
4452 4452
4453 4453 /*
4454 4454 * Connection to be closed later when c_ref = 0
4455 4455 */
4456 4456 status = RDMA_FAILED;
4457 4457 }
4458 4458
4459 4459 if (status == RDMA_SUCCESS) {
4460 4460 cn->c_state = C_CONNECTED;
4461 4461 *conn = cn;
4462 4462 } else {
4463 4463 cn->c_state = C_ERROR_CONN;
4464 4464 cn->c_ref--;
4465 4465 }
4466 4466 cv_signal(&cn->c_cv);
4467 4467 mutex_exit(&cn->c_lock);
4468 4468 return (status);
4469 4469 }
4470 4470
4471 4471 static void
4472 4472 rib_conn_close(void *rarg)
4473 4473 {
4474 4474 CONN *conn = (CONN *)rarg;
4475 4475 rib_qp_t *qp = ctoqp(conn);
4476 4476
4477 4477 mutex_enter(&conn->c_lock);
4478 4478 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4479 4479
4480 4480 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4481 4481
4482 4482 /*
4483 4483 * Live connection in CONNECTED state.
4484 4484 */
4485 4485 if (conn->c_state == C_CONNECTED) {
4486 4486 conn->c_state = C_ERROR_CONN;
4487 4487 }
4488 4488 mutex_exit(&conn->c_lock);
4489 4489
4490 4490 rib_close_a_channel(conn);
4491 4491
4492 4492 mutex_enter(&conn->c_lock);
4493 4493 conn->c_flags &= ~C_CLOSE_PENDING;
4494 4494 }
4495 4495
4496 4496 mutex_exit(&conn->c_lock);
4497 4497
4498 4498 if (qp->mode == RIB_SERVER)
4499 4499 (void) rib_disconnect_channel(conn,
4500 4500 &qp->hca->srv_conn_list);
4501 4501 else
4502 4502 (void) rib_disconnect_channel(conn,
4503 4503 &qp->hca->cl_conn_list);
4504 4504 }
4505 4505
4506 4506 static void
4507 4507 rib_conn_timeout_call(void *carg)
4508 4508 {
4509 4509 time_t idle_time;
4510 4510 CONN *conn = (CONN *)carg;
4511 4511 rib_hca_t *hca = ctoqp(conn)->hca;
4512 4512 int error;
4513 4513
4514 4514 mutex_enter(&conn->c_lock);
4515 4515 if ((conn->c_ref > 0) ||
4516 4516 (conn->c_state == C_DISCONN_PEND)) {
4517 4517 conn->c_timeout = NULL;
4518 4518 mutex_exit(&conn->c_lock);
4519 4519 return;
4520 4520 }
4521 4521
4522 4522 idle_time = (gethrestime_sec() - conn->c_last_used);
4523 4523
4524 4524 if ((idle_time <= rib_conn_timeout) &&
4525 4525 (conn->c_state != C_ERROR_CONN)) {
4526 4526 /*
4527 4527 * There was activity after the last timeout.
4528 4528 * Extend the conn life. Unless the conn is
4529 4529 * already in error state.
4530 4530 */
4531 4531 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4532 4532 SEC_TO_TICK(rib_conn_timeout - idle_time));
4533 4533 mutex_exit(&conn->c_lock);
4534 4534 return;
4535 4535 }
4536 4536
4537 4537 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4538 4538 (void *)conn, DDI_NOSLEEP);
4539 4539
4540 4540 /*
4541 4541 * If taskq dispatch fails above, then reset the timeout
4542 4542 * to try again after 10 secs.
4543 4543 */
4544 4544
4545 4545 if (error != DDI_SUCCESS) {
4546 4546 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4547 4547 SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4548 4548 mutex_exit(&conn->c_lock);
4549 4549 return;
4550 4550 }
4551 4551
4552 4552 conn->c_state = C_DISCONN_PEND;
4553 4553 mutex_exit(&conn->c_lock);
4554 4554 }
4555 4555
4556 4556 static rdma_stat
4557 4557 rib_conn_release(CONN *conn)
4558 4558 {
4559 4559 mutex_enter(&conn->c_lock);
4560 4560 return (rib_conn_release_locked(conn));
4561 4561 }
4562 4562
4563 4563 /*
4564 4564 * Expects conn->c_lock to be held on entry.
4565 4565 * c_lock released on return
4566 4566 */
4567 4567 static rdma_stat
4568 4568 rib_conn_release_locked(CONN *conn)
4569 4569 {
4570 4570 conn->c_ref--;
4571 4571
4572 4572 conn->c_last_used = gethrestime_sec();
4573 4573 if (conn->c_ref > 0) {
4574 4574 mutex_exit(&conn->c_lock);
4575 4575 return (RDMA_SUCCESS);
4576 4576 }
4577 4577
4578 4578 /*
4579 4579 * If a conn is C_ERROR_CONN, close the channel.
4580 4580 */
4581 4581 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4582 4582 conn->c_state = C_DISCONN_PEND;
4583 4583 mutex_exit(&conn->c_lock);
4584 4584 rib_conn_close((void *)conn);
4585 4585 return (RDMA_SUCCESS);
4586 4586 }
4587 4587
4588 4588 /*
4589 4589 * c_ref == 0, set a timeout for conn release
4590 4590 */
4591 4591
4592 4592 if (conn->c_timeout == NULL) {
4593 4593 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4594 4594 SEC_TO_TICK(rib_conn_timeout));
4595 4595 }
4596 4596
4597 4597 mutex_exit(&conn->c_lock);
4598 4598 return (RDMA_SUCCESS);
4599 4599 }
4600 4600
4601 4601 /*
4602 4602 * Add at front of list
4603 4603 */
4604 4604 static struct rdma_done_list *
4605 4605 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4606 4606 {
4607 4607 struct rdma_done_list *rd;
4608 4608
4609 4609 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4610 4610
4611 4611 rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4612 4612 rd->xid = xid;
4613 4613 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4614 4614
4615 4615 rd->prev = NULL;
4616 4616 rd->next = qp->rdlist;
4617 4617 if (qp->rdlist != NULL)
4618 4618 qp->rdlist->prev = rd;
4619 4619 qp->rdlist = rd;
4620 4620
4621 4621 return (rd);
4622 4622 }
4623 4623
4624 4624 static void
4625 4625 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4626 4626 {
4627 4627 struct rdma_done_list *r;
4628 4628
4629 4629 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4630 4630
4631 4631 r = rd->next;
4632 4632 if (r != NULL) {
4633 4633 r->prev = rd->prev;
4634 4634 }
4635 4635
4636 4636 r = rd->prev;
4637 4637 if (r != NULL) {
4638 4638 r->next = rd->next;
4639 4639 } else {
4640 4640 qp->rdlist = rd->next;
4641 4641 }
4642 4642
4643 4643 cv_destroy(&rd->rdma_done_cv);
4644 4644 kmem_free(rd, sizeof (*rd));
4645 4645 }
4646 4646
4647 4647 static void
4648 4648 rdma_done_rem_list(rib_qp_t *qp)
4649 4649 {
4650 4650 struct rdma_done_list *r, *n;
4651 4651
4652 4652 mutex_enter(&qp->rdlist_lock);
4653 4653 for (r = qp->rdlist; r != NULL; r = n) {
4654 4654 n = r->next;
4655 4655 rdma_done_rm(qp, r);
4656 4656 }
4657 4657 mutex_exit(&qp->rdlist_lock);
4658 4658 }
4659 4659
4660 4660 static void
4661 4661 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4662 4662 {
4663 4663 struct rdma_done_list *r = qp->rdlist;
4664 4664
4665 4665 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4666 4666
4667 4667 while (r) {
4668 4668 if (r->xid == xid) {
4669 4669 cv_signal(&r->rdma_done_cv);
4670 4670 return;
4671 4671 } else {
4672 4672 r = r->next;
4673 4673 }
4674 4674 }
4675 4675 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4676 4676 int, xid);
4677 4677 }
4678 4678
4679 4679 /*
4680 4680 * Expects conn->c_lock to be held by the caller.
4681 4681 */
4682 4682
4683 4683 static void
4684 4684 rib_close_a_channel(CONN *conn)
4685 4685 {
4686 4686 rib_qp_t *qp;
4687 4687 qp = ctoqp(conn);
4688 4688
4689 4689 if (qp->qp_hdl == NULL) {
4690 4690 /* channel already freed */
4691 4691 return;
4692 4692 }
4693 4693
4694 4694 /*
4695 4695 * Call ibt_close_rc_channel in blocking mode
4696 4696 * with no callbacks.
4697 4697 */
4698 4698 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4699 4699 NULL, 0, NULL, NULL, 0);
4700 4700 }
4701 4701
4702 4702 /*
4703 4703 * Goes through all connections and closes the channel
4704 4704 * This will cause all the WRs on those channels to be
4705 4705 * flushed.
4706 4706 */
4707 4707 static void
4708 4708 rib_close_channels(rib_conn_list_t *connlist)
4709 4709 {
4710 4710 CONN *conn, *tmp;
4711 4711
4712 4712 rw_enter(&connlist->conn_lock, RW_READER);
4713 4713 conn = connlist->conn_hd;
4714 4714 while (conn != NULL) {
4715 4715 mutex_enter(&conn->c_lock);
4716 4716 tmp = conn->c_next;
4717 4717 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4718 4718
4719 4719 if (conn->c_state == C_CONN_PEND) {
4720 4720 conn->c_flags |= C_CLOSE_PENDING;
4721 4721 goto next;
4722 4722 }
4723 4723
4724 4724 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4725 4725
4726 4726 /*
4727 4727 * Live connection in CONNECTED state.
4728 4728 */
4729 4729 if (conn->c_state == C_CONNECTED)
4730 4730 conn->c_state = C_ERROR_CONN;
4731 4731 mutex_exit(&conn->c_lock);
4732 4732
4733 4733 rib_close_a_channel(conn);
4734 4734
4735 4735 mutex_enter(&conn->c_lock);
4736 4736 conn->c_flags &= ~C_CLOSE_PENDING;
4737 4737 /* Signal a pending rib_disconnect_channel() */
4738 4738 cv_signal(&conn->c_cv);
4739 4739 }
4740 4740 next:
4741 4741 mutex_exit(&conn->c_lock);
4742 4742 conn = tmp;
4743 4743 }
4744 4744 rw_exit(&connlist->conn_lock);
4745 4745 }
4746 4746
4747 4747 /*
4748 4748 * Frees up all connections that are no longer being referenced
4749 4749 */
4750 4750 static void
4751 4751 rib_purge_connlist(rib_conn_list_t *connlist)
4752 4752 {
4753 4753 CONN *conn;
4754 4754
4755 4755 top:
4756 4756 rw_enter(&connlist->conn_lock, RW_READER);
4757 4757 conn = connlist->conn_hd;
4758 4758 while (conn != NULL) {
4759 4759 mutex_enter(&conn->c_lock);
4760 4760
4761 4761 /*
4762 4762 * At this point connection is either in ERROR
4763 4763 * or DISCONN_PEND state. If in DISCONN_PEND state
4764 4764 * then some other thread is culling that connection.
4765 4765 * If not and if c_ref is 0, then destroy the connection.
4766 4766 */
4767 4767 if (conn->c_ref == 0 &&
4768 4768 conn->c_state != C_DISCONN_PEND) {
4769 4769 /*
4770 4770 * Cull the connection
4771 4771 */
4772 4772 conn->c_state = C_DISCONN_PEND;
4773 4773 mutex_exit(&conn->c_lock);
4774 4774 rw_exit(&connlist->conn_lock);
4775 4775 (void) rib_disconnect_channel(conn, connlist);
4776 4776 goto top;
4777 4777 } else {
4778 4778 /*
4779 4779 * conn disconnect already scheduled or will
4780 4780 * happen from conn_release when c_ref drops to 0.
4781 4781 */
4782 4782 mutex_exit(&conn->c_lock);
4783 4783 }
4784 4784 conn = conn->c_next;
4785 4785 }
4786 4786 rw_exit(&connlist->conn_lock);
4787 4787
4788 4788 /*
4789 4789 * At this point, only connections with c_ref != 0 are on the list
4790 4790 */
4791 4791 }
4792 4792
4793 4793 /*
4794 4794 * Free all the HCA resources and close
4795 4795 * the hca.
4796 4796 */
4797 4797
4798 4798 static void
4799 4799 rib_free_hca(rib_hca_t *hca)
4800 4800 {
4801 4801 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4802 4802 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4803 4803 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4804 4804 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4805 4805
4806 4806 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4807 4807 kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4808 4808 kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4809 4809 kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4810 4810
4811 4811 rib_rbufpool_destroy(hca, RECV_BUFFER);
4812 4812 rib_rbufpool_destroy(hca, SEND_BUFFER);
4813 4813 rib_destroy_cache(hca);
4814 4814 if (rib_mod.rdma_count == 0)
4815 4815 (void) rdma_unregister_mod(&rib_mod);
4816 4816 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4817 4817 (void) ibt_close_hca(hca->hca_hdl);
4818 4818 hca->hca_hdl = NULL;
4819 4819 }
4820 4820
4821 4821
4822 4822 static void
4823 4823 rib_stop_hca_services(rib_hca_t *hca)
4824 4824 {
4825 4825 rib_stop_services(hca);
4826 4826 rib_close_channels(&hca->cl_conn_list);
4827 4827 rib_close_channels(&hca->srv_conn_list);
4828 4828
4829 4829 rib_purge_connlist(&hca->cl_conn_list);
4830 4830 rib_purge_connlist(&hca->srv_conn_list);
4831 4831
4832 4832 if ((rib_stat->hcas_list == NULL) && stats_enabled) {
4833 4833 kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4834 4834 GLOBAL_ZONEID);
4835 4835 stats_enabled = FALSE;
4836 4836 }
4837 4837
4838 4838 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4839 4839 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4840 4840 if (hca->srv_conn_list.conn_hd == NULL &&
4841 4841 hca->cl_conn_list.conn_hd == NULL) {
4842 4842 /*
4843 4843 * conn_lists are NULL, so destroy
4844 4844 * buffers, close hca and be done.
4845 4845 */
4846 4846 rib_free_hca(hca);
4847 4847 }
4848 4848 rw_exit(&hca->cl_conn_list.conn_lock);
4849 4849 rw_exit(&hca->srv_conn_list.conn_lock);
4850 4850
4851 4851 if (hca->hca_hdl != NULL) {
4852 4852 mutex_enter(&hca->inuse_lock);
4853 4853 while (hca->inuse)
4854 4854 cv_wait(&hca->cb_cv, &hca->inuse_lock);
4855 4855 mutex_exit(&hca->inuse_lock);
4856 4856
4857 4857 rib_free_hca(hca);
4858 4858 }
4859 4859 rw_destroy(&hca->bound_services_lock);
4860 4860
4861 4861 if (hca->cleanup_helper != NULL) {
4862 4862 ddi_taskq_destroy(hca->cleanup_helper);
4863 4863 hca->cleanup_helper = NULL;
4864 4864 }
4865 4865 }
4866 4866
4867 4867 /*
4868 4868 * Cleans and closes up all uses of the HCA
4869 4869 */
4870 4870 static void
4871 4871 rib_detach_hca(ibt_hca_hdl_t hca_hdl)
4872 4872 {
4873 4873 rib_hca_t *hca = NULL;
4874 4874 rib_hca_t **hcap;
4875 4875
4876 4876 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
4877 4877 for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) {
4878 4878 hca = *hcap;
4879 4879 rw_enter(&hca->state_lock, RW_WRITER);
4880 4880 if (hca->hca_hdl == hca_hdl) {
4881 4881 /*
4882 4882 * Mark as detached and remove from
4883 4883 * hca list.
4884 4884 */
4885 4885 hca->state = HCA_DETACHED;
4886 4886 *hcap = hca->next;
4887 4887 rib_stat->nhca_inited--;
4888 4888 rib_mod.rdma_count--;
4889 4889 rw_exit(&hca->state_lock);
4890 4890 break;
4891 4891 }
4892 4892 rw_exit(&hca->state_lock);
4893 4893 }
4894 4894 rw_exit(&rib_stat->hcas_list_lock);
4895 4895
4896 4896 if (hca == NULL)
4897 4897 return;
4898 4898 ASSERT(hca->hca_hdl == hca_hdl);
4899 4899
4900 4900 /*
4901 4901 * Stop all services on the HCA
4902 4902 * Go through cl_conn_list and close all rc_channels
4903 4903 * Go through svr_conn_list and close all rc_channels
4904 4904 * Free connections whose c_ref has dropped to 0
4905 4905 * Destroy all CQs
4906 4906 * Deregister and released all buffer pool memory after all
4907 4907 * connections are destroyed
4908 4908 * Free the protection domain
4909 4909 * ibt_close_hca()
4910 4910 */
4911 4911 rib_stop_hca_services(hca);
4912 4912
4913 4913 kmem_free(hca, sizeof (*hca));
4914 4914 }
4915 4915
4916 4916 static void
4917 4917 rib_server_side_cache_reclaim(void *argp)
4918 4918 {
4919 4919 cache_avl_struct_t *rcas;
4920 4920 rib_lrc_entry_t *rb;
4921 4921 rib_hca_t *hca = (rib_hca_t *)argp;
4922 4922
4923 4923 rw_enter(&hca->avl_rw_lock, RW_WRITER);
4924 4924 rcas = avl_first(&hca->avl_tree);
4925 4925 if (rcas != NULL)
4926 4926 avl_remove(&hca->avl_tree, rcas);
4927 4927
4928 4928 while (rcas != NULL) {
4929 4929 while (rcas->r.forw != &rcas->r) {
4930 4930 rcas->elements--;
4931 4931 rb = rcas->r.forw;
4932 4932 remque(rb);
4933 4933 if (rb->registered)
4934 4934 (void) rib_deregistermem_via_hca(hca,
4935 4935 rb->lrc_buf, rb->lrc_mhandle);
4936 4936
4937 4937 hca->cache_allocation -= rb->lrc_len;
4938 4938 kmem_free(rb->lrc_buf, rb->lrc_len);
4939 4939 kmem_free(rb, sizeof (rib_lrc_entry_t));
4940 4940 }
4941 4941 mutex_destroy(&rcas->node_lock);
4942 4942 kmem_cache_free(hca->server_side_cache, rcas);
4943 4943 rcas = avl_first(&hca->avl_tree);
4944 4944 if (rcas != NULL)
4945 4945 avl_remove(&hca->avl_tree, rcas);
4946 4946 }
4947 4947 rw_exit(&hca->avl_rw_lock);
4948 4948 }
4949 4949
4950 4950 static void
4951 4951 rib_server_side_cache_cleanup(void *argp)
4952 4952 {
4953 4953 cache_avl_struct_t *rcas;
4954 4954 rib_lrc_entry_t *rb;
4955 4955 rib_hca_t *hca = (rib_hca_t *)argp;
4956 4956
4957 4957 mutex_enter(&hca->cache_allocation_lock);
4958 4958 if (hca->cache_allocation < cache_limit) {
4959 4959 mutex_exit(&hca->cache_allocation_lock);
4960 4960 return;
4961 4961 }
4962 4962 mutex_exit(&hca->cache_allocation_lock);
4963 4963
4964 4964 rw_enter(&hca->avl_rw_lock, RW_WRITER);
4965 4965 rcas = avl_last(&hca->avl_tree);
4966 4966 if (rcas != NULL)
4967 4967 avl_remove(&hca->avl_tree, rcas);
4968 4968
4969 4969 while (rcas != NULL) {
4970 4970 while (rcas->r.forw != &rcas->r) {
4971 4971 rcas->elements--;
4972 4972 rb = rcas->r.forw;
4973 4973 remque(rb);
4974 4974 if (rb->registered)
4975 4975 (void) rib_deregistermem_via_hca(hca,
4976 4976 rb->lrc_buf, rb->lrc_mhandle);
4977 4977
4978 4978 hca->cache_allocation -= rb->lrc_len;
4979 4979
4980 4980 kmem_free(rb->lrc_buf, rb->lrc_len);
4981 4981 kmem_free(rb, sizeof (rib_lrc_entry_t));
4982 4982 }
4983 4983 mutex_destroy(&rcas->node_lock);
4984 4984 if (hca->server_side_cache) {
4985 4985 kmem_cache_free(hca->server_side_cache, rcas);
4986 4986 }
4987 4987
4988 4988 if (hca->cache_allocation < cache_limit) {
4989 4989 rw_exit(&hca->avl_rw_lock);
4990 4990 return;
4991 4991 }
4992 4992
4993 4993 rcas = avl_last(&hca->avl_tree);
4994 4994 if (rcas != NULL)
4995 4995 avl_remove(&hca->avl_tree, rcas);
4996 4996 }
4997 4997 rw_exit(&hca->avl_rw_lock);
4998 4998 }
4999 4999
5000 5000 static int
5001 5001 avl_compare(const void *t1, const void *t2)
5002 5002 {
5003 5003 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
5004 5004 return (0);
5005 5005
5006 5006 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
5007 5007 return (-1);
5008 5008
5009 5009 return (1);
5010 5010 }
5011 5011
5012 5012 static void
5013 5013 rib_destroy_cache(rib_hca_t *hca)
5014 5014 {
5015 5015 if (hca->avl_init) {
5016 5016 rib_server_side_cache_reclaim((void *)hca);
5017 5017 if (hca->server_side_cache) {
5018 5018 kmem_cache_destroy(hca->server_side_cache);
5019 5019 hca->server_side_cache = NULL;
5020 5020 }
5021 5021 avl_destroy(&hca->avl_tree);
5022 5022 mutex_destroy(&hca->cache_allocation_lock);
5023 5023 rw_destroy(&hca->avl_rw_lock);
5024 5024 }
5025 5025 hca->avl_init = FALSE;
5026 5026 }
5027 5027
5028 5028 static void
5029 5029 rib_force_cleanup(void *hca)
5030 5030 {
5031 5031 if (((rib_hca_t *)hca)->cleanup_helper != NULL)
5032 5032 (void) ddi_taskq_dispatch(
5033 5033 ((rib_hca_t *)hca)->cleanup_helper,
5034 5034 rib_server_side_cache_cleanup,
5035 5035 (void *)hca, DDI_NOSLEEP);
5036 5036 }
5037 5037
5038 5038 static rib_lrc_entry_t *
5039 5039 rib_get_cache_buf(CONN *conn, uint32_t len)
5040 5040 {
5041 5041 cache_avl_struct_t cas, *rcas;
5042 5042 rib_hca_t *hca = (ctoqp(conn))->hca;
5043 5043 rib_lrc_entry_t *reply_buf;
5044 5044 avl_index_t where = NULL;
5045 5045 uint64_t c_alloc = 0;
5046 5046
5047 5047 if (!hca->avl_init)
5048 5048 goto error_alloc;
5049 5049
5050 5050 cas.len = len;
5051 5051
5052 5052 rw_enter(&hca->avl_rw_lock, RW_READER);
5053 5053
5054 5054 mutex_enter(&hca->cache_allocation_lock);
5055 5055 c_alloc = hca->cache_allocation;
5056 5056 mutex_exit(&hca->cache_allocation_lock);
5057 5057
5058 5058 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
5059 5059 &where)) == NULL) {
5060 5060 /* Am I above the cache limit */
5061 5061 if ((c_alloc + len) >= cache_limit) {
5062 5062 rib_force_cleanup((void *)hca);
5063 5063 rw_exit(&hca->avl_rw_lock);
5064 5064 mutex_enter(&hca->cache_allocation_lock);
5065 5065 hca->cache_misses_above_the_limit ++;
5066 5066 mutex_exit(&hca->cache_allocation_lock);
5067 5067
5068 5068 /* Allocate and register the buffer directly */
5069 5069 goto error_alloc;
5070 5070 }
5071 5071
5072 5072 rw_exit(&hca->avl_rw_lock);
5073 5073 rw_enter(&hca->avl_rw_lock, RW_WRITER);
5074 5074
5075 5075 /* Recheck to make sure no other thread added the entry in */
5076 5076 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
5077 5077 &cas, &where)) == NULL) {
5078 5078 /* Allocate an avl tree entry */
5079 5079 rcas = (cache_avl_struct_t *)
5080 5080 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
5081 5081
5082 5082 bzero(rcas, sizeof (cache_avl_struct_t));
5083 5083 rcas->elements = 0;
5084 5084 rcas->r.forw = &rcas->r;
5085 5085 rcas->r.back = &rcas->r;
5086 5086 rcas->len = len;
5087 5087 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5088 5088 avl_insert(&hca->avl_tree, rcas, where);
5089 5089 }
5090 5090 }
5091 5091
5092 5092 mutex_enter(&rcas->node_lock);
5093 5093
5094 5094 if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
5095 5095 reply_buf = rcas->r.forw;
5096 5096 remque(reply_buf);
5097 5097 rcas->elements--;
5098 5098 mutex_exit(&rcas->node_lock);
5099 5099 rw_exit(&hca->avl_rw_lock);
5100 5100
5101 5101 mutex_enter(&hca->cache_allocation_lock);
5102 5102 hca->cache_hits++;
5103 5103 hca->cache_allocation -= len;
5104 5104 mutex_exit(&hca->cache_allocation_lock);
5105 5105 } else {
5106 5106 /* Am I above the cache limit */
5107 5107 mutex_exit(&rcas->node_lock);
5108 5108 if ((c_alloc + len) >= cache_limit) {
5109 5109 rib_force_cleanup((void *)hca);
5110 5110 rw_exit(&hca->avl_rw_lock);
5111 5111
5112 5112 mutex_enter(&hca->cache_allocation_lock);
5113 5113 hca->cache_misses_above_the_limit++;
5114 5114 mutex_exit(&hca->cache_allocation_lock);
5115 5115 /* Allocate and register the buffer directly */
5116 5116 goto error_alloc;
5117 5117 }
5118 5118 rw_exit(&hca->avl_rw_lock);
5119 5119 mutex_enter(&hca->cache_allocation_lock);
5120 5120 hca->cache_misses++;
5121 5121 mutex_exit(&hca->cache_allocation_lock);
5122 5122 /* Allocate a reply_buf entry */
5123 5123 reply_buf = (rib_lrc_entry_t *)
5124 5124 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5125 5125 bzero(reply_buf, sizeof (rib_lrc_entry_t));
5126 5126 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5127 5127 reply_buf->lrc_len = len;
5128 5128 reply_buf->registered = FALSE;
5129 5129 reply_buf->avl_node = (void *)rcas;
5130 5130 }
5131 5131
5132 5132 return (reply_buf);
5133 5133
5134 5134 error_alloc:
5135 5135 reply_buf = (rib_lrc_entry_t *)
5136 5136 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5137 5137 bzero(reply_buf, sizeof (rib_lrc_entry_t));
5138 5138 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5139 5139 reply_buf->lrc_len = len;
5140 5140 reply_buf->registered = FALSE;
5141 5141 reply_buf->avl_node = NULL;
5142 5142
5143 5143 return (reply_buf);
5144 5144 }
5145 5145
5146 5146 /*
5147 5147 * Return a pre-registered back to the cache (without
5148 5148 * unregistering the buffer)..
5149 5149 */
5150 5150
5151 5151 static void
5152 5152 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5153 5153 {
5154 5154 cache_avl_struct_t cas, *rcas;
5155 5155 avl_index_t where = NULL;
5156 5156 rib_hca_t *hca = (ctoqp(conn))->hca;
5157 5157
5158 5158 if (!hca->avl_init)
5159 5159 goto error_free;
5160 5160
5161 5161 cas.len = reg_buf->lrc_len;
5162 5162 rw_enter(&hca->avl_rw_lock, RW_READER);
5163 5163 if ((rcas = (cache_avl_struct_t *)
5164 5164 avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
5165 5165 rw_exit(&hca->avl_rw_lock);
5166 5166 goto error_free;
5167 5167 } else {
5168 5168 cas.len = reg_buf->lrc_len;
5169 5169 mutex_enter(&rcas->node_lock);
5170 5170 insque(reg_buf, &rcas->r);
5171 5171 rcas->elements ++;
5172 5172 mutex_exit(&rcas->node_lock);
5173 5173 rw_exit(&hca->avl_rw_lock);
5174 5174 mutex_enter(&hca->cache_allocation_lock);
5175 5175 hca->cache_allocation += cas.len;
5176 5176 mutex_exit(&hca->cache_allocation_lock);
5177 5177 }
5178 5178
5179 5179 return;
5180 5180
5181 5181 error_free:
5182 5182
5183 5183 if (reg_buf->registered)
5184 5184 (void) rib_deregistermem_via_hca(hca,
5185 5185 reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5186 5186 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
5187 5187 kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
5188 5188 }
5189 5189
5190 5190 static rdma_stat
5191 5191 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5192 5192 uint_t buflen, struct mrc *buf_handle)
5193 5193 {
5194 5194 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
5195 5195 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
5196 5196 rdma_stat status;
5197 5197
5198 5198
5199 5199 /*
5200 5200 * Note: ALL buffer pools use the same memory type RDMARW.
5201 5201 */
5202 5202 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5203 5203 if (status == RDMA_SUCCESS) {
5204 5204 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
5205 5205 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5206 5206 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5207 5207 } else {
5208 5208 buf_handle->mrc_linfo = NULL;
5209 5209 buf_handle->mrc_lmr = 0;
5210 5210 buf_handle->mrc_rmr = 0;
5211 5211 }
5212 5212 return (status);
5213 5213 }
5214 5214
5215 5215 /* ARGSUSED */
5216 5216 static rdma_stat
5217 5217 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5218 5218 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5219 5219 {
5220 5220
5221 5221 (void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5222 5222 return (RDMA_SUCCESS);
5223 5223 }
5224 5224
5225 5225 /* ARGSUSED */
5226 5226 static rdma_stat
5227 5227 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5228 5228 {
5229 5229
5230 5230 (void) ibt_deregister_mr(hca->hca_hdl,
5231 5231 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5232 5232 return (RDMA_SUCCESS);
5233 5233 }
5234 5234
5235 5235 /*
5236 5236 * Check if the IP interface named by `lifrp' is RDMA-capable.
5237 5237 */
5238 5238 static boolean_t
5239 5239 rpcib_rdma_capable_interface(struct lifreq *lifrp)
5240 5240 {
5241 5241 char ifname[LIFNAMSIZ];
5242 5242 char *cp;
5243 5243
5244 5244 if (lifrp->lifr_type == IFT_IB)
5245 5245 return (B_TRUE);
5246 5246
5247 5247 /*
5248 5248 * Strip off the logical interface portion before getting
5249 5249 * intimate with the name.
5250 5250 */
5251 5251 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
5252 5252 if ((cp = strchr(ifname, ':')) != NULL)
5253 5253 *cp = '\0';
5254 5254
5255 5255 return (strcmp("lo0", ifname) == 0);
5256 5256 }
5257 5257
5258 5258 static int
5259 5259 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
5260 5260 {
5261 5261 vnode_t *kkvp, *vp;
5262 5262 TIUSER *tiptr;
5263 5263 struct strioctl iocb;
5264 5264 k_sigset_t smask;
5265 5265 int err = 0;
5266 5266
5267 5267 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) {
5268 5268 if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE,
5269 5269 &tiptr, CRED()) == 0) {
5270 5270 vp = tiptr->fp->f_vnode;
5271 5271 } else {
5272 5272 VN_RELE(kkvp);
5273 5273 return (EPROTO);
5274 5274 }
5275 5275 } else {
5276 5276 return (EPROTO);
5277 5277 }
5278 5278
5279 5279 iocb.ic_cmd = cmd;
5280 5280 iocb.ic_timout = 0;
5281 5281 iocb.ic_len = len;
5282 5282 iocb.ic_dp = (caddr_t)arg;
5283 5283 sigintr(&smask, 0);
5284 5284 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5285 5285 sigunintr(&smask);
5286 5286 (void) t_kclose(tiptr, 0);
5287 5287 VN_RELE(kkvp);
5288 5288 return (err);
5289 5289 }
5290 5290
5291 5291 /*
5292 5292 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5293 5293 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5294 5294 */
5295 5295 static int
5296 5296 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
5297 5297 {
5298 5298 int err;
5299 5299 struct lifnum lifn;
5300 5300
5301 5301 bzero(&lifn, sizeof (struct lifnum));
5302 5302 lifn.lifn_family = AF_UNSPEC;
5303 5303
5304 5304 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
5305 5305 if (err != 0)
5306 5306 return (err);
5307 5307
5308 5308 /*
5309 5309 * Pad the interface count to account for additional interfaces that
5310 5310 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5311 5311 */
5312 5312 lifn.lifn_count += 4;
5313 5313
5314 5314 bzero(lifcp, sizeof (struct lifconf));
5315 5315 lifcp->lifc_family = AF_UNSPEC;
5316 5316 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
5317 5317 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
5318 5318
5319 5319 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
5320 5320 if (err != 0) {
5321 5321 kmem_free(lifcp->lifc_buf, *bufsizep);
5322 5322 return (err);
5323 5323 }
5324 5324 return (0);
5325 5325 }
5326 5326
5327 5327 static boolean_t
5328 5328 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
5329 5329 {
5330 5330 uint_t i, nifs;
5331 5331 uint_t bufsize;
5332 5332 struct lifconf lifc;
5333 5333 struct lifreq *lifrp;
5334 5334 struct sockaddr_in *sinp;
5335 5335 struct sockaddr_in6 *sin6p;
5336 5336
5337 5337 bzero(addrs4, sizeof (rpcib_ipaddrs_t));
5338 5338 bzero(addrs6, sizeof (rpcib_ipaddrs_t));
5339 5339
5340 5340 if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
5341 5341 return (B_FALSE);
5342 5342
5343 5343 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
5344 5344 kmem_free(lifc.lifc_buf, bufsize);
5345 5345 return (B_FALSE);
5346 5346 }
5347 5347
5348 5348 /*
5349 5349 * Worst case is that all of the addresses are IB-capable and have
5350 5350 * the same address family, so size our buffers accordingly.
5351 5351 */
5352 5352 addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
5353 5353 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
5354 5354 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
5355 5355 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
5356 5356
5357 5357 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
5358 5358 if (!rpcib_rdma_capable_interface(lifrp))
5359 5359 continue;
5360 5360
5361 5361 if (lifrp->lifr_addr.ss_family == AF_INET) {
5362 5362 sinp = addrs4->ri_list;
5363 5363 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
5364 5364 sizeof (struct sockaddr_in));
5365 5365 } else if (lifrp->lifr_addr.ss_family == AF_INET6) {
5366 5366 sin6p = addrs6->ri_list;
5367 5367 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
5368 5368 sizeof (struct sockaddr_in6));
5369 5369 }
5370 5370 }
5371 5371
5372 5372 kmem_free(lifc.lifc_buf, bufsize);
5373 5373 return (B_TRUE);
5374 5374 }
5375 5375
5376 5376 /* ARGSUSED */
5377 5377 static int
5378 5378 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
5379 5379 {
5380 5380 rib_hca_t *hca;
5381 5381
5382 5382 if (KSTAT_WRITE == rw) {
5383 5383 return (EACCES);
5384 5384 }
5385 5385
5386 5386 rpcib_kstat.cache_limit.value.ui64 =
5387 5387 (uint64_t)cache_limit;
5388 5388 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
5389 5389 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
5390 5390 rpcib_kstat.cache_allocation.value.ui64 +=
5391 5391 (uint64_t)hca->cache_allocation;
5392 5392 rpcib_kstat.cache_hits.value.ui64 +=
5393 5393 (uint64_t)hca->cache_hits;
5394 5394 rpcib_kstat.cache_misses.value.ui64 +=
5395 5395 (uint64_t)hca->cache_misses;
5396 5396 rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
5397 5397 (uint64_t)hca->cache_misses_above_the_limit;
5398 5398 }
5399 5399 rw_exit(&rib_stat->hcas_list_lock);
5400 5400 return (0);
5401 5401 }
↓ open down ↓ |
1107 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX