Print this page
XXXX introduce drv_sectohz
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/io/ib/clients/rds/rdsib_ib.c
+++ new/usr/src/uts/common/io/ib/clients/rds/rdsib_ib.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24 /*
25 25 * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
26 26 *
27 27 * This software is available to you under a choice of one of two
28 28 * licenses. You may choose to be licensed under the terms of the GNU
29 29 * General Public License (GPL) Version 2, available from the file
30 30 * COPYING in the main directory of this source tree, or the
31 31 * OpenIB.org BSD license below:
32 32 *
33 33 * Redistribution and use in source and binary forms, with or
34 34 * without modification, are permitted provided that the following
35 35 * conditions are met:
36 36 *
37 37 * - Redistributions of source code must retain the above
38 38 * copyright notice, this list of conditions and the following
39 39 * disclaimer.
40 40 *
41 41 * - Redistributions in binary form must reproduce the above
42 42 * copyright notice, this list of conditions and the following
43 43 * disclaimer in the documentation and/or other materials
44 44 * provided with the distribution.
45 45 *
46 46 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
47 47 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
48 48 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
49 49 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
50 50 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
51 51 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
52 52 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
53 53 * SOFTWARE.
54 54 *
55 55 */
56 56 /*
57 57 * Sun elects to include this software in Sun product
58 58 * under the OpenIB BSD license.
59 59 *
60 60 *
61 61 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
62 62 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
65 65 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
66 66 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
67 67 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
68 68 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
69 69 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
70 70 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
71 71 * POSSIBILITY OF SUCH DAMAGE.
72 72 */
73 73
74 74 #include <sys/types.h>
75 75 #include <sys/ddi.h>
76 76 #include <sys/sunddi.h>
77 77 #include <sys/ib/clients/rds/rdsib_cm.h>
78 78 #include <sys/ib/clients/rds/rdsib_ib.h>
79 79 #include <sys/ib/clients/rds/rdsib_buf.h>
80 80 #include <sys/ib/clients/rds/rdsib_ep.h>
81 81 #include <sys/ib/clients/rds/rds_kstat.h>
82 82
83 83 static void rds_async_handler(void *clntp, ibt_hca_hdl_t hdl,
84 84 ibt_async_code_t code, ibt_async_event_t *event);
85 85
86 86 static struct ibt_clnt_modinfo_s rds_ib_modinfo = {
87 87 IBTI_V_CURR,
88 88 IBT_NETWORK,
89 89 rds_async_handler,
90 90 NULL,
91 91 "RDS"
92 92 };
93 93
94 94 /* performance tunables */
95 95 uint_t rds_no_interrupts = 0;
96 96 uint_t rds_poll_percent_full = 25;
97 97 uint_t rds_wc_signal = IBT_NEXT_SOLICITED;
98 98 uint_t rds_waittime_ms = 100; /* ms */
99 99
100 100 extern dev_info_t *rdsib_dev_info;
101 101 extern void rds_close_sessions();
102 102
103 103 static void
104 104 rdsib_validate_chan_sizes(ibt_hca_attr_t *hattrp)
105 105 {
106 106 /* The SQ size should not be more than that supported by the HCA */
107 107 if (((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_chan_sz) ||
108 108 ((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_cq_sz)) {
109 109 RDS_DPRINTF2("RDSIB", "MaxDataSendBuffers + %d is greater "
110 110 "than that supported by the HCA driver "
111 111 "(%d + %d > %d or %d), lowering it to a supported value.",
112 112 RDS_NUM_ACKS, MaxDataSendBuffers, RDS_NUM_ACKS,
113 113 hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
114 114
115 115 MaxDataSendBuffers = (hattrp->hca_max_chan_sz >
116 116 hattrp->hca_max_cq_sz) ?
117 117 hattrp->hca_max_cq_sz - RDS_NUM_ACKS :
118 118 hattrp->hca_max_chan_sz - RDS_NUM_ACKS;
119 119 }
120 120
121 121 /* The RQ size should not be more than that supported by the HCA */
122 122 if ((MaxDataRecvBuffers > hattrp->hca_max_chan_sz) ||
123 123 (MaxDataRecvBuffers > hattrp->hca_max_cq_sz)) {
124 124 RDS_DPRINTF2("RDSIB", "MaxDataRecvBuffers is greater than that "
125 125 "supported by the HCA driver (%d > %d or %d), lowering it "
126 126 "to a supported value.", MaxDataRecvBuffers,
127 127 hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
128 128
129 129 MaxDataRecvBuffers = (hattrp->hca_max_chan_sz >
130 130 hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
131 131 hattrp->hca_max_chan_sz;
132 132 }
133 133
134 134 /* The SQ size should not be more than that supported by the HCA */
135 135 if ((MaxCtrlSendBuffers > hattrp->hca_max_chan_sz) ||
136 136 (MaxCtrlSendBuffers > hattrp->hca_max_cq_sz)) {
137 137 RDS_DPRINTF2("RDSIB", "MaxCtrlSendBuffers is greater than that "
138 138 "supported by the HCA driver (%d > %d or %d), lowering it "
139 139 "to a supported value.", MaxCtrlSendBuffers,
140 140 hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
141 141
142 142 MaxCtrlSendBuffers = (hattrp->hca_max_chan_sz >
143 143 hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
144 144 hattrp->hca_max_chan_sz;
145 145 }
146 146
147 147 /* The RQ size should not be more than that supported by the HCA */
148 148 if ((MaxCtrlRecvBuffers > hattrp->hca_max_chan_sz) ||
149 149 (MaxCtrlRecvBuffers > hattrp->hca_max_cq_sz)) {
150 150 RDS_DPRINTF2("RDSIB", "MaxCtrlRecvBuffers is greater than that "
151 151 "supported by the HCA driver (%d > %d or %d), lowering it "
152 152 "to a supported value.", MaxCtrlRecvBuffers,
153 153 hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
154 154
155 155 MaxCtrlRecvBuffers = (hattrp->hca_max_chan_sz >
156 156 hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
157 157 hattrp->hca_max_chan_sz;
158 158 }
159 159
160 160 /* The MaxRecvMemory should be less than that supported by the HCA */
161 161 if ((NDataRX * RdsPktSize) > hattrp->hca_max_memr_len) {
162 162 RDS_DPRINTF2("RDSIB", "MaxRecvMemory is greater than that "
163 163 "supported by the HCA driver (%d > %d), lowering it to %d",
164 164 NDataRX * RdsPktSize, hattrp->hca_max_memr_len,
165 165 hattrp->hca_max_memr_len);
166 166
167 167 NDataRX = hattrp->hca_max_memr_len/RdsPktSize;
168 168 }
169 169 }
170 170
171 171 /* Return hcap, given the hca guid */
172 172 rds_hca_t *
173 173 rds_lkup_hca(ib_guid_t hca_guid)
174 174 {
175 175 rds_hca_t *hcap;
176 176
177 177 RDS_DPRINTF4("rds_lkup_hca", "Enter: statep: 0x%p "
178 178 "guid: %llx", rdsib_statep, hca_guid);
179 179
180 180 rw_enter(&rdsib_statep->rds_hca_lock, RW_READER);
181 181
182 182 hcap = rdsib_statep->rds_hcalistp;
183 183 while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
184 184 hcap = hcap->hca_nextp;
185 185 }
186 186
187 187 rw_exit(&rdsib_statep->rds_hca_lock);
188 188
189 189 RDS_DPRINTF4("rds_lkup_hca", "return");
190 190
191 191 return (hcap);
192 192 }
193 193
194 194 void rds_randomize_qps(rds_hca_t *hcap);
195 195
196 196 static rds_hca_t *
197 197 rdsib_init_hca(ib_guid_t hca_guid)
198 198 {
199 199 rds_hca_t *hcap;
200 200 boolean_t alloc = B_FALSE;
201 201 int ret;
202 202
203 203 RDS_DPRINTF2("rdsib_init_hca", "enter: HCA 0x%llx", hca_guid);
204 204
205 205 /* Do a HCA lookup */
206 206 hcap = rds_lkup_hca(hca_guid);
207 207
208 208 if (hcap != NULL && hcap->hca_hdl != NULL) {
209 209 /*
210 210 * This can happen if we get IBT_HCA_ATTACH_EVENT on an HCA
211 211 * that we have already opened. Just return NULL so that
212 212 * we'll not end up reinitializing the HCA again.
213 213 */
214 214 RDS_DPRINTF2("rdsib_init_hca", "HCA already initialized");
215 215 return (NULL);
216 216 }
217 217
218 218 if (hcap == NULL) {
219 219 RDS_DPRINTF2("rdsib_init_hca", "New HCA is added");
220 220 hcap = (rds_hca_t *)kmem_zalloc(sizeof (rds_hca_t), KM_SLEEP);
221 221 alloc = B_TRUE;
222 222 }
223 223
224 224 hcap->hca_guid = hca_guid;
225 225 ret = ibt_open_hca(rdsib_statep->rds_ibhdl, hca_guid,
226 226 &hcap->hca_hdl);
227 227 if (ret != IBT_SUCCESS) {
228 228 if (ret == IBT_HCA_IN_USE) {
229 229 RDS_DPRINTF2("rdsib_init_hca",
230 230 "ibt_open_hca: 0x%llx returned IBT_HCA_IN_USE",
231 231 hca_guid);
232 232 } else {
233 233 RDS_DPRINTF2("rdsib_init_hca",
234 234 "ibt_open_hca: 0x%llx failed: %d", hca_guid, ret);
235 235 }
236 236 if (alloc == B_TRUE) {
237 237 kmem_free(hcap, sizeof (rds_hca_t));
238 238 }
239 239 return (NULL);
240 240 }
241 241
242 242 ret = ibt_query_hca(hcap->hca_hdl, &hcap->hca_attr);
243 243 if (ret != IBT_SUCCESS) {
244 244 RDS_DPRINTF2("rdsib_init_hca",
245 245 "Query HCA: 0x%llx failed: %d", hca_guid, ret);
246 246 ret = ibt_close_hca(hcap->hca_hdl);
247 247 ASSERT(ret == IBT_SUCCESS);
248 248 if (alloc == B_TRUE) {
249 249 kmem_free(hcap, sizeof (rds_hca_t));
250 250 } else {
251 251 hcap->hca_hdl = NULL;
252 252 }
253 253 return (NULL);
254 254 }
255 255
256 256 ret = ibt_query_hca_ports(hcap->hca_hdl, 0,
257 257 &hcap->hca_pinfop, &hcap->hca_nports, &hcap->hca_pinfo_sz);
258 258 if (ret != IBT_SUCCESS) {
259 259 RDS_DPRINTF2("rdsib_init_hca",
260 260 "Query HCA 0x%llx ports failed: %d", hca_guid,
261 261 ret);
262 262 ret = ibt_close_hca(hcap->hca_hdl);
263 263 hcap->hca_hdl = NULL;
264 264 ASSERT(ret == IBT_SUCCESS);
265 265 if (alloc == B_TRUE) {
266 266 kmem_free(hcap, sizeof (rds_hca_t));
267 267 } else {
268 268 hcap->hca_hdl = NULL;
269 269 }
270 270 return (NULL);
271 271 }
272 272
273 273 /* Only one PD per HCA is allocated, so do it here */
274 274 ret = ibt_alloc_pd(hcap->hca_hdl, IBT_PD_NO_FLAGS,
275 275 &hcap->hca_pdhdl);
276 276 if (ret != IBT_SUCCESS) {
277 277 RDS_DPRINTF2("rdsib_init_hca",
278 278 "ibt_alloc_pd 0x%llx failed: %d", hca_guid, ret);
279 279 (void) ibt_free_portinfo(hcap->hca_pinfop,
280 280 hcap->hca_pinfo_sz);
281 281 ret = ibt_close_hca(hcap->hca_hdl);
282 282 ASSERT(ret == IBT_SUCCESS);
283 283 hcap->hca_hdl = NULL;
284 284 if (alloc == B_TRUE) {
285 285 kmem_free(hcap, sizeof (rds_hca_t));
286 286 } else {
287 287 hcap->hca_hdl = NULL;
288 288 }
289 289 return (NULL);
290 290 }
291 291
292 292 rdsib_validate_chan_sizes(&hcap->hca_attr);
293 293
294 294 /* To minimize stale connections after ungraceful reboots */
295 295 rds_randomize_qps(hcap);
296 296
297 297 rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
298 298 hcap->hca_state = RDS_HCA_STATE_OPEN;
299 299 if (alloc == B_TRUE) {
300 300 /* this is a new HCA, add it to the list */
301 301 rdsib_statep->rds_nhcas++;
302 302 hcap->hca_nextp = rdsib_statep->rds_hcalistp;
303 303 rdsib_statep->rds_hcalistp = hcap;
304 304 }
305 305 rw_exit(&rdsib_statep->rds_hca_lock);
306 306
307 307 RDS_DPRINTF2("rdsib_init_hca", "return: HCA 0x%llx", hca_guid);
308 308
309 309 return (hcap);
310 310 }
311 311
312 312 /*
313 313 * Called from attach
314 314 */
315 315 int
316 316 rdsib_initialize_ib()
317 317 {
318 318 ib_guid_t *guidp;
319 319 rds_hca_t *hcap;
320 320 uint_t ix, hcaix, nhcas;
321 321 int ret;
322 322
323 323 RDS_DPRINTF2("rdsib_initialize_ib", "enter: statep %p", rdsib_statep);
324 324
325 325 ASSERT(rdsib_statep != NULL);
326 326 if (rdsib_statep == NULL) {
327 327 RDS_DPRINTF1("rdsib_initialize_ib",
328 328 "RDS Statep not initialized");
329 329 return (-1);
330 330 }
331 331
332 332 /* How many hcas are there? */
333 333 nhcas = ibt_get_hca_list(&guidp);
334 334 if (nhcas == 0) {
335 335 RDS_DPRINTF2("rdsib_initialize_ib", "No IB HCAs Available");
336 336 return (-1);
337 337 }
338 338
339 339 RDS_DPRINTF3("rdsib_initialize_ib", "Number of HCAs: %d", nhcas);
340 340
341 341 /* Register with IBTF */
342 342 ret = ibt_attach(&rds_ib_modinfo, rdsib_dev_info, rdsib_statep,
343 343 &rdsib_statep->rds_ibhdl);
344 344 if (ret != IBT_SUCCESS) {
345 345 RDS_DPRINTF2("rdsib_initialize_ib", "ibt_attach failed: %d",
346 346 ret);
347 347 (void) ibt_free_hca_list(guidp, nhcas);
348 348 return (-1);
349 349 }
350 350
351 351 /*
352 352 * Open each HCA and gather its information. Don't care about HCAs
353 353 * that cannot be opened. It is OK as long as atleast one HCA can be
354 354 * opened.
355 355 * Initialize a HCA only if all the information is available.
356 356 */
357 357 for (ix = 0, hcaix = 0; ix < nhcas; ix++) {
358 358 RDS_DPRINTF3(LABEL, "Open HCA: 0x%llx", guidp[ix]);
359 359
360 360 hcap = rdsib_init_hca(guidp[ix]);
361 361 if (hcap != NULL) hcaix++;
362 362 }
363 363
364 364 /* free the HCA list, we are done with it */
365 365 (void) ibt_free_hca_list(guidp, nhcas);
366 366
367 367 if (hcaix == 0) {
368 368 /* Failed to Initialize even one HCA */
369 369 RDS_DPRINTF2("rdsib_initialize_ib", "No HCAs are initialized");
370 370 (void) ibt_detach(rdsib_statep->rds_ibhdl);
371 371 rdsib_statep->rds_ibhdl = NULL;
372 372 return (-1);
373 373 }
374 374
375 375 if (hcaix < nhcas) {
376 376 RDS_DPRINTF2("rdsib_open_ib", "HCAs %d/%d failed to initialize",
377 377 (nhcas - hcaix), nhcas);
378 378 }
379 379
380 380 RDS_DPRINTF2("rdsib_initialize_ib", "return: statep %p", rdsib_statep);
381 381
382 382 return (0);
383 383 }
384 384
385 385 /*
386 386 * Called from detach
387 387 */
388 388 void
389 389 rdsib_deinitialize_ib()
390 390 {
391 391 rds_hca_t *hcap, *nextp;
392 392 int ret;
393 393
394 394 RDS_DPRINTF2("rdsib_deinitialize_ib", "enter: statep %p", rdsib_statep);
395 395
396 396 /* close and destroy all the sessions */
397 397 rds_close_sessions(NULL);
398 398
399 399 /* Release all HCA resources */
400 400 rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
401 401 RDS_DPRINTF2("rdsib_deinitialize_ib", "HCA List: %p, NHCA: %d",
402 402 rdsib_statep->rds_hcalistp, rdsib_statep->rds_nhcas);
403 403 hcap = rdsib_statep->rds_hcalistp;
404 404 rdsib_statep->rds_hcalistp = NULL;
405 405 rdsib_statep->rds_nhcas = 0;
406 406 rw_exit(&rdsib_statep->rds_hca_lock);
407 407
408 408 while (hcap != NULL) {
409 409 nextp = hcap->hca_nextp;
410 410
411 411 if (hcap->hca_hdl != NULL) {
412 412 ret = ibt_free_pd(hcap->hca_hdl, hcap->hca_pdhdl);
413 413 ASSERT(ret == IBT_SUCCESS);
414 414
415 415 (void) ibt_free_portinfo(hcap->hca_pinfop,
416 416 hcap->hca_pinfo_sz);
417 417
418 418 ret = ibt_close_hca(hcap->hca_hdl);
419 419 ASSERT(ret == IBT_SUCCESS);
420 420 }
421 421
422 422 kmem_free(hcap, sizeof (rds_hca_t));
423 423 hcap = nextp;
424 424 }
425 425
426 426 /* Deregister with IBTF */
427 427 if (rdsib_statep->rds_ibhdl != NULL) {
428 428 (void) ibt_detach(rdsib_statep->rds_ibhdl);
429 429 rdsib_statep->rds_ibhdl = NULL;
430 430 }
431 431
432 432 RDS_DPRINTF2("rdsib_deinitialize_ib", "return: statep %p",
433 433 rdsib_statep);
434 434 }
435 435
436 436 /*
437 437 * Called on open of first RDS socket
438 438 */
439 439 int
440 440 rdsib_open_ib()
441 441 {
442 442 int ret;
443 443
444 444 RDS_DPRINTF2("rdsib_open_ib", "enter: statep %p", rdsib_statep);
445 445
446 446 /* Enable incoming connection requests */
447 447 if (rdsib_statep->rds_srvhdl == NULL) {
448 448 rdsib_statep->rds_srvhdl =
449 449 rds_register_service(rdsib_statep->rds_ibhdl);
450 450 if (rdsib_statep->rds_srvhdl == NULL) {
451 451 RDS_DPRINTF2("rdsib_open_ib",
452 452 "Service registration failed");
453 453 return (-1);
454 454 } else {
455 455 /* bind the service on all available ports */
456 456 ret = rds_bind_service(rdsib_statep);
457 457 if (ret != 0) {
458 458 RDS_DPRINTF2("rdsib_open_ib",
459 459 "Bind service failed: %d", ret);
460 460 }
461 461 }
462 462 }
463 463
464 464 RDS_DPRINTF2("rdsib_open_ib", "return: statep %p", rdsib_statep);
465 465
466 466 return (0);
467 467 }
468 468
469 469 /*
470 470 * Called when all ports are closed.
471 471 */
472 472 void
473 473 rdsib_close_ib()
474 474 {
475 475 int ret;
476 476
477 477 RDS_DPRINTF2("rdsib_close_ib", "enter: statep %p", rdsib_statep);
478 478
479 479 /* Disable incoming connection requests */
480 480 if (rdsib_statep->rds_srvhdl != NULL) {
481 481 ret = ibt_unbind_all_services(rdsib_statep->rds_srvhdl);
482 482 if (ret != 0) {
483 483 RDS_DPRINTF2("rdsib_close_ib",
484 484 "ibt_unbind_all_services failed: %d\n", ret);
485 485 }
486 486 ret = ibt_deregister_service(rdsib_statep->rds_ibhdl,
487 487 rdsib_statep->rds_srvhdl);
488 488 if (ret != 0) {
489 489 RDS_DPRINTF2("rdsib_close_ib",
490 490 "ibt_deregister_service failed: %d\n", ret);
491 491 } else {
492 492 rdsib_statep->rds_srvhdl = NULL;
493 493 }
494 494 }
495 495
496 496 RDS_DPRINTF2("rdsib_close_ib", "return: statep %p", rdsib_statep);
497 497 }
498 498
499 499 /* Return hcap, given the hca guid */
500 500 rds_hca_t *
501 501 rds_get_hcap(rds_state_t *statep, ib_guid_t hca_guid)
502 502 {
503 503 rds_hca_t *hcap;
504 504
505 505 RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: Enter: statep: 0x%p "
506 506 "guid: %llx", statep, hca_guid);
507 507
508 508 rw_enter(&statep->rds_hca_lock, RW_READER);
509 509
510 510 hcap = statep->rds_hcalistp;
511 511 while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
512 512 hcap = hcap->hca_nextp;
513 513 }
514 514
515 515 /*
516 516 * don't let anyone use this HCA until the RECV memory
517 517 * is registered with this HCA
518 518 */
519 519 if ((hcap != NULL) &&
520 520 (hcap->hca_state == RDS_HCA_STATE_MEM_REGISTERED)) {
521 521 ASSERT(hcap->hca_mrhdl != NULL);
522 522 rw_exit(&statep->rds_hca_lock);
523 523 return (hcap);
524 524 }
525 525
526 526 RDS_DPRINTF2("rds_get_hcap",
527 527 "HCA (0x%p, 0x%llx) is not initialized", hcap, hca_guid);
528 528 rw_exit(&statep->rds_hca_lock);
529 529
530 530 RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: return");
531 531
532 532 return (NULL);
533 533 }
534 534
535 535 /* Return hcap, given a gid */
536 536 rds_hca_t *
537 537 rds_gid_to_hcap(rds_state_t *statep, ib_gid_t gid)
538 538 {
539 539 rds_hca_t *hcap;
540 540 uint_t ix;
541 541
542 542 RDS_DPRINTF4("rds_gid_to_hcap", "Enter: statep: 0x%p gid: %llx:%llx",
543 543 statep, gid.gid_prefix, gid.gid_guid);
544 544
545 545 rw_enter(&statep->rds_hca_lock, RW_READER);
546 546
547 547 hcap = statep->rds_hcalistp;
548 548 while (hcap != NULL) {
549 549
550 550 /*
551 551 * don't let anyone use this HCA until the RECV memory
552 552 * is registered with this HCA
553 553 */
554 554 if (hcap->hca_state != RDS_HCA_STATE_MEM_REGISTERED) {
555 555 RDS_DPRINTF3("rds_gid_to_hcap",
556 556 "HCA (0x%p, 0x%llx) is not initialized",
557 557 hcap, gid.gid_guid);
558 558 hcap = hcap->hca_nextp;
559 559 continue;
560 560 }
561 561
562 562 for (ix = 0; ix < hcap->hca_nports; ix++) {
563 563 if ((hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_prefix ==
564 564 gid.gid_prefix) &&
565 565 (hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_guid ==
566 566 gid.gid_guid)) {
567 567 RDS_DPRINTF4("rds_gid_to_hcap",
568 568 "gid found in hcap: 0x%p", hcap);
569 569 rw_exit(&statep->rds_hca_lock);
570 570 return (hcap);
571 571 }
572 572 }
573 573 hcap = hcap->hca_nextp;
574 574 }
575 575
576 576 rw_exit(&statep->rds_hca_lock);
577 577
578 578 return (NULL);
579 579 }
580 580
581 581 /* This is called from the send CQ handler */
582 582 void
583 583 rds_send_acknowledgement(rds_ep_t *ep)
584 584 {
585 585 int ret;
586 586 uint_t ix;
587 587
588 588 RDS_DPRINTF4("rds_send_acknowledgement", "Enter EP(%p)", ep);
589 589
590 590 mutex_enter(&ep->ep_lock);
591 591
592 592 ASSERT(ep->ep_rdmacnt != 0);
593 593
594 594 /*
595 595 * The previous ACK completed successfully, send the next one
596 596 * if more messages were received after sending the last ACK
597 597 */
598 598 if (ep->ep_rbufid != *(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va) {
599 599 *(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va = ep->ep_rbufid;
600 600 mutex_exit(&ep->ep_lock);
601 601
602 602 /* send acknowledgement */
603 603 RDS_INCR_TXACKS();
604 604 ret = ibt_post_send(ep->ep_chanhdl, &ep->ep_ackwr, 1, &ix);
605 605 if (ret != IBT_SUCCESS) {
606 606 RDS_DPRINTF2("rds_send_acknowledgement",
607 607 "EP(%p): ibt_post_send for acknowledgement "
608 608 "failed: %d, SQ depth: %d",
609 609 ep, ret, ep->ep_sndpool.pool_nbusy);
610 610 mutex_enter(&ep->ep_lock);
611 611 ep->ep_rdmacnt--;
612 612 mutex_exit(&ep->ep_lock);
613 613 }
614 614 } else {
615 615 /* ACKed all messages, no more to ACK */
616 616 ep->ep_rdmacnt--;
617 617 mutex_exit(&ep->ep_lock);
618 618 return;
619 619 }
620 620
621 621 RDS_DPRINTF4("rds_send_acknowledgement", "Return EP(%p)", ep);
622 622 }
623 623
624 624 static int
625 625 rds_poll_ctrl_completions(ibt_cq_hdl_t cq, rds_ep_t *ep)
626 626 {
627 627 ibt_wc_t wc;
628 628 uint_t npolled;
629 629 rds_buf_t *bp;
630 630 rds_ctrl_pkt_t *cpkt;
631 631 rds_qp_t *recvqp;
632 632 int ret = IBT_SUCCESS;
633 633
634 634 RDS_DPRINTF4("rds_poll_ctrl_completions", "Enter: EP(%p)", ep);
635 635
636 636 bzero(&wc, sizeof (ibt_wc_t));
637 637 ret = ibt_poll_cq(cq, &wc, 1, &npolled);
638 638 if (ret != IBT_SUCCESS) {
639 639 if (ret != IBT_CQ_EMPTY) {
640 640 RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
641 641 "returned: %d", ep, cq, ret);
642 642 } else {
643 643 RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
644 644 "returned: IBT_CQ_EMPTY", ep, cq);
645 645 }
646 646 return (ret);
647 647 }
648 648
649 649 bp = (rds_buf_t *)(uintptr_t)wc.wc_id;
650 650
651 651 if (wc.wc_status != IBT_WC_SUCCESS) {
652 652 mutex_enter(&ep->ep_recvqp.qp_lock);
653 653 ep->ep_recvqp.qp_level--;
654 654 mutex_exit(&ep->ep_recvqp.qp_lock);
655 655
656 656 /* Free the buffer */
657 657 bp->buf_state = RDS_RCVBUF_FREE;
658 658 rds_free_recv_buf(bp, 1);
659 659
660 660 /* Receive completion failure */
661 661 if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) {
662 662 RDS_DPRINTF2("rds_poll_ctrl_completions",
663 663 "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
664 664 ep, cq, wc.wc_id, wc.wc_status);
665 665 }
666 666 return (ret);
667 667 }
668 668
669 669 /* there is one less in the RQ */
670 670 recvqp = &ep->ep_recvqp;
671 671 mutex_enter(&recvqp->qp_lock);
672 672 recvqp->qp_level--;
673 673 if ((recvqp->qp_taskqpending == B_FALSE) &&
674 674 (recvqp->qp_level <= recvqp->qp_lwm)) {
675 675 /* Time to post more buffers into the RQ */
676 676 recvqp->qp_taskqpending = B_TRUE;
677 677 mutex_exit(&recvqp->qp_lock);
678 678
679 679 ret = ddi_taskq_dispatch(rds_taskq,
680 680 rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
681 681 if (ret != DDI_SUCCESS) {
682 682 RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d",
683 683 ret);
684 684 mutex_enter(&recvqp->qp_lock);
685 685 recvqp->qp_taskqpending = B_FALSE;
686 686 mutex_exit(&recvqp->qp_lock);
687 687 }
688 688 } else {
689 689 mutex_exit(&recvqp->qp_lock);
690 690 }
691 691
692 692 cpkt = (rds_ctrl_pkt_t *)(uintptr_t)bp->buf_ds.ds_va;
693 693 rds_handle_control_message(ep->ep_sp, cpkt);
694 694
695 695 bp->buf_state = RDS_RCVBUF_FREE;
696 696 rds_free_recv_buf(bp, 1);
697 697
698 698 RDS_DPRINTF4("rds_poll_ctrl_completions", "Return: EP(%p)", ep);
699 699
700 700 return (ret);
701 701 }
702 702
703 703 #define RDS_POST_FEW_ATATIME 100
704 704 /* Post recv WRs into the RQ. Assumes the ep->refcnt is already incremented */
705 705 void
706 706 rds_post_recv_buf(void *arg)
707 707 {
708 708 ibt_channel_hdl_t chanhdl;
709 709 rds_ep_t *ep;
710 710 rds_session_t *sp;
711 711 rds_qp_t *recvqp;
712 712 rds_bufpool_t *gp;
713 713 rds_buf_t *bp, *bp1;
714 714 ibt_recv_wr_t *wrp, wr[RDS_POST_FEW_ATATIME];
715 715 rds_hca_t *hcap;
716 716 uint_t npost, nspace, rcv_len;
717 717 uint_t ix, jx, kx;
718 718 int ret;
719 719
720 720 chanhdl = (ibt_channel_hdl_t)arg;
721 721 RDS_DPRINTF4("rds_post_recv_buf", "Enter: CHAN(%p)", chanhdl);
722 722 RDS_INCR_POST_RCV_BUF_CALLS();
723 723
724 724 ep = (rds_ep_t *)ibt_get_chan_private(chanhdl);
725 725 ASSERT(ep != NULL);
726 726 sp = ep->ep_sp;
727 727 recvqp = &ep->ep_recvqp;
728 728
729 729 RDS_DPRINTF5("rds_post_recv_buf", "EP(%p)", ep);
730 730
731 731 /* get the hcap for the HCA hosting this channel */
732 732 hcap = rds_lkup_hca(ep->ep_hca_guid);
733 733 if (hcap == NULL) {
734 734 RDS_DPRINTF2("rds_post_recv_buf", "HCA (0x%llx) not found",
735 735 ep->ep_hca_guid);
736 736 return;
737 737 }
738 738
739 739 /* Make sure the session is still connected */
740 740 rw_enter(&sp->session_lock, RW_READER);
741 741 if ((sp->session_state != RDS_SESSION_STATE_INIT) &&
742 742 (sp->session_state != RDS_SESSION_STATE_CONNECTED) &&
743 743 (sp->session_state != RDS_SESSION_STATE_HCA_CLOSING)) {
744 744 RDS_DPRINTF2("rds_post_recv_buf", "EP(%p): Session is not "
745 745 "in active state (%d)", ep, sp->session_state);
746 746 rw_exit(&sp->session_lock);
747 747 return;
748 748 }
749 749 rw_exit(&sp->session_lock);
750 750
751 751 /* how many can be posted */
752 752 mutex_enter(&recvqp->qp_lock);
753 753 nspace = recvqp->qp_depth - recvqp->qp_level;
754 754 if (nspace == 0) {
755 755 RDS_DPRINTF2("rds_post_recv_buf", "RQ is FULL");
756 756 recvqp->qp_taskqpending = B_FALSE;
757 757 mutex_exit(&recvqp->qp_lock);
758 758 return;
759 759 }
760 760 mutex_exit(&recvqp->qp_lock);
761 761
762 762 if (ep->ep_type == RDS_EP_TYPE_DATA) {
763 763 gp = &rds_dpool;
764 764 rcv_len = RdsPktSize;
765 765 } else {
766 766 gp = &rds_cpool;
767 767 rcv_len = RDS_CTRLPKT_SIZE;
768 768 }
769 769
770 770 bp = rds_get_buf(gp, nspace, &jx);
771 771 if (bp == NULL) {
772 772 RDS_DPRINTF2(LABEL, "EP(%p): No Recv buffers available", ep);
773 773 /* try again later */
774 774 ret = ddi_taskq_dispatch(rds_taskq, rds_post_recv_buf,
775 775 (void *)chanhdl, DDI_NOSLEEP);
776 776 if (ret != DDI_SUCCESS) {
777 777 RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d",
778 778 ret);
779 779 mutex_enter(&recvqp->qp_lock);
780 780 recvqp->qp_taskqpending = B_FALSE;
781 781 mutex_exit(&recvqp->qp_lock);
782 782 }
783 783 return;
784 784 }
785 785
786 786 if (jx != nspace) {
787 787 RDS_DPRINTF2(LABEL, "EP(%p): Recv buffers "
788 788 "needed: %d available: %d", ep, nspace, jx);
789 789 nspace = jx;
790 790 }
791 791
792 792 bp1 = bp;
793 793 for (ix = 0; ix < nspace; ix++) {
794 794 bp1->buf_ep = ep;
795 795 ASSERT(bp1->buf_state == RDS_RCVBUF_FREE);
796 796 bp1->buf_state = RDS_RCVBUF_POSTED;
797 797 bp1->buf_ds.ds_key = hcap->hca_lkey;
798 798 bp1->buf_ds.ds_len = rcv_len;
799 799 bp1 = bp1->buf_nextp;
800 800 }
801 801
802 802 #if 0
803 803 wrp = kmem_zalloc(RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t),
804 804 KM_SLEEP);
805 805 #else
806 806 wrp = &wr[0];
807 807 #endif
808 808
809 809 npost = nspace;
810 810 while (npost) {
811 811 jx = (npost > RDS_POST_FEW_ATATIME) ?
812 812 RDS_POST_FEW_ATATIME : npost;
813 813 for (ix = 0; ix < jx; ix++) {
814 814 wrp[ix].wr_id = (uintptr_t)bp;
815 815 wrp[ix].wr_nds = 1;
816 816 wrp[ix].wr_sgl = &bp->buf_ds;
817 817 bp = bp->buf_nextp;
818 818 }
819 819
820 820 ret = ibt_post_recv(chanhdl, wrp, jx, &kx);
821 821 if ((ret != IBT_SUCCESS) || (kx != jx)) {
822 822 RDS_DPRINTF2(LABEL, "ibt_post_recv for %d WRs failed: "
823 823 "%d", npost, ret);
824 824 npost -= kx;
825 825 break;
826 826 }
827 827
828 828 npost -= jx;
829 829 }
830 830
831 831 mutex_enter(&recvqp->qp_lock);
832 832 if (npost != 0) {
833 833 RDS_DPRINTF2("rds_post_recv_buf",
834 834 "EP(%p) Failed to post %d WRs", ep, npost);
835 835 recvqp->qp_level += (nspace - npost);
836 836 } else {
837 837 recvqp->qp_level += nspace;
838 838 }
839 839
840 840 /*
841 841 * sometimes, the recv WRs can get consumed as soon as they are
842 842 * posted. In that case, taskq thread to post more WRs to the RQ will
843 843 * not be scheduled as the taskqpending flag is still set.
844 844 */
845 845 if (recvqp->qp_level == 0) {
846 846 mutex_exit(&recvqp->qp_lock);
847 847 ret = ddi_taskq_dispatch(rds_taskq,
848 848 rds_post_recv_buf, (void *)chanhdl, DDI_NOSLEEP);
849 849 if (ret != DDI_SUCCESS) {
850 850 RDS_DPRINTF2("rds_post_recv_buf",
851 851 "ddi_taskq_dispatch failed: %d", ret);
852 852 mutex_enter(&recvqp->qp_lock);
853 853 recvqp->qp_taskqpending = B_FALSE;
854 854 mutex_exit(&recvqp->qp_lock);
855 855 }
856 856 } else {
857 857 recvqp->qp_taskqpending = B_FALSE;
858 858 mutex_exit(&recvqp->qp_lock);
859 859 }
860 860
861 861 #if 0
862 862 kmem_free(wrp, RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t));
863 863 #endif
864 864
865 865 RDS_DPRINTF4("rds_post_recv_buf", "Return: EP(%p)", ep);
866 866 }
867 867
868 868 static int
869 869 rds_poll_data_completions(ibt_cq_hdl_t cq, rds_ep_t *ep)
870 870 {
871 871 ibt_wc_t wc;
872 872 rds_buf_t *bp;
873 873 rds_data_hdr_t *pktp;
874 874 rds_qp_t *recvqp;
875 875 uint_t npolled;
876 876 int ret = IBT_SUCCESS;
877 877
878 878
879 879 RDS_DPRINTF4("rds_poll_data_completions", "Enter: EP(%p)", ep);
880 880
881 881 bzero(&wc, sizeof (ibt_wc_t));
882 882 ret = ibt_poll_cq(cq, &wc, 1, &npolled);
883 883 if (ret != IBT_SUCCESS) {
884 884 if (ret != IBT_CQ_EMPTY) {
885 885 RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
886 886 "returned: %d", ep, cq, ret);
887 887 } else {
888 888 RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
889 889 "returned: IBT_CQ_EMPTY", ep, cq);
890 890 }
891 891 return (ret);
892 892 }
893 893
894 894 bp = (rds_buf_t *)(uintptr_t)wc.wc_id;
895 895 ASSERT(bp->buf_state == RDS_RCVBUF_POSTED);
896 896 bp->buf_state = RDS_RCVBUF_ONSOCKQ;
897 897 bp->buf_nextp = NULL;
898 898
899 899 if (wc.wc_status != IBT_WC_SUCCESS) {
900 900 mutex_enter(&ep->ep_recvqp.qp_lock);
901 901 ep->ep_recvqp.qp_level--;
902 902 mutex_exit(&ep->ep_recvqp.qp_lock);
903 903
904 904 /* free the buffer */
905 905 bp->buf_state = RDS_RCVBUF_FREE;
906 906 rds_free_recv_buf(bp, 1);
907 907
908 908 /* Receive completion failure */
909 909 if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) {
910 910 RDS_DPRINTF2("rds_poll_data_completions",
911 911 "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
912 912 ep, cq, wc.wc_id, wc.wc_status);
913 913 RDS_INCR_RXERRS();
914 914 }
915 915 return (ret);
916 916 }
917 917
918 918 /* there is one less in the RQ */
919 919 recvqp = &ep->ep_recvqp;
920 920 mutex_enter(&recvqp->qp_lock);
921 921 recvqp->qp_level--;
922 922 if ((recvqp->qp_taskqpending == B_FALSE) &&
923 923 (recvqp->qp_level <= recvqp->qp_lwm)) {
924 924 /* Time to post more buffers into the RQ */
925 925 recvqp->qp_taskqpending = B_TRUE;
926 926 mutex_exit(&recvqp->qp_lock);
927 927
928 928 ret = ddi_taskq_dispatch(rds_taskq,
929 929 rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
930 930 if (ret != DDI_SUCCESS) {
931 931 RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d",
932 932 ret);
933 933 mutex_enter(&recvqp->qp_lock);
934 934 recvqp->qp_taskqpending = B_FALSE;
935 935 mutex_exit(&recvqp->qp_lock);
936 936 }
937 937 } else {
938 938 mutex_exit(&recvqp->qp_lock);
939 939 }
940 940
941 941 pktp = (rds_data_hdr_t *)(uintptr_t)bp->buf_ds.ds_va;
942 942 ASSERT(pktp->dh_datalen != 0);
943 943
944 944 RDS_DPRINTF5(LABEL, "Message Received: sendIP: 0x%x recvIP: 0x%x "
945 945 "sendport: %d recvport: %d npkts: %d pktno: %d", ep->ep_remip,
946 946 ep->ep_myip, pktp->dh_sendport, pktp->dh_recvport,
947 947 pktp->dh_npkts, pktp->dh_psn);
948 948
949 949 RDS_DPRINTF3(LABEL, "BP(%p): npkts: %d psn: %d", bp,
950 950 pktp->dh_npkts, pktp->dh_psn);
951 951
952 952 if (pktp->dh_npkts == 1) {
953 953 /* single pkt or last packet */
954 954 if (pktp->dh_psn != 0) {
955 955 /* last packet of a segmented message */
956 956 ASSERT(ep->ep_seglbp != NULL);
957 957 ep->ep_seglbp->buf_nextp = bp;
958 958 ep->ep_seglbp = bp;
959 959 rds_received_msg(ep, ep->ep_segfbp);
960 960 ep->ep_segfbp = NULL;
961 961 ep->ep_seglbp = NULL;
962 962 } else {
963 963 /* single packet */
964 964 rds_received_msg(ep, bp);
965 965 }
966 966 } else {
967 967 /* multi-pkt msg */
968 968 if (pktp->dh_psn == 0) {
969 969 /* first packet */
970 970 ASSERT(ep->ep_segfbp == NULL);
971 971 ep->ep_segfbp = bp;
972 972 ep->ep_seglbp = bp;
973 973 } else {
974 974 /* intermediate packet */
975 975 ASSERT(ep->ep_segfbp != NULL);
976 976 ep->ep_seglbp->buf_nextp = bp;
977 977 ep->ep_seglbp = bp;
978 978 }
979 979 }
980 980
981 981 RDS_DPRINTF4("rds_poll_data_completions", "Return: EP(%p)", ep);
982 982
983 983 return (ret);
984 984 }
985 985
986 986 void
987 987 rds_recvcq_handler(ibt_cq_hdl_t cq, void *arg)
988 988 {
989 989 rds_ep_t *ep;
990 990 int ret = IBT_SUCCESS;
991 991 int (*func)(ibt_cq_hdl_t, rds_ep_t *);
992 992
993 993 ep = (rds_ep_t *)arg;
994 994
995 995 RDS_DPRINTF4("rds_recvcq_handler", "enter: EP(%p)", ep);
996 996
997 997 if (ep->ep_type == RDS_EP_TYPE_DATA) {
998 998 func = rds_poll_data_completions;
999 999 } else {
1000 1000 func = rds_poll_ctrl_completions;
1001 1001 }
1002 1002
1003 1003 do {
1004 1004 ret = func(cq, ep);
1005 1005 } while (ret != IBT_CQ_EMPTY);
1006 1006
1007 1007 /* enable the CQ */
1008 1008 ret = ibt_enable_cq_notify(cq, rds_wc_signal);
1009 1009 if (ret != IBT_SUCCESS) {
1010 1010 RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify "
1011 1011 "failed: %d", ep, cq, ret);
1012 1012 return;
1013 1013 }
1014 1014
1015 1015 do {
1016 1016 ret = func(cq, ep);
1017 1017 } while (ret != IBT_CQ_EMPTY);
1018 1018
1019 1019 RDS_DPRINTF4("rds_recvcq_handler", "Return: EP(%p)", ep);
1020 1020 }
1021 1021
1022 1022 void
1023 1023 rds_poll_send_completions(ibt_cq_hdl_t cq, rds_ep_t *ep, boolean_t lock)
1024 1024 {
1025 1025 ibt_wc_t wc[RDS_NUM_DATA_SEND_WCS];
1026 1026 uint_t npolled, nret, send_error = 0;
1027 1027 rds_buf_t *headp, *tailp, *bp;
1028 1028 int ret, ix;
1029 1029
1030 1030 RDS_DPRINTF4("rds_poll_send_completions", "Enter EP(%p)", ep);
1031 1031
1032 1032 headp = NULL;
1033 1033 tailp = NULL;
1034 1034 npolled = 0;
1035 1035 do {
1036 1036 ret = ibt_poll_cq(cq, wc, RDS_NUM_DATA_SEND_WCS, &nret);
1037 1037 if (ret != IBT_SUCCESS) {
1038 1038 if (ret != IBT_CQ_EMPTY) {
1039 1039 RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): "
1040 1040 "ibt_poll_cq returned: %d", ep, cq, ret);
1041 1041 } else {
1042 1042 RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): "
1043 1043 "ibt_poll_cq returned: IBT_CQ_EMPTY",
1044 1044 ep, cq);
1045 1045 }
1046 1046
1047 1047 break;
1048 1048 }
1049 1049
1050 1050 for (ix = 0; ix < nret; ix++) {
1051 1051 if (wc[ix].wc_status == IBT_WC_SUCCESS) {
1052 1052 if (wc[ix].wc_type == IBT_WRC_RDMAW) {
1053 1053 rds_send_acknowledgement(ep);
1054 1054 continue;
1055 1055 }
1056 1056
1057 1057 bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
1058 1058 ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
1059 1059 bp->buf_state = RDS_SNDBUF_FREE;
1060 1060 } else if (wc[ix].wc_status == IBT_WC_WR_FLUSHED_ERR) {
1061 1061 RDS_INCR_TXERRS();
1062 1062 RDS_DPRINTF5("rds_poll_send_completions",
1063 1063 "EP(%p): WC ID: %p ERROR: %d", ep,
1064 1064 wc[ix].wc_id, wc[ix].wc_status);
1065 1065
1066 1066 send_error = 1;
1067 1067
1068 1068 if (wc[ix].wc_id == RDS_RDMAW_WRID) {
1069 1069 mutex_enter(&ep->ep_lock);
1070 1070 ep->ep_rdmacnt--;
1071 1071 mutex_exit(&ep->ep_lock);
1072 1072 continue;
1073 1073 }
1074 1074
1075 1075 bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
1076 1076 ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
1077 1077 bp->buf_state = RDS_SNDBUF_FREE;
1078 1078 } else {
1079 1079 RDS_INCR_TXERRS();
1080 1080 RDS_DPRINTF2("rds_poll_send_completions",
1081 1081 "EP(%p): WC ID: %p ERROR: %d", ep,
1082 1082 wc[ix].wc_id, wc[ix].wc_status);
1083 1083 if (send_error == 0) {
1084 1084 rds_session_t *sp = ep->ep_sp;
1085 1085
1086 1086 /* don't let anyone send anymore */
1087 1087 rw_enter(&sp->session_lock, RW_WRITER);
1088 1088 if (sp->session_state !=
1089 1089 RDS_SESSION_STATE_ERROR) {
1090 1090 sp->session_state =
1091 1091 RDS_SESSION_STATE_ERROR;
1092 1092 /* Make this the active end */
1093 1093 sp->session_type =
1094 1094 RDS_SESSION_ACTIVE;
1095 1095 }
1096 1096 rw_exit(&sp->session_lock);
1097 1097 }
1098 1098
1099 1099 send_error = 1;
1100 1100
1101 1101 if (wc[ix].wc_id == RDS_RDMAW_WRID) {
1102 1102 mutex_enter(&ep->ep_lock);
1103 1103 ep->ep_rdmacnt--;
1104 1104 mutex_exit(&ep->ep_lock);
1105 1105 continue;
1106 1106 }
1107 1107
1108 1108 bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
1109 1109 ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
1110 1110 bp->buf_state = RDS_SNDBUF_FREE;
1111 1111 }
1112 1112
1113 1113 bp->buf_nextp = NULL;
1114 1114 if (headp) {
1115 1115 tailp->buf_nextp = bp;
1116 1116 tailp = bp;
1117 1117 } else {
1118 1118 headp = bp;
1119 1119 tailp = bp;
1120 1120 }
1121 1121
1122 1122 npolled++;
1123 1123 }
1124 1124
1125 1125 if (rds_no_interrupts && (npolled > 100)) {
1126 1126 break;
1127 1127 }
1128 1128
1129 1129 if (rds_no_interrupts == 1) {
1130 1130 break;
1131 1131 }
1132 1132 } while (ret != IBT_CQ_EMPTY);
1133 1133
1134 1134 RDS_DPRINTF5("rds_poll_send_completions", "Npolled: %d send_error: %d",
1135 1135 npolled, send_error);
1136 1136
1137 1137 /* put the buffers to the pool */
1138 1138 if (npolled != 0) {
1139 1139 rds_free_send_buf(ep, headp, tailp, npolled, lock);
1140 1140 }
1141 1141
1142 1142 if (send_error != 0) {
1143 1143 rds_handle_send_error(ep);
1144 1144 }
1145 1145
1146 1146 RDS_DPRINTF4("rds_poll_send_completions", "Return EP(%p)", ep);
1147 1147 }
1148 1148
1149 1149 void
1150 1150 rds_sendcq_handler(ibt_cq_hdl_t cq, void *arg)
1151 1151 {
1152 1152 rds_ep_t *ep;
1153 1153 int ret;
1154 1154
1155 1155 ep = (rds_ep_t *)arg;
1156 1156
1157 1157 RDS_DPRINTF4("rds_sendcq_handler", "Enter: EP(%p)", ep);
1158 1158
1159 1159 /* enable the CQ */
1160 1160 ret = ibt_enable_cq_notify(cq, IBT_NEXT_COMPLETION);
1161 1161 if (ret != IBT_SUCCESS) {
1162 1162 RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify "
1163 1163 "failed: %d", ep, cq, ret);
1164 1164 return;
1165 1165 }
1166 1166
1167 1167 rds_poll_send_completions(cq, ep, B_FALSE);
1168 1168
1169 1169 RDS_DPRINTF4("rds_sendcq_handler", "Return: EP(%p)", ep);
1170 1170 }
1171 1171
1172 1172 void
1173 1173 rds_ep_free_rc_channel(rds_ep_t *ep)
1174 1174 {
1175 1175 int ret;
1176 1176
1177 1177 RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Enter", ep);
1178 1178
1179 1179 ASSERT(mutex_owned(&ep->ep_lock));
1180 1180
1181 1181 /* free the QP */
1182 1182 if (ep->ep_chanhdl != NULL) {
1183 1183 /* wait until the RQ is empty */
1184 1184 (void) ibt_flush_channel(ep->ep_chanhdl);
1185 1185 (void) rds_is_recvq_empty(ep, B_TRUE);
1186 1186 ret = ibt_free_channel(ep->ep_chanhdl);
1187 1187 if (ret != IBT_SUCCESS) {
1188 1188 RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) "
1189 1189 "ibt_free_channel returned: %d", ep, ret);
1190 1190 }
1191 1191 ep->ep_chanhdl = NULL;
1192 1192 } else {
1193 1193 RDS_DPRINTF2("rds_ep_free_rc_channel",
1194 1194 "EP(%p) Channel is ALREADY FREE", ep);
1195 1195 }
1196 1196
1197 1197 /* free the Send CQ */
1198 1198 if (ep->ep_sendcq != NULL) {
1199 1199 ret = ibt_free_cq(ep->ep_sendcq);
1200 1200 if (ret != IBT_SUCCESS) {
1201 1201 RDS_DPRINTF2("rds_ep_free_rc_channel",
1202 1202 "EP(%p) - for sendcq, ibt_free_cq returned %d",
1203 1203 ep, ret);
1204 1204 }
1205 1205 ep->ep_sendcq = NULL;
1206 1206 } else {
1207 1207 RDS_DPRINTF2("rds_ep_free_rc_channel",
1208 1208 "EP(%p) SendCQ is ALREADY FREE", ep);
1209 1209 }
1210 1210
1211 1211 /* free the Recv CQ */
1212 1212 if (ep->ep_recvcq != NULL) {
1213 1213 ret = ibt_free_cq(ep->ep_recvcq);
1214 1214 if (ret != IBT_SUCCESS) {
1215 1215 RDS_DPRINTF2("rds_ep_free_rc_channel",
1216 1216 "EP(%p) - for recvcq, ibt_free_cq returned %d",
1217 1217 ep, ret);
1218 1218 }
1219 1219 ep->ep_recvcq = NULL;
1220 1220 } else {
1221 1221 RDS_DPRINTF2("rds_ep_free_rc_channel",
1222 1222 "EP(%p) RecvCQ is ALREADY FREE", ep);
1223 1223 }
1224 1224
1225 1225 RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Return", ep);
1226 1226 }
1227 1227
1228 1228 /* Allocate resources for RC channel */
1229 1229 ibt_channel_hdl_t
1230 1230 rds_ep_alloc_rc_channel(rds_ep_t *ep, uint8_t hca_port)
1231 1231 {
1232 1232 int ret = IBT_SUCCESS;
1233 1233 ibt_cq_attr_t scqattr, rcqattr;
1234 1234 ibt_rc_chan_alloc_args_t chanargs;
1235 1235 ibt_channel_hdl_t chanhdl;
1236 1236 rds_session_t *sp;
1237 1237 rds_hca_t *hcap;
1238 1238
1239 1239 RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Enter: 0x%p port: %d",
1240 1240 ep, hca_port);
1241 1241
1242 1242 /* Update the EP with the right IP address and HCA guid */
1243 1243 sp = ep->ep_sp;
1244 1244 ASSERT(sp != NULL);
1245 1245 rw_enter(&sp->session_lock, RW_READER);
1246 1246 mutex_enter(&ep->ep_lock);
1247 1247 ep->ep_myip = sp->session_myip;
1248 1248 ep->ep_remip = sp->session_remip;
1249 1249 hcap = rds_gid_to_hcap(rdsib_statep, sp->session_lgid);
1250 1250 ep->ep_hca_guid = hcap->hca_guid;
1251 1251 mutex_exit(&ep->ep_lock);
1252 1252 rw_exit(&sp->session_lock);
1253 1253
1254 1254 /* reset taskqpending flag here */
1255 1255 ep->ep_recvqp.qp_taskqpending = B_FALSE;
1256 1256
1257 1257 if (ep->ep_type == RDS_EP_TYPE_CTRL) {
1258 1258 scqattr.cq_size = MaxCtrlSendBuffers;
1259 1259 scqattr.cq_sched = NULL;
1260 1260 scqattr.cq_flags = IBT_CQ_NO_FLAGS;
1261 1261
1262 1262 rcqattr.cq_size = MaxCtrlRecvBuffers;
1263 1263 rcqattr.cq_sched = NULL;
1264 1264 rcqattr.cq_flags = IBT_CQ_NO_FLAGS;
1265 1265
1266 1266 chanargs.rc_sizes.cs_sq = MaxCtrlSendBuffers;
1267 1267 chanargs.rc_sizes.cs_rq = MaxCtrlRecvBuffers;
1268 1268 chanargs.rc_sizes.cs_sq_sgl = 1;
1269 1269 chanargs.rc_sizes.cs_rq_sgl = 1;
1270 1270 } else {
1271 1271 scqattr.cq_size = MaxDataSendBuffers + RDS_NUM_ACKS;
1272 1272 scqattr.cq_sched = NULL;
1273 1273 scqattr.cq_flags = IBT_CQ_NO_FLAGS;
1274 1274
1275 1275 rcqattr.cq_size = MaxDataRecvBuffers;
1276 1276 rcqattr.cq_sched = NULL;
1277 1277 rcqattr.cq_flags = IBT_CQ_NO_FLAGS;
1278 1278
1279 1279 chanargs.rc_sizes.cs_sq = MaxDataSendBuffers + RDS_NUM_ACKS;
1280 1280 chanargs.rc_sizes.cs_rq = MaxDataRecvBuffers;
1281 1281 chanargs.rc_sizes.cs_sq_sgl = 1;
1282 1282 chanargs.rc_sizes.cs_rq_sgl = 1;
1283 1283 }
1284 1284
1285 1285 mutex_enter(&ep->ep_lock);
1286 1286 if (ep->ep_sendcq == NULL) {
1287 1287 /* returned size is always greater than the requested size */
1288 1288 ret = ibt_alloc_cq(hcap->hca_hdl, &scqattr,
1289 1289 &ep->ep_sendcq, NULL);
1290 1290 if (ret != IBT_SUCCESS) {
1291 1291 RDS_DPRINTF2(LABEL, "ibt_alloc_cq for sendCQ "
1292 1292 "failed, size = %d: %d", scqattr.cq_size, ret);
1293 1293 mutex_exit(&ep->ep_lock);
1294 1294 return (NULL);
1295 1295 }
1296 1296
1297 1297 (void) ibt_set_cq_handler(ep->ep_sendcq, rds_sendcq_handler,
1298 1298 ep);
1299 1299
1300 1300 if (rds_no_interrupts == 0) {
1301 1301 ret = ibt_enable_cq_notify(ep->ep_sendcq,
1302 1302 IBT_NEXT_COMPLETION);
1303 1303 if (ret != IBT_SUCCESS) {
1304 1304 RDS_DPRINTF2(LABEL,
1305 1305 "ibt_enable_cq_notify failed: %d", ret);
1306 1306 (void) ibt_free_cq(ep->ep_sendcq);
1307 1307 ep->ep_sendcq = NULL;
1308 1308 mutex_exit(&ep->ep_lock);
1309 1309 return (NULL);
1310 1310 }
1311 1311 }
1312 1312 }
1313 1313
1314 1314 if (ep->ep_recvcq == NULL) {
1315 1315 /* returned size is always greater than the requested size */
1316 1316 ret = ibt_alloc_cq(hcap->hca_hdl, &rcqattr,
1317 1317 &ep->ep_recvcq, NULL);
1318 1318 if (ret != IBT_SUCCESS) {
1319 1319 RDS_DPRINTF2(LABEL, "ibt_alloc_cq for recvCQ "
1320 1320 "failed, size = %d: %d", rcqattr.cq_size, ret);
1321 1321 (void) ibt_free_cq(ep->ep_sendcq);
1322 1322 ep->ep_sendcq = NULL;
1323 1323 mutex_exit(&ep->ep_lock);
1324 1324 return (NULL);
1325 1325 }
1326 1326
1327 1327 (void) ibt_set_cq_handler(ep->ep_recvcq, rds_recvcq_handler,
1328 1328 ep);
1329 1329
1330 1330 ret = ibt_enable_cq_notify(ep->ep_recvcq, rds_wc_signal);
1331 1331 if (ret != IBT_SUCCESS) {
1332 1332 RDS_DPRINTF2(LABEL,
1333 1333 "ibt_enable_cq_notify failed: %d", ret);
1334 1334 (void) ibt_free_cq(ep->ep_recvcq);
1335 1335 ep->ep_recvcq = NULL;
1336 1336 (void) ibt_free_cq(ep->ep_sendcq);
1337 1337 ep->ep_sendcq = NULL;
1338 1338 mutex_exit(&ep->ep_lock);
1339 1339 return (NULL);
1340 1340 }
1341 1341 }
1342 1342
1343 1343 chanargs.rc_flags = IBT_ALL_SIGNALED;
1344 1344 chanargs.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR |
1345 1345 IBT_CEP_ATOMIC;
1346 1346 chanargs.rc_hca_port_num = hca_port;
1347 1347 chanargs.rc_scq = ep->ep_sendcq;
1348 1348 chanargs.rc_rcq = ep->ep_recvcq;
1349 1349 chanargs.rc_pd = hcap->hca_pdhdl;
1350 1350 chanargs.rc_srq = NULL;
1351 1351
1352 1352 ret = ibt_alloc_rc_channel(hcap->hca_hdl,
1353 1353 IBT_ACHAN_NO_FLAGS, &chanargs, &chanhdl, NULL);
1354 1354 if (ret != IBT_SUCCESS) {
1355 1355 RDS_DPRINTF2(LABEL, "ibt_alloc_rc_channel fail: %d",
1356 1356 ret);
1357 1357 (void) ibt_free_cq(ep->ep_recvcq);
1358 1358 ep->ep_recvcq = NULL;
1359 1359 (void) ibt_free_cq(ep->ep_sendcq);
1360 1360 ep->ep_sendcq = NULL;
1361 1361 mutex_exit(&ep->ep_lock);
1362 1362 return (NULL);
1363 1363 }
1364 1364 mutex_exit(&ep->ep_lock);
1365 1365
1366 1366 /* Chan private should contain the ep */
1367 1367 (void) ibt_set_chan_private(chanhdl, ep);
1368 1368
1369 1369 RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Return: 0x%p", chanhdl);
1370 1370
1371 1371 return (chanhdl);
1372 1372 }
1373 1373
1374 1374
1375 1375 #if 0
1376 1376
1377 1377 /* Return node guid given a port gid */
1378 1378 ib_guid_t
1379 1379 rds_gid_to_node_guid(ib_gid_t gid)
1380 1380 {
1381 1381 ibt_node_info_t nodeinfo;
1382 1382 int ret;
1383 1383
1384 1384 RDS_DPRINTF4("rds_gid_to_node_guid", "Enter: gid: %llx:%llx",
1385 1385 gid.gid_prefix, gid.gid_guid);
1386 1386
1387 1387 ret = ibt_gid_to_node_info(gid, &nodeinfo);
1388 1388 if (ret != IBT_SUCCESS) {
1389 1389 RDS_DPRINTF2(LABEL, "ibt_gid_node_info for gid: %llx:%llx "
1390 1390 "failed", gid.gid_prefix, gid.gid_guid);
1391 1391 return (0LL);
1392 1392 }
1393 1393
1394 1394 RDS_DPRINTF4("rds_gid_to_node_guid", "Return: Node guid: %llx",
1395 1395 nodeinfo.n_node_guid);
1396 1396
1397 1397 return (nodeinfo.n_node_guid);
1398 1398 }
1399 1399
1400 1400 #endif
1401 1401
1402 1402 static void
1403 1403 rds_handle_portup_event(rds_state_t *statep, ibt_hca_hdl_t hdl,
1404 1404 ibt_async_event_t *event)
1405 1405 {
1406 1406 rds_hca_t *hcap;
1407 1407 ibt_hca_portinfo_t *newpinfop, *oldpinfop;
1408 1408 uint_t newsize, oldsize, nport;
1409 1409 ib_gid_t gid;
1410 1410 int ret;
1411 1411
1412 1412 RDS_DPRINTF2("rds_handle_portup_event",
1413 1413 "Enter: GUID: 0x%llx Statep: %p", event->ev_hca_guid, statep);
1414 1414
1415 1415 rw_enter(&statep->rds_hca_lock, RW_WRITER);
1416 1416
1417 1417 hcap = statep->rds_hcalistp;
1418 1418 while ((hcap != NULL) && (hcap->hca_guid != event->ev_hca_guid)) {
1419 1419 hcap = hcap->hca_nextp;
1420 1420 }
1421 1421
1422 1422 if (hcap == NULL) {
1423 1423 RDS_DPRINTF2("rds_handle_portup_event", "HCA: 0x%llx is "
1424 1424 "not in our list", event->ev_hca_guid);
1425 1425 rw_exit(&statep->rds_hca_lock);
1426 1426 return;
1427 1427 }
1428 1428
1429 1429 ret = ibt_query_hca_ports(hdl, 0, &newpinfop, &nport, &newsize);
1430 1430 if (ret != IBT_SUCCESS) {
1431 1431 RDS_DPRINTF2(LABEL, "ibt_query_hca_ports failed: %d", ret);
1432 1432 rw_exit(&statep->rds_hca_lock);
1433 1433 return;
1434 1434 }
1435 1435
1436 1436 oldpinfop = hcap->hca_pinfop;
1437 1437 oldsize = hcap->hca_pinfo_sz;
1438 1438 hcap->hca_pinfop = newpinfop;
1439 1439 hcap->hca_pinfo_sz = newsize;
1440 1440
1441 1441 (void) ibt_free_portinfo(oldpinfop, oldsize);
1442 1442
1443 1443 /* If RDS service is not registered then no bind is needed */
1444 1444 if (statep->rds_srvhdl == NULL) {
1445 1445 RDS_DPRINTF2("rds_handle_portup_event",
1446 1446 "RDS Service is not registered, so no action needed");
1447 1447 rw_exit(&statep->rds_hca_lock);
1448 1448 return;
1449 1449 }
1450 1450
1451 1451 /*
1452 1452 * If the service was previously bound on this port and
1453 1453 * if this port has changed state down and now up, we do not
1454 1454 * need to bind the service again. The bind is expected to
1455 1455 * persist across state changes. If the service was never bound
1456 1456 * before then we bind it this time.
1457 1457 */
1458 1458 if (hcap->hca_bindhdl[event->ev_port - 1] == NULL) {
1459 1459
1460 1460 /* structure copy */
1461 1461 gid = newpinfop[event->ev_port - 1].p_sgid_tbl[0];
1462 1462
1463 1463 /* bind RDS service on the port, pass statep as cm_private */
1464 1464 ret = ibt_bind_service(statep->rds_srvhdl, gid, NULL, statep,
1465 1465 &hcap->hca_bindhdl[event->ev_port - 1]);
1466 1466 if (ret != IBT_SUCCESS) {
1467 1467 RDS_DPRINTF2("rds_handle_portup_event",
1468 1468 "Bind service for HCA: 0x%llx Port: %d "
1469 1469 "gid %llx:%llx returned: %d", event->ev_hca_guid,
1470 1470 event->ev_port, gid.gid_prefix, gid.gid_guid, ret);
1471 1471 }
1472 1472 }
1473 1473
1474 1474 rw_exit(&statep->rds_hca_lock);
1475 1475
1476 1476 RDS_DPRINTF2("rds_handle_portup_event", "Return: GUID: 0x%llx",
1477 1477 event->ev_hca_guid);
1478 1478 }
1479 1479
1480 1480 static void
1481 1481 rdsib_add_hca(ib_guid_t hca_guid)
1482 1482 {
1483 1483 rds_hca_t *hcap;
1484 1484 ibt_mr_attr_t mem_attr;
1485 1485 ibt_mr_desc_t mem_desc;
1486 1486 int ret;
1487 1487
1488 1488 RDS_DPRINTF2("rdsib_add_hca", "Enter: GUID: 0x%llx", hca_guid);
1489 1489
1490 1490 hcap = rdsib_init_hca(hca_guid);
1491 1491 if (hcap == NULL)
1492 1492 return;
1493 1493
1494 1494 /* register the recv memory with this hca */
1495 1495 mutex_enter(&rds_dpool.pool_lock);
1496 1496 if (rds_dpool.pool_memp == NULL) {
1497 1497 /* no memory to register */
1498 1498 RDS_DPRINTF2("rdsib_add_hca", "No memory to register");
1499 1499 mutex_exit(&rds_dpool.pool_lock);
1500 1500 return;
1501 1501 }
1502 1502
1503 1503 mem_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)rds_dpool.pool_memp;
1504 1504 mem_attr.mr_len = rds_dpool.pool_memsize;
1505 1505 mem_attr.mr_as = NULL;
1506 1506 mem_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE;
1507 1507
1508 1508 ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl, &mem_attr,
1509 1509 &hcap->hca_mrhdl, &mem_desc);
1510 1510
1511 1511 mutex_exit(&rds_dpool.pool_lock);
1512 1512
1513 1513 if (ret != IBT_SUCCESS) {
1514 1514 RDS_DPRINTF2("rdsib_add_hca", "ibt_register_mr failed: %d",
1515 1515 ret);
1516 1516 } else {
1517 1517 rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
1518 1518 hcap->hca_state = RDS_HCA_STATE_MEM_REGISTERED;
1519 1519 hcap->hca_lkey = mem_desc.md_lkey;
1520 1520 hcap->hca_rkey = mem_desc.md_rkey;
1521 1521 rw_exit(&rdsib_statep->rds_hca_lock);
1522 1522 }
1523 1523
1524 1524 RDS_DPRINTF2("rdsib_add_hca", "Retrun: GUID: 0x%llx", hca_guid);
1525 1525 }
1526 1526
1527 1527 void rds_close_this_session(rds_session_t *sp, uint8_t wait);
1528 1528 int rds_post_control_message(rds_session_t *sp, uint8_t code, in_port_t port);
1529 1529
1530 1530 static void
1531 1531 rdsib_del_hca(rds_state_t *statep, ib_guid_t hca_guid)
1532 1532 {
1533 1533 rds_session_t *sp;
1534 1534 rds_hca_t *hcap;
1535 1535 rds_hca_state_t saved_state;
1536 1536 int ret, ix;
1537 1537
1538 1538 RDS_DPRINTF2("rdsib_del_hca", "Enter: GUID: 0x%llx", hca_guid);
1539 1539
1540 1540 /*
1541 1541 * This should be a write lock as we don't want anyone to get access
1542 1542 * to the hcap while we are modifing its contents
1543 1543 */
1544 1544 rw_enter(&statep->rds_hca_lock, RW_WRITER);
1545 1545
1546 1546 hcap = statep->rds_hcalistp;
1547 1547 while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
1548 1548 hcap = hcap->hca_nextp;
1549 1549 }
1550 1550
1551 1551 /* Prevent initiating any new activity on this HCA */
1552 1552 ASSERT(hcap != NULL);
1553 1553 saved_state = hcap->hca_state;
1554 1554 hcap->hca_state = RDS_HCA_STATE_STOPPING;
1555 1555
1556 1556 rw_exit(&statep->rds_hca_lock);
1557 1557
1558 1558 /*
1559 1559 * stop the outgoing traffic and close any active sessions on this hca.
1560 1560 * Any pending messages in the SQ will be allowed to complete.
1561 1561 */
1562 1562 rw_enter(&statep->rds_sessionlock, RW_READER);
1563 1563 sp = statep->rds_sessionlistp;
1564 1564 while (sp) {
1565 1565 if (sp->session_hca_guid != hca_guid) {
1566 1566 sp = sp->session_nextp;
1567 1567 continue;
1568 1568 }
1569 1569
1570 1570 rw_enter(&sp->session_lock, RW_WRITER);
1571 1571 RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp,
1572 1572 sp->session_state);
1573 1573 /*
1574 1574 * We are changing the session state in advance. This prevents
1575 1575 * further messages to be posted to the SQ. We then
1576 1576 * send a control message to the remote and tell it close
1577 1577 * the session.
1578 1578 */
1579 1579 sp->session_state = RDS_SESSION_STATE_HCA_CLOSING;
1580 1580 RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
1581 1581 "RDS_SESSION_STATE_PASSIVE_CLOSING", sp);
1582 1582 rw_exit(&sp->session_lock);
1583 1583
1584 1584 /*
1585 1585 * wait until the sendq is empty then tell the remote to
1586 1586 * close this session. This enables for graceful shutdown of
1587 1587 * the session
1588 1588 */
1589 1589 (void) rds_is_sendq_empty(&sp->session_dataep, 2);
1590 1590 (void) rds_post_control_message(sp,
1591 1591 RDS_CTRL_CODE_CLOSE_SESSION, 0);
1592 1592
1593 1593 sp = sp->session_nextp;
1594 1594 }
1595 1595
1596 1596 /* wait until all the sessions are off this HCA */
1597 1597 sp = statep->rds_sessionlistp;
1598 1598 while (sp) {
1599 1599 if (sp->session_hca_guid != hca_guid) {
1600 1600 sp = sp->session_nextp;
1601 1601 continue;
1602 1602 }
↓ open down ↓ |
1602 lines elided |
↑ open up ↑ |
1603 1603
1604 1604 rw_enter(&sp->session_lock, RW_READER);
1605 1605 RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp,
1606 1606 sp->session_state);
1607 1607
1608 1608 while ((sp->session_state == RDS_SESSION_STATE_HCA_CLOSING) ||
1609 1609 (sp->session_state == RDS_SESSION_STATE_ERROR) ||
1610 1610 (sp->session_state == RDS_SESSION_STATE_PASSIVE_CLOSING) ||
1611 1611 (sp->session_state == RDS_SESSION_STATE_CLOSED)) {
1612 1612 rw_exit(&sp->session_lock);
1613 - delay(drv_usectohz(1000000));
1613 + delay(drv_sectohz(1));
1614 1614 rw_enter(&sp->session_lock, RW_READER);
1615 1615 RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp,
1616 1616 sp->session_state);
1617 1617 }
1618 1618
1619 1619 rw_exit(&sp->session_lock);
1620 1620
1621 1621 sp = sp->session_nextp;
1622 1622 }
1623 1623 rw_exit(&statep->rds_sessionlock);
1624 1624
1625 1625 /*
1626 1626 * if rdsib_close_ib was called before this, then that would have
1627 1627 * unbound the service on all ports. In that case, the HCA structs
1628 1628 * will contain stale bindhdls. Hence, we do not call unbind unless
1629 1629 * the service is still registered.
1630 1630 */
1631 1631 if (statep->rds_srvhdl != NULL) {
1632 1632 /* unbind RDS service on all ports on this HCA */
1633 1633 for (ix = 0; ix < hcap->hca_nports; ix++) {
1634 1634 if (hcap->hca_bindhdl[ix] == NULL) {
1635 1635 continue;
1636 1636 }
1637 1637
1638 1638 RDS_DPRINTF2("rdsib_del_hca",
1639 1639 "Unbinding Service: port: %d, bindhdl: %p",
1640 1640 ix + 1, hcap->hca_bindhdl[ix]);
1641 1641 (void) ibt_unbind_service(rdsib_statep->rds_srvhdl,
1642 1642 hcap->hca_bindhdl[ix]);
1643 1643 hcap->hca_bindhdl[ix] = NULL;
1644 1644 }
1645 1645 }
1646 1646
1647 1647 RDS_DPRINTF2("rdsib_del_hca", "HCA(%p) State: %d", hcap,
1648 1648 hcap->hca_state);
1649 1649
1650 1650 switch (saved_state) {
1651 1651 case RDS_HCA_STATE_MEM_REGISTERED:
1652 1652 ASSERT(hcap->hca_mrhdl != NULL);
1653 1653 ret = ibt_deregister_mr(hcap->hca_hdl, hcap->hca_mrhdl);
1654 1654 if (ret != IBT_SUCCESS) {
1655 1655 RDS_DPRINTF2("rdsib_del_hca",
1656 1656 "ibt_deregister_mr failed: %d", ret);
1657 1657 return;
1658 1658 }
1659 1659 hcap->hca_mrhdl = NULL;
1660 1660 /* FALLTHRU */
1661 1661 case RDS_HCA_STATE_OPEN:
1662 1662 ASSERT(hcap->hca_hdl != NULL);
1663 1663 ASSERT(hcap->hca_pdhdl != NULL);
1664 1664
1665 1665
1666 1666 ret = ibt_free_pd(hcap->hca_hdl, hcap->hca_pdhdl);
1667 1667 if (ret != IBT_SUCCESS) {
1668 1668 RDS_DPRINTF2("rdsib_del_hca",
1669 1669 "ibt_free_pd failed: %d", ret);
1670 1670 }
1671 1671
1672 1672 (void) ibt_free_portinfo(hcap->hca_pinfop, hcap->hca_pinfo_sz);
1673 1673
1674 1674 ret = ibt_close_hca(hcap->hca_hdl);
1675 1675 if (ret != IBT_SUCCESS) {
1676 1676 RDS_DPRINTF2("rdsib_del_hca",
1677 1677 "ibt_close_hca failed: %d", ret);
1678 1678 }
1679 1679
1680 1680 hcap->hca_hdl = NULL;
1681 1681 hcap->hca_pdhdl = NULL;
1682 1682 hcap->hca_lkey = 0;
1683 1683 hcap->hca_rkey = 0;
1684 1684 }
1685 1685
1686 1686 /*
1687 1687 * This should be a write lock as we don't want anyone to get access
1688 1688 * to the hcap while we are modifing its contents
1689 1689 */
1690 1690 rw_enter(&statep->rds_hca_lock, RW_WRITER);
1691 1691 hcap->hca_state = RDS_HCA_STATE_REMOVED;
1692 1692 rw_exit(&statep->rds_hca_lock);
1693 1693
1694 1694 RDS_DPRINTF2("rdsib_del_hca", "Return: GUID: 0x%llx", hca_guid);
1695 1695 }
1696 1696
1697 1697 static void
1698 1698 rds_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code,
1699 1699 ibt_async_event_t *event)
1700 1700 {
1701 1701 rds_state_t *statep = (rds_state_t *)clntp;
1702 1702
1703 1703 RDS_DPRINTF2("rds_async_handler", "Async code: %d", code);
1704 1704
1705 1705 switch (code) {
1706 1706 case IBT_EVENT_PORT_UP:
1707 1707 rds_handle_portup_event(statep, hdl, event);
1708 1708 break;
1709 1709 case IBT_HCA_ATTACH_EVENT:
1710 1710 /*
1711 1711 * NOTE: In some error recovery paths, it is possible to
1712 1712 * receive IBT_HCA_ATTACH_EVENTs on already known HCAs.
1713 1713 */
1714 1714 (void) rdsib_add_hca(event->ev_hca_guid);
1715 1715 break;
1716 1716 case IBT_HCA_DETACH_EVENT:
1717 1717 (void) rdsib_del_hca(statep, event->ev_hca_guid);
1718 1718 break;
1719 1719
1720 1720 default:
1721 1721 RDS_DPRINTF2(LABEL, "Async event: %d not handled", code);
1722 1722 }
1723 1723
1724 1724 RDS_DPRINTF2("rds_async_handler", "Return: code: %d", code);
1725 1725 }
1726 1726
1727 1727 /*
1728 1728 * This routine exists to minimize stale connections across ungraceful
1729 1729 * reboots of nodes in a cluster.
1730 1730 */
1731 1731 void
1732 1732 rds_randomize_qps(rds_hca_t *hcap)
1733 1733 {
1734 1734 ibt_cq_attr_t cqattr;
1735 1735 ibt_rc_chan_alloc_args_t chanargs;
1736 1736 ibt_channel_hdl_t qp1, qp2;
1737 1737 ibt_cq_hdl_t cq_hdl;
1738 1738 hrtime_t nsec;
1739 1739 uint8_t i, j, rand1, rand2;
1740 1740 int ret;
1741 1741
1742 1742 bzero(&cqattr, sizeof (ibt_cq_attr_t));
1743 1743 cqattr.cq_size = 1;
1744 1744 cqattr.cq_sched = NULL;
1745 1745 cqattr.cq_flags = IBT_CQ_NO_FLAGS;
1746 1746 ret = ibt_alloc_cq(hcap->hca_hdl, &cqattr, &cq_hdl, NULL);
1747 1747 if (ret != IBT_SUCCESS) {
1748 1748 RDS_DPRINTF2("rds_randomize_qps",
1749 1749 "ibt_alloc_cq failed: %d", ret);
1750 1750 return;
1751 1751 }
1752 1752
1753 1753 bzero(&chanargs, sizeof (ibt_rc_chan_alloc_args_t));
1754 1754 chanargs.rc_flags = IBT_ALL_SIGNALED;
1755 1755 chanargs.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR |
1756 1756 IBT_CEP_ATOMIC;
1757 1757 chanargs.rc_hca_port_num = 1;
1758 1758 chanargs.rc_scq = cq_hdl;
1759 1759 chanargs.rc_rcq = cq_hdl;
1760 1760 chanargs.rc_pd = hcap->hca_pdhdl;
1761 1761 chanargs.rc_srq = NULL;
1762 1762
1763 1763 nsec = gethrtime();
1764 1764 rand1 = (nsec & 0xF);
1765 1765 rand2 = (nsec >> 4) & 0xF;
1766 1766 RDS_DPRINTF2("rds_randomize_qps", "rand1: %d rand2: %d",
1767 1767 rand1, rand2);
1768 1768
1769 1769 for (i = 0; i < rand1 + 3; i++) {
1770 1770 if (ibt_alloc_rc_channel(hcap->hca_hdl,
1771 1771 IBT_ACHAN_NO_FLAGS, &chanargs, &qp1, NULL) !=
1772 1772 IBT_SUCCESS) {
1773 1773 RDS_DPRINTF2("rds_randomize_qps",
1774 1774 "Bailing at i: %d", i);
1775 1775 (void) ibt_free_cq(cq_hdl);
1776 1776 return;
1777 1777 }
1778 1778 for (j = 0; j < rand2 + 3; j++) {
1779 1779 if (ibt_alloc_rc_channel(hcap->hca_hdl,
1780 1780 IBT_ACHAN_NO_FLAGS, &chanargs, &qp2,
1781 1781 NULL) != IBT_SUCCESS) {
1782 1782 RDS_DPRINTF2("rds_randomize_qps",
1783 1783 "Bailing at i: %d j: %d", i, j);
1784 1784 (void) ibt_free_channel(qp1);
1785 1785 (void) ibt_free_cq(cq_hdl);
1786 1786 return;
1787 1787 }
1788 1788 (void) ibt_free_channel(qp2);
1789 1789 }
1790 1790 (void) ibt_free_channel(qp1);
1791 1791 }
1792 1792
1793 1793 (void) ibt_free_cq(cq_hdl);
1794 1794 }
↓ open down ↓ |
171 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX