Print this page
5045 use atomic_{inc,dec}_* instead of atomic_add_*
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/io/ib/clients/rdsv3/ib_recv.c
+++ new/usr/src/uts/common/io/ib/clients/rdsv3/ib_recv.c
1 1 /*
2 2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3 3 */
4 4
5 5 /*
6 6 * This file contains code imported from the OFED rds source file ib_recv.c
7 7 * Oracle elects to have and use the contents of ib_recv.c under and governed
8 8 * by the OpenIB.org BSD license (see below for full license text). However,
9 9 * the following notice accompanied the original version of this file:
10 10 */
11 11
12 12 /*
13 13 * Copyright (c) 2006 Oracle. All rights reserved.
14 14 *
15 15 * This software is available to you under a choice of one of two
16 16 * licenses. You may choose to be licensed under the terms of the GNU
17 17 * General Public License (GPL) Version 2, available from the file
18 18 * COPYING in the main directory of this source tree, or the
19 19 * OpenIB.org BSD license below:
20 20 *
21 21 * Redistribution and use in source and binary forms, with or
22 22 * without modification, are permitted provided that the following
23 23 * conditions are met:
24 24 *
25 25 * - Redistributions of source code must retain the above
26 26 * copyright notice, this list of conditions and the following
27 27 * disclaimer.
28 28 *
29 29 * - Redistributions in binary form must reproduce the above
30 30 * copyright notice, this list of conditions and the following
31 31 * disclaimer in the documentation and/or other materials
32 32 * provided with the distribution.
33 33 *
34 34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35 35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36 36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37 37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38 38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39 39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40 40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41 41 * SOFTWARE.
42 42 *
43 43 */
44 44 #include <sys/types.h>
45 45 #include <sys/kmem.h>
46 46 #include <sys/cpuvar.h>
47 47 #include <sys/rds.h>
48 48
49 49 #include <sys/ib/clients/rdsv3/rdsv3.h>
50 50 #include <sys/ib/clients/rdsv3/ib.h>
51 51 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
52 52
53 53 static struct kmem_cache *rdsv3_ib_incoming_slab;
54 54 static atomic_t rdsv3_ib_allocation = ATOMIC_INIT(0);
55 55
56 56 void
57 57 rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic)
58 58 {
59 59 struct rdsv3_ib_recv_work *recv;
60 60 struct rdsv3_header *hdrp;
61 61 uint32_t i;
62 62
63 63 RDSV3_DPRINTF4("rdsv3_ib_recv_init_ring", "ic: %p", ic);
64 64
65 65 hdrp = ic->i_recv_hdrs;
66 66 for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
67 67 recv->r_ibinc = NULL;
68 68 recv->r_frag = NULL;
69 69
70 70 /* initialize the hdr sgl permanently */
71 71 recv->r_sge[0].ds_va = (ib_vaddr_t)(uintptr_t)hdrp++;
72 72 recv->r_sge[0].ds_len = sizeof (struct rdsv3_header);
73 73 recv->r_sge[0].ds_key = ic->i_mr->lkey;
74 74 }
75 75 }
76 76
77 77 static void
78 78 rdsv3_ib_recv_clear_one(struct rdsv3_ib_connection *ic,
79 79 struct rdsv3_ib_recv_work *recv)
80 80 {
81 81 RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "ic: %p, recv: %p",
82 82 ic, recv);
83 83
84 84 if (recv->r_ibinc) {
85 85 rdsv3_inc_put(&recv->r_ibinc->ii_inc);
86 86 recv->r_ibinc = NULL;
87 87 }
88 88
89 89 if (recv->r_frag) {
90 90 kmem_cache_free(ic->rds_ibdev->ib_frag_slab, recv->r_frag);
91 91 recv->r_frag = NULL;
92 92 }
93 93
94 94 RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "Return: ic: %p, recv: %p",
95 95 ic, recv);
96 96 }
97 97
98 98 void
99 99 rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic)
100 100 {
101 101 uint32_t i;
102 102
103 103 RDSV3_DPRINTF4("rdsv3_ib_recv_clear_ring", "ic: %p", ic);
104 104
105 105 for (i = 0; i < ic->i_recv_ring.w_nr; i++)
106 106 rdsv3_ib_recv_clear_one(ic, &ic->i_recvs[i]);
107 107 }
108 108
109 109 extern int atomic_add_unless(atomic_t *, uint_t, ulong_t);
110 110
111 111 static int
112 112 rdsv3_ib_recv_refill_one(struct rdsv3_connection *conn,
113 113 struct rdsv3_ib_recv_work *recv)
114 114 {
115 115 struct rdsv3_ib_connection *ic = conn->c_transport_data;
116 116 ibt_mi_hdl_t mi_hdl;
117 117 ibt_iov_attr_t iov_attr;
118 118 ibt_iov_t iov_arr[1];
119 119
120 120 RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "conn: %p, recv: %p",
121 121 conn, recv);
↓ open down ↓ |
121 lines elided |
↑ open up ↑ |
122 122
123 123 if (!recv->r_ibinc) {
124 124 if (!atomic_add_unless(&rdsv3_ib_allocation, 1,
125 125 ic->i_max_recv_alloc)) {
126 126 rdsv3_ib_stats_inc(s_ib_rx_alloc_limit);
127 127 goto out;
128 128 }
129 129 recv->r_ibinc = kmem_cache_alloc(rdsv3_ib_incoming_slab,
130 130 KM_NOSLEEP);
131 131 if (recv->r_ibinc == NULL) {
132 - atomic_add_32(&rdsv3_ib_allocation, -1);
132 + atomic_dec_32(&rdsv3_ib_allocation);
133 133 goto out;
134 134 }
135 135 rdsv3_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
136 136 recv->r_ibinc->ii_ibdev = ic->rds_ibdev;
137 137 recv->r_ibinc->ii_pool = ic->rds_ibdev->inc_pool;
138 138 }
139 139
140 140 if (!recv->r_frag) {
141 141 recv->r_frag = kmem_cache_alloc(ic->rds_ibdev->ib_frag_slab,
142 142 KM_NOSLEEP);
143 143 if (!recv->r_frag)
144 144 goto out;
145 145 }
146 146
↓ open down ↓ |
4 lines elided |
↑ open up ↑ |
147 147 /* Data sge, structure copy */
148 148 recv->r_sge[1] = recv->r_frag->f_sge;
149 149
150 150 RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "Return: conn: %p, recv: %p",
151 151 conn, recv);
152 152
153 153 return (0);
154 154 out:
155 155 if (recv->r_ibinc) {
156 156 kmem_cache_free(rdsv3_ib_incoming_slab, recv->r_ibinc);
157 - atomic_add_32(&rdsv3_ib_allocation, -1);
157 + atomic_dec_32(&rdsv3_ib_allocation);
158 158 recv->r_ibinc = NULL;
159 159 }
160 160 return (-ENOMEM);
161 161 }
162 162
163 163 /*
164 164 * This tries to allocate and post unused work requests after making sure that
165 165 * they have all the allocations they need to queue received fragments into
166 166 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
167 167 * pairs don't go unmatched.
168 168 *
169 169 * -1 is returned if posting fails due to temporary resource exhaustion.
170 170 */
171 171 int
172 172 rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int prefill)
173 173 {
174 174 struct rdsv3_ib_connection *ic = conn->c_transport_data;
175 175 struct rdsv3_ib_recv_work *recv;
176 176 unsigned int posted = 0;
177 177 int ret = 0, avail;
178 178 uint32_t pos, i;
179 179
180 180 RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "conn: %p, prefill: %d",
181 181 conn, prefill);
182 182
183 183 if (prefill || rdsv3_conn_up(conn)) {
184 184 uint_t w_nr = ic->i_recv_ring.w_nr;
185 185
186 186 avail = rdsv3_ib_ring_alloc(&ic->i_recv_ring, w_nr, &pos);
187 187 if ((avail <= 0) || (pos >= w_nr)) {
188 188 RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
189 189 "Argh - ring alloc returned pos=%u, avail: %d",
190 190 pos, avail);
191 191 return (-EINVAL);
192 192 }
193 193
194 194 /* populate the WRs */
195 195 for (i = 0; i < avail; i++) {
196 196 recv = &ic->i_recvs[pos];
197 197 ret = rdsv3_ib_recv_refill_one(conn, recv);
198 198 if (ret) {
199 199 rdsv3_ib_ring_unalloc(&ic->i_recv_ring,
200 200 avail - i);
201 201 break;
202 202 }
203 203 ic->i_recv_wrs[i].wr_id = (ibt_wrid_t)pos;
204 204 ic->i_recv_wrs[i].wr_nds = RDSV3_IB_RECV_SGE;
205 205 ic->i_recv_wrs[i].wr_sgl = &recv->r_sge[0];
206 206
207 207 pos = (pos + 1) % w_nr;
208 208 }
209 209
210 210 if (i) {
211 211 /* post the WRs at one shot */
212 212 ret = ibt_post_recv(ib_get_ibt_channel_hdl(ic->i_cm_id),
213 213 &ic->i_recv_wrs[0], i, &posted);
214 214 RDSV3_DPRINTF3("rdsv3_ib_recv_refill",
215 215 "attempted: %d posted: %d WRs ret %d",
216 216 i, posted, ret);
217 217 if (ret) {
218 218 RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
219 219 "disconnecting and reconnecting\n",
220 220 NIPQUAD(conn->c_faddr), ret);
221 221 rdsv3_ib_ring_unalloc(&ic->i_recv_ring,
222 222 i - posted);
223 223 rdsv3_conn_drop(conn);
224 224 }
225 225 }
226 226 }
227 227
228 228 /* We're doing flow control - update the window. */
229 229 if (ic->i_flowctl && posted)
230 230 rdsv3_ib_advertise_credits(conn, posted);
231 231
232 232 RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "Return: conn: %p, posted: %d",
233 233 conn, posted);
234 234 return (ret);
235 235 }
236 236
237 237 /*
238 238 * delayed freed incoming's
239 239 */
240 240 struct rdsv3_inc_pool {
241 241 list_t f_list; /* list of freed incoming */
242 242 kmutex_t f_lock; /* lock of fmr pool */
243 243 int32_t f_listcnt;
244 244 };
245 245
246 246 void
247 247 rdsv3_ib_destroy_inc_pool(struct rdsv3_ib_device *rds_ibdev)
248 248 {
249 249 struct rdsv3_inc_pool *pool = rds_ibdev->inc_pool;
250 250
251 251 if (pool) {
252 252 list_destroy(&pool->f_list);
253 253 kmem_free((void *) pool, sizeof (*pool));
254 254 }
255 255 }
256 256
257 257 int
258 258 rdsv3_ib_create_inc_pool(struct rdsv3_ib_device *rds_ibdev)
259 259 {
260 260 struct rdsv3_inc_pool *pool;
261 261
262 262 pool = (struct rdsv3_inc_pool *)kmem_zalloc(sizeof (*pool), KM_NOSLEEP);
263 263 if (pool == NULL) {
264 264 return (-ENOMEM);
265 265 }
266 266 list_create(&pool->f_list, sizeof (struct rdsv3_ib_incoming),
267 267 offsetof(struct rdsv3_ib_incoming, ii_obj));
268 268 mutex_init(&pool->f_lock, NULL, MUTEX_DRIVER, NULL);
269 269 rds_ibdev->inc_pool = pool;
270 270 return (0);
271 271 }
272 272
273 273 static void
274 274 rdsv3_ib_inc_drop(struct rdsv3_ib_incoming *ibinc)
275 275 {
276 276 struct rdsv3_page_frag *frag;
277 277 struct rdsv3_page_frag *pos;
278 278
279 279 RDSV3_FOR_EACH_LIST_NODE_SAFE(frag, pos, &ibinc->ii_frags, f_item) {
280 280 list_remove_node(&frag->f_item);
281 281 kmem_cache_free(ibinc->ii_ibdev->ib_frag_slab, frag);
282 282 }
283 283
284 284 ASSERT(list_is_empty(&ibinc->ii_frags));
285 285 kmem_cache_free(rdsv3_ib_incoming_slab, ibinc);
286 286 atomic_dec_uint(&rdsv3_ib_allocation);
287 287 }
288 288
289 289 void
290 290 rdsv3_ib_drain_inclist(void *data)
291 291 {
292 292 struct rdsv3_inc_pool *pool = (struct rdsv3_inc_pool *)data;
293 293 struct rdsv3_ib_incoming *ibinc;
294 294 list_t *listp = &pool->f_list;
295 295 kmutex_t *lockp = &pool->f_lock;
296 296 int i = 0;
297 297
298 298 for (;;) {
299 299 mutex_enter(lockp);
300 300 ibinc = (struct rdsv3_ib_incoming *)list_remove_head(listp);
301 301 if (ibinc)
302 302 pool->f_listcnt--;
303 303 mutex_exit(lockp);
304 304 if (!ibinc)
305 305 break;
306 306 i++;
307 307 rdsv3_ib_inc_drop(ibinc);
308 308 }
309 309 }
310 310
311 311 void
312 312 rdsv3_ib_inc_free(struct rdsv3_incoming *inc)
313 313 {
314 314 struct rdsv3_ib_incoming *ibinc;
315 315 rdsv3_af_thr_t *af_thr;
316 316
317 317 RDSV3_DPRINTF4("rdsv3_ib_inc_free", "inc: %p", inc);
318 318
319 319 ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc);
320 320 /* save af_thr in a local as ib_inc might be freed at mutex_exit */
321 321 af_thr = ibinc->ii_ibdev->inc_soft_cq;
322 322
323 323 mutex_enter(&ibinc->ii_pool->f_lock);
324 324 list_insert_tail(&ibinc->ii_pool->f_list, ibinc);
325 325 ibinc->ii_pool->f_listcnt++;
326 326 mutex_exit(&ibinc->ii_pool->f_lock);
327 327
328 328 rdsv3_af_thr_fire(af_thr);
329 329 }
330 330
331 331 int
332 332 rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop,
333 333 size_t size)
334 334 {
335 335 struct rdsv3_ib_incoming *ibinc;
336 336 struct rdsv3_page_frag *frag;
337 337 unsigned long to_copy;
338 338 unsigned long frag_off = 0;
339 339 int copied = 0;
340 340 int ret;
341 341 uint32_t len;
342 342
343 343 ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc);
344 344 frag = list_head(&ibinc->ii_frags);
345 345 len = ntohl(inc->i_hdr.h_len);
346 346
347 347 RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user", "inc: %p, size: %d len: %d",
348 348 inc, size, len);
349 349
350 350 while (copied < size && copied < len) {
351 351 if (frag_off == RDSV3_FRAG_SIZE) {
352 352 frag = list_next(&ibinc->ii_frags, frag);
353 353 frag_off = 0;
354 354 }
355 355
356 356 to_copy = min(len - copied, RDSV3_FRAG_SIZE - frag_off);
357 357 to_copy = min(size - copied, to_copy);
358 358
359 359 RDSV3_DPRINTF5("rdsv3_ib_inc_copy_to_user",
360 360 "%lu bytes to user %p from frag [%p, %u] + %lu",
361 361 to_copy, uiop,
362 362 frag->f_page, frag->f_offset, frag_off);
363 363
364 364 ret = uiomove((caddr_t)(frag->f_page +
365 365 frag->f_offset + frag_off),
366 366 to_copy, UIO_READ, uiop);
367 367 if (ret) {
368 368 RDSV3_DPRINTF2("rdsv3_ib_inc_copy_to_user",
369 369 "uiomove (%d) returned: %d", to_copy, ret);
370 370 break;
371 371 }
372 372
373 373 frag_off += to_copy;
374 374 copied += to_copy;
375 375 }
376 376
377 377 RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user",
378 378 "Return: inc: %p, copied: %d", inc, copied);
379 379
380 380 return (copied);
381 381 }
382 382
383 383 /* ic starts out kmem_zalloc()ed */
384 384 void
385 385 rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic)
386 386 {
387 387 ibt_send_wr_t *wr = &ic->i_ack_wr;
388 388 ibt_wr_ds_t *sge = &ic->i_ack_sge;
389 389
390 390 RDSV3_DPRINTF4("rdsv3_ib_recv_init_ack", "ic: %p", ic);
391 391
392 392 sge->ds_va = ic->i_ack_dma;
393 393 sge->ds_len = sizeof (struct rdsv3_header);
394 394 sge->ds_key = ic->i_mr->lkey;
395 395
396 396 wr->wr_sgl = sge;
397 397 wr->wr_nds = 1;
398 398 wr->wr_opcode = IBT_WRC_SEND;
399 399 wr->wr_id = RDSV3_IB_ACK_WR_ID;
400 400 wr->wr_flags = IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT;
401 401 }
402 402
403 403 /*
404 404 * You'd think that with reliable IB connections you wouldn't need to ack
405 405 * messages that have been received. The problem is that IB hardware generates
406 406 * an ack message before it has DMAed the message into memory. This creates a
407 407 * potential message loss if the HCA is disabled for any reason between when it
408 408 * sends the ack and before the message is DMAed and processed. This is only a
409 409 * potential issue if another HCA is available for fail-over.
410 410 *
411 411 * When the remote host receives our ack they'll free the sent message from
412 412 * their send queue. To decrease the latency of this we always send an ack
413 413 * immediately after we've received messages.
414 414 *
415 415 * For simplicity, we only have one ack in flight at a time. This puts
416 416 * pressure on senders to have deep enough send queues to absorb the latency of
417 417 * a single ack frame being in flight. This might not be good enough.
418 418 *
419 419 * This is implemented by have a long-lived send_wr and sge which point to a
420 420 * statically allocated ack frame. This ack wr does not fall under the ring
421 421 * accounting that the tx and rx wrs do. The QP attribute specifically makes
422 422 * room for it beyond the ring size. Send completion notices its special
423 423 * wr_id and avoids working with the ring in that case.
424 424 */
425 425 void
426 426 rdsv3_ib_set_ack(struct rdsv3_ib_connection *ic, uint64_t seq,
427 427 int ack_required)
428 428 {
429 429 RDSV3_DPRINTF4("rdsv3_ib_set_ack", "ic: %p, seq: %lld ack: %d",
430 430 ic, seq, ack_required);
431 431
432 432 mutex_enter(&ic->i_ack_lock);
433 433 ic->i_ack_next = seq;
434 434 if (ack_required)
435 435 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
436 436 mutex_exit(&ic->i_ack_lock);
437 437 }
438 438
439 439 static uint64_t
440 440 rdsv3_ib_get_ack(struct rdsv3_ib_connection *ic)
441 441 {
442 442 uint64_t seq;
443 443
444 444 RDSV3_DPRINTF4("rdsv3_ib_get_ack", "ic: %p", ic);
445 445
446 446 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
447 447
448 448 mutex_enter(&ic->i_ack_lock);
449 449 seq = ic->i_ack_next;
450 450 mutex_exit(&ic->i_ack_lock);
451 451
452 452 return (seq);
453 453 }
454 454
455 455 static void
456 456 rdsv3_ib_send_ack(struct rdsv3_ib_connection *ic, unsigned int adv_credits)
457 457 {
458 458 struct rdsv3_header *hdr = ic->i_ack;
459 459 uint64_t seq;
460 460 int ret;
461 461
462 462 RDSV3_DPRINTF4("rdsv3_ib_send_ack", "ic: %p adv_credits: %d",
463 463 ic, adv_credits);
464 464
465 465 seq = rdsv3_ib_get_ack(ic);
466 466
467 467 RDSV3_DPRINTF4("rdsv3_ib_send_ack", "send_ack: ic %p ack %llu",
468 468 ic, (unsigned long long) seq);
469 469 rdsv3_message_populate_header(hdr, 0, 0, 0);
470 470 hdr->h_ack = htonll(seq);
471 471 hdr->h_credit = adv_credits;
472 472 rdsv3_message_make_checksum(hdr);
473 473 ic->i_ack_queued = jiffies;
474 474
475 475 ret = ibt_post_send(RDSV3_QP2CHANHDL(ic->i_cm_id->qp), &ic->i_ack_wr, 1,
476 476 NULL);
477 477 if (ret) {
478 478 /*
479 479 * Failed to send. Release the WR, and
480 480 * force another ACK.
481 481 */
482 482 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
483 483 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
484 484 rdsv3_ib_stats_inc(s_ib_ack_send_failure);
485 485 RDSV3_DPRINTF2("rdsv3_ib_send_ack", "sending ack failed\n");
486 486 rdsv3_conn_drop(ic->conn);
487 487 } else {
488 488 rdsv3_ib_stats_inc(s_ib_ack_sent);
489 489 }
490 490 RDSV3_DPRINTF4("rdsv3_ib_send_ack", "Return: ic: %p adv_credits: %d",
491 491 ic, adv_credits);
492 492 }
493 493
494 494 /*
495 495 * There are 3 ways of getting acknowledgements to the peer:
496 496 * 1. We call rdsv3_ib_attempt_ack from the recv completion handler
497 497 * to send an ACK-only frame.
498 498 * However, there can be only one such frame in the send queue
499 499 * at any time, so we may have to postpone it.
500 500 * 2. When another (data) packet is transmitted while there's
501 501 * an ACK in the queue, we piggyback the ACK sequence number
502 502 * on the data packet.
503 503 * 3. If the ACK WR is done sending, we get called from the
504 504 * send queue completion handler, and check whether there's
505 505 * another ACK pending (postponed because the WR was on the
506 506 * queue). If so, we transmit it.
507 507 *
508 508 * We maintain 2 variables:
509 509 * - i_ack_flags, which keeps track of whether the ACK WR
510 510 * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
511 511 * - i_ack_next, which is the last sequence number we received
512 512 *
513 513 * Potentially, send queue and receive queue handlers can run concurrently.
514 514 * It would be nice to not have to use a spinlock to synchronize things,
515 515 * but the one problem that rules this out is that 64bit updates are
516 516 * not atomic on all platforms. Things would be a lot simpler if
517 517 * we had atomic64 or maybe cmpxchg64 everywhere.
518 518 *
519 519 * Reconnecting complicates this picture just slightly. When we
520 520 * reconnect, we may be seeing duplicate packets. The peer
521 521 * is retransmitting them, because it hasn't seen an ACK for
522 522 * them. It is important that we ACK these.
523 523 *
524 524 * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
525 525 * this flag set *MUST* be acknowledged immediately.
526 526 */
527 527
528 528 /*
529 529 * When we get here, we're called from the recv queue handler.
530 530 * Check whether we ought to transmit an ACK.
531 531 */
532 532 void
533 533 rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic)
534 534 {
535 535 unsigned int adv_credits;
536 536
537 537 RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "ic: %p", ic);
538 538
539 539 if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
540 540 return;
541 541
542 542 if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
543 543 rdsv3_ib_stats_inc(s_ib_ack_send_delayed);
544 544 return;
545 545 }
546 546
547 547 /* Can we get a send credit? */
548 548 if (!rdsv3_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
549 549 rdsv3_ib_stats_inc(s_ib_tx_throttle);
550 550 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
551 551 return;
552 552 }
553 553
554 554 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
555 555 rdsv3_ib_send_ack(ic, adv_credits);
556 556
557 557 RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "Return: ic: %p", ic);
558 558 }
559 559
560 560 /*
561 561 * We get here from the send completion handler, when the
562 562 * adapter tells us the ACK frame was sent.
563 563 */
564 564 void
565 565 rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic)
566 566 {
567 567 RDSV3_DPRINTF4("rdsv3_ib_ack_send_complete", "ic: %p", ic);
568 568 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
569 569 rdsv3_ib_attempt_ack(ic);
570 570 }
571 571
572 572 /*
573 573 * This is called by the regular xmit code when it wants to piggyback
574 574 * an ACK on an outgoing frame.
575 575 */
576 576 uint64_t
577 577 rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic)
578 578 {
579 579 RDSV3_DPRINTF4("rdsv3_ib_piggyb_ack", "ic: %p", ic);
580 580 if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) {
581 581 rdsv3_ib_stats_inc(s_ib_ack_send_piggybacked);
582 582 }
583 583 return (rdsv3_ib_get_ack(ic));
584 584 }
585 585
586 586 /*
587 587 * It's kind of lame that we're copying from the posted receive pages into
588 588 * long-lived bitmaps. We could have posted the bitmaps and rdma written into
589 589 * them. But receiving new congestion bitmaps should be a *rare* event, so
590 590 * hopefully we won't need to invest that complexity in making it more
591 591 * efficient. By copying we can share a simpler core with TCP which has to
592 592 * copy.
593 593 */
594 594 static void
595 595 rdsv3_ib_cong_recv(struct rdsv3_connection *conn,
596 596 struct rdsv3_ib_incoming *ibinc)
597 597 {
598 598 struct rdsv3_cong_map *map;
599 599 unsigned int map_off;
600 600 unsigned int map_page;
601 601 struct rdsv3_page_frag *frag;
602 602 unsigned long frag_off;
603 603 unsigned long to_copy;
604 604 unsigned long copied;
605 605 uint64_t uncongested = 0;
606 606 caddr_t addr;
607 607
608 608 RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "conn: %p, ibinc: %p",
609 609 conn, ibinc);
610 610
611 611 /* catch completely corrupt packets */
612 612 if (ntohl(ibinc->ii_inc.i_hdr.h_len) != RDSV3_CONG_MAP_BYTES)
613 613 return;
614 614
615 615 map = conn->c_fcong;
616 616 map_page = 0;
617 617 map_off = 0;
618 618
619 619 frag = list_head(&ibinc->ii_frags);
620 620 frag_off = 0;
621 621
622 622 copied = 0;
623 623
624 624 while (copied < RDSV3_CONG_MAP_BYTES) {
625 625 uint64_t *src, *dst;
626 626 unsigned int k;
627 627
628 628 to_copy = min(RDSV3_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
629 629 ASSERT(!(to_copy & 7)); /* Must be 64bit aligned. */
630 630
631 631 addr = frag->f_page + frag->f_offset;
632 632
633 633 src = (uint64_t *)(addr + frag_off);
634 634 dst = (uint64_t *)(map->m_page_addrs[map_page] + map_off);
635 635 RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
636 636 "src: %p dst: %p copied: %d", src, dst, copied);
637 637 for (k = 0; k < to_copy; k += 8) {
638 638 /*
639 639 * Record ports that became uncongested, ie
640 640 * bits that changed from 0 to 1.
641 641 */
642 642 uncongested |= ~(*src) & *dst;
643 643 *dst++ = *src++;
644 644 }
645 645
646 646 copied += to_copy;
647 647 RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
648 648 "src: %p dst: %p copied: %d", src, dst, copied);
649 649
650 650 map_off += to_copy;
651 651 if (map_off == PAGE_SIZE) {
652 652 map_off = 0;
653 653 map_page++;
654 654 }
655 655
656 656 frag_off += to_copy;
657 657 if (frag_off == RDSV3_FRAG_SIZE) {
658 658 frag = list_next(&ibinc->ii_frags, frag);
659 659 frag_off = 0;
660 660 }
661 661 }
662 662
663 663 #if 0
664 664 XXX
665 665 /* the congestion map is in little endian order */
666 666 uncongested = le64_to_cpu(uncongested);
667 667 #endif
668 668
669 669 rdsv3_cong_map_updated(map, uncongested);
670 670
671 671 RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "Return: conn: %p, ibinc: %p",
672 672 conn, ibinc);
673 673 }
674 674
675 675 static void
676 676 rdsv3_ib_process_recv(struct rdsv3_connection *conn,
677 677 struct rdsv3_ib_recv_work *recv, uint32_t data_len,
678 678 struct rdsv3_ib_ack_state *state)
679 679 {
680 680 struct rdsv3_ib_connection *ic = conn->c_transport_data;
681 681 struct rdsv3_ib_incoming *ibinc = ic->i_ibinc;
682 682 struct rdsv3_header *ihdr, *hdr;
683 683
684 684 /* XXX shut down the connection if port 0,0 are seen? */
685 685
686 686 RDSV3_DPRINTF5("rdsv3_ib_process_recv",
687 687 "ic %p ibinc %p recv %p byte len %u", ic, ibinc, recv, data_len);
688 688
689 689 if (data_len < sizeof (struct rdsv3_header)) {
690 690 RDSV3_DPRINTF2("rdsv3_ib_process_recv",
691 691 "incoming message from %u.%u.%u.%u didn't include a "
692 692 "header, disconnecting and reconnecting",
693 693 NIPQUAD(conn->c_faddr));
694 694 rdsv3_conn_drop(conn);
695 695 return;
696 696 }
697 697 data_len -= sizeof (struct rdsv3_header);
698 698
699 699 ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
700 700
701 701 /* Validate the checksum. */
702 702 if (!rdsv3_message_verify_checksum(ihdr)) {
703 703 RDSV3_DPRINTF2("rdsv3_ib_process_recv", "incoming message "
704 704 "from %u.%u.%u.%u has corrupted header - "
705 705 "forcing a reconnect",
706 706 NIPQUAD(conn->c_faddr));
707 707 rdsv3_conn_drop(conn);
708 708 rdsv3_stats_inc(s_recv_drop_bad_checksum);
709 709 return;
710 710 }
711 711
712 712 /* Process the ACK sequence which comes with every packet */
713 713 state->ack_recv = ntohll(ihdr->h_ack);
714 714 state->ack_recv_valid = 1;
715 715
716 716 /* Process the credits update if there was one */
717 717 if (ihdr->h_credit)
718 718 rdsv3_ib_send_add_credits(conn, ihdr->h_credit);
719 719
720 720 if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
721 721 /*
722 722 * This is an ACK-only packet. The fact that it gets
723 723 * special treatment here is that historically, ACKs
724 724 * were rather special beasts.
725 725 */
726 726 rdsv3_ib_stats_inc(s_ib_ack_received);
727 727 return;
728 728 }
729 729
730 730 /*
731 731 * If we don't already have an inc on the connection then this
732 732 * fragment has a header and starts a message.. copy its header
733 733 * into the inc and save the inc so we can hang upcoming fragments
734 734 * off its list.
735 735 */
736 736 if (!ibinc) {
737 737 ibinc = recv->r_ibinc;
738 738 recv->r_ibinc = NULL;
739 739 ic->i_ibinc = ibinc;
740 740
741 741 hdr = &ibinc->ii_inc.i_hdr;
742 742 (void) memcpy(hdr, ihdr, sizeof (*hdr));
743 743 ic->i_recv_data_rem = ntohl(hdr->h_len);
744 744
745 745 RDSV3_DPRINTF5("rdsv3_ib_process_recv",
746 746 "ic %p ibinc %p rem %u flag 0x%x", ic, ibinc,
747 747 ic->i_recv_data_rem, hdr->h_flags);
748 748 } else {
749 749 hdr = &ibinc->ii_inc.i_hdr;
750 750 /*
751 751 * We can't just use memcmp here; fragments of a
752 752 * single message may carry different ACKs
753 753 */
754 754 if (hdr->h_sequence != ihdr->h_sequence ||
755 755 hdr->h_len != ihdr->h_len ||
756 756 hdr->h_sport != ihdr->h_sport ||
757 757 hdr->h_dport != ihdr->h_dport) {
758 758 RDSV3_DPRINTF2("rdsv3_ib_process_recv",
759 759 "fragment header mismatch; forcing reconnect");
760 760 rdsv3_conn_drop(conn);
761 761 return;
762 762 }
763 763 }
764 764
765 765 list_insert_tail(&ibinc->ii_frags, recv->r_frag);
766 766 recv->r_frag = NULL;
767 767
768 768 if (ic->i_recv_data_rem > RDSV3_FRAG_SIZE)
769 769 ic->i_recv_data_rem -= RDSV3_FRAG_SIZE;
770 770 else {
771 771 ic->i_recv_data_rem = 0;
772 772 ic->i_ibinc = NULL;
773 773
774 774 if (ibinc->ii_inc.i_hdr.h_flags == RDSV3_FLAG_CONG_BITMAP)
775 775 rdsv3_ib_cong_recv(conn, ibinc);
776 776 else {
777 777 rdsv3_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
778 778 &ibinc->ii_inc, KM_NOSLEEP);
779 779 state->ack_next = ntohll(hdr->h_sequence);
780 780 state->ack_next_valid = 1;
781 781 }
782 782
783 783 /*
784 784 * Evaluate the ACK_REQUIRED flag *after* we received
785 785 * the complete frame, and after bumping the next_rx
786 786 * sequence.
787 787 */
788 788 if (hdr->h_flags & RDSV3_FLAG_ACK_REQUIRED) {
789 789 rdsv3_stats_inc(s_recv_ack_required);
790 790 state->ack_required = 1;
791 791 }
792 792
793 793 rdsv3_inc_put(&ibinc->ii_inc);
794 794 }
795 795
796 796 RDSV3_DPRINTF4("rdsv3_ib_process_recv",
797 797 "Return: conn: %p recv: %p len: %d state: %p",
798 798 conn, recv, data_len, state);
799 799 }
800 800
801 801 void
802 802 rdsv3_ib_recv_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc,
803 803 struct rdsv3_ib_ack_state *state)
804 804 {
805 805 struct rdsv3_connection *conn = ic->conn;
806 806 struct rdsv3_ib_recv_work *recv;
807 807 struct rdsv3_ib_work_ring *recv_ringp = &ic->i_recv_ring;
808 808
809 809 RDSV3_DPRINTF4("rdsv3_ib_recv_cqe_handler",
810 810 "rwc wc_id 0x%llx status %u byte_len %u imm_data %u\n",
811 811 (unsigned long long)wc->wc_id, wc->wc_status,
812 812 wc->wc_bytes_xfer, ntohl(wc->wc_immed_data));
813 813
814 814 rdsv3_ib_stats_inc(s_ib_rx_cq_event);
815 815
816 816 recv = &ic->i_recvs[rdsv3_ib_ring_oldest(recv_ringp)];
817 817
818 818 /*
819 819 * Also process recvs in connecting state because it is possible
820 820 * to get a recv completion _before_ the rdmacm ESTABLISHED
821 821 * event is processed.
822 822 */
823 823 if (rdsv3_conn_up(conn) || rdsv3_conn_connecting(conn)) {
824 824 /* We expect errors as the qp is drained during shutdown */
825 825 if (wc->wc_status == IBT_WC_SUCCESS) {
826 826 rdsv3_ib_process_recv(conn, recv,
827 827 wc->wc_bytes_xfer, state);
828 828 } else {
829 829 RDSV3_DPRINTF2("rdsv3_ib_recv_cqe_handler",
830 830 "recv completion on "
831 831 "%u.%u.%u.%u had status %u, "
832 832 "disconnecting and reconnecting\n",
833 833 NIPQUAD(conn->c_faddr),
834 834 wc->wc_status);
835 835 rdsv3_conn_drop(conn);
836 836 }
837 837 }
838 838
839 839 rdsv3_ib_ring_free(recv_ringp, 1);
840 840
841 841 /*
842 842 * If we ever end up with a really empty receive ring, we're
843 843 * in deep trouble, as the sender will definitely see RNR
844 844 * timeouts.
845 845 */
846 846 if (rdsv3_ib_ring_empty(recv_ringp))
847 847 rdsv3_ib_stats_inc(s_ib_rx_ring_empty);
848 848
849 849 if (rdsv3_ib_ring_low(recv_ringp)) {
850 850 rdsv3_af_thr_fire(ic->i_refill_rq);
851 851 }
852 852 }
853 853
854 854 int
855 855 rdsv3_ib_recv(struct rdsv3_connection *conn)
856 856 {
857 857 struct rdsv3_ib_connection *ic = conn->c_transport_data;
858 858 int ret = 0;
859 859
860 860 RDSV3_DPRINTF4("rdsv3_ib_recv", "conn %p\n", conn);
861 861
862 862 if (rdsv3_conn_up(conn))
863 863 rdsv3_ib_attempt_ack(ic);
864 864
865 865 RDSV3_DPRINTF4("rdsv3_ib_recv", "Return: conn: %p", conn);
866 866
867 867 return (ret);
868 868 }
869 869
870 870 extern int rdsv3_ib_inc_constructor(void *buf, void *arg, int kmflags);
871 871 extern void rdsv3_ib_inc_destructor(void *buf, void *arg);
872 872
873 873 int
874 874 rdsv3_ib_recv_init(void)
875 875 {
876 876 RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Enter");
877 877
878 878 rdsv3_ib_incoming_slab = kmem_cache_create("rdsv3_ib_incoming",
879 879 sizeof (struct rdsv3_ib_incoming), 0, rdsv3_ib_inc_constructor,
880 880 rdsv3_ib_inc_destructor, NULL, NULL, NULL, 0);
881 881 if (!rdsv3_ib_incoming_slab) {
882 882 RDSV3_DPRINTF2("rdsv3_ib_recv_init", "kmem_cache_create "
883 883 "failed");
884 884 return (-ENOMEM);
885 885 }
886 886
887 887 RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Return");
888 888 return (0);
889 889 }
890 890
891 891 void
892 892 rdsv3_ib_recv_exit(void)
893 893 {
894 894 RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Enter");
895 895 kmem_cache_destroy(rdsv3_ib_incoming_slab);
896 896 RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Return");
897 897 }
↓ open down ↓ |
730 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX