1 /*
   2  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
   3  */
   4 
   5 /*
   6  * This file contains code imported from the OFED rds source file rdma.c
   7  * Oracle elects to have and use the contents of rdma.c under and governed
   8  * by the OpenIB.org BSD license (see below for full license text). However,
   9  * the following notice accompanied the original version of this file:
  10  */
  11 
  12 /*
  13  * Copyright (c) 2007 Oracle.  All rights reserved.
  14  *
  15  * This software is available to you under a choice of one of two
  16  * licenses.  You may choose to be licensed under the terms of the GNU
  17  * General Public License (GPL) Version 2, available from the file
  18  * COPYING in the main directory of this source tree, or the
  19  * OpenIB.org BSD license below:
  20  *
  21  *     Redistribution and use in source and binary forms, with or
  22  *     without modification, are permitted provided that the following
  23  *     conditions are met:
  24  *
  25  *      - Redistributions of source code must retain the above
  26  *        copyright notice, this list of conditions and the following
  27  *        disclaimer.
  28  *
  29  *      - Redistributions in binary form must reproduce the above
  30  *        copyright notice, this list of conditions and the following
  31  *        disclaimer in the documentation and/or other materials
  32  *        provided with the distribution.
  33  *
  34  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  35  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  36  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  37  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  38  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  39  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  40  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  41  * SOFTWARE.
  42  *
  43  */
  44 #include <sys/ib/clients/of/rdma/ib_verbs.h>
  45 #include <sys/ib/clients/of/rdma/ib_addr.h>
  46 #include <sys/ib/clients/of/rdma/rdma_cm.h>
  47 
  48 #include <sys/ib/clients/rdsv3/ib.h>
  49 #include <sys/ib/clients/rdsv3/rdma.h>
  50 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
  51 
  52 #define DMA_TO_DEVICE 0
  53 #define DMA_FROM_DEVICE 1
  54 #define RB_CLEAR_NODE(nodep) AVL_SETPARENT(nodep, nodep);
  55 
  56 /*
  57  * XXX
  58  *  - build with sparse
  59  *  - should we limit the size of a mr region?  let transport return failure?
  60  *  - should we detect duplicate keys on a socket?  hmm.
  61  *  - an rdma is an mlock, apply rlimit?
  62  */
  63 
  64 /*
  65  * get the number of pages by looking at the page indices that the start and
  66  * end addresses fall in.
  67  *
  68  * Returns 0 if the vec is invalid.  It is invalid if the number of bytes
  69  * causes the address to wrap or overflows an unsigned int.  This comes
  70  * from being stored in the 'length' member of 'struct rdsv3_scatterlist'.
  71  */
  72 static unsigned int
  73 rdsv3_pages_in_vec(struct rds_iovec *vec)
  74 {
  75         if ((vec->addr + vec->bytes <= vec->addr) ||
  76             (vec->bytes > (uint64_t)UINT_MAX)) {
  77                 return (0);
  78         }
  79 
  80         return (((vec->addr + vec->bytes + PAGESIZE - 1) >>
  81             PAGESHIFT) - (vec->addr >> PAGESHIFT));
  82 }
  83 
  84 static struct rdsv3_mr *
  85 rdsv3_mr_tree_walk(struct avl_tree *root, uint32_t key,
  86         struct rdsv3_mr *insert)
  87 {
  88         struct rdsv3_mr *mr;
  89         avl_index_t where;
  90 
  91         mr = avl_find(root, &key, &where);
  92         if ((mr == NULL) && (insert != NULL)) {
  93                 avl_insert(root, (void *)insert, where);
  94                 atomic_inc_32(&insert->r_refcount);
  95                 return (NULL);
  96         }
  97 
  98         return (mr);
  99 }
 100 
 101 /*
 102  * Destroy the transport-specific part of a MR.
 103  */
 104 static void
 105 rdsv3_destroy_mr(struct rdsv3_mr *mr)
 106 {
 107         struct rdsv3_sock *rs = mr->r_sock;
 108         void *trans_private = NULL;
 109         avl_node_t *np;
 110 
 111         RDSV3_DPRINTF5("rdsv3_destroy_mr",
 112             "RDS: destroy mr key is %x refcnt %u",
 113             mr->r_key, atomic_get(&mr->r_refcount));
 114 
 115         if (test_and_set_bit(RDSV3_MR_DEAD, &mr->r_state))
 116                 return;
 117 
 118         mutex_enter(&rs->rs_rdma_lock);
 119         np = &mr->r_rb_node;
 120         if (AVL_XPARENT(np) != np)
 121                 avl_remove(&rs->rs_rdma_keys, mr);
 122         trans_private = mr->r_trans_private;
 123         mr->r_trans_private = NULL;
 124         mutex_exit(&rs->rs_rdma_lock);
 125 
 126         if (trans_private)
 127                 mr->r_trans->free_mr(trans_private, mr->r_invalidate);
 128 }
 129 
 130 void
 131 __rdsv3_put_mr_final(struct rdsv3_mr *mr)
 132 {
 133         rdsv3_destroy_mr(mr);
 134         kmem_free(mr, sizeof (*mr));
 135 }
 136 
 137 /*
 138  * By the time this is called we can't have any more ioctls called on
 139  * the socket so we don't need to worry about racing with others.
 140  */
 141 void
 142 rdsv3_rdma_drop_keys(struct rdsv3_sock *rs)
 143 {
 144         struct rdsv3_mr *mr;
 145         struct avl_node *node;
 146 
 147         /* Release any MRs associated with this socket */
 148         mutex_enter(&rs->rs_rdma_lock);
 149         while ((node = avl_first(&rs->rs_rdma_keys))) {
 150                 mr = container_of(node, struct rdsv3_mr, r_rb_node);
 151                 if (mr->r_trans == rs->rs_transport)
 152                         mr->r_invalidate = 0;
 153                 avl_remove(&rs->rs_rdma_keys, &mr->r_rb_node);
 154                 RB_CLEAR_NODE(&mr->r_rb_node)
 155                 mutex_exit(&rs->rs_rdma_lock);
 156                 rdsv3_destroy_mr(mr);
 157                 rdsv3_mr_put(mr);
 158                 mutex_enter(&rs->rs_rdma_lock);
 159         }
 160         mutex_exit(&rs->rs_rdma_lock);
 161 
 162         if (rs->rs_transport && rs->rs_transport->flush_mrs)
 163                 rs->rs_transport->flush_mrs();
 164 }
 165 
 166 static int
 167 __rdsv3_rdma_map(struct rdsv3_sock *rs, struct rds_get_mr_args *args,
 168         uint64_t *cookie_ret, struct rdsv3_mr **mr_ret)
 169 {
 170         struct rdsv3_mr *mr = NULL, *found;
 171         void *trans_private;
 172         rds_rdma_cookie_t cookie;
 173         unsigned int nents = 0;
 174         int ret;
 175 
 176         if (rs->rs_bound_addr == 0) {
 177                 ret = -ENOTCONN; /* XXX not a great errno */
 178                 goto out;
 179         }
 180 
 181         if (!rs->rs_transport->get_mr) {
 182                 ret = -EOPNOTSUPP;
 183                 goto out;
 184         }
 185 
 186         mr = kmem_zalloc(sizeof (struct rdsv3_mr), KM_NOSLEEP);
 187         if (!mr) {
 188                 ret = -ENOMEM;
 189                 goto out;
 190         }
 191 
 192         mr->r_refcount = 1;
 193         RB_CLEAR_NODE(&mr->r_rb_node);
 194         mr->r_trans = rs->rs_transport;
 195         mr->r_sock = rs;
 196 
 197         if (args->flags & RDS_RDMA_USE_ONCE)
 198                 mr->r_use_once = 1;
 199         if (args->flags & RDS_RDMA_INVALIDATE)
 200                 mr->r_invalidate = 1;
 201         if (args->flags & RDS_RDMA_READWRITE)
 202                 mr->r_write = 1;
 203 
 204         /*
 205          * Obtain a transport specific MR. If this succeeds, the
 206          * s/g list is now owned by the MR.
 207          * Note that dma_map() implies that pending writes are
 208          * flushed to RAM, so no dma_sync is needed here.
 209          */
 210         trans_private = rs->rs_transport->get_mr(&args->vec, nents, rs,
 211             &mr->r_key);
 212 
 213         if (IS_ERR(trans_private)) {
 214                 ret = PTR_ERR(trans_private);
 215                 goto out;
 216         }
 217 
 218         mr->r_trans_private = trans_private;
 219 
 220         /*
 221          * The user may pass us an unaligned address, but we can only
 222          * map page aligned regions. So we keep the offset, and build
 223          * a 64bit cookie containing <R_Key, offset> and pass that
 224          * around.
 225          */
 226         cookie = rdsv3_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGEMASK);
 227         if (cookie_ret)
 228                 *cookie_ret = cookie;
 229 
 230         /*
 231          * copy value of cookie to user address at args->cookie_addr
 232          */
 233         if (args->cookie_addr) {
 234                 ret = ddi_copyout((void *)&cookie,
 235                     (void *)((intptr_t)args->cookie_addr),
 236                     sizeof (rds_rdma_cookie_t), 0);
 237                 if (ret != 0) {
 238                         ret = -EFAULT;
 239                         goto out;
 240                 }
 241         }
 242 
 243         RDSV3_DPRINTF5("__rdsv3_rdma_map",
 244             "RDS: get_mr mr 0x%p addr 0x%llx key 0x%x",
 245             mr, args->vec.addr, mr->r_key);
 246         /*
 247          * Inserting the new MR into the rbtree bumps its
 248          * reference count.
 249          */
 250         mutex_enter(&rs->rs_rdma_lock);
 251         found = rdsv3_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
 252         mutex_exit(&rs->rs_rdma_lock);
 253 
 254         ASSERT(!(found && found != mr));
 255 
 256         if (mr_ret) {
 257                 atomic_inc_32(&mr->r_refcount);
 258                 *mr_ret = mr;
 259         }
 260 
 261         ret = 0;
 262 out:
 263         if (mr)
 264                 rdsv3_mr_put(mr);
 265         return (ret);
 266 }
 267 
 268 int
 269 rdsv3_get_mr(struct rdsv3_sock *rs, const void *optval, int optlen)
 270 {
 271         struct rds_get_mr_args args;
 272 
 273         if (optlen != sizeof (struct rds_get_mr_args))
 274                 return (-EINVAL);
 275 
 276 #if 1
 277         bcopy((struct rds_get_mr_args *)optval, &args,
 278             sizeof (struct rds_get_mr_args));
 279 #else
 280         if (ddi_copyin(optval, &args, optlen, 0))
 281                 return (-EFAULT);
 282 #endif
 283 
 284         return (__rdsv3_rdma_map(rs, &args, NULL, NULL));
 285 }
 286 
 287 int
 288 rdsv3_get_mr_for_dest(struct rdsv3_sock *rs, const void *optval,
 289     int optlen)
 290 {
 291         struct rds_get_mr_for_dest_args args;
 292         struct rds_get_mr_args new_args;
 293 
 294         if (optlen != sizeof (struct rds_get_mr_for_dest_args))
 295                 return (-EINVAL);
 296 
 297 #if 1
 298         bcopy((struct rds_get_mr_for_dest_args *)optval, &args,
 299             sizeof (struct rds_get_mr_for_dest_args));
 300 #else
 301         if (ddi_copyin(optval, &args, optlen, 0))
 302                 return (-EFAULT);
 303 #endif
 304 
 305         /*
 306          * Initially, just behave like get_mr().
 307          * TODO: Implement get_mr as wrapper around this
 308          *       and deprecate it.
 309          */
 310         new_args.vec = args.vec;
 311         new_args.cookie_addr = args.cookie_addr;
 312         new_args.flags = args.flags;
 313 
 314         return (__rdsv3_rdma_map(rs, &new_args, NULL, NULL));
 315 }
 316 
 317 /*
 318  * Free the MR indicated by the given R_Key
 319  */
 320 int
 321 rdsv3_free_mr(struct rdsv3_sock *rs, const void *optval, int optlen)
 322 {
 323         struct rds_free_mr_args args;
 324         struct rdsv3_mr *mr;
 325 
 326         if (optlen != sizeof (struct rds_free_mr_args))
 327                 return (-EINVAL);
 328 
 329 #if 1
 330         bcopy((struct rds_free_mr_args *)optval, &args,
 331             sizeof (struct rds_free_mr_args));
 332 #else
 333         if (ddi_copyin((struct rds_free_mr_args *)optval, &args,
 334             sizeof (struct rds_free_mr_args), 0))
 335                 return (-EFAULT);
 336 #endif
 337 
 338         /* Special case - a null cookie means flush all unused MRs */
 339         if (args.cookie == 0) {
 340                 if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
 341                         return (-EINVAL);
 342                 rs->rs_transport->flush_mrs();
 343                 return (0);
 344         }
 345 
 346         /*
 347          * Look up the MR given its R_key and remove it from the rbtree
 348          * so nobody else finds it.
 349          * This should also prevent races with rdsv3_rdma_unuse.
 350          */
 351         mutex_enter(&rs->rs_rdma_lock);
 352         mr = rdsv3_mr_tree_walk(&rs->rs_rdma_keys,
 353             rdsv3_rdma_cookie_key(args.cookie), NULL);
 354         if (mr) {
 355                 avl_remove(&rs->rs_rdma_keys, &mr->r_rb_node);
 356                 RB_CLEAR_NODE(&mr->r_rb_node);
 357                 if (args.flags & RDS_RDMA_INVALIDATE)
 358                         mr->r_invalidate = 1;
 359         }
 360         mutex_exit(&rs->rs_rdma_lock);
 361 
 362         if (!mr)
 363                 return (-EINVAL);
 364 
 365         /*
 366          * call rdsv3_destroy_mr() ourselves so that we're sure it's done
 367          * by time we return.  If we let rdsv3_mr_put() do it it might not
 368          * happen until someone else drops their ref.
 369          */
 370         rdsv3_destroy_mr(mr);
 371         rdsv3_mr_put(mr);
 372         return (0);
 373 }
 374 
 375 /*
 376  * This is called when we receive an extension header that
 377  * tells us this MR was used. It allows us to implement
 378  * use_once semantics
 379  */
 380 void
 381 rdsv3_rdma_unuse(struct rdsv3_sock *rs, uint32_t r_key, int force)
 382 {
 383         struct rdsv3_mr *mr;
 384         int zot_me = 0;
 385 
 386         RDSV3_DPRINTF4("rdsv3_rdma_unuse", "Enter rkey: 0x%x", r_key);
 387 
 388         mutex_enter(&rs->rs_rdma_lock);
 389         mr = rdsv3_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
 390         if (!mr) {
 391                 RDSV3_DPRINTF4("rdsv3_rdma_unuse",
 392                     "rdsv3: trying to unuse MR with unknown r_key %u!", r_key);
 393                 mutex_exit(&rs->rs_rdma_lock);
 394                 return;
 395         }
 396 
 397         if (mr->r_use_once || force) {
 398                 avl_remove(&rs->rs_rdma_keys, &mr->r_rb_node);
 399                 RB_CLEAR_NODE(&mr->r_rb_node);
 400                 zot_me = 1;
 401         } else {
 402                 atomic_inc_32(&mr->r_refcount);
 403         }
 404         mutex_exit(&rs->rs_rdma_lock);
 405 
 406         /*
 407          * May have to issue a dma_sync on this memory region.
 408          * Note we could avoid this if the operation was a RDMA READ,
 409          * but at this point we can't tell.
 410          */
 411         if (mr->r_trans->sync_mr)
 412                 mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
 413 
 414         /*
 415          * If the MR was marked as invalidate, this will
 416          * trigger an async flush.
 417          */
 418         if (zot_me)
 419                 rdsv3_destroy_mr(mr);
 420         rdsv3_mr_put(mr);
 421         RDSV3_DPRINTF4("rdsv3_rdma_unuse", "Return");
 422 }
 423 
 424 void
 425 rdsv3_rdma_free_op(struct rdsv3_rdma_op *ro)
 426 {
 427         unsigned int i;
 428 
 429         /* deallocate RDMA resources on rdsv3_message */
 430         for (i = 0; i < ro->r_nents; i++) {
 431                 ddi_umem_unlock(ro->r_rdma_sg[i].umem_cookie);
 432         }
 433 
 434         if (ro->r_notifier)
 435                 kmem_free(ro->r_notifier, sizeof (*ro->r_notifier));
 436         kmem_free(ro, sizeof (*ro));
 437 }
 438 
 439 /*
 440  * args is a pointer to an in-kernel copy in the sendmsg cmsg.
 441  */
 442 static struct rdsv3_rdma_op *
 443 rdsv3_rdma_prepare(struct rdsv3_sock *rs, struct rds_rdma_args *args)
 444 {
 445         struct rds_iovec vec;
 446         struct rdsv3_rdma_op *op = NULL;
 447         unsigned int nr_bytes;
 448         struct rds_iovec *local_vec;
 449         unsigned int nr;
 450         unsigned int i;
 451         ddi_umem_cookie_t umem_cookie;
 452         size_t umem_len;
 453         caddr_t umem_addr;
 454         int ret;
 455 
 456         if (rs->rs_bound_addr == 0) {
 457                 ret = -ENOTCONN; /* XXX not a great errno */
 458                 goto out;
 459         }
 460 
 461         if (args->nr_local > (uint64_t)UINT_MAX) {
 462                 ret = -EMSGSIZE;
 463                 goto out;
 464         }
 465 
 466         op = kmem_zalloc(offsetof(struct rdsv3_rdma_op,
 467             r_rdma_sg[args->nr_local]), KM_NOSLEEP);
 468         if (op == NULL) {
 469                 ret = -ENOMEM;
 470                 goto out;
 471         }
 472 
 473         op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
 474         op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
 475         op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
 476         op->r_recverr = rs->rs_recverr;
 477 
 478         if (op->r_notify || op->r_recverr) {
 479                 /*
 480                  * We allocate an uninitialized notifier here, because
 481                  * we don't want to do that in the completion handler. We
 482                  * would have to use GFP_ATOMIC there, and don't want to deal
 483                  * with failed allocations.
 484                  */
 485                 op->r_notifier = kmem_alloc(sizeof (struct rdsv3_notifier),
 486                     KM_NOSLEEP);
 487                 if (!op->r_notifier) {
 488                         ret = -ENOMEM;
 489                         goto out;
 490                 }
 491                 op->r_notifier->n_user_token = args->user_token;
 492                 op->r_notifier->n_status = RDS_RDMA_SUCCESS;
 493         }
 494 
 495         /*
 496          * The cookie contains the R_Key of the remote memory region, and
 497          * optionally an offset into it. This is how we implement RDMA into
 498          * unaligned memory.
 499          * When setting up the RDMA, we need to add that offset to the
 500          * destination address (which is really an offset into the MR)
 501          * FIXME: We may want to move this into ib_rdma.c
 502          */
 503         op->r_key = rdsv3_rdma_cookie_key(args->cookie);
 504         op->r_remote_addr = args->remote_vec.addr +
 505             rdsv3_rdma_cookie_offset(args->cookie);
 506 
 507         nr_bytes = 0;
 508 
 509         RDSV3_DPRINTF5("rdsv3_rdma_prepare",
 510             "RDS: rdma prepare nr_local %llu rva %llx rkey %x",
 511             (unsigned long long)args->nr_local,
 512             (unsigned long long)args->remote_vec.addr,
 513             op->r_key);
 514 
 515         local_vec = (struct rds_iovec *)(unsigned long) args->local_vec_addr;
 516 
 517         /* pin the scatter list of user buffers */
 518         for (i = 0; i < args->nr_local; i++) {
 519                 if (ddi_copyin(&local_vec[i], &vec,
 520                     sizeof (struct rds_iovec), 0)) {
 521                         ret = -EFAULT;
 522                         goto out;
 523                 }
 524 
 525                 nr = rdsv3_pages_in_vec(&vec);
 526                 if (nr == 0) {
 527                         RDSV3_DPRINTF2("rdsv3_rdma_prepare",
 528                             "rdsv3_pages_in_vec returned 0");
 529                         ret = -EINVAL;
 530                         goto out;
 531                 }
 532 
 533                 rs->rs_user_addr = vec.addr;
 534                 rs->rs_user_bytes = vec.bytes;
 535 
 536                 /* pin user memory pages */
 537                 umem_len = ptob(btopr(vec.bytes +
 538                     ((uintptr_t)vec.addr & PAGEOFFSET)));
 539                 umem_addr = (caddr_t)((uintptr_t)vec.addr & ~PAGEOFFSET);
 540                 ret = umem_lockmemory(umem_addr, umem_len,
 541                     DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ,
 542                     &umem_cookie, NULL, NULL);
 543                 if (ret != 0) {
 544                         RDSV3_DPRINTF2("rdsv3_rdma_prepare",
 545                             "umem_lockmemory() returned %d", ret);
 546                         ret = -EFAULT;
 547                         goto out;
 548                 }
 549                 op->r_rdma_sg[i].umem_cookie = umem_cookie;
 550                 op->r_rdma_sg[i].iovec = vec;
 551                 nr_bytes += vec.bytes;
 552 
 553                 RDSV3_DPRINTF5("rdsv3_rdma_prepare",
 554                     "RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx",
 555                     nr_bytes, nr, vec.bytes, vec.addr);
 556         }
 557         op->r_nents = i;
 558 
 559         if (nr_bytes > args->remote_vec.bytes) {
 560                 RDSV3_DPRINTF2("rdsv3_rdma_prepare",
 561                     "RDS nr_bytes %u remote_bytes %u do not match",
 562                     nr_bytes, (unsigned int) args->remote_vec.bytes);
 563                 ret = -EINVAL;
 564                 goto out;
 565         }
 566         op->r_bytes = nr_bytes;
 567 
 568         ret = 0;
 569 out:
 570         if (ret) {
 571                 if (op)
 572                         rdsv3_rdma_free_op(op);
 573                 op = ERR_PTR(ret);
 574         }
 575         return (op);
 576 }
 577 
 578 #define CEIL(x, y)      (((x) + (y) - 1) / (y))
 579 
 580 /*
 581  * The application asks for a RDMA transfer.
 582  * Extract all arguments and set up the rdma_op
 583  */
 584 int
 585 rdsv3_cmsg_rdma_args(struct rdsv3_sock *rs, struct rdsv3_message *rm,
 586         struct cmsghdr *cmsg)
 587 {
 588         struct rdsv3_rdma_op *op;
 589         /* uint64_t alignment on the buffer */
 590         uint64_t buf[CEIL(CMSG_LEN(sizeof (struct rds_rdma_args)),
 591             sizeof (uint64_t))];
 592 
 593         if (cmsg->cmsg_len != CMSG_LEN(sizeof (struct rds_rdma_args)) ||
 594             rm->m_rdma_op != NULL)
 595                 return (-EINVAL);
 596 
 597         ASSERT(sizeof (buf) >= cmsg->cmsg_len && ((uintptr_t)buf & 0x7) == 0);
 598 
 599         bcopy(CMSG_DATA(cmsg), (char *)buf, cmsg->cmsg_len);
 600         op = rdsv3_rdma_prepare(rs, (struct rds_rdma_args *)buf);
 601 
 602         if (IS_ERR(op))
 603                 return (PTR_ERR(op));
 604         rdsv3_stats_inc(s_send_rdma);
 605         rm->m_rdma_op = op;
 606         return (0);
 607 }
 608 
 609 /*
 610  * The application wants us to pass an RDMA destination (aka MR)
 611  * to the remote
 612  */
 613 int
 614 rdsv3_cmsg_rdma_dest(struct rdsv3_sock *rs, struct rdsv3_message *rm,
 615         struct cmsghdr *cmsg)
 616 {
 617         struct rdsv3_mr *mr;
 618         uint32_t r_key;
 619         int err = 0;
 620 
 621         if (cmsg->cmsg_len != CMSG_LEN(sizeof (rds_rdma_cookie_t)) ||
 622             rm->m_rdma_cookie != 0)
 623                 return (-EINVAL);
 624 
 625         (void) memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg),
 626             sizeof (rm->m_rdma_cookie));
 627 
 628         /*
 629          * We are reusing a previously mapped MR here. Most likely, the
 630          * application has written to the buffer, so we need to explicitly
 631          * flush those writes to RAM. Otherwise the HCA may not see them
 632          * when doing a DMA from that buffer.
 633          */
 634         r_key = rdsv3_rdma_cookie_key(rm->m_rdma_cookie);
 635 
 636         mutex_enter(&rs->rs_rdma_lock);
 637         mr = rdsv3_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
 638         if (!mr)
 639                 err = -EINVAL;  /* invalid r_key */
 640         else
 641                 atomic_inc_32(&mr->r_refcount);
 642         mutex_exit(&rs->rs_rdma_lock);
 643 
 644         if (mr) {
 645                 mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
 646                 rm->m_rdma_mr = mr;
 647         }
 648         return (err);
 649 }
 650 
 651 /*
 652  * The application passes us an address range it wants to enable RDMA
 653  * to/from. We map the area, and save the <R_Key,offset> pair
 654  * in rm->m_rdma_cookie. This causes it to be sent along to the peer
 655  * in an extension header.
 656  */
 657 int
 658 rdsv3_cmsg_rdma_map(struct rdsv3_sock *rs, struct rdsv3_message *rm,
 659         struct cmsghdr *cmsg)
 660 {
 661         /* uint64_t alignment on the buffer */
 662         uint64_t buf[CEIL(CMSG_LEN(sizeof (struct rds_get_mr_args)),
 663             sizeof (uint64_t))];
 664         int status;
 665 
 666         if (cmsg->cmsg_len != CMSG_LEN(sizeof (struct rds_get_mr_args)) ||
 667             rm->m_rdma_cookie != 0)
 668                 return (-EINVAL);
 669 
 670         ASSERT(sizeof (buf) >= cmsg->cmsg_len && ((uintptr_t)buf & 0x7) == 0);
 671 
 672         bcopy(CMSG_DATA(cmsg), (char *)buf, cmsg->cmsg_len);
 673         status = __rdsv3_rdma_map(rs, (struct rds_get_mr_args *)buf,
 674             &rm->m_rdma_cookie, &rm->m_rdma_mr);
 675 
 676         return (status);
 677 }