1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
  27  *
  28  * This software is available to you under a choice of one of two
  29  * licenses.  You may choose to be licensed under the terms of the GNU
  30  * General Public License (GPL) Version 2, available from the file
  31  * COPYING in the main directory of this source tree, or the
  32  * OpenIB.org BSD license below:
  33  *
  34  *     Redistribution and use in source and binary forms, with or
  35  *     without modification, are permitted provided that the following
  36  *     conditions are met:
  37  *
  38  *      - Redistributions of source code must retain the above
  39  *        copyright notice, this list of conditions and the following
  40  *        disclaimer.
  41  *
  42  *      - Redistributions in binary form must reproduce the above
  43  *        copyright notice, this list of conditions and the following
  44  *        disclaimer in the documentation and/or other materials
  45  *        provided with the distribution.
  46  *
  47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  54  * SOFTWARE.
  55  *
  56  */
  57 /*
  58  * Sun elects to include this software in Sun product
  59  * under the OpenIB BSD license.
  60  *
  61  *
  62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  72  * POSSIBILITY OF SUCH DAMAGE.
  73  */
  74 
  75 #include <sys/ib/clients/rds/rdsib_cm.h>
  76 #include <sys/ib/clients/rds/rdsib_ib.h>
  77 #include <sys/ib/clients/rds/rdsib_buf.h>
  78 #include <sys/ib/clients/rds/rdsib_ep.h>
  79 
  80 /*
  81  * This file contains CM related work:
  82  *
  83  * Service registration/deregistration
  84  * Path lookup
  85  * CM connection callbacks
  86  * CM active and passive connection establishment
  87  * Connection failover
  88  */
  89 
  90 #define SRCIP   src_addr.un.ip4addr
  91 #define DSTIP   dst_addr.un.ip4addr
  92 
  93 /*
  94  * Handle an incoming CM REQ
  95  */
  96 /* ARGSUSED */
  97 static ibt_cm_status_t
  98 rds_handle_cm_req(rds_state_t *statep, ibt_cm_event_t *evp,
  99     ibt_cm_return_args_t *rargsp, void *rcmp, ibt_priv_data_len_t rcmp_len)
 100 {
 101         ibt_cm_req_rcv_t        *reqp;
 102         ib_gid_t                lgid, rgid;
 103         rds_cm_private_data_t   cmp;
 104         rds_session_t           *sp;
 105         rds_ep_t                *ep;
 106         ibt_channel_hdl_t       chanhdl;
 107         ibt_ip_cm_info_t        ipcm_info;
 108         uint8_t                 save_state, save_type;
 109         int                     ret;
 110 
 111         RDS_DPRINTF2("rds_handle_cm_req", "Enter");
 112 
 113         reqp = &evp->cm_event.req;
 114         rgid = reqp->req_prim_addr.av_dgid; /* requester gid */
 115         lgid = reqp->req_prim_addr.av_sgid; /* receiver gid */
 116 
 117         RDS_DPRINTF2(LABEL, "REQ Received: From: %llx:%llx To: %llx:%llx",
 118             rgid.gid_prefix, rgid.gid_guid, lgid.gid_prefix, lgid.gid_guid);
 119 
 120         /*
 121          * CM private data brings IP information
 122          * Private data received is a stream of bytes and may not be properly
 123          * aligned. So, bcopy the data onto the stack before accessing it.
 124          */
 125         bcopy((uint8_t *)evp->cm_priv_data, &cmp,
 126             sizeof (rds_cm_private_data_t));
 127 
 128         /* extract the CM IP info */
 129         ret = ibt_get_ip_data(evp->cm_priv_data_len, evp->cm_priv_data,
 130             &ipcm_info);
 131         if (ret != IBT_SUCCESS) {
 132                 RDS_DPRINTF2("rds_handle_cm_req", "ibt_get_ip_data failed: %d",
 133                     ret);
 134                 return (IBT_CM_REJECT);
 135         }
 136 
 137         RDS_DPRINTF2("rds_handle_cm_req",
 138             "REQ Received: From IP: 0x%x To IP: 0x%x type: %d",
 139             ipcm_info.SRCIP, ipcm_info.DSTIP, cmp.cmp_eptype);
 140 
 141         if (cmp.cmp_version != RDS_VERSION) {
 142                 RDS_DPRINTF2(LABEL, "Version Mismatch: Local version: %d "
 143                     "Remote version: %d", RDS_VERSION, cmp.cmp_version);
 144                 return (IBT_CM_REJECT);
 145         }
 146 
 147         /* RDS supports V4 addresses only */
 148         if ((ipcm_info.src_addr.family != AF_INET) ||
 149             (ipcm_info.dst_addr.family != AF_INET)) {
 150                 RDS_DPRINTF2(LABEL, "Unsupported Address Family: "
 151                     "src: %d dst: %d", ipcm_info.src_addr.family,
 152                     ipcm_info.dst_addr.family);
 153                 return (IBT_CM_REJECT);
 154         }
 155 
 156         if (cmp.cmp_arch != RDS_THIS_ARCH) {
 157                 RDS_DPRINTF2(LABEL, "ARCH does not match (%d != %d)",
 158                     cmp.cmp_arch, RDS_THIS_ARCH);
 159                 return (IBT_CM_REJECT);
 160         }
 161 
 162         if ((cmp.cmp_eptype != RDS_EP_TYPE_CTRL) &&
 163             (cmp.cmp_eptype != RDS_EP_TYPE_DATA)) {
 164                 RDS_DPRINTF2(LABEL, "Unknown Channel type: %d", cmp.cmp_eptype);
 165                 return (IBT_CM_REJECT);
 166         }
 167 
 168         /* user_buffer_size should be same on all nodes */
 169         if (cmp.cmp_user_buffer_size != UserBufferSize) {
 170                 RDS_DPRINTF2(LABEL,
 171                     "UserBufferSize Mismatch, this node: %d remote node: %d",
 172                     UserBufferSize, cmp.cmp_user_buffer_size);
 173                 return (IBT_CM_REJECT);
 174         }
 175 
 176         /*
 177          * RDS needs more time to process a failover REQ so send an MRA.
 178          * Otherwise, the remote may retry the REQ and fail the connection.
 179          */
 180         if ((cmp.cmp_failover) && (cmp.cmp_eptype == RDS_EP_TYPE_DATA)) {
 181                 RDS_DPRINTF2("rds_handle_cm_req", "Session Failover, send MRA");
 182                 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
 183                     10000000 /* 10 sec */, NULL, 0);
 184         }
 185 
 186         /* Is there a session to the destination node? */
 187         rw_enter(&statep->rds_sessionlock, RW_READER);
 188         sp = rds_session_lkup(statep, ipcm_info.SRCIP, rgid.gid_guid);
 189         rw_exit(&statep->rds_sessionlock);
 190 
 191         if (sp == NULL) {
 192                 /*
 193                  * currently there is no session to the destination
 194                  * remote ip in the private data is the local ip and vice
 195                  * versa
 196                  */
 197                 sp = rds_session_create(statep, ipcm_info.DSTIP,
 198                     ipcm_info.SRCIP, reqp, RDS_SESSION_PASSIVE);
 199                 if (sp == NULL) {
 200                         /* Check the list anyway. */
 201                         rw_enter(&statep->rds_sessionlock, RW_READER);
 202                         sp = rds_session_lkup(statep, ipcm_info.SRCIP,
 203                             rgid.gid_guid);
 204                         rw_exit(&statep->rds_sessionlock);
 205                         if (sp == NULL) {
 206                                 /*
 207                                  * The only way this can fail is due to lack
 208                                  * of kernel resources
 209                                  */
 210                                 return (IBT_CM_REJECT);
 211                         }
 212                 }
 213         }
 214 
 215         rw_enter(&sp->session_lock, RW_WRITER);
 216 
 217         /* catch peer-to-peer case as soon as possible */
 218         if ((sp->session_state == RDS_SESSION_STATE_CREATED) ||
 219             (sp->session_state == RDS_SESSION_STATE_INIT)) {
 220                 /* Check possible peer-to-peer case here */
 221                 if (sp->session_type != RDS_SESSION_PASSIVE) {
 222                         RDS_DPRINTF2("rds_handle_cm_req",
 223                             "SP(%p) Peer-peer connection handling", sp);
 224                         if (lgid.gid_guid > rgid.gid_guid) {
 225                                 /* this node is active so reject this request */
 226                                 rw_exit(&sp->session_lock);
 227                                 return (IBT_CM_REJECT);
 228                         } else {
 229                                 /* this node is passive, change the session */
 230                                 sp->session_type = RDS_SESSION_PASSIVE;
 231                                 sp->session_lgid = lgid;
 232                                 sp->session_rgid = rgid;
 233                         }
 234                 }
 235         }
 236 
 237         RDS_DPRINTF2(LABEL, "SP(%p) state: %d", sp, sp->session_state);
 238         save_state = sp->session_state;
 239         save_type = sp->session_type;
 240 
 241         switch (sp->session_state) {
 242         case RDS_SESSION_STATE_CONNECTED:
 243                 RDS_DPRINTF2(LABEL, "STALE Session Detected SP(%p)", sp);
 244                 sp->session_state = RDS_SESSION_STATE_ERROR;
 245                 RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
 246                     "RDS_SESSION_STATE_ERROR", sp);
 247 
 248                 /* FALLTHRU */
 249         case RDS_SESSION_STATE_ERROR:
 250         case RDS_SESSION_STATE_PASSIVE_CLOSING:
 251                 /*
 252                  * Some other thread must be processing this session,
 253                  * this thread must wait until the other thread finishes.
 254                  */
 255                 sp->session_type = RDS_SESSION_PASSIVE;
 256                 rw_exit(&sp->session_lock);
 257 
 258                 /* Handling this will take some time, so send an MRA */
 259                 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
 260                     10000000 /* 10 sec */, NULL, 0);
 261 
 262                 /*
 263                  * Any pending completions don't get flushed until the channel
 264                  * is closed. So, passing 0 here will not wait for pending
 265                  * completions in rds_session_close before closing the channel
 266                  */
 267                 rds_session_close(sp, IBT_NOCALLBACKS, 0);
 268 
 269                 rw_enter(&sp->session_lock, RW_WRITER);
 270 
 271                 /*
 272                  * If the session was in ERROR, then either a failover thread
 273                  * or event_failure thread would be processing this session.
 274                  * This thread should wait for event_failure thread to
 275                  * complete. This need not wait for failover thread.
 276                  */
 277                 if ((save_state != RDS_SESSION_STATE_CONNECTED) &&
 278                     (save_type == RDS_SESSION_PASSIVE)) {
 279                                 /*
 280                                  * The other thread is event_failure thread,
 281                                  * wait until it finishes.
 282                                  */
 283                                 while (!((sp->session_state ==
 284                                     RDS_SESSION_STATE_FAILED) ||
 285                                     (sp->session_state ==
 286                                     RDS_SESSION_STATE_FINI))) {
 287                                         rw_exit(&sp->session_lock);
 288                                         delay(drv_usectohz(1000000));
 289                                         rw_enter(&sp->session_lock, RW_WRITER);
 290                                 }
 291                 }
 292 
 293                 /* move the session to init state */
 294                 if ((sp->session_state == RDS_SESSION_STATE_ERROR) ||
 295                     (sp->session_state == RDS_SESSION_STATE_PASSIVE_CLOSING)) {
 296                         ret = rds_session_reinit(sp, lgid);
 297                         sp->session_myip = ipcm_info.DSTIP;
 298                         sp->session_lgid = lgid;
 299                         sp->session_rgid = rgid;
 300                         if (ret != 0) {
 301                                 rds_session_fini(sp);
 302                                 sp->session_state = RDS_SESSION_STATE_FAILED;
 303                                 RDS_DPRINTF3("rds_handle_cm_req",
 304                                     "SP(%p) State RDS_SESSION_STATE_FAILED",
 305                                     sp);
 306                                 rw_exit(&sp->session_lock);
 307                                 return (IBT_CM_REJECT);
 308                         } else {
 309                                 sp->session_state = RDS_SESSION_STATE_INIT;
 310                                 RDS_DPRINTF3("rds_handle_cm_req",
 311                                     "SP(%p) State RDS_SESSION_STATE_INIT", sp);
 312                         }
 313 
 314                         if (cmp.cmp_eptype == RDS_EP_TYPE_CTRL) {
 315                                 ep = &sp->session_ctrlep;
 316                         } else {
 317                                 ep = &sp->session_dataep;
 318                         }
 319                         break;
 320                 }
 321 
 322                 /* FALLTHRU */
 323         case RDS_SESSION_STATE_CREATED:
 324         case RDS_SESSION_STATE_FAILED:
 325         case RDS_SESSION_STATE_FINI:
 326                 /*
 327                  * Initialize both channels, we accept this connection
 328                  * only if both channels are initialized
 329                  */
 330                 sp->session_type = RDS_SESSION_PASSIVE;
 331                 sp->session_lgid = lgid;
 332                 sp->session_rgid = rgid;
 333                 sp->session_state = RDS_SESSION_STATE_CREATED;
 334                 RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
 335                     "RDS_SESSION_STATE_CREATED", sp);
 336                 ret = rds_session_init(sp);
 337                 if (ret != 0) {
 338                         /* Seems like there are not enough resources */
 339                         sp->session_state = RDS_SESSION_STATE_FAILED;
 340                         RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
 341                             "RDS_SESSION_STATE_FAILED", sp);
 342                         rw_exit(&sp->session_lock);
 343                         return (IBT_CM_REJECT);
 344                 }
 345                 sp->session_state = RDS_SESSION_STATE_INIT;
 346                 RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
 347                     "RDS_SESSION_STATE_INIT", sp);
 348 
 349                 /* FALLTHRU */
 350         case RDS_SESSION_STATE_INIT:
 351                 /*
 352                  * When re-using an existing session, make sure the
 353                  * session is still through the same HCA. Otherwise, the
 354                  * memory registrations have to moved to the new HCA.
 355                  */
 356                 if (cmp.cmp_eptype == RDS_EP_TYPE_DATA) {
 357                         if (sp->session_lgid.gid_guid != lgid.gid_guid) {
 358                                 RDS_DPRINTF2("rds_handle_cm_req",
 359                                     "Existing Session but different gid "
 360                                     "existing: 0x%llx, new: 0x%llx, "
 361                                     "sending an MRA",
 362                                     sp->session_lgid.gid_guid, lgid.gid_guid);
 363                                 (void) ibt_cm_delay(IBT_CM_DELAY_REQ,
 364                                     evp->cm_session_id, 10000000 /* 10 sec */,
 365                                     NULL, 0);
 366                                 ret = rds_session_reinit(sp, lgid);
 367                                 if (ret != 0) {
 368                                         rds_session_fini(sp);
 369                                         sp->session_state =
 370                                             RDS_SESSION_STATE_FAILED;
 371                                         sp->session_failover = 0;
 372                                         RDS_DPRINTF3("rds_failover_session",
 373                                             "SP(%p) State "
 374                                             "RDS_SESSION_STATE_FAILED", sp);
 375                                         rw_exit(&sp->session_lock);
 376                                         return (IBT_CM_REJECT);
 377                                 }
 378                         }
 379                         ep = &sp->session_dataep;
 380                 } else {
 381                         ep = &sp->session_ctrlep;
 382                 }
 383 
 384                 break;
 385         default:
 386                 RDS_DPRINTF2(LABEL, "ERROR: SP(%p) is in an unexpected "
 387                     "state: %d", sp, sp->session_state);
 388                 rw_exit(&sp->session_lock);
 389                 return (IBT_CM_REJECT);
 390         }
 391 
 392         sp->session_failover = 0; /* reset any previous value */
 393         if (cmp.cmp_failover) {
 394                 RDS_DPRINTF2("rds_handle_cm_req",
 395                     "SP(%p) Failover Session (BP %p)", sp, cmp.cmp_last_bufid);
 396                 sp->session_failover = 1;
 397         }
 398 
 399         mutex_enter(&ep->ep_lock);
 400         if (ep->ep_state == RDS_EP_STATE_UNCONNECTED) {
 401                 ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
 402                 sp->session_type = RDS_SESSION_PASSIVE;
 403                 rw_exit(&sp->session_lock);
 404         } else if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
 405                 rw_exit(&sp->session_lock);
 406                 /*
 407                  * Peer to peer connection. There is an active
 408                  * connection pending on this ep. The one with
 409                  * greater port guid becomes active and the
 410                  * other becomes passive.
 411                  */
 412                 RDS_DPRINTF2("rds_handle_cm_req",
 413                     "EP(%p) Peer-peer connection handling", ep);
 414                 if (lgid.gid_guid > rgid.gid_guid) {
 415                         /* this node is active so reject this request */
 416                         mutex_exit(&ep->ep_lock);
 417                         RDS_DPRINTF2(LABEL, "SP(%p) EP(%p): "
 418                             "Rejecting passive in favor of active", sp, ep);
 419                         return (IBT_CM_REJECT);
 420                 } else {
 421                         /*
 422                          * This session is not the active end, change it
 423                          * to passive end.
 424                          */
 425                         ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
 426 
 427                         rw_enter(&sp->session_lock, RW_WRITER);
 428                         sp->session_type = RDS_SESSION_PASSIVE;
 429                         sp->session_lgid = lgid;
 430                         sp->session_rgid = rgid;
 431                         rw_exit(&sp->session_lock);
 432                 }
 433         } else {
 434                 rw_exit(&sp->session_lock);
 435         }
 436 
 437         ep->ep_lbufid = cmp.cmp_last_bufid;
 438         ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
 439         ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
 440         cmp.cmp_last_bufid = ep->ep_rbufid;
 441         cmp.cmp_ack_addr = ep->ep_ack_addr;
 442         cmp.cmp_ack_rkey = ep->ep_ack_rkey;
 443         mutex_exit(&ep->ep_lock);
 444 
 445         /* continue with accepting the connection request for this channel */
 446         chanhdl = rds_ep_alloc_rc_channel(ep, reqp->req_prim_hca_port);
 447         if (chanhdl == NULL) {
 448                 mutex_enter(&ep->ep_lock);
 449                 ep->ep_state = RDS_EP_STATE_UNCONNECTED;
 450                 mutex_exit(&ep->ep_lock);
 451                 return (IBT_CM_REJECT);
 452         }
 453 
 454         /* pre-post recv buffers in the RQ */
 455         rds_post_recv_buf((void *)chanhdl);
 456 
 457         rargsp->cm_ret_len = sizeof (rds_cm_private_data_t);
 458         bcopy((uint8_t *)&cmp, rcmp, sizeof (rds_cm_private_data_t));
 459         rargsp->cm_ret.rep.cm_channel = chanhdl;
 460         rargsp->cm_ret.rep.cm_rdma_ra_out = 4;
 461         rargsp->cm_ret.rep.cm_rdma_ra_in = 4;
 462         rargsp->cm_ret.rep.cm_rnr_retry_cnt = MinRnrRetry;
 463 
 464         RDS_DPRINTF2("rds_handle_cm_req", "Return: SP(%p) EP(%p) Chan (%p)",
 465             sp, ep, chanhdl);
 466 
 467         return (IBT_CM_ACCEPT);
 468 }
 469 
 470 /*
 471  * Handle an incoming CM REP
 472  * Pre-post recv buffers for the QP
 473  */
 474 /* ARGSUSED */
 475 static ibt_cm_status_t
 476 rds_handle_cm_rep(ibt_cm_event_t *evp, ibt_cm_return_args_t *rargsp,
 477     void *rcmp, ibt_priv_data_len_t rcmp_len)
 478 {
 479         rds_ep_t        *ep;
 480         rds_cm_private_data_t   cmp;
 481 
 482         RDS_DPRINTF2("rds_handle_cm_rep", "Enter");
 483 
 484         /* pre-post recv buffers in the RQ */
 485         rds_post_recv_buf((void *)evp->cm_channel);
 486 
 487         ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
 488         bcopy((uint8_t *)evp->cm_priv_data, &cmp,
 489             sizeof (rds_cm_private_data_t));
 490         ep->ep_lbufid = cmp.cmp_last_bufid;
 491         ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
 492         ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
 493 
 494         rargsp->cm_ret_len = 0;
 495 
 496         RDS_DPRINTF2("rds_handle_cm_rep", "Return: lbufid: %p", ep->ep_lbufid);
 497 
 498         return (IBT_CM_ACCEPT);
 499 }
 500 
 501 /*
 502  * Handle CONN EST
 503  */
 504 static ibt_cm_status_t
 505 rds_handle_cm_conn_est(ibt_cm_event_t *evp)
 506 {
 507         rds_session_t   *sp;
 508         rds_ep_t        *ep;
 509 
 510         ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
 511 
 512         RDS_DPRINTF2("rds_handle_cm_conn_est", "EP(%p) State: %d", ep,
 513             ep->ep_state);
 514 
 515         mutex_enter(&ep->ep_lock);
 516         ASSERT((ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) ||
 517             (ep->ep_state == RDS_EP_STATE_PASSIVE_PENDING));
 518         ep->ep_state = RDS_EP_STATE_CONNECTED;
 519         ep->ep_chanhdl = evp->cm_channel;
 520         sp = ep->ep_sp;
 521         mutex_exit(&ep->ep_lock);
 522 
 523         (void) rds_session_active(sp);
 524 
 525         RDS_DPRINTF2("rds_handle_cm_conn_est", "Return");
 526         return (IBT_CM_ACCEPT);
 527 }
 528 
 529 /*
 530  * Handle CONN CLOSED
 531  */
 532 static ibt_cm_status_t
 533 rds_handle_cm_conn_closed(ibt_cm_event_t *evp)
 534 {
 535         rds_ep_t        *ep;
 536         rds_session_t   *sp;
 537 
 538         /* Catch DREQs but ignore DREPs */
 539         if (evp->cm_event.closed != IBT_CM_CLOSED_DREQ_RCVD) {
 540                 RDS_DPRINTF2("rds_handle_cm_conn_closed",
 541                     "Ignoring Event: %d received", evp->cm_event.closed);
 542                 return (IBT_CM_ACCEPT);
 543         }
 544 
 545         ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
 546         sp = ep->ep_sp;
 547         RDS_DPRINTF2("rds_handle_cm_conn_closed", "EP(%p) Chan(%p) Enter",
 548             ep, evp->cm_channel);
 549 
 550         mutex_enter(&ep->ep_lock);
 551         if (ep->ep_state != RDS_EP_STATE_CONNECTED) {
 552                 /* Ignore this DREQ */
 553                 RDS_DPRINTF2("rds_handle_cm_conn_closed",
 554                     "EP(%p) not connected, state: %d", ep, ep->ep_state);
 555                 mutex_exit(&ep->ep_lock);
 556                 return (IBT_CM_ACCEPT);
 557         }
 558         ep->ep_state = RDS_EP_STATE_CLOSING;
 559         mutex_exit(&ep->ep_lock);
 560 
 561         rw_enter(&sp->session_lock, RW_WRITER);
 562         RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) - state: %d", sp,
 563             sp->session_state);
 564 
 565         switch (sp->session_state) {
 566         case RDS_SESSION_STATE_CONNECTED:
 567         case RDS_SESSION_STATE_HCA_CLOSING:
 568                 sp->session_state = RDS_SESSION_STATE_PASSIVE_CLOSING;
 569                 RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
 570                     "RDS_SESSION_STATE_PASSIVE_CLOSING", sp);
 571                 break;
 572 
 573         case RDS_SESSION_STATE_PASSIVE_CLOSING:
 574                 sp->session_state = RDS_SESSION_STATE_CLOSED;
 575                 RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
 576                     "RDS_SESSION_STATE_CLOSED", sp);
 577                 rds_passive_session_fini(sp);
 578                 sp->session_state = RDS_SESSION_STATE_FINI;
 579                 RDS_DPRINTF3("rds_handle_cm_conn_closed",
 580                     "SP(%p) State RDS_SESSION_STATE_FINI", sp);
 581                 break;
 582 
 583         case RDS_SESSION_STATE_ACTIVE_CLOSING:
 584         case RDS_SESSION_STATE_ERROR:
 585         case RDS_SESSION_STATE_CLOSED:
 586                 break;
 587 
 588         case RDS_SESSION_STATE_INIT:
 589                 sp->session_state = RDS_SESSION_STATE_ERROR;
 590                 RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
 591                     "RDS_SESSION_STATE_ERROR", sp);
 592                 rds_passive_session_fini(sp);
 593                 sp->session_state = RDS_SESSION_STATE_FAILED;
 594                 RDS_DPRINTF3("rds_handle_cm_conn_closed",
 595                     "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
 596                 break;
 597 
 598         default:
 599                 RDS_DPRINTF2("rds_handle_cm_conn_closed",
 600                     "SP(%p) - Unexpected state: %d", sp, sp->session_state);
 601                 rds_passive_session_fini(sp);
 602                 sp->session_state = RDS_SESSION_STATE_FAILED;
 603                 RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
 604                     "RDS_SESSION_STATE_FAILED", sp);
 605         }
 606         rw_exit(&sp->session_lock);
 607 
 608         mutex_enter(&ep->ep_lock);
 609         ep->ep_state = RDS_EP_STATE_CLOSED;
 610         mutex_exit(&ep->ep_lock);
 611 
 612         RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) Return", sp);
 613         return (IBT_CM_ACCEPT);
 614 }
 615 
 616 /*
 617  * Handle EVENT FAILURE
 618  */
 619 static ibt_cm_status_t
 620 rds_handle_cm_event_failure(ibt_cm_event_t *evp)
 621 {
 622         rds_ep_t        *ep;
 623         rds_session_t   *sp;
 624         int             ret;
 625 
 626         RDS_DPRINTF2("rds_handle_cm_event_failure", "Enter: Chan hdl: 0x%p "
 627             "Code: %d msg: %d reason: %d", evp->cm_channel,
 628             evp->cm_event.failed.cf_code, evp->cm_event.failed.cf_msg,
 629             evp->cm_event.failed.cf_reason);
 630 
 631         if (evp->cm_event.failed.cf_reason == IBT_CM_INVALID_SID) {
 632                 RDS_DPRINTF2(LABEL,
 633                     "Received REJ with reason IBT_CM_INVALID_SID: "
 634                     "RDS may not be loaded on the remote system");
 635         }
 636 
 637         if (evp->cm_channel == NULL) {
 638                 return (IBT_CM_ACCEPT);
 639         }
 640 
 641         if ((evp->cm_event.failed.cf_code != IBT_CM_FAILURE_STALE) &&
 642             (evp->cm_event.failed.cf_msg == IBT_CM_FAILURE_REQ)) {
 643                 /*
 644                  * This end is active, just ignore, ibt_open_rc_channel()
 645                  * caller will take care of cleanup.
 646                  */
 647                 RDS_DPRINTF2("rds_handle_cm_event_failure",
 648                     "Ignoring this event: Chan hdl: 0x%p", evp->cm_channel);
 649                 return (IBT_CM_ACCEPT);
 650         }
 651 
 652         ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
 653         sp = ep->ep_sp;
 654 
 655         rw_enter(&sp->session_lock, RW_WRITER);
 656         if (sp->session_type == RDS_SESSION_PASSIVE) {
 657                 RDS_DPRINTF2("rds_handle_cm_event_failure",
 658                     "SP(%p) - state: %d", sp, sp->session_state);
 659                 if ((sp->session_state == RDS_SESSION_STATE_INIT) ||
 660                     (sp->session_state == RDS_SESSION_STATE_CONNECTED)) {
 661                         sp->session_state = RDS_SESSION_STATE_ERROR;
 662                         RDS_DPRINTF3("rds_handle_cm_event_failure",
 663                             "SP(%p) State RDS_SESSION_STATE_ERROR", sp);
 664 
 665                         /*
 666                          * Store the cm_channel for freeing later
 667                          * Active side frees it on ibt_open_rc_channel
 668                          * failure
 669                          */
 670                         if (ep->ep_chanhdl == NULL) {
 671                                 ep->ep_chanhdl = evp->cm_channel;
 672                         }
 673                         rw_exit(&sp->session_lock);
 674 
 675                         /*
 676                          * rds_passive_session_fini should not be called
 677                          * directly in the CM handler. It will cause a deadlock.
 678                          */
 679                         ret = ddi_taskq_dispatch(rds_taskq,
 680                             rds_cleanup_passive_session, (void *)sp,
 681                             DDI_NOSLEEP);
 682                         if (ret != DDI_SUCCESS) {
 683                                 RDS_DPRINTF2("rds_handle_cm_event_failure",
 684                                     "SP(%p) TaskQ dispatch FAILED:%d", sp, ret);
 685                         }
 686                         return (IBT_CM_ACCEPT);
 687                 }
 688         }
 689         rw_exit(&sp->session_lock);
 690 
 691         RDS_DPRINTF2("rds_handle_cm_event_failure", "SP(%p) Return", sp);
 692         return (IBT_CM_ACCEPT);
 693 }
 694 
 695 /*
 696  * CM Handler
 697  *
 698  * Called by IBCM
 699  * The cm_private type differs for active and passive events.
 700  */
 701 ibt_cm_status_t
 702 rds_cm_handler(void *cm_private, ibt_cm_event_t *eventp,
 703     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
 704     ibt_priv_data_len_t ret_len_max)
 705 {
 706         ibt_cm_status_t         ret = IBT_CM_ACCEPT;
 707 
 708         RDS_DPRINTF2("rds_cm_handler", "Enter: event: %d", eventp->cm_type);
 709 
 710         switch (eventp->cm_type) {
 711         case IBT_CM_EVENT_REQ_RCV:
 712                 ret = rds_handle_cm_req((rds_state_t *)cm_private, eventp,
 713                     ret_args, ret_priv_data, ret_len_max);
 714                 break;
 715         case IBT_CM_EVENT_REP_RCV:
 716                 ret = rds_handle_cm_rep(eventp, ret_args, ret_priv_data,
 717                     ret_len_max);
 718                 break;
 719         case IBT_CM_EVENT_MRA_RCV:
 720                 /* Not supported */
 721                 break;
 722         case IBT_CM_EVENT_CONN_EST:
 723                 ret = rds_handle_cm_conn_est(eventp);
 724                 break;
 725         case IBT_CM_EVENT_CONN_CLOSED:
 726                 ret = rds_handle_cm_conn_closed(eventp);
 727                 break;
 728         case IBT_CM_EVENT_FAILURE:
 729                 ret = rds_handle_cm_event_failure(eventp);
 730                 break;
 731         case IBT_CM_EVENT_LAP_RCV:
 732                 /* Not supported */
 733                 RDS_DPRINTF2(LABEL, "LAP message received");
 734                 break;
 735         case IBT_CM_EVENT_APR_RCV:
 736                 /* Not supported */
 737                 RDS_DPRINTF2(LABEL, "APR message received");
 738                 break;
 739         default:
 740                 break;
 741         }
 742 
 743         RDS_DPRINTF2("rds_cm_handler", "Return");
 744 
 745         return (ret);
 746 }
 747 
 748 /* This is based on OFED Linux RDS */
 749 #define RDS_PORT_NUM    6556
 750 
 751 /*
 752  * Register the wellknown service with service id: RDS_SERVICE_ID
 753  * Incoming connection requests should arrive on this service id.
 754  */
 755 ibt_srv_hdl_t
 756 rds_register_service(ibt_clnt_hdl_t rds_ibhdl)
 757 {
 758         ibt_srv_hdl_t   srvhdl;
 759         ibt_srv_desc_t  srvdesc;
 760         int             ret;
 761 
 762         RDS_DPRINTF2("rds_register_service", "Enter: 0x%p", rds_ibhdl);
 763 
 764         bzero(&srvdesc, sizeof (ibt_srv_desc_t));
 765         srvdesc.sd_handler = rds_cm_handler;
 766         srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
 767 
 768         /*
 769          * This is the new service id as per:
 770          * Annex A11: RDMA IP CM Service
 771          */
 772         rdsib_statep->rds_service_id = ibt_get_ip_sid(IPPROTO_TCP,
 773             RDS_PORT_NUM);
 774         ret = ibt_register_service(rds_ibhdl, &srvdesc,
 775             rdsib_statep->rds_service_id, 1, &srvhdl, NULL);
 776         if (ret != IBT_SUCCESS) {
 777                 RDS_DPRINTF2(LABEL,
 778                     "RDS Service (0x%llx) Registration Failed: %d",
 779                     rdsib_statep->rds_service_id, ret);
 780                 return (NULL);
 781         }
 782 
 783         RDS_DPRINTF2("rds_register_service", "Return: 0x%p", srvhdl);
 784         return (srvhdl);
 785 }
 786 
 787 /* Bind the RDS service on all ports */
 788 int
 789 rds_bind_service(rds_state_t *statep)
 790 {
 791         rds_hca_t       *hcap;
 792         ib_gid_t        gid;
 793         uint_t          jx, nbinds = 0, nports = 0;
 794         int             ret;
 795 
 796         RDS_DPRINTF2("rds_bind_service", "Enter: 0x%p", statep);
 797 
 798         rw_enter(&statep->rds_hca_lock, RW_READER);
 799 
 800         hcap = statep->rds_hcalistp;
 801         while (hcap != NULL) {
 802 
 803                 /* skip the HCAs that are not fully online */
 804                 if ((hcap->hca_state != RDS_HCA_STATE_OPEN) &&
 805                     (hcap->hca_state != RDS_HCA_STATE_MEM_REGISTERED)) {
 806                         RDS_DPRINTF2("rds_bind_service",
 807                             "Skipping HCA: 0x%llx, state: %d",
 808                             hcap->hca_guid, hcap->hca_state);
 809                         hcap = hcap->hca_nextp;
 810                         continue;
 811                 }
 812 
 813                 /* currently, we have space for only 4 bindhdls */
 814                 ASSERT(hcap->hca_nports < 4);
 815                 for (jx = 0; jx < hcap->hca_nports; jx++) {
 816                         nports++;
 817                         if (hcap->hca_pinfop[jx].p_linkstate !=
 818                             IBT_PORT_ACTIVE) {
 819                                 /*
 820                                  * service bind will be called in the async
 821                                  * handler when the port comes up. Clear any
 822                                  * stale bind handle.
 823                                  */
 824                                 hcap->hca_bindhdl[jx] = NULL;
 825                                 continue;
 826                         }
 827 
 828                         gid = hcap->hca_pinfop[jx].p_sgid_tbl[0];
 829                         RDS_DPRINTF5(LABEL, "HCA: 0x%llx Port: %d "
 830                             "gid: %llx:%llx", hcap->hca_guid,
 831                             hcap->hca_pinfop[jx].p_port_num, gid.gid_prefix,
 832                             gid.gid_guid);
 833 
 834                         /* pass statep as cm_private */
 835                         ret = ibt_bind_service(statep->rds_srvhdl, gid,
 836                             NULL, statep, &hcap->hca_bindhdl[jx]);
 837                         if (ret != IBT_SUCCESS) {
 838                                 RDS_DPRINTF2(LABEL, "Bind service for "
 839                                     "HCA: 0x%llx Port: %d gid %llx:%llx "
 840                                     "failed: %d", hcap->hca_guid,
 841                                     hcap->hca_pinfop[jx].p_port_num,
 842                                     gid.gid_prefix, gid.gid_guid, ret);
 843                                 continue;
 844                         }
 845 
 846                         nbinds++;
 847                 }
 848                 hcap = hcap->hca_nextp;
 849         }
 850 
 851         rw_exit(&statep->rds_hca_lock);
 852 
 853         RDS_DPRINTF2(LABEL, "RDS Service available on %d/%d ports",
 854             nbinds, nports);
 855 
 856 #if 0
 857         if (nbinds == 0) {
 858                 return (-1);
 859         }
 860 #endif
 861 
 862         RDS_DPRINTF2("rds_bind_service", "Return");
 863 
 864         return (0);
 865 }
 866 
 867 /* Open an RC connection */
 868 int
 869 rds_open_rc_channel(rds_ep_t *ep, ibt_path_info_t *pinfo,
 870     ibt_execution_mode_t mode, ibt_channel_hdl_t *chanhdl)
 871 {
 872         rds_session_t           *sp;
 873         ibt_chan_open_args_t    ocargs;
 874         ibt_rc_returns_t        ocrets;
 875         rds_cm_private_data_t   cmp;
 876         uint8_t                 hca_port;
 877         ibt_channel_hdl_t       hdl;
 878         ibt_status_t            ret = 0;
 879         ibt_ip_cm_info_t        ipcm_info;
 880 
 881         RDS_DPRINTF2("rds_open_rc_channel", "Enter: EP(%p) mode: %d", ep, mode);
 882 
 883         sp = ep->ep_sp;
 884 
 885         bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
 886         ipcm_info.src_addr.family = AF_INET;
 887         ipcm_info.SRCIP = sp->session_myip;
 888         ipcm_info.dst_addr.family = AF_INET;
 889         ipcm_info.DSTIP = sp->session_remip;
 890         ipcm_info.src_port = RDS_PORT_NUM;
 891         ret = ibt_format_ip_private_data(&ipcm_info,
 892             sizeof (rds_cm_private_data_t), &cmp);
 893         if (ret != IBT_SUCCESS) {
 894                 RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_format_ip_private_data "
 895                     "failed: %d", sp, ep, ret);
 896                 return (-1);
 897         }
 898 
 899         hca_port = pinfo->pi_prim_cep_path.cep_hca_port_num;
 900 
 901         hdl = rds_ep_alloc_rc_channel(ep, hca_port);
 902         if (hdl == NULL) {
 903                 return (-1);
 904         }
 905 
 906         cmp.cmp_version = RDS_VERSION;
 907         cmp.cmp_arch = RDS_THIS_ARCH;
 908         cmp.cmp_eptype = ep->ep_type;
 909         cmp.cmp_failover = sp->session_failover;
 910         cmp.cmp_last_bufid = ep->ep_rbufid;
 911         cmp.cmp_user_buffer_size = UserBufferSize;
 912         cmp.cmp_ack_addr = ep->ep_ack_addr;
 913         cmp.cmp_ack_rkey = ep->ep_ack_rkey;
 914 
 915         bzero(&ocargs, sizeof (ibt_chan_open_args_t));
 916         bzero(&ocrets, sizeof (ibt_rc_returns_t));
 917         ocargs.oc_path = pinfo;
 918         ocargs.oc_cm_handler = rds_cm_handler;
 919         ocargs.oc_cm_clnt_private = NULL;
 920         ocargs.oc_rdma_ra_out = 4;
 921         ocargs.oc_rdma_ra_in = 4;
 922         ocargs.oc_priv_data_len = sizeof (rds_cm_private_data_t);
 923         ocargs.oc_priv_data = &cmp;
 924         ocargs.oc_path_retry_cnt = IBPathRetryCount;
 925         ocargs.oc_path_rnr_retry_cnt = MinRnrRetry;
 926         ret = ibt_open_rc_channel(hdl, IBT_OCHAN_NO_FLAGS,
 927             mode, &ocargs, &ocrets);
 928         if (ret != IBT_SUCCESS) {
 929                 RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_open_rc_channel "
 930                     "failed: %d", sp, ep, ret);
 931                 (void) ibt_flush_channel(hdl);
 932                 (void) ibt_free_channel(hdl);
 933 
 934                 mutex_enter(&ep->ep_lock);
 935                 /* don't cleanup if this failure is due to peer-peer race */
 936                 if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
 937                         /* cleanup stuff allocated in rds_ep_alloc_rc_channel */
 938                         ep->ep_state = RDS_EP_STATE_ERROR;
 939                         rds_ep_free_rc_channel(ep);
 940                 }
 941                 mutex_exit(&ep->ep_lock);
 942 
 943                 return (-1);
 944         }
 945 
 946         *chanhdl = hdl;
 947 
 948         RDS_DPRINTF2("rds_open_rc_channel", "Return: EP(%p) Chan: %p", ep,
 949             *chanhdl);
 950 
 951         return (0);
 952 }
 953 
 954 int
 955 rds_close_rc_channel(ibt_channel_hdl_t chanhdl, ibt_execution_mode_t mode)
 956 {
 957         int     ret;
 958 
 959         RDS_DPRINTF2("rds_close_rc_channel", "Enter: Chan(%p) Mode(%d)",
 960             chanhdl, mode);
 961 
 962         ret = ibt_close_rc_channel(chanhdl, mode, NULL, 0, NULL, NULL, 0);
 963 
 964         RDS_DPRINTF2("rds_close_rc_channel", "Return Chan(%p)", chanhdl);
 965 
 966         return (ret);
 967 }