1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
  26  *
  27  * This software is available to you under a choice of one of two
  28  * licenses.  You may choose to be licensed under the terms of the GNU
  29  * General Public License (GPL) Version 2, available from the file
  30  * COPYING in the main directory of this source tree, or the
  31  * OpenIB.org BSD license below:
  32  *
  33  *     Redistribution and use in source and binary forms, with or
  34  *     without modification, are permitted provided that the following
  35  *     conditions are met:
  36  *
  37  *      - Redistributions of source code must retain the above
  38  *        copyright notice, this list of conditions and the following
  39  *        disclaimer.
  40  *
  41  *      - Redistributions in binary form must reproduce the above
  42  *        copyright notice, this list of conditions and the following
  43  *        disclaimer in the documentation and/or other materials
  44  *        provided with the distribution.
  45  *
  46  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  47  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  48  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  49  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  50  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  51  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  52  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  53  * SOFTWARE.
  54  *
  55  */
  56 /*
  57  * Sun elects to include this software in Sun product
  58  * under the OpenIB BSD license.
  59  *
  60  *
  61  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  62  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  63  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  64  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  65  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  66  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  67  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  68  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  69  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  70  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  71  * POSSIBILITY OF SUCH DAMAGE.
  72  */
  73 
  74 #include <sys/types.h>
  75 #include <sys/ddi.h>
  76 #include <sys/sunddi.h>
  77 #include <sys/ib/clients/rds/rdsib_cm.h>
  78 #include <sys/ib/clients/rds/rdsib_ib.h>
  79 #include <sys/ib/clients/rds/rdsib_buf.h>
  80 #include <sys/ib/clients/rds/rdsib_ep.h>
  81 #include <sys/ib/clients/rds/rds_kstat.h>
  82 
  83 static void rds_async_handler(void *clntp, ibt_hca_hdl_t hdl,
  84     ibt_async_code_t code, ibt_async_event_t *event);
  85 
  86 static struct ibt_clnt_modinfo_s rds_ib_modinfo = {
  87         IBTI_V_CURR,
  88         IBT_NETWORK,
  89         rds_async_handler,
  90         NULL,
  91         "RDS"
  92 };
  93 
  94 /* performance tunables */
  95 uint_t          rds_no_interrupts = 0;
  96 uint_t          rds_poll_percent_full = 25;
  97 uint_t          rds_wc_signal = IBT_NEXT_SOLICITED;
  98 uint_t          rds_waittime_ms = 100; /* ms */
  99 
 100 extern dev_info_t *rdsib_dev_info;
 101 extern void rds_close_sessions();
 102 
 103 static void
 104 rdsib_validate_chan_sizes(ibt_hca_attr_t *hattrp)
 105 {
 106         /* The SQ size should not be more than that supported by the HCA */
 107         if (((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_chan_sz) ||
 108             ((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_cq_sz)) {
 109                 RDS_DPRINTF2("RDSIB", "MaxDataSendBuffers + %d is greater "
 110                     "than that supported by the HCA driver "
 111                     "(%d + %d > %d or %d), lowering it to a supported value.",
 112                     RDS_NUM_ACKS, MaxDataSendBuffers, RDS_NUM_ACKS,
 113                     hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
 114 
 115                 MaxDataSendBuffers = (hattrp->hca_max_chan_sz >
 116                     hattrp->hca_max_cq_sz) ?
 117                     hattrp->hca_max_cq_sz - RDS_NUM_ACKS :
 118                     hattrp->hca_max_chan_sz - RDS_NUM_ACKS;
 119         }
 120 
 121         /* The RQ size should not be more than that supported by the HCA */
 122         if ((MaxDataRecvBuffers > hattrp->hca_max_chan_sz) ||
 123             (MaxDataRecvBuffers > hattrp->hca_max_cq_sz)) {
 124                 RDS_DPRINTF2("RDSIB", "MaxDataRecvBuffers is greater than that "
 125                     "supported by the HCA driver (%d > %d or %d), lowering it "
 126                     "to a supported value.", MaxDataRecvBuffers,
 127                     hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
 128 
 129                 MaxDataRecvBuffers = (hattrp->hca_max_chan_sz >
 130                     hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
 131                     hattrp->hca_max_chan_sz;
 132         }
 133 
 134         /* The SQ size should not be more than that supported by the HCA */
 135         if ((MaxCtrlSendBuffers > hattrp->hca_max_chan_sz) ||
 136             (MaxCtrlSendBuffers > hattrp->hca_max_cq_sz)) {
 137                 RDS_DPRINTF2("RDSIB", "MaxCtrlSendBuffers is greater than that "
 138                     "supported by the HCA driver (%d > %d or %d), lowering it "
 139                     "to a supported value.", MaxCtrlSendBuffers,
 140                     hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
 141 
 142                 MaxCtrlSendBuffers = (hattrp->hca_max_chan_sz >
 143                     hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
 144                     hattrp->hca_max_chan_sz;
 145         }
 146 
 147         /* The RQ size should not be more than that supported by the HCA */
 148         if ((MaxCtrlRecvBuffers > hattrp->hca_max_chan_sz) ||
 149             (MaxCtrlRecvBuffers > hattrp->hca_max_cq_sz)) {
 150                 RDS_DPRINTF2("RDSIB", "MaxCtrlRecvBuffers is greater than that "
 151                     "supported by the HCA driver (%d > %d or %d), lowering it "
 152                     "to a supported value.", MaxCtrlRecvBuffers,
 153                     hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
 154 
 155                 MaxCtrlRecvBuffers = (hattrp->hca_max_chan_sz >
 156                     hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
 157                     hattrp->hca_max_chan_sz;
 158         }
 159 
 160         /* The MaxRecvMemory should be less than that supported by the HCA */
 161         if ((NDataRX * RdsPktSize) > hattrp->hca_max_memr_len) {
 162                 RDS_DPRINTF2("RDSIB", "MaxRecvMemory is greater than that "
 163                     "supported by the HCA driver (%d > %d), lowering it to %d",
 164                     NDataRX * RdsPktSize, hattrp->hca_max_memr_len,
 165                     hattrp->hca_max_memr_len);
 166 
 167                 NDataRX = hattrp->hca_max_memr_len/RdsPktSize;
 168         }
 169 }
 170 
 171 /* Return hcap, given the hca guid */
 172 rds_hca_t *
 173 rds_lkup_hca(ib_guid_t hca_guid)
 174 {
 175         rds_hca_t       *hcap;
 176 
 177         RDS_DPRINTF4("rds_lkup_hca", "Enter: statep: 0x%p "
 178             "guid: %llx", rdsib_statep, hca_guid);
 179 
 180         rw_enter(&rdsib_statep->rds_hca_lock, RW_READER);
 181 
 182         hcap = rdsib_statep->rds_hcalistp;
 183         while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
 184                 hcap = hcap->hca_nextp;
 185         }
 186 
 187         rw_exit(&rdsib_statep->rds_hca_lock);
 188 
 189         RDS_DPRINTF4("rds_lkup_hca", "return");
 190 
 191         return (hcap);
 192 }
 193 
 194 void rds_randomize_qps(rds_hca_t *hcap);
 195 
 196 static rds_hca_t *
 197 rdsib_init_hca(ib_guid_t hca_guid)
 198 {
 199         rds_hca_t       *hcap;
 200         boolean_t       alloc = B_FALSE;
 201         int             ret;
 202 
 203         RDS_DPRINTF2("rdsib_init_hca", "enter: HCA 0x%llx", hca_guid);
 204 
 205         /* Do a HCA lookup */
 206         hcap = rds_lkup_hca(hca_guid);
 207 
 208         if (hcap != NULL && hcap->hca_hdl != NULL) {
 209                 /*
 210                  * This can happen if we get IBT_HCA_ATTACH_EVENT on an HCA
 211                  * that we have already opened. Just return NULL so that
 212                  * we'll not end up reinitializing the HCA again.
 213                  */
 214                 RDS_DPRINTF2("rdsib_init_hca", "HCA already initialized");
 215                 return (NULL);
 216         }
 217 
 218         if (hcap == NULL) {
 219                 RDS_DPRINTF2("rdsib_init_hca", "New HCA is added");
 220                 hcap = (rds_hca_t *)kmem_zalloc(sizeof (rds_hca_t), KM_SLEEP);
 221                 alloc = B_TRUE;
 222         }
 223 
 224         hcap->hca_guid = hca_guid;
 225         ret = ibt_open_hca(rdsib_statep->rds_ibhdl, hca_guid,
 226             &hcap->hca_hdl);
 227         if (ret != IBT_SUCCESS) {
 228                 if (ret == IBT_HCA_IN_USE) {
 229                         RDS_DPRINTF2("rdsib_init_hca",
 230                             "ibt_open_hca: 0x%llx returned IBT_HCA_IN_USE",
 231                             hca_guid);
 232                 } else {
 233                         RDS_DPRINTF2("rdsib_init_hca",
 234                             "ibt_open_hca: 0x%llx failed: %d", hca_guid, ret);
 235                 }
 236                 if (alloc == B_TRUE) {
 237                         kmem_free(hcap, sizeof (rds_hca_t));
 238                 }
 239                 return (NULL);
 240         }
 241 
 242         ret = ibt_query_hca(hcap->hca_hdl, &hcap->hca_attr);
 243         if (ret != IBT_SUCCESS) {
 244                 RDS_DPRINTF2("rdsib_init_hca",
 245                     "Query HCA: 0x%llx failed:  %d", hca_guid, ret);
 246                 ret = ibt_close_hca(hcap->hca_hdl);
 247                 ASSERT(ret == IBT_SUCCESS);
 248                 if (alloc == B_TRUE) {
 249                         kmem_free(hcap, sizeof (rds_hca_t));
 250                 } else {
 251                         hcap->hca_hdl = NULL;
 252                 }
 253                 return (NULL);
 254         }
 255 
 256         ret = ibt_query_hca_ports(hcap->hca_hdl, 0,
 257             &hcap->hca_pinfop, &hcap->hca_nports, &hcap->hca_pinfo_sz);
 258         if (ret != IBT_SUCCESS) {
 259                 RDS_DPRINTF2("rdsib_init_hca",
 260                     "Query HCA 0x%llx ports failed: %d", hca_guid,
 261                     ret);
 262                 ret = ibt_close_hca(hcap->hca_hdl);
 263                 hcap->hca_hdl = NULL;
 264                 ASSERT(ret == IBT_SUCCESS);
 265                 if (alloc == B_TRUE) {
 266                         kmem_free(hcap, sizeof (rds_hca_t));
 267                 } else {
 268                         hcap->hca_hdl = NULL;
 269                 }
 270                 return (NULL);
 271         }
 272 
 273         /* Only one PD per HCA is allocated, so do it here */
 274         ret = ibt_alloc_pd(hcap->hca_hdl, IBT_PD_NO_FLAGS,
 275             &hcap->hca_pdhdl);
 276         if (ret != IBT_SUCCESS) {
 277                 RDS_DPRINTF2("rdsib_init_hca",
 278                     "ibt_alloc_pd 0x%llx failed: %d", hca_guid, ret);
 279                 (void) ibt_free_portinfo(hcap->hca_pinfop,
 280                     hcap->hca_pinfo_sz);
 281                 ret = ibt_close_hca(hcap->hca_hdl);
 282                 ASSERT(ret == IBT_SUCCESS);
 283                 hcap->hca_hdl = NULL;
 284                 if (alloc == B_TRUE) {
 285                         kmem_free(hcap, sizeof (rds_hca_t));
 286                 } else {
 287                         hcap->hca_hdl = NULL;
 288                 }
 289                 return (NULL);
 290         }
 291 
 292         rdsib_validate_chan_sizes(&hcap->hca_attr);
 293 
 294         /* To minimize stale connections after ungraceful reboots */
 295         rds_randomize_qps(hcap);
 296 
 297         rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
 298         hcap->hca_state = RDS_HCA_STATE_OPEN;
 299         if (alloc == B_TRUE) {
 300                 /* this is a new HCA, add it to the list */
 301                 rdsib_statep->rds_nhcas++;
 302                 hcap->hca_nextp = rdsib_statep->rds_hcalistp;
 303                 rdsib_statep->rds_hcalistp = hcap;
 304         }
 305         rw_exit(&rdsib_statep->rds_hca_lock);
 306 
 307         RDS_DPRINTF2("rdsib_init_hca", "return: HCA 0x%llx", hca_guid);
 308 
 309         return (hcap);
 310 }
 311 
 312 /*
 313  * Called from attach
 314  */
 315 int
 316 rdsib_initialize_ib()
 317 {
 318         ib_guid_t       *guidp;
 319         rds_hca_t       *hcap;
 320         uint_t          ix, hcaix, nhcas;
 321         int             ret;
 322 
 323         RDS_DPRINTF2("rdsib_initialize_ib", "enter: statep %p", rdsib_statep);
 324 
 325         ASSERT(rdsib_statep != NULL);
 326         if (rdsib_statep == NULL) {
 327                 RDS_DPRINTF1("rdsib_initialize_ib",
 328                     "RDS Statep not initialized");
 329                 return (-1);
 330         }
 331 
 332         /* How many hcas are there? */
 333         nhcas = ibt_get_hca_list(&guidp);
 334         if (nhcas == 0) {
 335                 RDS_DPRINTF2("rdsib_initialize_ib", "No IB HCAs Available");
 336                 return (-1);
 337         }
 338 
 339         RDS_DPRINTF3("rdsib_initialize_ib", "Number of HCAs: %d", nhcas);
 340 
 341         /* Register with IBTF */
 342         ret = ibt_attach(&rds_ib_modinfo, rdsib_dev_info, rdsib_statep,
 343             &rdsib_statep->rds_ibhdl);
 344         if (ret != IBT_SUCCESS) {
 345                 RDS_DPRINTF2("rdsib_initialize_ib", "ibt_attach failed: %d",
 346                     ret);
 347                 (void) ibt_free_hca_list(guidp, nhcas);
 348                 return (-1);
 349         }
 350 
 351         /*
 352          * Open each HCA and gather its information. Don't care about HCAs
 353          * that cannot be opened. It is OK as long as atleast one HCA can be
 354          * opened.
 355          * Initialize a HCA only if all the information is available.
 356          */
 357         for (ix = 0, hcaix = 0; ix < nhcas; ix++) {
 358                 RDS_DPRINTF3(LABEL, "Open HCA: 0x%llx", guidp[ix]);
 359 
 360                 hcap = rdsib_init_hca(guidp[ix]);
 361                 if (hcap != NULL) hcaix++;
 362         }
 363 
 364         /* free the HCA list, we are done with it */
 365         (void) ibt_free_hca_list(guidp, nhcas);
 366 
 367         if (hcaix == 0) {
 368                 /* Failed to Initialize even one HCA */
 369                 RDS_DPRINTF2("rdsib_initialize_ib", "No HCAs are initialized");
 370                 (void) ibt_detach(rdsib_statep->rds_ibhdl);
 371                 rdsib_statep->rds_ibhdl = NULL;
 372                 return (-1);
 373         }
 374 
 375         if (hcaix < nhcas) {
 376                 RDS_DPRINTF2("rdsib_open_ib", "HCAs %d/%d failed to initialize",
 377                     (nhcas - hcaix), nhcas);
 378         }
 379 
 380         RDS_DPRINTF2("rdsib_initialize_ib", "return: statep %p", rdsib_statep);
 381 
 382         return (0);
 383 }
 384 
 385 /*
 386  * Called from detach
 387  */
 388 void
 389 rdsib_deinitialize_ib()
 390 {
 391         rds_hca_t       *hcap, *nextp;
 392         int             ret;
 393 
 394         RDS_DPRINTF2("rdsib_deinitialize_ib", "enter: statep %p", rdsib_statep);
 395 
 396         /* close and destroy all the sessions */
 397         rds_close_sessions(NULL);
 398 
 399         /* Release all HCA resources */
 400         rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
 401         RDS_DPRINTF2("rdsib_deinitialize_ib", "HCA List: %p, NHCA: %d",
 402             rdsib_statep->rds_hcalistp, rdsib_statep->rds_nhcas);
 403         hcap = rdsib_statep->rds_hcalistp;
 404         rdsib_statep->rds_hcalistp = NULL;
 405         rdsib_statep->rds_nhcas = 0;
 406         rw_exit(&rdsib_statep->rds_hca_lock);
 407 
 408         while (hcap != NULL) {
 409                 nextp = hcap->hca_nextp;
 410 
 411                 if (hcap->hca_hdl != NULL) {
 412                         ret = ibt_free_pd(hcap->hca_hdl, hcap->hca_pdhdl);
 413                         ASSERT(ret == IBT_SUCCESS);
 414 
 415                         (void) ibt_free_portinfo(hcap->hca_pinfop,
 416                             hcap->hca_pinfo_sz);
 417 
 418                         ret = ibt_close_hca(hcap->hca_hdl);
 419                         ASSERT(ret == IBT_SUCCESS);
 420                 }
 421 
 422                 kmem_free(hcap, sizeof (rds_hca_t));
 423                 hcap = nextp;
 424         }
 425 
 426         /* Deregister with IBTF */
 427         if (rdsib_statep->rds_ibhdl != NULL) {
 428                 (void) ibt_detach(rdsib_statep->rds_ibhdl);
 429                 rdsib_statep->rds_ibhdl = NULL;
 430         }
 431 
 432         RDS_DPRINTF2("rdsib_deinitialize_ib", "return: statep %p",
 433             rdsib_statep);
 434 }
 435 
 436 /*
 437  * Called on open of first RDS socket
 438  */
 439 int
 440 rdsib_open_ib()
 441 {
 442         int     ret;
 443 
 444         RDS_DPRINTF2("rdsib_open_ib", "enter: statep %p", rdsib_statep);
 445 
 446         /* Enable incoming connection requests */
 447         if (rdsib_statep->rds_srvhdl == NULL) {
 448                 rdsib_statep->rds_srvhdl =
 449                     rds_register_service(rdsib_statep->rds_ibhdl);
 450                 if (rdsib_statep->rds_srvhdl == NULL) {
 451                         RDS_DPRINTF2("rdsib_open_ib",
 452                             "Service registration failed");
 453                         return (-1);
 454                 } else {
 455                         /* bind the service on all available ports */
 456                         ret = rds_bind_service(rdsib_statep);
 457                         if (ret != 0) {
 458                                 RDS_DPRINTF2("rdsib_open_ib",
 459                                     "Bind service failed: %d", ret);
 460                         }
 461                 }
 462         }
 463 
 464         RDS_DPRINTF2("rdsib_open_ib", "return: statep %p", rdsib_statep);
 465 
 466         return (0);
 467 }
 468 
 469 /*
 470  * Called when all ports are closed.
 471  */
 472 void
 473 rdsib_close_ib()
 474 {
 475         int     ret;
 476 
 477         RDS_DPRINTF2("rdsib_close_ib", "enter: statep %p", rdsib_statep);
 478 
 479         /* Disable incoming connection requests */
 480         if (rdsib_statep->rds_srvhdl != NULL) {
 481                 ret = ibt_unbind_all_services(rdsib_statep->rds_srvhdl);
 482                 if (ret != 0) {
 483                         RDS_DPRINTF2("rdsib_close_ib",
 484                             "ibt_unbind_all_services failed: %d\n", ret);
 485                 }
 486                 ret = ibt_deregister_service(rdsib_statep->rds_ibhdl,
 487                     rdsib_statep->rds_srvhdl);
 488                 if (ret != 0) {
 489                         RDS_DPRINTF2("rdsib_close_ib",
 490                             "ibt_deregister_service failed: %d\n", ret);
 491                 } else {
 492                         rdsib_statep->rds_srvhdl = NULL;
 493                 }
 494         }
 495 
 496         RDS_DPRINTF2("rdsib_close_ib", "return: statep %p", rdsib_statep);
 497 }
 498 
 499 /* Return hcap, given the hca guid */
 500 rds_hca_t *
 501 rds_get_hcap(rds_state_t *statep, ib_guid_t hca_guid)
 502 {
 503         rds_hca_t       *hcap;
 504 
 505         RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: Enter: statep: 0x%p "
 506             "guid: %llx", statep, hca_guid);
 507 
 508         rw_enter(&statep->rds_hca_lock, RW_READER);
 509 
 510         hcap = statep->rds_hcalistp;
 511         while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
 512                 hcap = hcap->hca_nextp;
 513         }
 514 
 515         /*
 516          * don't let anyone use this HCA until the RECV memory
 517          * is registered with this HCA
 518          */
 519         if ((hcap != NULL) &&
 520             (hcap->hca_state == RDS_HCA_STATE_MEM_REGISTERED)) {
 521                 ASSERT(hcap->hca_mrhdl != NULL);
 522                 rw_exit(&statep->rds_hca_lock);
 523                 return (hcap);
 524         }
 525 
 526         RDS_DPRINTF2("rds_get_hcap",
 527             "HCA (0x%p, 0x%llx) is not initialized", hcap, hca_guid);
 528         rw_exit(&statep->rds_hca_lock);
 529 
 530         RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: return");
 531 
 532         return (NULL);
 533 }
 534 
 535 /* Return hcap, given a gid */
 536 rds_hca_t *
 537 rds_gid_to_hcap(rds_state_t *statep, ib_gid_t gid)
 538 {
 539         rds_hca_t       *hcap;
 540         uint_t          ix;
 541 
 542         RDS_DPRINTF4("rds_gid_to_hcap", "Enter: statep: 0x%p gid: %llx:%llx",
 543             statep, gid.gid_prefix, gid.gid_guid);
 544 
 545         rw_enter(&statep->rds_hca_lock, RW_READER);
 546 
 547         hcap = statep->rds_hcalistp;
 548         while (hcap != NULL) {
 549 
 550                 /*
 551                  * don't let anyone use this HCA until the RECV memory
 552                  * is registered with this HCA
 553                  */
 554                 if (hcap->hca_state != RDS_HCA_STATE_MEM_REGISTERED) {
 555                         RDS_DPRINTF3("rds_gid_to_hcap",
 556                             "HCA (0x%p, 0x%llx) is not initialized",
 557                             hcap, gid.gid_guid);
 558                         hcap = hcap->hca_nextp;
 559                         continue;
 560                 }
 561 
 562                 for (ix = 0; ix < hcap->hca_nports; ix++) {
 563                         if ((hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_prefix ==
 564                             gid.gid_prefix) &&
 565                             (hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_guid ==
 566                             gid.gid_guid)) {
 567                                 RDS_DPRINTF4("rds_gid_to_hcap",
 568                                     "gid found in hcap: 0x%p", hcap);
 569                                 rw_exit(&statep->rds_hca_lock);
 570                                 return (hcap);
 571                         }
 572                 }
 573                 hcap = hcap->hca_nextp;
 574         }
 575 
 576         rw_exit(&statep->rds_hca_lock);
 577 
 578         return (NULL);
 579 }
 580 
 581 /* This is called from the send CQ handler */
 582 void
 583 rds_send_acknowledgement(rds_ep_t *ep)
 584 {
 585         int     ret;
 586         uint_t  ix;
 587 
 588         RDS_DPRINTF4("rds_send_acknowledgement", "Enter EP(%p)", ep);
 589 
 590         mutex_enter(&ep->ep_lock);
 591 
 592         ASSERT(ep->ep_rdmacnt != 0);
 593 
 594         /*
 595          * The previous ACK completed successfully, send the next one
 596          * if more messages were received after sending the last ACK
 597          */
 598         if (ep->ep_rbufid != *(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va) {
 599                 *(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va = ep->ep_rbufid;
 600                 mutex_exit(&ep->ep_lock);
 601 
 602                 /* send acknowledgement */
 603                 RDS_INCR_TXACKS();
 604                 ret = ibt_post_send(ep->ep_chanhdl, &ep->ep_ackwr, 1, &ix);
 605                 if (ret != IBT_SUCCESS) {
 606                         RDS_DPRINTF2("rds_send_acknowledgement",
 607                             "EP(%p): ibt_post_send for acknowledgement "
 608                             "failed: %d, SQ depth: %d",
 609                             ep, ret, ep->ep_sndpool.pool_nbusy);
 610                         mutex_enter(&ep->ep_lock);
 611                         ep->ep_rdmacnt--;
 612                         mutex_exit(&ep->ep_lock);
 613                 }
 614         } else {
 615                 /* ACKed all messages, no more to ACK */
 616                 ep->ep_rdmacnt--;
 617                 mutex_exit(&ep->ep_lock);
 618                 return;
 619         }
 620 
 621         RDS_DPRINTF4("rds_send_acknowledgement", "Return EP(%p)", ep);
 622 }
 623 
 624 static int
 625 rds_poll_ctrl_completions(ibt_cq_hdl_t cq, rds_ep_t *ep)
 626 {
 627         ibt_wc_t        wc;
 628         uint_t          npolled;
 629         rds_buf_t       *bp;
 630         rds_ctrl_pkt_t  *cpkt;
 631         rds_qp_t        *recvqp;
 632         int             ret = IBT_SUCCESS;
 633 
 634         RDS_DPRINTF4("rds_poll_ctrl_completions", "Enter: EP(%p)", ep);
 635 
 636         bzero(&wc, sizeof (ibt_wc_t));
 637         ret = ibt_poll_cq(cq, &wc, 1, &npolled);
 638         if (ret != IBT_SUCCESS) {
 639                 if (ret != IBT_CQ_EMPTY) {
 640                         RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
 641                             "returned: %d", ep, cq, ret);
 642                 } else {
 643                         RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
 644                             "returned: IBT_CQ_EMPTY", ep, cq);
 645                 }
 646                 return (ret);
 647         }
 648 
 649         bp = (rds_buf_t *)(uintptr_t)wc.wc_id;
 650 
 651         if (wc.wc_status != IBT_WC_SUCCESS) {
 652                 mutex_enter(&ep->ep_recvqp.qp_lock);
 653                 ep->ep_recvqp.qp_level--;
 654                 mutex_exit(&ep->ep_recvqp.qp_lock);
 655 
 656                 /* Free the buffer */
 657                 bp->buf_state = RDS_RCVBUF_FREE;
 658                 rds_free_recv_buf(bp, 1);
 659 
 660                 /* Receive completion failure */
 661                 if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) {
 662                         RDS_DPRINTF2("rds_poll_ctrl_completions",
 663                             "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
 664                             ep, cq, wc.wc_id, wc.wc_status);
 665                 }
 666                 return (ret);
 667         }
 668 
 669         /* there is one less in the RQ */
 670         recvqp = &ep->ep_recvqp;
 671         mutex_enter(&recvqp->qp_lock);
 672         recvqp->qp_level--;
 673         if ((recvqp->qp_taskqpending == B_FALSE) &&
 674             (recvqp->qp_level <= recvqp->qp_lwm)) {
 675                 /* Time to post more buffers into the RQ */
 676                 recvqp->qp_taskqpending = B_TRUE;
 677                 mutex_exit(&recvqp->qp_lock);
 678 
 679                 ret = ddi_taskq_dispatch(rds_taskq,
 680                     rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
 681                 if (ret != DDI_SUCCESS) {
 682                         RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d",
 683                             ret);
 684                         mutex_enter(&recvqp->qp_lock);
 685                         recvqp->qp_taskqpending = B_FALSE;
 686                         mutex_exit(&recvqp->qp_lock);
 687                 }
 688         } else {
 689                 mutex_exit(&recvqp->qp_lock);
 690         }
 691 
 692         cpkt = (rds_ctrl_pkt_t *)(uintptr_t)bp->buf_ds.ds_va;
 693         rds_handle_control_message(ep->ep_sp, cpkt);
 694 
 695         bp->buf_state = RDS_RCVBUF_FREE;
 696         rds_free_recv_buf(bp, 1);
 697 
 698         RDS_DPRINTF4("rds_poll_ctrl_completions", "Return: EP(%p)", ep);
 699 
 700         return (ret);
 701 }
 702 
 703 #define RDS_POST_FEW_ATATIME    100
 704 /* Post recv WRs into the RQ. Assumes the ep->refcnt is already incremented */
 705 void
 706 rds_post_recv_buf(void *arg)
 707 {
 708         ibt_channel_hdl_t       chanhdl;
 709         rds_ep_t                *ep;
 710         rds_session_t           *sp;
 711         rds_qp_t                *recvqp;
 712         rds_bufpool_t           *gp;
 713         rds_buf_t               *bp, *bp1;
 714         ibt_recv_wr_t           *wrp, wr[RDS_POST_FEW_ATATIME];
 715         rds_hca_t               *hcap;
 716         uint_t                  npost, nspace, rcv_len;
 717         uint_t                  ix, jx, kx;
 718         int                     ret;
 719 
 720         chanhdl = (ibt_channel_hdl_t)arg;
 721         RDS_DPRINTF4("rds_post_recv_buf", "Enter: CHAN(%p)", chanhdl);
 722         RDS_INCR_POST_RCV_BUF_CALLS();
 723 
 724         ep = (rds_ep_t *)ibt_get_chan_private(chanhdl);
 725         ASSERT(ep != NULL);
 726         sp = ep->ep_sp;
 727         recvqp = &ep->ep_recvqp;
 728 
 729         RDS_DPRINTF5("rds_post_recv_buf", "EP(%p)", ep);
 730 
 731         /* get the hcap for the HCA hosting this channel */
 732         hcap = rds_lkup_hca(ep->ep_hca_guid);
 733         if (hcap == NULL) {
 734                 RDS_DPRINTF2("rds_post_recv_buf", "HCA (0x%llx) not found",
 735                     ep->ep_hca_guid);
 736                 return;
 737         }
 738 
 739         /* Make sure the session is still connected */
 740         rw_enter(&sp->session_lock, RW_READER);
 741         if ((sp->session_state != RDS_SESSION_STATE_INIT) &&
 742             (sp->session_state != RDS_SESSION_STATE_CONNECTED) &&
 743             (sp->session_state != RDS_SESSION_STATE_HCA_CLOSING)) {
 744                 RDS_DPRINTF2("rds_post_recv_buf", "EP(%p): Session is not "
 745                     "in active state (%d)", ep, sp->session_state);
 746                 rw_exit(&sp->session_lock);
 747                 return;
 748         }
 749         rw_exit(&sp->session_lock);
 750 
 751         /* how many can be posted */
 752         mutex_enter(&recvqp->qp_lock);
 753         nspace = recvqp->qp_depth - recvqp->qp_level;
 754         if (nspace == 0) {
 755                 RDS_DPRINTF2("rds_post_recv_buf", "RQ is FULL");
 756                 recvqp->qp_taskqpending = B_FALSE;
 757                 mutex_exit(&recvqp->qp_lock);
 758                 return;
 759         }
 760         mutex_exit(&recvqp->qp_lock);
 761 
 762         if (ep->ep_type == RDS_EP_TYPE_DATA) {
 763                 gp = &rds_dpool;
 764                 rcv_len = RdsPktSize;
 765         } else {
 766                 gp = &rds_cpool;
 767                 rcv_len = RDS_CTRLPKT_SIZE;
 768         }
 769 
 770         bp = rds_get_buf(gp, nspace, &jx);
 771         if (bp == NULL) {
 772                 RDS_DPRINTF2(LABEL, "EP(%p): No Recv buffers available", ep);
 773                 /* try again later */
 774                 ret = ddi_taskq_dispatch(rds_taskq, rds_post_recv_buf,
 775                     (void *)chanhdl, DDI_NOSLEEP);
 776                 if (ret != DDI_SUCCESS) {
 777                         RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d",
 778                             ret);
 779                         mutex_enter(&recvqp->qp_lock);
 780                         recvqp->qp_taskqpending = B_FALSE;
 781                         mutex_exit(&recvqp->qp_lock);
 782                 }
 783                 return;
 784         }
 785 
 786         if (jx != nspace) {
 787                 RDS_DPRINTF2(LABEL, "EP(%p): Recv buffers "
 788                     "needed: %d available: %d", ep, nspace, jx);
 789                 nspace = jx;
 790         }
 791 
 792         bp1 = bp;
 793         for (ix = 0; ix < nspace; ix++) {
 794                 bp1->buf_ep = ep;
 795                 ASSERT(bp1->buf_state == RDS_RCVBUF_FREE);
 796                 bp1->buf_state = RDS_RCVBUF_POSTED;
 797                 bp1->buf_ds.ds_key = hcap->hca_lkey;
 798                 bp1->buf_ds.ds_len = rcv_len;
 799                 bp1 = bp1->buf_nextp;
 800         }
 801 
 802 #if 0
 803         wrp = kmem_zalloc(RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t),
 804             KM_SLEEP);
 805 #else
 806         wrp = &wr[0];
 807 #endif
 808 
 809         npost = nspace;
 810         while (npost) {
 811                 jx = (npost > RDS_POST_FEW_ATATIME) ?
 812                     RDS_POST_FEW_ATATIME : npost;
 813                 for (ix = 0; ix < jx; ix++) {
 814                         wrp[ix].wr_id = (uintptr_t)bp;
 815                         wrp[ix].wr_nds = 1;
 816                         wrp[ix].wr_sgl = &bp->buf_ds;
 817                         bp = bp->buf_nextp;
 818                 }
 819 
 820                 ret = ibt_post_recv(chanhdl, wrp, jx, &kx);
 821                 if ((ret != IBT_SUCCESS) || (kx != jx)) {
 822                         RDS_DPRINTF2(LABEL, "ibt_post_recv for %d WRs failed: "
 823                             "%d", npost, ret);
 824                         npost -= kx;
 825                         break;
 826                 }
 827 
 828                 npost -= jx;
 829         }
 830 
 831         mutex_enter(&recvqp->qp_lock);
 832         if (npost != 0) {
 833                 RDS_DPRINTF2("rds_post_recv_buf",
 834                     "EP(%p) Failed to post %d WRs", ep, npost);
 835                 recvqp->qp_level += (nspace - npost);
 836         } else {
 837                 recvqp->qp_level += nspace;
 838         }
 839 
 840         /*
 841          * sometimes, the recv WRs can get consumed as soon as they are
 842          * posted. In that case, taskq thread to post more WRs to the RQ will
 843          * not be scheduled as the taskqpending flag is still set.
 844          */
 845         if (recvqp->qp_level == 0) {
 846                 mutex_exit(&recvqp->qp_lock);
 847                 ret = ddi_taskq_dispatch(rds_taskq,
 848                     rds_post_recv_buf, (void *)chanhdl, DDI_NOSLEEP);
 849                 if (ret != DDI_SUCCESS) {
 850                         RDS_DPRINTF2("rds_post_recv_buf",
 851                             "ddi_taskq_dispatch failed: %d", ret);
 852                         mutex_enter(&recvqp->qp_lock);
 853                         recvqp->qp_taskqpending = B_FALSE;
 854                         mutex_exit(&recvqp->qp_lock);
 855                 }
 856         } else {
 857                 recvqp->qp_taskqpending = B_FALSE;
 858                 mutex_exit(&recvqp->qp_lock);
 859         }
 860 
 861 #if 0
 862         kmem_free(wrp, RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t));
 863 #endif
 864 
 865         RDS_DPRINTF4("rds_post_recv_buf", "Return: EP(%p)", ep);
 866 }
 867 
 868 static int
 869 rds_poll_data_completions(ibt_cq_hdl_t cq, rds_ep_t *ep)
 870 {
 871         ibt_wc_t        wc;
 872         rds_buf_t       *bp;
 873         rds_data_hdr_t  *pktp;
 874         rds_qp_t        *recvqp;
 875         uint_t          npolled;
 876         int             ret = IBT_SUCCESS;
 877 
 878 
 879         RDS_DPRINTF4("rds_poll_data_completions", "Enter: EP(%p)", ep);
 880 
 881         bzero(&wc, sizeof (ibt_wc_t));
 882         ret = ibt_poll_cq(cq, &wc, 1, &npolled);
 883         if (ret != IBT_SUCCESS) {
 884                 if (ret != IBT_CQ_EMPTY) {
 885                         RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
 886                             "returned: %d", ep, cq, ret);
 887                 } else {
 888                         RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
 889                             "returned: IBT_CQ_EMPTY", ep, cq);
 890                 }
 891                 return (ret);
 892         }
 893 
 894         bp = (rds_buf_t *)(uintptr_t)wc.wc_id;
 895         ASSERT(bp->buf_state == RDS_RCVBUF_POSTED);
 896         bp->buf_state = RDS_RCVBUF_ONSOCKQ;
 897         bp->buf_nextp = NULL;
 898 
 899         if (wc.wc_status != IBT_WC_SUCCESS) {
 900                 mutex_enter(&ep->ep_recvqp.qp_lock);
 901                 ep->ep_recvqp.qp_level--;
 902                 mutex_exit(&ep->ep_recvqp.qp_lock);
 903 
 904                 /* free the buffer */
 905                 bp->buf_state = RDS_RCVBUF_FREE;
 906                 rds_free_recv_buf(bp, 1);
 907 
 908                 /* Receive completion failure */
 909                 if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) {
 910                         RDS_DPRINTF2("rds_poll_data_completions",
 911                             "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
 912                             ep, cq, wc.wc_id, wc.wc_status);
 913                         RDS_INCR_RXERRS();
 914                 }
 915                 return (ret);
 916         }
 917 
 918         /* there is one less in the RQ */
 919         recvqp = &ep->ep_recvqp;
 920         mutex_enter(&recvqp->qp_lock);
 921         recvqp->qp_level--;
 922         if ((recvqp->qp_taskqpending == B_FALSE) &&
 923             (recvqp->qp_level <= recvqp->qp_lwm)) {
 924                 /* Time to post more buffers into the RQ */
 925                 recvqp->qp_taskqpending = B_TRUE;
 926                 mutex_exit(&recvqp->qp_lock);
 927 
 928                 ret = ddi_taskq_dispatch(rds_taskq,
 929                     rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
 930                 if (ret != DDI_SUCCESS) {
 931                         RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d",
 932                             ret);
 933                         mutex_enter(&recvqp->qp_lock);
 934                         recvqp->qp_taskqpending = B_FALSE;
 935                         mutex_exit(&recvqp->qp_lock);
 936                 }
 937         } else {
 938                 mutex_exit(&recvqp->qp_lock);
 939         }
 940 
 941         pktp = (rds_data_hdr_t *)(uintptr_t)bp->buf_ds.ds_va;
 942         ASSERT(pktp->dh_datalen != 0);
 943 
 944         RDS_DPRINTF5(LABEL, "Message Received: sendIP: 0x%x recvIP: 0x%x "
 945             "sendport: %d recvport: %d npkts: %d pktno: %d", ep->ep_remip,
 946             ep->ep_myip, pktp->dh_sendport, pktp->dh_recvport,
 947             pktp->dh_npkts, pktp->dh_psn);
 948 
 949         RDS_DPRINTF3(LABEL, "BP(%p): npkts: %d psn: %d", bp,
 950             pktp->dh_npkts, pktp->dh_psn);
 951 
 952         if (pktp->dh_npkts == 1) {
 953                 /* single pkt or last packet */
 954                 if (pktp->dh_psn != 0) {
 955                         /* last packet of a segmented message */
 956                         ASSERT(ep->ep_seglbp != NULL);
 957                         ep->ep_seglbp->buf_nextp = bp;
 958                         ep->ep_seglbp = bp;
 959                         rds_received_msg(ep, ep->ep_segfbp);
 960                         ep->ep_segfbp = NULL;
 961                         ep->ep_seglbp = NULL;
 962                 } else {
 963                         /* single packet */
 964                         rds_received_msg(ep, bp);
 965                 }
 966         } else {
 967                 /* multi-pkt msg */
 968                 if (pktp->dh_psn == 0) {
 969                         /* first packet */
 970                         ASSERT(ep->ep_segfbp == NULL);
 971                         ep->ep_segfbp = bp;
 972                         ep->ep_seglbp = bp;
 973                 } else {
 974                         /* intermediate packet */
 975                         ASSERT(ep->ep_segfbp != NULL);
 976                         ep->ep_seglbp->buf_nextp = bp;
 977                         ep->ep_seglbp = bp;
 978                 }
 979         }
 980 
 981         RDS_DPRINTF4("rds_poll_data_completions", "Return: EP(%p)", ep);
 982 
 983         return (ret);
 984 }
 985 
 986 void
 987 rds_recvcq_handler(ibt_cq_hdl_t cq, void *arg)
 988 {
 989         rds_ep_t        *ep;
 990         int             ret = IBT_SUCCESS;
 991         int             (*func)(ibt_cq_hdl_t, rds_ep_t *);
 992 
 993         ep = (rds_ep_t *)arg;
 994 
 995         RDS_DPRINTF4("rds_recvcq_handler", "enter: EP(%p)", ep);
 996 
 997         if (ep->ep_type == RDS_EP_TYPE_DATA) {
 998                 func = rds_poll_data_completions;
 999         } else {
1000                 func = rds_poll_ctrl_completions;
1001         }
1002 
1003         do {
1004                 ret = func(cq, ep);
1005         } while (ret != IBT_CQ_EMPTY);
1006 
1007         /* enable the CQ */
1008         ret = ibt_enable_cq_notify(cq, rds_wc_signal);
1009         if (ret != IBT_SUCCESS) {
1010                 RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify "
1011                     "failed: %d", ep, cq, ret);
1012                 return;
1013         }
1014 
1015         do {
1016                 ret = func(cq, ep);
1017         } while (ret != IBT_CQ_EMPTY);
1018 
1019         RDS_DPRINTF4("rds_recvcq_handler", "Return: EP(%p)", ep);
1020 }
1021 
1022 void
1023 rds_poll_send_completions(ibt_cq_hdl_t cq, rds_ep_t *ep, boolean_t lock)
1024 {
1025         ibt_wc_t        wc[RDS_NUM_DATA_SEND_WCS];
1026         uint_t          npolled, nret, send_error = 0;
1027         rds_buf_t       *headp, *tailp, *bp;
1028         int             ret, ix;
1029 
1030         RDS_DPRINTF4("rds_poll_send_completions", "Enter EP(%p)", ep);
1031 
1032         headp = NULL;
1033         tailp = NULL;
1034         npolled = 0;
1035         do {
1036                 ret = ibt_poll_cq(cq, wc, RDS_NUM_DATA_SEND_WCS, &nret);
1037                 if (ret != IBT_SUCCESS) {
1038                         if (ret != IBT_CQ_EMPTY) {
1039                                 RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): "
1040                                     "ibt_poll_cq returned: %d", ep, cq, ret);
1041                         } else {
1042                                 RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): "
1043                                     "ibt_poll_cq returned: IBT_CQ_EMPTY",
1044                                     ep, cq);
1045                         }
1046 
1047                         break;
1048                 }
1049 
1050                 for (ix = 0; ix < nret; ix++) {
1051                         if (wc[ix].wc_status == IBT_WC_SUCCESS) {
1052                                 if (wc[ix].wc_type == IBT_WRC_RDMAW) {
1053                                         rds_send_acknowledgement(ep);
1054                                         continue;
1055                                 }
1056 
1057                                 bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
1058                                 ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
1059                                 bp->buf_state = RDS_SNDBUF_FREE;
1060                         } else if (wc[ix].wc_status == IBT_WC_WR_FLUSHED_ERR) {
1061                                 RDS_INCR_TXERRS();
1062                                 RDS_DPRINTF5("rds_poll_send_completions",
1063                                     "EP(%p): WC ID: %p ERROR: %d", ep,
1064                                     wc[ix].wc_id, wc[ix].wc_status);
1065 
1066                                 send_error = 1;
1067 
1068                                 if (wc[ix].wc_id == RDS_RDMAW_WRID) {
1069                                         mutex_enter(&ep->ep_lock);
1070                                         ep->ep_rdmacnt--;
1071                                         mutex_exit(&ep->ep_lock);
1072                                         continue;
1073                                 }
1074 
1075                                 bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
1076                                 ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
1077                                 bp->buf_state = RDS_SNDBUF_FREE;
1078                         } else {
1079                                 RDS_INCR_TXERRS();
1080                                 RDS_DPRINTF2("rds_poll_send_completions",
1081                                     "EP(%p): WC ID: %p ERROR: %d", ep,
1082                                     wc[ix].wc_id, wc[ix].wc_status);
1083                                 if (send_error == 0) {
1084                                         rds_session_t   *sp = ep->ep_sp;
1085 
1086                                         /* don't let anyone send anymore */
1087                                         rw_enter(&sp->session_lock, RW_WRITER);
1088                                         if (sp->session_state !=
1089                                             RDS_SESSION_STATE_ERROR) {
1090                                                 sp->session_state =
1091                                                     RDS_SESSION_STATE_ERROR;
1092                                                 /* Make this the active end */
1093                                                 sp->session_type =
1094                                                     RDS_SESSION_ACTIVE;
1095                                         }
1096                                         rw_exit(&sp->session_lock);
1097                                 }
1098 
1099                                 send_error = 1;
1100 
1101                                 if (wc[ix].wc_id == RDS_RDMAW_WRID) {
1102                                         mutex_enter(&ep->ep_lock);
1103                                         ep->ep_rdmacnt--;
1104                                         mutex_exit(&ep->ep_lock);
1105                                         continue;
1106                                 }
1107 
1108                                 bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
1109                                 ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
1110                                 bp->buf_state = RDS_SNDBUF_FREE;
1111                         }
1112 
1113                         bp->buf_nextp = NULL;
1114                         if (headp) {
1115                                 tailp->buf_nextp = bp;
1116                                 tailp = bp;
1117                         } else {
1118                                 headp = bp;
1119                                 tailp = bp;
1120                         }
1121 
1122                         npolled++;
1123                 }
1124 
1125                 if (rds_no_interrupts && (npolled > 100)) {
1126                         break;
1127                 }
1128 
1129                 if (rds_no_interrupts == 1) {
1130                         break;
1131                 }
1132         } while (ret != IBT_CQ_EMPTY);
1133 
1134         RDS_DPRINTF5("rds_poll_send_completions", "Npolled: %d send_error: %d",
1135             npolled, send_error);
1136 
1137         /* put the buffers to the pool */
1138         if (npolled != 0) {
1139                 rds_free_send_buf(ep, headp, tailp, npolled, lock);
1140         }
1141 
1142         if (send_error != 0) {
1143                 rds_handle_send_error(ep);
1144         }
1145 
1146         RDS_DPRINTF4("rds_poll_send_completions", "Return EP(%p)", ep);
1147 }
1148 
1149 void
1150 rds_sendcq_handler(ibt_cq_hdl_t cq, void *arg)
1151 {
1152         rds_ep_t        *ep;
1153         int             ret;
1154 
1155         ep = (rds_ep_t *)arg;
1156 
1157         RDS_DPRINTF4("rds_sendcq_handler", "Enter: EP(%p)", ep);
1158 
1159         /* enable the CQ */
1160         ret = ibt_enable_cq_notify(cq, IBT_NEXT_COMPLETION);
1161         if (ret != IBT_SUCCESS) {
1162                 RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify "
1163                     "failed: %d", ep, cq, ret);
1164                 return;
1165         }
1166 
1167         rds_poll_send_completions(cq, ep, B_FALSE);
1168 
1169         RDS_DPRINTF4("rds_sendcq_handler", "Return: EP(%p)", ep);
1170 }
1171 
1172 void
1173 rds_ep_free_rc_channel(rds_ep_t *ep)
1174 {
1175         int ret;
1176 
1177         RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Enter", ep);
1178 
1179         ASSERT(mutex_owned(&ep->ep_lock));
1180 
1181         /* free the QP */
1182         if (ep->ep_chanhdl != NULL) {
1183                 /* wait until the RQ is empty */
1184                 (void) ibt_flush_channel(ep->ep_chanhdl);
1185                 (void) rds_is_recvq_empty(ep, B_TRUE);
1186                 ret = ibt_free_channel(ep->ep_chanhdl);
1187                 if (ret != IBT_SUCCESS) {
1188                         RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) "
1189                             "ibt_free_channel returned: %d", ep, ret);
1190                 }
1191                 ep->ep_chanhdl = NULL;
1192         } else {
1193                 RDS_DPRINTF2("rds_ep_free_rc_channel",
1194                     "EP(%p) Channel is ALREADY FREE", ep);
1195         }
1196 
1197         /* free the Send CQ */
1198         if (ep->ep_sendcq != NULL) {
1199                 ret = ibt_free_cq(ep->ep_sendcq);
1200                 if (ret != IBT_SUCCESS) {
1201                         RDS_DPRINTF2("rds_ep_free_rc_channel",
1202                             "EP(%p) - for sendcq, ibt_free_cq returned %d",
1203                             ep, ret);
1204                 }
1205                 ep->ep_sendcq = NULL;
1206         } else {
1207                 RDS_DPRINTF2("rds_ep_free_rc_channel",
1208                     "EP(%p) SendCQ is ALREADY FREE", ep);
1209         }
1210 
1211         /* free the Recv CQ */
1212         if (ep->ep_recvcq != NULL) {
1213                 ret = ibt_free_cq(ep->ep_recvcq);
1214                 if (ret != IBT_SUCCESS) {
1215                         RDS_DPRINTF2("rds_ep_free_rc_channel",
1216                             "EP(%p) - for recvcq, ibt_free_cq returned %d",
1217                             ep, ret);
1218                 }
1219                 ep->ep_recvcq = NULL;
1220         } else {
1221                 RDS_DPRINTF2("rds_ep_free_rc_channel",
1222                     "EP(%p) RecvCQ is ALREADY FREE", ep);
1223         }
1224 
1225         RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Return", ep);
1226 }
1227 
1228 /* Allocate resources for RC channel */
1229 ibt_channel_hdl_t
1230 rds_ep_alloc_rc_channel(rds_ep_t *ep, uint8_t hca_port)
1231 {
1232         int                             ret = IBT_SUCCESS;
1233         ibt_cq_attr_t                   scqattr, rcqattr;
1234         ibt_rc_chan_alloc_args_t        chanargs;
1235         ibt_channel_hdl_t               chanhdl;
1236         rds_session_t                   *sp;
1237         rds_hca_t                       *hcap;
1238 
1239         RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Enter: 0x%p port: %d",
1240             ep, hca_port);
1241 
1242         /* Update the EP with the right IP address and HCA guid */
1243         sp = ep->ep_sp;
1244         ASSERT(sp != NULL);
1245         rw_enter(&sp->session_lock, RW_READER);
1246         mutex_enter(&ep->ep_lock);
1247         ep->ep_myip = sp->session_myip;
1248         ep->ep_remip = sp->session_remip;
1249         hcap = rds_gid_to_hcap(rdsib_statep, sp->session_lgid);
1250         ep->ep_hca_guid = hcap->hca_guid;
1251         mutex_exit(&ep->ep_lock);
1252         rw_exit(&sp->session_lock);
1253 
1254         /* reset taskqpending flag here */
1255         ep->ep_recvqp.qp_taskqpending = B_FALSE;
1256 
1257         if (ep->ep_type == RDS_EP_TYPE_CTRL) {
1258                 scqattr.cq_size = MaxCtrlSendBuffers;
1259                 scqattr.cq_sched = NULL;
1260                 scqattr.cq_flags = IBT_CQ_NO_FLAGS;
1261 
1262                 rcqattr.cq_size = MaxCtrlRecvBuffers;
1263                 rcqattr.cq_sched = NULL;
1264                 rcqattr.cq_flags = IBT_CQ_NO_FLAGS;
1265 
1266                 chanargs.rc_sizes.cs_sq = MaxCtrlSendBuffers;
1267                 chanargs.rc_sizes.cs_rq = MaxCtrlRecvBuffers;
1268                 chanargs.rc_sizes.cs_sq_sgl = 1;
1269                 chanargs.rc_sizes.cs_rq_sgl = 1;
1270         } else {
1271                 scqattr.cq_size = MaxDataSendBuffers + RDS_NUM_ACKS;
1272                 scqattr.cq_sched = NULL;
1273                 scqattr.cq_flags = IBT_CQ_NO_FLAGS;
1274 
1275                 rcqattr.cq_size = MaxDataRecvBuffers;
1276                 rcqattr.cq_sched = NULL;
1277                 rcqattr.cq_flags = IBT_CQ_NO_FLAGS;
1278 
1279                 chanargs.rc_sizes.cs_sq = MaxDataSendBuffers + RDS_NUM_ACKS;
1280                 chanargs.rc_sizes.cs_rq = MaxDataRecvBuffers;
1281                 chanargs.rc_sizes.cs_sq_sgl = 1;
1282                 chanargs.rc_sizes.cs_rq_sgl = 1;
1283         }
1284 
1285         mutex_enter(&ep->ep_lock);
1286         if (ep->ep_sendcq == NULL) {
1287                 /* returned size is always greater than the requested size */
1288                 ret = ibt_alloc_cq(hcap->hca_hdl, &scqattr,
1289                     &ep->ep_sendcq, NULL);
1290                 if (ret != IBT_SUCCESS) {
1291                         RDS_DPRINTF2(LABEL, "ibt_alloc_cq for sendCQ "
1292                             "failed, size = %d: %d", scqattr.cq_size, ret);
1293                         mutex_exit(&ep->ep_lock);
1294                         return (NULL);
1295                 }
1296 
1297                 (void) ibt_set_cq_handler(ep->ep_sendcq, rds_sendcq_handler,
1298                     ep);
1299 
1300                 if (rds_no_interrupts == 0) {
1301                         ret = ibt_enable_cq_notify(ep->ep_sendcq,
1302                             IBT_NEXT_COMPLETION);
1303                         if (ret != IBT_SUCCESS) {
1304                                 RDS_DPRINTF2(LABEL,
1305                                     "ibt_enable_cq_notify failed: %d", ret);
1306                                 (void) ibt_free_cq(ep->ep_sendcq);
1307                                 ep->ep_sendcq = NULL;
1308                                 mutex_exit(&ep->ep_lock);
1309                                 return (NULL);
1310                         }
1311                 }
1312         }
1313 
1314         if (ep->ep_recvcq == NULL) {
1315                 /* returned size is always greater than the requested size */
1316                 ret = ibt_alloc_cq(hcap->hca_hdl, &rcqattr,
1317                     &ep->ep_recvcq, NULL);
1318                 if (ret != IBT_SUCCESS) {
1319                         RDS_DPRINTF2(LABEL, "ibt_alloc_cq for recvCQ "
1320                             "failed, size = %d: %d", rcqattr.cq_size, ret);
1321                         (void) ibt_free_cq(ep->ep_sendcq);
1322                         ep->ep_sendcq = NULL;
1323                         mutex_exit(&ep->ep_lock);
1324                         return (NULL);
1325                 }
1326 
1327                 (void) ibt_set_cq_handler(ep->ep_recvcq, rds_recvcq_handler,
1328                     ep);
1329 
1330                 ret = ibt_enable_cq_notify(ep->ep_recvcq, rds_wc_signal);
1331                 if (ret != IBT_SUCCESS) {
1332                         RDS_DPRINTF2(LABEL,
1333                             "ibt_enable_cq_notify failed: %d", ret);
1334                         (void) ibt_free_cq(ep->ep_recvcq);
1335                         ep->ep_recvcq = NULL;
1336                         (void) ibt_free_cq(ep->ep_sendcq);
1337                         ep->ep_sendcq = NULL;
1338                         mutex_exit(&ep->ep_lock);
1339                         return (NULL);
1340                 }
1341         }
1342 
1343         chanargs.rc_flags = IBT_ALL_SIGNALED;
1344         chanargs.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR |
1345             IBT_CEP_ATOMIC;
1346         chanargs.rc_hca_port_num = hca_port;
1347         chanargs.rc_scq = ep->ep_sendcq;
1348         chanargs.rc_rcq = ep->ep_recvcq;
1349         chanargs.rc_pd = hcap->hca_pdhdl;
1350         chanargs.rc_srq = NULL;
1351 
1352         ret = ibt_alloc_rc_channel(hcap->hca_hdl,
1353             IBT_ACHAN_NO_FLAGS, &chanargs, &chanhdl, NULL);
1354         if (ret != IBT_SUCCESS) {
1355                 RDS_DPRINTF2(LABEL, "ibt_alloc_rc_channel fail: %d",
1356                     ret);
1357                 (void) ibt_free_cq(ep->ep_recvcq);
1358                 ep->ep_recvcq = NULL;
1359                 (void) ibt_free_cq(ep->ep_sendcq);
1360                 ep->ep_sendcq = NULL;
1361                 mutex_exit(&ep->ep_lock);
1362                 return (NULL);
1363         }
1364         mutex_exit(&ep->ep_lock);
1365 
1366         /* Chan private should contain the ep */
1367         (void) ibt_set_chan_private(chanhdl, ep);
1368 
1369         RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Return: 0x%p", chanhdl);
1370 
1371         return (chanhdl);
1372 }
1373 
1374 
1375 #if 0
1376 
1377 /* Return node guid given a port gid */
1378 ib_guid_t
1379 rds_gid_to_node_guid(ib_gid_t gid)
1380 {
1381         ibt_node_info_t nodeinfo;
1382         int             ret;
1383 
1384         RDS_DPRINTF4("rds_gid_to_node_guid", "Enter: gid: %llx:%llx",
1385             gid.gid_prefix, gid.gid_guid);
1386 
1387         ret = ibt_gid_to_node_info(gid, &nodeinfo);
1388         if (ret != IBT_SUCCESS) {
1389                 RDS_DPRINTF2(LABEL, "ibt_gid_node_info for gid: %llx:%llx "
1390                     "failed", gid.gid_prefix, gid.gid_guid);
1391                 return (0LL);
1392         }
1393 
1394         RDS_DPRINTF4("rds_gid_to_node_guid", "Return: Node guid: %llx",
1395             nodeinfo.n_node_guid);
1396 
1397         return (nodeinfo.n_node_guid);
1398 }
1399 
1400 #endif
1401 
1402 static void
1403 rds_handle_portup_event(rds_state_t *statep, ibt_hca_hdl_t hdl,
1404     ibt_async_event_t *event)
1405 {
1406         rds_hca_t               *hcap;
1407         ibt_hca_portinfo_t      *newpinfop, *oldpinfop;
1408         uint_t                  newsize, oldsize, nport;
1409         ib_gid_t                gid;
1410         int                     ret;
1411 
1412         RDS_DPRINTF2("rds_handle_portup_event",
1413             "Enter: GUID: 0x%llx Statep: %p", event->ev_hca_guid, statep);
1414 
1415         rw_enter(&statep->rds_hca_lock, RW_WRITER);
1416 
1417         hcap = statep->rds_hcalistp;
1418         while ((hcap != NULL) && (hcap->hca_guid != event->ev_hca_guid)) {
1419                 hcap = hcap->hca_nextp;
1420         }
1421 
1422         if (hcap == NULL) {
1423                 RDS_DPRINTF2("rds_handle_portup_event", "HCA: 0x%llx is "
1424                     "not in our list", event->ev_hca_guid);
1425                 rw_exit(&statep->rds_hca_lock);
1426                 return;
1427         }
1428 
1429         ret = ibt_query_hca_ports(hdl, 0, &newpinfop, &nport, &newsize);
1430         if (ret != IBT_SUCCESS) {
1431                 RDS_DPRINTF2(LABEL, "ibt_query_hca_ports failed: %d", ret);
1432                 rw_exit(&statep->rds_hca_lock);
1433                 return;
1434         }
1435 
1436         oldpinfop = hcap->hca_pinfop;
1437         oldsize = hcap->hca_pinfo_sz;
1438         hcap->hca_pinfop = newpinfop;
1439         hcap->hca_pinfo_sz = newsize;
1440 
1441         (void) ibt_free_portinfo(oldpinfop, oldsize);
1442 
1443         /* If RDS service is not registered then no bind is needed */
1444         if (statep->rds_srvhdl == NULL) {
1445                 RDS_DPRINTF2("rds_handle_portup_event",
1446                     "RDS Service is not registered, so no action needed");
1447                 rw_exit(&statep->rds_hca_lock);
1448                 return;
1449         }
1450 
1451         /*
1452          * If the service was previously bound on this port and
1453          * if this port has changed state down and now up, we do not
1454          * need to bind the service again. The bind is expected to
1455          * persist across state changes. If the service was never bound
1456          * before then we bind it this time.
1457          */
1458         if (hcap->hca_bindhdl[event->ev_port - 1] == NULL) {
1459 
1460                 /* structure copy */
1461                 gid = newpinfop[event->ev_port - 1].p_sgid_tbl[0];
1462 
1463                 /* bind RDS service on the port, pass statep as cm_private */
1464                 ret = ibt_bind_service(statep->rds_srvhdl, gid, NULL, statep,
1465                     &hcap->hca_bindhdl[event->ev_port - 1]);
1466                 if (ret != IBT_SUCCESS) {
1467                         RDS_DPRINTF2("rds_handle_portup_event",
1468                             "Bind service for HCA: 0x%llx Port: %d "
1469                             "gid %llx:%llx returned: %d", event->ev_hca_guid,
1470                             event->ev_port, gid.gid_prefix, gid.gid_guid, ret);
1471                 }
1472         }
1473 
1474         rw_exit(&statep->rds_hca_lock);
1475 
1476         RDS_DPRINTF2("rds_handle_portup_event", "Return: GUID: 0x%llx",
1477             event->ev_hca_guid);
1478 }
1479 
1480 static void
1481 rdsib_add_hca(ib_guid_t hca_guid)
1482 {
1483         rds_hca_t       *hcap;
1484         ibt_mr_attr_t   mem_attr;
1485         ibt_mr_desc_t   mem_desc;
1486         int             ret;
1487 
1488         RDS_DPRINTF2("rdsib_add_hca", "Enter: GUID: 0x%llx", hca_guid);
1489 
1490         hcap = rdsib_init_hca(hca_guid);
1491         if (hcap == NULL)
1492                 return;
1493 
1494         /* register the recv memory with this hca */
1495         mutex_enter(&rds_dpool.pool_lock);
1496         if (rds_dpool.pool_memp == NULL) {
1497                 /* no memory to register */
1498                 RDS_DPRINTF2("rdsib_add_hca", "No memory to register");
1499                 mutex_exit(&rds_dpool.pool_lock);
1500                 return;
1501         }
1502 
1503         mem_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)rds_dpool.pool_memp;
1504         mem_attr.mr_len = rds_dpool.pool_memsize;
1505         mem_attr.mr_as = NULL;
1506         mem_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE;
1507 
1508         ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl, &mem_attr,
1509             &hcap->hca_mrhdl, &mem_desc);
1510 
1511         mutex_exit(&rds_dpool.pool_lock);
1512 
1513         if (ret != IBT_SUCCESS) {
1514                 RDS_DPRINTF2("rdsib_add_hca", "ibt_register_mr failed: %d",
1515                     ret);
1516         } else {
1517                 rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
1518                 hcap->hca_state = RDS_HCA_STATE_MEM_REGISTERED;
1519                 hcap->hca_lkey = mem_desc.md_lkey;
1520                 hcap->hca_rkey = mem_desc.md_rkey;
1521                 rw_exit(&rdsib_statep->rds_hca_lock);
1522         }
1523 
1524         RDS_DPRINTF2("rdsib_add_hca", "Retrun: GUID: 0x%llx", hca_guid);
1525 }
1526 
1527 void rds_close_this_session(rds_session_t *sp, uint8_t wait);
1528 int rds_post_control_message(rds_session_t *sp, uint8_t code, in_port_t port);
1529 
1530 static void
1531 rdsib_del_hca(rds_state_t *statep, ib_guid_t hca_guid)
1532 {
1533         rds_session_t   *sp;
1534         rds_hca_t       *hcap;
1535         rds_hca_state_t saved_state;
1536         int             ret, ix;
1537 
1538         RDS_DPRINTF2("rdsib_del_hca", "Enter: GUID: 0x%llx", hca_guid);
1539 
1540         /*
1541          * This should be a write lock as we don't want anyone to get access
1542          * to the hcap while we are modifing its contents
1543          */
1544         rw_enter(&statep->rds_hca_lock, RW_WRITER);
1545 
1546         hcap = statep->rds_hcalistp;
1547         while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
1548                 hcap = hcap->hca_nextp;
1549         }
1550 
1551         /* Prevent initiating any new activity on this HCA */
1552         ASSERT(hcap != NULL);
1553         saved_state = hcap->hca_state;
1554         hcap->hca_state = RDS_HCA_STATE_STOPPING;
1555 
1556         rw_exit(&statep->rds_hca_lock);
1557 
1558         /*
1559          * stop the outgoing traffic and close any active sessions on this hca.
1560          * Any pending messages in the SQ will be allowed to complete.
1561          */
1562         rw_enter(&statep->rds_sessionlock, RW_READER);
1563         sp = statep->rds_sessionlistp;
1564         while (sp) {
1565                 if (sp->session_hca_guid != hca_guid) {
1566                         sp = sp->session_nextp;
1567                         continue;
1568                 }
1569 
1570                 rw_enter(&sp->session_lock, RW_WRITER);
1571                 RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp,
1572                     sp->session_state);
1573                 /*
1574                  * We are changing the session state in advance. This prevents
1575                  * further messages to be posted to the SQ. We then
1576                  * send a control message to the remote and tell it close
1577                  * the session.
1578                  */
1579                 sp->session_state = RDS_SESSION_STATE_HCA_CLOSING;
1580                 RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
1581                     "RDS_SESSION_STATE_PASSIVE_CLOSING", sp);
1582                 rw_exit(&sp->session_lock);
1583 
1584                 /*
1585                  * wait until the sendq is empty then tell the remote to
1586                  * close this session. This enables for graceful shutdown of
1587                  * the session
1588                  */
1589                 (void) rds_is_sendq_empty(&sp->session_dataep, 2);
1590                 (void) rds_post_control_message(sp,
1591                     RDS_CTRL_CODE_CLOSE_SESSION, 0);
1592 
1593                 sp = sp->session_nextp;
1594         }
1595 
1596         /* wait until all the sessions are off this HCA */
1597         sp = statep->rds_sessionlistp;
1598         while (sp) {
1599                 if (sp->session_hca_guid != hca_guid) {
1600                         sp = sp->session_nextp;
1601                         continue;
1602                 }
1603 
1604                 rw_enter(&sp->session_lock, RW_READER);
1605                 RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp,
1606                     sp->session_state);
1607 
1608                 while ((sp->session_state == RDS_SESSION_STATE_HCA_CLOSING) ||
1609                     (sp->session_state == RDS_SESSION_STATE_ERROR) ||
1610                     (sp->session_state == RDS_SESSION_STATE_PASSIVE_CLOSING) ||
1611                     (sp->session_state == RDS_SESSION_STATE_CLOSED)) {
1612                         rw_exit(&sp->session_lock);
1613                         delay(drv_sectohz(1));
1614                         rw_enter(&sp->session_lock, RW_READER);
1615                         RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp,
1616                             sp->session_state);
1617                 }
1618 
1619                 rw_exit(&sp->session_lock);
1620 
1621                 sp = sp->session_nextp;
1622         }
1623         rw_exit(&statep->rds_sessionlock);
1624 
1625         /*
1626          * if rdsib_close_ib was called before this, then that would have
1627          * unbound the service on all ports. In that case, the HCA structs
1628          * will contain stale bindhdls. Hence, we do not call unbind unless
1629          * the service is still registered.
1630          */
1631         if (statep->rds_srvhdl != NULL) {
1632                 /* unbind RDS service on all ports on this HCA */
1633                 for (ix = 0; ix < hcap->hca_nports; ix++) {
1634                         if (hcap->hca_bindhdl[ix] == NULL) {
1635                                 continue;
1636                         }
1637 
1638                         RDS_DPRINTF2("rdsib_del_hca",
1639                             "Unbinding Service: port: %d, bindhdl: %p",
1640                             ix + 1, hcap->hca_bindhdl[ix]);
1641                         (void) ibt_unbind_service(rdsib_statep->rds_srvhdl,
1642                             hcap->hca_bindhdl[ix]);
1643                         hcap->hca_bindhdl[ix] = NULL;
1644                 }
1645         }
1646 
1647         RDS_DPRINTF2("rdsib_del_hca", "HCA(%p) State: %d", hcap,
1648             hcap->hca_state);
1649 
1650         switch (saved_state) {
1651         case RDS_HCA_STATE_MEM_REGISTERED:
1652                 ASSERT(hcap->hca_mrhdl != NULL);
1653                 ret = ibt_deregister_mr(hcap->hca_hdl, hcap->hca_mrhdl);
1654                 if (ret != IBT_SUCCESS) {
1655                         RDS_DPRINTF2("rdsib_del_hca",
1656                             "ibt_deregister_mr failed: %d", ret);
1657                         return;
1658                 }
1659                 hcap->hca_mrhdl = NULL;
1660                 /* FALLTHRU */
1661         case RDS_HCA_STATE_OPEN:
1662                 ASSERT(hcap->hca_hdl != NULL);
1663                 ASSERT(hcap->hca_pdhdl != NULL);
1664 
1665 
1666                 ret = ibt_free_pd(hcap->hca_hdl, hcap->hca_pdhdl);
1667                 if (ret != IBT_SUCCESS) {
1668                         RDS_DPRINTF2("rdsib_del_hca",
1669                             "ibt_free_pd failed: %d", ret);
1670                 }
1671 
1672                 (void) ibt_free_portinfo(hcap->hca_pinfop, hcap->hca_pinfo_sz);
1673 
1674                 ret = ibt_close_hca(hcap->hca_hdl);
1675                 if (ret != IBT_SUCCESS) {
1676                         RDS_DPRINTF2("rdsib_del_hca",
1677                             "ibt_close_hca failed: %d", ret);
1678                 }
1679 
1680                 hcap->hca_hdl = NULL;
1681                 hcap->hca_pdhdl = NULL;
1682                 hcap->hca_lkey = 0;
1683                 hcap->hca_rkey = 0;
1684         }
1685 
1686         /*
1687          * This should be a write lock as we don't want anyone to get access
1688          * to the hcap while we are modifing its contents
1689          */
1690         rw_enter(&statep->rds_hca_lock, RW_WRITER);
1691         hcap->hca_state = RDS_HCA_STATE_REMOVED;
1692         rw_exit(&statep->rds_hca_lock);
1693 
1694         RDS_DPRINTF2("rdsib_del_hca", "Return: GUID: 0x%llx", hca_guid);
1695 }
1696 
1697 static void
1698 rds_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code,
1699     ibt_async_event_t *event)
1700 {
1701         rds_state_t             *statep = (rds_state_t *)clntp;
1702 
1703         RDS_DPRINTF2("rds_async_handler", "Async code: %d", code);
1704 
1705         switch (code) {
1706         case IBT_EVENT_PORT_UP:
1707                 rds_handle_portup_event(statep, hdl, event);
1708                 break;
1709         case IBT_HCA_ATTACH_EVENT:
1710                 /*
1711                  * NOTE: In some error recovery paths, it is possible to
1712                  * receive IBT_HCA_ATTACH_EVENTs on already known HCAs.
1713                  */
1714                 (void) rdsib_add_hca(event->ev_hca_guid);
1715                 break;
1716         case IBT_HCA_DETACH_EVENT:
1717                 (void) rdsib_del_hca(statep, event->ev_hca_guid);
1718                 break;
1719 
1720         default:
1721                 RDS_DPRINTF2(LABEL, "Async event: %d not handled", code);
1722         }
1723 
1724         RDS_DPRINTF2("rds_async_handler", "Return: code: %d", code);
1725 }
1726 
1727 /*
1728  * This routine exists to minimize stale connections across ungraceful
1729  * reboots of nodes in a cluster.
1730  */
1731 void
1732 rds_randomize_qps(rds_hca_t *hcap)
1733 {
1734         ibt_cq_attr_t                   cqattr;
1735         ibt_rc_chan_alloc_args_t        chanargs;
1736         ibt_channel_hdl_t               qp1, qp2;
1737         ibt_cq_hdl_t                    cq_hdl;
1738         hrtime_t                        nsec;
1739         uint8_t                         i, j, rand1, rand2;
1740         int                             ret;
1741 
1742         bzero(&cqattr, sizeof (ibt_cq_attr_t));
1743         cqattr.cq_size = 1;
1744         cqattr.cq_sched = NULL;
1745         cqattr.cq_flags = IBT_CQ_NO_FLAGS;
1746         ret = ibt_alloc_cq(hcap->hca_hdl, &cqattr, &cq_hdl, NULL);
1747         if (ret != IBT_SUCCESS) {
1748                 RDS_DPRINTF2("rds_randomize_qps",
1749                     "ibt_alloc_cq failed: %d", ret);
1750                 return;
1751         }
1752 
1753         bzero(&chanargs, sizeof (ibt_rc_chan_alloc_args_t));
1754         chanargs.rc_flags = IBT_ALL_SIGNALED;
1755         chanargs.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR |
1756             IBT_CEP_ATOMIC;
1757         chanargs.rc_hca_port_num = 1;
1758         chanargs.rc_scq = cq_hdl;
1759         chanargs.rc_rcq = cq_hdl;
1760         chanargs.rc_pd = hcap->hca_pdhdl;
1761         chanargs.rc_srq = NULL;
1762 
1763         nsec = gethrtime();
1764         rand1 = (nsec & 0xF);
1765         rand2 = (nsec >> 4) & 0xF;
1766         RDS_DPRINTF2("rds_randomize_qps", "rand1: %d rand2: %d",
1767             rand1, rand2);
1768 
1769         for (i = 0; i < rand1 + 3; i++) {
1770                 if (ibt_alloc_rc_channel(hcap->hca_hdl,
1771                     IBT_ACHAN_NO_FLAGS, &chanargs, &qp1, NULL) !=
1772                     IBT_SUCCESS) {
1773                         RDS_DPRINTF2("rds_randomize_qps",
1774                             "Bailing at i: %d", i);
1775                         (void) ibt_free_cq(cq_hdl);
1776                         return;
1777                 }
1778                 for (j = 0; j < rand2 + 3; j++) {
1779                         if (ibt_alloc_rc_channel(hcap->hca_hdl,
1780                             IBT_ACHAN_NO_FLAGS, &chanargs, &qp2,
1781                             NULL) != IBT_SUCCESS) {
1782                                 RDS_DPRINTF2("rds_randomize_qps",
1783                                     "Bailing at i: %d j: %d", i, j);
1784                                 (void) ibt_free_channel(qp1);
1785                                 (void) ibt_free_cq(cq_hdl);
1786                                 return;
1787                         }
1788                         (void) ibt_free_channel(qp2);
1789                 }
1790                 (void) ibt_free_channel(qp1);
1791         }
1792 
1793         (void) ibt_free_cq(cq_hdl);
1794 }