1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 /* Copyright (c) 1990 Mentat Inc. */
  26 
  27 /*
  28  * An implementation of the IPoIB-CM standard based on PSARC 2009/593.
  29  */
  30 #include <sys/types.h>
  31 #include <sys/conf.h>
  32 #include <sys/ddi.h>
  33 #include <sys/sunddi.h>
  34 #include <sys/modctl.h>
  35 #include <sys/stropts.h>
  36 #include <sys/stream.h>
  37 #include <sys/strsun.h>
  38 #include <sys/strsubr.h>
  39 #include <sys/dlpi.h>
  40 #include <sys/mac_provider.h>
  41 
  42 #include <sys/pattr.h>            /* for HCK_FULLCKSUM */
  43 #include <sys/atomic.h>           /* for atomic_add*() */
  44 #include <sys/ethernet.h> /* for ETHERTYPE_IP */
  45 #include <netinet/in.h>           /* for netinet/ip.h below */
  46 #include <netinet/ip.h>           /* for struct ip */
  47 #include <inet/common.h>  /* for inet/ip.h below */
  48 #include <inet/ip.h>              /* for ipha_t */
  49 #include <inet/ip_if.h>           /* for ETHERTYPE_IPV6 */
  50 #include <inet/ip6.h>             /* for ip6_t */
  51 #include <netinet/icmp6.h>        /* for icmp6_t */
  52 
  53 #include <sys/ib/clients/ibd/ibd.h>
  54 
  55 extern ibd_global_state_t ibd_gstate;
  56 extern int ibd_rc_conn_timeout;
  57 uint_t ibd_rc_tx_softintr = 1;
  58 /*
  59  * If the number of WRs in receive queue of each RC connection less than
  60  * IBD_RC_RX_WR_THRESHOLD, we will post more receive WRs into it.
  61  */
  62 #define IBD_RC_RX_WR_THRESHOLD          0x20
  63 
  64 /*
  65  * If the number of free SWQEs (or large Tx buf) is larger than or equal to
  66  * IBD_RC_TX_FREE_THRESH, we will call mac_tx_update to notify GLD to continue
  67  * transmitting packets.
  68  */
  69 #define IBD_RC_TX_FREE_THRESH           8
  70 
  71 #define IBD_RC_QPN_TO_SID(qpn) \
  72         ((uint64_t)(IBD_RC_SERVICE_ID | ((qpn) & 0xffffff)))
  73 
  74 /* For interop with legacy OFED */
  75 #define IBD_RC_QPN_TO_SID_OFED_INTEROP(qpn) \
  76         ((uint64_t)(IBD_RC_SERVICE_ID_OFED_INTEROP | ((qpn) & 0xffffff)))
  77 
  78 /* Internet Header + 64 bits of Data Datagram. Refer to RFC 792 */
  79 #define IBD_RC_IP_ICMP_RETURN_DATA_BYTES        64
  80 
  81 
  82 /* Functions for Reliable Connected Mode */
  83 /* Connection Setup/Close Functions */
  84 static ibt_cm_status_t ibd_rc_dispatch_pass_mad(void *,
  85     ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
  86 static ibt_cm_status_t ibd_rc_dispatch_actv_mad(void *,
  87     ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
  88 static void ibd_rc_act_close(ibd_rc_chan_t *, boolean_t);
  89 
  90 static inline void ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *,
  91     ibd_rc_chan_t *);
  92 static inline ibd_rc_chan_t *ibd_rc_rm_header_chan_list(
  93     ibd_rc_chan_list_t *);
  94 static inline ibd_rc_chan_t *ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *,
  95     ibd_rc_chan_t *);
  96 
  97 /* CQ handlers */
  98 static void ibd_rc_rcq_handler(ibt_cq_hdl_t, void *);
  99 static void ibd_rc_scq_handler(ibt_cq_hdl_t, void *);
 100 static void ibd_rc_poll_rcq(ibd_rc_chan_t *, ibt_cq_hdl_t);
 101 
 102 /* Receive Functions */
 103 static int ibd_rc_post_srq(ibd_state_t *, ibd_rwqe_t *);
 104 static void ibd_rc_srq_freemsg_cb(char *);
 105 static void ibd_rc_srq_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
 106 
 107 static int ibd_rc_post_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
 108 static void ibd_rc_freemsg_cb(char *);
 109 static void ibd_rc_process_rx(ibd_rc_chan_t *, ibd_rwqe_t *, ibt_wc_t *);
 110 static void ibd_rc_free_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
 111 static void ibd_rc_fini_rxlist(ibd_rc_chan_t *);
 112 
 113 
 114 /* Send Functions */
 115 static void ibd_rc_release_swqe(ibd_rc_chan_t *, ibd_swqe_t *);
 116 static int ibd_rc_init_txlist(ibd_rc_chan_t *);
 117 static void ibd_rc_fini_txlist(ibd_rc_chan_t *);
 118 static uint_t ibd_rc_tx_recycle(caddr_t);
 119 
 120 
 121 void
 122 ibd_async_rc_close_act_chan(ibd_state_t *state, ibd_req_t *req)
 123 {
 124         ibd_rc_chan_t *rc_chan = req->rq_ptr;
 125         ibd_ace_t *ace;
 126 
 127         while (rc_chan != NULL) {
 128                 ace = rc_chan->ace;
 129                 ASSERT(ace != NULL);
 130                 /* Close old RC channel */
 131                 ibd_rc_act_close(rc_chan, B_TRUE);
 132                 mutex_enter(&state->id_ac_mutex);
 133                 ASSERT(ace->ac_ref != 0);
 134                 atomic_dec_32(&ace->ac_ref);
 135                 ace->ac_chan = NULL;
 136                 if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
 137                         IBD_ACACHE_INSERT_FREE(state, ace);
 138                         ace->ac_ref = 0;
 139                 } else {
 140                         ace->ac_ref |= CYCLEVAL;
 141                         state->rc_delay_ace_recycle++;
 142                 }
 143                 mutex_exit(&state->id_ac_mutex);
 144                 rc_chan = ibd_rc_rm_header_chan_list(
 145                     &state->rc_obs_act_chan_list);
 146         }
 147 }
 148 
 149 void
 150 ibd_async_rc_recycle_ace(ibd_state_t *state, ibd_req_t *req)
 151 {
 152         ibd_ace_t *ace = req->rq_ptr;
 153         ibd_rc_chan_t *rc_chan;
 154 
 155         ASSERT(ace != NULL);
 156         rc_chan = ace->ac_chan;
 157         ASSERT(rc_chan != NULL);
 158         /* Close old RC channel */
 159         ibd_rc_act_close(rc_chan, B_TRUE);
 160         mutex_enter(&state->id_ac_mutex);
 161         ASSERT(ace->ac_ref != 0);
 162         atomic_dec_32(&ace->ac_ref);
 163         ace->ac_chan = NULL;
 164         if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
 165                 IBD_ACACHE_INSERT_FREE(state, ace);
 166                 ace->ac_ref = 0;
 167         } else {
 168                 ace->ac_ref |= CYCLEVAL;
 169                 state->rc_delay_ace_recycle++;
 170         }
 171         mutex_exit(&state->id_ac_mutex);
 172         mutex_enter(&state->rc_ace_recycle_lock);
 173         state->rc_ace_recycle = NULL;
 174         mutex_exit(&state->rc_ace_recycle_lock);
 175 }
 176 
 177 /* Simple ICMP IP Header Template */
 178 static const ipha_t icmp_ipha = {
 179         IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
 180 };
 181 
 182 /* Packet is too big. Send ICMP packet to GLD to request a smaller MTU */
 183 void
 184 ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req)
 185 {
 186         mblk_t *mp = req->rq_ptr;
 187         ibd_ace_t *ace = req->rq_ptr2;
 188         uint16_t mtu = state->id_mtu - IPOIB_HDRSIZE;
 189         uint_t  len_needed;
 190         size_t  msg_len;
 191         mblk_t  *pmtu_mp;
 192         ushort_t        sap;
 193         ib_header_info_t *ibha; /* ib header for pmtu_pkt */
 194         /*
 195          * ipha: IP header for pmtu_pkt
 196          * old_ipha: IP header for old packet
 197          */
 198         ipha_t *ipha, *old_ipha;
 199         icmph_t *icmph;
 200 
 201         sap = ntohs(((ipoib_hdr_t *)mp->b_rptr)->ipoib_type);
 202 
 203         if (!pullupmsg(mp, -1)) {
 204                 DPRINT(40, "ibd_async_rc_process_too_big: pullupmsg fail");
 205                 goto too_big_fail;
 206         }
 207         /* move to IP header. */
 208         mp->b_rptr += IPOIB_HDRSIZE;
 209         old_ipha = (ipha_t *)mp->b_rptr;
 210 
 211         len_needed = IPH_HDR_LENGTH(old_ipha);
 212         if (old_ipha->ipha_protocol == IPPROTO_ENCAP) {
 213                 len_needed += IPH_HDR_LENGTH(((uchar_t *)old_ipha +
 214                     len_needed));
 215         } else if (old_ipha->ipha_protocol == IPPROTO_IPV6) {
 216                 ip6_t *ip6h = (ip6_t *)((uchar_t *)old_ipha
 217                     + len_needed);
 218                 len_needed += ip_hdr_length_v6(mp, ip6h);
 219         }
 220         len_needed += IBD_RC_IP_ICMP_RETURN_DATA_BYTES;
 221         msg_len = msgdsize(mp);
 222         if (msg_len > len_needed) {
 223                 (void) adjmsg(mp, len_needed - msg_len);
 224                 msg_len = len_needed;
 225         }
 226 
 227         if ((pmtu_mp = allocb(sizeof (ib_header_info_t) + sizeof (ipha_t)
 228             + sizeof (icmph_t), BPRI_MED)) == NULL) {
 229                 DPRINT(40, "ibd_async_rc_process_too_big: allocb fail");
 230                 goto too_big_fail;
 231         }
 232         pmtu_mp->b_cont = mp;
 233         pmtu_mp->b_wptr = pmtu_mp->b_rptr + sizeof (ib_header_info_t)
 234             + sizeof (ipha_t) + sizeof (icmph_t);
 235 
 236         ibha = (ib_header_info_t *)pmtu_mp->b_rptr;
 237 
 238         /* Fill IB header */
 239         bcopy(&state->id_macaddr, &ibha->ib_dst, IPOIB_ADDRL);
 240         /*
 241          * If the GRH is not valid, indicate to GLDv3 by setting
 242          * the VerTcFlow field to 0.
 243          */
 244         ibha->ib_grh.ipoib_vertcflow = 0;
 245         ibha->ipib_rhdr.ipoib_type = htons(sap);
 246         ibha->ipib_rhdr.ipoib_mbz = 0;
 247 
 248         /* Fill IP header */
 249         ipha = (ipha_t *)&ibha[1];
 250         *ipha = icmp_ipha;
 251         ipha->ipha_src = old_ipha->ipha_dst;
 252         ipha->ipha_dst = old_ipha->ipha_src;
 253         ipha->ipha_ttl = old_ipha->ipha_ttl;
 254         msg_len += sizeof (icmp_ipha) + sizeof (icmph_t);
 255         if (msg_len > IP_MAXPACKET) {
 256                 ibd_print_warn(state, "ibd_rc_process_too_big_pkt: msg_len(%d) "
 257                     "> IP_MAXPACKET", (uint32_t)msg_len);
 258                 (void) adjmsg(mp, IP_MAXPACKET - msg_len);
 259                 msg_len = IP_MAXPACKET;
 260         }
 261         ipha->ipha_length = htons((uint16_t)msg_len);
 262         ipha->ipha_hdr_checksum = 0;
 263         ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
 264 
 265         /* Fill ICMP body */
 266         icmph = (icmph_t *)&ipha[1];
 267         bzero(icmph, sizeof (icmph_t));
 268         icmph->icmph_type = ICMP_DEST_UNREACHABLE;
 269         icmph->icmph_code = ICMP_FRAGMENTATION_NEEDED;
 270         icmph->icmph_du_mtu = htons(mtu);
 271         icmph->icmph_checksum = 0;
 272         icmph->icmph_checksum = IP_CSUM(pmtu_mp,
 273             (int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0);
 274 
 275         (void) hcksum_assoc(pmtu_mp, NULL, NULL, 0, 0, 0, 0,
 276             HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
 277 
 278         DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, "
 279             "ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d",
 280             sap, ipha->ipha_src, ipha->ipha_dst, ipha->ipha_ttl,
 281             len_needed, (uint32_t)msg_len);
 282 
 283         mac_rx(state->id_mh, state->id_rh, pmtu_mp);
 284 
 285         mutex_enter(&ace->tx_too_big_mutex);
 286         ace->tx_too_big_ongoing = B_FALSE;
 287         mutex_exit(&ace->tx_too_big_mutex);
 288         return;
 289 
 290 too_big_fail:
 291         /* Drop packet */
 292         freemsg(mp);
 293         mutex_enter(&ace->tx_too_big_mutex);
 294         ace->tx_too_big_ongoing = B_FALSE;
 295         mutex_exit(&ace->tx_too_big_mutex);
 296 }
 297 
 298 /*
 299  * Check all active/passive channels. If any ative/passive
 300  * channel has not been used for a long time, close it.
 301  */
 302 void
 303 ibd_rc_conn_timeout_call(void *carg)
 304 {
 305         ibd_state_t *state = carg;
 306         ibd_ace_t *ace, *pre_ace;
 307         ibd_rc_chan_t *chan, *pre_chan, *next_chan;
 308         ibd_req_t *req;
 309 
 310         /* Check all active channels. If chan->is_used == B_FALSE, close it */
 311         mutex_enter(&state->id_ac_mutex);
 312         ace = list_head(&state->id_ah_active);
 313         while ((pre_ace = ace) != NULL) {
 314                 ace = list_next(&state->id_ah_active, ace);
 315                 if (pre_ace->ac_chan != NULL) {
 316                         chan = pre_ace->ac_chan;
 317                         ASSERT(state->id_enable_rc == B_TRUE);
 318                         if (chan->chan_state == IBD_RC_STATE_ACT_ESTAB) {
 319                                 if (chan->is_used == B_FALSE) {
 320                                         state->rc_timeout_act++;
 321                                         INC_REF(pre_ace, 1);
 322                                         IBD_ACACHE_PULLOUT_ACTIVE(state,
 323                                             pre_ace);
 324                                         chan->chan_state =
 325                                             IBD_RC_STATE_ACT_CLOSING;
 326                                         ibd_rc_signal_act_close(state, pre_ace);
 327                                 } else {
 328                                         chan->is_used = B_FALSE;
 329                                 }
 330                         }
 331                 }
 332         }
 333         mutex_exit(&state->id_ac_mutex);
 334 
 335         /* Check all passive channels. If chan->is_used == B_FALSE, close it */
 336         mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
 337         next_chan = state->rc_pass_chan_list.chan_list;
 338         pre_chan = NULL;
 339         while ((chan = next_chan) != NULL) {
 340                 next_chan = chan->next;
 341                 if (chan->is_used == B_FALSE) {
 342                         req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
 343                         if (req != NULL) {
 344                                 /* remove it */
 345                                 state->rc_timeout_pas++;
 346                                 req->rq_ptr = chan;
 347                                 ibd_queue_work_slot(state, req,
 348                                     IBD_ASYNC_RC_CLOSE_PAS_CHAN);
 349                         } else {
 350                                 ibd_print_warn(state, "ibd_rc_conn_timeout: "
 351                                     "alloc ibd_req_t fail");
 352                                 if (pre_chan == NULL) {
 353                                         state->rc_pass_chan_list.chan_list =
 354                                             chan;
 355                                 } else {
 356                                         pre_chan->next = chan;
 357                                 }
 358                                 pre_chan = chan;
 359                         }
 360                 } else {
 361                         if (pre_chan == NULL) {
 362                                 state->rc_pass_chan_list.chan_list = chan;
 363                         } else {
 364                                 pre_chan->next = chan;
 365                         }
 366                         pre_chan = chan;
 367                         chan->is_used = B_FALSE;
 368                 }
 369         }
 370         if (pre_chan != NULL) {
 371                 pre_chan->next = NULL;
 372         } else {
 373                 state->rc_pass_chan_list.chan_list = NULL;
 374         }
 375         mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
 376 
 377         mutex_enter(&state->rc_timeout_lock);
 378         if (state->rc_timeout_start == B_TRUE) {
 379                 state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state,
 380                     SEC_TO_TICK(ibd_rc_conn_timeout));
 381         }
 382         mutex_exit(&state->rc_timeout_lock);
 383 }
 384 
 385 #ifdef DEBUG
 386 /*
 387  * ibd_rc_update_stats - update driver private kstat counters
 388  *
 389  * This routine will dump the internal statistics counters for ibd's
 390  * Reliable Connected Mode. The current stats dump values will
 391  * be sent to the kernel status area.
 392  */
 393 static int
 394 ibd_rc_update_stats(kstat_t *ksp, int rw)
 395 {
 396         ibd_state_t *state;
 397         ibd_rc_stat_t *ibd_rc_ksp;
 398 
 399         if (rw == KSTAT_WRITE)
 400                 return (EACCES);
 401 
 402         state = (ibd_state_t *)ksp->ks_private;
 403         ASSERT(state != NULL);
 404         ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
 405 
 406         ibd_rc_ksp->rc_rcv_trans_byte.value.ul = state->rc_rcv_trans_byte;
 407         ibd_rc_ksp->rc_rcv_trans_pkt.value.ul = state->rc_rcv_trans_pkt;
 408         ibd_rc_ksp->rc_rcv_copy_byte.value.ul = state->rc_rcv_copy_byte;
 409         ibd_rc_ksp->rc_rcv_copy_pkt.value.ul = state->rc_rcv_copy_pkt;
 410         ibd_rc_ksp->rc_rcv_alloc_fail.value.ul = state->rc_rcv_alloc_fail;
 411 
 412         ibd_rc_ksp->rc_rcq_err.value.ul = state->rc_rcq_err;
 413 
 414         ibd_rc_ksp->rc_rwqe_short.value.ul = state->rc_rwqe_short;
 415 
 416         ibd_rc_ksp->rc_xmt_bytes.value.ul = state->rc_xmt_bytes;
 417         ibd_rc_ksp->rc_xmt_small_pkt.value.ul = state->rc_xmt_small_pkt;
 418         ibd_rc_ksp->rc_xmt_fragmented_pkt.value.ul =
 419             state->rc_xmt_fragmented_pkt;
 420         ibd_rc_ksp->rc_xmt_map_fail_pkt.value.ul = state->rc_xmt_map_fail_pkt;
 421         ibd_rc_ksp->rc_xmt_map_succ_pkt.value.ul = state->rc_xmt_map_succ_pkt;
 422         ibd_rc_ksp->rc_ace_not_found.value.ul = state->rc_ace_not_found;
 423 
 424         ibd_rc_ksp->rc_scq_no_swqe.value.ul = state->rc_scq_no_swqe;
 425         ibd_rc_ksp->rc_scq_no_largebuf.value.ul = state->rc_scq_no_largebuf;
 426         ibd_rc_ksp->rc_swqe_short.value.ul = state->rc_swqe_short;
 427         ibd_rc_ksp->rc_swqe_mac_update.value.ul = state->rc_swqe_mac_update;
 428         ibd_rc_ksp->rc_xmt_buf_short.value.ul = state->rc_xmt_buf_short;
 429         ibd_rc_ksp->rc_xmt_buf_mac_update.value.ul =
 430             state->rc_xmt_buf_mac_update;
 431 
 432         ibd_rc_ksp->rc_conn_succ.value.ul = state->rc_conn_succ;
 433         ibd_rc_ksp->rc_conn_fail.value.ul = state->rc_conn_fail;
 434         ibd_rc_ksp->rc_null_conn.value.ul = state->rc_null_conn;
 435         ibd_rc_ksp->rc_no_estab_conn.value.ul = state->rc_no_estab_conn;
 436 
 437         ibd_rc_ksp->rc_act_close.value.ul = state->rc_act_close;
 438         ibd_rc_ksp->rc_pas_close.value.ul = state->rc_pas_close;
 439         ibd_rc_ksp->rc_delay_ace_recycle.value.ul = state->rc_delay_ace_recycle;
 440         ibd_rc_ksp->rc_act_close_simultaneous.value.ul =
 441             state->rc_act_close_simultaneous;
 442         ibd_rc_ksp->rc_reset_cnt.value.ul = state->rc_reset_cnt;
 443         ibd_rc_ksp->rc_timeout_act.value.ul = state->rc_timeout_act;
 444         ibd_rc_ksp->rc_timeout_pas.value.ul = state->rc_timeout_pas;
 445 
 446         return (0);
 447 }
 448 
 449 
 450 /*
 451  * ibd_rc_init_stats - initialize kstat data structures
 452  *
 453  * This routine will create and initialize the driver private
 454  * statistics counters.
 455  */
 456 int
 457 ibd_rc_init_stats(ibd_state_t *state)
 458 {
 459         kstat_t *ksp;
 460         ibd_rc_stat_t *ibd_rc_ksp;
 461         char stat_name[KSTAT_STRLEN];
 462         int inst;
 463 
 464         /*
 465          * Create and init kstat
 466          */
 467         inst = ddi_get_instance(state->id_dip);
 468         (void) snprintf(stat_name, KSTAT_STRLEN, "statistics%d_%x_%u", inst,
 469             state->id_pkey, state->id_plinkid);
 470         ksp = kstat_create("ibd", 0, stat_name, "net", KSTAT_TYPE_NAMED,
 471             sizeof (ibd_rc_stat_t) / sizeof (kstat_named_t), 0);
 472 
 473         if (ksp == NULL) {
 474                 ibd_print_warn(state, "ibd_rc_init_stats: Could not create "
 475                     "kernel statistics");
 476                 return (DDI_FAILURE);
 477         }
 478 
 479         state->rc_ksp = ksp; /* Fill in the ksp of ibd over RC mode */
 480 
 481         ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
 482 
 483         /*
 484          * Initialize all the statistics
 485          */
 486         kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_byte, "RC: Rx Bytes, "
 487             "transfer mode", KSTAT_DATA_ULONG);
 488         kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_pkt, "RC: Rx Pkts, "
 489             "transfer mode", KSTAT_DATA_ULONG);
 490         kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_byte, "RC: Rx Bytes, "
 491             "copy mode", KSTAT_DATA_ULONG);
 492         kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_pkt, "RC: Rx Pkts, "
 493             "copy mode", KSTAT_DATA_ULONG);
 494         kstat_named_init(&ibd_rc_ksp->rc_rcv_alloc_fail, "RC: Rx alloc fail",
 495             KSTAT_DATA_ULONG);
 496 
 497         kstat_named_init(&ibd_rc_ksp->rc_rcq_err, "RC: fail in Recv CQ handler",
 498             KSTAT_DATA_ULONG);
 499 
 500         kstat_named_init(&ibd_rc_ksp->rc_rwqe_short, "RC: Short rwqe",
 501             KSTAT_DATA_ULONG);
 502 
 503         kstat_named_init(&ibd_rc_ksp->rc_xmt_bytes, "RC: Sent Bytes",
 504             KSTAT_DATA_ULONG);
 505         kstat_named_init(&ibd_rc_ksp->rc_xmt_small_pkt,
 506             "RC: Tx pkt small size", KSTAT_DATA_ULONG);
 507         kstat_named_init(&ibd_rc_ksp->rc_xmt_fragmented_pkt,
 508             "RC: Tx pkt fragmentary", KSTAT_DATA_ULONG);
 509         kstat_named_init(&ibd_rc_ksp->rc_xmt_map_fail_pkt,
 510             "RC: Tx pkt fail ibt_map_mem_iov()", KSTAT_DATA_ULONG);
 511         kstat_named_init(&ibd_rc_ksp->rc_xmt_map_succ_pkt,
 512             "RC: Tx pkt succ ibt_map_mem_iov()", KSTAT_DATA_ULONG);
 513         kstat_named_init(&ibd_rc_ksp->rc_ace_not_found, "RC: ace not found",
 514             KSTAT_DATA_ULONG);
 515 
 516         kstat_named_init(&ibd_rc_ksp->rc_scq_no_swqe, "RC: No swqe after "
 517             "recycle", KSTAT_DATA_ULONG);
 518         kstat_named_init(&ibd_rc_ksp->rc_scq_no_largebuf, "RC: No large tx buf "
 519             "after recycle", KSTAT_DATA_ULONG);
 520         kstat_named_init(&ibd_rc_ksp->rc_swqe_short, "RC: No swqe in ibd_send",
 521             KSTAT_DATA_ULONG);
 522         kstat_named_init(&ibd_rc_ksp->rc_swqe_mac_update, "RC: mac_tx_update "
 523             "#, swqe available", KSTAT_DATA_ULONG);
 524         kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_short, "RC: No buf in "
 525             "ibd_send", KSTAT_DATA_ULONG);
 526         kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_mac_update, "RC: "
 527             "mac_tx_update #, buf available", KSTAT_DATA_ULONG);
 528 
 529         kstat_named_init(&ibd_rc_ksp->rc_conn_succ, "RC: succ connected",
 530             KSTAT_DATA_ULONG);
 531         kstat_named_init(&ibd_rc_ksp->rc_conn_fail, "RC: fail connect",
 532             KSTAT_DATA_ULONG);
 533         kstat_named_init(&ibd_rc_ksp->rc_null_conn, "RC: null conn for unicast "
 534             "pkt", KSTAT_DATA_ULONG);
 535         kstat_named_init(&ibd_rc_ksp->rc_no_estab_conn, "RC: not in act estab "
 536             "state", KSTAT_DATA_ULONG);
 537 
 538         kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: call ibd_rc_act_close",
 539             KSTAT_DATA_ULONG);
 540         kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: call ibd_rc_pas_close",
 541             KSTAT_DATA_ULONG);
 542         kstat_named_init(&ibd_rc_ksp->rc_delay_ace_recycle, "RC: delay ace "
 543             "recycle", KSTAT_DATA_ULONG);
 544         kstat_named_init(&ibd_rc_ksp->rc_act_close_simultaneous, "RC: "
 545             "simultaneous ibd_rc_act_close", KSTAT_DATA_ULONG);
 546         kstat_named_init(&ibd_rc_ksp->rc_reset_cnt, "RC: Reset RC channel",
 547             KSTAT_DATA_ULONG);
 548         kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: timeout act side",
 549             KSTAT_DATA_ULONG);
 550         kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: timeout pas side",
 551             KSTAT_DATA_ULONG);
 552 
 553         /*
 554          * Function to provide kernel stat update on demand
 555          */
 556         ksp->ks_update = ibd_rc_update_stats;
 557 
 558         /*
 559          * Pointer into provider's raw statistics
 560          */
 561         ksp->ks_private = (void *)state;
 562 
 563         /*
 564          * Add kstat to systems kstat chain
 565          */
 566         kstat_install(ksp);
 567 
 568         return (DDI_SUCCESS);
 569 }
 570 #endif
 571 
 572 static ibt_status_t
 573 ibd_rc_alloc_chan(ibd_rc_chan_t **ret_chan, ibd_state_t *state,
 574     boolean_t is_tx_chan)
 575 {
 576         ibt_status_t result;
 577         ibd_rc_chan_t *chan;
 578         ibt_rc_chan_alloc_args_t alloc_args;
 579         ibt_chan_alloc_flags_t alloc_flags;
 580         ibt_chan_sizes_t sizes;
 581         ibt_cq_attr_t cq_atts;
 582         int rv;
 583 
 584         chan = kmem_zalloc(sizeof (ibd_rc_chan_t), KM_SLEEP);
 585 
 586         chan->state = state;
 587         mutex_init(&chan->rx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
 588         mutex_init(&chan->rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
 589         mutex_init(&chan->tx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
 590         mutex_init(&chan->tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
 591         mutex_init(&chan->tx_post_lock, NULL, MUTEX_DRIVER, NULL);
 592         mutex_init(&chan->tx_poll_lock, NULL, MUTEX_DRIVER, NULL);
 593 
 594         /* Allocate IB structures for a new RC channel. */
 595         if (is_tx_chan) {
 596                 chan->scq_size = state->id_rc_num_swqe;
 597                 chan->rcq_size = IBD_RC_MIN_CQ_SIZE;
 598         } else {
 599                 chan->scq_size = IBD_RC_MIN_CQ_SIZE;
 600                 chan->rcq_size = state->id_rc_num_rwqe;
 601         }
 602         cq_atts.cq_size = chan->scq_size;
 603         cq_atts.cq_sched = NULL;
 604         cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
 605         result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->scq_hdl,
 606             &chan->scq_size);
 607         if (result != IBT_SUCCESS) {
 608                 DPRINT(40, "ibd_rc_alloc_chan: error <%d>"
 609                     "create scq completion queue (size <%d>)",
 610                     result, chan->scq_size);
 611                 goto alloc_scq_err;
 612         }       /* if failure to alloc cq */
 613 
 614         if (ibt_modify_cq(chan->scq_hdl, state->id_rc_tx_comp_count,
 615             state->id_rc_tx_comp_usec, 0) != IBT_SUCCESS) {
 616                 DPRINT(30, "ibd_rc_alloc_chan: Send CQ "
 617                     "interrupt moderation failed");
 618         }
 619 
 620         ibt_set_cq_private(chan->scq_hdl, (void *) (uintptr_t)chan);
 621         ibt_set_cq_handler(chan->scq_hdl, ibd_rc_scq_handler,
 622             (void *) (uintptr_t)chan);
 623 
 624         cq_atts.cq_size = chan->rcq_size;
 625         cq_atts.cq_sched = NULL;
 626         cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
 627         result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->rcq_hdl,
 628             &chan->rcq_size);
 629         if (result != IBT_SUCCESS) {
 630                 ibd_print_warn(state, "ibd_rc_alloc_chan: error <%d> creating "
 631                     "rx completion queue (size <%d>)", result, chan->rcq_size);
 632                 goto alloc_rcq_err;
 633         }       /* if failure to alloc cq */
 634 
 635         if (ibt_modify_cq(chan->rcq_hdl, state->id_rc_rx_comp_count,
 636             state->id_rc_rx_comp_usec, 0) != IBT_SUCCESS) {
 637                 DPRINT(30, "ibd_rc_alloc_chan: Receive CQ "
 638                     "interrupt moderation failed");
 639         }
 640 
 641         ibt_set_cq_private(chan->rcq_hdl, (void *) (uintptr_t)chan);
 642         ibt_set_cq_handler(chan->rcq_hdl, ibd_rc_rcq_handler,
 643             (void *)(uintptr_t)chan);
 644 
 645         if (is_tx_chan) {
 646                 chan->is_tx_chan = B_TRUE;
 647                 if (ibd_rc_init_txlist(chan) != DDI_SUCCESS) {
 648                         ibd_print_warn(state, "ibd_rc_alloc_chan: "
 649                             "ibd_rc_init_txlist failed");
 650                         goto init_txlist_err;
 651                 }
 652                 if (ibd_rc_tx_softintr == 1) {
 653                         if ((rv = ddi_add_softintr(state->id_dip,
 654                             DDI_SOFTINT_LOW, &chan->scq_softintr, NULL, NULL,
 655                             ibd_rc_tx_recycle, (caddr_t)chan)) !=
 656                             DDI_SUCCESS) {
 657                                 DPRINT(10, "ibd_rc_alloc_chan: failed in "
 658                                     "ddi_add_softintr(scq_softintr), ret=%d",
 659                                     rv);
 660                                 goto alloc_softintr_err;
 661                         }
 662                 }
 663         } else {
 664                 chan->is_tx_chan = B_FALSE;
 665         }
 666 
 667         /*
 668          * enable completions
 669          */
 670         result = ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION);
 671         if (result != IBT_SUCCESS) {
 672                 ibd_print_warn(state, "ibd_rc_alloc_chan: ibt_enable_cq_notify"
 673                     "(scq) failed: status %d\n", result);
 674                 goto alloc_scq_enable_err;
 675         }
 676 
 677         /* We will enable chan->rcq_hdl later. */
 678 
 679         /* alloc a RC channel */
 680         bzero(&alloc_args, sizeof (ibt_rc_chan_alloc_args_t));
 681         bzero(&sizes, sizeof (ibt_chan_sizes_t));
 682 
 683         alloc_args.rc_flags = IBT_WR_SIGNALED;
 684         alloc_args.rc_control = IBT_CEP_NO_FLAGS;
 685 
 686         alloc_args.rc_scq = chan->scq_hdl;
 687         alloc_args.rc_rcq = chan->rcq_hdl;
 688         alloc_args.rc_pd = state->id_pd_hdl;
 689 
 690         alloc_args.rc_hca_port_num = state->id_port;
 691         alloc_args.rc_clone_chan = NULL;
 692 
 693         /* scatter/gather */
 694         alloc_args.rc_sizes.cs_sq_sgl = state->rc_tx_max_sqseg;
 695 
 696         /*
 697          * For the number of SGL elements in receive side, I think it
 698          * should be 1. Because ibd driver allocates a whole block memory
 699          * for each ibt_post_recv().
 700          */
 701         alloc_args.rc_sizes.cs_rq_sgl = 1;
 702 
 703         /* The send queue size and the receive queue size */
 704         alloc_args.rc_sizes.cs_sq = chan->scq_size;
 705         alloc_args.rc_sizes.cs_rq = chan->rcq_size;
 706 
 707         if (state->id_hca_res_lkey_capab) {
 708                 alloc_args.rc_flags = IBT_FAST_REG_RES_LKEY;
 709         } else {
 710                 DPRINT(40, "ibd_rc_alloc_chan: not support reserved lkey");
 711         }
 712 
 713         if (state->rc_enable_srq) {
 714                 alloc_flags = IBT_ACHAN_USES_SRQ;
 715                 alloc_args.rc_srq = state->rc_srq_hdl;
 716         } else {
 717                 alloc_flags = IBT_ACHAN_NO_FLAGS;
 718         }
 719 
 720         result = ibt_alloc_rc_channel(state->id_hca_hdl,
 721             alloc_flags, &alloc_args, &chan->chan_hdl, &sizes);
 722         if (result != IBT_SUCCESS) {
 723                 ibd_print_warn(state, "ibd_rc_alloc_chan: ibd_rc_open_channel"
 724                     " fail:<%d>", result);
 725                 goto alloc_scq_enable_err;
 726         }
 727 
 728         if (is_tx_chan)
 729                 atomic_inc_32(&state->rc_num_tx_chan);
 730         else
 731                 atomic_inc_32(&state->rc_num_rx_chan);
 732 
 733         /* For the connection reaper routine ibd_rc_conn_timeout_call() */
 734         chan->is_used = B_TRUE;
 735 
 736         *ret_chan = chan;
 737         return (IBT_SUCCESS);
 738 
 739 alloc_scq_enable_err:
 740         if (is_tx_chan) {
 741                 if (ibd_rc_tx_softintr == 1) {
 742                         ddi_remove_softintr(chan->scq_softintr);
 743                 }
 744         }
 745 alloc_softintr_err:
 746         if (is_tx_chan) {
 747                 ibd_rc_fini_txlist(chan);
 748         }
 749 init_txlist_err:
 750         (void) ibt_free_cq(chan->rcq_hdl);
 751 alloc_rcq_err:
 752         (void) ibt_free_cq(chan->scq_hdl);
 753 alloc_scq_err:
 754         mutex_destroy(&chan->tx_poll_lock);
 755         mutex_destroy(&chan->tx_post_lock);
 756         mutex_destroy(&chan->tx_rel_list.dl_mutex);
 757         mutex_destroy(&chan->tx_wqe_list.dl_mutex);
 758         mutex_destroy(&chan->rx_free_list.dl_mutex);
 759         mutex_destroy(&chan->rx_wqe_list.dl_mutex);
 760         kmem_free(chan, sizeof (ibd_rc_chan_t));
 761         return (result);
 762 }
 763 
 764 static void
 765 ibd_rc_free_chan(ibd_rc_chan_t *chan)
 766 {
 767         ibt_status_t ret;
 768 
 769         /* DPRINT(30, "ibd_rc_free_chan: chan=%p", chan); */
 770 
 771         if (chan->chan_hdl != NULL) {
 772                 ret = ibt_free_channel(chan->chan_hdl);
 773                 if (ret != IBT_SUCCESS) {
 774                         DPRINT(40, "ib_rc_free_chan: ibt_free_channel failed, "
 775                             "chan=%p, returned: %d", chan, ret);
 776                         return;
 777                 }
 778                 chan->chan_hdl = NULL;
 779         }
 780 
 781         if (chan->rcq_hdl != NULL) {
 782                 ret = ibt_free_cq(chan->rcq_hdl);
 783                 if (ret != IBT_SUCCESS) {
 784                         DPRINT(40, "ib_rc_free_chan: ibt_free_cq(rcq) failed, "
 785                             "chan=%p, returned: %d", chan, ret);
 786                         return;
 787                 }
 788                 chan->rcq_hdl = NULL;
 789         }
 790 
 791         if (chan->scq_hdl != NULL) {
 792                 ret = ibt_free_cq(chan->scq_hdl);
 793                 if (ret != IBT_SUCCESS) {
 794                         DPRINT(40, "ib_rc_free_chan: ibt_free_cq(scq) failed, "
 795                             "chan=%p, returned: %d", chan, ret);
 796                         return;
 797                 }
 798                 chan->scq_hdl = NULL;
 799         }
 800 
 801         /* Free buffers */
 802         if (chan->is_tx_chan) {
 803                 ibd_rc_fini_txlist(chan);
 804                 if (ibd_rc_tx_softintr == 1) {
 805                         ddi_remove_softintr(chan->scq_softintr);
 806                 }
 807                 atomic_dec_32(&chan->state->rc_num_tx_chan);
 808         } else {
 809                 if (!chan->state->rc_enable_srq) {
 810                         ibd_rc_fini_rxlist(chan);
 811                 }
 812                 atomic_dec_32(&chan->state->rc_num_rx_chan);
 813         }
 814 
 815         mutex_destroy(&chan->tx_poll_lock);
 816         mutex_destroy(&chan->tx_post_lock);
 817         mutex_destroy(&chan->tx_rel_list.dl_mutex);
 818         mutex_destroy(&chan->tx_wqe_list.dl_mutex);
 819         mutex_destroy(&chan->rx_free_list.dl_mutex);
 820         mutex_destroy(&chan->rx_wqe_list.dl_mutex);
 821 
 822         /*
 823          * If it is a passive channel, must make sure it has been removed
 824          * from chan->state->rc_pass_chan_list
 825          */
 826         kmem_free(chan, sizeof (ibd_rc_chan_t));
 827 }
 828 
 829 /* Add a RC channel */
 830 static inline void
 831 ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
 832 {
 833         mutex_enter(&list->chan_list_mutex);
 834         if (list->chan_list == NULL) {
 835                 list->chan_list = chan;
 836                 chan->next = NULL;
 837         } else {
 838                 chan->next = list->chan_list;
 839                 list->chan_list = chan;
 840         }
 841         mutex_exit(&list->chan_list_mutex);
 842 }
 843 
 844 static boolean_t
 845 ibd_rc_re_add_to_pas_chan_list(ibd_rc_chan_t *chan)
 846 {
 847         ibd_state_t *state = chan->state;
 848 
 849         mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
 850         if ((state->id_mac_state & IBD_DRV_STARTED) == 0) {
 851                 mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
 852                 return (B_FALSE);
 853         } else {
 854                 if (state->rc_pass_chan_list.chan_list == NULL) {
 855                         state->rc_pass_chan_list.chan_list = chan;
 856                         chan->next = NULL;
 857                 } else {
 858                         chan->next = state->rc_pass_chan_list.chan_list;
 859                         state->rc_pass_chan_list.chan_list = chan;
 860                 }
 861                 mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
 862                 return (B_TRUE);
 863         }
 864 }
 865 
 866 /* Remove a RC channel */
 867 static inline ibd_rc_chan_t *
 868 ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
 869 {
 870         ibd_rc_chan_t *pre_chan;
 871 
 872         mutex_enter(&list->chan_list_mutex);
 873         if (list->chan_list == chan) {
 874                 DPRINT(30, "ibd_rc_rm_from_chan_list(first): found chan(%p)"
 875                     " in chan_list", chan);
 876                 list->chan_list = chan->next;
 877         } else {
 878                 pre_chan = list->chan_list;
 879                 while (pre_chan != NULL) {
 880                         if (pre_chan->next == chan) {
 881                                 DPRINT(30, "ibd_rc_rm_from_chan_list"
 882                                     "(middle): found chan(%p)", chan);
 883                                 pre_chan->next = chan->next;
 884                                 break;
 885                         }
 886                         pre_chan = pre_chan->next;
 887                 }
 888                 if (pre_chan == NULL)
 889                         chan = NULL;
 890         }
 891         mutex_exit(&list->chan_list_mutex);
 892         return (chan);
 893 }
 894 
 895 static inline ibd_rc_chan_t *
 896 ibd_rc_rm_header_chan_list(ibd_rc_chan_list_t *list)
 897 {
 898         ibd_rc_chan_t *rc_chan;
 899 
 900         mutex_enter(&list->chan_list_mutex);
 901         rc_chan = list->chan_list;
 902         if (rc_chan != NULL) {
 903                 list->chan_list = rc_chan->next;
 904         }
 905         mutex_exit(&list->chan_list_mutex);
 906         return (rc_chan);
 907 }
 908 
 909 static int
 910 ibd_rc_alloc_srq_copybufs(ibd_state_t *state)
 911 {
 912         ibt_mr_attr_t mem_attr;
 913         uint_t rc_rx_bufs_sz;
 914 
 915         /*
 916          * Allocate one big chunk for all regular rx copy bufs
 917          */
 918         rc_rx_bufs_sz =  (state->rc_mtu + IPOIB_GRH_SIZE) * state->rc_srq_size;
 919 
 920         state->rc_srq_rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
 921 
 922         state->rc_srq_rwqes = kmem_zalloc(state->rc_srq_size *
 923             sizeof (ibd_rwqe_t), KM_SLEEP);
 924 
 925         /*
 926          * Do one memory registration on the entire rxbuf area
 927          */
 928         mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_srq_rx_bufs;
 929         mem_attr.mr_len = rc_rx_bufs_sz;
 930         mem_attr.mr_as = NULL;
 931         mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
 932         if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
 933             &state->rc_srq_rx_mr_hdl, &state->rc_srq_rx_mr_desc)
 934             != IBT_SUCCESS) {
 935                 DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr() "
 936                     "failed");
 937                 kmem_free(state->rc_srq_rwqes,
 938                     state->rc_srq_size * sizeof (ibd_rwqe_t));
 939                 kmem_free(state->rc_srq_rx_bufs, rc_rx_bufs_sz);
 940                 state->rc_srq_rx_bufs = NULL;
 941                 state->rc_srq_rwqes = NULL;
 942                 return (DDI_FAILURE);
 943         }
 944 
 945         return (DDI_SUCCESS);
 946 }
 947 
 948 static void
 949 ibd_rc_free_srq_copybufs(ibd_state_t *state)
 950 {
 951         uint_t rc_rx_buf_sz;
 952 
 953         /*
 954          * Don't change the value of state->rc_mtu at the period from call
 955          * ibd_rc_alloc_srq_copybufs() to call ibd_rc_free_srq_copybufs().
 956          */
 957         rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
 958 
 959         /*
 960          * Unregister rxbuf mr
 961          */
 962         if (ibt_deregister_mr(state->id_hca_hdl,
 963             state->rc_srq_rx_mr_hdl) != IBT_SUCCESS) {
 964                 DPRINT(40, "ibd_rc_free_srq_copybufs: ibt_deregister_mr()"
 965                     " failed");
 966         }
 967         state->rc_srq_rx_mr_hdl = NULL;
 968 
 969         /*
 970          * Free rxbuf memory
 971          */
 972         kmem_free(state->rc_srq_rwqes,
 973             state->rc_srq_size * sizeof (ibd_rwqe_t));
 974         kmem_free(state->rc_srq_rx_bufs, state->rc_srq_size * rc_rx_buf_sz);
 975         state->rc_srq_rwqes = NULL;
 976         state->rc_srq_rx_bufs = NULL;
 977 }
 978 
 979 /*
 980  * Allocate and post a certain number of SRQ receive buffers and WRs.
 981  */
 982 int
 983 ibd_rc_init_srq_list(ibd_state_t *state)
 984 {
 985         ibd_rwqe_t *rwqe;
 986         ibt_lkey_t lkey;
 987         int i;
 988         uint_t len;
 989         uint8_t *bufaddr;
 990         ibt_srq_sizes_t srq_sizes;
 991         ibt_srq_sizes_t  srq_real_sizes;
 992         ibt_status_t ret;
 993 
 994         srq_sizes.srq_sgl_sz = 1;
 995         srq_sizes.srq_wr_sz = state->id_rc_num_srq;
 996         ret = ibt_alloc_srq(state->id_hca_hdl, IBT_SRQ_NO_FLAGS,
 997             state->id_pd_hdl, &srq_sizes, &state->rc_srq_hdl, &srq_real_sizes);
 998         if (ret != IBT_SUCCESS) {
 999                 /*
1000                  * The following code is for CR 6932460 (can't configure ibd
1001                  * interface on 32 bits x86 systems). 32 bits x86 system has
1002                  * less memory resource than 64 bits x86 system. If current
1003                  * resource request can't be satisfied, we request less
1004                  * resource here.
1005                  */
1006                 len = state->id_rc_num_srq;
1007                 while ((ret == IBT_HCA_WR_EXCEEDED) &&
1008                     (len >= 2 * IBD_RC_MIN_CQ_SIZE)) {
1009                         len = len/2;
1010                         srq_sizes.srq_sgl_sz = 1;
1011                         srq_sizes.srq_wr_sz = len;
1012                         ret = ibt_alloc_srq(state->id_hca_hdl,
1013                             IBT_SRQ_NO_FLAGS, state->id_pd_hdl, &srq_sizes,
1014                             &state->rc_srq_hdl, &srq_real_sizes);
1015                 }
1016                 if (ret != IBT_SUCCESS) {
1017                         DPRINT(10, "ibd_rc_init_srq_list: ibt_alloc_srq failed."
1018                             "req_sgl_sz=%d, req_wr_sz=0x%x, final_req_wr_sz="
1019                             "0x%x, ret=%d", srq_sizes.srq_sgl_sz,
1020                             srq_sizes.srq_wr_sz, len, ret);
1021                         return (DDI_FAILURE);
1022                 }
1023                 state->id_rc_num_srq = len;
1024                 state->id_rc_num_rwqe = state->id_rc_num_srq + 1;
1025         }
1026 
1027         state->rc_srq_size = srq_real_sizes.srq_wr_sz;
1028         if (ibd_rc_alloc_srq_copybufs(state) != DDI_SUCCESS) {
1029                 ret = ibt_free_srq(state->rc_srq_hdl);
1030                 if (ret != IBT_SUCCESS) {
1031                         ibd_print_warn(state, "ibd_rc_init_srq_list: "
1032                             "ibt_free_srq fail, ret=%d", ret);
1033                 }
1034                 return (DDI_FAILURE);
1035         }
1036 
1037         /*
1038          * Allocate and setup the rwqe list
1039          */
1040         lkey = state->rc_srq_rx_mr_desc.md_lkey;
1041         rwqe = state->rc_srq_rwqes;
1042         bufaddr = state->rc_srq_rx_bufs;
1043         len = state->rc_mtu + IPOIB_GRH_SIZE;
1044         state->rc_srq_rwqe_list.dl_cnt = 0;
1045         state->rc_srq_rwqe_list.dl_bufs_outstanding = 0;
1046         for (i = 0; i < state->rc_srq_size; i++, rwqe++, bufaddr += len) {
1047                 rwqe->w_state = state;
1048                 rwqe->w_freeing_wqe = B_FALSE;
1049                 rwqe->w_freemsg_cb.free_func = ibd_rc_srq_freemsg_cb;
1050                 rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
1051                 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
1052 
1053                 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
1054                     &rwqe->w_freemsg_cb)) == NULL) {
1055                         DPRINT(40, "ibd_rc_init_srq_list : desballoc() failed");
1056                         rwqe->rwqe_copybuf.ic_bufaddr = NULL;
1057                         if (atomic_dec_32_nv(&state->id_running) != 0) {
1058                                 cmn_err(CE_WARN, "ibd_rc_init_srq_list: "
1059                                     "id_running was not 1\n");
1060                         }
1061                         ibd_rc_fini_srq_list(state);
1062                         atomic_inc_32(&state->id_running);
1063                         return (DDI_FAILURE);
1064                 }
1065 
1066                 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
1067                 /* Leave IPOIB_GRH_SIZE space */
1068                 rwqe->rwqe_copybuf.ic_sgl.ds_va =
1069                     (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
1070                 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
1071                 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
1072                 rwqe->w_rwr.wr_nds = 1;
1073                 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
1074                 (void) ibd_rc_post_srq(state, rwqe);
1075         }
1076 
1077         mutex_enter(&state->rc_srq_free_list.dl_mutex);
1078         state->rc_srq_free_list.dl_head = NULL;
1079         state->rc_srq_free_list.dl_cnt = 0;
1080         mutex_exit(&state->rc_srq_free_list.dl_mutex);
1081 
1082         return (DDI_SUCCESS);
1083 }
1084 
1085 /*
1086  * Free the statically allocated Rx buffer list for SRQ.
1087  */
1088 void
1089 ibd_rc_fini_srq_list(ibd_state_t *state)
1090 {
1091         ibd_rwqe_t *rwqe;
1092         int i;
1093         ibt_status_t ret;
1094 
1095         ASSERT(state->id_running == 0);
1096         ret = ibt_free_srq(state->rc_srq_hdl);
1097         if (ret != IBT_SUCCESS) {
1098                 ibd_print_warn(state, "ibd_rc_fini_srq_list: "
1099                     "ibt_free_srq fail, ret=%d", ret);
1100         }
1101 
1102         mutex_enter(&state->rc_srq_rwqe_list.dl_mutex);
1103         rwqe = state->rc_srq_rwqes;
1104         for (i = 0; i < state->rc_srq_size; i++, rwqe++) {
1105                 if (rwqe->rwqe_im_mblk != NULL) {
1106                         rwqe->w_freeing_wqe = B_TRUE;
1107                         freemsg(rwqe->rwqe_im_mblk);
1108                 }
1109         }
1110         mutex_exit(&state->rc_srq_rwqe_list.dl_mutex);
1111 
1112         ibd_rc_free_srq_copybufs(state);
1113 }
1114 
1115 /* Repost the elements in state->ib_rc_free_list */
1116 int
1117 ibd_rc_repost_srq_free_list(ibd_state_t *state)
1118 {
1119         ibd_rwqe_t *rwqe;
1120         ibd_wqe_t *list;
1121         uint_t len;
1122 
1123         mutex_enter(&state->rc_srq_free_list.dl_mutex);
1124         if (state->rc_srq_free_list.dl_head != NULL) {
1125                 /* repost them */
1126                 len = state->rc_mtu + IPOIB_GRH_SIZE;
1127                 list = state->rc_srq_free_list.dl_head;
1128                 state->rc_srq_free_list.dl_head = NULL;
1129                 state->rc_srq_free_list.dl_cnt = 0;
1130                 mutex_exit(&state->rc_srq_free_list.dl_mutex);
1131                 while (list != NULL) {
1132                         rwqe = WQE_TO_RWQE(list);
1133                         if ((rwqe->rwqe_im_mblk == NULL) &&
1134                             ((rwqe->rwqe_im_mblk = desballoc(
1135                             rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
1136                             &rwqe->w_freemsg_cb)) == NULL)) {
1137                                 DPRINT(40, "ibd_rc_repost_srq_free_list: "
1138                                     "failed in desballoc()");
1139                                 do {
1140                                         ibd_rc_srq_free_rwqe(state, rwqe);
1141                                         list = list->w_next;
1142                                         rwqe = WQE_TO_RWQE(list);
1143                                 } while (list != NULL);
1144                                 return (DDI_FAILURE);
1145                         }
1146                         if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1147                                 ibd_rc_srq_free_rwqe(state, rwqe);
1148                         }
1149                         list = list->w_next;
1150                 }
1151                 return (DDI_SUCCESS);
1152         }
1153         mutex_exit(&state->rc_srq_free_list.dl_mutex);
1154         return (DDI_SUCCESS);
1155 }
1156 
1157 /*
1158  * Free an allocated recv wqe.
1159  */
1160 static void
1161 ibd_rc_srq_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
1162 {
1163         /*
1164          * desballoc() failed (no memory) or the posting of rwqe failed.
1165          *
1166          * This rwqe is placed on a free list so that it
1167          * can be reinstated in future.
1168          *
1169          * NOTE: no code currently exists to reinstate
1170          * these "lost" rwqes.
1171          */
1172         mutex_enter(&state->rc_srq_free_list.dl_mutex);
1173         state->rc_srq_free_list.dl_cnt++;
1174         rwqe->rwqe_next = state->rc_srq_free_list.dl_head;
1175         state->rc_srq_free_list.dl_head = RWQE_TO_WQE(rwqe);
1176         mutex_exit(&state->rc_srq_free_list.dl_mutex);
1177 }
1178 
1179 static void
1180 ibd_rc_srq_freemsg_cb(char *arg)
1181 {
1182         ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1183         ibd_state_t *state = rwqe->w_state;
1184 
1185         ASSERT(state->rc_enable_srq);
1186 
1187         /*
1188          * If the driver is stopped, just free the rwqe.
1189          */
1190         if (atomic_add_32_nv(&state->id_running, 0) == 0) {
1191                 if (!rwqe->w_freeing_wqe) {
1192                         atomic_dec_32(
1193                             &state->rc_srq_rwqe_list.dl_bufs_outstanding);
1194                         DPRINT(6, "ibd_rc_srq_freemsg_cb: wqe being freed");
1195                         rwqe->rwqe_im_mblk = NULL;
1196                         ibd_rc_srq_free_rwqe(state, rwqe);
1197                 }
1198                 return;
1199         }
1200 
1201         atomic_dec_32(&state->rc_srq_rwqe_list.dl_bufs_outstanding);
1202 
1203         ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1204         ASSERT(!rwqe->w_freeing_wqe);
1205 
1206         /*
1207          * Upper layer has released held mblk, so we have
1208          * no more use for keeping the old pointer in
1209          * our rwqe.
1210          */
1211         rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1212             state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1213         if (rwqe->rwqe_im_mblk == NULL) {
1214                 DPRINT(40, "ibd_rc_srq_freemsg_cb: desballoc failed");
1215                 ibd_rc_srq_free_rwqe(state, rwqe);
1216                 return;
1217         }
1218 
1219         if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1220                 ibd_print_warn(state, "ibd_rc_srq_freemsg_cb: ibd_rc_post_srq"
1221                     " failed");
1222                 ibd_rc_srq_free_rwqe(state, rwqe);
1223                 return;
1224         }
1225 }
1226 
1227 /*
1228  * Post a rwqe to the hardware and add it to the Rx list.
1229  */
1230 static int
1231 ibd_rc_post_srq(ibd_state_t *state, ibd_rwqe_t *rwqe)
1232 {
1233         /*
1234          * Here we should add dl_cnt before post recv, because
1235          * we would have to make sure dl_cnt is updated before
1236          * the corresponding ibd_rc_process_rx() is called.
1237          */
1238         ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1239         atomic_add_32(&state->rc_srq_rwqe_list.dl_cnt, 1);
1240         if (ibt_post_srq(state->rc_srq_hdl, &rwqe->w_rwr, 1, NULL) !=
1241             IBT_SUCCESS) {
1242                 atomic_dec_32(&state->rc_srq_rwqe_list.dl_cnt);
1243                 DPRINT(40, "ibd_rc_post_srq : ibt_post_srq() failed");
1244                 return (DDI_FAILURE);
1245         }
1246 
1247         return (DDI_SUCCESS);
1248 }
1249 
1250 /*
1251  * Post a rwqe to the hardware and add it to the Rx list.
1252  */
1253 static int
1254 ibd_rc_post_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1255 {
1256         /*
1257          * Here we should add dl_cnt before post recv, because we would
1258          * have to make sure dl_cnt has already updated before
1259          * corresponding ibd_rc_process_rx() is called.
1260          */
1261         atomic_add_32(&chan->rx_wqe_list.dl_cnt, 1);
1262         if (ibt_post_recv(chan->chan_hdl, &rwqe->w_rwr, 1, NULL) !=
1263             IBT_SUCCESS) {
1264                 atomic_dec_32(&chan->rx_wqe_list.dl_cnt);
1265                 DPRINT(40, "ibd_rc_post_rwqe : failed in ibt_post_recv()");
1266                 return (DDI_FAILURE);
1267         }
1268         return (DDI_SUCCESS);
1269 }
1270 
1271 static int
1272 ibd_rc_alloc_rx_copybufs(ibd_rc_chan_t *chan)
1273 {
1274         ibd_state_t *state = chan->state;
1275         ibt_mr_attr_t mem_attr;
1276         uint_t rc_rx_bufs_sz;
1277 
1278         /*
1279          * Allocate one big chunk for all regular rx copy bufs
1280          */
1281         rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * chan->rcq_size;
1282 
1283         chan->rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
1284 
1285         chan->rx_rwqes = kmem_zalloc(chan->rcq_size *
1286             sizeof (ibd_rwqe_t), KM_SLEEP);
1287 
1288         /*
1289          * Do one memory registration on the entire rxbuf area
1290          */
1291         mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->rx_bufs;
1292         mem_attr.mr_len = rc_rx_bufs_sz;
1293         mem_attr.mr_as = NULL;
1294         mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
1295         if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1296             &chan->rx_mr_hdl, &chan->rx_mr_desc) != IBT_SUCCESS) {
1297                 DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr failed");
1298                 kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1299                 kmem_free(chan->rx_bufs, rc_rx_bufs_sz);
1300                 chan->rx_bufs = NULL;
1301                 chan->rx_rwqes = NULL;
1302                 return (DDI_FAILURE);
1303         }
1304 
1305         return (DDI_SUCCESS);
1306 }
1307 
1308 static void
1309 ibd_rc_free_rx_copybufs(ibd_rc_chan_t *chan)
1310 {
1311         ibd_state_t *state = chan->state;
1312         uint_t rc_rx_buf_sz;
1313 
1314         ASSERT(!state->rc_enable_srq);
1315         ASSERT(chan->rx_rwqes != NULL);
1316         ASSERT(chan->rx_bufs != NULL);
1317 
1318         /*
1319          * Don't change the value of state->rc_mtu at the period from call
1320          * ibd_rc_alloc_rx_copybufs() to call ibd_rc_free_rx_copybufs().
1321          */
1322         rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
1323 
1324         /*
1325          * Unregister rxbuf mr
1326          */
1327         if (ibt_deregister_mr(state->id_hca_hdl,
1328             chan->rx_mr_hdl) != IBT_SUCCESS) {
1329                 DPRINT(40, "ibd_rc_free_rx_copybufs: ibt_deregister_mr failed");
1330         }
1331         chan->rx_mr_hdl = NULL;
1332 
1333         /*
1334          * Free rxbuf memory
1335          */
1336         kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1337         chan->rx_rwqes = NULL;
1338 
1339         kmem_free(chan->rx_bufs, chan->rcq_size * rc_rx_buf_sz);
1340         chan->rx_bufs = NULL;
1341 }
1342 
1343 /*
1344  * Post a certain number of receive buffers and WRs on a RC channel.
1345  */
1346 static int
1347 ibd_rc_init_rxlist(ibd_rc_chan_t *chan)
1348 {
1349         ibd_state_t *state = chan->state;
1350         ibd_rwqe_t *rwqe;
1351         ibt_lkey_t lkey;
1352         int i;
1353         uint_t len;
1354         uint8_t *bufaddr;
1355 
1356         ASSERT(!state->rc_enable_srq);
1357         if (ibd_rc_alloc_rx_copybufs(chan) != DDI_SUCCESS)
1358                 return (DDI_FAILURE);
1359 
1360         /*
1361          * Allocate and setup the rwqe list
1362          */
1363         lkey = chan->rx_mr_desc.md_lkey;
1364         rwqe = chan->rx_rwqes;
1365         bufaddr = chan->rx_bufs;
1366         len = state->rc_mtu + IPOIB_GRH_SIZE;
1367         for (i = 0; i < chan->rcq_size; i++, rwqe++, bufaddr += len) {
1368                 rwqe->w_state = state;
1369                 rwqe->w_chan = chan;
1370                 rwqe->w_freeing_wqe = B_FALSE;
1371                 rwqe->w_freemsg_cb.free_func = ibd_rc_freemsg_cb;
1372                 rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
1373                 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
1374 
1375                 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
1376                     &rwqe->w_freemsg_cb)) == NULL) {
1377                         DPRINT(40, "ibd_rc_init_srq_list: desballoc() failed");
1378                         rwqe->rwqe_copybuf.ic_bufaddr = NULL;
1379                         ibd_rc_fini_rxlist(chan);
1380                         return (DDI_FAILURE);
1381                 }
1382 
1383                 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
1384                 rwqe->rwqe_copybuf.ic_sgl.ds_va =
1385                     (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
1386                 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
1387                 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
1388                 rwqe->w_rwr.wr_nds = 1;
1389                 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
1390                 (void) ibd_rc_post_rwqe(chan, rwqe);
1391         }
1392 
1393         return (DDI_SUCCESS);
1394 }
1395 
1396 /*
1397  * Free the statically allocated Rx buffer list for SRQ.
1398  */
1399 static void
1400 ibd_rc_fini_rxlist(ibd_rc_chan_t *chan)
1401 {
1402         ibd_rwqe_t *rwqe;
1403         int i;
1404 
1405         if (chan->rx_bufs == NULL) {
1406                 DPRINT(40, "ibd_rc_fini_rxlist: empty chan->rx_bufs, quit");
1407                 return;
1408         }
1409 
1410         /* bufs_outstanding must be 0 */
1411         ASSERT((chan->rx_wqe_list.dl_head == NULL) ||
1412             (chan->rx_wqe_list.dl_bufs_outstanding == 0));
1413 
1414         mutex_enter(&chan->rx_wqe_list.dl_mutex);
1415         rwqe = chan->rx_rwqes;
1416         for (i = 0; i < chan->rcq_size; i++, rwqe++) {
1417                 if (rwqe->rwqe_im_mblk != NULL) {
1418                         rwqe->w_freeing_wqe = B_TRUE;
1419                         freemsg(rwqe->rwqe_im_mblk);
1420                 }
1421         }
1422         mutex_exit(&chan->rx_wqe_list.dl_mutex);
1423 
1424         ibd_rc_free_rx_copybufs(chan);
1425 }
1426 
1427 /*
1428  * Free an allocated recv wqe.
1429  */
1430 static void
1431 ibd_rc_free_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1432 {
1433         /*
1434          * desballoc() failed (no memory) or the posting of rwqe failed.
1435          *
1436          * This rwqe is placed on a free list so that it
1437          * can be reinstated in future.
1438          *
1439          * NOTE: no code currently exists to reinstate
1440          * these "lost" rwqes.
1441          */
1442         mutex_enter(&chan->rx_free_list.dl_mutex);
1443         chan->rx_free_list.dl_cnt++;
1444         rwqe->rwqe_next = chan->rx_free_list.dl_head;
1445         chan->rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
1446         mutex_exit(&chan->rx_free_list.dl_mutex);
1447 }
1448 
1449 /*
1450  * Processing to be done after receipt of a packet; hand off to GLD
1451  * in the format expected by GLD.
1452  */
1453 static void
1454 ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
1455 {
1456         ibd_state_t *state = chan->state;
1457         ib_header_info_t *phdr;
1458         ipoib_hdr_t *ipibp;
1459         mblk_t *mp;
1460         mblk_t *mpc;
1461         int rxcnt;
1462         ip6_t *ip6h;
1463         int len;
1464 
1465         /*
1466          * Track number handed to upper layer, and number still
1467          * available to receive packets.
1468          */
1469         if (state->rc_enable_srq) {
1470                 rxcnt = atomic_dec_32_nv(&state->rc_srq_rwqe_list.dl_cnt);
1471         } else {
1472                 rxcnt = atomic_dec_32_nv(&chan->rx_wqe_list.dl_cnt);
1473         }
1474 
1475         /*
1476          * It can not be a IBA multicast packet.
1477          */
1478         ASSERT(!wc->wc_flags & IBT_WC_GRH_PRESENT);
1479 
1480         /* For the connection reaper routine ibd_rc_conn_timeout_call() */
1481         chan->is_used = B_TRUE;
1482 
1483 #ifdef DEBUG
1484         if (rxcnt < state->id_rc_rx_rwqe_thresh) {
1485                 state->rc_rwqe_short++;
1486         }
1487 #endif
1488 
1489         /*
1490          * Possibly replenish the Rx pool if needed.
1491          */
1492         if ((rxcnt >= state->id_rc_rx_rwqe_thresh) &&
1493             (wc->wc_bytes_xfer > state->id_rc_rx_copy_thresh)) {
1494                 atomic_add_64(&state->rc_rcv_trans_byte, wc->wc_bytes_xfer);
1495                 atomic_inc_64(&state->rc_rcv_trans_pkt);
1496 
1497                 /*
1498                  * Record how many rwqe has been occupied by upper
1499                  * network layer
1500                  */
1501                 if (state->rc_enable_srq) {
1502                         atomic_add_32(&state->rc_srq_rwqe_list.
1503                             dl_bufs_outstanding, 1);
1504                 } else {
1505                         atomic_add_32(&chan->rx_wqe_list.
1506                             dl_bufs_outstanding, 1);
1507                 }
1508                 mp = rwqe->rwqe_im_mblk;
1509         } else {
1510                 atomic_add_64(&state->rc_rcv_copy_byte, wc->wc_bytes_xfer);
1511                 atomic_inc_64(&state->rc_rcv_copy_pkt);
1512 
1513                 if ((mp = allocb(wc->wc_bytes_xfer + IPOIB_GRH_SIZE,
1514                     BPRI_HI)) == NULL) {        /* no memory */
1515                         DPRINT(40, "ibd_rc_process_rx: allocb() failed");
1516                         state->rc_rcv_alloc_fail++;
1517                         if (state->rc_enable_srq) {
1518                                 if (ibd_rc_post_srq(state, rwqe) ==
1519                                     DDI_FAILURE) {
1520                                         ibd_rc_srq_free_rwqe(state, rwqe);
1521                                 }
1522                         } else {
1523                                 if (ibd_rc_post_rwqe(chan, rwqe) ==
1524                                     DDI_FAILURE) {
1525                                         ibd_rc_free_rwqe(chan, rwqe);
1526                                 }
1527                         }
1528                         return;
1529                 }
1530 
1531                 bcopy(rwqe->rwqe_im_mblk->b_rptr + IPOIB_GRH_SIZE,
1532                     mp->b_wptr + IPOIB_GRH_SIZE, wc->wc_bytes_xfer);
1533 
1534                 if (state->rc_enable_srq) {
1535                         if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1536                                 ibd_rc_srq_free_rwqe(state, rwqe);
1537                         }
1538                 } else {
1539                         if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1540                                 ibd_rc_free_rwqe(chan, rwqe);
1541                         }
1542                 }
1543         }
1544 
1545         ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);
1546         if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
1547                 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
1548                 len = ntohs(ip6h->ip6_plen);
1549                 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
1550                         /* LINTED: E_CONSTANT_CONDITION */
1551                         IBD_PAD_NSNA(ip6h, len, IBD_RECV);
1552                 }
1553         }
1554 
1555         phdr = (ib_header_info_t *)mp->b_rptr;
1556         phdr->ib_grh.ipoib_vertcflow = 0;
1557         ovbcopy(&state->id_macaddr, &phdr->ib_dst,
1558             sizeof (ipoib_mac_t));
1559         mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer+ IPOIB_GRH_SIZE;
1560 
1561         /*
1562          * Can RC mode in IB guarantee its checksum correctness?
1563          *
1564          *      (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
1565          *          HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
1566          */
1567 
1568         /*
1569          * Make sure this is NULL or we're in trouble.
1570          */
1571         if (mp->b_next != NULL) {
1572                 ibd_print_warn(state,
1573                     "ibd_rc_process_rx: got duplicate mp from rcq?");
1574                 mp->b_next = NULL;
1575         }
1576 
1577         /*
1578          * Add this mp to the list of processed mp's to send to
1579          * the nw layer
1580          */
1581         if (state->rc_enable_srq) {
1582                 mutex_enter(&state->rc_rx_lock);
1583                 if (state->rc_rx_mp) {
1584                         ASSERT(state->rc_rx_mp_tail != NULL);
1585                         state->rc_rx_mp_tail->b_next = mp;
1586                 } else {
1587                         ASSERT(state->rc_rx_mp_tail == NULL);
1588                         state->rc_rx_mp = mp;
1589                 }
1590 
1591                 state->rc_rx_mp_tail = mp;
1592                 state->rc_rx_mp_len++;
1593 
1594                 if (state->rc_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
1595                         mpc = state->rc_rx_mp;
1596 
1597                         state->rc_rx_mp = NULL;
1598                         state->rc_rx_mp_tail = NULL;
1599                         state->rc_rx_mp_len = 0;
1600                         mutex_exit(&state->rc_rx_lock);
1601                         mac_rx(state->id_mh, NULL, mpc);
1602                 } else {
1603                         mutex_exit(&state->rc_rx_lock);
1604                 }
1605         } else {
1606                 mutex_enter(&chan->rx_lock);
1607                 if (chan->rx_mp) {
1608                         ASSERT(chan->rx_mp_tail != NULL);
1609                         chan->rx_mp_tail->b_next = mp;
1610                 } else {
1611                         ASSERT(chan->rx_mp_tail == NULL);
1612                         chan->rx_mp = mp;
1613                 }
1614 
1615                 chan->rx_mp_tail = mp;
1616                 chan->rx_mp_len++;
1617 
1618                 if (chan->rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
1619                         mpc = chan->rx_mp;
1620 
1621                         chan->rx_mp = NULL;
1622                         chan->rx_mp_tail = NULL;
1623                         chan->rx_mp_len = 0;
1624                         mutex_exit(&chan->rx_lock);
1625                         mac_rx(state->id_mh, NULL, mpc);
1626                 } else {
1627                         mutex_exit(&chan->rx_lock);
1628                 }
1629         }
1630 }
1631 
1632 /*
1633  * Callback code invoked from STREAMs when the recv data buffer is free
1634  * for recycling.
1635  */
1636 static void
1637 ibd_rc_freemsg_cb(char *arg)
1638 {
1639         ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1640         ibd_rc_chan_t *chan = rwqe->w_chan;
1641         ibd_state_t *state = rwqe->w_state;
1642 
1643         /*
1644          * If the wqe is being destructed, do not attempt recycling.
1645          */
1646         if (rwqe->w_freeing_wqe == B_TRUE) {
1647                 return;
1648         }
1649 
1650         ASSERT(!state->rc_enable_srq);
1651         ASSERT(chan->rx_wqe_list.dl_cnt < chan->rcq_size);
1652 
1653         rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1654             state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1655         if (rwqe->rwqe_im_mblk == NULL) {
1656                 DPRINT(40, "ibd_rc_freemsg_cb: desballoc() failed");
1657                 ibd_rc_free_rwqe(chan, rwqe);
1658                 return;
1659         }
1660 
1661         /*
1662          * Post back to h/w. We could actually have more than
1663          * id_num_rwqe WQEs on the list if there were multiple
1664          * ibd_freemsg_cb() calls outstanding (since the lock is
1665          * not held the entire time). This will start getting
1666          * corrected over subsequent ibd_freemsg_cb() calls.
1667          */
1668         if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1669                 ibd_rc_free_rwqe(chan, rwqe);
1670                 return;
1671         }
1672         atomic_add_32(&chan->rx_wqe_list.dl_bufs_outstanding, -1);
1673 }
1674 
1675 /*
1676  * Common code for interrupt handling as well as for polling
1677  * for all completed wqe's while detaching.
1678  */
1679 static void
1680 ibd_rc_poll_rcq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
1681 {
1682         ibd_wqe_t *wqe;
1683         ibt_wc_t *wc, *wcs;
1684         uint_t numwcs, real_numwcs;
1685         int i;
1686 
1687         wcs = chan->rx_wc;
1688         numwcs = IBD_RC_MAX_CQ_WC;
1689 
1690         while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
1691                 for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
1692                         wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
1693                         if (wc->wc_status != IBT_WC_SUCCESS) {
1694                                 chan->state->rc_rcq_err++;
1695                                 /*
1696                                  * Channel being torn down.
1697                                  */
1698                                 DPRINT(40, "ibd_rc_poll_rcq: wc_status(%d) != "
1699                                     "SUCC, chan=%p", wc->wc_status, chan);
1700                                 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
1701                                         /*
1702                                          * Do not invoke Rx handler because
1703                                          * it might add buffers to the Rx pool
1704                                          * when we are trying to deinitialize.
1705                                          */
1706                                         continue;
1707                                 }
1708                         }
1709                         ibd_rc_process_rx(chan, WQE_TO_RWQE(wqe), wc);
1710                 }
1711         }
1712 }
1713 
1714 /* Receive CQ handler */
1715 /* ARGSUSED */
1716 static void
1717 ibd_rc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1718 {
1719         ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
1720         ibd_state_t *state = chan->state;
1721 
1722         atomic_inc_32(&chan->rcq_invoking);
1723         ASSERT(chan->chan_state == IBD_RC_STATE_PAS_ESTAB);
1724 
1725         /*
1726          * Poll for completed entries; the CQ will not interrupt any
1727          * more for incoming (or transmitted) packets.
1728          */
1729         ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1730 
1731         /*
1732          * Now enable CQ notifications; all packets that arrive now
1733          * (or complete transmission) will cause new interrupts.
1734          */
1735         if (ibt_enable_cq_notify(chan->rcq_hdl, IBT_NEXT_COMPLETION) !=
1736             IBT_SUCCESS) {
1737                 /*
1738                  * We do not expect a failure here.
1739                  */
1740                 DPRINT(40, "ibd_rc_rcq_handler: ibt_enable_cq_notify() failed");
1741         }
1742 
1743         /*
1744          * Repoll to catch all packets that might have arrived after
1745          * we finished the first poll loop and before interrupts got
1746          * armed.
1747          */
1748         ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1749 
1750         if (state->rc_enable_srq) {
1751                 mutex_enter(&state->rc_rx_lock);
1752 
1753                 if (state->rc_rx_mp != NULL) {
1754                         mblk_t *mpc;
1755                         mpc = state->rc_rx_mp;
1756 
1757                         state->rc_rx_mp = NULL;
1758                         state->rc_rx_mp_tail = NULL;
1759                         state->rc_rx_mp_len = 0;
1760 
1761                         mutex_exit(&state->rc_rx_lock);
1762                         mac_rx(state->id_mh, NULL, mpc);
1763                 } else {
1764                         mutex_exit(&state->rc_rx_lock);
1765                 }
1766         } else {
1767                 mutex_enter(&chan->rx_lock);
1768 
1769                 if (chan->rx_mp != NULL) {
1770                         mblk_t *mpc;
1771                         mpc = chan->rx_mp;
1772 
1773                         chan->rx_mp = NULL;
1774                         chan->rx_mp_tail = NULL;
1775                         chan->rx_mp_len = 0;
1776 
1777                         mutex_exit(&chan->rx_lock);
1778                         mac_rx(state->id_mh, NULL, mpc);
1779                 } else {
1780                         mutex_exit(&chan->rx_lock);
1781                 }
1782         }
1783         atomic_dec_32(&chan->rcq_invoking);
1784 }
1785 
1786 /*
1787  * Allocate the statically allocated Tx buffer list.
1788  */
1789 int
1790 ibd_rc_init_tx_largebuf_list(ibd_state_t *state)
1791 {
1792         ibd_rc_tx_largebuf_t *lbufp;
1793         ibd_rc_tx_largebuf_t *tail;
1794         uint8_t *memp;
1795         ibt_mr_attr_t mem_attr;
1796         uint32_t num_swqe;
1797         size_t  mem_size;
1798         int i;
1799 
1800         num_swqe = state->id_rc_num_swqe - 1;
1801 
1802         /*
1803          * Allocate one big chunk for all Tx large copy bufs
1804          */
1805         /* Don't transfer IPOIB_GRH_SIZE bytes (40 bytes) */
1806         mem_size = num_swqe * state->rc_mtu;
1807         state->rc_tx_mr_bufs = kmem_zalloc(mem_size, KM_SLEEP);
1808 
1809         mem_attr.mr_len = mem_size;
1810         mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_tx_mr_bufs;
1811         mem_attr.mr_as = NULL;
1812         mem_attr.mr_flags = IBT_MR_SLEEP;
1813         if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1814             &state->rc_tx_mr_hdl, &state->rc_tx_mr_desc) != IBT_SUCCESS) {
1815                 DPRINT(40, "ibd_rc_init_tx_largebuf_list: ibt_register_mr "
1816                     "failed");
1817                 kmem_free(state->rc_tx_mr_bufs, mem_size);
1818                 state->rc_tx_mr_bufs = NULL;
1819                 return (DDI_FAILURE);
1820         }
1821 
1822         state->rc_tx_largebuf_desc_base = kmem_zalloc(num_swqe *
1823             sizeof (ibd_rc_tx_largebuf_t), KM_SLEEP);
1824 
1825         /*
1826          * Set up the buf chain
1827          */
1828         memp = state->rc_tx_mr_bufs;
1829         mutex_enter(&state->rc_tx_large_bufs_lock);
1830         lbufp = state->rc_tx_largebuf_desc_base;
1831         for (i = 0; i < num_swqe; i++) {
1832                 lbufp->lb_buf = memp;
1833                 lbufp->lb_next = lbufp + 1;
1834 
1835                 tail = lbufp;
1836 
1837                 memp += state->rc_mtu;
1838                 lbufp++;
1839         }
1840         tail->lb_next = NULL;
1841 
1842         /*
1843          * Set up the buffer information in ibd state
1844          */
1845         state->rc_tx_largebuf_free_head = state->rc_tx_largebuf_desc_base;
1846         state->rc_tx_largebuf_nfree = num_swqe;
1847         mutex_exit(&state->rc_tx_large_bufs_lock);
1848         return (DDI_SUCCESS);
1849 }
1850 
1851 void
1852 ibd_rc_fini_tx_largebuf_list(ibd_state_t *state)
1853 {
1854         uint32_t num_swqe;
1855 
1856         num_swqe = state->id_rc_num_swqe - 1;
1857 
1858         if (ibt_deregister_mr(state->id_hca_hdl,
1859             state->rc_tx_mr_hdl) != IBT_SUCCESS) {
1860                 DPRINT(40, "ibd_rc_fini_tx_largebuf_list: ibt_deregister_mr() "
1861                     "failed");
1862         }
1863         state->rc_tx_mr_hdl = NULL;
1864 
1865         kmem_free(state->rc_tx_mr_bufs, num_swqe * state->rc_mtu);
1866         state->rc_tx_mr_bufs = NULL;
1867 
1868         kmem_free(state->rc_tx_largebuf_desc_base,
1869             num_swqe * sizeof (ibd_rc_tx_largebuf_t));
1870         state->rc_tx_largebuf_desc_base = NULL;
1871 }
1872 
1873 static int
1874 ibd_rc_alloc_tx_copybufs(ibd_rc_chan_t *chan)
1875 {
1876         ibt_mr_attr_t mem_attr;
1877         ibd_state_t *state;
1878 
1879         state = chan->state;
1880         ASSERT(state != NULL);
1881 
1882         /*
1883          * Allocate one big chunk for all regular tx copy bufs
1884          */
1885         mem_attr.mr_len = chan->scq_size * state->id_rc_tx_copy_thresh;
1886 
1887         chan->tx_mr_bufs = kmem_zalloc(mem_attr.mr_len, KM_SLEEP);
1888 
1889         /*
1890          * Do one memory registration on the entire txbuf area
1891          */
1892         mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->tx_mr_bufs;
1893         mem_attr.mr_as = NULL;
1894         mem_attr.mr_flags = IBT_MR_SLEEP;
1895         if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1896             &chan->tx_mr_hdl, &chan->tx_mr_desc) != IBT_SUCCESS) {
1897                 DPRINT(40, "ibd_rc_alloc_tx_copybufs: ibt_register_mr failed");
1898                 ASSERT(mem_attr.mr_len ==
1899                     chan->scq_size * state->id_rc_tx_copy_thresh);
1900                 kmem_free(chan->tx_mr_bufs, mem_attr.mr_len);
1901                 chan->tx_mr_bufs = NULL;
1902                 return (DDI_FAILURE);
1903         }
1904 
1905         return (DDI_SUCCESS);
1906 }
1907 
1908 /*
1909  * Allocate the statically allocated Tx buffer list.
1910  */
1911 static int
1912 ibd_rc_init_txlist(ibd_rc_chan_t *chan)
1913 {
1914         ibd_swqe_t *swqe;
1915         int i;
1916         ibt_lkey_t lkey;
1917         ibd_state_t *state = chan->state;
1918 
1919         if (ibd_rc_alloc_tx_copybufs(chan) != DDI_SUCCESS)
1920                 return (DDI_FAILURE);
1921 
1922         /*
1923          * Allocate and setup the swqe list
1924          */
1925         lkey = chan->tx_mr_desc.md_lkey;
1926         chan->tx_wqes = kmem_zalloc(chan->scq_size *
1927             sizeof (ibd_swqe_t), KM_SLEEP);
1928         swqe = chan->tx_wqes;
1929         for (i = 0; i < chan->scq_size; i++, swqe++) {
1930                 swqe->swqe_next = NULL;
1931                 swqe->swqe_im_mblk = NULL;
1932 
1933                 swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
1934                 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
1935 
1936                 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
1937                 swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
1938                 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
1939                     (chan->tx_mr_bufs + i * state->id_rc_tx_copy_thresh);
1940                 swqe->w_swr.wr_trans = IBT_RC_SRV;
1941 
1942                 /* Add to list */
1943                 mutex_enter(&chan->tx_wqe_list.dl_mutex);
1944                 chan->tx_wqe_list.dl_cnt++;
1945                 swqe->swqe_next = chan->tx_wqe_list.dl_head;
1946                 chan->tx_wqe_list.dl_head = SWQE_TO_WQE(swqe);
1947                 mutex_exit(&chan->tx_wqe_list.dl_mutex);
1948         }
1949 
1950         return (DDI_SUCCESS);
1951 }
1952 
1953 /*
1954  * Free the statically allocated Tx buffer list.
1955  */
1956 static void
1957 ibd_rc_fini_txlist(ibd_rc_chan_t *chan)
1958 {
1959         ibd_state_t *state = chan->state;
1960         if (chan->tx_mr_hdl != NULL) {
1961                 if (ibt_deregister_mr(chan->state->id_hca_hdl,
1962                     chan->tx_mr_hdl) != IBT_SUCCESS) {
1963                         DPRINT(40, "ibd_rc_fini_txlist: ibt_deregister_mr "
1964                             "failed");
1965                 }
1966                 chan->tx_mr_hdl = NULL;
1967         }
1968 
1969         if (chan->tx_mr_bufs != NULL) {
1970                 kmem_free(chan->tx_mr_bufs, chan->scq_size *
1971                     state->id_rc_tx_copy_thresh);
1972                 chan->tx_mr_bufs = NULL;
1973         }
1974 
1975         if (chan->tx_wqes != NULL) {
1976                 kmem_free(chan->tx_wqes, chan->scq_size *
1977                     sizeof (ibd_swqe_t));
1978                 chan->tx_wqes = NULL;
1979         }
1980 }
1981 
1982 /*
1983  * Acquire send wqe from free list.
1984  * Returns error number and send wqe pointer.
1985  */
1986 ibd_swqe_t *
1987 ibd_rc_acquire_swqes(ibd_rc_chan_t *chan)
1988 {
1989         ibd_swqe_t *wqe;
1990 
1991         mutex_enter(&chan->tx_rel_list.dl_mutex);
1992         if (chan->tx_rel_list.dl_head != NULL) {
1993                 /* transfer id_tx_rel_list to id_tx_list */
1994                 chan->tx_wqe_list.dl_head =
1995                     chan->tx_rel_list.dl_head;
1996                 chan->tx_wqe_list.dl_cnt =
1997                     chan->tx_rel_list.dl_cnt;
1998                 chan->tx_wqe_list.dl_pending_sends = B_FALSE;
1999 
2000                 /* clear id_tx_rel_list */
2001                 chan->tx_rel_list.dl_head = NULL;
2002                 chan->tx_rel_list.dl_cnt = 0;
2003                 mutex_exit(&chan->tx_rel_list.dl_mutex);
2004 
2005                 wqe = WQE_TO_SWQE(chan->tx_wqe_list.dl_head);
2006                 chan->tx_wqe_list.dl_cnt -= 1;
2007                 chan->tx_wqe_list.dl_head = wqe->swqe_next;
2008         } else {        /* no free swqe */
2009                 mutex_exit(&chan->tx_rel_list.dl_mutex);
2010                 chan->tx_wqe_list.dl_pending_sends = B_TRUE;
2011                 wqe = NULL;
2012         }
2013         return (wqe);
2014 }
2015 
2016 /*
2017  * Release send wqe back into free list.
2018  */
2019 static void
2020 ibd_rc_release_swqe(ibd_rc_chan_t *chan, ibd_swqe_t *swqe)
2021 {
2022         /*
2023          * Add back on Tx list for reuse.
2024          */
2025         swqe->swqe_next = NULL;
2026         mutex_enter(&chan->tx_rel_list.dl_mutex);
2027         chan->tx_rel_list.dl_pending_sends = B_FALSE;
2028         swqe->swqe_next = chan->tx_rel_list.dl_head;
2029         chan->tx_rel_list.dl_head = SWQE_TO_WQE(swqe);
2030         chan->tx_rel_list.dl_cnt++;
2031         mutex_exit(&chan->tx_rel_list.dl_mutex);
2032 }
2033 
2034 void
2035 ibd_rc_post_send(ibd_rc_chan_t *chan, ibd_swqe_t *node)
2036 {
2037         uint_t          i;
2038         uint_t          num_posted;
2039         uint_t          n_wrs;
2040         ibt_status_t    ibt_status;
2041         ibt_send_wr_t   wrs[IBD_MAX_TX_POST_MULTIPLE];
2042         ibd_swqe_t      *tx_head, *elem;
2043         ibd_swqe_t      *nodes[IBD_MAX_TX_POST_MULTIPLE];
2044 
2045         /* post the one request, then check for more */
2046         ibt_status = ibt_post_send(chan->chan_hdl,
2047             &node->w_swr, 1, NULL);
2048         if (ibt_status != IBT_SUCCESS) {
2049                 ibd_print_warn(chan->state, "ibd_post_send: "
2050                     "posting one wr failed: ret=%d", ibt_status);
2051                 ibd_rc_tx_cleanup(node);
2052         }
2053 
2054         tx_head = NULL;
2055         for (;;) {
2056                 if (tx_head == NULL) {
2057                         mutex_enter(&chan->tx_post_lock);
2058                         tx_head = chan->tx_head;
2059                         if (tx_head == NULL) {
2060                                 chan->tx_busy = 0;
2061                                 mutex_exit(&chan->tx_post_lock);
2062                                 return;
2063                         }
2064                         chan->tx_head = NULL;
2065                         mutex_exit(&chan->tx_post_lock);
2066                 }
2067 
2068                 /*
2069                  * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
2070                  * at a time if possible, and keep posting them.
2071                  */
2072                 for (n_wrs = 0, elem = tx_head;
2073                     (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
2074                     elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
2075                         nodes[n_wrs] = elem;
2076                         wrs[n_wrs] = elem->w_swr;
2077                 }
2078                 tx_head = elem;
2079 
2080                 ASSERT(n_wrs != 0);
2081 
2082                 /*
2083                  * If posting fails for some reason, we'll never receive
2084                  * completion intimation, so we'll need to cleanup. But
2085                  * we need to make sure we don't clean up nodes whose
2086                  * wrs have been successfully posted. We assume that the
2087                  * hca driver returns on the first failure to post and
2088                  * therefore the first 'num_posted' entries don't need
2089                  * cleanup here.
2090                  */
2091                 num_posted = 0;
2092                 ibt_status = ibt_post_send(chan->chan_hdl,
2093                     wrs, n_wrs, &num_posted);
2094                 if (ibt_status != IBT_SUCCESS) {
2095                         ibd_print_warn(chan->state, "ibd_post_send: "
2096                             "posting multiple wrs failed: "
2097                             "requested=%d, done=%d, ret=%d",
2098                             n_wrs, num_posted, ibt_status);
2099 
2100                         for (i = num_posted; i < n_wrs; i++)
2101                                 ibd_rc_tx_cleanup(nodes[i]);
2102                 }
2103         }
2104 }
2105 
2106 /*
2107  * Common code that deals with clean ups after a successful or
2108  * erroneous transmission attempt.
2109  */
2110 void
2111 ibd_rc_tx_cleanup(ibd_swqe_t *swqe)
2112 {
2113         ibd_ace_t *ace = swqe->w_ahandle;
2114         ibd_state_t *state;
2115 
2116         ASSERT(ace != NULL);
2117         ASSERT(ace->ac_chan != NULL);
2118 
2119         state = ace->ac_chan->state;
2120 
2121         /*
2122          * If this was a dynamic registration in ibd_send(),
2123          * deregister now.
2124          */
2125         if (swqe->swqe_im_mblk != NULL) {
2126                 ASSERT(swqe->w_buftype == IBD_WQE_MAPPED);
2127                 if (swqe->w_buftype == IBD_WQE_MAPPED) {
2128                         ibd_unmap_mem(state, swqe);
2129                 }
2130                 freemsg(swqe->swqe_im_mblk);
2131                 swqe->swqe_im_mblk = NULL;
2132         } else {
2133                 ASSERT(swqe->w_buftype != IBD_WQE_MAPPED);
2134         }
2135 
2136         if (swqe->w_buftype == IBD_WQE_RC_COPYBUF) {
2137                 ibd_rc_tx_largebuf_t *lbufp;
2138 
2139                 lbufp = swqe->w_rc_tx_largebuf;
2140                 ASSERT(lbufp != NULL);
2141 
2142                 mutex_enter(&state->rc_tx_large_bufs_lock);
2143                 lbufp->lb_next = state->rc_tx_largebuf_free_head;
2144                 state->rc_tx_largebuf_free_head = lbufp;
2145                 state->rc_tx_largebuf_nfree ++;
2146                 mutex_exit(&state->rc_tx_large_bufs_lock);
2147                 swqe->w_rc_tx_largebuf = NULL;
2148         }
2149 
2150 
2151         /*
2152          * Release the send wqe for reuse.
2153          */
2154         ibd_rc_release_swqe(ace->ac_chan, swqe);
2155 
2156         /*
2157          * Drop the reference count on the AH; it can be reused
2158          * now for a different destination if there are no more
2159          * posted sends that will use it. This can be eliminated
2160          * if we can always associate each Tx buffer with an AH.
2161          * The ace can be null if we are cleaning up from the
2162          * ibd_send() error path.
2163          */
2164         ibd_dec_ref_ace(state, ace);
2165 }
2166 
2167 void
2168 ibd_rc_drain_scq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
2169 {
2170         ibd_state_t *state = chan->state;
2171         ibd_wqe_t *wqe;
2172         ibt_wc_t *wc, *wcs;
2173         ibd_ace_t *ace;
2174         uint_t numwcs, real_numwcs;
2175         int i;
2176         boolean_t encount_error;
2177 
2178         wcs = chan->tx_wc;
2179         numwcs = IBD_RC_MAX_CQ_WC;
2180         encount_error = B_FALSE;
2181 
2182         while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
2183                 for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
2184                         wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
2185                         if (wc->wc_status != IBT_WC_SUCCESS) {
2186                                 if (encount_error == B_FALSE) {
2187                                         /*
2188                                          * This RC channle is in error status,
2189                                          * remove it.
2190                                          */
2191                                         encount_error = B_TRUE;
2192                                         mutex_enter(&state->id_ac_mutex);
2193                                         if ((chan->chan_state ==
2194                                             IBD_RC_STATE_ACT_ESTAB) &&
2195                                             (chan->state->id_link_state ==
2196                                             LINK_STATE_UP) &&
2197                                             ((ace = ibd_acache_find(state,
2198                                             &chan->ace->ac_mac, B_FALSE, 0))
2199                                             != NULL) && (ace == chan->ace)) {
2200                                                 ASSERT(ace->ac_mce == NULL);
2201                                                 INC_REF(ace, 1);
2202                                                 IBD_ACACHE_PULLOUT_ACTIVE(
2203                                                     state, ace);
2204                                                 chan->chan_state =
2205                                                     IBD_RC_STATE_ACT_CLOSING;
2206                                                 mutex_exit(&state->id_ac_mutex);
2207                                                 state->rc_reset_cnt++;
2208                                                 DPRINT(30, "ibd_rc_drain_scq: "
2209                                                     "wc_status(%d) != SUCC, "
2210                                                     "chan=%p, ace=%p, "
2211                                                     "link_state=%d"
2212                                                     "reset RC channel",
2213                                                     wc->wc_status, chan,
2214                                                     chan->ace, chan->state->
2215                                                     id_link_state);
2216                                                 ibd_rc_signal_act_close(
2217                                                     state, ace);
2218                                         } else {
2219                                                 mutex_exit(&state->id_ac_mutex);
2220                                                 state->
2221                                                     rc_act_close_simultaneous++;
2222                                                 DPRINT(40, "ibd_rc_drain_scq: "
2223                                                     "wc_status(%d) != SUCC, "
2224                                                     "chan=%p, chan_state=%d,"
2225                                                     "ace=%p, link_state=%d."
2226                                                     "other thread is closing "
2227                                                     "it", wc->wc_status, chan,
2228                                                     chan->chan_state, chan->ace,
2229                                                     chan->state->id_link_state);
2230                                         }
2231                                 }
2232                         }
2233                         ibd_rc_tx_cleanup(WQE_TO_SWQE(wqe));
2234                 }
2235 
2236                 mutex_enter(&state->id_sched_lock);
2237                 if (state->id_sched_needed == 0) {
2238                         mutex_exit(&state->id_sched_lock);
2239                 } else if (state->id_sched_needed & IBD_RSRC_RC_SWQE) {
2240                         mutex_enter(&chan->tx_wqe_list.dl_mutex);
2241                         mutex_enter(&chan->tx_rel_list.dl_mutex);
2242                         if ((chan->tx_rel_list.dl_cnt +
2243                             chan->tx_wqe_list.dl_cnt) > IBD_RC_TX_FREE_THRESH) {
2244                                 state->id_sched_needed &= ~IBD_RSRC_RC_SWQE;
2245                                 mutex_exit(&chan->tx_rel_list.dl_mutex);
2246                                 mutex_exit(&chan->tx_wqe_list.dl_mutex);
2247                                 mutex_exit(&state->id_sched_lock);
2248                                 state->rc_swqe_mac_update++;
2249                                 mac_tx_update(state->id_mh);
2250                         } else {
2251                                 state->rc_scq_no_swqe++;
2252                                 mutex_exit(&chan->tx_rel_list.dl_mutex);
2253                                 mutex_exit(&chan->tx_wqe_list.dl_mutex);
2254                                 mutex_exit(&state->id_sched_lock);
2255                         }
2256                 } else if (state->id_sched_needed & IBD_RSRC_RC_TX_LARGEBUF) {
2257                         mutex_enter(&state->rc_tx_large_bufs_lock);
2258                         if (state->rc_tx_largebuf_nfree >
2259                             IBD_RC_TX_FREE_THRESH) {
2260                                 ASSERT(state->rc_tx_largebuf_free_head != NULL);
2261                                 state->id_sched_needed &=
2262                                     ~IBD_RSRC_RC_TX_LARGEBUF;
2263                                 mutex_exit(&state->rc_tx_large_bufs_lock);
2264                                 mutex_exit(&state->id_sched_lock);
2265                                 state->rc_xmt_buf_mac_update++;
2266                                 mac_tx_update(state->id_mh);
2267                         } else {
2268                                 state->rc_scq_no_largebuf++;
2269                                 mutex_exit(&state->rc_tx_large_bufs_lock);
2270                                 mutex_exit(&state->id_sched_lock);
2271                         }
2272                 } else if (state->id_sched_needed & IBD_RSRC_SWQE) {
2273                         mutex_enter(&state->id_tx_list.dl_mutex);
2274                         mutex_enter(&state->id_tx_rel_list.dl_mutex);
2275                         if ((state->id_tx_list.dl_cnt +
2276                             state->id_tx_rel_list.dl_cnt)
2277                             > IBD_FREE_SWQES_THRESH) {
2278                                 state->id_sched_needed &= ~IBD_RSRC_SWQE;
2279                                 state->id_sched_cnt++;
2280                                 mutex_exit(&state->id_tx_rel_list.dl_mutex);
2281                                 mutex_exit(&state->id_tx_list.dl_mutex);
2282                                 mutex_exit(&state->id_sched_lock);
2283                                 mac_tx_update(state->id_mh);
2284                         } else {
2285                                 mutex_exit(&state->id_tx_rel_list.dl_mutex);
2286                                 mutex_exit(&state->id_tx_list.dl_mutex);
2287                                 mutex_exit(&state->id_sched_lock);
2288                         }
2289                 } else {
2290                         mutex_exit(&state->id_sched_lock);
2291                 }
2292         }
2293 }
2294 
2295 /* Send CQ handler, call ibd_rx_tx_cleanup to recycle Tx buffers */
2296 /* ARGSUSED */
2297 static void
2298 ibd_rc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
2299 {
2300         ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2301 
2302         if (ibd_rc_tx_softintr == 1) {
2303                 mutex_enter(&chan->tx_poll_lock);
2304                 if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2305                         chan->tx_poll_busy |= IBD_REDO_CQ_POLLING;
2306                         mutex_exit(&chan->tx_poll_lock);
2307                         return;
2308                 } else {
2309                         mutex_exit(&chan->tx_poll_lock);
2310                         ddi_trigger_softintr(chan->scq_softintr);
2311                 }
2312         } else
2313                 (void) ibd_rc_tx_recycle(arg);
2314 }
2315 
2316 static uint_t
2317 ibd_rc_tx_recycle(caddr_t arg)
2318 {
2319         ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2320         ibd_state_t *state = chan->state;
2321         int flag, redo_flag;
2322         int redo = 1;
2323 
2324         flag = IBD_CQ_POLLING;
2325         redo_flag = IBD_REDO_CQ_POLLING;
2326 
2327         mutex_enter(&chan->tx_poll_lock);
2328         if (chan->tx_poll_busy & flag) {
2329                 ibd_print_warn(state, "ibd_rc_tx_recycle: multiple polling "
2330                     "threads");
2331                 chan->tx_poll_busy |= redo_flag;
2332                 mutex_exit(&chan->tx_poll_lock);
2333                 return (DDI_INTR_CLAIMED);
2334         }
2335         chan->tx_poll_busy |= flag;
2336         mutex_exit(&chan->tx_poll_lock);
2337 
2338         /*
2339          * Poll for completed entries; the CQ will not interrupt any
2340          * more for completed packets.
2341          */
2342         ibd_rc_drain_scq(chan, chan->scq_hdl);
2343 
2344         /*
2345          * Now enable CQ notifications; all completions originating now
2346          * will cause new interrupts.
2347          */
2348         do {
2349                 if (ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION) !=
2350                     IBT_SUCCESS) {
2351                         /*
2352                          * We do not expect a failure here.
2353                          */
2354                         DPRINT(40, "ibd_rc_scq_handler: ibt_enable_cq_notify()"
2355                             " failed");
2356                 }
2357 
2358                 ibd_rc_drain_scq(chan, chan->scq_hdl);
2359 
2360                 mutex_enter(&chan->tx_poll_lock);
2361                 if (chan->tx_poll_busy & redo_flag)
2362                         chan->tx_poll_busy &= ~redo_flag;
2363                 else {
2364                         chan->tx_poll_busy &= ~flag;
2365                         redo = 0;
2366                 }
2367                 mutex_exit(&chan->tx_poll_lock);
2368 
2369         } while (redo);
2370 
2371         return (DDI_INTR_CLAIMED);
2372 }
2373 
2374 static ibt_status_t
2375 ibd_register_service(ibt_srv_desc_t *srv, ib_svc_id_t sid,
2376     int num_sids, ibt_srv_hdl_t *srv_hdl, ib_svc_id_t *ret_sid)
2377 {
2378         ibd_service_t *p;
2379         ibt_status_t status;
2380 
2381         mutex_enter(&ibd_gstate.ig_mutex);
2382         for (p = ibd_gstate.ig_service_list; p != NULL; p = p->is_link) {
2383                 if (p->is_sid == sid) {
2384                         p->is_ref_cnt++;
2385                         *srv_hdl = p->is_srv_hdl;
2386                         *ret_sid = sid;
2387                         mutex_exit(&ibd_gstate.ig_mutex);
2388                         return (IBT_SUCCESS);
2389                 }
2390         }
2391         status = ibt_register_service(ibd_gstate.ig_ibt_hdl, srv, sid,
2392             num_sids, srv_hdl, ret_sid);
2393         if (status == IBT_SUCCESS) {
2394                 p = kmem_alloc(sizeof (*p), KM_SLEEP);
2395                 p->is_srv_hdl = *srv_hdl;
2396                 p->is_sid = sid;
2397                 p->is_ref_cnt = 1;
2398                 p->is_link = ibd_gstate.ig_service_list;
2399                 ibd_gstate.ig_service_list = p;
2400         }
2401         mutex_exit(&ibd_gstate.ig_mutex);
2402         return (status);
2403 }
2404 
2405 static ibt_status_t
2406 ibd_deregister_service(ibt_srv_hdl_t srv_hdl)
2407 {
2408         ibd_service_t *p, **pp;
2409         ibt_status_t status;
2410 
2411         mutex_enter(&ibd_gstate.ig_mutex);
2412         for (pp = &ibd_gstate.ig_service_list; *pp != NULL;
2413             pp = &((*pp)->is_link)) {
2414                 p = *pp;
2415                 if (p->is_srv_hdl == srv_hdl) {      /* Found it */
2416                         if (--p->is_ref_cnt == 0) {
2417                                 status = ibt_deregister_service(
2418                                     ibd_gstate.ig_ibt_hdl, srv_hdl);
2419                                 *pp = p->is_link; /* link prev to next */
2420                                 kmem_free(p, sizeof (*p));
2421                         } else {
2422                                 status = IBT_SUCCESS;
2423                         }
2424                         mutex_exit(&ibd_gstate.ig_mutex);
2425                         return (status);
2426                 }
2427         }
2428         /* Should not ever get here */
2429         mutex_exit(&ibd_gstate.ig_mutex);
2430         return (IBT_FAILURE);
2431 }
2432 
2433 /* Listen with corresponding service ID */
2434 ibt_status_t
2435 ibd_rc_listen(ibd_state_t *state)
2436 {
2437         ibt_srv_desc_t srvdesc;
2438         ib_svc_id_t ret_sid;
2439         ibt_status_t status;
2440         ib_gid_t gid;
2441 
2442         if (state->rc_listen_hdl != NULL) {
2443                 DPRINT(40, "ibd_rc_listen: rc_listen_hdl should be NULL");
2444                 return (IBT_FAILURE);
2445         }
2446 
2447         bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2448         srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2449         srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2450 
2451         /*
2452          * Register the service with service id
2453          * Incoming connection requests should arrive on this service id.
2454          */
2455         status = ibd_register_service(&srvdesc,
2456             IBD_RC_QPN_TO_SID(state->id_qpnum),
2457             1, &state->rc_listen_hdl, &ret_sid);
2458         if (status != IBT_SUCCESS) {
2459                 DPRINT(40, "ibd_rc_listen: Service Registration Failed, "
2460                     "ret=%d", status);
2461                 return (status);
2462         }
2463 
2464         gid = state->id_sgid;
2465 
2466         /* pass state as cm_private */
2467         status = ibt_bind_service(state->rc_listen_hdl,
2468             gid, NULL, state, &state->rc_listen_bind);
2469         if (status != IBT_SUCCESS) {
2470                 DPRINT(40, "ibd_rc_listen:"
2471                     " fail to bind port: <%d>", status);
2472                 (void) ibd_deregister_service(state->rc_listen_hdl);
2473                 return (status);
2474         }
2475 
2476         /*
2477          * Legacy OFED had used a wrong service ID (one additional zero digit)
2478          * for many years. To interop with legacy OFED, we support this wrong
2479          * service ID here.
2480          */
2481         ASSERT(state->rc_listen_hdl_OFED_interop == NULL);
2482 
2483         bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2484         srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2485         srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2486 
2487         /*
2488          * Register the service with service id
2489          * Incoming connection requests should arrive on this service id.
2490          */
2491         status = ibd_register_service(&srvdesc,
2492             IBD_RC_QPN_TO_SID_OFED_INTEROP(state->id_qpnum),
2493             1, &state->rc_listen_hdl_OFED_interop, &ret_sid);
2494         if (status != IBT_SUCCESS) {
2495                 DPRINT(40,
2496                     "ibd_rc_listen: Service Registration for Legacy OFED "
2497                     "Failed %d", status);
2498                 (void) ibt_unbind_service(state->rc_listen_hdl,
2499                     state->rc_listen_bind);
2500                 (void) ibd_deregister_service(state->rc_listen_hdl);
2501                 return (status);
2502         }
2503 
2504         gid = state->id_sgid;
2505 
2506         /* pass state as cm_private */
2507         status = ibt_bind_service(state->rc_listen_hdl_OFED_interop,
2508             gid, NULL, state, &state->rc_listen_bind_OFED_interop);
2509         if (status != IBT_SUCCESS) {
2510                 DPRINT(40, "ibd_rc_listen: fail to bind port: <%d> for "
2511                     "Legacy OFED listener", status);
2512                 (void) ibd_deregister_service(
2513                     state->rc_listen_hdl_OFED_interop);
2514                 (void) ibt_unbind_service(state->rc_listen_hdl,
2515                     state->rc_listen_bind);
2516                 (void) ibd_deregister_service(state->rc_listen_hdl);
2517                 return (status);
2518         }
2519 
2520         return (IBT_SUCCESS);
2521 }
2522 
2523 void
2524 ibd_rc_stop_listen(ibd_state_t *state)
2525 {
2526         int ret;
2527 
2528         /* Disable incoming connection requests */
2529         if (state->rc_listen_hdl != NULL) {
2530                 ret = ibt_unbind_all_services(state->rc_listen_hdl);
2531                 if (ret != 0) {
2532                         DPRINT(40, "ibd_rc_stop_listen:"
2533                             "ibt_unbind_all_services() failed, ret=%d", ret);
2534                 }
2535                 ret = ibd_deregister_service(state->rc_listen_hdl);
2536                 if (ret != 0) {
2537                         DPRINT(40, "ibd_rc_stop_listen:"
2538                             "ibd_deregister_service() failed, ret=%d", ret);
2539                 } else {
2540                         state->rc_listen_hdl = NULL;
2541                 }
2542         }
2543 
2544         /* Disable incoming connection requests */
2545         if (state->rc_listen_hdl_OFED_interop != NULL) {
2546                 ret = ibt_unbind_all_services(
2547                     state->rc_listen_hdl_OFED_interop);
2548                 if (ret != 0) {
2549                         DPRINT(40, "ibd_rc_stop_listen:"
2550                             "ibt_unbind_all_services() failed: %d", ret);
2551                 }
2552                 ret = ibd_deregister_service(state->rc_listen_hdl_OFED_interop);
2553                 if (ret != 0) {
2554                         DPRINT(40, "ibd_rc_stop_listen:"
2555                             "ibd_deregister_service() failed: %d", ret);
2556                 } else {
2557                         state->rc_listen_hdl_OFED_interop = NULL;
2558                 }
2559         }
2560 }
2561 
2562 void
2563 ibd_rc_close_all_chan(ibd_state_t *state)
2564 {
2565         ibd_rc_chan_t *rc_chan;
2566         ibd_ace_t *ace, *pre_ace;
2567         uint_t attempts;
2568 
2569         /* Disable all Rx routines */
2570         mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
2571         rc_chan = state->rc_pass_chan_list.chan_list;
2572         while (rc_chan != NULL) {
2573                 ibt_set_cq_handler(rc_chan->rcq_hdl, 0, 0);
2574                 rc_chan = rc_chan->next;
2575         }
2576         mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
2577 
2578         if (state->rc_enable_srq) {
2579                 attempts = 10;
2580                 while (state->rc_srq_rwqe_list.dl_bufs_outstanding > 0) {
2581                         DPRINT(30, "ibd_rc_close_all_chan: outstanding > 0");
2582                         delay(drv_usectohz(100000));
2583                         if (--attempts == 0) {
2584                                 /*
2585                                  * There are pending bufs with the network
2586                                  * layer and we have no choice but to wait
2587                                  * for them to be done with. Reap all the
2588                                  * Tx/Rx completions that were posted since
2589                                  * we turned off the notification and
2590                                  * return failure.
2591                                  */
2592                                 break;
2593                         }
2594                 }
2595         }
2596 
2597         /* Close all passive RC channels */
2598         rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2599         while (rc_chan != NULL) {
2600                 (void) ibd_rc_pas_close(rc_chan, B_TRUE, B_FALSE);
2601                 rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2602         }
2603 
2604         /* Close all active RC channels */
2605         mutex_enter(&state->id_ac_mutex);
2606         state->id_ac_hot_ace = NULL;
2607         ace = list_head(&state->id_ah_active);
2608         while ((pre_ace = ace) != NULL) {
2609                 ace = list_next(&state->id_ah_active, ace);
2610                 if (pre_ace->ac_chan != NULL) {
2611                         INC_REF(pre_ace, 1);
2612                         IBD_ACACHE_PULLOUT_ACTIVE(state, pre_ace);
2613                         pre_ace->ac_chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
2614                         ibd_rc_add_to_chan_list(&state->rc_obs_act_chan_list,
2615                             pre_ace->ac_chan);
2616                 }
2617         }
2618         mutex_exit(&state->id_ac_mutex);
2619 
2620         rc_chan = ibd_rc_rm_header_chan_list(&state->rc_obs_act_chan_list);
2621         while (rc_chan != NULL) {
2622                 ace = rc_chan->ace;
2623                 ibd_rc_act_close(rc_chan, B_TRUE);
2624                 if (ace != NULL) {
2625                         mutex_enter(&state->id_ac_mutex);
2626                         ASSERT(ace->ac_ref != 0);
2627                         atomic_dec_32(&ace->ac_ref);
2628                         ace->ac_chan = NULL;
2629                         if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
2630                                 IBD_ACACHE_INSERT_FREE(state, ace);
2631                                 ace->ac_ref = 0;
2632                         } else {
2633                                 ace->ac_ref |= CYCLEVAL;
2634                                 state->rc_delay_ace_recycle++;
2635                         }
2636                         mutex_exit(&state->id_ac_mutex);
2637                 }
2638                 rc_chan = ibd_rc_rm_header_chan_list(
2639                     &state->rc_obs_act_chan_list);
2640         }
2641 
2642         attempts = 400;
2643         while (((state->rc_num_tx_chan != 0) ||
2644             (state->rc_num_rx_chan != 0)) && (attempts > 0)) {
2645                 /* Other thread is closing CM channel, wait it */
2646                 delay(drv_usectohz(100000));
2647                 attempts--;
2648         }
2649 }
2650 
2651 void
2652 ibd_rc_try_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path)
2653 {
2654         ibt_status_t status;
2655 
2656         if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
2657                 return;
2658 
2659         status = ibd_rc_connect(state, ace, path,
2660             IBD_RC_SERVICE_ID_OFED_INTEROP);
2661 
2662         if (status != IBT_SUCCESS) {
2663                 /* wait peer side remove stale channel */
2664                 delay(drv_usectohz(10000));
2665                 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
2666                         return;
2667                 status = ibd_rc_connect(state, ace, path,
2668                     IBD_RC_SERVICE_ID_OFED_INTEROP);
2669         }
2670 
2671         if (status != IBT_SUCCESS) {
2672                 /* wait peer side remove stale channel */
2673                 delay(drv_usectohz(10000));
2674                 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
2675                         return;
2676                 (void) ibd_rc_connect(state, ace, path,
2677                     IBD_RC_SERVICE_ID);
2678         }
2679 }
2680 
2681 /*
2682  * Allocates channel and sets the ace->ac_chan to it.
2683  * Opens the channel.
2684  */
2685 ibt_status_t
2686 ibd_rc_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path,
2687     uint64_t ietf_cm_service_id)
2688 {
2689         ibt_status_t status = 0;
2690         ibt_rc_returns_t open_returns;
2691         ibt_chan_open_args_t open_args;
2692         ibd_rc_msg_hello_t hello_req_msg;
2693         ibd_rc_msg_hello_t *hello_ack_msg;
2694         ibd_rc_chan_t *chan;
2695         ibt_ud_dest_query_attr_t dest_attrs;
2696 
2697         ASSERT(ace != NULL);
2698         ASSERT(ace->ac_mce == NULL);
2699         ASSERT(ace->ac_chan == NULL);
2700 
2701         if ((status = ibd_rc_alloc_chan(&chan, state, B_TRUE)) != IBT_SUCCESS) {
2702                 DPRINT(10, "ibd_rc_connect: ibd_rc_alloc_chan() failed");
2703                 return (status);
2704         }
2705 
2706         ace->ac_chan = chan;
2707         chan->state = state;
2708         chan->ace = ace;
2709 
2710         ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)ace);
2711 
2712         hello_ack_msg = kmem_zalloc(sizeof (ibd_rc_msg_hello_t), KM_SLEEP);
2713 
2714         /*
2715          * open the channels
2716          */
2717         bzero(&open_args, sizeof (ibt_chan_open_args_t));
2718         bzero(&open_returns, sizeof (ibt_rc_returns_t));
2719 
2720         open_args.oc_cm_handler = ibd_rc_dispatch_actv_mad;
2721         open_args.oc_cm_clnt_private = (void *)(uintptr_t)ace;
2722 
2723         /*
2724          * update path record with the SID
2725          */
2726         if ((status = ibt_query_ud_dest(ace->ac_dest, &dest_attrs))
2727             != IBT_SUCCESS) {
2728                 DPRINT(40, "ibd_rc_connect: ibt_query_ud_dest() failed, "
2729                     "ret=%d", status);
2730                 return (status);
2731         }
2732 
2733         path->pi_sid =
2734             ietf_cm_service_id | ((dest_attrs.ud_dst_qpn) & 0xffffff);
2735 
2736 
2737         /* pre-allocate memory for hello ack message */
2738         open_returns.rc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2739         open_returns.rc_priv_data = hello_ack_msg;
2740 
2741         open_args.oc_path = path;
2742 
2743         open_args.oc_path_rnr_retry_cnt = 1;
2744         open_args.oc_path_retry_cnt = 1;
2745 
2746         /* We don't do RDMA */
2747         open_args.oc_rdma_ra_out = 0;
2748         open_args.oc_rdma_ra_in = 0;
2749 
2750         hello_req_msg.reserved_qpn = htonl(state->id_qpnum);
2751         hello_req_msg.rx_mtu = htonl(state->rc_mtu);
2752         open_args.oc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2753         open_args.oc_priv_data = (void *)(&hello_req_msg);
2754 
2755         ASSERT(open_args.oc_priv_data_len <= IBT_REQ_PRIV_DATA_SZ);
2756         ASSERT(open_returns.rc_priv_data_len <= IBT_REP_PRIV_DATA_SZ);
2757         ASSERT(open_args.oc_cm_handler != NULL);
2758 
2759         status = ibt_open_rc_channel(chan->chan_hdl, IBT_OCHAN_NO_FLAGS,
2760             IBT_BLOCKING, &open_args, &open_returns);
2761 
2762         if (status == IBT_SUCCESS) {
2763                 /* Success! */
2764                 DPRINT(2, "ibd_rc_connect: call ibt_open_rc_channel succ!");
2765                 state->rc_conn_succ++;
2766                 kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2767                 return (IBT_SUCCESS);
2768         }
2769 
2770         /* failure */
2771         (void) ibt_flush_channel(chan->chan_hdl);
2772         ibd_rc_free_chan(chan);
2773         ace->ac_chan = NULL;
2774 
2775         /* check open_returns report error and exit */
2776         DPRINT(30, "ibd_rc_connect: call ibt_open_rc_chan fail."
2777             "ret status = %d, reason=%d, ace=%p, mtu=0x%x, qpn=0x%x,"
2778             " peer qpn=0x%x", status, (int)open_returns.rc_status, ace,
2779             hello_req_msg.rx_mtu, hello_req_msg.reserved_qpn,
2780             dest_attrs.ud_dst_qpn);
2781         kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2782         return (status);
2783 }
2784 
2785 void
2786 ibd_rc_signal_act_close(ibd_state_t *state, ibd_ace_t *ace)
2787 {
2788         ibd_req_t *req;
2789 
2790         req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2791         if (req == NULL) {
2792                 ibd_print_warn(state, "ibd_rc_signal_act_close: alloc "
2793                     "ibd_req_t fail");
2794                 mutex_enter(&state->rc_obs_act_chan_list.chan_list_mutex);
2795                 ace->ac_chan->next = state->rc_obs_act_chan_list.chan_list;
2796                 state->rc_obs_act_chan_list.chan_list = ace->ac_chan;
2797                 mutex_exit(&state->rc_obs_act_chan_list.chan_list_mutex);
2798         } else {
2799                 req->rq_ptr = ace->ac_chan;
2800                 ibd_queue_work_slot(state, req, IBD_ASYNC_RC_CLOSE_ACT_CHAN);
2801         }
2802 }
2803 
2804 void
2805 ibd_rc_signal_ace_recycle(ibd_state_t *state, ibd_ace_t *ace)
2806 {
2807         ibd_req_t *req;
2808 
2809         mutex_enter(&state->rc_ace_recycle_lock);
2810         if (state->rc_ace_recycle != NULL) {
2811                 mutex_exit(&state->rc_ace_recycle_lock);
2812                 return;
2813         }
2814 
2815         req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2816         if (req == NULL) {
2817                 mutex_exit(&state->rc_ace_recycle_lock);
2818                 return;
2819         }
2820 
2821         state->rc_ace_recycle = ace;
2822         mutex_exit(&state->rc_ace_recycle_lock);
2823         ASSERT(ace->ac_mce == NULL);
2824         INC_REF(ace, 1);
2825         IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
2826         req->rq_ptr = ace;
2827         ibd_queue_work_slot(state, req, IBD_ASYNC_RC_RECYCLE_ACE);
2828 }
2829 
2830 /*
2831  * Close an active channel
2832  *
2833  * is_close_rc_chan: if B_TRUE, we will call ibt_close_rc_channel()
2834  */
2835 static void
2836 ibd_rc_act_close(ibd_rc_chan_t *chan, boolean_t is_close_rc_chan)
2837 {
2838         ibd_state_t *state;
2839         ibd_ace_t *ace;
2840         uint_t times;
2841         ibt_status_t ret;
2842 
2843         ASSERT(chan != NULL);
2844 
2845         chan->state->rc_act_close++;
2846         switch (chan->chan_state) {
2847         case IBD_RC_STATE_ACT_CLOSING:  /* stale, close it */
2848         case IBD_RC_STATE_ACT_ESTAB:
2849                 DPRINT(30, "ibd_rc_act_close-1: close and free chan, "
2850                     "act_state=%d, chan=%p", chan->chan_state, chan);
2851                 chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2852                 ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2853                 /*
2854                  * Wait send queue empty. Its old value is 50 (5 seconds). But
2855                  * in my experiment, 5 seconds is not enough time to let IBTL
2856                  * return all buffers and ace->ac_ref. I tried 25 seconds, it
2857                  * works well. As another evidence, I saw IBTL takes about 17
2858                  * seconds every time it cleans a stale RC channel.
2859                  */
2860                 times = 250;
2861                 ace = chan->ace;
2862                 ASSERT(ace != NULL);
2863                 state = chan->state;
2864                 ASSERT(state != NULL);
2865                 mutex_enter(&state->id_ac_mutex);
2866                 mutex_enter(&chan->tx_wqe_list.dl_mutex);
2867                 mutex_enter(&chan->tx_rel_list.dl_mutex);
2868                 while (((chan->tx_wqe_list.dl_cnt + chan->tx_rel_list.dl_cnt)
2869                     != chan->scq_size) || ((ace->ac_ref != 1) &&
2870                     (ace->ac_ref != (CYCLEVAL+1)))) {
2871                         mutex_exit(&chan->tx_rel_list.dl_mutex);
2872                         mutex_exit(&chan->tx_wqe_list.dl_mutex);
2873                         mutex_exit(&state->id_ac_mutex);
2874                         times--;
2875                         if (times == 0) {
2876                                 state->rc_act_close_not_clean++;
2877                                 DPRINT(40, "ibd_rc_act_close: dl_cnt(tx_wqe_"
2878                                     "list=%d, tx_rel_list=%d) != chan->"
2879                                     "scq_size=%d, OR ac_ref(=%d) not clean",
2880                                     chan->tx_wqe_list.dl_cnt,
2881                                     chan->tx_rel_list.dl_cnt,
2882                                     chan->scq_size, ace->ac_ref);
2883                                 break;
2884                         }
2885                         mutex_enter(&chan->tx_poll_lock);
2886                         if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2887                                 DPRINT(40, "ibd_rc_act_close: multiple "
2888                                     "polling threads");
2889                                 mutex_exit(&chan->tx_poll_lock);
2890                         } else {
2891                                 chan->tx_poll_busy = IBD_CQ_POLLING;
2892                                 mutex_exit(&chan->tx_poll_lock);
2893                                 ibd_rc_drain_scq(chan, chan->scq_hdl);
2894                                 mutex_enter(&chan->tx_poll_lock);
2895                                 chan->tx_poll_busy = 0;
2896                                 mutex_exit(&chan->tx_poll_lock);
2897                         }
2898                         delay(drv_usectohz(100000));
2899                         mutex_enter(&state->id_ac_mutex);
2900                         mutex_enter(&chan->tx_wqe_list.dl_mutex);
2901                         mutex_enter(&chan->tx_rel_list.dl_mutex);
2902                 }
2903                 if (times != 0) {
2904                         mutex_exit(&chan->tx_rel_list.dl_mutex);
2905                         mutex_exit(&chan->tx_wqe_list.dl_mutex);
2906                         mutex_exit(&state->id_ac_mutex);
2907                 }
2908 
2909                 ibt_set_cq_handler(chan->scq_hdl, 0, 0);
2910                 if (is_close_rc_chan) {
2911                         ret = ibt_close_rc_channel(chan->chan_hdl,
2912                             IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL,
2913                             0);
2914                         if (ret != IBT_SUCCESS) {
2915                                 DPRINT(40, "ibd_rc_act_close: ibt_close_rc_"
2916                                     "channel fail, chan=%p, ret=%d",
2917                                     chan, ret);
2918                         } else {
2919                                 DPRINT(30, "ibd_rc_act_close: ibt_close_rc_"
2920                                     "channel succ, chan=%p", chan);
2921                         }
2922                 }
2923 
2924                 ibd_rc_free_chan(chan);
2925                 break;
2926         case IBD_RC_STATE_ACT_REP_RECV:
2927                 chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2928                 (void) ibt_flush_channel(chan->chan_hdl);
2929                 ibd_rc_free_chan(chan);
2930                 break;
2931         case IBD_RC_STATE_ACT_ERROR:
2932                 DPRINT(40, "ibd_rc_act_close: IBD_RC_STATE_ERROR branch");
2933                 break;
2934         default:
2935                 DPRINT(40, "ibd_rc_act_close: default branch, act_state=%d, "
2936                     "chan=%p", chan->chan_state, chan);
2937         }
2938 }
2939 
2940 /*
2941  * Close a passive channel
2942  *
2943  * is_close_rc_chan: if B_TRUE, we will call ibt_close_rc_channel()
2944  *
2945  * is_timeout_close: if B_TRUE, this function is called by the connection
2946  * reaper (refer to function ibd_rc_conn_timeout_call). When the connection
2947  * reaper calls ibd_rc_pas_close(), and if it finds that dl_bufs_outstanding
2948  * or chan->rcq_invoking is non-zero, then it can simply put that channel back
2949  * on the passive channels list and move on, since it might be an indication
2950  * that the channel became active again by the time we started it's cleanup.
2951  * It is costlier to do the cleanup and then reinitiate the channel
2952  * establishment and hence it will help to be conservative when we do the
2953  * cleanup.
2954  */
2955 int
2956 ibd_rc_pas_close(ibd_rc_chan_t *chan, boolean_t is_close_rc_chan,
2957     boolean_t is_timeout_close)
2958 {
2959         uint_t times;
2960         ibt_status_t ret;
2961 
2962         ASSERT(chan != NULL);
2963         chan->state->rc_pas_close++;
2964 
2965         switch (chan->chan_state) {
2966         case IBD_RC_STATE_PAS_ESTAB:
2967                 if (is_timeout_close) {
2968                         if ((chan->rcq_invoking != 0) ||
2969                             ((!chan->state->rc_enable_srq) &&
2970                             (chan->rx_wqe_list.dl_bufs_outstanding > 0))) {
2971                                 if (ibd_rc_re_add_to_pas_chan_list(chan)) {
2972                                         return (DDI_FAILURE);
2973                                 }
2974                         }
2975                 }
2976                 /*
2977                  * First, stop receive interrupts; this stops the
2978                  * connection from handing up buffers to higher layers.
2979                  * Wait for receive buffers to be returned; give up
2980                  * after 5 seconds.
2981                  */
2982                 ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2983                 /* Wait 0.01 second to let ibt_set_cq_handler() take effect */
2984                 delay(drv_usectohz(10000));
2985                 if (!chan->state->rc_enable_srq) {
2986                         times = 50;
2987                         while (chan->rx_wqe_list.dl_bufs_outstanding > 0) {
2988                                 delay(drv_usectohz(100000));
2989                                 if (--times == 0) {
2990                                         DPRINT(40, "ibd_rc_pas_close : "
2991                                             "reclaiming failed");
2992                                         ibd_rc_poll_rcq(chan, chan->rcq_hdl);
2993                                         ibt_set_cq_handler(chan->rcq_hdl,
2994                                             ibd_rc_rcq_handler,
2995                                             (void *)(uintptr_t)chan);
2996                                         return (DDI_FAILURE);
2997                                 }
2998                         }
2999                 }
3000                 times = 50;
3001                 while (chan->rcq_invoking != 0) {
3002                         delay(drv_usectohz(100000));
3003                         if (--times == 0) {
3004                                 DPRINT(40, "ibd_rc_pas_close : "
3005                                     "rcq handler is being invoked");
3006                                 chan->state->rc_pas_close_rcq_invoking++;
3007                                 break;
3008                         }
3009                 }
3010                 ibt_set_cq_handler(chan->scq_hdl, 0, 0);
3011                 chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
3012                 DPRINT(30, "ibd_rc_pas_close-1: close and free chan, "
3013                     "chan_state=%d, chan=%p", chan->chan_state, chan);
3014                 if (is_close_rc_chan) {
3015                         ret = ibt_close_rc_channel(chan->chan_hdl,
3016                             IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL,
3017                             0);
3018                         if (ret != IBT_SUCCESS) {
3019                                 DPRINT(40, "ibd_rc_pas_close: ibt_close_rc_"
3020                                     "channel() fail, chan=%p, ret=%d", chan,
3021                                     ret);
3022                         } else {
3023                                 DPRINT(30, "ibd_rc_pas_close: ibt_close_rc_"
3024                                     "channel() succ, chan=%p", chan);
3025                         }
3026                 }
3027                 ibd_rc_free_chan(chan);
3028                 break;
3029         case IBD_RC_STATE_PAS_REQ_RECV:
3030                 chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
3031                 (void) ibt_flush_channel(chan->chan_hdl);
3032                 ibd_rc_free_chan(chan);
3033                 break;
3034         default:
3035                 DPRINT(40, "ibd_rc_pas_close: default, chan_state=%d, chan=%p",
3036                     chan->chan_state, chan);
3037         }
3038         return (DDI_SUCCESS);
3039 }
3040 
3041 /*
3042  * Passive Side:
3043  *      Handle an incoming CM REQ from active side.
3044  *
3045  *      If success, this function allocates an ibd_rc_chan_t, then
3046  * assigns it to "*ret_conn".
3047  */
3048 static ibt_cm_status_t
3049 ibd_rc_handle_req(void *arg, ibd_rc_chan_t **ret_conn,
3050     ibt_cm_event_t *ibt_cm_event, ibt_cm_return_args_t *ret_args,
3051     void *ret_priv_data)
3052 {
3053         ibd_rc_msg_hello_t *hello_msg;
3054         ibd_state_t *state = (ibd_state_t *)arg;
3055         ibd_rc_chan_t *chan;
3056 
3057         if (ibd_rc_alloc_chan(&chan, state, B_FALSE) != IBT_SUCCESS) {
3058                 DPRINT(40, "ibd_rc_handle_req: ibd_rc_alloc_chan() failed");
3059                 return (IBT_CM_REJECT);
3060         }
3061 
3062         ibd_rc_add_to_chan_list(&state->rc_pass_chan_list, chan);
3063 
3064         ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)chan);
3065 
3066         if (!state->rc_enable_srq) {
3067                 if (ibd_rc_init_rxlist(chan) != DDI_SUCCESS) {
3068                         ibd_rc_free_chan(chan);
3069                         DPRINT(40, "ibd_rc_handle_req: ibd_rc_init_rxlist() "
3070                             "failed");
3071                         return (IBT_CM_REJECT);
3072                 }
3073         }
3074 
3075         ret_args->cm_ret.rep.cm_channel = chan->chan_hdl;
3076 
3077         /* We don't do RDMA */
3078         ret_args->cm_ret.rep.cm_rdma_ra_out = 0;
3079         ret_args->cm_ret.rep.cm_rdma_ra_in = 0;
3080 
3081         ret_args->cm_ret.rep.cm_rnr_retry_cnt = 7;
3082         ret_args->cm_ret_len = sizeof (ibd_rc_msg_hello_t);
3083 
3084         hello_msg = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
3085         DPRINT(30, "ibd_rc_handle_req(): peer qpn=0x%x, peer mtu=0x%x",
3086             ntohl(hello_msg->reserved_qpn), ntohl(hello_msg->rx_mtu));
3087 
3088         hello_msg = (ibd_rc_msg_hello_t *)ret_priv_data;
3089         hello_msg->reserved_qpn = htonl(state->id_qpnum);
3090         hello_msg->rx_mtu = htonl(state->rc_mtu);
3091 
3092         chan->chan_state = IBD_RC_STATE_PAS_REQ_RECV;        /* ready to receive */
3093         *ret_conn = chan;
3094 
3095         return (IBT_CM_ACCEPT);
3096 }
3097 
3098 /*
3099  * ibd_rc_handle_act_estab -- handler for connection established completion
3100  * for active side.
3101  */
3102 static ibt_cm_status_t
3103 ibd_rc_handle_act_estab(ibd_ace_t *ace)
3104 {
3105         ibt_status_t result;
3106 
3107         switch (ace->ac_chan->chan_state) {
3108                 case IBD_RC_STATE_ACT_REP_RECV:
3109                         ace->ac_chan->chan_state = IBD_RC_STATE_ACT_ESTAB;
3110                         result = ibt_enable_cq_notify(ace->ac_chan->rcq_hdl,
3111                             IBT_NEXT_COMPLETION);
3112                         if (result != IBT_SUCCESS) {
3113                                 DPRINT(40, "ibd_rc_handle_act_estab: "
3114                                     "ibt_enable_cq_notify(rcq) "
3115                                     "failed: status %d", result);
3116                                 return (IBT_CM_REJECT);
3117                         }
3118                         break;
3119                 default:
3120                         DPRINT(40, "ibd_rc_handle_act_estab: default "
3121                             "branch, act_state=%d", ace->ac_chan->chan_state);
3122                         return (IBT_CM_REJECT);
3123         }
3124         return (IBT_CM_ACCEPT);
3125 }
3126 
3127 /*
3128  * ibd_rc_handle_pas_estab -- handler for connection established completion
3129  * for passive side.
3130  */
3131 static ibt_cm_status_t
3132 ibd_rc_handle_pas_estab(ibd_rc_chan_t *chan)
3133 {
3134         ibt_status_t result;
3135 
3136         switch (chan->chan_state) {
3137                 case IBD_RC_STATE_PAS_REQ_RECV:
3138                         chan->chan_state = IBD_RC_STATE_PAS_ESTAB;
3139 
3140                         result = ibt_enable_cq_notify(chan->rcq_hdl,
3141                             IBT_NEXT_COMPLETION);
3142                         if (result != IBT_SUCCESS) {
3143                                 DPRINT(40, "ibd_rc_handle_pas_estab: "
3144                                     "ibt_enable_cq_notify(rcq) "
3145                                     "failed: status %d", result);
3146                                 return (IBT_CM_REJECT);
3147                         }
3148                         break;
3149                 default:
3150                         DPRINT(40, "ibd_rc_handle_pas_estab: default "
3151                             "branch, chan_state=%d", chan->chan_state);
3152                         return (IBT_CM_REJECT);
3153         }
3154         return (IBT_CM_ACCEPT);
3155 }
3156 
3157 /* ARGSUSED */
3158 static ibt_cm_status_t
3159 ibd_rc_dispatch_actv_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
3160     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
3161     ibt_priv_data_len_t ret_len_max)
3162 {
3163         ibt_cm_status_t result = IBT_CM_ACCEPT;
3164         ibd_ace_t *ace = (ibd_ace_t *)(uintptr_t)arg;
3165         ibd_rc_chan_t *rc_chan;
3166         ibd_state_t *state;
3167         ibd_rc_msg_hello_t *hello_ack;
3168 
3169         switch (ibt_cm_event->cm_type) {
3170         case IBT_CM_EVENT_REP_RCV:
3171                 ASSERT(ace->ac_chan != NULL);
3172                 ASSERT(ace->ac_chan->chan_state == IBD_RC_STATE_INIT);
3173                 hello_ack = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
3174                 DPRINT(30, "ibd_rc_handle_rep: hello_ack->mtu=0x%x, "
3175                     "hello_ack->qpn=0x%x", ntohl(hello_ack->rx_mtu),
3176                     ntohl(hello_ack->reserved_qpn));
3177                 ace->ac_chan->chan_state = IBD_RC_STATE_ACT_REP_RECV;
3178                 break;
3179 
3180         case IBT_CM_EVENT_CONN_EST:
3181                 ASSERT(ace->ac_chan != NULL);
3182                 DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_CONN_EST, "
3183                     "ace=%p, act_state=%d, chan=%p",
3184                     ace, ace->ac_chan->chan_state, ace->ac_chan);
3185                 result = ibd_rc_handle_act_estab(ace);
3186                 break;
3187 
3188         case IBT_CM_EVENT_CONN_CLOSED:
3189                 rc_chan = ace->ac_chan;
3190                 if (rc_chan == NULL) {
3191                         DPRINT(40, "ibd_rc_dispatch_actv_mad: "
3192                             "rc_chan==NULL, IBT_CM_EVENT_CONN_CLOSED");
3193                         return (IBT_CM_ACCEPT);
3194                 }
3195                 state = rc_chan->state;
3196                 mutex_enter(&state->id_ac_mutex);
3197                 if ((rc_chan->chan_state == IBD_RC_STATE_ACT_ESTAB) &&
3198                     ((ace = ibd_acache_find(state, &ace->ac_mac, B_FALSE, 0))
3199                     != NULL) && (ace == rc_chan->ace)) {
3200                         rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
3201                         ASSERT(ace->ac_mce == NULL);
3202                         INC_REF(ace, 1);
3203                         IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
3204                         mutex_exit(&state->id_ac_mutex);
3205                         DPRINT(30, "ibd_rc_dispatch_actv_mad: "
3206                             "IBT_CM_EVENT_CONN_CLOSED, ace=%p, chan=%p, "
3207                             "reason=%d", ace, rc_chan,
3208                             ibt_cm_event->cm_event.closed);
3209                 } else {
3210                         mutex_exit(&state->id_ac_mutex);
3211                         state->rc_act_close_simultaneous++;
3212                         DPRINT(40, "ibd_rc_dispatch_actv_mad: other thread "
3213                             "is closing it, IBT_CM_EVENT_CONN_CLOSED, "
3214                             "chan_state=%d", rc_chan->chan_state);
3215                         return (IBT_CM_ACCEPT);
3216                 }
3217                 ibd_rc_act_close(rc_chan, B_FALSE);
3218                 mutex_enter(&state->id_ac_mutex);
3219                 ace->ac_chan = NULL;
3220                 ASSERT(ace->ac_ref != 0);
3221                 atomic_dec_32(&ace->ac_ref);
3222                 if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
3223                         IBD_ACACHE_INSERT_FREE(state, ace);
3224                         ace->ac_ref = 0;
3225                 } else {
3226                         ace->ac_ref |= CYCLEVAL;
3227                         state->rc_delay_ace_recycle++;
3228                 }
3229                 mutex_exit(&state->id_ac_mutex);
3230                 break;
3231 
3232         case IBT_CM_EVENT_FAILURE:
3233                 DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_FAILURE,"
3234                     "ace=%p, chan=%p, code: %d, msg: %d, reason=%d",
3235                     ace, ace->ac_chan,
3236                     ibt_cm_event->cm_event.failed.cf_code,
3237                     ibt_cm_event->cm_event.failed.cf_msg,
3238                     ibt_cm_event->cm_event.failed.cf_reason);
3239                 /*
3240                  * Don't need free resource here. The resource is freed
3241                  * at function ibd_rc_connect()
3242                  */
3243                 break;
3244 
3245         case IBT_CM_EVENT_MRA_RCV:
3246                 DPRINT(40, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_MRA_RCV");
3247                 break;
3248         case IBT_CM_EVENT_LAP_RCV:
3249                 DPRINT(40, "ibd_rc_dispatch_actv_mad: LAP message received");
3250                 break;
3251         case IBT_CM_EVENT_APR_RCV:
3252                 DPRINT(40, "ibd_rc_dispatch_actv_mad: APR message received");
3253                 break;
3254         default:
3255                 DPRINT(40, "ibd_rc_dispatch_actv_mad: default branch, "
3256                     "ibt_cm_event->cm_type=%d", ibt_cm_event->cm_type);
3257                 break;
3258         }
3259 
3260         return (result);
3261 }
3262 
3263 /* ARGSUSED */
3264 static ibt_cm_status_t
3265 ibd_rc_dispatch_pass_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
3266     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
3267     ibt_priv_data_len_t ret_len_max)
3268 {
3269         ibt_cm_status_t result = IBT_CM_ACCEPT;
3270         ibd_rc_chan_t *chan;
3271 
3272         if (ibt_cm_event->cm_type == IBT_CM_EVENT_REQ_RCV) {
3273                 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_REQ_RCV,"
3274                     "req_pkey=%x", ibt_cm_event->cm_event.req.req_pkey);
3275                 /* Receive an incoming CM REQ from active side */
3276                 result = ibd_rc_handle_req(arg, &chan, ibt_cm_event, ret_args,
3277                     ret_priv_data);
3278                 return (result);
3279         }
3280 
3281         if (ibt_cm_event->cm_channel == 0) {
3282                 DPRINT(30, "ibd_rc_dispatch_pass_mad: "
3283                     "ERROR ibt_cm_event->cm_channel == 0");
3284                 return (IBT_CM_REJECT);
3285         }
3286 
3287         chan =
3288             (ibd_rc_chan_t *)ibt_get_chan_private(ibt_cm_event->cm_channel);
3289         if (chan == NULL) {
3290                 DPRINT(40, "ibd_rc_dispatch_pass_mad: conn == 0");
3291                 return (IBT_CM_REJECT);
3292         }
3293 
3294         switch (ibt_cm_event->cm_type) {
3295         case IBT_CM_EVENT_CONN_EST:
3296                 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_EST, "
3297                     "chan=%p", chan);
3298                 result = ibd_rc_handle_pas_estab(chan);
3299                 break;
3300         case IBT_CM_EVENT_CONN_CLOSED:
3301                 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_CLOSED,"
3302                     " chan=%p, reason=%d", chan, ibt_cm_event->cm_event.closed);
3303                 chan = ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list,
3304                     chan);
3305                 if (chan != NULL)
3306                         (void) ibd_rc_pas_close(chan, B_FALSE, B_FALSE);
3307                 break;
3308         case IBT_CM_EVENT_FAILURE:
3309                 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_FAILURE,"
3310                     " chan=%p, code: %d, msg: %d, reason=%d", chan,
3311                     ibt_cm_event->cm_event.failed.cf_code,
3312                     ibt_cm_event->cm_event.failed.cf_msg,
3313                     ibt_cm_event->cm_event.failed.cf_reason);
3314                 chan = ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list,
3315                     chan);
3316                 if (chan != NULL)
3317                         (void) ibd_rc_pas_close(chan, B_FALSE, B_FALSE);
3318                 return (IBT_CM_ACCEPT);
3319         case IBT_CM_EVENT_MRA_RCV:
3320                 DPRINT(40, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_MRA_RCV");
3321                 break;
3322         case IBT_CM_EVENT_LAP_RCV:
3323                 DPRINT(40, "ibd_rc_dispatch_pass_mad: LAP message received");
3324                 break;
3325         case IBT_CM_EVENT_APR_RCV:
3326                 DPRINT(40, "ibd_rc_dispatch_pass_mad: APR message received");
3327                 break;
3328         default:
3329                 DPRINT(40, "ibd_rc_dispatch_pass_mad: default, type=%d, "
3330                     "chan=%p", ibt_cm_event->cm_type, chan);
3331                 break;
3332         }
3333 
3334         return (result);
3335 }