1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source. A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * This file is part of the Chelsio T4 support code.
  14  *
  15  * Copyright (C) 2010-2013 Chelsio Communications.  All rights reserved.
  16  *
  17  * This program is distributed in the hope that it will be useful, but WITHOUT
  18  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19  * FITNESS FOR A PARTICULAR PURPOSE.  See the LICENSE file included in this
  20  * release for licensing terms and conditions.
  21  */
  22 
  23 #include <sys/ddi.h>
  24 #include <sys/sunddi.h>
  25 #include <sys/sunndi.h>
  26 #include <sys/atomic.h>
  27 #include <sys/dlpi.h>
  28 #include <sys/pattr.h>
  29 #include <sys/strsubr.h>
  30 #include <sys/stream.h>
  31 #include <sys/strsun.h>
  32 #include <sys/ethernet.h>
  33 #include <inet/ip.h>
  34 #include <inet/ipclassifier.h>
  35 #include <inet/tcp.h>
  36 
  37 #include "common/common.h"
  38 #include "common/t4_msg.h"
  39 #include "common/t4_regs.h"
  40 #include "common/t4_regs_values.h"
  41 #include "t4_l2t.h"
  42 
  43 /* identifies sync vs async L2T_WRITE_REQs */
  44 #define S_SYNC_WR       12
  45 #define V_SYNC_WR(x)    ((x) << S_SYNC_WR)
  46 #define F_SYNC_WR       V_SYNC_WR(1)
  47 #define VLAN_NONE       0xfff
  48 
  49 /*
  50  * jhash.h: Jenkins hash support.
  51  *
  52  * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
  53  *
  54  * http://burtleburtle.net/bob/hash/
  55  *
  56  * These are the credits from Bob's sources:
  57  *
  58  * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
  59  * hash(), hash2(), hash3, and mix() are externally useful functions.
  60  * Routines to test the hash are included if SELF_TEST is defined.
  61  * You can use this free for any purpose.  It has no warranty.
  62  */
  63 
  64 /* NOTE: Arguments are modified. */
  65 #define __jhash_mix(a, b, c) \
  66 { \
  67         a -= b; a -= c; a ^= (c>>13); \
  68         b -= c; b -= a; b ^= (a<<8); \
  69         c -= a; c -= b; c ^= (b>>13); \
  70         a -= b; a -= c; a ^= (c>>12);  \
  71         b -= c; b -= a; b ^= (a<<16); \
  72         c -= a; c -= b; c ^= (b>>5); \
  73         a -= b; a -= c; a ^= (c>>3);  \
  74         b -= c; b -= a; b ^= (a<<10); \
  75         c -= a; c -= b; c ^= (b>>15); \
  76 }
  77 
  78 /* The golden ration: an arbitrary value */
  79 #define JHASH_GOLDEN_RATIO      0x9e3779b9
  80 
  81 /*
  82  * A special ultra-optimized versions that knows they are hashing exactly
  83  * 3, 2 or 1 word(s).
  84  *
  85  * NOTE: In partilar the "c += length; __jhash_mix(a,b,c);" normally
  86  *       done at the end is not done here.
  87  */
  88 static inline u32
  89 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
  90 {
  91         a += JHASH_GOLDEN_RATIO;
  92         b += JHASH_GOLDEN_RATIO;
  93         c += initval;
  94 
  95         __jhash_mix(a, b, c);
  96 
  97         return (c);
  98 }
  99 
 100 static inline u32
 101 jhash_2words(u32 a, u32 b, u32 initval)
 102 {
 103         return (jhash_3words(a, b, 0, initval));
 104 }
 105 
 106 #ifndef container_of
 107 #define container_of(p, s, f) ((s *)(((uint8_t *)(p)) - offsetof(s, f)))
 108 #endif
 109 
 110 #if defined(__GNUC__)
 111 #define likely(x)       __builtin_expect((x), 1)
 112 #define unlikely(x)     __builtin_expect((x), 0)
 113 #else
 114 #define likely(x)       (x)
 115 #define unlikely(x)     (x)
 116 #endif /* defined(__GNUC__) */
 117 
 118 enum {
 119         L2T_STATE_VALID,        /* entry is up to date */
 120         L2T_STATE_STALE,        /* entry may be used but needs revalidation */
 121         L2T_STATE_RESOLVING,    /* entry needs address resolution */
 122         L2T_STATE_SYNC_WRITE,   /* synchronous write of entry underway */
 123 
 124         /* when state is one of the below the entry is not hashed */
 125         L2T_STATE_SWITCHING,    /* entry is being used by a switching filter */
 126         L2T_STATE_UNUSED        /* entry not in use */
 127 };
 128 
 129 struct l2t_data {
 130         krwlock_t lock;
 131         volatile uint_t nfree;   /* number of free entries */
 132         struct l2t_entry *rover; /* starting point for next allocation */
 133         struct l2t_entry l2tab[L2T_SIZE];
 134 };
 135 
 136 #define VLAN_NONE       0xfff
 137 #define SA(x)           ((struct sockaddr *)(x))
 138 #define SIN(x)          ((struct sockaddr_in *)(x))
 139 #define SINADDR(x)      (SIN(x)->sin_addr.s_addr)
 140 #define atomic_read(x) atomic_add_int_nv(x, 0)
 141 /*
 142  * Allocate a free L2T entry.
 143  * Must be called with l2t_data.lockatomic_load_acq_int held.
 144  */
 145 static struct l2t_entry *
 146 alloc_l2e(struct l2t_data *d)
 147 {
 148         struct l2t_entry *end, *e, **p;
 149 
 150         ASSERT(rw_write_held(&d->lock));
 151 
 152         if (!atomic_read(&d->nfree))
 153                 return (NULL);
 154 
 155         /* there's definitely a free entry */
 156         for (e = d->rover, end = &d->l2tab[L2T_SIZE]; e != end; ++e)
 157                 if (atomic_read(&e->refcnt) == 0)
 158                         goto found;
 159 
 160         for (e = d->l2tab; atomic_read(&e->refcnt); ++e)
 161                 /* */;
 162 found:
 163         d->rover = e + 1;
 164         atomic_dec_uint(&d->nfree);
 165 
 166         /*
 167          * The entry we found may be an inactive entry that is
 168          * presently in the hash table.  We need to remove it.
 169          */
 170         if (e->state < L2T_STATE_SWITCHING) {
 171                 for (p = &d->l2tab[e->hash].first; *p; p = &(*p)->next) {
 172                         if (*p == e) {
 173                                 *p = e->next;
 174                                 e->next = NULL;
 175                                 break;
 176                         }
 177                 }
 178         }
 179 
 180         e->state = L2T_STATE_UNUSED;
 181         return (e);
 182 }
 183 
 184 /*
 185  * Write an L2T entry.  Must be called with the entry locked.
 186  * The write may be synchronous or asynchronous.
 187  */
 188 static int
 189 write_l2e(adapter_t *sc, struct l2t_entry *e, int sync)
 190 {
 191         mblk_t *m;
 192         struct cpl_l2t_write_req *req;
 193 
 194         ASSERT(MUTEX_HELD(&e->lock));
 195 
 196         if ((m = allocb(sizeof (*req), BPRI_HI)) == NULL)
 197                 return (ENOMEM);
 198 
 199         /* LINTED: E_BAD_PTR_CAST_ALIGN */
 200         req = (struct cpl_l2t_write_req *)m->b_wptr;
 201 
 202         /* LINTED: E_CONSTANT_CONDITION */
 203         INIT_TP_WR(req, 0);
 204         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx |
 205             V_SYNC_WR(sync) | V_TID_QID(sc->sge.fwq.abs_id)));
 206         req->params = htons(V_L2T_W_PORT(e->lport) | V_L2T_W_NOREPLY(!sync));
 207         req->l2t_idx = htons(e->idx);
 208         req->vlan = htons(e->vlan);
 209         (void) memcpy(req->dst_mac, e->dmac, sizeof (req->dst_mac));
 210 
 211         m->b_wptr += sizeof (*req);
 212 
 213         (void) t4_mgmt_tx(sc, m);
 214 
 215         if (sync && e->state != L2T_STATE_SWITCHING)
 216                 e->state = L2T_STATE_SYNC_WRITE;
 217 
 218         return (0);
 219 }
 220 
 221 struct l2t_data *
 222 t4_init_l2t(struct adapter *sc)
 223 {
 224         int i;
 225         struct l2t_data *d;
 226 
 227         d = kmem_zalloc(sizeof (*d), KM_SLEEP);
 228 
 229         d->rover = d->l2tab;
 230         (void) atomic_swap_uint(&d->nfree, L2T_SIZE);
 231         rw_init(&d->lock, NULL, RW_DRIVER, NULL);
 232 
 233         for (i = 0; i < L2T_SIZE; i++) {
 234                 /* LINTED: E_ASSIGN_NARROW_CONV */
 235                 d->l2tab[i].idx = i;
 236                 d->l2tab[i].state = L2T_STATE_UNUSED;
 237                 mutex_init(&d->l2tab[i].lock, NULL, MUTEX_DRIVER, NULL);
 238                 (void) atomic_swap_uint(&d->l2tab[i].refcnt, 0);
 239         }
 240 
 241         (void) t4_register_cpl_handler(sc, CPL_L2T_WRITE_RPL, do_l2t_write_rpl);
 242 
 243         return (d);
 244 }
 245 
 246 int
 247 t4_free_l2t(struct l2t_data *d)
 248 {
 249         int i;
 250 
 251         for (i = 0; i < L2T_SIZE; i++)
 252                 mutex_destroy(&d->l2tab[i].lock);
 253         rw_destroy(&d->lock);
 254         kmem_free(d, sizeof (*d));
 255 
 256         return (0);
 257 }
 258 
 259 #ifndef TCP_OFFLOAD_DISABLE
 260 static inline void
 261 l2t_hold(struct l2t_data *d, struct l2t_entry *e)
 262 {
 263         if (atomic_inc_uint_nv(&e->refcnt) == 1)  /* 0 -> 1 transition */
 264                 atomic_dec_uint(&d->nfree);
 265 }
 266 
 267 /*
 268  * To avoid having to check address families we do not allow v4 and v6
 269  * neighbors to be on the same hash chain.  We keep v4 entries in the first
 270  * half of available hash buckets and v6 in the second.
 271  */
 272 enum {
 273         L2T_SZ_HALF = L2T_SIZE / 2,
 274         L2T_HASH_MASK = L2T_SZ_HALF - 1
 275 };
 276 
 277 static inline unsigned int
 278 arp_hash(const uint32_t *key, int ifindex)
 279 {
 280         return (jhash_2words(*key, ifindex, 0) & L2T_HASH_MASK);
 281 }
 282 
 283 static inline unsigned int
 284 ipv6_hash(const uint32_t *key, int ifindex)
 285 {
 286         uint32_t xor = key[0] ^ key[1] ^ key[2] ^ key[3];
 287 
 288         return (L2T_SZ_HALF + (jhash_2words(xor, ifindex, 0) & L2T_HASH_MASK));
 289 }
 290 
 291 static inline unsigned int
 292 addr_hash(const uint32_t *addr, int addr_len, int ifindex)
 293 {
 294         return (addr_len == 4 ? arp_hash(addr, ifindex) :
 295             ipv6_hash(addr, ifindex));
 296 }
 297 
 298 /*
 299  * Checks if an L2T entry is for the given IP/IPv6 address.  It does not check
 300  * whether the L2T entry and the address are of the same address family.
 301  * Callers ensure an address is only checked against L2T entries of the same
 302  * family, something made trivial by the separation of IP and IPv6 hash chains
 303  * mentioned above.  Returns 0 if there's a match,
 304  */
 305 static inline int
 306 addreq(const struct l2t_entry *e, const uint32_t *addr)
 307 {
 308         if (e->v6 != 0)
 309                 return ((e->addr[0] ^ addr[0]) | (e->addr[1] ^ addr[1]) |
 310                     (e->addr[2] ^ addr[2]) | (e->addr[3] ^ addr[3]));
 311         return (e->addr[0] ^ addr[0]);
 312 }
 313 
 314 /*
 315  * Add a packet to an L2T entry's queue of packets awaiting resolution.
 316  * Must be called with the entry's lock held.
 317  */
 318 static inline void
 319 arpq_enqueue(struct l2t_entry *e, mblk_t *m)
 320 {
 321         ASSERT(MUTEX_HELD(&e->lock));
 322 
 323         ASSERT(m->b_next == NULL);
 324         if (e->arpq_head != NULL)
 325                 e->arpq_tail->b_next = m;
 326         else
 327                 e->arpq_head = m;
 328         e->arpq_tail = m;
 329 }
 330 
 331 static inline void
 332 send_pending(struct adapter *sc, struct l2t_entry *e)
 333 {
 334         mblk_t *m, *next;
 335 
 336         ASSERT(MUTEX_HELD(&e->lock));
 337 
 338         for (m = e->arpq_head; m; m = next) {
 339                 next = m->b_next;
 340                 m->b_next = NULL;
 341                 (void) t4_wrq_tx(sc, MBUF_EQ(m), m);
 342         }
 343         e->arpq_head = e->arpq_tail = NULL;
 344 }
 345 
 346 int
 347 t4_l2t_send(struct adapter *sc, mblk_t *m, struct l2t_entry *e)
 348 {
 349         sin_t *sin;
 350         ip2mac_t ip2m;
 351 
 352         if (e->v6 != 0)
 353                 ASSERT(0);
 354 again:
 355         switch (e->state) {
 356         case L2T_STATE_STALE:   /* entry is stale, kick off revalidation */
 357 
 358         /* Fall through */
 359         case L2T_STATE_VALID:   /* fast-path, send the packet on */
 360                 (void) t4_wrq_tx(sc, MBUF_EQ(m), m);
 361                 return (0);
 362 
 363         case L2T_STATE_RESOLVING:
 364         case L2T_STATE_SYNC_WRITE:
 365                 mutex_enter(&e->lock);
 366                 if (e->state != L2T_STATE_SYNC_WRITE &&
 367                     e->state != L2T_STATE_RESOLVING) {
 368                         /* state changed by the time we got here */
 369                         mutex_exit(&e->lock);
 370                         goto again;
 371                 }
 372                 arpq_enqueue(e, m);
 373                 mutex_exit(&e->lock);
 374 
 375                 bzero(&ip2m, sizeof (ip2m));
 376                 sin = (sin_t *)&ip2m.ip2mac_pa;
 377                 sin->sin_family = AF_INET;
 378                 sin->sin_addr.s_addr = e->in_addr;
 379                 ip2m.ip2mac_ifindex = e->ifindex;
 380 
 381                 if (e->state == L2T_STATE_RESOLVING) {
 382                         (void) ip2mac(IP2MAC_RESOLVE, &ip2m, t4_l2t_update, e,
 383                             0);
 384                         if (ip2m.ip2mac_err == EINPROGRESS)
 385                                 ASSERT(0);
 386                         else if (ip2m.ip2mac_err == 0)
 387                                 t4_l2t_update(&ip2m, e);
 388                         else
 389                                 ASSERT(0);
 390                 }
 391         }
 392 
 393         return (0);
 394 }
 395 
 396 /*
 397  * Called when an L2T entry has no more users.  The entry is left in the hash
 398  * table since it is likely to be reused but we also bump nfree to indicate
 399  * that the entry can be reallocated for a different neighbor.  We also drop
 400  * the existing neighbor reference in case the neighbor is going away and is
 401  * waiting on our reference.
 402  *
 403  * Because entries can be reallocated to other neighbors once their ref count
 404  * drops to 0 we need to take the entry's lock to avoid races with a new
 405  * incarnation.
 406  */
 407 static void
 408 t4_l2e_free(struct l2t_entry *e)
 409 {
 410         struct l2t_data *d;
 411 
 412         mutex_enter(&e->lock);
 413         /* LINTED: E_NOP_IF_STMT */
 414         if (atomic_read(&e->refcnt) == 0) {  /* hasn't been recycled */
 415                 /*
 416                  * Don't need to worry about the arpq, an L2T entry can't be
 417                  * released if any packets are waiting for resolution as we
 418                  * need to be able to communicate with the device to close a
 419                  * connection.
 420                  */
 421         }
 422         mutex_exit(&e->lock);
 423 
 424         d = container_of(e, struct l2t_data, l2tab[e->idx]);
 425         atomic_inc_uint(&d->nfree);
 426 
 427 }
 428 
 429 void
 430 t4_l2t_release(struct l2t_entry *e)
 431 {
 432         if (atomic_dec_uint_nv(&e->refcnt) == 0)
 433                 t4_l2e_free(e);
 434 }
 435 
 436 /* ARGSUSED */
 437 int
 438 do_l2t_write_rpl(struct sge_iq *iq, const struct rss_header *rss, mblk_t *m)
 439 {
 440         struct adapter *sc = iq->adapter;
 441         const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
 442         unsigned int tid = GET_TID(rpl);
 443         unsigned int idx = tid & (L2T_SIZE - 1);
 444 
 445         if (likely(rpl->status != CPL_ERR_NONE)) {
 446                 cxgb_printf(sc->dip, CE_WARN,
 447                     "Unexpected L2T_WRITE_RPL status %u for entry %u",
 448                     rpl->status, idx);
 449                 return (-EINVAL);
 450         }
 451 
 452         if (tid & F_SYNC_WR) {
 453                 struct l2t_entry *e = &sc->l2t->l2tab[idx];
 454 
 455                 mutex_enter(&e->lock);
 456                 if (e->state != L2T_STATE_SWITCHING) {
 457                         send_pending(sc, e);
 458                         e->state = L2T_STATE_VALID;
 459                 }
 460                 mutex_exit(&e->lock);
 461         }
 462 
 463         return (0);
 464 }
 465 
 466 /*
 467  * The TOE wants an L2 table entry that it can use to reach the next hop over
 468  * the specified port.  Produce such an entry - create one if needed.
 469  *
 470  * Note that the ifnet could be a pseudo-device like if_vlan, if_lagg, etc. on
 471  * top of the real cxgbe interface.
 472  */
 473 struct l2t_entry *
 474 t4_l2t_get(struct port_info *pi, conn_t *connp)
 475 {
 476         struct l2t_entry *e;
 477         struct l2t_data *d = pi->adapter->l2t;
 478         int addr_len;
 479         uint32_t *addr;
 480         int hash;
 481         int index = \
 482             connp->conn_ixa->ixa_ire->ire_ill->ill_phyint->phyint_ifindex;
 483         unsigned int smt_idx = pi->port_id;
 484         addr = (uint32_t *)&connp->conn_faddr_v4;
 485         addr_len  = sizeof (connp->conn_faddr_v4);
 486 
 487         hash = addr_hash(addr, addr_len, index);
 488 
 489         rw_enter(&d->lock, RW_WRITER);
 490         for (e = d->l2tab[hash].first; e; e = e->next) {
 491                 if (!addreq(e, addr) && e->smt_idx == smt_idx) {
 492                         l2t_hold(d, e);
 493                         goto done;
 494                 }
 495         }
 496 
 497         /* Need to allocate a new entry */
 498         e = alloc_l2e(d);
 499         if (e != NULL) {
 500                 mutex_enter(&e->lock);   /* avoid race with t4_l2t_free */
 501                 e->state = L2T_STATE_RESOLVING;
 502                 (void) memcpy(e->addr, addr, addr_len);
 503                 e->in_addr = connp->conn_faddr_v4;
 504                 e->ifindex = index;
 505                 /* LINTED: E_ASSIGN_NARROW_CONV */
 506                 e->smt_idx = smt_idx;
 507                 /* LINTED: E_ASSIGN_NARROW_CONV */
 508                 e->hash = hash;
 509                 e->lport = pi->lport;
 510                 e->arpq_head = e->arpq_tail = NULL;
 511                 e->v6 = (addr_len == 16);
 512                 e->sc = pi->adapter;
 513                 (void) atomic_swap_uint(&e->refcnt, 1);
 514                 e->vlan = VLAN_NONE;
 515                 e->next = d->l2tab[hash].first;
 516                 d->l2tab[hash].first = e;
 517                 mutex_exit(&e->lock);
 518         } else {
 519                 ASSERT(0);
 520         }
 521 
 522 done:
 523         rw_exit(&d->lock);
 524         return (e);
 525 }
 526 
 527 /*
 528  * Called when the host's neighbor layer makes a change to some entry that is
 529  * loaded into the HW L2 table.
 530  */
 531 void
 532 t4_l2t_update(ip2mac_t *ip2macp, void *arg)
 533 {
 534         struct l2t_entry *e = (struct l2t_entry *)arg;
 535         struct adapter *sc = e->sc;
 536         uchar_t *cp;
 537 
 538         if (ip2macp->ip2mac_err != 0) {
 539                 ASSERT(0); /* Don't know what to do. Needs to be investigated */
 540         }
 541 
 542         mutex_enter(&e->lock);
 543         if (atomic_read(&e->refcnt) != 0)
 544                 goto found;
 545         e->state = L2T_STATE_STALE;
 546         mutex_exit(&e->lock);
 547 
 548         /* The TOE has no interest in this LLE */
 549         return;
 550 
 551 found:
 552         if (atomic_read(&e->refcnt) != 0) {
 553 
 554                 /* Entry is referenced by at least 1 offloaded connection. */
 555 
 556                 cp = (uchar_t *)LLADDR(&ip2macp->ip2mac_ha);
 557                 bcopy(cp, e->dmac, 6);
 558                 (void) write_l2e(sc, e, 1);
 559                 e->state = L2T_STATE_VALID;
 560 
 561         }
 562         mutex_exit(&e->lock);
 563 }
 564 #endif