Print this page
patch nuke-the-dbuf-hash
patch make-the-merge-easy

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dbuf.c
          +++ new/usr/src/uts/common/fs/zfs/dbuf.c
↓ open down ↓ 76 lines elided ↑ open up ↑
  77   77  /* ARGSUSED */
  78   78  static void
  79   79  dbuf_dest(void *vdb, void *unused)
  80   80  {
  81   81          dmu_buf_impl_t *db = vdb;
  82   82          mutex_destroy(&db->db_mtx);
  83   83          cv_destroy(&db->db_changed);
  84   84          refcount_destroy(&db->db_holds);
  85   85  }
  86   86  
  87      -/*
  88      - * dbuf hash table routines
  89      - */
  90      -#pragma align 64(dbuf_hash_table)
  91      -static dbuf_hash_table_t dbuf_hash_table;
  92      -
  93      -static uint64_t dbuf_hash_count;
  94      -
  95      -static uint64_t
  96      -dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
  97      -{
  98      -        uintptr_t osv = (uintptr_t)os;
  99      -        uint64_t crc = -1ULL;
 100      -
 101      -        ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 102      -        crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
 103      -        crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
 104      -        crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
 105      -        crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
 106      -        crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
 107      -        crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
 108      -
 109      -        crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
 110      -
 111      -        return (crc);
 112      -}
 113      -
 114      -#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
 115      -
 116      -#define DBUF_EQUAL(dbuf, os, obj, level, blkid)         \
 117      -        ((dbuf)->db.db_object == (obj) &&               \
 118      -        (dbuf)->db_objset == (os) &&                    \
 119      -        (dbuf)->db_level == (level) &&                  \
 120      -        (dbuf)->db_blkid == (blkid))
 121      -
 122   87  dmu_buf_impl_t *
 123   88  dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
 124   89  {
 125      -        dbuf_hash_table_t *h = &dbuf_hash_table;
 126   90          objset_t *os = dn->dn_objset;
 127   91          uint64_t obj = dn->dn_object;
 128      -        uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 129      -        uint64_t idx = hv & h->hash_table_mask;
 130   92          dmu_buf_impl_t *db;
       93 +        dmu_buf_impl_t key;
       94 +        avl_index_t where;
 131   95  
 132      -        mutex_enter(DBUF_HASH_MUTEX(h, idx));
 133      -        for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 134      -                if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 135      -                        mutex_enter(&db->db_mtx);
 136      -                        if (db->db_state != DB_EVICTING) {
 137      -                                mutex_exit(DBUF_HASH_MUTEX(h, idx));
 138      -                                return (db);
 139      -                        }
 140      -                        mutex_exit(&db->db_mtx);
 141      -                }
 142      -        }
 143      -        mutex_exit(DBUF_HASH_MUTEX(h, idx));
 144      -        return (NULL);
 145      -}
       96 +        key.db_level = level;
       97 +        key.db_blkid = blkid;
       98 +        key.db_state = DB_SEARCH;
 146   99  
 147      -/*
 148      - * Insert an entry into the hash table.  If there is already an element
 149      - * equal to elem in the hash table, then the already existing element
 150      - * will be returned and the new element will not be inserted.
 151      - * Otherwise returns NULL.
 152      - */
 153      -static dmu_buf_impl_t *
 154      -dbuf_hash_insert(dmu_buf_impl_t *db)
 155      -{
 156      -        dbuf_hash_table_t *h = &dbuf_hash_table;
 157      -        objset_t *os = db->db_objset;
 158      -        uint64_t obj = db->db.db_object;
 159      -        int level = db->db_level;
 160      -        uint64_t blkid = db->db_blkid;
 161      -        uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 162      -        uint64_t idx = hv & h->hash_table_mask;
 163      -        dmu_buf_impl_t *dbf;
 164      -
 165      -        mutex_enter(DBUF_HASH_MUTEX(h, idx));
 166      -        for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
 167      -                if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 168      -                        mutex_enter(&dbf->db_mtx);
 169      -                        if (dbf->db_state != DB_EVICTING) {
 170      -                                mutex_exit(DBUF_HASH_MUTEX(h, idx));
 171      -                                return (dbf);
 172      -                        }
 173      -                        mutex_exit(&dbf->db_mtx);
      100 +        mutex_enter(&dn->dn_dbufs_mtx);
      101 +        db = avl_find(&dn->dn_dbufs, &key, &where);
      102 +        ASSERT3P(db, ==, NULL);
      103 +        db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
      104 +
      105 +        for (; db; db = AVL_NEXT(&dn->dn_dbufs, db)) {
      106 +                if ((db->db_level != level) || (db->db_blkid != blkid))
      107 +                        break;
      108 +
      109 +                mutex_enter(&db->db_mtx);
      110 +                if (db->db_state != DB_EVICTING) {
      111 +                        mutex_exit(&dn->dn_dbufs_mtx);
      112 +                        return (db);
 174  113                  }
      114 +                mutex_exit(&db->db_mtx);
 175  115          }
 176  116  
 177      -        mutex_enter(&db->db_mtx);
 178      -        db->db_hash_next = h->hash_table[idx];
 179      -        h->hash_table[idx] = db;
 180      -        mutex_exit(DBUF_HASH_MUTEX(h, idx));
 181      -        atomic_inc_64(&dbuf_hash_count);
 182      -
      117 +        mutex_exit(&dn->dn_dbufs_mtx);
 183  118          return (NULL);
 184  119  }
 185  120  
 186      -/*
 187      - * Remove an entry from the hash table.  It must be in the EVICTING state.
 188      - */
 189      -static void
 190      -dbuf_hash_remove(dmu_buf_impl_t *db)
 191      -{
 192      -        dbuf_hash_table_t *h = &dbuf_hash_table;
 193      -        uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
 194      -            db->db_level, db->db_blkid);
 195      -        uint64_t idx = hv & h->hash_table_mask;
 196      -        dmu_buf_impl_t *dbf, **dbp;
 197      -
 198      -        /*
 199      -         * We musn't hold db_mtx to maintain lock ordering:
 200      -         * DBUF_HASH_MUTEX > db_mtx.
 201      -         */
 202      -        ASSERT(refcount_is_zero(&db->db_holds));
 203      -        ASSERT(db->db_state == DB_EVICTING);
 204      -        ASSERT(!MUTEX_HELD(&db->db_mtx));
 205      -
 206      -        mutex_enter(DBUF_HASH_MUTEX(h, idx));
 207      -        dbp = &h->hash_table[idx];
 208      -        while ((dbf = *dbp) != db) {
 209      -                dbp = &dbf->db_hash_next;
 210      -                ASSERT(dbf != NULL);
 211      -        }
 212      -        *dbp = db->db_hash_next;
 213      -        db->db_hash_next = NULL;
 214      -        mutex_exit(DBUF_HASH_MUTEX(h, idx));
 215      -        atomic_dec_64(&dbuf_hash_count);
 216      -}
 217      -
 218  121  static arc_evict_func_t dbuf_do_evict;
 219  122  
 220  123  static void
 221  124  dbuf_evict_user(dmu_buf_impl_t *db)
 222  125  {
 223  126          ASSERT(MUTEX_HELD(&db->db_mtx));
 224  127  
 225  128          if (db->db_level != 0 || db->db_evict_func == NULL)
 226  129                  return;
 227  130  
↓ open down ↓ 28 lines elided ↑ open up ↑
 256  159          ASSERT(db->db_buf == NULL);
 257  160          ASSERT(db->db_data_pending == NULL);
 258  161  
 259  162          dbuf_clear(db);
 260  163          dbuf_destroy(db);
 261  164  }
 262  165  
 263  166  void
 264  167  dbuf_init(void)
 265  168  {
 266      -        uint64_t hsize = 1ULL << 16;
 267      -        dbuf_hash_table_t *h = &dbuf_hash_table;
 268      -        int i;
 269      -
 270      -        /*
 271      -         * The hash table is big enough to fill all of physical memory
 272      -         * with an average 4K block size.  The table will take up
 273      -         * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
 274      -         */
 275      -        while (hsize * 4096 < physmem * PAGESIZE)
 276      -                hsize <<= 1;
 277      -
 278      -retry:
 279      -        h->hash_table_mask = hsize - 1;
 280      -        h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
 281      -        if (h->hash_table == NULL) {
 282      -                /* XXX - we should really return an error instead of assert */
 283      -                ASSERT(hsize > (1ULL << 10));
 284      -                hsize >>= 1;
 285      -                goto retry;
 286      -        }
 287      -
 288  169          dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
 289  170              sizeof (dmu_buf_impl_t),
 290  171              0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 291      -
 292      -        for (i = 0; i < DBUF_MUTEXES; i++)
 293      -                mutex_init(DBUF_HASH_MUTEX(h, i), NULL, MUTEX_DEFAULT, NULL);
 294  172  }
 295  173  
 296  174  void
 297  175  dbuf_fini(void)
 298  176  {
 299      -        dbuf_hash_table_t *h = &dbuf_hash_table;
 300      -        int i;
 301      -
 302      -        for (i = 0; i < DBUF_MUTEXES; i++)
 303      -                mutex_destroy(DBUF_HASH_MUTEX(h, i));
 304      -        kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 305  177          kmem_cache_destroy(dbuf_cache);
 306  178  }
 307  179  
 308  180  /*
 309  181   * Other stuff.
 310  182   */
 311  183  
 312  184  #ifdef ZFS_DEBUG
 313  185  static void
 314  186  dbuf_verify(dmu_buf_impl_t *db)
↓ open down ↓ 1418 lines elided ↑ open up ↑
1733 1605                  return (0);
1734 1606          }
1735 1607  }
1736 1608  
1737 1609  static dmu_buf_impl_t *
1738 1610  dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1739 1611      dmu_buf_impl_t *parent, blkptr_t *blkptr)
1740 1612  {
1741 1613          objset_t *os = dn->dn_objset;
1742 1614          dmu_buf_impl_t *db, *odb;
     1615 +        avl_index_t where;
1743 1616  
1744 1617          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1745 1618          ASSERT(dn->dn_type != DMU_OT_NONE);
1746 1619  
1747 1620          db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1748 1621  
1749 1622          db->db_objset = os;
1750 1623          db->db.db_object = dn->dn_object;
1751 1624          db->db_level = level;
1752 1625          db->db_blkid = blkid;
↓ open down ↓ 9 lines elided ↑ open up ↑
1762 1635          db->db_immediate_evict = 0;
1763 1636          db->db_freed_in_flight = 0;
1764 1637  
1765 1638          if (blkid == DMU_BONUS_BLKID) {
1766 1639                  ASSERT3P(parent, ==, dn->dn_dbuf);
1767 1640                  db->db.db_size = DN_MAX_BONUSLEN -
1768 1641                      (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1769 1642                  ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1770 1643                  db->db.db_offset = DMU_BONUS_BLKID;
1771 1644                  db->db_state = DB_UNCACHED;
1772      -                /* the bonus dbuf is not placed in the hash table */
     1645 +                /* the bonus dbuf is not placed into the dnode's dbuf tree */
1773 1646                  arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1774 1647                  return (db);
1775 1648          } else if (blkid == DMU_SPILL_BLKID) {
1776 1649                  db->db.db_size = (blkptr != NULL) ?
1777 1650                      BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1778 1651                  db->db.db_offset = 0;
1779 1652          } else {
1780 1653                  int blocksize =
1781 1654                      db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1782 1655                  db->db.db_size = blocksize;
1783 1656                  db->db.db_offset = db->db_blkid * blocksize;
1784 1657          }
1785 1658  
1786      -        /*
1787      -         * Hold the dn_dbufs_mtx while we get the new dbuf
1788      -         * in the hash table *and* added to the dbufs list.
1789      -         * This prevents a possible deadlock with someone
1790      -         * trying to look up this dbuf before its added to the
1791      -         * dn_dbufs list.
1792      -         */
1793 1659          mutex_enter(&dn->dn_dbufs_mtx);
     1660 +        mutex_enter(&db->db_mtx);
1794 1661          db->db_state = DB_EVICTING;
1795      -        if ((odb = dbuf_hash_insert(db)) != NULL) {
     1662 +        if ((odb = avl_find(&dn->dn_dbufs, db, &where))) {
1796 1663                  /* someone else inserted it first */
     1664 +                mutex_exit(&db->db_mtx);
1797 1665                  kmem_cache_free(dbuf_cache, db);
     1666 +                mutex_enter(&odb->db_mtx);
1798 1667                  mutex_exit(&dn->dn_dbufs_mtx);
1799 1668                  return (odb);
1800 1669          }
1801      -        avl_add(&dn->dn_dbufs, db);
     1670 +        avl_insert(&dn->dn_dbufs, db, where);
1802 1671          if (db->db_level == 0 && db->db_blkid >=
1803 1672              dn->dn_unlisted_l0_blkid)
1804 1673                  dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1805 1674          db->db_state = DB_UNCACHED;
1806 1675          mutex_exit(&dn->dn_dbufs_mtx);
1807 1676          arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1808 1677  
1809 1678          if (parent && parent != dn->dn_dbuf)
1810 1679                  dbuf_add_ref(parent, db);
1811 1680  
↓ open down ↓ 51 lines elided ↑ open up ↑
1863 1732                          DB_DNODE_EXIT(db);
1864 1733                          /*
1865 1734                           * Decrementing the dbuf count means that the hold
1866 1735                           * corresponding to the removed dbuf is no longer
1867 1736                           * discounted in dnode_move(), so the dnode cannot be
1868 1737                           * moved until after we release the hold.
1869 1738                           */
1870 1739                          dnode_rele(dn, db);
1871 1740                          db->db_dnode_handle = NULL;
1872 1741                  }
1873      -                dbuf_hash_remove(db);
1874 1742          }
1875 1743          db->db_parent = NULL;
1876 1744          db->db_buf = NULL;
1877 1745  
1878 1746          ASSERT(db->db.db_data == NULL);
1879      -        ASSERT(db->db_hash_next == NULL);
1880 1747          ASSERT(db->db_blkptr == NULL);
1881 1748          ASSERT(db->db_data_pending == NULL);
1882 1749  
1883 1750          kmem_cache_free(dbuf_cache, db);
1884 1751          arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1885 1752  }
1886 1753  
1887 1754  void
1888 1755  dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1889 1756  {
↓ open down ↓ 999 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX