Print this page
patch nuke-the-dbuf-hash
patch make-the-merge-easy

@@ -82,141 +82,44 @@
         mutex_destroy(&db->db_mtx);
         cv_destroy(&db->db_changed);
         refcount_destroy(&db->db_holds);
 }
 
-/*
- * dbuf hash table routines
- */
-#pragma align 64(dbuf_hash_table)
-static dbuf_hash_table_t dbuf_hash_table;
-
-static uint64_t dbuf_hash_count;
-
-static uint64_t
-dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
-{
-        uintptr_t osv = (uintptr_t)os;
-        uint64_t crc = -1ULL;
-
-        ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-        crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
-        crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
-        crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
-        crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
-        crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
-        crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
-
-        crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
-
-        return (crc);
-}
-
-#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
-
-#define DBUF_EQUAL(dbuf, os, obj, level, blkid)         \
-        ((dbuf)->db.db_object == (obj) &&               \
-        (dbuf)->db_objset == (os) &&                    \
-        (dbuf)->db_level == (level) &&                  \
-        (dbuf)->db_blkid == (blkid))
-
 dmu_buf_impl_t *
 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
 {
-        dbuf_hash_table_t *h = &dbuf_hash_table;
         objset_t *os = dn->dn_objset;
         uint64_t obj = dn->dn_object;
-        uint64_t hv = DBUF_HASH(os, obj, level, blkid);
-        uint64_t idx = hv & h->hash_table_mask;
         dmu_buf_impl_t *db;
+        dmu_buf_impl_t key;
+        avl_index_t where;
+
+        key.db_level = level;
+        key.db_blkid = blkid;
+        key.db_state = DB_SEARCH;
+
+        mutex_enter(&dn->dn_dbufs_mtx);
+        db = avl_find(&dn->dn_dbufs, &key, &where);
+        ASSERT3P(db, ==, NULL);
+        db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+        for (; db; db = AVL_NEXT(&dn->dn_dbufs, db)) {
+                if ((db->db_level != level) || (db->db_blkid != blkid))
+                        break;
 
-        mutex_enter(DBUF_HASH_MUTEX(h, idx));
-        for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
-                if (DBUF_EQUAL(db, os, obj, level, blkid)) {
                         mutex_enter(&db->db_mtx);
                         if (db->db_state != DB_EVICTING) {
-                                mutex_exit(DBUF_HASH_MUTEX(h, idx));
+                        mutex_exit(&dn->dn_dbufs_mtx);
                                 return (db);
                         }
                         mutex_exit(&db->db_mtx);
                 }
-        }
-        mutex_exit(DBUF_HASH_MUTEX(h, idx));
-        return (NULL);
-}
-
-/*
- * Insert an entry into the hash table.  If there is already an element
- * equal to elem in the hash table, then the already existing element
- * will be returned and the new element will not be inserted.
- * Otherwise returns NULL.
- */
-static dmu_buf_impl_t *
-dbuf_hash_insert(dmu_buf_impl_t *db)
-{
-        dbuf_hash_table_t *h = &dbuf_hash_table;
-        objset_t *os = db->db_objset;
-        uint64_t obj = db->db.db_object;
-        int level = db->db_level;
-        uint64_t blkid = db->db_blkid;
-        uint64_t hv = DBUF_HASH(os, obj, level, blkid);
-        uint64_t idx = hv & h->hash_table_mask;
-        dmu_buf_impl_t *dbf;
-
-        mutex_enter(DBUF_HASH_MUTEX(h, idx));
-        for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
-                if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
-                        mutex_enter(&dbf->db_mtx);
-                        if (dbf->db_state != DB_EVICTING) {
-                                mutex_exit(DBUF_HASH_MUTEX(h, idx));
-                                return (dbf);
-                        }
-                        mutex_exit(&dbf->db_mtx);
-                }
-        }
-
-        mutex_enter(&db->db_mtx);
-        db->db_hash_next = h->hash_table[idx];
-        h->hash_table[idx] = db;
-        mutex_exit(DBUF_HASH_MUTEX(h, idx));
-        atomic_inc_64(&dbuf_hash_count);
 
+        mutex_exit(&dn->dn_dbufs_mtx);
         return (NULL);
 }
 
-/*
- * Remove an entry from the hash table.  It must be in the EVICTING state.
- */
-static void
-dbuf_hash_remove(dmu_buf_impl_t *db)
-{
-        dbuf_hash_table_t *h = &dbuf_hash_table;
-        uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
-            db->db_level, db->db_blkid);
-        uint64_t idx = hv & h->hash_table_mask;
-        dmu_buf_impl_t *dbf, **dbp;
-
-        /*
-         * We musn't hold db_mtx to maintain lock ordering:
-         * DBUF_HASH_MUTEX > db_mtx.
-         */
-        ASSERT(refcount_is_zero(&db->db_holds));
-        ASSERT(db->db_state == DB_EVICTING);
-        ASSERT(!MUTEX_HELD(&db->db_mtx));
-
-        mutex_enter(DBUF_HASH_MUTEX(h, idx));
-        dbp = &h->hash_table[idx];
-        while ((dbf = *dbp) != db) {
-                dbp = &dbf->db_hash_next;
-                ASSERT(dbf != NULL);
-        }
-        *dbp = db->db_hash_next;
-        db->db_hash_next = NULL;
-        mutex_exit(DBUF_HASH_MUTEX(h, idx));
-        atomic_dec_64(&dbuf_hash_count);
-}
-
 static arc_evict_func_t dbuf_do_evict;
 
 static void
 dbuf_evict_user(dmu_buf_impl_t *db)
 {

@@ -261,49 +164,18 @@
 }
 
 void
 dbuf_init(void)
 {
-        uint64_t hsize = 1ULL << 16;
-        dbuf_hash_table_t *h = &dbuf_hash_table;
-        int i;
-
-        /*
-         * The hash table is big enough to fill all of physical memory
-         * with an average 4K block size.  The table will take up
-         * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
-         */
-        while (hsize * 4096 < physmem * PAGESIZE)
-                hsize <<= 1;
-
-retry:
-        h->hash_table_mask = hsize - 1;
-        h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
-        if (h->hash_table == NULL) {
-                /* XXX - we should really return an error instead of assert */
-                ASSERT(hsize > (1ULL << 10));
-                hsize >>= 1;
-                goto retry;
-        }
-
         dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
             sizeof (dmu_buf_impl_t),
             0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
-
-        for (i = 0; i < DBUF_MUTEXES; i++)
-                mutex_init(DBUF_HASH_MUTEX(h, i), NULL, MUTEX_DEFAULT, NULL);
 }
 
 void
 dbuf_fini(void)
 {
-        dbuf_hash_table_t *h = &dbuf_hash_table;
-        int i;
-
-        for (i = 0; i < DBUF_MUTEXES; i++)
-                mutex_destroy(DBUF_HASH_MUTEX(h, i));
-        kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
         kmem_cache_destroy(dbuf_cache);
 }
 
 /*
  * Other stuff.

@@ -1738,10 +1610,11 @@
 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
     dmu_buf_impl_t *parent, blkptr_t *blkptr)
 {
         objset_t *os = dn->dn_objset;
         dmu_buf_impl_t *db, *odb;
+        avl_index_t where;
 
         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
         ASSERT(dn->dn_type != DMU_OT_NONE);
 
         db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);

@@ -1767,11 +1640,11 @@
                 db->db.db_size = DN_MAX_BONUSLEN -
                     (dn->dn_nblkptr-1) * sizeof (blkptr_t);
                 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
                 db->db.db_offset = DMU_BONUS_BLKID;
                 db->db_state = DB_UNCACHED;
-                /* the bonus dbuf is not placed in the hash table */
+                /* the bonus dbuf is not placed into the dnode's dbuf tree */
                 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
                 return (db);
         } else if (blkid == DMU_SPILL_BLKID) {
                 db->db.db_size = (blkptr != NULL) ?
                     BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;

@@ -1781,26 +1654,22 @@
                     db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
                 db->db.db_size = blocksize;
                 db->db.db_offset = db->db_blkid * blocksize;
         }
 
-        /*
-         * Hold the dn_dbufs_mtx while we get the new dbuf
-         * in the hash table *and* added to the dbufs list.
-         * This prevents a possible deadlock with someone
-         * trying to look up this dbuf before its added to the
-         * dn_dbufs list.
-         */
         mutex_enter(&dn->dn_dbufs_mtx);
+        mutex_enter(&db->db_mtx);
         db->db_state = DB_EVICTING;
-        if ((odb = dbuf_hash_insert(db)) != NULL) {
+        if ((odb = avl_find(&dn->dn_dbufs, db, &where))) {
                 /* someone else inserted it first */
+                mutex_exit(&db->db_mtx);
                 kmem_cache_free(dbuf_cache, db);
+                mutex_enter(&odb->db_mtx);
                 mutex_exit(&dn->dn_dbufs_mtx);
                 return (odb);
         }
-        avl_add(&dn->dn_dbufs, db);
+        avl_insert(&dn->dn_dbufs, db, where);
         if (db->db_level == 0 && db->db_blkid >=
             dn->dn_unlisted_l0_blkid)
                 dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
         db->db_state = DB_UNCACHED;
         mutex_exit(&dn->dn_dbufs_mtx);

@@ -1868,17 +1737,15 @@
                          * moved until after we release the hold.
                          */
                         dnode_rele(dn, db);
                         db->db_dnode_handle = NULL;
                 }
-                dbuf_hash_remove(db);
         }
         db->db_parent = NULL;
         db->db_buf = NULL;
 
         ASSERT(db->db.db_data == NULL);
-        ASSERT(db->db_hash_next == NULL);
         ASSERT(db->db_blkptr == NULL);
         ASSERT(db->db_data_pending == NULL);
 
         kmem_cache_free(dbuf_cache, db);
         arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);