1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  24  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  25  */
  26 
  27 /*
  28  * This file contains the top half of the zfs directory structure
  29  * implementation. The bottom half is in zap_leaf.c.
  30  *
  31  * The zdir is an extendable hash data structure. There is a table of
  32  * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
  33  * each a constant size and hold a variable number of directory entries.
  34  * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
  35  *
  36  * The pointer table holds a power of 2 number of pointers.
  37  * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
  38  * by the pointer at index i in the table holds entries whose hash value
  39  * has a zd_prefix_len - bit prefix
  40  */
  41 
  42 #include <sys/spa.h>
  43 #include <sys/dmu.h>
  44 #include <sys/zfs_context.h>
  45 #include <sys/zfs_znode.h>
  46 #include <sys/fs/zfs.h>
  47 #include <sys/zap.h>
  48 #include <sys/refcount.h>
  49 #include <sys/zap_impl.h>
  50 #include <sys/zap_leaf.h>
  51 
  52 int fzap_default_block_shift = 14; /* 16k blocksize */
  53 
  54 extern inline zap_phys_t *zap_f_phys(zap_t *zap);
  55 
  56 static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
  57 
  58 void
  59 fzap_byteswap(void *vbuf, size_t size)
  60 {
  61         uint64_t block_type;
  62 
  63         block_type = *(uint64_t *)vbuf;
  64 
  65         if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
  66                 zap_leaf_byteswap(vbuf, size);
  67         else {
  68                 /* it's a ptrtbl block */
  69                 byteswap_uint64_array(vbuf, size);
  70         }
  71 }
  72 
  73 void
  74 fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
  75 {
  76         dmu_buf_t *db;
  77         zap_leaf_t *l;
  78         int i;
  79         zap_phys_t *zp;
  80 
  81         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
  82         zap->zap_ismicro = FALSE;
  83 
  84         zap->zap_dbu.dbu_evict_func_prep = NULL;
  85         zap->zap_dbu.dbu_evict_func = zap_evict;
  86 
  87         mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
  88         zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
  89 
  90         zp = zap_f_phys(zap);
  91         /*
  92          * explicitly zero it since it might be coming from an
  93          * initialized microzap
  94          */
  95         bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
  96         zp->zap_block_type = ZBT_HEADER;
  97         zp->zap_magic = ZAP_MAGIC;
  98 
  99         zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
 100 
 101         zp->zap_freeblk = 2;         /* block 1 will be the first leaf */
 102         zp->zap_num_leafs = 1;
 103         zp->zap_num_entries = 0;
 104         zp->zap_salt = zap->zap_salt;
 105         zp->zap_normflags = zap->zap_normflags;
 106         zp->zap_flags = flags;
 107 
 108         /* block 1 will be the first leaf */
 109         for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
 110                 ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
 111 
 112         /*
 113          * set up block 1 - the first leaf
 114          */
 115         VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 116             1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
 117         dmu_buf_will_dirty(db, tx);
 118 
 119         l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 120         l->l_dbuf = db;
 121 
 122         zap_leaf_init(l, zp->zap_normflags != 0);
 123 
 124         kmem_free(l, sizeof (zap_leaf_t));
 125         dmu_buf_rele(db, FTAG);
 126 }
 127 
 128 static int
 129 zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
 130 {
 131         if (RW_WRITE_HELD(&zap->zap_rwlock))
 132                 return (1);
 133         if (rw_tryupgrade(&zap->zap_rwlock)) {
 134                 dmu_buf_will_dirty(zap->zap_dbuf, tx);
 135                 return (1);
 136         }
 137         return (0);
 138 }
 139 
 140 /*
 141  * Generic routines for dealing with the pointer & cookie tables.
 142  */
 143 
 144 static int
 145 zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 146     void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
 147     dmu_tx_t *tx)
 148 {
 149         uint64_t b, newblk;
 150         dmu_buf_t *db_old, *db_new;
 151         int err;
 152         int bs = FZAP_BLOCK_SHIFT(zap);
 153         int hepb = 1<<(bs-4);
 154         /* hepb = half the number of entries in a block */
 155 
 156         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 157         ASSERT(tbl->zt_blk != 0);
 158         ASSERT(tbl->zt_numblks > 0);
 159 
 160         if (tbl->zt_nextblk != 0) {
 161                 newblk = tbl->zt_nextblk;
 162         } else {
 163                 newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
 164                 tbl->zt_nextblk = newblk;
 165                 ASSERT0(tbl->zt_blks_copied);
 166                 dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
 167                     tbl->zt_blk << bs, tbl->zt_numblks << bs,
 168                     ZIO_PRIORITY_SYNC_READ);
 169         }
 170 
 171         /*
 172          * Copy the ptrtbl from the old to new location.
 173          */
 174 
 175         b = tbl->zt_blks_copied;
 176         err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 177             (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
 178         if (err)
 179                 return (err);
 180 
 181         /* first half of entries in old[b] go to new[2*b+0] */
 182         VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 183             (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 184         dmu_buf_will_dirty(db_new, tx);
 185         transfer_func(db_old->db_data, db_new->db_data, hepb);
 186         dmu_buf_rele(db_new, FTAG);
 187 
 188         /* second half of entries in old[b] go to new[2*b+1] */
 189         VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 190             (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 191         dmu_buf_will_dirty(db_new, tx);
 192         transfer_func((uint64_t *)db_old->db_data + hepb,
 193             db_new->db_data, hepb);
 194         dmu_buf_rele(db_new, FTAG);
 195 
 196         dmu_buf_rele(db_old, FTAG);
 197 
 198         tbl->zt_blks_copied++;
 199 
 200         dprintf("copied block %llu of %llu\n",
 201             tbl->zt_blks_copied, tbl->zt_numblks);
 202 
 203         if (tbl->zt_blks_copied == tbl->zt_numblks) {
 204                 (void) dmu_free_range(zap->zap_objset, zap->zap_object,
 205                     tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
 206 
 207                 tbl->zt_blk = newblk;
 208                 tbl->zt_numblks *= 2;
 209                 tbl->zt_shift++;
 210                 tbl->zt_nextblk = 0;
 211                 tbl->zt_blks_copied = 0;
 212 
 213                 dprintf("finished; numblocks now %llu (%lluk entries)\n",
 214                     tbl->zt_numblks, 1<<(tbl->zt_shift-10));
 215         }
 216 
 217         return (0);
 218 }
 219 
 220 static int
 221 zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 222     dmu_tx_t *tx)
 223 {
 224         int err;
 225         uint64_t blk, off;
 226         int bs = FZAP_BLOCK_SHIFT(zap);
 227         dmu_buf_t *db;
 228 
 229         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 230         ASSERT(tbl->zt_blk != 0);
 231 
 232         dprintf("storing %llx at index %llx\n", val, idx);
 233 
 234         blk = idx >> (bs-3);
 235         off = idx & ((1<<(bs-3))-1);
 236 
 237         err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 238             (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 239         if (err)
 240                 return (err);
 241         dmu_buf_will_dirty(db, tx);
 242 
 243         if (tbl->zt_nextblk != 0) {
 244                 uint64_t idx2 = idx * 2;
 245                 uint64_t blk2 = idx2 >> (bs-3);
 246                 uint64_t off2 = idx2 & ((1<<(bs-3))-1);
 247                 dmu_buf_t *db2;
 248 
 249                 err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 250                     (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
 251                     DMU_READ_NO_PREFETCH);
 252                 if (err) {
 253                         dmu_buf_rele(db, FTAG);
 254                         return (err);
 255                 }
 256                 dmu_buf_will_dirty(db2, tx);
 257                 ((uint64_t *)db2->db_data)[off2] = val;
 258                 ((uint64_t *)db2->db_data)[off2+1] = val;
 259                 dmu_buf_rele(db2, FTAG);
 260         }
 261 
 262         ((uint64_t *)db->db_data)[off] = val;
 263         dmu_buf_rele(db, FTAG);
 264 
 265         return (0);
 266 }
 267 
 268 static int
 269 zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 270 {
 271         uint64_t blk, off;
 272         int err;
 273         dmu_buf_t *db;
 274         int bs = FZAP_BLOCK_SHIFT(zap);
 275 
 276         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 277 
 278         blk = idx >> (bs-3);
 279         off = idx & ((1<<(bs-3))-1);
 280 
 281         err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 282             (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 283         if (err)
 284                 return (err);
 285         *valp = ((uint64_t *)db->db_data)[off];
 286         dmu_buf_rele(db, FTAG);
 287 
 288         if (tbl->zt_nextblk != 0) {
 289                 /*
 290                  * read the nextblk for the sake of i/o error checking,
 291                  * so that zap_table_load() will catch errors for
 292                  * zap_table_store.
 293                  */
 294                 blk = (idx*2) >> (bs-3);
 295 
 296                 err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 297                     (tbl->zt_nextblk + blk) << bs, FTAG, &db,
 298                     DMU_READ_NO_PREFETCH);
 299                 if (err == 0)
 300                         dmu_buf_rele(db, FTAG);
 301         }
 302         return (err);
 303 }
 304 
 305 /*
 306  * Routines for growing the ptrtbl.
 307  */
 308 
 309 static void
 310 zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
 311 {
 312         int i;
 313         for (i = 0; i < n; i++) {
 314                 uint64_t lb = src[i];
 315                 dst[2*i+0] = lb;
 316                 dst[2*i+1] = lb;
 317         }
 318 }
 319 
 320 static int
 321 zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 322 {
 323         /*
 324          * The pointer table should never use more hash bits than we
 325          * have (otherwise we'd be using useless zero bits to index it).
 326          * If we are within 2 bits of running out, stop growing, since
 327          * this is already an aberrant condition.
 328          */
 329         if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
 330                 return (SET_ERROR(ENOSPC));
 331 
 332         if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 333                 /*
 334                  * We are outgrowing the "embedded" ptrtbl (the one
 335                  * stored in the header block).  Give it its own entire
 336                  * block, which will double the size of the ptrtbl.
 337                  */
 338                 uint64_t newblk;
 339                 dmu_buf_t *db_new;
 340                 int err;
 341 
 342                 ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
 343                     ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
 344                 ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
 345 
 346                 newblk = zap_allocate_blocks(zap, 1);
 347                 err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 348                     newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
 349                     DMU_READ_NO_PREFETCH);
 350                 if (err)
 351                         return (err);
 352                 dmu_buf_will_dirty(db_new, tx);
 353                 zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 354                     db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
 355                 dmu_buf_rele(db_new, FTAG);
 356 
 357                 zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
 358                 zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
 359                 zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
 360 
 361                 ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
 362                     zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
 363                     (FZAP_BLOCK_SHIFT(zap)-3));
 364 
 365                 return (0);
 366         } else {
 367                 return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
 368                     zap_ptrtbl_transfer, tx));
 369         }
 370 }
 371 
 372 static void
 373 zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
 374 {
 375         dmu_buf_will_dirty(zap->zap_dbuf, tx);
 376         mutex_enter(&zap->zap_f.zap_num_entries_mtx);
 377         ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
 378         zap_f_phys(zap)->zap_num_entries += delta;
 379         mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 380 }
 381 
 382 static uint64_t
 383 zap_allocate_blocks(zap_t *zap, int nblocks)
 384 {
 385         uint64_t newblk;
 386         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 387         newblk = zap_f_phys(zap)->zap_freeblk;
 388         zap_f_phys(zap)->zap_freeblk += nblocks;
 389         return (newblk);
 390 }
 391 
 392 static void
 393 zap_leaf_pageout(void *dbu)
 394 {
 395         zap_leaf_t *l = dbu;
 396 
 397         rw_destroy(&l->l_rwlock);
 398         kmem_free(l, sizeof (zap_leaf_t));
 399 }
 400 
 401 static zap_leaf_t *
 402 zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 403 {
 404         void *winner;
 405         zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 406 
 407         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 408 
 409         rw_init(&l->l_rwlock, 0, 0, 0);
 410         rw_enter(&l->l_rwlock, RW_WRITER);
 411         l->l_blkid = zap_allocate_blocks(zap, 1);
 412         l->l_dbuf = NULL;
 413 
 414         VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
 415             l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
 416             DMU_READ_NO_PREFETCH));
 417         dmu_buf_init_user(&l->l_dbu, NULL, zap_leaf_pageout, &l->l_dbuf);
 418         winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
 419         ASSERT(winner == NULL);
 420         dmu_buf_will_dirty(l->l_dbuf, tx);
 421 
 422         zap_leaf_init(l, zap->zap_normflags != 0);
 423 
 424         zap_f_phys(zap)->zap_num_leafs++;
 425 
 426         return (l);
 427 }
 428 
 429 int
 430 fzap_count(zap_t *zap, uint64_t *count)
 431 {
 432         ASSERT(!zap->zap_ismicro);
 433         mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
 434         *count = zap_f_phys(zap)->zap_num_entries;
 435         mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 436         return (0);
 437 }
 438 
 439 /*
 440  * Routines for obtaining zap_leaf_t's
 441  */
 442 
 443 void
 444 zap_put_leaf(zap_leaf_t *l)
 445 {
 446         rw_exit(&l->l_rwlock);
 447         dmu_buf_rele(l->l_dbuf, NULL);
 448 }
 449 
 450 static zap_leaf_t *
 451 zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 452 {
 453         zap_leaf_t *l, *winner;
 454 
 455         ASSERT(blkid != 0);
 456 
 457         l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 458         rw_init(&l->l_rwlock, 0, 0, 0);
 459         rw_enter(&l->l_rwlock, RW_WRITER);
 460         l->l_blkid = blkid;
 461         l->l_bs = highbit64(db->db_size) - 1;
 462         l->l_dbuf = db;
 463 
 464         dmu_buf_init_user(&l->l_dbu, NULL, zap_leaf_pageout, &l->l_dbuf);
 465         winner = dmu_buf_set_user(db, &l->l_dbu);
 466 
 467         rw_exit(&l->l_rwlock);
 468         if (winner != NULL) {
 469                 /* someone else set it first */
 470                 zap_leaf_pageout(&l->l_dbu);
 471                 l = winner;
 472         }
 473 
 474         /*
 475          * lhr_pad was previously used for the next leaf in the leaf
 476          * chain.  There should be no chained leafs (as we have removed
 477          * support for them).
 478          */
 479         ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
 480 
 481         /*
 482          * There should be more hash entries than there can be
 483          * chunks to put in the hash table
 484          */
 485         ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
 486 
 487         /* The chunks should begin at the end of the hash table */
 488         ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
 489             &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
 490 
 491         /* The chunks should end at the end of the block */
 492         ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
 493             (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
 494 
 495         return (l);
 496 }
 497 
 498 static int
 499 zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
 500     zap_leaf_t **lp)
 501 {
 502         dmu_buf_t *db;
 503         zap_leaf_t *l;
 504         int bs = FZAP_BLOCK_SHIFT(zap);
 505         int err;
 506 
 507         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 508 
 509         err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 510             blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
 511         if (err)
 512                 return (err);
 513 
 514         ASSERT3U(db->db_object, ==, zap->zap_object);
 515         ASSERT3U(db->db_offset, ==, blkid << bs);
 516         ASSERT3U(db->db_size, ==, 1 << bs);
 517         ASSERT(blkid != 0);
 518 
 519         l = dmu_buf_get_user(db);
 520 
 521         if (l == NULL)
 522                 l = zap_open_leaf(blkid, db);
 523 
 524         rw_enter(&l->l_rwlock, lt);
 525         /*
 526          * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
 527          * causing ASSERT below to fail.
 528          */
 529         if (lt == RW_WRITER)
 530                 dmu_buf_will_dirty(db, tx);
 531         ASSERT3U(l->l_blkid, ==, blkid);
 532         ASSERT3P(l->l_dbuf, ==, db);
 533         ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
 534         ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 535 
 536         *lp = l;
 537         return (0);
 538 }
 539 
 540 static int
 541 zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
 542 {
 543         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 544 
 545         if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 546                 ASSERT3U(idx, <,
 547                     (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
 548                 *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
 549                 return (0);
 550         } else {
 551                 return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
 552                     idx, valp));
 553         }
 554 }
 555 
 556 static int
 557 zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
 558 {
 559         ASSERT(tx != NULL);
 560         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 561 
 562         if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
 563                 ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
 564                 return (0);
 565         } else {
 566                 return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
 567                     idx, blk, tx));
 568         }
 569 }
 570 
 571 static int
 572 zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
 573 {
 574         uint64_t idx, blk;
 575         int err;
 576 
 577         ASSERT(zap->zap_dbuf == NULL ||
 578             zap_f_phys(zap) == zap->zap_dbuf->db_data);
 579         ASSERT3U(zap_f_phys(zap)->zap_magic, ==, ZAP_MAGIC);
 580         idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 581         err = zap_idx_to_blk(zap, idx, &blk);
 582         if (err != 0)
 583                 return (err);
 584         err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
 585 
 586         ASSERT(err ||
 587             ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
 588             zap_leaf_phys(*lp)->l_hdr.lh_prefix);
 589         return (err);
 590 }
 591 
 592 static int
 593 zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
 594 {
 595         zap_t *zap = zn->zn_zap;
 596         uint64_t hash = zn->zn_hash;
 597         zap_leaf_t *nl;
 598         int prefix_diff, i, err;
 599         uint64_t sibling;
 600         int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 601 
 602         ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 603         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 604 
 605         ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
 606             zap_leaf_phys(l)->l_hdr.lh_prefix);
 607 
 608         if (zap_tryupgradedir(zap, tx) == 0 ||
 609             old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 610                 /* We failed to upgrade, or need to grow the pointer table */
 611                 objset_t *os = zap->zap_objset;
 612                 uint64_t object = zap->zap_object;
 613 
 614                 zap_put_leaf(l);
 615                 zap_unlockdir(zap);
 616                 err = zap_lockdir(os, object, tx, RW_WRITER,
 617                     FALSE, FALSE, &zn->zn_zap);
 618                 zap = zn->zn_zap;
 619                 if (err)
 620                         return (err);
 621                 ASSERT(!zap->zap_ismicro);
 622 
 623                 while (old_prefix_len ==
 624                     zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 625                         err = zap_grow_ptrtbl(zap, tx);
 626                         if (err)
 627                                 return (err);
 628                 }
 629 
 630                 err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
 631                 if (err)
 632                         return (err);
 633 
 634                 if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
 635                         /* it split while our locks were down */
 636                         *lp = l;
 637                         return (0);
 638                 }
 639         }
 640         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 641         ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 642         ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
 643             zap_leaf_phys(l)->l_hdr.lh_prefix);
 644 
 645         prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
 646             (old_prefix_len + 1);
 647         sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
 648 
 649         /* check for i/o errors before doing zap_leaf_split */
 650         for (i = 0; i < (1ULL<<prefix_diff); i++) {
 651                 uint64_t blk;
 652                 err = zap_idx_to_blk(zap, sibling+i, &blk);
 653                 if (err)
 654                         return (err);
 655                 ASSERT3U(blk, ==, l->l_blkid);
 656         }
 657 
 658         nl = zap_create_leaf(zap, tx);
 659         zap_leaf_split(l, nl, zap->zap_normflags != 0);
 660 
 661         /* set sibling pointers */
 662         for (i = 0; i < (1ULL << prefix_diff); i++) {
 663                 err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
 664                 ASSERT0(err); /* we checked for i/o errors above */
 665         }
 666 
 667         if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
 668                 /* we want the sibling */
 669                 zap_put_leaf(l);
 670                 *lp = nl;
 671         } else {
 672                 zap_put_leaf(nl);
 673                 *lp = l;
 674         }
 675 
 676         return (0);
 677 }
 678 
 679 static void
 680 zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
 681 {
 682         zap_t *zap = zn->zn_zap;
 683         int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 684         int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
 685             zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
 686 
 687         zap_put_leaf(l);
 688 
 689         if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
 690                 int err;
 691 
 692                 /*
 693                  * We are in the middle of growing the pointer table, or
 694                  * this leaf will soon make us grow it.
 695                  */
 696                 if (zap_tryupgradedir(zap, tx) == 0) {
 697                         objset_t *os = zap->zap_objset;
 698                         uint64_t zapobj = zap->zap_object;
 699 
 700                         zap_unlockdir(zap);
 701                         err = zap_lockdir(os, zapobj, tx,
 702                             RW_WRITER, FALSE, FALSE, &zn->zn_zap);
 703                         zap = zn->zn_zap;
 704                         if (err)
 705                                 return;
 706                 }
 707 
 708                 /* could have finished growing while our locks were down */
 709                 if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
 710                         (void) zap_grow_ptrtbl(zap, tx);
 711         }
 712 }
 713 
 714 static int
 715 fzap_checkname(zap_name_t *zn)
 716 {
 717         if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
 718                 return (SET_ERROR(ENAMETOOLONG));
 719         return (0);
 720 }
 721 
 722 static int
 723 fzap_checksize(uint64_t integer_size, uint64_t num_integers)
 724 {
 725         /* Only integer sizes supported by C */
 726         switch (integer_size) {
 727         case 1:
 728         case 2:
 729         case 4:
 730         case 8:
 731                 break;
 732         default:
 733                 return (SET_ERROR(EINVAL));
 734         }
 735 
 736         if (integer_size * num_integers > ZAP_MAXVALUELEN)
 737                 return (E2BIG);
 738 
 739         return (0);
 740 }
 741 
 742 static int
 743 fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
 744 {
 745         int err;
 746 
 747         if ((err = fzap_checkname(zn)) != 0)
 748                 return (err);
 749         return (fzap_checksize(integer_size, num_integers));
 750 }
 751 
 752 /*
 753  * Routines for manipulating attributes.
 754  */
 755 int
 756 fzap_lookup(zap_name_t *zn,
 757     uint64_t integer_size, uint64_t num_integers, void *buf,
 758     char *realname, int rn_len, boolean_t *ncp)
 759 {
 760         zap_leaf_t *l;
 761         int err;
 762         zap_entry_handle_t zeh;
 763 
 764         if ((err = fzap_checkname(zn)) != 0)
 765                 return (err);
 766 
 767         err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
 768         if (err != 0)
 769                 return (err);
 770         err = zap_leaf_lookup(l, zn, &zeh);
 771         if (err == 0) {
 772                 if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
 773                         zap_put_leaf(l);
 774                         return (err);
 775                 }
 776 
 777                 err = zap_entry_read(&zeh, integer_size, num_integers, buf);
 778                 (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
 779                 if (ncp) {
 780                         *ncp = zap_entry_normalization_conflict(&zeh,
 781                             zn, NULL, zn->zn_zap);
 782                 }
 783         }
 784 
 785         zap_put_leaf(l);
 786         return (err);
 787 }
 788 
 789 int
 790 fzap_add_cd(zap_name_t *zn,
 791     uint64_t integer_size, uint64_t num_integers,
 792     const void *val, uint32_t cd, dmu_tx_t *tx)
 793 {
 794         zap_leaf_t *l;
 795         int err;
 796         zap_entry_handle_t zeh;
 797         zap_t *zap = zn->zn_zap;
 798 
 799         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 800         ASSERT(!zap->zap_ismicro);
 801         ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
 802 
 803         err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
 804         if (err != 0)
 805                 return (err);
 806 retry:
 807         err = zap_leaf_lookup(l, zn, &zeh);
 808         if (err == 0) {
 809                 err = SET_ERROR(EEXIST);
 810                 goto out;
 811         }
 812         if (err != ENOENT)
 813                 goto out;
 814 
 815         err = zap_entry_create(l, zn, cd,
 816             integer_size, num_integers, val, &zeh);
 817 
 818         if (err == 0) {
 819                 zap_increment_num_entries(zap, 1, tx);
 820         } else if (err == EAGAIN) {
 821                 err = zap_expand_leaf(zn, l, tx, &l);
 822                 zap = zn->zn_zap;    /* zap_expand_leaf() may change zap */
 823                 if (err == 0)
 824                         goto retry;
 825         }
 826 
 827 out:
 828         if (zap != NULL)
 829                 zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
 830         return (err);
 831 }
 832 
 833 int
 834 fzap_add(zap_name_t *zn,
 835     uint64_t integer_size, uint64_t num_integers,
 836     const void *val, dmu_tx_t *tx)
 837 {
 838         int err = fzap_check(zn, integer_size, num_integers);
 839         if (err != 0)
 840                 return (err);
 841 
 842         return (fzap_add_cd(zn, integer_size, num_integers,
 843             val, ZAP_NEED_CD, tx));
 844 }
 845 
 846 int
 847 fzap_update(zap_name_t *zn,
 848     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
 849 {
 850         zap_leaf_t *l;
 851         int err, create;
 852         zap_entry_handle_t zeh;
 853         zap_t *zap = zn->zn_zap;
 854 
 855         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 856         err = fzap_check(zn, integer_size, num_integers);
 857         if (err != 0)
 858                 return (err);
 859 
 860         err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
 861         if (err != 0)
 862                 return (err);
 863 retry:
 864         err = zap_leaf_lookup(l, zn, &zeh);
 865         create = (err == ENOENT);
 866         ASSERT(err == 0 || err == ENOENT);
 867 
 868         if (create) {
 869                 err = zap_entry_create(l, zn, ZAP_NEED_CD,
 870                     integer_size, num_integers, val, &zeh);
 871                 if (err == 0)
 872                         zap_increment_num_entries(zap, 1, tx);
 873         } else {
 874                 err = zap_entry_update(&zeh, integer_size, num_integers, val);
 875         }
 876 
 877         if (err == EAGAIN) {
 878                 err = zap_expand_leaf(zn, l, tx, &l);
 879                 zap = zn->zn_zap;    /* zap_expand_leaf() may change zap */
 880                 if (err == 0)
 881                         goto retry;
 882         }
 883 
 884         if (zap != NULL)
 885                 zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
 886         return (err);
 887 }
 888 
 889 int
 890 fzap_length(zap_name_t *zn,
 891     uint64_t *integer_size, uint64_t *num_integers)
 892 {
 893         zap_leaf_t *l;
 894         int err;
 895         zap_entry_handle_t zeh;
 896 
 897         err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
 898         if (err != 0)
 899                 return (err);
 900         err = zap_leaf_lookup(l, zn, &zeh);
 901         if (err != 0)
 902                 goto out;
 903 
 904         if (integer_size)
 905                 *integer_size = zeh.zeh_integer_size;
 906         if (num_integers)
 907                 *num_integers = zeh.zeh_num_integers;
 908 out:
 909         zap_put_leaf(l);
 910         return (err);
 911 }
 912 
 913 int
 914 fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
 915 {
 916         zap_leaf_t *l;
 917         int err;
 918         zap_entry_handle_t zeh;
 919 
 920         err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
 921         if (err != 0)
 922                 return (err);
 923         err = zap_leaf_lookup(l, zn, &zeh);
 924         if (err == 0) {
 925                 zap_entry_remove(&zeh);
 926                 zap_increment_num_entries(zn->zn_zap, -1, tx);
 927         }
 928         zap_put_leaf(l);
 929         return (err);
 930 }
 931 
 932 void
 933 fzap_prefetch(zap_name_t *zn)
 934 {
 935         uint64_t idx, blk;
 936         zap_t *zap = zn->zn_zap;
 937         int bs;
 938 
 939         idx = ZAP_HASH_IDX(zn->zn_hash,
 940             zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 941         if (zap_idx_to_blk(zap, idx, &blk) != 0)
 942                 return;
 943         bs = FZAP_BLOCK_SHIFT(zap);
 944         dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
 945             ZIO_PRIORITY_SYNC_READ);
 946 }
 947 
 948 /*
 949  * Helper functions for consumers.
 950  */
 951 
 952 uint64_t
 953 zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
 954     const char *name, dmu_tx_t *tx)
 955 {
 956         uint64_t new_obj;
 957 
 958         VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0);
 959         VERIFY(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
 960             tx) == 0);
 961 
 962         return (new_obj);
 963 }
 964 
 965 int
 966 zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
 967     char *name)
 968 {
 969         zap_cursor_t zc;
 970         zap_attribute_t *za;
 971         int err;
 972 
 973         if (mask == 0)
 974                 mask = -1ULL;
 975 
 976         za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 977         for (zap_cursor_init(&zc, os, zapobj);
 978             (err = zap_cursor_retrieve(&zc, za)) == 0;
 979             zap_cursor_advance(&zc)) {
 980                 if ((za->za_first_integer & mask) == (value & mask)) {
 981                         (void) strcpy(name, za->za_name);
 982                         break;
 983                 }
 984         }
 985         zap_cursor_fini(&zc);
 986         kmem_free(za, sizeof (zap_attribute_t));
 987         return (err);
 988 }
 989 
 990 int
 991 zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
 992 {
 993         zap_cursor_t zc;
 994         zap_attribute_t za;
 995         int err;
 996 
 997         err = 0;
 998         for (zap_cursor_init(&zc, os, fromobj);
 999             zap_cursor_retrieve(&zc, &za) == 0;
1000             (void) zap_cursor_advance(&zc)) {
1001                 if (za.za_integer_length != 8 || za.za_num_integers != 1) {
1002                         err = SET_ERROR(EINVAL);
1003                         break;
1004                 }
1005                 err = zap_add(os, intoobj, za.za_name,
1006                     8, 1, &za.za_first_integer, tx);
1007                 if (err)
1008                         break;
1009         }
1010         zap_cursor_fini(&zc);
1011         return (err);
1012 }
1013 
1014 int
1015 zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
1016     uint64_t value, dmu_tx_t *tx)
1017 {
1018         zap_cursor_t zc;
1019         zap_attribute_t za;
1020         int err;
1021 
1022         err = 0;
1023         for (zap_cursor_init(&zc, os, fromobj);
1024             zap_cursor_retrieve(&zc, &za) == 0;
1025             (void) zap_cursor_advance(&zc)) {
1026                 if (za.za_integer_length != 8 || za.za_num_integers != 1) {
1027                         err = SET_ERROR(EINVAL);
1028                         break;
1029                 }
1030                 err = zap_add(os, intoobj, za.za_name,
1031                     8, 1, &value, tx);
1032                 if (err)
1033                         break;
1034         }
1035         zap_cursor_fini(&zc);
1036         return (err);
1037 }
1038 
1039 int
1040 zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
1041     dmu_tx_t *tx)
1042 {
1043         zap_cursor_t zc;
1044         zap_attribute_t za;
1045         int err;
1046 
1047         err = 0;
1048         for (zap_cursor_init(&zc, os, fromobj);
1049             zap_cursor_retrieve(&zc, &za) == 0;
1050             (void) zap_cursor_advance(&zc)) {
1051                 uint64_t delta = 0;
1052 
1053                 if (za.za_integer_length != 8 || za.za_num_integers != 1) {
1054                         err = SET_ERROR(EINVAL);
1055                         break;
1056                 }
1057 
1058                 err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta);
1059                 if (err != 0 && err != ENOENT)
1060                         break;
1061                 delta += za.za_first_integer;
1062                 err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx);
1063                 if (err)
1064                         break;
1065         }
1066         zap_cursor_fini(&zc);
1067         return (err);
1068 }
1069 
1070 int
1071 zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
1072 {
1073         char name[20];
1074 
1075         (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
1076         return (zap_add(os, obj, name, 8, 1, &value, tx));
1077 }
1078 
1079 int
1080 zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
1081 {
1082         char name[20];
1083 
1084         (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
1085         return (zap_remove(os, obj, name, tx));
1086 }
1087 
1088 int
1089 zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
1090 {
1091         char name[20];
1092 
1093         (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
1094         return (zap_lookup(os, obj, name, 8, 1, &value));
1095 }
1096 
1097 int
1098 zap_add_int_key(objset_t *os, uint64_t obj,
1099     uint64_t key, uint64_t value, dmu_tx_t *tx)
1100 {
1101         char name[20];
1102 
1103         (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1104         return (zap_add(os, obj, name, 8, 1, &value, tx));
1105 }
1106 
1107 int
1108 zap_update_int_key(objset_t *os, uint64_t obj,
1109     uint64_t key, uint64_t value, dmu_tx_t *tx)
1110 {
1111         char name[20];
1112 
1113         (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1114         return (zap_update(os, obj, name, 8, 1, &value, tx));
1115 }
1116 
1117 int
1118 zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
1119 {
1120         char name[20];
1121 
1122         (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1123         return (zap_lookup(os, obj, name, 8, 1, valuep));
1124 }
1125 
1126 int
1127 zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
1128     dmu_tx_t *tx)
1129 {
1130         uint64_t value = 0;
1131         int err;
1132 
1133         if (delta == 0)
1134                 return (0);
1135 
1136         err = zap_lookup(os, obj, name, 8, 1, &value);
1137         if (err != 0 && err != ENOENT)
1138                 return (err);
1139         value += delta;
1140         if (value == 0)
1141                 err = zap_remove(os, obj, name, tx);
1142         else
1143                 err = zap_update(os, obj, name, 8, 1, &value, tx);
1144         return (err);
1145 }
1146 
1147 int
1148 zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
1149     dmu_tx_t *tx)
1150 {
1151         char name[20];
1152 
1153         (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
1154         return (zap_increment(os, obj, name, delta, tx));
1155 }
1156 
1157 /*
1158  * Routines for iterating over the attributes.
1159  */
1160 
1161 int
1162 fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
1163 {
1164         int err = ENOENT;
1165         zap_entry_handle_t zeh;
1166         zap_leaf_t *l;
1167 
1168         /* retrieve the next entry at or after zc_hash/zc_cd */
1169         /* if no entry, return ENOENT */
1170 
1171         if (zc->zc_leaf &&
1172             (ZAP_HASH_IDX(zc->zc_hash,
1173             zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
1174             zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
1175                 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1176                 zap_put_leaf(zc->zc_leaf);
1177                 zc->zc_leaf = NULL;
1178         }
1179 
1180 again:
1181         if (zc->zc_leaf == NULL) {
1182                 err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
1183                     &zc->zc_leaf);
1184                 if (err != 0)
1185                         return (err);
1186         } else {
1187                 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1188         }
1189         l = zc->zc_leaf;
1190 
1191         err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
1192 
1193         if (err == ENOENT) {
1194                 uint64_t nocare =
1195                     (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
1196                 zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
1197                 zc->zc_cd = 0;
1198                 if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 ||
1199                     zc->zc_hash == 0) {
1200                         zc->zc_hash = -1ULL;
1201                 } else {
1202                         zap_put_leaf(zc->zc_leaf);
1203                         zc->zc_leaf = NULL;
1204                         goto again;
1205                 }
1206         }
1207 
1208         if (err == 0) {
1209                 zc->zc_hash = zeh.zeh_hash;
1210                 zc->zc_cd = zeh.zeh_cd;
1211                 za->za_integer_length = zeh.zeh_integer_size;
1212                 za->za_num_integers = zeh.zeh_num_integers;
1213                 if (zeh.zeh_num_integers == 0) {
1214                         za->za_first_integer = 0;
1215                 } else {
1216                         err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
1217                         ASSERT(err == 0 || err == EOVERFLOW);
1218                 }
1219                 err = zap_entry_read_name(zap, &zeh,
1220                     sizeof (za->za_name), za->za_name);
1221                 ASSERT(err == 0);
1222 
1223                 za->za_normalization_conflict =
1224                     zap_entry_normalization_conflict(&zeh,
1225                     NULL, za->za_name, zap);
1226         }
1227         rw_exit(&zc->zc_leaf->l_rwlock);
1228         return (err);
1229 }
1230 
1231 static void
1232 zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
1233 {
1234         int i, err;
1235         uint64_t lastblk = 0;
1236 
1237         /*
1238          * NB: if a leaf has more pointers than an entire ptrtbl block
1239          * can hold, then it'll be accounted for more than once, since
1240          * we won't have lastblk.
1241          */
1242         for (i = 0; i < len; i++) {
1243                 zap_leaf_t *l;
1244 
1245                 if (tbl[i] == lastblk)
1246                         continue;
1247                 lastblk = tbl[i];
1248 
1249                 err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
1250                 if (err == 0) {
1251                         zap_leaf_stats(zap, l, zs);
1252                         zap_put_leaf(l);
1253                 }
1254         }
1255 }
1256 
1257 void
1258 fzap_get_stats(zap_t *zap, zap_stats_t *zs)
1259 {
1260         int bs = FZAP_BLOCK_SHIFT(zap);
1261         zs->zs_blocksize = 1ULL << bs;
1262 
1263         /*
1264          * Set zap_phys_t fields
1265          */
1266         zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
1267         zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
1268         zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
1269         zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
1270         zs->zs_magic = zap_f_phys(zap)->zap_magic;
1271         zs->zs_salt = zap_f_phys(zap)->zap_salt;
1272 
1273         /*
1274          * Set zap_ptrtbl fields
1275          */
1276         zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
1277         zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
1278         zs->zs_ptrtbl_blks_copied =
1279             zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
1280         zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
1281         zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
1282         zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
1283 
1284         if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
1285                 /* the ptrtbl is entirely in the header block. */
1286                 zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
1287                     1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
1288         } else {
1289                 int b;
1290 
1291                 dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
1292                     zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
1293                     zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
1294                     ZIO_PRIORITY_SYNC_READ);
1295 
1296                 for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
1297                     b++) {
1298                         dmu_buf_t *db;
1299                         int err;
1300 
1301                         err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
1302                             (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
1303                             FTAG, &db, DMU_READ_NO_PREFETCH);
1304                         if (err == 0) {
1305                                 zap_stats_ptrtbl(zap, db->db_data,
1306                                     1<<(bs-3), zs);
1307                                 dmu_buf_rele(db, FTAG);
1308                         }
1309                 }
1310         }
1311 }
1312 
1313 int
1314 fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
1315     uint64_t *tooverwrite)
1316 {
1317         zap_t *zap = zn->zn_zap;
1318         zap_leaf_t *l;
1319         int err;
1320 
1321         /*
1322          * Account for the header block of the fatzap.
1323          */
1324         if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
1325                 *tooverwrite += zap->zap_dbuf->db_size;
1326         } else {
1327                 *towrite += zap->zap_dbuf->db_size;
1328         }
1329 
1330         /*
1331          * Account for the pointer table blocks.
1332          * If we are adding we need to account for the following cases :
1333          * - If the pointer table is embedded, this operation could force an
1334          *   external pointer table.
1335          * - If this already has an external pointer table this operation
1336          *   could extend the table.
1337          */
1338         if (add) {
1339                 if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0)
1340                         *towrite += zap->zap_dbuf->db_size;
1341                 else
1342                         *towrite += (zap->zap_dbuf->db_size * 3);
1343         }
1344 
1345         /*
1346          * Now, check if the block containing leaf is freeable
1347          * and account accordingly.
1348          */
1349         err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l);
1350         if (err != 0) {
1351                 return (err);
1352         }
1353 
1354         if (!add && dmu_buf_freeable(l->l_dbuf)) {
1355                 *tooverwrite += l->l_dbuf->db_size;
1356         } else {
1357                 /*
1358                  * If this an add operation, the leaf block could split.
1359                  * Hence, we need to account for an additional leaf block.
1360                  */
1361                 *towrite += (add ? 2 : 1) * l->l_dbuf->db_size;
1362         }
1363 
1364         zap_put_leaf(l);
1365         return (0);
1366 }