5047-dont-use-atomic_*_nv-if-you-discard-the-return-value Wdiff usr/src/uts/common/fs/zfs/dbuf.c

Print this page

5047 don't use atomic_*_nv if you discard the return value

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dbuf.c
          +++ new/usr/src/uts/common/fs/zfs/dbuf.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  25   25   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  26   26   * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  27   27   */
  28   28  
  29   29  #include <sys/zfs_context.h>
  30   30  #include <sys/dmu.h>
  31   31  #include <sys/dmu_send.h>
  32   32  #include <sys/dmu_impl.h>
  33   33  #include <sys/dbuf.h>
  34   34  #include <sys/dmu_objset.h>
  35   35  #include <sys/dsl_dataset.h>
  36   36  #include <sys/dsl_dir.h>
  37   37  #include <sys/dmu_tx.h>
  38   38  #include <sys/spa.h>
  39   39  #include <sys/zio.h>
  40   40  #include <sys/dmu_zfetch.h>
  41   41  #include <sys/sa.h>
  42   42  #include <sys/sa_impl.h>
  43   43  #include <sys/zfeature.h>
  44   44  #include <sys/blkptr.h>
  45   45  #include <sys/range_tree.h>
  46   46  
  47   47  /*
  48   48   * Number of times that zfs_free_range() took the slow path while doing
  49   49   * a zfs receive.  A nonzero value indicates a potential performance problem.
  50   50   */
  51   51  uint64_t zfs_free_range_recv_miss;
  52   52  
  53   53  static void dbuf_destroy(dmu_buf_impl_t *db);
  54   54  static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
  55   55  static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
  56   56  
  57   57  /*
  58   58   * Global data structures and functions for the dbuf cache.
  59   59   */
  60   60  static kmem_cache_t *dbuf_cache;
  61   61  
  62   62  /* ARGSUSED */
  63   63  static int
  64   64  dbuf_cons(void *vdb, void *unused, int kmflag)
  65   65  {
  66   66          dmu_buf_impl_t *db = vdb;
  67   67          bzero(db, sizeof (dmu_buf_impl_t));
  68   68  
  69   69          mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
  70   70          cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
  71   71          refcount_create(&db->db_holds);
  72   72          return (0);
  73   73  }
  74   74  
  75   75  /* ARGSUSED */
  76   76  static void
  77   77  dbuf_dest(void *vdb, void *unused)
  78   78  {
  79   79          dmu_buf_impl_t *db = vdb;
  80   80          mutex_destroy(&db->db_mtx);
  81   81          cv_destroy(&db->db_changed);
  82   82          refcount_destroy(&db->db_holds);
  83   83  }
  84   84  
  85   85  /*
  86   86   * dbuf hash table routines
  87   87   */
  88   88  static dbuf_hash_table_t dbuf_hash_table;
  89   89  
  90   90  static uint64_t dbuf_hash_count;
  91   91  
  92   92  static uint64_t
  93   93  dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
  94   94  {
  95   95          uintptr_t osv = (uintptr_t)os;
  96   96          uint64_t crc = -1ULL;
  97   97  
  98   98          ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
  99   99          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
 100  100          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
 101  101          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
 102  102          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
 103  103          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
 104  104          crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
 105  105  
 106  106          crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
 107  107  
 108  108          return (crc);
 109  109  }
 110  110  
 111  111  #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
 112  112  
 113  113  #define DBUF_EQUAL(dbuf, os, obj, level, blkid)         \
 114  114          ((dbuf)->db.db_object == (obj) &&               \
 115  115          (dbuf)->db_objset == (os) &&                    \
 116  116          (dbuf)->db_level == (level) &&                  \
 117  117          (dbuf)->db_blkid == (blkid))
 118  118  
 119  119  dmu_buf_impl_t *
 120  120  dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
 121  121  {
 122  122          dbuf_hash_table_t *h = &dbuf_hash_table;
 123  123          objset_t *os = dn->dn_objset;
 124  124          uint64_t obj = dn->dn_object;
 125  125          uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 126  126          uint64_t idx = hv & h->hash_table_mask;
 127  127          dmu_buf_impl_t *db;
 128  128  
 129  129          mutex_enter(DBUF_HASH_MUTEX(h, idx));
 130  130          for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 131  131                  if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 132  132                          mutex_enter(&db->db_mtx);
 133  133                          if (db->db_state != DB_EVICTING) {
 134  134                                  mutex_exit(DBUF_HASH_MUTEX(h, idx));
 135  135                                  return (db);
 136  136                          }
 137  137                          mutex_exit(&db->db_mtx);
 138  138                  }
 139  139          }
 140  140          mutex_exit(DBUF_HASH_MUTEX(h, idx));
 141  141          return (NULL);
 142  142  }
 143  143  
 144  144  /*
 145  145   * Insert an entry into the hash table.  If there is already an element
 146  146   * equal to elem in the hash table, then the already existing element
 147  147   * will be returned and the new element will not be inserted.
 148  148   * Otherwise returns NULL.
 149  149   */
 150  150  static dmu_buf_impl_t *
 151  151  dbuf_hash_insert(dmu_buf_impl_t *db)
 152  152  {
 153  153          dbuf_hash_table_t *h = &dbuf_hash_table;
 154  154          objset_t *os = db->db_objset;
 155  155          uint64_t obj = db->db.db_object;
 156  156          int level = db->db_level;
 157  157          uint64_t blkid = db->db_blkid;
 158  158          uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 159  159          uint64_t idx = hv & h->hash_table_mask;
 160  160          dmu_buf_impl_t *dbf;
 161  161  
 162  162          mutex_enter(DBUF_HASH_MUTEX(h, idx));
 163  163          for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
 164  164                  if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 165  165                          mutex_enter(&dbf->db_mtx);
 166  166                          if (dbf->db_state != DB_EVICTING) {
 167  167                                  mutex_exit(DBUF_HASH_MUTEX(h, idx));
 168  168                                  return (dbf);
 169  169                          }
 170  170                          mutex_exit(&dbf->db_mtx);
 171  171                  }
 172  172          }
 173  173  
 174  174          mutex_enter(&db->db_mtx);
 175  175          db->db_hash_next = h->hash_table[idx];
 176  176          h->hash_table[idx] = db;
 177  177          mutex_exit(DBUF_HASH_MUTEX(h, idx));
 178  178          atomic_inc_64(&dbuf_hash_count);
 179  179  
 180  180          return (NULL);
 181  181  }
 182  182  
 183  183  /*
 184  184   * Remove an entry from the hash table.  This operation will
 185  185   * fail if there are any existing holds on the db.
 186  186   */
 187  187  static void
 188  188  dbuf_hash_remove(dmu_buf_impl_t *db)
 189  189  {
 190  190          dbuf_hash_table_t *h = &dbuf_hash_table;
 191  191          uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
 192  192              db->db_level, db->db_blkid);
 193  193          uint64_t idx = hv & h->hash_table_mask;
 194  194          dmu_buf_impl_t *dbf, **dbp;
 195  195  
 196  196          /*
 197  197           * We musn't hold db_mtx to maintin lock ordering:
 198  198           * DBUF_HASH_MUTEX > db_mtx.
 199  199           */
 200  200          ASSERT(refcount_is_zero(&db->db_holds));
 201  201          ASSERT(db->db_state == DB_EVICTING);
 202  202          ASSERT(!MUTEX_HELD(&db->db_mtx));
 203  203  
 204  204          mutex_enter(DBUF_HASH_MUTEX(h, idx));
 205  205          dbp = &h->hash_table[idx];
 206  206          while ((dbf = *dbp) != db) {
 207  207                  dbp = &dbf->db_hash_next;
 208  208                  ASSERT(dbf != NULL);
 209  209          }
 210  210          *dbp = db->db_hash_next;
 211  211          db->db_hash_next = NULL;
 212  212          mutex_exit(DBUF_HASH_MUTEX(h, idx));
 213  213          atomic_dec_64(&dbuf_hash_count);
 214  214  }
 215  215  
 216  216  static arc_evict_func_t dbuf_do_evict;
 217  217  
 218  218  static void
 219  219  dbuf_evict_user(dmu_buf_impl_t *db)
 220  220  {
 221  221          ASSERT(MUTEX_HELD(&db->db_mtx));
 222  222  
 223  223          if (db->db_level != 0 || db->db_evict_func == NULL)
 224  224                  return;
 225  225  
 226  226          if (db->db_user_data_ptr_ptr)
 227  227                  *db->db_user_data_ptr_ptr = db->db.db_data;
 228  228          db->db_evict_func(&db->db, db->db_user_ptr);
 229  229          db->db_user_ptr = NULL;
 230  230          db->db_user_data_ptr_ptr = NULL;
 231  231          db->db_evict_func = NULL;
 232  232  }
 233  233  
 234  234  boolean_t
 235  235  dbuf_is_metadata(dmu_buf_impl_t *db)
 236  236  {
 237  237          if (db->db_level > 0) {
 238  238                  return (B_TRUE);
 239  239          } else {
 240  240                  boolean_t is_metadata;
 241  241  
 242  242                  DB_DNODE_ENTER(db);
 243  243                  is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
 244  244                  DB_DNODE_EXIT(db);
 245  245  
 246  246                  return (is_metadata);
 247  247          }
 248  248  }
 249  249  
 250  250  void
 251  251  dbuf_evict(dmu_buf_impl_t *db)
 252  252  {
 253  253          ASSERT(MUTEX_HELD(&db->db_mtx));
 254  254          ASSERT(db->db_buf == NULL);
 255  255          ASSERT(db->db_data_pending == NULL);
 256  256  
 257  257          dbuf_clear(db);
 258  258          dbuf_destroy(db);
 259  259  }
 260  260  
 261  261  void
 262  262  dbuf_init(void)
 263  263  {
 264  264          uint64_t hsize = 1ULL << 16;
 265  265          dbuf_hash_table_t *h = &dbuf_hash_table;
 266  266          int i;
 267  267  
 268  268          /*
 269  269           * The hash table is big enough to fill all of physical memory
 270  270           * with an average 4K block size.  The table will take up
 271  271           * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
 272  272           */
 273  273          while (hsize * 4096 < physmem * PAGESIZE)
 274  274                  hsize <<= 1;
 275  275  
 276  276  retry:
 277  277          h->hash_table_mask = hsize - 1;
 278  278          h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
 279  279          if (h->hash_table == NULL) {
 280  280                  /* XXX - we should really return an error instead of assert */
 281  281                  ASSERT(hsize > (1ULL << 10));
 282  282                  hsize >>= 1;
 283  283                  goto retry;
 284  284          }
 285  285  
 286  286          dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
 287  287              sizeof (dmu_buf_impl_t),
 288  288              0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 289  289  
 290  290          for (i = 0; i < DBUF_MUTEXES; i++)
 291  291                  mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 292  292  }
 293  293  
 294  294  void
 295  295  dbuf_fini(void)
 296  296  {
 297  297          dbuf_hash_table_t *h = &dbuf_hash_table;
 298  298          int i;
 299  299  
 300  300          for (i = 0; i < DBUF_MUTEXES; i++)
 301  301                  mutex_destroy(&h->hash_mutexes[i]);
 302  302          kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 303  303          kmem_cache_destroy(dbuf_cache);
 304  304  }
 305  305  
 306  306  /*
 307  307   * Other stuff.
 308  308   */
 309  309  
 310  310  #ifdef ZFS_DEBUG
 311  311  static void
 312  312  dbuf_verify(dmu_buf_impl_t *db)
 313  313  {
 314  314          dnode_t *dn;
 315  315          dbuf_dirty_record_t *dr;
 316  316  
 317  317          ASSERT(MUTEX_HELD(&db->db_mtx));
 318  318  
 319  319          if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
 320  320                  return;
 321  321  
 322  322          ASSERT(db->db_objset != NULL);
 323  323          DB_DNODE_ENTER(db);
 324  324          dn = DB_DNODE(db);
 325  325          if (dn == NULL) {
 326  326                  ASSERT(db->db_parent == NULL);
 327  327                  ASSERT(db->db_blkptr == NULL);
 328  328          } else {
 329  329                  ASSERT3U(db->db.db_object, ==, dn->dn_object);
 330  330                  ASSERT3P(db->db_objset, ==, dn->dn_objset);
 331  331                  ASSERT3U(db->db_level, <, dn->dn_nlevels);
 332  332                  ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 333  333                      db->db_blkid == DMU_SPILL_BLKID ||
 334  334                      !list_is_empty(&dn->dn_dbufs));
 335  335          }
 336  336          if (db->db_blkid == DMU_BONUS_BLKID) {
 337  337                  ASSERT(dn != NULL);
 338  338                  ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 339  339                  ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
 340  340          } else if (db->db_blkid == DMU_SPILL_BLKID) {
 341  341                  ASSERT(dn != NULL);
 342  342                  ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 343  343                  ASSERT0(db->db.db_offset);
 344  344          } else {
 345  345                  ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
 346  346          }
 347  347  
 348  348          for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
 349  349                  ASSERT(dr->dr_dbuf == db);
 350  350  
 351  351          for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
 352  352                  ASSERT(dr->dr_dbuf == db);
 353  353  
 354  354          /*
 355  355           * We can't assert that db_size matches dn_datablksz because it
 356  356           * can be momentarily different when another thread is doing
 357  357           * dnode_set_blksz().
 358  358           */
 359  359          if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
 360  360                  dr = db->db_data_pending;
 361  361                  /*
 362  362                   * It should only be modified in syncing context, so
 363  363                   * make sure we only have one copy of the data.
 364  364                   */
 365  365                  ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
 366  366          }
 367  367  
 368  368          /* verify db->db_blkptr */
 369  369          if (db->db_blkptr) {
 370  370                  if (db->db_parent == dn->dn_dbuf) {
 371  371                          /* db is pointed to by the dnode */
 372  372                          /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
 373  373                          if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
 374  374                                  ASSERT(db->db_parent == NULL);
 375  375                          else
 376  376                                  ASSERT(db->db_parent != NULL);
 377  377                          if (db->db_blkid != DMU_SPILL_BLKID)
 378  378                                  ASSERT3P(db->db_blkptr, ==,
 379  379                                      &dn->dn_phys->dn_blkptr[db->db_blkid]);
 380  380                  } else {
 381  381                          /* db is pointed to by an indirect block */
 382  382                          int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
 383  383                          ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
 384  384                          ASSERT3U(db->db_parent->db.db_object, ==,
 385  385                              db->db.db_object);
 386  386                          /*
 387  387                           * dnode_grow_indblksz() can make this fail if we don't
 388  388                           * have the struct_rwlock.  XXX indblksz no longer
 389  389                           * grows.  safe to do this now?
 390  390                           */
 391  391                          if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 392  392                                  ASSERT3P(db->db_blkptr, ==,
 393  393                                      ((blkptr_t *)db->db_parent->db.db_data +
 394  394                                      db->db_blkid % epb));
 395  395                          }
 396  396                  }
 397  397          }
 398  398          if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
 399  399              (db->db_buf == NULL || db->db_buf->b_data) &&
 400  400              db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
 401  401              db->db_state != DB_FILL && !dn->dn_free_txg) {
 402  402                  /*
 403  403                   * If the blkptr isn't set but they have nonzero data,
 404  404                   * it had better be dirty, otherwise we'll lose that
 405  405                   * data when we evict this buffer.
 406  406                   */
 407  407                  if (db->db_dirtycnt == 0) {
 408  408                          uint64_t *buf = db->db.db_data;
 409  409                          int i;
 410  410  
 411  411                          for (i = 0; i < db->db.db_size >> 3; i++) {
 412  412                                  ASSERT(buf[i] == 0);
 413  413                          }
 414  414                  }
 415  415          }
 416  416          DB_DNODE_EXIT(db);
 417  417  }
 418  418  #endif
 419  419  
 420  420  static void
 421  421  dbuf_update_data(dmu_buf_impl_t *db)
 422  422  {
 423  423          ASSERT(MUTEX_HELD(&db->db_mtx));
 424  424          if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
 425  425                  ASSERT(!refcount_is_zero(&db->db_holds));
 426  426                  *db->db_user_data_ptr_ptr = db->db.db_data;
 427  427          }
 428  428  }
 429  429  
 430  430  static void
 431  431  dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 432  432  {
 433  433          ASSERT(MUTEX_HELD(&db->db_mtx));
 434  434          ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
 435  435          db->db_buf = buf;
 436  436          if (buf != NULL) {
 437  437                  ASSERT(buf->b_data != NULL);
 438  438                  db->db.db_data = buf->b_data;
 439  439                  if (!arc_released(buf))
 440  440                          arc_set_callback(buf, dbuf_do_evict, db);
 441  441                  dbuf_update_data(db);
 442  442          } else {
 443  443                  dbuf_evict_user(db);
 444  444                  db->db.db_data = NULL;
 445  445                  if (db->db_state != DB_NOFILL)
 446  446                          db->db_state = DB_UNCACHED;
 447  447          }
 448  448  }
 449  449  
 450  450  /*
 451  451   * Loan out an arc_buf for read.  Return the loaned arc_buf.
 452  452   */
 453  453  arc_buf_t *
 454  454  dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 455  455  {
 456  456          arc_buf_t *abuf;
 457  457  
 458  458          mutex_enter(&db->db_mtx);
 459  459          if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
 460  460                  int blksz = db->db.db_size;
 461  461                  spa_t *spa = db->db_objset->os_spa;
 462  462  
 463  463                  mutex_exit(&db->db_mtx);
 464  464                  abuf = arc_loan_buf(spa, blksz);
 465  465                  bcopy(db->db.db_data, abuf->b_data, blksz);
 466  466          } else {
 467  467                  abuf = db->db_buf;
 468  468                  arc_loan_inuse_buf(abuf, db);
 469  469                  dbuf_set_data(db, NULL);
 470  470                  mutex_exit(&db->db_mtx);
 471  471          }
 472  472          return (abuf);
 473  473  }
 474  474  
 475  475  uint64_t
 476  476  dbuf_whichblock(dnode_t *dn, uint64_t offset)
 477  477  {
 478  478          if (dn->dn_datablkshift) {
 479  479                  return (offset >> dn->dn_datablkshift);
 480  480          } else {
 481  481                  ASSERT3U(offset, <, dn->dn_datablksz);
 482  482                  return (0);
 483  483          }
 484  484  }
 485  485  
 486  486  static void
 487  487  dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 488  488  {
 489  489          dmu_buf_impl_t *db = vdb;
 490  490  
 491  491          mutex_enter(&db->db_mtx);
 492  492          ASSERT3U(db->db_state, ==, DB_READ);
 493  493          /*
 494  494           * All reads are synchronous, so we must have a hold on the dbuf
 495  495           */
 496  496          ASSERT(refcount_count(&db->db_holds) > 0);
 497  497          ASSERT(db->db_buf == NULL);
 498  498          ASSERT(db->db.db_data == NULL);
 499  499          if (db->db_level == 0 && db->db_freed_in_flight) {
 500  500                  /* we were freed in flight; disregard any error */
 501  501                  arc_release(buf, db);
 502  502                  bzero(buf->b_data, db->db.db_size);
 503  503                  arc_buf_freeze(buf);
 504  504                  db->db_freed_in_flight = FALSE;
 505  505                  dbuf_set_data(db, buf);
 506  506                  db->db_state = DB_CACHED;
 507  507          } else if (zio == NULL || zio->io_error == 0) {
 508  508                  dbuf_set_data(db, buf);
 509  509                  db->db_state = DB_CACHED;
 510  510          } else {
 511  511                  ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 512  512                  ASSERT3P(db->db_buf, ==, NULL);
 513  513                  VERIFY(arc_buf_remove_ref(buf, db));
 514  514                  db->db_state = DB_UNCACHED;
 515  515          }
 516  516          cv_broadcast(&db->db_changed);
 517  517          dbuf_rele_and_unlock(db, NULL);
 518  518  }
 519  519  
 520  520  static void
 521  521  dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 522  522  {
 523  523          dnode_t *dn;
 524  524          zbookmark_phys_t zb;
 525  525          uint32_t aflags = ARC_NOWAIT;
 526  526  
 527  527          DB_DNODE_ENTER(db);
 528  528          dn = DB_DNODE(db);
 529  529          ASSERT(!refcount_is_zero(&db->db_holds));
 530  530          /* We need the struct_rwlock to prevent db_blkptr from changing. */
 531  531          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 532  532          ASSERT(MUTEX_HELD(&db->db_mtx));
 533  533          ASSERT(db->db_state == DB_UNCACHED);
 534  534          ASSERT(db->db_buf == NULL);
 535  535  
 536  536          if (db->db_blkid == DMU_BONUS_BLKID) {
 537  537                  int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 538  538  
 539  539                  ASSERT3U(bonuslen, <=, db->db.db_size);
 540  540                  db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 541  541                  arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 542  542                  if (bonuslen < DN_MAX_BONUSLEN)
 543  543                          bzero(db->db.db_data, DN_MAX_BONUSLEN);
 544  544                  if (bonuslen)
 545  545                          bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
 546  546                  DB_DNODE_EXIT(db);
 547  547                  dbuf_update_data(db);
 548  548                  db->db_state = DB_CACHED;
 549  549                  mutex_exit(&db->db_mtx);
 550  550                  return;
 551  551          }
 552  552  
 553  553          /*
 554  554           * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
 555  555           * processes the delete record and clears the bp while we are waiting
 556  556           * for the dn_mtx (resulting in a "no" from block_freed).
 557  557           */
 558  558          if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
 559  559              (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
 560  560              BP_IS_HOLE(db->db_blkptr)))) {
 561  561                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 562  562  
 563  563                  DB_DNODE_EXIT(db);
 564  564                  dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
 565  565                      db->db.db_size, db, type));
 566  566                  bzero(db->db.db_data, db->db.db_size);
 567  567                  db->db_state = DB_CACHED;
 568  568                  *flags |= DB_RF_CACHED;
 569  569                  mutex_exit(&db->db_mtx);
 570  570                  return;
 571  571          }
 572  572  
 573  573          DB_DNODE_EXIT(db);
 574  574  
 575  575          db->db_state = DB_READ;
 576  576          mutex_exit(&db->db_mtx);
 577  577  
 578  578          if (DBUF_IS_L2CACHEABLE(db))
 579  579                  aflags |= ARC_L2CACHE;
 580  580          if (DBUF_IS_L2COMPRESSIBLE(db))
 581  581                  aflags |= ARC_L2COMPRESS;
 582  582  
 583  583          SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
 584  584              db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 585  585              db->db.db_object, db->db_level, db->db_blkid);
 586  586  
 587  587          dbuf_add_ref(db, NULL);
 588  588  
 589  589          (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
 590  590              dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
 591  591              (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
 592  592              &aflags, &zb);
 593  593          if (aflags & ARC_CACHED)
 594  594                  *flags |= DB_RF_CACHED;
 595  595  }
 596  596  
 597  597  int
 598  598  dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 599  599  {
 600  600          int err = 0;
 601  601          boolean_t havepzio = (zio != NULL);
 602  602          boolean_t prefetch;
 603  603          dnode_t *dn;
 604  604  
 605  605          /*
 606  606           * We don't have to hold the mutex to check db_state because it
 607  607           * can't be freed while we have a hold on the buffer.
 608  608           */
 609  609          ASSERT(!refcount_is_zero(&db->db_holds));
 610  610  
 611  611          if (db->db_state == DB_NOFILL)
 612  612                  return (SET_ERROR(EIO));
 613  613  
 614  614          DB_DNODE_ENTER(db);
 615  615          dn = DB_DNODE(db);
 616  616          if ((flags & DB_RF_HAVESTRUCT) == 0)
 617  617                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 618  618  
 619  619          prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 620  620              (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
 621  621              DBUF_IS_CACHEABLE(db);
 622  622  
 623  623          mutex_enter(&db->db_mtx);
 624  624          if (db->db_state == DB_CACHED) {
 625  625                  mutex_exit(&db->db_mtx);
 626  626                  if (prefetch)
 627  627                          dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 628  628                              db->db.db_size, TRUE);
 629  629                  if ((flags & DB_RF_HAVESTRUCT) == 0)
 630  630                          rw_exit(&dn->dn_struct_rwlock);
 631  631                  DB_DNODE_EXIT(db);
 632  632          } else if (db->db_state == DB_UNCACHED) {
 633  633                  spa_t *spa = dn->dn_objset->os_spa;
 634  634  
 635  635                  if (zio == NULL)
 636  636                          zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 637  637                  dbuf_read_impl(db, zio, &flags);
 638  638  
 639  639                  /* dbuf_read_impl has dropped db_mtx for us */
 640  640  
 641  641                  if (prefetch)
 642  642                          dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 643  643                              db->db.db_size, flags & DB_RF_CACHED);
 644  644  
 645  645                  if ((flags & DB_RF_HAVESTRUCT) == 0)
 646  646                          rw_exit(&dn->dn_struct_rwlock);
 647  647                  DB_DNODE_EXIT(db);
 648  648  
 649  649                  if (!havepzio)
 650  650                          err = zio_wait(zio);
 651  651          } else {
 652  652                  /*
 653  653                   * Another reader came in while the dbuf was in flight
 654  654                   * between UNCACHED and CACHED.  Either a writer will finish
 655  655                   * writing the buffer (sending the dbuf to CACHED) or the
 656  656                   * first reader's request will reach the read_done callback
 657  657                   * and send the dbuf to CACHED.  Otherwise, a failure
 658  658                   * occurred and the dbuf went to UNCACHED.
 659  659                   */
 660  660                  mutex_exit(&db->db_mtx);
 661  661                  if (prefetch)
 662  662                          dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 663  663                              db->db.db_size, TRUE);
 664  664                  if ((flags & DB_RF_HAVESTRUCT) == 0)
 665  665                          rw_exit(&dn->dn_struct_rwlock);
 666  666                  DB_DNODE_EXIT(db);
 667  667  
 668  668                  /* Skip the wait per the caller's request. */
 669  669                  mutex_enter(&db->db_mtx);
 670  670                  if ((flags & DB_RF_NEVERWAIT) == 0) {
 671  671                          while (db->db_state == DB_READ ||
 672  672                              db->db_state == DB_FILL) {
 673  673                                  ASSERT(db->db_state == DB_READ ||
 674  674                                      (flags & DB_RF_HAVESTRUCT) == 0);
 675  675                                  cv_wait(&db->db_changed, &db->db_mtx);
 676  676                          }
 677  677                          if (db->db_state == DB_UNCACHED)
 678  678                                  err = SET_ERROR(EIO);
 679  679                  }
 680  680                  mutex_exit(&db->db_mtx);
 681  681          }
 682  682  
 683  683          ASSERT(err || havepzio || db->db_state == DB_CACHED);
 684  684          return (err);
 685  685  }
 686  686  
 687  687  static void
 688  688  dbuf_noread(dmu_buf_impl_t *db)
 689  689  {
 690  690          ASSERT(!refcount_is_zero(&db->db_holds));
 691  691          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 692  692          mutex_enter(&db->db_mtx);
 693  693          while (db->db_state == DB_READ || db->db_state == DB_FILL)
 694  694                  cv_wait(&db->db_changed, &db->db_mtx);
 695  695          if (db->db_state == DB_UNCACHED) {
 696  696                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 697  697                  spa_t *spa = db->db_objset->os_spa;
 698  698  
 699  699                  ASSERT(db->db_buf == NULL);
 700  700                  ASSERT(db->db.db_data == NULL);
 701  701                  dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
 702  702                  db->db_state = DB_FILL;
 703  703          } else if (db->db_state == DB_NOFILL) {
 704  704                  dbuf_set_data(db, NULL);
 705  705          } else {
 706  706                  ASSERT3U(db->db_state, ==, DB_CACHED);
 707  707          }
 708  708          mutex_exit(&db->db_mtx);
 709  709  }
 710  710  
 711  711  /*
 712  712   * This is our just-in-time copy function.  It makes a copy of
 713  713   * buffers, that have been modified in a previous transaction
 714  714   * group, before we modify them in the current active group.
 715  715   *
 716  716   * This function is used in two places: when we are dirtying a
 717  717   * buffer for the first time in a txg, and when we are freeing
 718  718   * a range in a dnode that includes this buffer.
 719  719   *
 720  720   * Note that when we are called from dbuf_free_range() we do
 721  721   * not put a hold on the buffer, we just traverse the active
 722  722   * dbuf list for the dnode.
 723  723   */
 724  724  static void
 725  725  dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 726  726  {
 727  727          dbuf_dirty_record_t *dr = db->db_last_dirty;
 728  728  
 729  729          ASSERT(MUTEX_HELD(&db->db_mtx));
 730  730          ASSERT(db->db.db_data != NULL);
 731  731          ASSERT(db->db_level == 0);
 732  732          ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
 733  733  
 734  734          if (dr == NULL ||
 735  735              (dr->dt.dl.dr_data !=
 736  736              ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
 737  737                  return;
 738  738  
 739  739          /*
 740  740           * If the last dirty record for this dbuf has not yet synced
 741  741           * and its referencing the dbuf data, either:
 742  742           *      reset the reference to point to a new copy,
 743  743           * or (if there a no active holders)
 744  744           *      just null out the current db_data pointer.
 745  745           */
 746  746          ASSERT(dr->dr_txg >= txg - 2);
 747  747          if (db->db_blkid == DMU_BONUS_BLKID) {
 748  748                  /* Note that the data bufs here are zio_bufs */
 749  749                  dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 750  750                  arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 751  751                  bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
 752  752          } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 753  753                  int size = db->db.db_size;
 754  754                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 755  755                  spa_t *spa = db->db_objset->os_spa;
 756  756  
 757  757                  dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
 758  758                  bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
 759  759          } else {
 760  760                  dbuf_set_data(db, NULL);
 761  761          }
 762  762  }
 763  763  
 764  764  void
 765  765  dbuf_unoverride(dbuf_dirty_record_t *dr)
 766  766  {
 767  767          dmu_buf_impl_t *db = dr->dr_dbuf;
 768  768          blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 769  769          uint64_t txg = dr->dr_txg;
 770  770  
 771  771          ASSERT(MUTEX_HELD(&db->db_mtx));
 772  772          ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
 773  773          ASSERT(db->db_level == 0);
 774  774  
 775  775          if (db->db_blkid == DMU_BONUS_BLKID ||
 776  776              dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 777  777                  return;
 778  778  
 779  779          ASSERT(db->db_data_pending != dr);
 780  780  
 781  781          /* free this block */
 782  782          if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
 783  783                  zio_free(db->db_objset->os_spa, txg, bp);
 784  784  
 785  785          dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 786  786          dr->dt.dl.dr_nopwrite = B_FALSE;
 787  787  
 788  788          /*
 789  789           * Release the already-written buffer, so we leave it in
 790  790           * a consistent dirty state.  Note that all callers are
 791  791           * modifying the buffer, so they will immediately do
 792  792           * another (redundant) arc_release().  Therefore, leave
 793  793           * the buf thawed to save the effort of freezing &
 794  794           * immediately re-thawing it.
 795  795           */
 796  796          arc_release(dr->dt.dl.dr_data, db);
 797  797  }
 798  798  
 799  799  /*
 800  800   * Evict (if its unreferenced) or clear (if its referenced) any level-0
 801  801   * data blocks in the free range, so that any future readers will find
 802  802   * empty blocks.
 803  803   *
 804  804   * This is a no-op if the dataset is in the middle of an incremental
 805  805   * receive; see comment below for details.
 806  806   */
 807  807  void
 808  808  dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
 809  809  {
 810  810          dmu_buf_impl_t *db, *db_next;
 811  811          uint64_t txg = tx->tx_txg;
 812  812  
 813  813          if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID))
 814  814                  end = dn->dn_maxblkid;
 815  815          dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
 816  816  
 817  817          mutex_enter(&dn->dn_dbufs_mtx);
 818  818          if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
 819  819                  /* There can't be any dbufs in this range; no need to search. */
 820  820                  mutex_exit(&dn->dn_dbufs_mtx);
 821  821                  return;
 822  822          } else if (dmu_objset_is_receiving(dn->dn_objset)) {
 823  823                  /*
 824  824                   * If we are receiving, we expect there to be no dbufs in
 825  825                   * the range to be freed, because receive modifies each
 826  826                   * block at most once, and in offset order.  If this is
 827  827                   * not the case, it can lead to performance problems,
 828  828                   * so note that we unexpectedly took the slow path.
 829  829                   */
 830  830                  atomic_inc_64(&zfs_free_range_recv_miss);
 831  831          }
 832  832  
 833  833          for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
 834  834                  db_next = list_next(&dn->dn_dbufs, db);
 835  835                  ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 836  836  
 837  837                  if (db->db_level != 0)
 838  838                          continue;
 839  839                  if (db->db_blkid < start || db->db_blkid > end)
 840  840                          continue;
 841  841  
 842  842                  /* found a level 0 buffer in the range */
 843  843                  mutex_enter(&db->db_mtx);
 844  844                  if (dbuf_undirty(db, tx)) {
 845  845                          /* mutex has been dropped and dbuf destroyed */
 846  846                          continue;
 847  847                  }
 848  848  
 849  849                  if (db->db_state == DB_UNCACHED ||
 850  850                      db->db_state == DB_NOFILL ||
 851  851                      db->db_state == DB_EVICTING) {
 852  852                          ASSERT(db->db.db_data == NULL);
 853  853                          mutex_exit(&db->db_mtx);
 854  854                          continue;
 855  855                  }
 856  856                  if (db->db_state == DB_READ || db->db_state == DB_FILL) {
 857  857                          /* will be handled in dbuf_read_done or dbuf_rele */
 858  858                          db->db_freed_in_flight = TRUE;
 859  859                          mutex_exit(&db->db_mtx);
 860  860                          continue;
 861  861                  }
 862  862                  if (refcount_count(&db->db_holds) == 0) {
 863  863                          ASSERT(db->db_buf);
 864  864                          dbuf_clear(db);
 865  865                          continue;
 866  866                  }
 867  867                  /* The dbuf is referenced */
 868  868  
 869  869                  if (db->db_last_dirty != NULL) {
 870  870                          dbuf_dirty_record_t *dr = db->db_last_dirty;
 871  871  
 872  872                          if (dr->dr_txg == txg) {
 873  873                                  /*
 874  874                                   * This buffer is "in-use", re-adjust the file
 875  875                                   * size to reflect that this buffer may
 876  876                                   * contain new data when we sync.
 877  877                                   */
 878  878                                  if (db->db_blkid != DMU_SPILL_BLKID &&
 879  879                                      db->db_blkid > dn->dn_maxblkid)
 880  880                                          dn->dn_maxblkid = db->db_blkid;
 881  881                                  dbuf_unoverride(dr);
 882  882                          } else {
 883  883                                  /*
 884  884                                   * This dbuf is not dirty in the open context.
 885  885                                   * Either uncache it (if its not referenced in
 886  886                                   * the open context) or reset its contents to
 887  887                                   * empty.
 888  888                                   */
 889  889                                  dbuf_fix_old_data(db, txg);
 890  890                          }
 891  891                  }
 892  892                  /* clear the contents if its cached */
 893  893                  if (db->db_state == DB_CACHED) {
 894  894                          ASSERT(db->db.db_data != NULL);
 895  895                          arc_release(db->db_buf, db);
 896  896                          bzero(db->db.db_data, db->db.db_size);
 897  897                          arc_buf_freeze(db->db_buf);
 898  898                  }
 899  899  
 900  900                  mutex_exit(&db->db_mtx);
 901  901          }
 902  902          mutex_exit(&dn->dn_dbufs_mtx);
 903  903  }
 904  904  
 905  905  static int
 906  906  dbuf_block_freeable(dmu_buf_impl_t *db)
 907  907  {
 908  908          dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
 909  909          uint64_t birth_txg = 0;
 910  910  
 911  911          /*
 912  912           * We don't need any locking to protect db_blkptr:
 913  913           * If it's syncing, then db_last_dirty will be set
 914  914           * so we'll ignore db_blkptr.
 915  915           *
 916  916           * This logic ensures that only block births for
 917  917           * filled blocks are considered.
 918  918           */
 919  919          ASSERT(MUTEX_HELD(&db->db_mtx));
 920  920          if (db->db_last_dirty && (db->db_blkptr == NULL ||
 921  921              !BP_IS_HOLE(db->db_blkptr))) {
 922  922                  birth_txg = db->db_last_dirty->dr_txg;
 923  923          } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
 924  924                  birth_txg = db->db_blkptr->blk_birth;
 925  925          }
 926  926  
 927  927          /*
 928  928           * If this block don't exist or is in a snapshot, it can't be freed.
 929  929           * Don't pass the bp to dsl_dataset_block_freeable() since we
 930  930           * are holding the db_mtx lock and might deadlock if we are
 931  931           * prefetching a dedup-ed block.
 932  932           */
 933  933          if (birth_txg != 0)
 934  934                  return (ds == NULL ||
 935  935                      dsl_dataset_block_freeable(ds, NULL, birth_txg));
 936  936          else
 937  937                  return (B_FALSE);
 938  938  }
 939  939  
 940  940  void
 941  941  dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 942  942  {
 943  943          arc_buf_t *buf, *obuf;
 944  944          int osize = db->db.db_size;
 945  945          arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 946  946          dnode_t *dn;
 947  947  
 948  948          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 949  949  
 950  950          DB_DNODE_ENTER(db);
 951  951          dn = DB_DNODE(db);
 952  952  
 953  953          /* XXX does *this* func really need the lock? */
 954  954          ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 955  955  
 956  956          /*
 957  957           * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
 958  958           * is OK, because there can be no other references to the db
 959  959           * when we are changing its size, so no concurrent DB_FILL can
 960  960           * be happening.
 961  961           */
 962  962          /*
 963  963           * XXX we should be doing a dbuf_read, checking the return
 964  964           * value and returning that up to our callers
 965  965           */
 966  966          dmu_buf_will_dirty(&db->db, tx);
 967  967  
 968  968          /* create the data buffer for the new block */
 969  969          buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
 970  970  
 971  971          /* copy old block data to the new block */
 972  972          obuf = db->db_buf;
 973  973          bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
 974  974          /* zero the remainder */
 975  975          if (size > osize)
 976  976                  bzero((uint8_t *)buf->b_data + osize, size - osize);
 977  977  
 978  978          mutex_enter(&db->db_mtx);
 979  979          dbuf_set_data(db, buf);
 980  980          VERIFY(arc_buf_remove_ref(obuf, db));
 981  981          db->db.db_size = size;
 982  982  
 983  983          if (db->db_level == 0) {
 984  984                  ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
 985  985                  db->db_last_dirty->dt.dl.dr_data = buf;
 986  986          }
 987  987          mutex_exit(&db->db_mtx);
 988  988  
 989  989          dnode_willuse_space(dn, size-osize, tx);
 990  990          DB_DNODE_EXIT(db);
 991  991  }
 992  992  
 993  993  void
 994  994  dbuf_release_bp(dmu_buf_impl_t *db)
 995  995  {
 996  996          objset_t *os = db->db_objset;
 997  997  
 998  998          ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 999  999          ASSERT(arc_released(os->os_phys_buf) ||
1000 1000              list_link_active(&os->os_dsl_dataset->ds_synced_link));
1001 1001          ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1002 1002  
1003 1003          (void) arc_release(db->db_buf, db);
1004 1004  }
1005 1005  
1006 1006  dbuf_dirty_record_t *
1007 1007  dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1008 1008  {
1009 1009          dnode_t *dn;
1010 1010          objset_t *os;
1011 1011          dbuf_dirty_record_t **drp, *dr;
1012 1012          int drop_struct_lock = FALSE;
1013 1013          boolean_t do_free_accounting = B_FALSE;
1014 1014          int txgoff = tx->tx_txg & TXG_MASK;
1015 1015  
1016 1016          ASSERT(tx->tx_txg != 0);
1017 1017          ASSERT(!refcount_is_zero(&db->db_holds));
1018 1018          DMU_TX_DIRTY_BUF(tx, db);
1019 1019  
1020 1020          DB_DNODE_ENTER(db);
1021 1021          dn = DB_DNODE(db);
1022 1022          /*
1023 1023           * Shouldn't dirty a regular buffer in syncing context.  Private
1024 1024           * objects may be dirtied in syncing context, but only if they
1025 1025           * were already pre-dirtied in open context.
1026 1026           */
1027 1027          ASSERT(!dmu_tx_is_syncing(tx) ||
1028 1028              BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1029 1029              DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1030 1030              dn->dn_objset->os_dsl_dataset == NULL);
1031 1031          /*
1032 1032           * We make this assert for private objects as well, but after we
1033 1033           * check if we're already dirty.  They are allowed to re-dirty
1034 1034           * in syncing context.
1035 1035           */
1036 1036          ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1037 1037              dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1038 1038              (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1039 1039  
1040 1040          mutex_enter(&db->db_mtx);
1041 1041          /*
1042 1042           * XXX make this true for indirects too?  The problem is that
1043 1043           * transactions created with dmu_tx_create_assigned() from
1044 1044           * syncing context don't bother holding ahead.
1045 1045           */
1046 1046          ASSERT(db->db_level != 0 ||
1047 1047              db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1048 1048              db->db_state == DB_NOFILL);
1049 1049  
1050 1050          mutex_enter(&dn->dn_mtx);
1051 1051          /*
1052 1052           * Don't set dirtyctx to SYNC if we're just modifying this as we
1053 1053           * initialize the objset.
1054 1054           */
1055 1055          if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1056 1056              !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1057 1057                  dn->dn_dirtyctx =
1058 1058                      (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1059 1059                  ASSERT(dn->dn_dirtyctx_firstset == NULL);
1060 1060                  dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1061 1061          }
1062 1062          mutex_exit(&dn->dn_mtx);
1063 1063  
1064 1064          if (db->db_blkid == DMU_SPILL_BLKID)
1065 1065                  dn->dn_have_spill = B_TRUE;
1066 1066  
1067 1067          /*
1068 1068           * If this buffer is already dirty, we're done.
1069 1069           */
1070 1070          drp = &db->db_last_dirty;
1071 1071          ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1072 1072              db->db.db_object == DMU_META_DNODE_OBJECT);
1073 1073          while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1074 1074                  drp = &dr->dr_next;
1075 1075          if (dr && dr->dr_txg == tx->tx_txg) {
1076 1076                  DB_DNODE_EXIT(db);
1077 1077  
1078 1078                  if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1079 1079                          /*
1080 1080                           * If this buffer has already been written out,
1081 1081                           * we now need to reset its state.
1082 1082                           */
1083 1083                          dbuf_unoverride(dr);
1084 1084                          if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1085 1085                              db->db_state != DB_NOFILL)
1086 1086                                  arc_buf_thaw(db->db_buf);
1087 1087                  }
1088 1088                  mutex_exit(&db->db_mtx);
1089 1089                  return (dr);
1090 1090          }
1091 1091  
1092 1092          /*
1093 1093           * Only valid if not already dirty.
1094 1094           */
1095 1095          ASSERT(dn->dn_object == 0 ||
1096 1096              dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1097 1097              (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1098 1098  
1099 1099          ASSERT3U(dn->dn_nlevels, >, db->db_level);
1100 1100          ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1101 1101              dn->dn_phys->dn_nlevels > db->db_level ||
1102 1102              dn->dn_next_nlevels[txgoff] > db->db_level ||
1103 1103              dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1104 1104              dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1105 1105  
1106 1106          /*
1107 1107           * We should only be dirtying in syncing context if it's the
1108 1108           * mos or we're initializing the os or it's a special object.
1109 1109           * However, we are allowed to dirty in syncing context provided
1110 1110           * we already dirtied it in open context.  Hence we must make
1111 1111           * this assertion only if we're not already dirty.
1112 1112           */
1113 1113          os = dn->dn_objset;
1114 1114          ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1115 1115              os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1116 1116          ASSERT(db->db.db_size != 0);
1117 1117  
1118 1118          dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1119 1119  
1120 1120          if (db->db_blkid != DMU_BONUS_BLKID) {
1121 1121                  /*
1122 1122                   * Update the accounting.
1123 1123                   * Note: we delay "free accounting" until after we drop
1124 1124                   * the db_mtx.  This keeps us from grabbing other locks
1125 1125                   * (and possibly deadlocking) in bp_get_dsize() while
1126 1126                   * also holding the db_mtx.
1127 1127                   */
1128 1128                  dnode_willuse_space(dn, db->db.db_size, tx);
1129 1129                  do_free_accounting = dbuf_block_freeable(db);
1130 1130          }
1131 1131  
1132 1132          /*
1133 1133           * If this buffer is dirty in an old transaction group we need
1134 1134           * to make a copy of it so that the changes we make in this
1135 1135           * transaction group won't leak out when we sync the older txg.
1136 1136           */
1137 1137          dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1138 1138          if (db->db_level == 0) {
1139 1139                  void *data_old = db->db_buf;
1140 1140  
1141 1141                  if (db->db_state != DB_NOFILL) {
1142 1142                          if (db->db_blkid == DMU_BONUS_BLKID) {
1143 1143                                  dbuf_fix_old_data(db, tx->tx_txg);
1144 1144                                  data_old = db->db.db_data;
1145 1145                          } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1146 1146                                  /*
1147 1147                                   * Release the data buffer from the cache so
1148 1148                                   * that we can modify it without impacting
1149 1149                                   * possible other users of this cached data
1150 1150                                   * block.  Note that indirect blocks and
1151 1151                                   * private objects are not released until the
1152 1152                                   * syncing state (since they are only modified
1153 1153                                   * then).
1154 1154                                   */
1155 1155                                  arc_release(db->db_buf, db);
1156 1156                                  dbuf_fix_old_data(db, tx->tx_txg);
1157 1157                                  data_old = db->db_buf;
1158 1158                          }
1159 1159                          ASSERT(data_old != NULL);
1160 1160                  }
1161 1161                  dr->dt.dl.dr_data = data_old;
1162 1162          } else {
1163 1163                  mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1164 1164                  list_create(&dr->dt.di.dr_children,
1165 1165                      sizeof (dbuf_dirty_record_t),
1166 1166                      offsetof(dbuf_dirty_record_t, dr_dirty_node));
1167 1167          }
1168 1168          if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1169 1169                  dr->dr_accounted = db->db.db_size;
1170 1170          dr->dr_dbuf = db;
1171 1171          dr->dr_txg = tx->tx_txg;
1172 1172          dr->dr_next = *drp;
1173 1173          *drp = dr;
1174 1174  
1175 1175          /*
1176 1176           * We could have been freed_in_flight between the dbuf_noread
1177 1177           * and dbuf_dirty.  We win, as though the dbuf_noread() had
1178 1178           * happened after the free.
1179 1179           */
1180 1180          if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1181 1181              db->db_blkid != DMU_SPILL_BLKID) {
1182 1182                  mutex_enter(&dn->dn_mtx);
1183 1183                  if (dn->dn_free_ranges[txgoff] != NULL) {
1184 1184                          range_tree_clear(dn->dn_free_ranges[txgoff],
1185 1185                              db->db_blkid, 1);
1186 1186                  }
1187 1187                  mutex_exit(&dn->dn_mtx);
1188 1188                  db->db_freed_in_flight = FALSE;
1189 1189          }
1190 1190  
1191 1191          /*
1192 1192           * This buffer is now part of this txg
1193 1193           */
1194 1194          dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1195 1195          db->db_dirtycnt += 1;
1196 1196          ASSERT3U(db->db_dirtycnt, <=, 3);
1197 1197  
1198 1198          mutex_exit(&db->db_mtx);
1199 1199  
1200 1200          if (db->db_blkid == DMU_BONUS_BLKID ||
1201 1201              db->db_blkid == DMU_SPILL_BLKID) {
1202 1202                  mutex_enter(&dn->dn_mtx);
1203 1203                  ASSERT(!list_link_active(&dr->dr_dirty_node));
1204 1204                  list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1205 1205                  mutex_exit(&dn->dn_mtx);
1206 1206                  dnode_setdirty(dn, tx);
1207 1207                  DB_DNODE_EXIT(db);
1208 1208                  return (dr);
1209 1209          } else if (do_free_accounting) {
1210 1210                  blkptr_t *bp = db->db_blkptr;
1211 1211                  int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1212 1212                      bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1213 1213                  /*
1214 1214                   * This is only a guess -- if the dbuf is dirty
1215 1215                   * in a previous txg, we don't know how much
1216 1216                   * space it will use on disk yet.  We should
1217 1217                   * really have the struct_rwlock to access
1218 1218                   * db_blkptr, but since this is just a guess,
1219 1219                   * it's OK if we get an odd answer.
1220 1220                   */
1221 1221                  ddt_prefetch(os->os_spa, bp);
1222 1222                  dnode_willuse_space(dn, -willfree, tx);
1223 1223          }
1224 1224  
1225 1225          if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1226 1226                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
1227 1227                  drop_struct_lock = TRUE;
1228 1228          }
1229 1229  
1230 1230          if (db->db_level == 0) {
1231 1231                  dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1232 1232                  ASSERT(dn->dn_maxblkid >= db->db_blkid);
1233 1233          }
1234 1234  
1235 1235          if (db->db_level+1 < dn->dn_nlevels) {
1236 1236                  dmu_buf_impl_t *parent = db->db_parent;
1237 1237                  dbuf_dirty_record_t *di;
1238 1238                  int parent_held = FALSE;
1239 1239  
1240 1240                  if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1241 1241                          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1242 1242  
1243 1243                          parent = dbuf_hold_level(dn, db->db_level+1,
1244 1244                              db->db_blkid >> epbs, FTAG);
1245 1245                          ASSERT(parent != NULL);
1246 1246                          parent_held = TRUE;
1247 1247                  }
1248 1248                  if (drop_struct_lock)
1249 1249                          rw_exit(&dn->dn_struct_rwlock);
1250 1250                  ASSERT3U(db->db_level+1, ==, parent->db_level);
1251 1251                  di = dbuf_dirty(parent, tx);
1252 1252                  if (parent_held)
1253 1253                          dbuf_rele(parent, FTAG);
1254 1254  
1255 1255                  mutex_enter(&db->db_mtx);
1256 1256                  /*
1257 1257                   * Since we've dropped the mutex, it's possible that
1258 1258                   * dbuf_undirty() might have changed this out from under us.
1259 1259                   */
1260 1260                  if (db->db_last_dirty == dr ||
1261 1261                      dn->dn_object == DMU_META_DNODE_OBJECT) {
1262 1262                          mutex_enter(&di->dt.di.dr_mtx);
1263 1263                          ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1264 1264                          ASSERT(!list_link_active(&dr->dr_dirty_node));
1265 1265                          list_insert_tail(&di->dt.di.dr_children, dr);
1266 1266                          mutex_exit(&di->dt.di.dr_mtx);
1267 1267                          dr->dr_parent = di;
1268 1268                  }
1269 1269                  mutex_exit(&db->db_mtx);
1270 1270          } else {
1271 1271                  ASSERT(db->db_level+1 == dn->dn_nlevels);
1272 1272                  ASSERT(db->db_blkid < dn->dn_nblkptr);
1273 1273                  ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1274 1274                  mutex_enter(&dn->dn_mtx);
1275 1275                  ASSERT(!list_link_active(&dr->dr_dirty_node));
1276 1276                  list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1277 1277                  mutex_exit(&dn->dn_mtx);
1278 1278                  if (drop_struct_lock)
1279 1279                          rw_exit(&dn->dn_struct_rwlock);
1280 1280          }
1281 1281  
1282 1282          dnode_setdirty(dn, tx);
1283 1283          DB_DNODE_EXIT(db);
1284 1284          return (dr);
1285 1285  }
1286 1286  
1287 1287  /*
1288 1288   * Undirty a buffer in the transaction group referenced by the given
1289 1289   * transaction.  Return whether this evicted the dbuf.
1290 1290   */
1291 1291  static boolean_t
1292 1292  dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1293 1293  {
1294 1294          dnode_t *dn;
1295 1295          uint64_t txg = tx->tx_txg;
1296 1296          dbuf_dirty_record_t *dr, **drp;
1297 1297  
1298 1298          ASSERT(txg != 0);
1299 1299          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1300 1300          ASSERT0(db->db_level);
1301 1301          ASSERT(MUTEX_HELD(&db->db_mtx));
1302 1302  
1303 1303          /*
1304 1304           * If this buffer is not dirty, we're done.
1305 1305           */
1306 1306          for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1307 1307                  if (dr->dr_txg <= txg)
1308 1308                          break;
1309 1309          if (dr == NULL || dr->dr_txg < txg)
1310 1310                  return (B_FALSE);
1311 1311          ASSERT(dr->dr_txg == txg);
1312 1312          ASSERT(dr->dr_dbuf == db);
1313 1313  
1314 1314          DB_DNODE_ENTER(db);
1315 1315          dn = DB_DNODE(db);
1316 1316  
1317 1317          dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1318 1318  
1319 1319          ASSERT(db->db.db_size != 0);
1320 1320  
1321 1321          /*
1322 1322           * Any space we accounted for in dp_dirty_* will be cleaned up by
1323 1323           * dsl_pool_sync().  This is relatively rare so the discrepancy
1324 1324           * is not a big deal.
1325 1325           */
1326 1326  
1327 1327          *drp = dr->dr_next;
1328 1328  
1329 1329          /*
1330 1330           * Note that there are three places in dbuf_dirty()
1331 1331           * where this dirty record may be put on a list.
1332 1332           * Make sure to do a list_remove corresponding to
1333 1333           * every one of those list_insert calls.
1334 1334           */
1335 1335          if (dr->dr_parent) {
1336 1336                  mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1337 1337                  list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1338 1338                  mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1339 1339          } else if (db->db_blkid == DMU_SPILL_BLKID ||
1340 1340              db->db_level+1 == dn->dn_nlevels) {
1341 1341                  ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1342 1342                  mutex_enter(&dn->dn_mtx);
1343 1343                  list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1344 1344                  mutex_exit(&dn->dn_mtx);
1345 1345          }
1346 1346          DB_DNODE_EXIT(db);
1347 1347  
1348 1348          if (db->db_state != DB_NOFILL) {
1349 1349                  dbuf_unoverride(dr);
1350 1350  
1351 1351                  ASSERT(db->db_buf != NULL);
1352 1352                  ASSERT(dr->dt.dl.dr_data != NULL);
1353 1353                  if (dr->dt.dl.dr_data != db->db_buf)
1354 1354                          VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1355 1355          }
1356 1356  
1357 1357          if (db->db_level != 0) {
1358 1358                  mutex_destroy(&dr->dt.di.dr_mtx);
1359 1359                  list_destroy(&dr->dt.di.dr_children);
1360 1360          }
1361 1361  
1362 1362          kmem_free(dr, sizeof (dbuf_dirty_record_t));
1363 1363  
1364 1364          ASSERT(db->db_dirtycnt > 0);
1365 1365          db->db_dirtycnt -= 1;
1366 1366  
1367 1367          if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1368 1368                  arc_buf_t *buf = db->db_buf;
1369 1369  
1370 1370                  ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1371 1371                  dbuf_set_data(db, NULL);
1372 1372                  VERIFY(arc_buf_remove_ref(buf, db));
1373 1373                  dbuf_evict(db);
1374 1374                  return (B_TRUE);
1375 1375          }
1376 1376  
1377 1377          return (B_FALSE);
1378 1378  }
1379 1379  
1380 1380  void
1381 1381  dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1382 1382  {
1383 1383          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1384 1384          int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1385 1385  
1386 1386          ASSERT(tx->tx_txg != 0);
1387 1387          ASSERT(!refcount_is_zero(&db->db_holds));
1388 1388  
1389 1389          DB_DNODE_ENTER(db);
1390 1390          if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1391 1391                  rf |= DB_RF_HAVESTRUCT;
1392 1392          DB_DNODE_EXIT(db);
1393 1393          (void) dbuf_read(db, NULL, rf);
1394 1394          (void) dbuf_dirty(db, tx);
1395 1395  }
1396 1396  
1397 1397  void
1398 1398  dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1399 1399  {
1400 1400          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1401 1401  
1402 1402          db->db_state = DB_NOFILL;
1403 1403  
1404 1404          dmu_buf_will_fill(db_fake, tx);
1405 1405  }
1406 1406  
1407 1407  void
1408 1408  dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1409 1409  {
1410 1410          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1411 1411  
1412 1412          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1413 1413          ASSERT(tx->tx_txg != 0);
1414 1414          ASSERT(db->db_level == 0);
1415 1415          ASSERT(!refcount_is_zero(&db->db_holds));
1416 1416  
1417 1417          ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1418 1418              dmu_tx_private_ok(tx));
1419 1419  
1420 1420          dbuf_noread(db);
1421 1421          (void) dbuf_dirty(db, tx);
1422 1422  }
1423 1423  
1424 1424  #pragma weak dmu_buf_fill_done = dbuf_fill_done
1425 1425  /* ARGSUSED */
1426 1426  void
1427 1427  dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1428 1428  {
1429 1429          mutex_enter(&db->db_mtx);
1430 1430          DBUF_VERIFY(db);
1431 1431  
1432 1432          if (db->db_state == DB_FILL) {
1433 1433                  if (db->db_level == 0 && db->db_freed_in_flight) {
1434 1434                          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1435 1435                          /* we were freed while filling */
1436 1436                          /* XXX dbuf_undirty? */
1437 1437                          bzero(db->db.db_data, db->db.db_size);
1438 1438                          db->db_freed_in_flight = FALSE;
1439 1439                  }
1440 1440                  db->db_state = DB_CACHED;
1441 1441                  cv_broadcast(&db->db_changed);
1442 1442          }
1443 1443          mutex_exit(&db->db_mtx);
1444 1444  }
1445 1445  
1446 1446  void
1447 1447  dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
1448 1448      bp_embedded_type_t etype, enum zio_compress comp,
1449 1449      int uncompressed_size, int compressed_size, int byteorder,
1450 1450      dmu_tx_t *tx)
1451 1451  {
1452 1452          dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
1453 1453          struct dirty_leaf *dl;
1454 1454          dmu_object_type_t type;
1455 1455  
1456 1456          DB_DNODE_ENTER(db);
1457 1457          type = DB_DNODE(db)->dn_type;
1458 1458          DB_DNODE_EXIT(db);
1459 1459  
1460 1460          ASSERT0(db->db_level);
1461 1461          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1462 1462  
1463 1463          dmu_buf_will_not_fill(dbuf, tx);
1464 1464  
1465 1465          ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1466 1466          dl = &db->db_last_dirty->dt.dl;
1467 1467          encode_embedded_bp_compressed(&dl->dr_overridden_by,
1468 1468              data, comp, uncompressed_size, compressed_size);
1469 1469          BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
1470 1470          BP_SET_TYPE(&dl->dr_overridden_by, type);
1471 1471          BP_SET_LEVEL(&dl->dr_overridden_by, 0);
1472 1472          BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
1473 1473  
1474 1474          dl->dr_override_state = DR_OVERRIDDEN;
1475 1475          dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
1476 1476  }
1477 1477  
1478 1478  /*
1479 1479   * Directly assign a provided arc buf to a given dbuf if it's not referenced
1480 1480   * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1481 1481   */
1482 1482  void
1483 1483  dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1484 1484  {
1485 1485          ASSERT(!refcount_is_zero(&db->db_holds));
1486 1486          ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1487 1487          ASSERT(db->db_level == 0);
1488 1488          ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1489 1489          ASSERT(buf != NULL);
1490 1490          ASSERT(arc_buf_size(buf) == db->db.db_size);
1491 1491          ASSERT(tx->tx_txg != 0);
1492 1492  
1493 1493          arc_return_buf(buf, db);
1494 1494          ASSERT(arc_released(buf));
1495 1495  
1496 1496          mutex_enter(&db->db_mtx);
1497 1497  
1498 1498          while (db->db_state == DB_READ || db->db_state == DB_FILL)
1499 1499                  cv_wait(&db->db_changed, &db->db_mtx);
1500 1500  
1501 1501          ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1502 1502  
1503 1503          if (db->db_state == DB_CACHED &&
1504 1504              refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1505 1505                  mutex_exit(&db->db_mtx);
1506 1506                  (void) dbuf_dirty(db, tx);
1507 1507                  bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1508 1508                  VERIFY(arc_buf_remove_ref(buf, db));
1509 1509                  xuio_stat_wbuf_copied();
1510 1510                  return;
1511 1511          }
1512 1512  
1513 1513          xuio_stat_wbuf_nocopy();
1514 1514          if (db->db_state == DB_CACHED) {
1515 1515                  dbuf_dirty_record_t *dr = db->db_last_dirty;
1516 1516  
1517 1517                  ASSERT(db->db_buf != NULL);
1518 1518                  if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1519 1519                          ASSERT(dr->dt.dl.dr_data == db->db_buf);
1520 1520                          if (!arc_released(db->db_buf)) {
1521 1521                                  ASSERT(dr->dt.dl.dr_override_state ==
1522 1522                                      DR_OVERRIDDEN);
1523 1523                                  arc_release(db->db_buf, db);
1524 1524                          }
1525 1525                          dr->dt.dl.dr_data = buf;
1526 1526                          VERIFY(arc_buf_remove_ref(db->db_buf, db));
1527 1527                  } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1528 1528                          arc_release(db->db_buf, db);
1529 1529                          VERIFY(arc_buf_remove_ref(db->db_buf, db));
1530 1530                  }
1531 1531                  db->db_buf = NULL;
1532 1532          }
1533 1533          ASSERT(db->db_buf == NULL);
1534 1534          dbuf_set_data(db, buf);
1535 1535          db->db_state = DB_FILL;
1536 1536          mutex_exit(&db->db_mtx);
1537 1537          (void) dbuf_dirty(db, tx);
1538 1538          dmu_buf_fill_done(&db->db, tx);
1539 1539  }
1540 1540  
1541 1541  /*
1542 1542   * "Clear" the contents of this dbuf.  This will mark the dbuf
1543 1543   * EVICTING and clear *most* of its references.  Unfortunately,
1544 1544   * when we are not holding the dn_dbufs_mtx, we can't clear the
1545 1545   * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1546 1546   * in this case.  For callers from the DMU we will usually see:
1547 1547   *      dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1548 1548   * For the arc callback, we will usually see:
1549 1549   *      dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1550 1550   * Sometimes, though, we will get a mix of these two:
1551 1551   *      DMU: dbuf_clear()->arc_buf_evict()
1552 1552   *      ARC: dbuf_do_evict()->dbuf_destroy()
1553 1553   */
1554 1554  void
1555 1555  dbuf_clear(dmu_buf_impl_t *db)
1556 1556  {
1557 1557          dnode_t *dn;
1558 1558          dmu_buf_impl_t *parent = db->db_parent;
1559 1559          dmu_buf_impl_t *dndb;
1560 1560          int dbuf_gone = FALSE;
1561 1561  
1562 1562          ASSERT(MUTEX_HELD(&db->db_mtx));
1563 1563          ASSERT(refcount_is_zero(&db->db_holds));
1564 1564  
1565 1565          dbuf_evict_user(db);
1566 1566  
1567 1567          if (db->db_state == DB_CACHED) {
1568 1568                  ASSERT(db->db.db_data != NULL);
1569 1569                  if (db->db_blkid == DMU_BONUS_BLKID) {
1570 1570                          zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1571 1571                          arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1572 1572                  }
1573 1573                  db->db.db_data = NULL;
1574 1574                  db->db_state = DB_UNCACHED;
1575 1575          }
1576 1576  
1577 1577          ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);

↓ open down ↓

1577 lines elided

↑ open up ↑

1578 1578          ASSERT(db->db_data_pending == NULL);
1579 1579  
1580 1580          db->db_state = DB_EVICTING;
1581 1581          db->db_blkptr = NULL;
1582 1582  
1583 1583          DB_DNODE_ENTER(db);
1584 1584          dn = DB_DNODE(db);
1585 1585          dndb = dn->dn_dbuf;
1586 1586          if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1587 1587                  list_remove(&dn->dn_dbufs, db);
1588      -                (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
     1588 +                atomic_dec_32(&dn->dn_dbufs_count);
1589 1589                  membar_producer();
1590 1590                  DB_DNODE_EXIT(db);
1591 1591                  /*
1592 1592                   * Decrementing the dbuf count means that the hold corresponding
1593 1593                   * to the removed dbuf is no longer discounted in dnode_move(),
1594 1594                   * so the dnode cannot be moved until after we release the hold.
1595 1595                   * The membar_producer() ensures visibility of the decremented
1596 1596                   * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1597 1597                   * release any lock.
1598 1598                   */

1599 1599                  dnode_rele(dn, db);
1600 1600                  db->db_dnode_handle = NULL;
1601 1601          } else {
1602 1602                  DB_DNODE_EXIT(db);
1603 1603          }
1604 1604  
1605 1605          if (db->db_buf)
1606 1606                  dbuf_gone = arc_buf_evict(db->db_buf);
1607 1607  
1608 1608          if (!dbuf_gone)
1609 1609                  mutex_exit(&db->db_mtx);
1610 1610  
1611 1611          /*
1612 1612           * If this dbuf is referenced from an indirect dbuf,
1613 1613           * decrement the ref count on the indirect dbuf.
1614 1614           */
1615 1615          if (parent && parent != dndb)
1616 1616                  dbuf_rele(parent, db);
1617 1617  }
1618 1618  
1619 1619  static int
1620 1620  dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1621 1621      dmu_buf_impl_t **parentp, blkptr_t **bpp)
1622 1622  {
1623 1623          int nlevels, epbs;
1624 1624  
1625 1625          *parentp = NULL;
1626 1626          *bpp = NULL;
1627 1627  
1628 1628          ASSERT(blkid != DMU_BONUS_BLKID);
1629 1629  
1630 1630          if (blkid == DMU_SPILL_BLKID) {
1631 1631                  mutex_enter(&dn->dn_mtx);
1632 1632                  if (dn->dn_have_spill &&
1633 1633                      (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1634 1634                          *bpp = &dn->dn_phys->dn_spill;
1635 1635                  else
1636 1636                          *bpp = NULL;
1637 1637                  dbuf_add_ref(dn->dn_dbuf, NULL);
1638 1638                  *parentp = dn->dn_dbuf;
1639 1639                  mutex_exit(&dn->dn_mtx);
1640 1640                  return (0);
1641 1641          }
1642 1642  
1643 1643          if (dn->dn_phys->dn_nlevels == 0)
1644 1644                  nlevels = 1;
1645 1645          else
1646 1646                  nlevels = dn->dn_phys->dn_nlevels;
1647 1647  
1648 1648          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1649 1649  
1650 1650          ASSERT3U(level * epbs, <, 64);
1651 1651          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1652 1652          if (level >= nlevels ||
1653 1653              (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1654 1654                  /* the buffer has no parent yet */
1655 1655                  return (SET_ERROR(ENOENT));
1656 1656          } else if (level < nlevels-1) {
1657 1657                  /* this block is referenced from an indirect block */
1658 1658                  int err = dbuf_hold_impl(dn, level+1,
1659 1659                      blkid >> epbs, fail_sparse, NULL, parentp);
1660 1660                  if (err)
1661 1661                          return (err);
1662 1662                  err = dbuf_read(*parentp, NULL,
1663 1663                      (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1664 1664                  if (err) {
1665 1665                          dbuf_rele(*parentp, NULL);
1666 1666                          *parentp = NULL;
1667 1667                          return (err);
1668 1668                  }
1669 1669                  *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1670 1670                      (blkid & ((1ULL << epbs) - 1));
1671 1671                  return (0);
1672 1672          } else {
1673 1673                  /* the block is referenced from the dnode */
1674 1674                  ASSERT3U(level, ==, nlevels-1);
1675 1675                  ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1676 1676                      blkid < dn->dn_phys->dn_nblkptr);
1677 1677                  if (dn->dn_dbuf) {
1678 1678                          dbuf_add_ref(dn->dn_dbuf, NULL);
1679 1679                          *parentp = dn->dn_dbuf;
1680 1680                  }
1681 1681                  *bpp = &dn->dn_phys->dn_blkptr[blkid];
1682 1682                  return (0);
1683 1683          }
1684 1684  }
1685 1685  
1686 1686  static dmu_buf_impl_t *
1687 1687  dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1688 1688      dmu_buf_impl_t *parent, blkptr_t *blkptr)
1689 1689  {
1690 1690          objset_t *os = dn->dn_objset;
1691 1691          dmu_buf_impl_t *db, *odb;
1692 1692  
1693 1693          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1694 1694          ASSERT(dn->dn_type != DMU_OT_NONE);
1695 1695  
1696 1696          db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1697 1697  
1698 1698          db->db_objset = os;
1699 1699          db->db.db_object = dn->dn_object;
1700 1700          db->db_level = level;
1701 1701          db->db_blkid = blkid;
1702 1702          db->db_last_dirty = NULL;
1703 1703          db->db_dirtycnt = 0;
1704 1704          db->db_dnode_handle = dn->dn_handle;
1705 1705          db->db_parent = parent;
1706 1706          db->db_blkptr = blkptr;
1707 1707  
1708 1708          db->db_user_ptr = NULL;
1709 1709          db->db_user_data_ptr_ptr = NULL;
1710 1710          db->db_evict_func = NULL;
1711 1711          db->db_immediate_evict = 0;
1712 1712          db->db_freed_in_flight = 0;
1713 1713  
1714 1714          if (blkid == DMU_BONUS_BLKID) {
1715 1715                  ASSERT3P(parent, ==, dn->dn_dbuf);
1716 1716                  db->db.db_size = DN_MAX_BONUSLEN -
1717 1717                      (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1718 1718                  ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1719 1719                  db->db.db_offset = DMU_BONUS_BLKID;
1720 1720                  db->db_state = DB_UNCACHED;
1721 1721                  /* the bonus dbuf is not placed in the hash table */
1722 1722                  arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1723 1723                  return (db);
1724 1724          } else if (blkid == DMU_SPILL_BLKID) {
1725 1725                  db->db.db_size = (blkptr != NULL) ?
1726 1726                      BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1727 1727                  db->db.db_offset = 0;
1728 1728          } else {
1729 1729                  int blocksize =
1730 1730                      db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1731 1731                  db->db.db_size = blocksize;
1732 1732                  db->db.db_offset = db->db_blkid * blocksize;
1733 1733          }
1734 1734  
1735 1735          /*
1736 1736           * Hold the dn_dbufs_mtx while we get the new dbuf
1737 1737           * in the hash table *and* added to the dbufs list.
1738 1738           * This prevents a possible deadlock with someone
1739 1739           * trying to look up this dbuf before its added to the
1740 1740           * dn_dbufs list.
1741 1741           */
1742 1742          mutex_enter(&dn->dn_dbufs_mtx);
1743 1743          db->db_state = DB_EVICTING;
1744 1744          if ((odb = dbuf_hash_insert(db)) != NULL) {
1745 1745                  /* someone else inserted it first */
1746 1746                  kmem_cache_free(dbuf_cache, db);
1747 1747                  mutex_exit(&dn->dn_dbufs_mtx);
1748 1748                  return (odb);
1749 1749          }
1750 1750          list_insert_head(&dn->dn_dbufs, db);
1751 1751          if (db->db_level == 0 && db->db_blkid >=
1752 1752              dn->dn_unlisted_l0_blkid)
1753 1753                  dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1754 1754          db->db_state = DB_UNCACHED;
1755 1755          mutex_exit(&dn->dn_dbufs_mtx);
1756 1756          arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1757 1757  
1758 1758          if (parent && parent != dn->dn_dbuf)
1759 1759                  dbuf_add_ref(parent, db);
1760 1760  
1761 1761          ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1762 1762              refcount_count(&dn->dn_holds) > 0);
1763 1763          (void) refcount_add(&dn->dn_holds, db);
1764 1764          (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1765 1765  
1766 1766          dprintf_dbuf(db, "db=%p\n", db);
1767 1767  
1768 1768          return (db);
1769 1769  }
1770 1770  
1771 1771  static int
1772 1772  dbuf_do_evict(void *private)
1773 1773  {
1774 1774          arc_buf_t *buf = private;
1775 1775          dmu_buf_impl_t *db = buf->b_private;
1776 1776  
1777 1777          if (!MUTEX_HELD(&db->db_mtx))
1778 1778                  mutex_enter(&db->db_mtx);
1779 1779  
1780 1780          ASSERT(refcount_is_zero(&db->db_holds));
1781 1781  
1782 1782          if (db->db_state != DB_EVICTING) {
1783 1783                  ASSERT(db->db_state == DB_CACHED);
1784 1784                  DBUF_VERIFY(db);
1785 1785                  db->db_buf = NULL;
1786 1786                  dbuf_evict(db);
1787 1787          } else {
1788 1788                  mutex_exit(&db->db_mtx);
1789 1789                  dbuf_destroy(db);
1790 1790          }
1791 1791          return (0);
1792 1792  }
1793 1793  
1794 1794  static void
1795 1795  dbuf_destroy(dmu_buf_impl_t *db)
1796 1796  {
1797 1797          ASSERT(refcount_is_zero(&db->db_holds));
1798 1798  
1799 1799          if (db->db_blkid != DMU_BONUS_BLKID) {
1800 1800                  /*

↓ open down ↓

202 lines elided

↑ open up ↑

1801 1801                   * If this dbuf is still on the dn_dbufs list,
1802 1802                   * remove it from that list.
1803 1803                   */
1804 1804                  if (db->db_dnode_handle != NULL) {
1805 1805                          dnode_t *dn;
1806 1806  
1807 1807                          DB_DNODE_ENTER(db);
1808 1808                          dn = DB_DNODE(db);
1809 1809                          mutex_enter(&dn->dn_dbufs_mtx);
1810 1810                          list_remove(&dn->dn_dbufs, db);
1811      -                        (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
     1811 +                        atomic_dec_32(&dn->dn_dbufs_count);
1812 1812                          mutex_exit(&dn->dn_dbufs_mtx);
1813 1813                          DB_DNODE_EXIT(db);
1814 1814                          /*
1815 1815                           * Decrementing the dbuf count means that the hold
1816 1816                           * corresponding to the removed dbuf is no longer
1817 1817                           * discounted in dnode_move(), so the dnode cannot be
1818 1818                           * moved until after we release the hold.
1819 1819                           */
1820 1820                          dnode_rele(dn, db);
1821 1821                          db->db_dnode_handle = NULL;

1822 1822                  }
1823 1823                  dbuf_hash_remove(db);
1824 1824          }
1825 1825          db->db_parent = NULL;
1826 1826          db->db_buf = NULL;
1827 1827  
1828 1828          ASSERT(!list_link_active(&db->db_link));
1829 1829          ASSERT(db->db.db_data == NULL);
1830 1830          ASSERT(db->db_hash_next == NULL);
1831 1831          ASSERT(db->db_blkptr == NULL);
1832 1832          ASSERT(db->db_data_pending == NULL);
1833 1833  
1834 1834          kmem_cache_free(dbuf_cache, db);
1835 1835          arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1836 1836  }
1837 1837  
1838 1838  void
1839 1839  dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1840 1840  {
1841 1841          dmu_buf_impl_t *db = NULL;
1842 1842          blkptr_t *bp = NULL;
1843 1843  
1844 1844          ASSERT(blkid != DMU_BONUS_BLKID);
1845 1845          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1846 1846  
1847 1847          if (dnode_block_freed(dn, blkid))
1848 1848                  return;
1849 1849  
1850 1850          /* dbuf_find() returns with db_mtx held */
1851 1851          if (db = dbuf_find(dn, 0, blkid)) {
1852 1852                  /*
1853 1853                   * This dbuf is already in the cache.  We assume that
1854 1854                   * it is already CACHED, or else about to be either
1855 1855                   * read or filled.
1856 1856                   */
1857 1857                  mutex_exit(&db->db_mtx);
1858 1858                  return;
1859 1859          }
1860 1860  
1861 1861          if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1862 1862                  if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
1863 1863                          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1864 1864                          uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1865 1865                          zbookmark_phys_t zb;
1866 1866  
1867 1867                          SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1868 1868                              dn->dn_object, 0, blkid);
1869 1869  
1870 1870                          (void) arc_read(NULL, dn->dn_objset->os_spa,
1871 1871                              bp, NULL, NULL, prio,
1872 1872                              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1873 1873                              &aflags, &zb);
1874 1874                  }
1875 1875                  if (db)
1876 1876                          dbuf_rele(db, NULL);
1877 1877          }
1878 1878  }
1879 1879  
1880 1880  /*
1881 1881   * Returns with db_holds incremented, and db_mtx not held.
1882 1882   * Note: dn_struct_rwlock must be held.
1883 1883   */
1884 1884  int
1885 1885  dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1886 1886      void *tag, dmu_buf_impl_t **dbp)
1887 1887  {
1888 1888          dmu_buf_impl_t *db, *parent = NULL;
1889 1889  
1890 1890          ASSERT(blkid != DMU_BONUS_BLKID);
1891 1891          ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1892 1892          ASSERT3U(dn->dn_nlevels, >, level);
1893 1893  
1894 1894          *dbp = NULL;
1895 1895  top:
1896 1896          /* dbuf_find() returns with db_mtx held */
1897 1897          db = dbuf_find(dn, level, blkid);
1898 1898  
1899 1899          if (db == NULL) {
1900 1900                  blkptr_t *bp = NULL;
1901 1901                  int err;
1902 1902  
1903 1903                  ASSERT3P(parent, ==, NULL);
1904 1904                  err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1905 1905                  if (fail_sparse) {
1906 1906                          if (err == 0 && bp && BP_IS_HOLE(bp))
1907 1907                                  err = SET_ERROR(ENOENT);
1908 1908                          if (err) {
1909 1909                                  if (parent)
1910 1910                                          dbuf_rele(parent, NULL);
1911 1911                                  return (err);
1912 1912                          }
1913 1913                  }
1914 1914                  if (err && err != ENOENT)
1915 1915                          return (err);
1916 1916                  db = dbuf_create(dn, level, blkid, parent, bp);
1917 1917          }
1918 1918  
1919 1919          if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1920 1920                  arc_buf_add_ref(db->db_buf, db);
1921 1921                  if (db->db_buf->b_data == NULL) {
1922 1922                          dbuf_clear(db);
1923 1923                          if (parent) {
1924 1924                                  dbuf_rele(parent, NULL);
1925 1925                                  parent = NULL;
1926 1926                          }
1927 1927                          goto top;
1928 1928                  }
1929 1929                  ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1930 1930          }
1931 1931  
1932 1932          ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1933 1933  
1934 1934          /*
1935 1935           * If this buffer is currently syncing out, and we are are
1936 1936           * still referencing it from db_data, we need to make a copy
1937 1937           * of it in case we decide we want to dirty it again in this txg.
1938 1938           */
1939 1939          if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1940 1940              dn->dn_object != DMU_META_DNODE_OBJECT &&
1941 1941              db->db_state == DB_CACHED && db->db_data_pending) {
1942 1942                  dbuf_dirty_record_t *dr = db->db_data_pending;
1943 1943  
1944 1944                  if (dr->dt.dl.dr_data == db->db_buf) {
1945 1945                          arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1946 1946  
1947 1947                          dbuf_set_data(db,
1948 1948                              arc_buf_alloc(dn->dn_objset->os_spa,
1949 1949                              db->db.db_size, db, type));
1950 1950                          bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1951 1951                              db->db.db_size);
1952 1952                  }
1953 1953          }
1954 1954  
1955 1955          (void) refcount_add(&db->db_holds, tag);
1956 1956          dbuf_update_data(db);
1957 1957          DBUF_VERIFY(db);
1958 1958          mutex_exit(&db->db_mtx);
1959 1959  
1960 1960          /* NOTE: we can't rele the parent until after we drop the db_mtx */
1961 1961          if (parent)
1962 1962                  dbuf_rele(parent, NULL);
1963 1963  
1964 1964          ASSERT3P(DB_DNODE(db), ==, dn);
1965 1965          ASSERT3U(db->db_blkid, ==, blkid);
1966 1966          ASSERT3U(db->db_level, ==, level);
1967 1967          *dbp = db;
1968 1968  
1969 1969          return (0);
1970 1970  }
1971 1971  
1972 1972  dmu_buf_impl_t *
1973 1973  dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1974 1974  {
1975 1975          dmu_buf_impl_t *db;
1976 1976          int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1977 1977          return (err ? NULL : db);
1978 1978  }
1979 1979  
1980 1980  dmu_buf_impl_t *
1981 1981  dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1982 1982  {
1983 1983          dmu_buf_impl_t *db;
1984 1984          int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1985 1985          return (err ? NULL : db);
1986 1986  }
1987 1987  
1988 1988  void
1989 1989  dbuf_create_bonus(dnode_t *dn)
1990 1990  {
1991 1991          ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1992 1992  
1993 1993          ASSERT(dn->dn_bonus == NULL);
1994 1994          dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
1995 1995  }
1996 1996  
1997 1997  int
1998 1998  dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
1999 1999  {
2000 2000          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2001 2001          dnode_t *dn;
2002 2002  
2003 2003          if (db->db_blkid != DMU_SPILL_BLKID)
2004 2004                  return (SET_ERROR(ENOTSUP));
2005 2005          if (blksz == 0)
2006 2006                  blksz = SPA_MINBLOCKSIZE;
2007 2007          if (blksz > SPA_MAXBLOCKSIZE)
2008 2008                  blksz = SPA_MAXBLOCKSIZE;
2009 2009          else
2010 2010                  blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
2011 2011  
2012 2012          DB_DNODE_ENTER(db);
2013 2013          dn = DB_DNODE(db);
2014 2014          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2015 2015          dbuf_new_size(db, blksz, tx);
2016 2016          rw_exit(&dn->dn_struct_rwlock);
2017 2017          DB_DNODE_EXIT(db);
2018 2018  
2019 2019          return (0);
2020 2020  }
2021 2021  
2022 2022  void
2023 2023  dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
2024 2024  {
2025 2025          dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2026 2026  }
2027 2027  
2028 2028  #pragma weak dmu_buf_add_ref = dbuf_add_ref
2029 2029  void
2030 2030  dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2031 2031  {
2032 2032          int64_t holds = refcount_add(&db->db_holds, tag);
2033 2033          ASSERT(holds > 1);
2034 2034  }
2035 2035  
2036 2036  /*
2037 2037   * If you call dbuf_rele() you had better not be referencing the dnode handle
2038 2038   * unless you have some other direct or indirect hold on the dnode. (An indirect
2039 2039   * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2040 2040   * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2041 2041   * dnode's parent dbuf evicting its dnode handles.
2042 2042   */
2043 2043  void
2044 2044  dbuf_rele(dmu_buf_impl_t *db, void *tag)
2045 2045  {
2046 2046          mutex_enter(&db->db_mtx);
2047 2047          dbuf_rele_and_unlock(db, tag);
2048 2048  }
2049 2049  
2050 2050  void
2051 2051  dmu_buf_rele(dmu_buf_t *db, void *tag)
2052 2052  {
2053 2053          dbuf_rele((dmu_buf_impl_t *)db, tag);
2054 2054  }
2055 2055  
2056 2056  /*
2057 2057   * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
2058 2058   * db_dirtycnt and db_holds to be updated atomically.
2059 2059   */
2060 2060  void
2061 2061  dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2062 2062  {
2063 2063          int64_t holds;
2064 2064  
2065 2065          ASSERT(MUTEX_HELD(&db->db_mtx));
2066 2066          DBUF_VERIFY(db);
2067 2067  
2068 2068          /*
2069 2069           * Remove the reference to the dbuf before removing its hold on the
2070 2070           * dnode so we can guarantee in dnode_move() that a referenced bonus
2071 2071           * buffer has a corresponding dnode hold.
2072 2072           */
2073 2073          holds = refcount_remove(&db->db_holds, tag);
2074 2074          ASSERT(holds >= 0);
2075 2075  
2076 2076          /*
2077 2077           * We can't freeze indirects if there is a possibility that they
2078 2078           * may be modified in the current syncing context.
2079 2079           */
2080 2080          if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2081 2081                  arc_buf_freeze(db->db_buf);
2082 2082  
2083 2083          if (holds == db->db_dirtycnt &&
2084 2084              db->db_level == 0 && db->db_immediate_evict)
2085 2085                  dbuf_evict_user(db);

↓ open down ↓

264 lines elided

↑ open up ↑

2086 2086  
2087 2087          if (holds == 0) {
2088 2088                  if (db->db_blkid == DMU_BONUS_BLKID) {
2089 2089                          mutex_exit(&db->db_mtx);
2090 2090  
2091 2091                          /*
2092 2092                           * If the dnode moves here, we cannot cross this barrier
2093 2093                           * until the move completes.
2094 2094                           */
2095 2095                          DB_DNODE_ENTER(db);
2096      -                        (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
     2096 +                        atomic_dec_32(&DB_DNODE(db)->dn_dbufs_count);
2097 2097                          DB_DNODE_EXIT(db);
2098 2098                          /*
2099 2099                           * The bonus buffer's dnode hold is no longer discounted
2100 2100                           * in dnode_move(). The dnode cannot move until after
2101 2101                           * the dnode_rele().
2102 2102                           */
2103 2103                          dnode_rele(DB_DNODE(db), db);
2104 2104                  } else if (db->db_buf == NULL) {
2105 2105                          /*
2106 2106                           * This is a special case: we never associated this

2107 2107                           * dbuf with any data allocated from the ARC.
2108 2108                           */
2109 2109                          ASSERT(db->db_state == DB_UNCACHED ||
2110 2110                              db->db_state == DB_NOFILL);
2111 2111                          dbuf_evict(db);
2112 2112                  } else if (arc_released(db->db_buf)) {
2113 2113                          arc_buf_t *buf = db->db_buf;
2114 2114                          /*
2115 2115                           * This dbuf has anonymous data associated with it.
2116 2116                           */
2117 2117                          dbuf_set_data(db, NULL);
2118 2118                          VERIFY(arc_buf_remove_ref(buf, db));
2119 2119                          dbuf_evict(db);
2120 2120                  } else {
2121 2121                          VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2122 2122  
2123 2123                          /*
2124 2124                           * A dbuf will be eligible for eviction if either the
2125 2125                           * 'primarycache' property is set or a duplicate
2126 2126                           * copy of this buffer is already cached in the arc.
2127 2127                           *
2128 2128                           * In the case of the 'primarycache' a buffer
2129 2129                           * is considered for eviction if it matches the
2130 2130                           * criteria set in the property.
2131 2131                           *
2132 2132                           * To decide if our buffer is considered a
2133 2133                           * duplicate, we must call into the arc to determine
2134 2134                           * if multiple buffers are referencing the same
2135 2135                           * block on-disk. If so, then we simply evict
2136 2136                           * ourselves.
2137 2137                           */
2138 2138                          if (!DBUF_IS_CACHEABLE(db) ||
2139 2139                              arc_buf_eviction_needed(db->db_buf))
2140 2140                                  dbuf_clear(db);
2141 2141                          else
2142 2142                                  mutex_exit(&db->db_mtx);
2143 2143                  }
2144 2144          } else {
2145 2145                  mutex_exit(&db->db_mtx);
2146 2146          }
2147 2147  }
2148 2148  
2149 2149  #pragma weak dmu_buf_refcount = dbuf_refcount
2150 2150  uint64_t
2151 2151  dbuf_refcount(dmu_buf_impl_t *db)
2152 2152  {
2153 2153          return (refcount_count(&db->db_holds));
2154 2154  }
2155 2155  
2156 2156  void *
2157 2157  dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2158 2158      dmu_buf_evict_func_t *evict_func)
2159 2159  {
2160 2160          return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2161 2161              user_data_ptr_ptr, evict_func));
2162 2162  }
2163 2163  
2164 2164  void *
2165 2165  dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2166 2166      dmu_buf_evict_func_t *evict_func)
2167 2167  {
2168 2168          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2169 2169  
2170 2170          db->db_immediate_evict = TRUE;
2171 2171          return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2172 2172              user_data_ptr_ptr, evict_func));
2173 2173  }
2174 2174  
2175 2175  void *
2176 2176  dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2177 2177      void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2178 2178  {
2179 2179          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2180 2180          ASSERT(db->db_level == 0);
2181 2181  
2182 2182          ASSERT((user_ptr == NULL) == (evict_func == NULL));
2183 2183  
2184 2184          mutex_enter(&db->db_mtx);
2185 2185  
2186 2186          if (db->db_user_ptr == old_user_ptr) {
2187 2187                  db->db_user_ptr = user_ptr;
2188 2188                  db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2189 2189                  db->db_evict_func = evict_func;
2190 2190  
2191 2191                  dbuf_update_data(db);
2192 2192          } else {
2193 2193                  old_user_ptr = db->db_user_ptr;
2194 2194          }
2195 2195  
2196 2196          mutex_exit(&db->db_mtx);
2197 2197          return (old_user_ptr);
2198 2198  }
2199 2199  
2200 2200  void *
2201 2201  dmu_buf_get_user(dmu_buf_t *db_fake)
2202 2202  {
2203 2203          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2204 2204          ASSERT(!refcount_is_zero(&db->db_holds));
2205 2205  
2206 2206          return (db->db_user_ptr);
2207 2207  }
2208 2208  
2209 2209  boolean_t
2210 2210  dmu_buf_freeable(dmu_buf_t *dbuf)
2211 2211  {
2212 2212          boolean_t res = B_FALSE;
2213 2213          dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2214 2214  
2215 2215          if (db->db_blkptr)
2216 2216                  res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2217 2217                      db->db_blkptr, db->db_blkptr->blk_birth);
2218 2218  
2219 2219          return (res);
2220 2220  }
2221 2221  
2222 2222  blkptr_t *
2223 2223  dmu_buf_get_blkptr(dmu_buf_t *db)
2224 2224  {
2225 2225          dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2226 2226          return (dbi->db_blkptr);
2227 2227  }
2228 2228  
2229 2229  static void
2230 2230  dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2231 2231  {
2232 2232          /* ASSERT(dmu_tx_is_syncing(tx) */
2233 2233          ASSERT(MUTEX_HELD(&db->db_mtx));
2234 2234  
2235 2235          if (db->db_blkptr != NULL)
2236 2236                  return;
2237 2237  
2238 2238          if (db->db_blkid == DMU_SPILL_BLKID) {
2239 2239                  db->db_blkptr = &dn->dn_phys->dn_spill;
2240 2240                  BP_ZERO(db->db_blkptr);
2241 2241                  return;
2242 2242          }
2243 2243          if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2244 2244                  /*
2245 2245                   * This buffer was allocated at a time when there was
2246 2246                   * no available blkptrs from the dnode, or it was
2247 2247                   * inappropriate to hook it in (i.e., nlevels mis-match).
2248 2248                   */
2249 2249                  ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2250 2250                  ASSERT(db->db_parent == NULL);
2251 2251                  db->db_parent = dn->dn_dbuf;
2252 2252                  db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2253 2253                  DBUF_VERIFY(db);
2254 2254          } else {
2255 2255                  dmu_buf_impl_t *parent = db->db_parent;
2256 2256                  int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2257 2257  
2258 2258                  ASSERT(dn->dn_phys->dn_nlevels > 1);
2259 2259                  if (parent == NULL) {
2260 2260                          mutex_exit(&db->db_mtx);
2261 2261                          rw_enter(&dn->dn_struct_rwlock, RW_READER);
2262 2262                          (void) dbuf_hold_impl(dn, db->db_level+1,
2263 2263                              db->db_blkid >> epbs, FALSE, db, &parent);
2264 2264                          rw_exit(&dn->dn_struct_rwlock);
2265 2265                          mutex_enter(&db->db_mtx);
2266 2266                          db->db_parent = parent;
2267 2267                  }
2268 2268                  db->db_blkptr = (blkptr_t *)parent->db.db_data +
2269 2269                      (db->db_blkid & ((1ULL << epbs) - 1));
2270 2270                  DBUF_VERIFY(db);
2271 2271          }
2272 2272  }
2273 2273  
2274 2274  static void
2275 2275  dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2276 2276  {
2277 2277          dmu_buf_impl_t *db = dr->dr_dbuf;
2278 2278          dnode_t *dn;
2279 2279          zio_t *zio;
2280 2280  
2281 2281          ASSERT(dmu_tx_is_syncing(tx));
2282 2282  
2283 2283          dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2284 2284  
2285 2285          mutex_enter(&db->db_mtx);
2286 2286  
2287 2287          ASSERT(db->db_level > 0);
2288 2288          DBUF_VERIFY(db);
2289 2289  
2290 2290          /* Read the block if it hasn't been read yet. */
2291 2291          if (db->db_buf == NULL) {
2292 2292                  mutex_exit(&db->db_mtx);
2293 2293                  (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2294 2294                  mutex_enter(&db->db_mtx);
2295 2295          }
2296 2296          ASSERT3U(db->db_state, ==, DB_CACHED);
2297 2297          ASSERT(db->db_buf != NULL);
2298 2298  
2299 2299          DB_DNODE_ENTER(db);
2300 2300          dn = DB_DNODE(db);
2301 2301          /* Indirect block size must match what the dnode thinks it is. */
2302 2302          ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2303 2303          dbuf_check_blkptr(dn, db);
2304 2304          DB_DNODE_EXIT(db);
2305 2305  
2306 2306          /* Provide the pending dirty record to child dbufs */
2307 2307          db->db_data_pending = dr;
2308 2308  
2309 2309          mutex_exit(&db->db_mtx);
2310 2310          dbuf_write(dr, db->db_buf, tx);
2311 2311  
2312 2312          zio = dr->dr_zio;
2313 2313          mutex_enter(&dr->dt.di.dr_mtx);
2314 2314          dbuf_sync_list(&dr->dt.di.dr_children, tx);
2315 2315          ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2316 2316          mutex_exit(&dr->dt.di.dr_mtx);
2317 2317          zio_nowait(zio);
2318 2318  }
2319 2319  
2320 2320  static void
2321 2321  dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2322 2322  {
2323 2323          arc_buf_t **datap = &dr->dt.dl.dr_data;
2324 2324          dmu_buf_impl_t *db = dr->dr_dbuf;
2325 2325          dnode_t *dn;
2326 2326          objset_t *os;
2327 2327          uint64_t txg = tx->tx_txg;
2328 2328  
2329 2329          ASSERT(dmu_tx_is_syncing(tx));
2330 2330  
2331 2331          dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2332 2332  
2333 2333          mutex_enter(&db->db_mtx);
2334 2334          /*
2335 2335           * To be synced, we must be dirtied.  But we
2336 2336           * might have been freed after the dirty.
2337 2337           */
2338 2338          if (db->db_state == DB_UNCACHED) {
2339 2339                  /* This buffer has been freed since it was dirtied */
2340 2340                  ASSERT(db->db.db_data == NULL);
2341 2341          } else if (db->db_state == DB_FILL) {
2342 2342                  /* This buffer was freed and is now being re-filled */
2343 2343                  ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2344 2344          } else {
2345 2345                  ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2346 2346          }
2347 2347          DBUF_VERIFY(db);
2348 2348  
2349 2349          DB_DNODE_ENTER(db);
2350 2350          dn = DB_DNODE(db);
2351 2351  
2352 2352          if (db->db_blkid == DMU_SPILL_BLKID) {
2353 2353                  mutex_enter(&dn->dn_mtx);
2354 2354                  dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2355 2355                  mutex_exit(&dn->dn_mtx);
2356 2356          }
2357 2357  
2358 2358          /*
2359 2359           * If this is a bonus buffer, simply copy the bonus data into the
2360 2360           * dnode.  It will be written out when the dnode is synced (and it
2361 2361           * will be synced, since it must have been dirty for dbuf_sync to
2362 2362           * be called).
2363 2363           */
2364 2364          if (db->db_blkid == DMU_BONUS_BLKID) {
2365 2365                  dbuf_dirty_record_t **drp;
2366 2366  
2367 2367                  ASSERT(*datap != NULL);
2368 2368                  ASSERT0(db->db_level);
2369 2369                  ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2370 2370                  bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2371 2371                  DB_DNODE_EXIT(db);
2372 2372  
2373 2373                  if (*datap != db->db.db_data) {
2374 2374                          zio_buf_free(*datap, DN_MAX_BONUSLEN);
2375 2375                          arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2376 2376                  }
2377 2377                  db->db_data_pending = NULL;
2378 2378                  drp = &db->db_last_dirty;
2379 2379                  while (*drp != dr)
2380 2380                          drp = &(*drp)->dr_next;
2381 2381                  ASSERT(dr->dr_next == NULL);
2382 2382                  ASSERT(dr->dr_dbuf == db);
2383 2383                  *drp = dr->dr_next;
2384 2384                  kmem_free(dr, sizeof (dbuf_dirty_record_t));
2385 2385                  ASSERT(db->db_dirtycnt > 0);
2386 2386                  db->db_dirtycnt -= 1;
2387 2387                  dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2388 2388                  return;
2389 2389          }
2390 2390  
2391 2391          os = dn->dn_objset;
2392 2392  
2393 2393          /*
2394 2394           * This function may have dropped the db_mtx lock allowing a dmu_sync
2395 2395           * operation to sneak in. As a result, we need to ensure that we
2396 2396           * don't check the dr_override_state until we have returned from
2397 2397           * dbuf_check_blkptr.
2398 2398           */
2399 2399          dbuf_check_blkptr(dn, db);
2400 2400  
2401 2401          /*
2402 2402           * If this buffer is in the middle of an immediate write,
2403 2403           * wait for the synchronous IO to complete.
2404 2404           */
2405 2405          while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2406 2406                  ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2407 2407                  cv_wait(&db->db_changed, &db->db_mtx);
2408 2408                  ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2409 2409          }
2410 2410  
2411 2411          if (db->db_state != DB_NOFILL &&
2412 2412              dn->dn_object != DMU_META_DNODE_OBJECT &&
2413 2413              refcount_count(&db->db_holds) > 1 &&
2414 2414              dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2415 2415              *datap == db->db_buf) {
2416 2416                  /*
2417 2417                   * If this buffer is currently "in use" (i.e., there
2418 2418                   * are active holds and db_data still references it),
2419 2419                   * then make a copy before we start the write so that
2420 2420                   * any modifications from the open txg will not leak
2421 2421                   * into this write.
2422 2422                   *
2423 2423                   * NOTE: this copy does not need to be made for
2424 2424                   * objects only modified in the syncing context (e.g.
2425 2425                   * DNONE_DNODE blocks).
2426 2426                   */
2427 2427                  int blksz = arc_buf_size(*datap);
2428 2428                  arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2429 2429                  *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2430 2430                  bcopy(db->db.db_data, (*datap)->b_data, blksz);
2431 2431          }
2432 2432          db->db_data_pending = dr;
2433 2433  
2434 2434          mutex_exit(&db->db_mtx);
2435 2435  
2436 2436          dbuf_write(dr, *datap, tx);
2437 2437  
2438 2438          ASSERT(!list_link_active(&dr->dr_dirty_node));
2439 2439          if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2440 2440                  list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2441 2441                  DB_DNODE_EXIT(db);
2442 2442          } else {
2443 2443                  /*
2444 2444                   * Although zio_nowait() does not "wait for an IO", it does
2445 2445                   * initiate the IO. If this is an empty write it seems plausible
2446 2446                   * that the IO could actually be completed before the nowait
2447 2447                   * returns. We need to DB_DNODE_EXIT() first in case
2448 2448                   * zio_nowait() invalidates the dbuf.
2449 2449                   */
2450 2450                  DB_DNODE_EXIT(db);
2451 2451                  zio_nowait(dr->dr_zio);
2452 2452          }
2453 2453  }
2454 2454  
2455 2455  void
2456 2456  dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2457 2457  {
2458 2458          dbuf_dirty_record_t *dr;
2459 2459  
2460 2460          while (dr = list_head(list)) {
2461 2461                  if (dr->dr_zio != NULL) {
2462 2462                          /*
2463 2463                           * If we find an already initialized zio then we
2464 2464                           * are processing the meta-dnode, and we have finished.
2465 2465                           * The dbufs for all dnodes are put back on the list
2466 2466                           * during processing, so that we can zio_wait()
2467 2467                           * these IOs after initiating all child IOs.
2468 2468                           */
2469 2469                          ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2470 2470                              DMU_META_DNODE_OBJECT);
2471 2471                          break;
2472 2472                  }
2473 2473                  list_remove(list, dr);
2474 2474                  if (dr->dr_dbuf->db_level > 0)
2475 2475                          dbuf_sync_indirect(dr, tx);
2476 2476                  else
2477 2477                          dbuf_sync_leaf(dr, tx);
2478 2478          }
2479 2479  }
2480 2480  
2481 2481  /* ARGSUSED */
2482 2482  static void
2483 2483  dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2484 2484  {
2485 2485          dmu_buf_impl_t *db = vdb;
2486 2486          dnode_t *dn;
2487 2487          blkptr_t *bp = zio->io_bp;
2488 2488          blkptr_t *bp_orig = &zio->io_bp_orig;
2489 2489          spa_t *spa = zio->io_spa;
2490 2490          int64_t delta;
2491 2491          uint64_t fill = 0;
2492 2492          int i;
2493 2493  
2494 2494          ASSERT3P(db->db_blkptr, ==, bp);
2495 2495  
2496 2496          DB_DNODE_ENTER(db);
2497 2497          dn = DB_DNODE(db);
2498 2498          delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2499 2499          dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2500 2500          zio->io_prev_space_delta = delta;
2501 2501  
2502 2502          if (bp->blk_birth != 0) {
2503 2503                  ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2504 2504                      BP_GET_TYPE(bp) == dn->dn_type) ||
2505 2505                      (db->db_blkid == DMU_SPILL_BLKID &&
2506 2506                      BP_GET_TYPE(bp) == dn->dn_bonustype) ||
2507 2507                      BP_IS_EMBEDDED(bp));
2508 2508                  ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2509 2509          }
2510 2510  
2511 2511          mutex_enter(&db->db_mtx);
2512 2512  
2513 2513  #ifdef ZFS_DEBUG
2514 2514          if (db->db_blkid == DMU_SPILL_BLKID) {
2515 2515                  ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2516 2516                  ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2517 2517                      db->db_blkptr == &dn->dn_phys->dn_spill);
2518 2518          }
2519 2519  #endif
2520 2520  
2521 2521          if (db->db_level == 0) {
2522 2522                  mutex_enter(&dn->dn_mtx);
2523 2523                  if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2524 2524                      db->db_blkid != DMU_SPILL_BLKID)
2525 2525                          dn->dn_phys->dn_maxblkid = db->db_blkid;
2526 2526                  mutex_exit(&dn->dn_mtx);
2527 2527  
2528 2528                  if (dn->dn_type == DMU_OT_DNODE) {
2529 2529                          dnode_phys_t *dnp = db->db.db_data;
2530 2530                          for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2531 2531                              i--, dnp++) {
2532 2532                                  if (dnp->dn_type != DMU_OT_NONE)
2533 2533                                          fill++;
2534 2534                          }
2535 2535                  } else {
2536 2536                          if (BP_IS_HOLE(bp)) {
2537 2537                                  fill = 0;
2538 2538                          } else {
2539 2539                                  fill = 1;
2540 2540                          }
2541 2541                  }
2542 2542          } else {
2543 2543                  blkptr_t *ibp = db->db.db_data;
2544 2544                  ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2545 2545                  for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2546 2546                          if (BP_IS_HOLE(ibp))
2547 2547                                  continue;
2548 2548                          fill += BP_GET_FILL(ibp);
2549 2549                  }
2550 2550          }
2551 2551          DB_DNODE_EXIT(db);
2552 2552  
2553 2553          if (!BP_IS_EMBEDDED(bp))
2554 2554                  bp->blk_fill = fill;
2555 2555  
2556 2556          mutex_exit(&db->db_mtx);
2557 2557  }
2558 2558  
2559 2559  /*
2560 2560   * The SPA will call this callback several times for each zio - once
2561 2561   * for every physical child i/o (zio->io_phys_children times).  This
2562 2562   * allows the DMU to monitor the progress of each logical i/o.  For example,
2563 2563   * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2564 2564   * block.  There may be a long delay before all copies/fragments are completed,
2565 2565   * so this callback allows us to retire dirty space gradually, as the physical
2566 2566   * i/os complete.
2567 2567   */
2568 2568  /* ARGSUSED */
2569 2569  static void
2570 2570  dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2571 2571  {
2572 2572          dmu_buf_impl_t *db = arg;
2573 2573          objset_t *os = db->db_objset;
2574 2574          dsl_pool_t *dp = dmu_objset_pool(os);
2575 2575          dbuf_dirty_record_t *dr;
2576 2576          int delta = 0;
2577 2577  
2578 2578          dr = db->db_data_pending;
2579 2579          ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2580 2580  
2581 2581          /*
2582 2582           * The callback will be called io_phys_children times.  Retire one
2583 2583           * portion of our dirty space each time we are called.  Any rounding
2584 2584           * error will be cleaned up by dsl_pool_sync()'s call to
2585 2585           * dsl_pool_undirty_space().
2586 2586           */
2587 2587          delta = dr->dr_accounted / zio->io_phys_children;
2588 2588          dsl_pool_undirty_space(dp, delta, zio->io_txg);
2589 2589  }
2590 2590  
2591 2591  /* ARGSUSED */
2592 2592  static void
2593 2593  dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2594 2594  {
2595 2595          dmu_buf_impl_t *db = vdb;
2596 2596          blkptr_t *bp_orig = &zio->io_bp_orig;
2597 2597          blkptr_t *bp = db->db_blkptr;
2598 2598          objset_t *os = db->db_objset;
2599 2599          dmu_tx_t *tx = os->os_synctx;
2600 2600          dbuf_dirty_record_t **drp, *dr;
2601 2601  
2602 2602          ASSERT0(zio->io_error);
2603 2603          ASSERT(db->db_blkptr == bp);
2604 2604  
2605 2605          /*
2606 2606           * For nopwrites and rewrites we ensure that the bp matches our
2607 2607           * original and bypass all the accounting.
2608 2608           */
2609 2609          if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2610 2610                  ASSERT(BP_EQUAL(bp, bp_orig));
2611 2611          } else {
2612 2612                  dsl_dataset_t *ds = os->os_dsl_dataset;
2613 2613                  (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2614 2614                  dsl_dataset_block_born(ds, bp, tx);
2615 2615          }
2616 2616  
2617 2617          mutex_enter(&db->db_mtx);
2618 2618  
2619 2619          DBUF_VERIFY(db);
2620 2620  
2621 2621          drp = &db->db_last_dirty;
2622 2622          while ((dr = *drp) != db->db_data_pending)
2623 2623                  drp = &dr->dr_next;
2624 2624          ASSERT(!list_link_active(&dr->dr_dirty_node));
2625 2625          ASSERT(dr->dr_dbuf == db);
2626 2626          ASSERT(dr->dr_next == NULL);
2627 2627          *drp = dr->dr_next;
2628 2628  
2629 2629  #ifdef ZFS_DEBUG
2630 2630          if (db->db_blkid == DMU_SPILL_BLKID) {
2631 2631                  dnode_t *dn;
2632 2632  
2633 2633                  DB_DNODE_ENTER(db);
2634 2634                  dn = DB_DNODE(db);
2635 2635                  ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2636 2636                  ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2637 2637                      db->db_blkptr == &dn->dn_phys->dn_spill);
2638 2638                  DB_DNODE_EXIT(db);
2639 2639          }
2640 2640  #endif
2641 2641  
2642 2642          if (db->db_level == 0) {
2643 2643                  ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2644 2644                  ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2645 2645                  if (db->db_state != DB_NOFILL) {
2646 2646                          if (dr->dt.dl.dr_data != db->db_buf)
2647 2647                                  VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2648 2648                                      db));
2649 2649                          else if (!arc_released(db->db_buf))
2650 2650                                  arc_set_callback(db->db_buf, dbuf_do_evict, db);
2651 2651                  }
2652 2652          } else {
2653 2653                  dnode_t *dn;
2654 2654  
2655 2655                  DB_DNODE_ENTER(db);
2656 2656                  dn = DB_DNODE(db);
2657 2657                  ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2658 2658                  ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
2659 2659                  if (!BP_IS_HOLE(db->db_blkptr)) {
2660 2660                          int epbs =
2661 2661                              dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2662 2662                          ASSERT3U(db->db_blkid, <=,
2663 2663                              dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
2664 2664                          ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2665 2665                              db->db.db_size);
2666 2666                          if (!arc_released(db->db_buf))
2667 2667                                  arc_set_callback(db->db_buf, dbuf_do_evict, db);
2668 2668                  }
2669 2669                  DB_DNODE_EXIT(db);
2670 2670                  mutex_destroy(&dr->dt.di.dr_mtx);
2671 2671                  list_destroy(&dr->dt.di.dr_children);
2672 2672          }
2673 2673          kmem_free(dr, sizeof (dbuf_dirty_record_t));
2674 2674  
2675 2675          cv_broadcast(&db->db_changed);
2676 2676          ASSERT(db->db_dirtycnt > 0);
2677 2677          db->db_dirtycnt -= 1;
2678 2678          db->db_data_pending = NULL;
2679 2679          dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
2680 2680  }
2681 2681  
2682 2682  static void
2683 2683  dbuf_write_nofill_ready(zio_t *zio)
2684 2684  {
2685 2685          dbuf_write_ready(zio, NULL, zio->io_private);
2686 2686  }
2687 2687  
2688 2688  static void
2689 2689  dbuf_write_nofill_done(zio_t *zio)
2690 2690  {
2691 2691          dbuf_write_done(zio, NULL, zio->io_private);
2692 2692  }
2693 2693  
2694 2694  static void
2695 2695  dbuf_write_override_ready(zio_t *zio)
2696 2696  {
2697 2697          dbuf_dirty_record_t *dr = zio->io_private;
2698 2698          dmu_buf_impl_t *db = dr->dr_dbuf;
2699 2699  
2700 2700          dbuf_write_ready(zio, NULL, db);
2701 2701  }
2702 2702  
2703 2703  static void
2704 2704  dbuf_write_override_done(zio_t *zio)
2705 2705  {
2706 2706          dbuf_dirty_record_t *dr = zio->io_private;
2707 2707          dmu_buf_impl_t *db = dr->dr_dbuf;
2708 2708          blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2709 2709  
2710 2710          mutex_enter(&db->db_mtx);
2711 2711          if (!BP_EQUAL(zio->io_bp, obp)) {
2712 2712                  if (!BP_IS_HOLE(obp))
2713 2713                          dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2714 2714                  arc_release(dr->dt.dl.dr_data, db);
2715 2715          }
2716 2716          mutex_exit(&db->db_mtx);
2717 2717  
2718 2718          dbuf_write_done(zio, NULL, db);
2719 2719  }
2720 2720  
2721 2721  /* Issue I/O to commit a dirty buffer to disk. */
2722 2722  static void
2723 2723  dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2724 2724  {
2725 2725          dmu_buf_impl_t *db = dr->dr_dbuf;
2726 2726          dnode_t *dn;
2727 2727          objset_t *os;
2728 2728          dmu_buf_impl_t *parent = db->db_parent;
2729 2729          uint64_t txg = tx->tx_txg;
2730 2730          zbookmark_phys_t zb;
2731 2731          zio_prop_t zp;
2732 2732          zio_t *zio;
2733 2733          int wp_flag = 0;
2734 2734  
2735 2735          DB_DNODE_ENTER(db);
2736 2736          dn = DB_DNODE(db);
2737 2737          os = dn->dn_objset;
2738 2738  
2739 2739          if (db->db_state != DB_NOFILL) {
2740 2740                  if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2741 2741                          /*
2742 2742                           * Private object buffers are released here rather
2743 2743                           * than in dbuf_dirty() since they are only modified
2744 2744                           * in the syncing context and we don't want the
2745 2745                           * overhead of making multiple copies of the data.
2746 2746                           */
2747 2747                          if (BP_IS_HOLE(db->db_blkptr)) {
2748 2748                                  arc_buf_thaw(data);
2749 2749                          } else {
2750 2750                                  dbuf_release_bp(db);
2751 2751                          }
2752 2752                  }
2753 2753          }
2754 2754  
2755 2755          if (parent != dn->dn_dbuf) {
2756 2756                  /* Our parent is an indirect block. */
2757 2757                  /* We have a dirty parent that has been scheduled for write. */
2758 2758                  ASSERT(parent && parent->db_data_pending);
2759 2759                  /* Our parent's buffer is one level closer to the dnode. */
2760 2760                  ASSERT(db->db_level == parent->db_level-1);
2761 2761                  /*
2762 2762                   * We're about to modify our parent's db_data by modifying
2763 2763                   * our block pointer, so the parent must be released.
2764 2764                   */
2765 2765                  ASSERT(arc_released(parent->db_buf));
2766 2766                  zio = parent->db_data_pending->dr_zio;
2767 2767          } else {
2768 2768                  /* Our parent is the dnode itself. */
2769 2769                  ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2770 2770                      db->db_blkid != DMU_SPILL_BLKID) ||
2771 2771                      (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2772 2772                  if (db->db_blkid != DMU_SPILL_BLKID)
2773 2773                          ASSERT3P(db->db_blkptr, ==,
2774 2774                              &dn->dn_phys->dn_blkptr[db->db_blkid]);
2775 2775                  zio = dn->dn_zio;
2776 2776          }
2777 2777  
2778 2778          ASSERT(db->db_level == 0 || data == db->db_buf);
2779 2779          ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2780 2780          ASSERT(zio);
2781 2781  
2782 2782          SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2783 2783              os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2784 2784              db->db.db_object, db->db_level, db->db_blkid);
2785 2785  
2786 2786          if (db->db_blkid == DMU_SPILL_BLKID)
2787 2787                  wp_flag = WP_SPILL;
2788 2788          wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2789 2789  
2790 2790          dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2791 2791          DB_DNODE_EXIT(db);
2792 2792  
2793 2793          if (db->db_level == 0 &&
2794 2794              dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2795 2795                  /*
2796 2796                   * The BP for this block has been provided by open context
2797 2797                   * (by dmu_sync() or dmu_buf_write_embedded()).
2798 2798                   */
2799 2799                  void *contents = (data != NULL) ? data->b_data : NULL;
2800 2800  
2801 2801                  dr->dr_zio = zio_write(zio, os->os_spa, txg,
2802 2802                      db->db_blkptr, contents, db->db.db_size, &zp,
2803 2803                      dbuf_write_override_ready, NULL, dbuf_write_override_done,
2804 2804                      dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2805 2805                  mutex_enter(&db->db_mtx);
2806 2806                  dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2807 2807                  zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2808 2808                      dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2809 2809                  mutex_exit(&db->db_mtx);
2810 2810          } else if (db->db_state == DB_NOFILL) {
2811 2811                  ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
2812 2812                      zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
2813 2813                  dr->dr_zio = zio_write(zio, os->os_spa, txg,
2814 2814                      db->db_blkptr, NULL, db->db.db_size, &zp,
2815 2815                      dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
2816 2816                      ZIO_PRIORITY_ASYNC_WRITE,
2817 2817                      ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2818 2818          } else {
2819 2819                  ASSERT(arc_released(data));
2820 2820                  dr->dr_zio = arc_write(zio, os->os_spa, txg,
2821 2821                      db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2822 2822                      DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2823 2823                      dbuf_write_physdone, dbuf_write_done, db,
2824 2824                      ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2825 2825          }
2826 2826  }

↓ open down ↓

720 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX