Print this page
patch first-pass
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/dbuf.c
+++ new/usr/src/uts/common/fs/zfs/dbuf.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
25 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28 28 * Copyright (c) 2014 Integros [integros.com]
29 29 */
30 30
31 31 #include <sys/zfs_context.h>
32 32 #include <sys/dmu.h>
33 33 #include <sys/dmu_send.h>
34 34 #include <sys/dmu_impl.h>
35 35 #include <sys/dbuf.h>
36 36 #include <sys/dmu_objset.h>
37 37 #include <sys/dsl_dataset.h>
38 38 #include <sys/dsl_dir.h>
39 39 #include <sys/dmu_tx.h>
40 40 #include <sys/spa.h>
41 41 #include <sys/zio.h>
42 42 #include <sys/dmu_zfetch.h>
43 43 #include <sys/sa.h>
44 44 #include <sys/sa_impl.h>
45 45 #include <sys/zfeature.h>
46 46 #include <sys/blkptr.h>
47 47 #include <sys/range_tree.h>
48 48
49 49 /*
50 50 * Number of times that zfs_free_range() took the slow path while doing
↓ open down ↓ |
50 lines elided |
↑ open up ↑ |
51 51 * a zfs receive. A nonzero value indicates a potential performance problem.
52 52 */
53 53 uint64_t zfs_free_range_recv_miss;
54 54
55 55 static void dbuf_destroy(dmu_buf_impl_t *db);
56 56 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
57 57 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
58 58
59 59 #ifndef __lint
60 60 extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
61 - dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp);
61 + dmu_buf_evict_func_t *evict_func_prep, dmu_buf_evict_func_t *evict_func,
62 + dmu_buf_t **clear_on_evict_dbufp);
62 63 #endif /* ! __lint */
63 64
64 65 /*
65 66 * Global data structures and functions for the dbuf cache.
66 67 */
67 68 static kmem_cache_t *dbuf_cache;
68 69 static taskq_t *dbu_evict_taskq;
69 70
70 71 /* ARGSUSED */
71 72 static int
72 73 dbuf_cons(void *vdb, void *unused, int kmflag)
73 74 {
74 75 dmu_buf_impl_t *db = vdb;
75 76 bzero(db, sizeof (dmu_buf_impl_t));
76 77
77 78 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
78 79 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
79 80 refcount_create(&db->db_holds);
80 81
81 82 return (0);
82 83 }
83 84
84 85 /* ARGSUSED */
85 86 static void
86 87 dbuf_dest(void *vdb, void *unused)
87 88 {
88 89 dmu_buf_impl_t *db = vdb;
89 90 mutex_destroy(&db->db_mtx);
90 91 cv_destroy(&db->db_changed);
91 92 refcount_destroy(&db->db_holds);
92 93 }
93 94
94 95 /*
95 96 * dbuf hash table routines
96 97 */
97 98 static dbuf_hash_table_t dbuf_hash_table;
98 99
99 100 static uint64_t dbuf_hash_count;
100 101
101 102 static uint64_t
102 103 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
103 104 {
104 105 uintptr_t osv = (uintptr_t)os;
105 106 uint64_t crc = -1ULL;
106 107
107 108 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
108 109 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
109 110 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
110 111 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
111 112 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
112 113 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
113 114 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
114 115
115 116 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
116 117
117 118 return (crc);
118 119 }
119 120
120 121 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
121 122
122 123 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
123 124 ((dbuf)->db.db_object == (obj) && \
124 125 (dbuf)->db_objset == (os) && \
125 126 (dbuf)->db_level == (level) && \
126 127 (dbuf)->db_blkid == (blkid))
127 128
128 129 dmu_buf_impl_t *
129 130 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
130 131 {
131 132 dbuf_hash_table_t *h = &dbuf_hash_table;
132 133 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
133 134 uint64_t idx = hv & h->hash_table_mask;
134 135 dmu_buf_impl_t *db;
135 136
136 137 mutex_enter(DBUF_HASH_MUTEX(h, idx));
137 138 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
138 139 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
139 140 mutex_enter(&db->db_mtx);
140 141 if (db->db_state != DB_EVICTING) {
141 142 mutex_exit(DBUF_HASH_MUTEX(h, idx));
142 143 return (db);
143 144 }
144 145 mutex_exit(&db->db_mtx);
145 146 }
146 147 }
147 148 mutex_exit(DBUF_HASH_MUTEX(h, idx));
148 149 return (NULL);
149 150 }
150 151
151 152 static dmu_buf_impl_t *
152 153 dbuf_find_bonus(objset_t *os, uint64_t object)
153 154 {
154 155 dnode_t *dn;
155 156 dmu_buf_impl_t *db = NULL;
156 157
157 158 if (dnode_hold(os, object, FTAG, &dn) == 0) {
158 159 rw_enter(&dn->dn_struct_rwlock, RW_READER);
159 160 if (dn->dn_bonus != NULL) {
160 161 db = dn->dn_bonus;
161 162 mutex_enter(&db->db_mtx);
162 163 }
163 164 rw_exit(&dn->dn_struct_rwlock);
164 165 dnode_rele(dn, FTAG);
165 166 }
166 167 return (db);
167 168 }
168 169
169 170 /*
170 171 * Insert an entry into the hash table. If there is already an element
171 172 * equal to elem in the hash table, then the already existing element
172 173 * will be returned and the new element will not be inserted.
173 174 * Otherwise returns NULL.
174 175 */
175 176 static dmu_buf_impl_t *
176 177 dbuf_hash_insert(dmu_buf_impl_t *db)
177 178 {
178 179 dbuf_hash_table_t *h = &dbuf_hash_table;
179 180 objset_t *os = db->db_objset;
180 181 uint64_t obj = db->db.db_object;
181 182 int level = db->db_level;
182 183 uint64_t blkid = db->db_blkid;
183 184 uint64_t hv = DBUF_HASH(os, obj, level, blkid);
184 185 uint64_t idx = hv & h->hash_table_mask;
185 186 dmu_buf_impl_t *dbf;
186 187
187 188 mutex_enter(DBUF_HASH_MUTEX(h, idx));
188 189 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
189 190 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
190 191 mutex_enter(&dbf->db_mtx);
191 192 if (dbf->db_state != DB_EVICTING) {
192 193 mutex_exit(DBUF_HASH_MUTEX(h, idx));
193 194 return (dbf);
194 195 }
195 196 mutex_exit(&dbf->db_mtx);
196 197 }
197 198 }
198 199
199 200 mutex_enter(&db->db_mtx);
200 201 db->db_hash_next = h->hash_table[idx];
201 202 h->hash_table[idx] = db;
202 203 mutex_exit(DBUF_HASH_MUTEX(h, idx));
203 204 atomic_inc_64(&dbuf_hash_count);
204 205
205 206 return (NULL);
206 207 }
207 208
208 209 /*
209 210 * Remove an entry from the hash table. It must be in the EVICTING state.
210 211 */
211 212 static void
212 213 dbuf_hash_remove(dmu_buf_impl_t *db)
213 214 {
214 215 dbuf_hash_table_t *h = &dbuf_hash_table;
215 216 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
216 217 db->db_level, db->db_blkid);
217 218 uint64_t idx = hv & h->hash_table_mask;
218 219 dmu_buf_impl_t *dbf, **dbp;
219 220
220 221 /*
221 222 * We musn't hold db_mtx to maintain lock ordering:
222 223 * DBUF_HASH_MUTEX > db_mtx.
223 224 */
224 225 ASSERT(refcount_is_zero(&db->db_holds));
225 226 ASSERT(db->db_state == DB_EVICTING);
226 227 ASSERT(!MUTEX_HELD(&db->db_mtx));
227 228
228 229 mutex_enter(DBUF_HASH_MUTEX(h, idx));
229 230 dbp = &h->hash_table[idx];
230 231 while ((dbf = *dbp) != db) {
231 232 dbp = &dbf->db_hash_next;
232 233 ASSERT(dbf != NULL);
233 234 }
234 235 *dbp = db->db_hash_next;
235 236 db->db_hash_next = NULL;
236 237 mutex_exit(DBUF_HASH_MUTEX(h, idx));
237 238 atomic_dec_64(&dbuf_hash_count);
238 239 }
239 240
240 241 static arc_evict_func_t dbuf_do_evict;
241 242
242 243 typedef enum {
243 244 DBVU_EVICTING,
244 245 DBVU_NOT_EVICTING
245 246 } dbvu_verify_type_t;
246 247
247 248 static void
248 249 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
249 250 {
250 251 #ifdef ZFS_DEBUG
251 252 int64_t holds;
252 253
253 254 if (db->db_user == NULL)
254 255 return;
255 256
256 257 /* Only data blocks support the attachment of user data. */
257 258 ASSERT(db->db_level == 0);
258 259
259 260 /* Clients must resolve a dbuf before attaching user data. */
260 261 ASSERT(db->db.db_data != NULL);
261 262 ASSERT3U(db->db_state, ==, DB_CACHED);
262 263
263 264 holds = refcount_count(&db->db_holds);
264 265 if (verify_type == DBVU_EVICTING) {
265 266 /*
266 267 * Immediate eviction occurs when holds == dirtycnt.
267 268 * For normal eviction buffers, holds is zero on
268 269 * eviction, except when dbuf_fix_old_data() calls
269 270 * dbuf_clear_data(). However, the hold count can grow
270 271 * during eviction even though db_mtx is held (see
271 272 * dmu_bonus_hold() for an example), so we can only
272 273 * test the generic invariant that holds >= dirtycnt.
273 274 */
274 275 ASSERT3U(holds, >=, db->db_dirtycnt);
275 276 } else {
276 277 if (db->db_user_immediate_evict == TRUE)
277 278 ASSERT3U(holds, >=, db->db_dirtycnt);
278 279 else
279 280 ASSERT3U(holds, >, 0);
280 281 }
281 282 #endif
282 283 }
283 284
284 285 static void
285 286 dbuf_evict_user(dmu_buf_impl_t *db)
286 287 {
287 288 dmu_buf_user_t *dbu = db->db_user;
288 289
289 290 ASSERT(MUTEX_HELD(&db->db_mtx));
290 291
↓ open down ↓ |
219 lines elided |
↑ open up ↑ |
291 292 if (dbu == NULL)
292 293 return;
293 294
294 295 dbuf_verify_user(db, DBVU_EVICTING);
295 296 db->db_user = NULL;
296 297
297 298 #ifdef ZFS_DEBUG
298 299 if (dbu->dbu_clear_on_evict_dbufp != NULL)
299 300 *dbu->dbu_clear_on_evict_dbufp = NULL;
300 301 #endif
302 +
303 + if (dbu->dbu_evict_func_prep != NULL)
304 + dbu->dbu_evict_func_prep(dbu);
301 305
302 306 /*
303 307 * Invoke the callback from a taskq to avoid lock order reversals
304 308 * and limit stack depth.
305 309 */
306 310 taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0,
307 311 &dbu->dbu_tqent);
308 312 }
309 313
310 314 boolean_t
311 315 dbuf_is_metadata(dmu_buf_impl_t *db)
312 316 {
313 317 if (db->db_level > 0) {
314 318 return (B_TRUE);
315 319 } else {
316 320 boolean_t is_metadata;
317 321
318 322 DB_DNODE_ENTER(db);
319 323 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
320 324 DB_DNODE_EXIT(db);
321 325
322 326 return (is_metadata);
323 327 }
324 328 }
325 329
326 330 void
327 331 dbuf_evict(dmu_buf_impl_t *db)
328 332 {
329 333 ASSERT(MUTEX_HELD(&db->db_mtx));
330 334 ASSERT(db->db_buf == NULL);
331 335 ASSERT(db->db_data_pending == NULL);
332 336
333 337 dbuf_clear(db);
334 338 dbuf_destroy(db);
335 339 }
336 340
337 341 void
338 342 dbuf_init(void)
339 343 {
340 344 uint64_t hsize = 1ULL << 16;
341 345 dbuf_hash_table_t *h = &dbuf_hash_table;
342 346 int i;
343 347
344 348 /*
345 349 * The hash table is big enough to fill all of physical memory
346 350 * with an average 4K block size. The table will take up
347 351 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
348 352 */
349 353 while (hsize * 4096 < physmem * PAGESIZE)
350 354 hsize <<= 1;
351 355
352 356 retry:
353 357 h->hash_table_mask = hsize - 1;
354 358 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
355 359 if (h->hash_table == NULL) {
356 360 /* XXX - we should really return an error instead of assert */
357 361 ASSERT(hsize > (1ULL << 10));
358 362 hsize >>= 1;
359 363 goto retry;
360 364 }
361 365
362 366 dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
363 367 sizeof (dmu_buf_impl_t),
364 368 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
365 369
366 370 for (i = 0; i < DBUF_MUTEXES; i++)
367 371 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
368 372
369 373 /*
370 374 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
371 375 * configuration is not required.
372 376 */
373 377 dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
374 378 }
375 379
376 380 void
377 381 dbuf_fini(void)
378 382 {
379 383 dbuf_hash_table_t *h = &dbuf_hash_table;
380 384 int i;
381 385
382 386 for (i = 0; i < DBUF_MUTEXES; i++)
383 387 mutex_destroy(&h->hash_mutexes[i]);
384 388 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
385 389 kmem_cache_destroy(dbuf_cache);
386 390 taskq_destroy(dbu_evict_taskq);
387 391 }
388 392
389 393 /*
390 394 * Other stuff.
391 395 */
392 396
393 397 #ifdef ZFS_DEBUG
394 398 static void
395 399 dbuf_verify(dmu_buf_impl_t *db)
396 400 {
397 401 dnode_t *dn;
398 402 dbuf_dirty_record_t *dr;
399 403
400 404 ASSERT(MUTEX_HELD(&db->db_mtx));
401 405
402 406 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
403 407 return;
404 408
405 409 ASSERT(db->db_objset != NULL);
406 410 DB_DNODE_ENTER(db);
407 411 dn = DB_DNODE(db);
408 412 if (dn == NULL) {
409 413 ASSERT(db->db_parent == NULL);
410 414 ASSERT(db->db_blkptr == NULL);
411 415 } else {
412 416 ASSERT3U(db->db.db_object, ==, dn->dn_object);
413 417 ASSERT3P(db->db_objset, ==, dn->dn_objset);
414 418 ASSERT3U(db->db_level, <, dn->dn_nlevels);
415 419 ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
416 420 db->db_blkid == DMU_SPILL_BLKID ||
417 421 !avl_is_empty(&dn->dn_dbufs));
418 422 }
419 423 if (db->db_blkid == DMU_BONUS_BLKID) {
420 424 ASSERT(dn != NULL);
421 425 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
422 426 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
423 427 } else if (db->db_blkid == DMU_SPILL_BLKID) {
424 428 ASSERT(dn != NULL);
425 429 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
426 430 ASSERT0(db->db.db_offset);
427 431 } else {
428 432 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
429 433 }
430 434
431 435 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
432 436 ASSERT(dr->dr_dbuf == db);
433 437
434 438 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
435 439 ASSERT(dr->dr_dbuf == db);
436 440
437 441 /*
438 442 * We can't assert that db_size matches dn_datablksz because it
439 443 * can be momentarily different when another thread is doing
440 444 * dnode_set_blksz().
441 445 */
442 446 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
443 447 dr = db->db_data_pending;
444 448 /*
445 449 * It should only be modified in syncing context, so
446 450 * make sure we only have one copy of the data.
447 451 */
448 452 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
449 453 }
450 454
451 455 /* verify db->db_blkptr */
452 456 if (db->db_blkptr) {
453 457 if (db->db_parent == dn->dn_dbuf) {
454 458 /* db is pointed to by the dnode */
455 459 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
456 460 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
457 461 ASSERT(db->db_parent == NULL);
458 462 else
459 463 ASSERT(db->db_parent != NULL);
460 464 if (db->db_blkid != DMU_SPILL_BLKID)
461 465 ASSERT3P(db->db_blkptr, ==,
462 466 &dn->dn_phys->dn_blkptr[db->db_blkid]);
463 467 } else {
464 468 /* db is pointed to by an indirect block */
465 469 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
466 470 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
467 471 ASSERT3U(db->db_parent->db.db_object, ==,
468 472 db->db.db_object);
469 473 /*
470 474 * dnode_grow_indblksz() can make this fail if we don't
471 475 * have the struct_rwlock. XXX indblksz no longer
472 476 * grows. safe to do this now?
473 477 */
474 478 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
475 479 ASSERT3P(db->db_blkptr, ==,
476 480 ((blkptr_t *)db->db_parent->db.db_data +
477 481 db->db_blkid % epb));
478 482 }
479 483 }
480 484 }
481 485 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
482 486 (db->db_buf == NULL || db->db_buf->b_data) &&
483 487 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
484 488 db->db_state != DB_FILL && !dn->dn_free_txg) {
485 489 /*
486 490 * If the blkptr isn't set but they have nonzero data,
487 491 * it had better be dirty, otherwise we'll lose that
488 492 * data when we evict this buffer.
489 493 */
490 494 if (db->db_dirtycnt == 0) {
491 495 uint64_t *buf = db->db.db_data;
492 496 int i;
493 497
494 498 for (i = 0; i < db->db.db_size >> 3; i++) {
495 499 ASSERT(buf[i] == 0);
496 500 }
497 501 }
498 502 }
499 503 DB_DNODE_EXIT(db);
500 504 }
501 505 #endif
502 506
503 507 static void
504 508 dbuf_clear_data(dmu_buf_impl_t *db)
505 509 {
506 510 ASSERT(MUTEX_HELD(&db->db_mtx));
507 511 dbuf_evict_user(db);
508 512 db->db_buf = NULL;
509 513 db->db.db_data = NULL;
510 514 if (db->db_state != DB_NOFILL)
511 515 db->db_state = DB_UNCACHED;
512 516 }
513 517
514 518 static void
515 519 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
516 520 {
517 521 ASSERT(MUTEX_HELD(&db->db_mtx));
518 522 ASSERT(buf != NULL);
519 523
520 524 db->db_buf = buf;
521 525 ASSERT(buf->b_data != NULL);
522 526 db->db.db_data = buf->b_data;
523 527 if (!arc_released(buf))
524 528 arc_set_callback(buf, dbuf_do_evict, db);
525 529 }
526 530
527 531 /*
528 532 * Loan out an arc_buf for read. Return the loaned arc_buf.
529 533 */
530 534 arc_buf_t *
531 535 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
532 536 {
533 537 arc_buf_t *abuf;
534 538
535 539 mutex_enter(&db->db_mtx);
536 540 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
537 541 int blksz = db->db.db_size;
538 542 spa_t *spa = db->db_objset->os_spa;
539 543
540 544 mutex_exit(&db->db_mtx);
541 545 abuf = arc_loan_buf(spa, blksz);
542 546 bcopy(db->db.db_data, abuf->b_data, blksz);
543 547 } else {
544 548 abuf = db->db_buf;
545 549 arc_loan_inuse_buf(abuf, db);
546 550 dbuf_clear_data(db);
547 551 mutex_exit(&db->db_mtx);
548 552 }
549 553 return (abuf);
550 554 }
551 555
552 556 /*
553 557 * Calculate which level n block references the data at the level 0 offset
554 558 * provided.
555 559 */
556 560 uint64_t
557 561 dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
558 562 {
559 563 if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
560 564 /*
561 565 * The level n blkid is equal to the level 0 blkid divided by
562 566 * the number of level 0s in a level n block.
563 567 *
564 568 * The level 0 blkid is offset >> datablkshift =
565 569 * offset / 2^datablkshift.
566 570 *
567 571 * The number of level 0s in a level n is the number of block
568 572 * pointers in an indirect block, raised to the power of level.
569 573 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
570 574 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
571 575 *
572 576 * Thus, the level n blkid is: offset /
573 577 * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
574 578 * = offset / 2^(datablkshift + level *
575 579 * (indblkshift - SPA_BLKPTRSHIFT))
576 580 * = offset >> (datablkshift + level *
577 581 * (indblkshift - SPA_BLKPTRSHIFT))
578 582 */
579 583 return (offset >> (dn->dn_datablkshift + level *
580 584 (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
581 585 } else {
582 586 ASSERT3U(offset, <, dn->dn_datablksz);
583 587 return (0);
584 588 }
585 589 }
586 590
587 591 static void
588 592 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
589 593 {
590 594 dmu_buf_impl_t *db = vdb;
591 595
592 596 mutex_enter(&db->db_mtx);
593 597 ASSERT3U(db->db_state, ==, DB_READ);
594 598 /*
595 599 * All reads are synchronous, so we must have a hold on the dbuf
596 600 */
597 601 ASSERT(refcount_count(&db->db_holds) > 0);
598 602 ASSERT(db->db_buf == NULL);
599 603 ASSERT(db->db.db_data == NULL);
600 604 if (db->db_level == 0 && db->db_freed_in_flight) {
601 605 /* we were freed in flight; disregard any error */
602 606 arc_release(buf, db);
603 607 bzero(buf->b_data, db->db.db_size);
604 608 arc_buf_freeze(buf);
605 609 db->db_freed_in_flight = FALSE;
606 610 dbuf_set_data(db, buf);
607 611 db->db_state = DB_CACHED;
608 612 } else if (zio == NULL || zio->io_error == 0) {
609 613 dbuf_set_data(db, buf);
610 614 db->db_state = DB_CACHED;
611 615 } else {
612 616 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
613 617 ASSERT3P(db->db_buf, ==, NULL);
614 618 VERIFY(arc_buf_remove_ref(buf, db));
615 619 db->db_state = DB_UNCACHED;
616 620 }
617 621 cv_broadcast(&db->db_changed);
618 622 dbuf_rele_and_unlock(db, NULL);
619 623 }
620 624
621 625 static void
622 626 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
623 627 {
624 628 dnode_t *dn;
625 629 zbookmark_phys_t zb;
626 630 arc_flags_t aflags = ARC_FLAG_NOWAIT;
627 631
628 632 DB_DNODE_ENTER(db);
629 633 dn = DB_DNODE(db);
630 634 ASSERT(!refcount_is_zero(&db->db_holds));
631 635 /* We need the struct_rwlock to prevent db_blkptr from changing. */
632 636 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
633 637 ASSERT(MUTEX_HELD(&db->db_mtx));
634 638 ASSERT(db->db_state == DB_UNCACHED);
635 639 ASSERT(db->db_buf == NULL);
636 640
637 641 if (db->db_blkid == DMU_BONUS_BLKID) {
638 642 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
639 643
640 644 ASSERT3U(bonuslen, <=, db->db.db_size);
641 645 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
642 646 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
643 647 if (bonuslen < DN_MAX_BONUSLEN)
644 648 bzero(db->db.db_data, DN_MAX_BONUSLEN);
645 649 if (bonuslen)
646 650 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
647 651 DB_DNODE_EXIT(db);
648 652 db->db_state = DB_CACHED;
649 653 mutex_exit(&db->db_mtx);
650 654 return;
651 655 }
652 656
653 657 /*
654 658 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
655 659 * processes the delete record and clears the bp while we are waiting
656 660 * for the dn_mtx (resulting in a "no" from block_freed).
657 661 */
658 662 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
659 663 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
660 664 BP_IS_HOLE(db->db_blkptr)))) {
661 665 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
662 666
663 667 DB_DNODE_EXIT(db);
664 668 dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
665 669 db->db.db_size, db, type));
666 670 bzero(db->db.db_data, db->db.db_size);
667 671 db->db_state = DB_CACHED;
668 672 mutex_exit(&db->db_mtx);
669 673 return;
670 674 }
671 675
672 676 DB_DNODE_EXIT(db);
673 677
674 678 db->db_state = DB_READ;
675 679 mutex_exit(&db->db_mtx);
676 680
677 681 if (DBUF_IS_L2CACHEABLE(db))
678 682 aflags |= ARC_FLAG_L2CACHE;
679 683 if (DBUF_IS_L2COMPRESSIBLE(db))
680 684 aflags |= ARC_FLAG_L2COMPRESS;
681 685
682 686 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
683 687 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
684 688 db->db.db_object, db->db_level, db->db_blkid);
685 689
686 690 dbuf_add_ref(db, NULL);
687 691
688 692 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
689 693 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
690 694 (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
691 695 &aflags, &zb);
692 696 }
693 697
694 698 int
695 699 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
696 700 {
697 701 int err = 0;
698 702 boolean_t havepzio = (zio != NULL);
699 703 boolean_t prefetch;
700 704 dnode_t *dn;
701 705
702 706 /*
703 707 * We don't have to hold the mutex to check db_state because it
704 708 * can't be freed while we have a hold on the buffer.
705 709 */
706 710 ASSERT(!refcount_is_zero(&db->db_holds));
707 711
708 712 if (db->db_state == DB_NOFILL)
709 713 return (SET_ERROR(EIO));
710 714
711 715 DB_DNODE_ENTER(db);
712 716 dn = DB_DNODE(db);
713 717 if ((flags & DB_RF_HAVESTRUCT) == 0)
714 718 rw_enter(&dn->dn_struct_rwlock, RW_READER);
715 719
716 720 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
717 721 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
718 722 DBUF_IS_CACHEABLE(db);
719 723
720 724 mutex_enter(&db->db_mtx);
721 725 if (db->db_state == DB_CACHED) {
722 726 mutex_exit(&db->db_mtx);
723 727 if (prefetch)
724 728 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
725 729 if ((flags & DB_RF_HAVESTRUCT) == 0)
726 730 rw_exit(&dn->dn_struct_rwlock);
727 731 DB_DNODE_EXIT(db);
728 732 } else if (db->db_state == DB_UNCACHED) {
729 733 spa_t *spa = dn->dn_objset->os_spa;
730 734
731 735 if (zio == NULL)
732 736 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
733 737 dbuf_read_impl(db, zio, flags);
734 738
735 739 /* dbuf_read_impl has dropped db_mtx for us */
736 740
737 741 if (prefetch)
738 742 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
739 743
740 744 if ((flags & DB_RF_HAVESTRUCT) == 0)
741 745 rw_exit(&dn->dn_struct_rwlock);
742 746 DB_DNODE_EXIT(db);
743 747
744 748 if (!havepzio)
745 749 err = zio_wait(zio);
746 750 } else {
747 751 /*
748 752 * Another reader came in while the dbuf was in flight
749 753 * between UNCACHED and CACHED. Either a writer will finish
750 754 * writing the buffer (sending the dbuf to CACHED) or the
751 755 * first reader's request will reach the read_done callback
752 756 * and send the dbuf to CACHED. Otherwise, a failure
753 757 * occurred and the dbuf went to UNCACHED.
754 758 */
755 759 mutex_exit(&db->db_mtx);
756 760 if (prefetch)
757 761 dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
758 762 if ((flags & DB_RF_HAVESTRUCT) == 0)
759 763 rw_exit(&dn->dn_struct_rwlock);
760 764 DB_DNODE_EXIT(db);
761 765
762 766 /* Skip the wait per the caller's request. */
763 767 mutex_enter(&db->db_mtx);
764 768 if ((flags & DB_RF_NEVERWAIT) == 0) {
765 769 while (db->db_state == DB_READ ||
766 770 db->db_state == DB_FILL) {
767 771 ASSERT(db->db_state == DB_READ ||
768 772 (flags & DB_RF_HAVESTRUCT) == 0);
769 773 DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
770 774 db, zio_t *, zio);
771 775 cv_wait(&db->db_changed, &db->db_mtx);
772 776 }
773 777 if (db->db_state == DB_UNCACHED)
774 778 err = SET_ERROR(EIO);
775 779 }
776 780 mutex_exit(&db->db_mtx);
777 781 }
778 782
779 783 ASSERT(err || havepzio || db->db_state == DB_CACHED);
780 784 return (err);
781 785 }
782 786
783 787 static void
784 788 dbuf_noread(dmu_buf_impl_t *db)
785 789 {
786 790 ASSERT(!refcount_is_zero(&db->db_holds));
787 791 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
788 792 mutex_enter(&db->db_mtx);
789 793 while (db->db_state == DB_READ || db->db_state == DB_FILL)
790 794 cv_wait(&db->db_changed, &db->db_mtx);
791 795 if (db->db_state == DB_UNCACHED) {
792 796 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
793 797 spa_t *spa = db->db_objset->os_spa;
794 798
795 799 ASSERT(db->db_buf == NULL);
796 800 ASSERT(db->db.db_data == NULL);
797 801 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
798 802 db->db_state = DB_FILL;
799 803 } else if (db->db_state == DB_NOFILL) {
800 804 dbuf_clear_data(db);
801 805 } else {
802 806 ASSERT3U(db->db_state, ==, DB_CACHED);
803 807 }
804 808 mutex_exit(&db->db_mtx);
805 809 }
806 810
807 811 /*
808 812 * This is our just-in-time copy function. It makes a copy of
809 813 * buffers, that have been modified in a previous transaction
810 814 * group, before we modify them in the current active group.
811 815 *
812 816 * This function is used in two places: when we are dirtying a
813 817 * buffer for the first time in a txg, and when we are freeing
814 818 * a range in a dnode that includes this buffer.
815 819 *
816 820 * Note that when we are called from dbuf_free_range() we do
817 821 * not put a hold on the buffer, we just traverse the active
818 822 * dbuf list for the dnode.
819 823 */
820 824 static void
821 825 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
822 826 {
823 827 dbuf_dirty_record_t *dr = db->db_last_dirty;
824 828
825 829 ASSERT(MUTEX_HELD(&db->db_mtx));
826 830 ASSERT(db->db.db_data != NULL);
827 831 ASSERT(db->db_level == 0);
828 832 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
829 833
830 834 if (dr == NULL ||
831 835 (dr->dt.dl.dr_data !=
832 836 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
833 837 return;
834 838
835 839 /*
836 840 * If the last dirty record for this dbuf has not yet synced
837 841 * and its referencing the dbuf data, either:
838 842 * reset the reference to point to a new copy,
839 843 * or (if there a no active holders)
840 844 * just null out the current db_data pointer.
841 845 */
842 846 ASSERT(dr->dr_txg >= txg - 2);
843 847 if (db->db_blkid == DMU_BONUS_BLKID) {
844 848 /* Note that the data bufs here are zio_bufs */
845 849 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
846 850 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
847 851 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
848 852 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
849 853 int size = db->db.db_size;
850 854 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
851 855 spa_t *spa = db->db_objset->os_spa;
852 856
853 857 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
854 858 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
855 859 } else {
856 860 dbuf_clear_data(db);
857 861 }
858 862 }
859 863
860 864 void
861 865 dbuf_unoverride(dbuf_dirty_record_t *dr)
862 866 {
863 867 dmu_buf_impl_t *db = dr->dr_dbuf;
864 868 blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
865 869 uint64_t txg = dr->dr_txg;
866 870
867 871 ASSERT(MUTEX_HELD(&db->db_mtx));
868 872 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
869 873 ASSERT(db->db_level == 0);
870 874
871 875 if (db->db_blkid == DMU_BONUS_BLKID ||
872 876 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
873 877 return;
874 878
875 879 ASSERT(db->db_data_pending != dr);
876 880
877 881 /* free this block */
878 882 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
879 883 zio_free(db->db_objset->os_spa, txg, bp);
880 884
881 885 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
882 886 dr->dt.dl.dr_nopwrite = B_FALSE;
883 887
884 888 /*
885 889 * Release the already-written buffer, so we leave it in
886 890 * a consistent dirty state. Note that all callers are
887 891 * modifying the buffer, so they will immediately do
888 892 * another (redundant) arc_release(). Therefore, leave
889 893 * the buf thawed to save the effort of freezing &
890 894 * immediately re-thawing it.
891 895 */
892 896 arc_release(dr->dt.dl.dr_data, db);
893 897 }
894 898
895 899 /*
896 900 * Evict (if its unreferenced) or clear (if its referenced) any level-0
897 901 * data blocks in the free range, so that any future readers will find
898 902 * empty blocks.
899 903 *
900 904 * This is a no-op if the dataset is in the middle of an incremental
901 905 * receive; see comment below for details.
902 906 */
903 907 void
904 908 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
905 909 dmu_tx_t *tx)
906 910 {
907 911 dmu_buf_impl_t db_search;
908 912 dmu_buf_impl_t *db, *db_next;
909 913 uint64_t txg = tx->tx_txg;
910 914 avl_index_t where;
911 915
912 916 if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
913 917 end_blkid = dn->dn_maxblkid;
914 918 dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
915 919
916 920 db_search.db_level = 0;
917 921 db_search.db_blkid = start_blkid;
918 922 db_search.db_state = DB_SEARCH;
919 923
920 924 mutex_enter(&dn->dn_dbufs_mtx);
921 925 if (start_blkid >= dn->dn_unlisted_l0_blkid) {
922 926 /* There can't be any dbufs in this range; no need to search. */
923 927 #ifdef DEBUG
924 928 db = avl_find(&dn->dn_dbufs, &db_search, &where);
925 929 ASSERT3P(db, ==, NULL);
926 930 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
927 931 ASSERT(db == NULL || db->db_level > 0);
928 932 #endif
929 933 mutex_exit(&dn->dn_dbufs_mtx);
930 934 return;
931 935 } else if (dmu_objset_is_receiving(dn->dn_objset)) {
932 936 /*
933 937 * If we are receiving, we expect there to be no dbufs in
934 938 * the range to be freed, because receive modifies each
935 939 * block at most once, and in offset order. If this is
936 940 * not the case, it can lead to performance problems,
937 941 * so note that we unexpectedly took the slow path.
938 942 */
939 943 atomic_inc_64(&zfs_free_range_recv_miss);
940 944 }
941 945
942 946 db = avl_find(&dn->dn_dbufs, &db_search, &where);
943 947 ASSERT3P(db, ==, NULL);
944 948 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
945 949
946 950 for (; db != NULL; db = db_next) {
947 951 db_next = AVL_NEXT(&dn->dn_dbufs, db);
948 952 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
949 953
950 954 if (db->db_level != 0 || db->db_blkid > end_blkid) {
951 955 break;
952 956 }
953 957 ASSERT3U(db->db_blkid, >=, start_blkid);
954 958
955 959 /* found a level 0 buffer in the range */
956 960 mutex_enter(&db->db_mtx);
957 961 if (dbuf_undirty(db, tx)) {
958 962 /* mutex has been dropped and dbuf destroyed */
959 963 continue;
960 964 }
961 965
962 966 if (db->db_state == DB_UNCACHED ||
963 967 db->db_state == DB_NOFILL ||
964 968 db->db_state == DB_EVICTING) {
965 969 ASSERT(db->db.db_data == NULL);
966 970 mutex_exit(&db->db_mtx);
967 971 continue;
968 972 }
969 973 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
970 974 /* will be handled in dbuf_read_done or dbuf_rele */
971 975 db->db_freed_in_flight = TRUE;
972 976 mutex_exit(&db->db_mtx);
973 977 continue;
974 978 }
975 979 if (refcount_count(&db->db_holds) == 0) {
976 980 ASSERT(db->db_buf);
977 981 dbuf_clear(db);
978 982 continue;
979 983 }
980 984 /* The dbuf is referenced */
981 985
982 986 if (db->db_last_dirty != NULL) {
983 987 dbuf_dirty_record_t *dr = db->db_last_dirty;
984 988
985 989 if (dr->dr_txg == txg) {
986 990 /*
987 991 * This buffer is "in-use", re-adjust the file
988 992 * size to reflect that this buffer may
989 993 * contain new data when we sync.
990 994 */
991 995 if (db->db_blkid != DMU_SPILL_BLKID &&
992 996 db->db_blkid > dn->dn_maxblkid)
993 997 dn->dn_maxblkid = db->db_blkid;
994 998 dbuf_unoverride(dr);
995 999 } else {
996 1000 /*
997 1001 * This dbuf is not dirty in the open context.
998 1002 * Either uncache it (if its not referenced in
999 1003 * the open context) or reset its contents to
1000 1004 * empty.
1001 1005 */
1002 1006 dbuf_fix_old_data(db, txg);
1003 1007 }
1004 1008 }
1005 1009 /* clear the contents if its cached */
1006 1010 if (db->db_state == DB_CACHED) {
1007 1011 ASSERT(db->db.db_data != NULL);
1008 1012 arc_release(db->db_buf, db);
1009 1013 bzero(db->db.db_data, db->db.db_size);
1010 1014 arc_buf_freeze(db->db_buf);
1011 1015 }
1012 1016
1013 1017 mutex_exit(&db->db_mtx);
1014 1018 }
1015 1019 mutex_exit(&dn->dn_dbufs_mtx);
1016 1020 }
1017 1021
1018 1022 static int
1019 1023 dbuf_block_freeable(dmu_buf_impl_t *db)
1020 1024 {
1021 1025 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
1022 1026 uint64_t birth_txg = 0;
1023 1027
1024 1028 /*
1025 1029 * We don't need any locking to protect db_blkptr:
1026 1030 * If it's syncing, then db_last_dirty will be set
1027 1031 * so we'll ignore db_blkptr.
1028 1032 *
1029 1033 * This logic ensures that only block births for
1030 1034 * filled blocks are considered.
1031 1035 */
1032 1036 ASSERT(MUTEX_HELD(&db->db_mtx));
1033 1037 if (db->db_last_dirty && (db->db_blkptr == NULL ||
1034 1038 !BP_IS_HOLE(db->db_blkptr))) {
1035 1039 birth_txg = db->db_last_dirty->dr_txg;
1036 1040 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
1037 1041 birth_txg = db->db_blkptr->blk_birth;
1038 1042 }
1039 1043
1040 1044 /*
1041 1045 * If this block don't exist or is in a snapshot, it can't be freed.
1042 1046 * Don't pass the bp to dsl_dataset_block_freeable() since we
1043 1047 * are holding the db_mtx lock and might deadlock if we are
1044 1048 * prefetching a dedup-ed block.
1045 1049 */
1046 1050 if (birth_txg != 0)
1047 1051 return (ds == NULL ||
1048 1052 dsl_dataset_block_freeable(ds, NULL, birth_txg));
1049 1053 else
1050 1054 return (B_FALSE);
1051 1055 }
1052 1056
1053 1057 void
1054 1058 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
1055 1059 {
1056 1060 arc_buf_t *buf, *obuf;
1057 1061 int osize = db->db.db_size;
1058 1062 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1059 1063 dnode_t *dn;
1060 1064
1061 1065 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1062 1066
1063 1067 DB_DNODE_ENTER(db);
1064 1068 dn = DB_DNODE(db);
1065 1069
1066 1070 /* XXX does *this* func really need the lock? */
1067 1071 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1068 1072
1069 1073 /*
1070 1074 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
1071 1075 * is OK, because there can be no other references to the db
1072 1076 * when we are changing its size, so no concurrent DB_FILL can
1073 1077 * be happening.
1074 1078 */
1075 1079 /*
1076 1080 * XXX we should be doing a dbuf_read, checking the return
1077 1081 * value and returning that up to our callers
1078 1082 */
1079 1083 dmu_buf_will_dirty(&db->db, tx);
1080 1084
1081 1085 /* create the data buffer for the new block */
1082 1086 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
1083 1087
1084 1088 /* copy old block data to the new block */
1085 1089 obuf = db->db_buf;
1086 1090 bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
1087 1091 /* zero the remainder */
1088 1092 if (size > osize)
1089 1093 bzero((uint8_t *)buf->b_data + osize, size - osize);
1090 1094
1091 1095 mutex_enter(&db->db_mtx);
1092 1096 dbuf_set_data(db, buf);
1093 1097 VERIFY(arc_buf_remove_ref(obuf, db));
1094 1098 db->db.db_size = size;
1095 1099
1096 1100 if (db->db_level == 0) {
1097 1101 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1098 1102 db->db_last_dirty->dt.dl.dr_data = buf;
1099 1103 }
1100 1104 mutex_exit(&db->db_mtx);
1101 1105
1102 1106 dnode_willuse_space(dn, size-osize, tx);
1103 1107 DB_DNODE_EXIT(db);
1104 1108 }
1105 1109
1106 1110 void
1107 1111 dbuf_release_bp(dmu_buf_impl_t *db)
1108 1112 {
1109 1113 objset_t *os = db->db_objset;
1110 1114
1111 1115 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1112 1116 ASSERT(arc_released(os->os_phys_buf) ||
1113 1117 list_link_active(&os->os_dsl_dataset->ds_synced_link));
1114 1118 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1115 1119
1116 1120 (void) arc_release(db->db_buf, db);
1117 1121 }
1118 1122
1119 1123 /*
1120 1124 * We already have a dirty record for this TXG, and we are being
1121 1125 * dirtied again.
1122 1126 */
1123 1127 static void
1124 1128 dbuf_redirty(dbuf_dirty_record_t *dr)
1125 1129 {
1126 1130 dmu_buf_impl_t *db = dr->dr_dbuf;
1127 1131
1128 1132 ASSERT(MUTEX_HELD(&db->db_mtx));
1129 1133
1130 1134 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1131 1135 /*
1132 1136 * If this buffer has already been written out,
1133 1137 * we now need to reset its state.
1134 1138 */
1135 1139 dbuf_unoverride(dr);
1136 1140 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1137 1141 db->db_state != DB_NOFILL) {
1138 1142 /* Already released on initial dirty, so just thaw. */
1139 1143 ASSERT(arc_released(db->db_buf));
1140 1144 arc_buf_thaw(db->db_buf);
1141 1145 }
1142 1146 }
1143 1147 }
1144 1148
1145 1149 dbuf_dirty_record_t *
1146 1150 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1147 1151 {
1148 1152 dnode_t *dn;
1149 1153 objset_t *os;
1150 1154 dbuf_dirty_record_t **drp, *dr;
1151 1155 int drop_struct_lock = FALSE;
1152 1156 boolean_t do_free_accounting = B_FALSE;
1153 1157 int txgoff = tx->tx_txg & TXG_MASK;
1154 1158
1155 1159 ASSERT(tx->tx_txg != 0);
1156 1160 ASSERT(!refcount_is_zero(&db->db_holds));
1157 1161 DMU_TX_DIRTY_BUF(tx, db);
1158 1162
1159 1163 DB_DNODE_ENTER(db);
1160 1164 dn = DB_DNODE(db);
1161 1165 /*
1162 1166 * Shouldn't dirty a regular buffer in syncing context. Private
1163 1167 * objects may be dirtied in syncing context, but only if they
1164 1168 * were already pre-dirtied in open context.
1165 1169 */
1166 1170 ASSERT(!dmu_tx_is_syncing(tx) ||
1167 1171 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1168 1172 DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1169 1173 dn->dn_objset->os_dsl_dataset == NULL);
1170 1174 /*
1171 1175 * We make this assert for private objects as well, but after we
1172 1176 * check if we're already dirty. They are allowed to re-dirty
1173 1177 * in syncing context.
1174 1178 */
1175 1179 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1176 1180 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1177 1181 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1178 1182
1179 1183 mutex_enter(&db->db_mtx);
1180 1184 /*
1181 1185 * XXX make this true for indirects too? The problem is that
1182 1186 * transactions created with dmu_tx_create_assigned() from
1183 1187 * syncing context don't bother holding ahead.
1184 1188 */
1185 1189 ASSERT(db->db_level != 0 ||
1186 1190 db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1187 1191 db->db_state == DB_NOFILL);
1188 1192
1189 1193 mutex_enter(&dn->dn_mtx);
1190 1194 /*
1191 1195 * Don't set dirtyctx to SYNC if we're just modifying this as we
1192 1196 * initialize the objset.
1193 1197 */
1194 1198 if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1195 1199 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1196 1200 dn->dn_dirtyctx =
1197 1201 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1198 1202 ASSERT(dn->dn_dirtyctx_firstset == NULL);
1199 1203 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1200 1204 }
1201 1205 mutex_exit(&dn->dn_mtx);
1202 1206
1203 1207 if (db->db_blkid == DMU_SPILL_BLKID)
1204 1208 dn->dn_have_spill = B_TRUE;
1205 1209
1206 1210 /*
1207 1211 * If this buffer is already dirty, we're done.
1208 1212 */
1209 1213 drp = &db->db_last_dirty;
1210 1214 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1211 1215 db->db.db_object == DMU_META_DNODE_OBJECT);
1212 1216 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1213 1217 drp = &dr->dr_next;
1214 1218 if (dr && dr->dr_txg == tx->tx_txg) {
1215 1219 DB_DNODE_EXIT(db);
1216 1220
1217 1221 dbuf_redirty(dr);
1218 1222 mutex_exit(&db->db_mtx);
1219 1223 return (dr);
1220 1224 }
1221 1225
1222 1226 /*
1223 1227 * Only valid if not already dirty.
1224 1228 */
1225 1229 ASSERT(dn->dn_object == 0 ||
1226 1230 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1227 1231 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1228 1232
1229 1233 ASSERT3U(dn->dn_nlevels, >, db->db_level);
1230 1234 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1231 1235 dn->dn_phys->dn_nlevels > db->db_level ||
1232 1236 dn->dn_next_nlevels[txgoff] > db->db_level ||
1233 1237 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1234 1238 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1235 1239
1236 1240 /*
1237 1241 * We should only be dirtying in syncing context if it's the
1238 1242 * mos or we're initializing the os or it's a special object.
1239 1243 * However, we are allowed to dirty in syncing context provided
1240 1244 * we already dirtied it in open context. Hence we must make
1241 1245 * this assertion only if we're not already dirty.
1242 1246 */
1243 1247 os = dn->dn_objset;
1244 1248 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1245 1249 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1246 1250 ASSERT(db->db.db_size != 0);
1247 1251
1248 1252 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1249 1253
1250 1254 if (db->db_blkid != DMU_BONUS_BLKID) {
1251 1255 /*
1252 1256 * Update the accounting.
1253 1257 * Note: we delay "free accounting" until after we drop
1254 1258 * the db_mtx. This keeps us from grabbing other locks
1255 1259 * (and possibly deadlocking) in bp_get_dsize() while
1256 1260 * also holding the db_mtx.
1257 1261 */
1258 1262 dnode_willuse_space(dn, db->db.db_size, tx);
1259 1263 do_free_accounting = dbuf_block_freeable(db);
1260 1264 }
1261 1265
1262 1266 /*
1263 1267 * If this buffer is dirty in an old transaction group we need
1264 1268 * to make a copy of it so that the changes we make in this
1265 1269 * transaction group won't leak out when we sync the older txg.
1266 1270 */
1267 1271 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1268 1272 if (db->db_level == 0) {
1269 1273 void *data_old = db->db_buf;
1270 1274
1271 1275 if (db->db_state != DB_NOFILL) {
1272 1276 if (db->db_blkid == DMU_BONUS_BLKID) {
1273 1277 dbuf_fix_old_data(db, tx->tx_txg);
1274 1278 data_old = db->db.db_data;
1275 1279 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1276 1280 /*
1277 1281 * Release the data buffer from the cache so
1278 1282 * that we can modify it without impacting
1279 1283 * possible other users of this cached data
1280 1284 * block. Note that indirect blocks and
1281 1285 * private objects are not released until the
1282 1286 * syncing state (since they are only modified
1283 1287 * then).
1284 1288 */
1285 1289 arc_release(db->db_buf, db);
1286 1290 dbuf_fix_old_data(db, tx->tx_txg);
1287 1291 data_old = db->db_buf;
1288 1292 }
1289 1293 ASSERT(data_old != NULL);
1290 1294 }
1291 1295 dr->dt.dl.dr_data = data_old;
1292 1296 } else {
1293 1297 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1294 1298 list_create(&dr->dt.di.dr_children,
1295 1299 sizeof (dbuf_dirty_record_t),
1296 1300 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1297 1301 }
1298 1302 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1299 1303 dr->dr_accounted = db->db.db_size;
1300 1304 dr->dr_dbuf = db;
1301 1305 dr->dr_txg = tx->tx_txg;
1302 1306 dr->dr_next = *drp;
1303 1307 *drp = dr;
1304 1308
1305 1309 /*
1306 1310 * We could have been freed_in_flight between the dbuf_noread
1307 1311 * and dbuf_dirty. We win, as though the dbuf_noread() had
1308 1312 * happened after the free.
1309 1313 */
1310 1314 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1311 1315 db->db_blkid != DMU_SPILL_BLKID) {
1312 1316 mutex_enter(&dn->dn_mtx);
1313 1317 if (dn->dn_free_ranges[txgoff] != NULL) {
1314 1318 range_tree_clear(dn->dn_free_ranges[txgoff],
1315 1319 db->db_blkid, 1);
1316 1320 }
1317 1321 mutex_exit(&dn->dn_mtx);
1318 1322 db->db_freed_in_flight = FALSE;
1319 1323 }
1320 1324
1321 1325 /*
1322 1326 * This buffer is now part of this txg
1323 1327 */
1324 1328 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1325 1329 db->db_dirtycnt += 1;
1326 1330 ASSERT3U(db->db_dirtycnt, <=, 3);
1327 1331
1328 1332 mutex_exit(&db->db_mtx);
1329 1333
1330 1334 if (db->db_blkid == DMU_BONUS_BLKID ||
1331 1335 db->db_blkid == DMU_SPILL_BLKID) {
1332 1336 mutex_enter(&dn->dn_mtx);
1333 1337 ASSERT(!list_link_active(&dr->dr_dirty_node));
1334 1338 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1335 1339 mutex_exit(&dn->dn_mtx);
1336 1340 dnode_setdirty(dn, tx);
1337 1341 DB_DNODE_EXIT(db);
1338 1342 return (dr);
1339 1343 } else if (do_free_accounting) {
1340 1344 blkptr_t *bp = db->db_blkptr;
1341 1345 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1342 1346 bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1343 1347 /*
1344 1348 * This is only a guess -- if the dbuf is dirty
1345 1349 * in a previous txg, we don't know how much
1346 1350 * space it will use on disk yet. We should
1347 1351 * really have the struct_rwlock to access
1348 1352 * db_blkptr, but since this is just a guess,
1349 1353 * it's OK if we get an odd answer.
1350 1354 */
1351 1355 ddt_prefetch(os->os_spa, bp);
1352 1356 dnode_willuse_space(dn, -willfree, tx);
1353 1357 }
1354 1358
1355 1359 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1356 1360 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1357 1361 drop_struct_lock = TRUE;
1358 1362 }
1359 1363
1360 1364 if (db->db_level == 0) {
1361 1365 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1362 1366 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1363 1367 }
1364 1368
1365 1369 if (db->db_level+1 < dn->dn_nlevels) {
1366 1370 dmu_buf_impl_t *parent = db->db_parent;
1367 1371 dbuf_dirty_record_t *di;
1368 1372 int parent_held = FALSE;
1369 1373
1370 1374 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1371 1375 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1372 1376
1373 1377 parent = dbuf_hold_level(dn, db->db_level+1,
1374 1378 db->db_blkid >> epbs, FTAG);
1375 1379 ASSERT(parent != NULL);
1376 1380 parent_held = TRUE;
1377 1381 }
1378 1382 if (drop_struct_lock)
1379 1383 rw_exit(&dn->dn_struct_rwlock);
1380 1384 ASSERT3U(db->db_level+1, ==, parent->db_level);
1381 1385 di = dbuf_dirty(parent, tx);
1382 1386 if (parent_held)
1383 1387 dbuf_rele(parent, FTAG);
1384 1388
1385 1389 mutex_enter(&db->db_mtx);
1386 1390 /*
1387 1391 * Since we've dropped the mutex, it's possible that
1388 1392 * dbuf_undirty() might have changed this out from under us.
1389 1393 */
1390 1394 if (db->db_last_dirty == dr ||
1391 1395 dn->dn_object == DMU_META_DNODE_OBJECT) {
1392 1396 mutex_enter(&di->dt.di.dr_mtx);
1393 1397 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1394 1398 ASSERT(!list_link_active(&dr->dr_dirty_node));
1395 1399 list_insert_tail(&di->dt.di.dr_children, dr);
1396 1400 mutex_exit(&di->dt.di.dr_mtx);
1397 1401 dr->dr_parent = di;
1398 1402 }
1399 1403 mutex_exit(&db->db_mtx);
1400 1404 } else {
1401 1405 ASSERT(db->db_level+1 == dn->dn_nlevels);
1402 1406 ASSERT(db->db_blkid < dn->dn_nblkptr);
1403 1407 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1404 1408 mutex_enter(&dn->dn_mtx);
1405 1409 ASSERT(!list_link_active(&dr->dr_dirty_node));
1406 1410 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1407 1411 mutex_exit(&dn->dn_mtx);
1408 1412 if (drop_struct_lock)
1409 1413 rw_exit(&dn->dn_struct_rwlock);
1410 1414 }
1411 1415
1412 1416 dnode_setdirty(dn, tx);
1413 1417 DB_DNODE_EXIT(db);
1414 1418 return (dr);
1415 1419 }
1416 1420
1417 1421 /*
1418 1422 * Undirty a buffer in the transaction group referenced by the given
1419 1423 * transaction. Return whether this evicted the dbuf.
1420 1424 */
1421 1425 static boolean_t
1422 1426 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1423 1427 {
1424 1428 dnode_t *dn;
1425 1429 uint64_t txg = tx->tx_txg;
1426 1430 dbuf_dirty_record_t *dr, **drp;
1427 1431
1428 1432 ASSERT(txg != 0);
1429 1433
1430 1434 /*
1431 1435 * Due to our use of dn_nlevels below, this can only be called
1432 1436 * in open context, unless we are operating on the MOS.
1433 1437 * From syncing context, dn_nlevels may be different from the
1434 1438 * dn_nlevels used when dbuf was dirtied.
1435 1439 */
1436 1440 ASSERT(db->db_objset ==
1437 1441 dmu_objset_pool(db->db_objset)->dp_meta_objset ||
1438 1442 txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
1439 1443 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1440 1444 ASSERT0(db->db_level);
1441 1445 ASSERT(MUTEX_HELD(&db->db_mtx));
1442 1446
1443 1447 /*
1444 1448 * If this buffer is not dirty, we're done.
1445 1449 */
1446 1450 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1447 1451 if (dr->dr_txg <= txg)
1448 1452 break;
1449 1453 if (dr == NULL || dr->dr_txg < txg)
1450 1454 return (B_FALSE);
1451 1455 ASSERT(dr->dr_txg == txg);
1452 1456 ASSERT(dr->dr_dbuf == db);
1453 1457
1454 1458 DB_DNODE_ENTER(db);
1455 1459 dn = DB_DNODE(db);
1456 1460
1457 1461 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1458 1462
1459 1463 ASSERT(db->db.db_size != 0);
1460 1464
1461 1465 dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
1462 1466 dr->dr_accounted, txg);
1463 1467
1464 1468 *drp = dr->dr_next;
1465 1469
1466 1470 /*
1467 1471 * Note that there are three places in dbuf_dirty()
1468 1472 * where this dirty record may be put on a list.
1469 1473 * Make sure to do a list_remove corresponding to
1470 1474 * every one of those list_insert calls.
1471 1475 */
1472 1476 if (dr->dr_parent) {
1473 1477 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1474 1478 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1475 1479 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1476 1480 } else if (db->db_blkid == DMU_SPILL_BLKID ||
1477 1481 db->db_level + 1 == dn->dn_nlevels) {
1478 1482 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1479 1483 mutex_enter(&dn->dn_mtx);
1480 1484 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1481 1485 mutex_exit(&dn->dn_mtx);
1482 1486 }
1483 1487 DB_DNODE_EXIT(db);
1484 1488
1485 1489 if (db->db_state != DB_NOFILL) {
1486 1490 dbuf_unoverride(dr);
1487 1491
1488 1492 ASSERT(db->db_buf != NULL);
1489 1493 ASSERT(dr->dt.dl.dr_data != NULL);
1490 1494 if (dr->dt.dl.dr_data != db->db_buf)
1491 1495 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1492 1496 }
1493 1497
1494 1498 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1495 1499
1496 1500 ASSERT(db->db_dirtycnt > 0);
1497 1501 db->db_dirtycnt -= 1;
1498 1502
1499 1503 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1500 1504 arc_buf_t *buf = db->db_buf;
1501 1505
1502 1506 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1503 1507 dbuf_clear_data(db);
1504 1508 VERIFY(arc_buf_remove_ref(buf, db));
1505 1509 dbuf_evict(db);
1506 1510 return (B_TRUE);
1507 1511 }
1508 1512
1509 1513 return (B_FALSE);
1510 1514 }
1511 1515
1512 1516 void
1513 1517 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1514 1518 {
1515 1519 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1516 1520 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1517 1521
1518 1522 ASSERT(tx->tx_txg != 0);
1519 1523 ASSERT(!refcount_is_zero(&db->db_holds));
1520 1524
1521 1525 /*
1522 1526 * Quick check for dirtyness. For already dirty blocks, this
1523 1527 * reduces runtime of this function by >90%, and overall performance
1524 1528 * by 50% for some workloads (e.g. file deletion with indirect blocks
1525 1529 * cached).
1526 1530 */
1527 1531 mutex_enter(&db->db_mtx);
1528 1532 dbuf_dirty_record_t *dr;
1529 1533 for (dr = db->db_last_dirty;
1530 1534 dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
1531 1535 /*
1532 1536 * It's possible that it is already dirty but not cached,
1533 1537 * because there are some calls to dbuf_dirty() that don't
1534 1538 * go through dmu_buf_will_dirty().
1535 1539 */
1536 1540 if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
1537 1541 /* This dbuf is already dirty and cached. */
1538 1542 dbuf_redirty(dr);
1539 1543 mutex_exit(&db->db_mtx);
1540 1544 return;
1541 1545 }
1542 1546 }
1543 1547 mutex_exit(&db->db_mtx);
1544 1548
1545 1549 DB_DNODE_ENTER(db);
1546 1550 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1547 1551 rf |= DB_RF_HAVESTRUCT;
1548 1552 DB_DNODE_EXIT(db);
1549 1553 (void) dbuf_read(db, NULL, rf);
1550 1554 (void) dbuf_dirty(db, tx);
1551 1555 }
1552 1556
1553 1557 void
1554 1558 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1555 1559 {
1556 1560 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1557 1561
1558 1562 db->db_state = DB_NOFILL;
1559 1563
1560 1564 dmu_buf_will_fill(db_fake, tx);
1561 1565 }
1562 1566
1563 1567 void
1564 1568 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1565 1569 {
1566 1570 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1567 1571
1568 1572 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1569 1573 ASSERT(tx->tx_txg != 0);
1570 1574 ASSERT(db->db_level == 0);
1571 1575 ASSERT(!refcount_is_zero(&db->db_holds));
1572 1576
1573 1577 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1574 1578 dmu_tx_private_ok(tx));
1575 1579
1576 1580 dbuf_noread(db);
1577 1581 (void) dbuf_dirty(db, tx);
1578 1582 }
1579 1583
1580 1584 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1581 1585 /* ARGSUSED */
1582 1586 void
1583 1587 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1584 1588 {
1585 1589 mutex_enter(&db->db_mtx);
1586 1590 DBUF_VERIFY(db);
1587 1591
1588 1592 if (db->db_state == DB_FILL) {
1589 1593 if (db->db_level == 0 && db->db_freed_in_flight) {
1590 1594 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1591 1595 /* we were freed while filling */
1592 1596 /* XXX dbuf_undirty? */
1593 1597 bzero(db->db.db_data, db->db.db_size);
1594 1598 db->db_freed_in_flight = FALSE;
1595 1599 }
1596 1600 db->db_state = DB_CACHED;
1597 1601 cv_broadcast(&db->db_changed);
1598 1602 }
1599 1603 mutex_exit(&db->db_mtx);
1600 1604 }
1601 1605
1602 1606 void
1603 1607 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
1604 1608 bp_embedded_type_t etype, enum zio_compress comp,
1605 1609 int uncompressed_size, int compressed_size, int byteorder,
1606 1610 dmu_tx_t *tx)
1607 1611 {
1608 1612 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
1609 1613 struct dirty_leaf *dl;
1610 1614 dmu_object_type_t type;
1611 1615
1612 1616 if (etype == BP_EMBEDDED_TYPE_DATA) {
1613 1617 ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
1614 1618 SPA_FEATURE_EMBEDDED_DATA));
1615 1619 }
1616 1620
1617 1621 DB_DNODE_ENTER(db);
1618 1622 type = DB_DNODE(db)->dn_type;
1619 1623 DB_DNODE_EXIT(db);
1620 1624
1621 1625 ASSERT0(db->db_level);
1622 1626 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1623 1627
1624 1628 dmu_buf_will_not_fill(dbuf, tx);
1625 1629
1626 1630 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1627 1631 dl = &db->db_last_dirty->dt.dl;
1628 1632 encode_embedded_bp_compressed(&dl->dr_overridden_by,
1629 1633 data, comp, uncompressed_size, compressed_size);
1630 1634 BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
1631 1635 BP_SET_TYPE(&dl->dr_overridden_by, type);
1632 1636 BP_SET_LEVEL(&dl->dr_overridden_by, 0);
1633 1637 BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
1634 1638
1635 1639 dl->dr_override_state = DR_OVERRIDDEN;
1636 1640 dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
1637 1641 }
1638 1642
1639 1643 /*
1640 1644 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1641 1645 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1642 1646 */
1643 1647 void
1644 1648 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1645 1649 {
1646 1650 ASSERT(!refcount_is_zero(&db->db_holds));
1647 1651 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1648 1652 ASSERT(db->db_level == 0);
1649 1653 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1650 1654 ASSERT(buf != NULL);
1651 1655 ASSERT(arc_buf_size(buf) == db->db.db_size);
1652 1656 ASSERT(tx->tx_txg != 0);
1653 1657
1654 1658 arc_return_buf(buf, db);
1655 1659 ASSERT(arc_released(buf));
1656 1660
1657 1661 mutex_enter(&db->db_mtx);
1658 1662
1659 1663 while (db->db_state == DB_READ || db->db_state == DB_FILL)
1660 1664 cv_wait(&db->db_changed, &db->db_mtx);
1661 1665
1662 1666 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1663 1667
1664 1668 if (db->db_state == DB_CACHED &&
1665 1669 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1666 1670 mutex_exit(&db->db_mtx);
1667 1671 (void) dbuf_dirty(db, tx);
1668 1672 bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1669 1673 VERIFY(arc_buf_remove_ref(buf, db));
1670 1674 xuio_stat_wbuf_copied();
1671 1675 return;
1672 1676 }
1673 1677
1674 1678 xuio_stat_wbuf_nocopy();
1675 1679 if (db->db_state == DB_CACHED) {
1676 1680 dbuf_dirty_record_t *dr = db->db_last_dirty;
1677 1681
1678 1682 ASSERT(db->db_buf != NULL);
1679 1683 if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1680 1684 ASSERT(dr->dt.dl.dr_data == db->db_buf);
1681 1685 if (!arc_released(db->db_buf)) {
1682 1686 ASSERT(dr->dt.dl.dr_override_state ==
1683 1687 DR_OVERRIDDEN);
1684 1688 arc_release(db->db_buf, db);
1685 1689 }
1686 1690 dr->dt.dl.dr_data = buf;
1687 1691 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1688 1692 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1689 1693 arc_release(db->db_buf, db);
1690 1694 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1691 1695 }
1692 1696 db->db_buf = NULL;
1693 1697 }
1694 1698 ASSERT(db->db_buf == NULL);
1695 1699 dbuf_set_data(db, buf);
1696 1700 db->db_state = DB_FILL;
1697 1701 mutex_exit(&db->db_mtx);
1698 1702 (void) dbuf_dirty(db, tx);
1699 1703 dmu_buf_fill_done(&db->db, tx);
1700 1704 }
1701 1705
1702 1706 /*
1703 1707 * "Clear" the contents of this dbuf. This will mark the dbuf
1704 1708 * EVICTING and clear *most* of its references. Unfortunately,
1705 1709 * when we are not holding the dn_dbufs_mtx, we can't clear the
1706 1710 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1707 1711 * in this case. For callers from the DMU we will usually see:
1708 1712 * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
1709 1713 * For the arc callback, we will usually see:
1710 1714 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1711 1715 * Sometimes, though, we will get a mix of these two:
1712 1716 * DMU: dbuf_clear()->arc_clear_callback()
1713 1717 * ARC: dbuf_do_evict()->dbuf_destroy()
1714 1718 *
1715 1719 * This routine will dissociate the dbuf from the arc, by calling
1716 1720 * arc_clear_callback(), but will not evict the data from the ARC.
1717 1721 */
1718 1722 void
1719 1723 dbuf_clear(dmu_buf_impl_t *db)
1720 1724 {
1721 1725 dnode_t *dn;
1722 1726 dmu_buf_impl_t *parent = db->db_parent;
1723 1727 dmu_buf_impl_t *dndb;
1724 1728 boolean_t dbuf_gone = B_FALSE;
1725 1729
1726 1730 ASSERT(MUTEX_HELD(&db->db_mtx));
1727 1731 ASSERT(refcount_is_zero(&db->db_holds));
1728 1732
1729 1733 dbuf_evict_user(db);
1730 1734
1731 1735 if (db->db_state == DB_CACHED) {
1732 1736 ASSERT(db->db.db_data != NULL);
1733 1737 if (db->db_blkid == DMU_BONUS_BLKID) {
1734 1738 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1735 1739 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1736 1740 }
1737 1741 db->db.db_data = NULL;
1738 1742 db->db_state = DB_UNCACHED;
1739 1743 }
1740 1744
1741 1745 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1742 1746 ASSERT(db->db_data_pending == NULL);
1743 1747
1744 1748 db->db_state = DB_EVICTING;
1745 1749 db->db_blkptr = NULL;
1746 1750
1747 1751 DB_DNODE_ENTER(db);
1748 1752 dn = DB_DNODE(db);
1749 1753 dndb = dn->dn_dbuf;
1750 1754 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1751 1755 avl_remove(&dn->dn_dbufs, db);
1752 1756 atomic_dec_32(&dn->dn_dbufs_count);
1753 1757 membar_producer();
1754 1758 DB_DNODE_EXIT(db);
1755 1759 /*
1756 1760 * Decrementing the dbuf count means that the hold corresponding
1757 1761 * to the removed dbuf is no longer discounted in dnode_move(),
1758 1762 * so the dnode cannot be moved until after we release the hold.
1759 1763 * The membar_producer() ensures visibility of the decremented
1760 1764 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1761 1765 * release any lock.
1762 1766 */
1763 1767 dnode_rele(dn, db);
1764 1768 db->db_dnode_handle = NULL;
1765 1769 } else {
1766 1770 DB_DNODE_EXIT(db);
1767 1771 }
1768 1772
1769 1773 if (db->db_buf)
1770 1774 dbuf_gone = arc_clear_callback(db->db_buf);
1771 1775
1772 1776 if (!dbuf_gone)
1773 1777 mutex_exit(&db->db_mtx);
1774 1778
1775 1779 /*
1776 1780 * If this dbuf is referenced from an indirect dbuf,
1777 1781 * decrement the ref count on the indirect dbuf.
1778 1782 */
1779 1783 if (parent && parent != dndb)
1780 1784 dbuf_rele(parent, db);
1781 1785 }
1782 1786
1783 1787 /*
1784 1788 * Note: While bpp will always be updated if the function returns success,
1785 1789 * parentp will not be updated if the dnode does not have dn_dbuf filled in;
1786 1790 * this happens when the dnode is the meta-dnode, or a userused or groupused
1787 1791 * object.
1788 1792 */
1789 1793 static int
1790 1794 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1791 1795 dmu_buf_impl_t **parentp, blkptr_t **bpp)
1792 1796 {
1793 1797 int nlevels, epbs;
1794 1798
1795 1799 *parentp = NULL;
1796 1800 *bpp = NULL;
1797 1801
1798 1802 ASSERT(blkid != DMU_BONUS_BLKID);
1799 1803
1800 1804 if (blkid == DMU_SPILL_BLKID) {
1801 1805 mutex_enter(&dn->dn_mtx);
1802 1806 if (dn->dn_have_spill &&
1803 1807 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1804 1808 *bpp = &dn->dn_phys->dn_spill;
1805 1809 else
1806 1810 *bpp = NULL;
1807 1811 dbuf_add_ref(dn->dn_dbuf, NULL);
1808 1812 *parentp = dn->dn_dbuf;
1809 1813 mutex_exit(&dn->dn_mtx);
1810 1814 return (0);
1811 1815 }
1812 1816
1813 1817 if (dn->dn_phys->dn_nlevels == 0)
1814 1818 nlevels = 1;
1815 1819 else
1816 1820 nlevels = dn->dn_phys->dn_nlevels;
1817 1821
1818 1822 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1819 1823
1820 1824 ASSERT3U(level * epbs, <, 64);
1821 1825 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1822 1826 if (level >= nlevels ||
1823 1827 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1824 1828 /* the buffer has no parent yet */
1825 1829 return (SET_ERROR(ENOENT));
1826 1830 } else if (level < nlevels-1) {
1827 1831 /* this block is referenced from an indirect block */
1828 1832 int err = dbuf_hold_impl(dn, level+1,
1829 1833 blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
1830 1834 if (err)
1831 1835 return (err);
1832 1836 err = dbuf_read(*parentp, NULL,
1833 1837 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1834 1838 if (err) {
1835 1839 dbuf_rele(*parentp, NULL);
1836 1840 *parentp = NULL;
1837 1841 return (err);
1838 1842 }
1839 1843 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1840 1844 (blkid & ((1ULL << epbs) - 1));
1841 1845 return (0);
1842 1846 } else {
1843 1847 /* the block is referenced from the dnode */
1844 1848 ASSERT3U(level, ==, nlevels-1);
1845 1849 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1846 1850 blkid < dn->dn_phys->dn_nblkptr);
1847 1851 if (dn->dn_dbuf) {
1848 1852 dbuf_add_ref(dn->dn_dbuf, NULL);
1849 1853 *parentp = dn->dn_dbuf;
1850 1854 }
1851 1855 *bpp = &dn->dn_phys->dn_blkptr[blkid];
1852 1856 return (0);
1853 1857 }
1854 1858 }
1855 1859
1856 1860 static dmu_buf_impl_t *
1857 1861 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1858 1862 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1859 1863 {
1860 1864 objset_t *os = dn->dn_objset;
1861 1865 dmu_buf_impl_t *db, *odb;
1862 1866
1863 1867 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1864 1868 ASSERT(dn->dn_type != DMU_OT_NONE);
1865 1869
1866 1870 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1867 1871
1868 1872 db->db_objset = os;
1869 1873 db->db.db_object = dn->dn_object;
1870 1874 db->db_level = level;
1871 1875 db->db_blkid = blkid;
1872 1876 db->db_last_dirty = NULL;
1873 1877 db->db_dirtycnt = 0;
1874 1878 db->db_dnode_handle = dn->dn_handle;
1875 1879 db->db_parent = parent;
1876 1880 db->db_blkptr = blkptr;
1877 1881
1878 1882 db->db_user = NULL;
1879 1883 db->db_user_immediate_evict = FALSE;
1880 1884 db->db_freed_in_flight = FALSE;
1881 1885 db->db_pending_evict = FALSE;
1882 1886
1883 1887 if (blkid == DMU_BONUS_BLKID) {
1884 1888 ASSERT3P(parent, ==, dn->dn_dbuf);
1885 1889 db->db.db_size = DN_MAX_BONUSLEN -
1886 1890 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1887 1891 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1888 1892 db->db.db_offset = DMU_BONUS_BLKID;
1889 1893 db->db_state = DB_UNCACHED;
1890 1894 /* the bonus dbuf is not placed in the hash table */
1891 1895 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1892 1896 return (db);
1893 1897 } else if (blkid == DMU_SPILL_BLKID) {
1894 1898 db->db.db_size = (blkptr != NULL) ?
1895 1899 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1896 1900 db->db.db_offset = 0;
1897 1901 } else {
1898 1902 int blocksize =
1899 1903 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1900 1904 db->db.db_size = blocksize;
1901 1905 db->db.db_offset = db->db_blkid * blocksize;
1902 1906 }
1903 1907
1904 1908 /*
1905 1909 * Hold the dn_dbufs_mtx while we get the new dbuf
1906 1910 * in the hash table *and* added to the dbufs list.
1907 1911 * This prevents a possible deadlock with someone
1908 1912 * trying to look up this dbuf before its added to the
1909 1913 * dn_dbufs list.
1910 1914 */
1911 1915 mutex_enter(&dn->dn_dbufs_mtx);
1912 1916 db->db_state = DB_EVICTING;
1913 1917 if ((odb = dbuf_hash_insert(db)) != NULL) {
1914 1918 /* someone else inserted it first */
1915 1919 kmem_cache_free(dbuf_cache, db);
1916 1920 mutex_exit(&dn->dn_dbufs_mtx);
1917 1921 return (odb);
1918 1922 }
1919 1923 avl_add(&dn->dn_dbufs, db);
1920 1924 if (db->db_level == 0 && db->db_blkid >=
1921 1925 dn->dn_unlisted_l0_blkid)
1922 1926 dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1923 1927 db->db_state = DB_UNCACHED;
1924 1928 mutex_exit(&dn->dn_dbufs_mtx);
1925 1929 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1926 1930
1927 1931 if (parent && parent != dn->dn_dbuf)
1928 1932 dbuf_add_ref(parent, db);
1929 1933
1930 1934 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1931 1935 refcount_count(&dn->dn_holds) > 0);
1932 1936 (void) refcount_add(&dn->dn_holds, db);
1933 1937 atomic_inc_32(&dn->dn_dbufs_count);
1934 1938
1935 1939 dprintf_dbuf(db, "db=%p\n", db);
1936 1940
1937 1941 return (db);
1938 1942 }
1939 1943
1940 1944 static int
1941 1945 dbuf_do_evict(void *private)
1942 1946 {
1943 1947 dmu_buf_impl_t *db = private;
1944 1948
1945 1949 if (!MUTEX_HELD(&db->db_mtx))
1946 1950 mutex_enter(&db->db_mtx);
1947 1951
1948 1952 ASSERT(refcount_is_zero(&db->db_holds));
1949 1953
1950 1954 if (db->db_state != DB_EVICTING) {
1951 1955 ASSERT(db->db_state == DB_CACHED);
1952 1956 DBUF_VERIFY(db);
1953 1957 db->db_buf = NULL;
1954 1958 dbuf_evict(db);
1955 1959 } else {
1956 1960 mutex_exit(&db->db_mtx);
1957 1961 dbuf_destroy(db);
1958 1962 }
1959 1963 return (0);
1960 1964 }
1961 1965
1962 1966 static void
1963 1967 dbuf_destroy(dmu_buf_impl_t *db)
1964 1968 {
1965 1969 ASSERT(refcount_is_zero(&db->db_holds));
1966 1970
1967 1971 if (db->db_blkid != DMU_BONUS_BLKID) {
1968 1972 /*
1969 1973 * If this dbuf is still on the dn_dbufs list,
1970 1974 * remove it from that list.
1971 1975 */
1972 1976 if (db->db_dnode_handle != NULL) {
1973 1977 dnode_t *dn;
1974 1978
1975 1979 DB_DNODE_ENTER(db);
1976 1980 dn = DB_DNODE(db);
1977 1981 mutex_enter(&dn->dn_dbufs_mtx);
1978 1982 avl_remove(&dn->dn_dbufs, db);
1979 1983 atomic_dec_32(&dn->dn_dbufs_count);
1980 1984 mutex_exit(&dn->dn_dbufs_mtx);
1981 1985 DB_DNODE_EXIT(db);
1982 1986 /*
1983 1987 * Decrementing the dbuf count means that the hold
1984 1988 * corresponding to the removed dbuf is no longer
1985 1989 * discounted in dnode_move(), so the dnode cannot be
1986 1990 * moved until after we release the hold.
1987 1991 */
1988 1992 dnode_rele(dn, db);
1989 1993 db->db_dnode_handle = NULL;
1990 1994 }
1991 1995 dbuf_hash_remove(db);
1992 1996 }
1993 1997 db->db_parent = NULL;
1994 1998 db->db_buf = NULL;
1995 1999
1996 2000 ASSERT(db->db.db_data == NULL);
1997 2001 ASSERT(db->db_hash_next == NULL);
1998 2002 ASSERT(db->db_blkptr == NULL);
1999 2003 ASSERT(db->db_data_pending == NULL);
2000 2004
2001 2005 kmem_cache_free(dbuf_cache, db);
2002 2006 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2003 2007 }
2004 2008
2005 2009 typedef struct dbuf_prefetch_arg {
2006 2010 spa_t *dpa_spa; /* The spa to issue the prefetch in. */
2007 2011 zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
2008 2012 int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
2009 2013 int dpa_curlevel; /* The current level that we're reading */
2010 2014 zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
2011 2015 zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
2012 2016 arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
2013 2017 } dbuf_prefetch_arg_t;
2014 2018
2015 2019 /*
2016 2020 * Actually issue the prefetch read for the block given.
2017 2021 */
2018 2022 static void
2019 2023 dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
2020 2024 {
2021 2025 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
2022 2026 return;
2023 2027
2024 2028 arc_flags_t aflags =
2025 2029 dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
2026 2030
2027 2031 ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
2028 2032 ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
2029 2033 ASSERT(dpa->dpa_zio != NULL);
2030 2034 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
2031 2035 dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2032 2036 &aflags, &dpa->dpa_zb);
2033 2037 }
2034 2038
2035 2039 /*
2036 2040 * Called when an indirect block above our prefetch target is read in. This
2037 2041 * will either read in the next indirect block down the tree or issue the actual
2038 2042 * prefetch if the next block down is our target.
2039 2043 */
2040 2044 static void
2041 2045 dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
2042 2046 {
2043 2047 dbuf_prefetch_arg_t *dpa = private;
2044 2048
2045 2049 ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
2046 2050 ASSERT3S(dpa->dpa_curlevel, >, 0);
2047 2051 if (zio != NULL) {
2048 2052 ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
2049 2053 ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
2050 2054 ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
2051 2055 }
2052 2056
2053 2057 dpa->dpa_curlevel--;
2054 2058
2055 2059 uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
2056 2060 (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
2057 2061 blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
2058 2062 P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
2059 2063 if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
2060 2064 kmem_free(dpa, sizeof (*dpa));
2061 2065 } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
2062 2066 ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
2063 2067 dbuf_issue_final_prefetch(dpa, bp);
2064 2068 kmem_free(dpa, sizeof (*dpa));
2065 2069 } else {
2066 2070 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
2067 2071 zbookmark_phys_t zb;
2068 2072
2069 2073 ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
2070 2074
2071 2075 SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
2072 2076 dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
2073 2077
2074 2078 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
2075 2079 bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
2076 2080 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2077 2081 &iter_aflags, &zb);
2078 2082 }
2079 2083 (void) arc_buf_remove_ref(abuf, private);
2080 2084 }
2081 2085
2082 2086 /*
2083 2087 * Issue prefetch reads for the given block on the given level. If the indirect
2084 2088 * blocks above that block are not in memory, we will read them in
2085 2089 * asynchronously. As a result, this call never blocks waiting for a read to
2086 2090 * complete.
2087 2091 */
2088 2092 void
2089 2093 dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
2090 2094 arc_flags_t aflags)
2091 2095 {
2092 2096 blkptr_t bp;
2093 2097 int epbs, nlevels, curlevel;
2094 2098 uint64_t curblkid;
2095 2099
2096 2100 ASSERT(blkid != DMU_BONUS_BLKID);
2097 2101 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2098 2102
2099 2103 if (blkid > dn->dn_maxblkid)
2100 2104 return;
2101 2105
2102 2106 if (dnode_block_freed(dn, blkid))
2103 2107 return;
2104 2108
2105 2109 /*
2106 2110 * This dnode hasn't been written to disk yet, so there's nothing to
2107 2111 * prefetch.
2108 2112 */
2109 2113 nlevels = dn->dn_phys->dn_nlevels;
2110 2114 if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
2111 2115 return;
2112 2116
2113 2117 epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2114 2118 if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
2115 2119 return;
2116 2120
2117 2121 dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
2118 2122 level, blkid);
2119 2123 if (db != NULL) {
2120 2124 mutex_exit(&db->db_mtx);
2121 2125 /*
2122 2126 * This dbuf already exists. It is either CACHED, or
2123 2127 * (we assume) about to be read or filled.
2124 2128 */
2125 2129 return;
2126 2130 }
2127 2131
2128 2132 /*
2129 2133 * Find the closest ancestor (indirect block) of the target block
2130 2134 * that is present in the cache. In this indirect block, we will
2131 2135 * find the bp that is at curlevel, curblkid.
2132 2136 */
2133 2137 curlevel = level;
2134 2138 curblkid = blkid;
2135 2139 while (curlevel < nlevels - 1) {
2136 2140 int parent_level = curlevel + 1;
2137 2141 uint64_t parent_blkid = curblkid >> epbs;
2138 2142 dmu_buf_impl_t *db;
2139 2143
2140 2144 if (dbuf_hold_impl(dn, parent_level, parent_blkid,
2141 2145 FALSE, TRUE, FTAG, &db) == 0) {
2142 2146 blkptr_t *bpp = db->db_buf->b_data;
2143 2147 bp = bpp[P2PHASE(curblkid, 1 << epbs)];
2144 2148 dbuf_rele(db, FTAG);
2145 2149 break;
2146 2150 }
2147 2151
2148 2152 curlevel = parent_level;
2149 2153 curblkid = parent_blkid;
2150 2154 }
2151 2155
2152 2156 if (curlevel == nlevels - 1) {
2153 2157 /* No cached indirect blocks found. */
2154 2158 ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
2155 2159 bp = dn->dn_phys->dn_blkptr[curblkid];
2156 2160 }
2157 2161 if (BP_IS_HOLE(&bp))
2158 2162 return;
2159 2163
2160 2164 ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
2161 2165
2162 2166 zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
2163 2167 ZIO_FLAG_CANFAIL);
2164 2168
2165 2169 dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
2166 2170 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
2167 2171 SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
2168 2172 dn->dn_object, level, blkid);
2169 2173 dpa->dpa_curlevel = curlevel;
2170 2174 dpa->dpa_prio = prio;
2171 2175 dpa->dpa_aflags = aflags;
2172 2176 dpa->dpa_spa = dn->dn_objset->os_spa;
2173 2177 dpa->dpa_epbs = epbs;
2174 2178 dpa->dpa_zio = pio;
2175 2179
2176 2180 /*
2177 2181 * If we have the indirect just above us, no need to do the asynchronous
2178 2182 * prefetch chain; we'll just run the last step ourselves. If we're at
2179 2183 * a higher level, though, we want to issue the prefetches for all the
2180 2184 * indirect blocks asynchronously, so we can go on with whatever we were
2181 2185 * doing.
2182 2186 */
2183 2187 if (curlevel == level) {
2184 2188 ASSERT3U(curblkid, ==, blkid);
2185 2189 dbuf_issue_final_prefetch(dpa, &bp);
2186 2190 kmem_free(dpa, sizeof (*dpa));
2187 2191 } else {
2188 2192 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
2189 2193 zbookmark_phys_t zb;
2190 2194
2191 2195 SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
2192 2196 dn->dn_object, curlevel, curblkid);
2193 2197 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
2194 2198 &bp, dbuf_prefetch_indirect_done, dpa, prio,
2195 2199 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2196 2200 &iter_aflags, &zb);
2197 2201 }
2198 2202 /*
2199 2203 * We use pio here instead of dpa_zio since it's possible that
2200 2204 * dpa may have already been freed.
2201 2205 */
2202 2206 zio_nowait(pio);
2203 2207 }
2204 2208
2205 2209 /*
2206 2210 * Returns with db_holds incremented, and db_mtx not held.
2207 2211 * Note: dn_struct_rwlock must be held.
2208 2212 */
2209 2213 int
2210 2214 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
2211 2215 boolean_t fail_sparse, boolean_t fail_uncached,
2212 2216 void *tag, dmu_buf_impl_t **dbp)
2213 2217 {
2214 2218 dmu_buf_impl_t *db, *parent = NULL;
2215 2219
2216 2220 ASSERT(blkid != DMU_BONUS_BLKID);
2217 2221 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2218 2222 ASSERT3U(dn->dn_nlevels, >, level);
2219 2223
2220 2224 *dbp = NULL;
2221 2225 top:
2222 2226 /* dbuf_find() returns with db_mtx held */
2223 2227 db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
2224 2228
2225 2229 if (db == NULL) {
2226 2230 blkptr_t *bp = NULL;
2227 2231 int err;
2228 2232
2229 2233 if (fail_uncached)
2230 2234 return (SET_ERROR(ENOENT));
2231 2235
2232 2236 ASSERT3P(parent, ==, NULL);
2233 2237 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
2234 2238 if (fail_sparse) {
2235 2239 if (err == 0 && bp && BP_IS_HOLE(bp))
2236 2240 err = SET_ERROR(ENOENT);
2237 2241 if (err) {
2238 2242 if (parent)
2239 2243 dbuf_rele(parent, NULL);
2240 2244 return (err);
2241 2245 }
2242 2246 }
2243 2247 if (err && err != ENOENT)
2244 2248 return (err);
2245 2249 db = dbuf_create(dn, level, blkid, parent, bp);
2246 2250 }
2247 2251
2248 2252 if (fail_uncached && db->db_state != DB_CACHED) {
2249 2253 mutex_exit(&db->db_mtx);
2250 2254 return (SET_ERROR(ENOENT));
2251 2255 }
2252 2256
2253 2257 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
2254 2258 arc_buf_add_ref(db->db_buf, db);
2255 2259 if (db->db_buf->b_data == NULL) {
2256 2260 dbuf_clear(db);
2257 2261 if (parent) {
2258 2262 dbuf_rele(parent, NULL);
2259 2263 parent = NULL;
2260 2264 }
2261 2265 goto top;
2262 2266 }
2263 2267 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
2264 2268 }
2265 2269
2266 2270 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
2267 2271
2268 2272 /*
2269 2273 * If this buffer is currently syncing out, and we are are
2270 2274 * still referencing it from db_data, we need to make a copy
2271 2275 * of it in case we decide we want to dirty it again in this txg.
2272 2276 */
2273 2277 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
2274 2278 dn->dn_object != DMU_META_DNODE_OBJECT &&
2275 2279 db->db_state == DB_CACHED && db->db_data_pending) {
2276 2280 dbuf_dirty_record_t *dr = db->db_data_pending;
2277 2281
2278 2282 if (dr->dt.dl.dr_data == db->db_buf) {
2279 2283 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2280 2284
2281 2285 dbuf_set_data(db,
2282 2286 arc_buf_alloc(dn->dn_objset->os_spa,
2283 2287 db->db.db_size, db, type));
2284 2288 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
2285 2289 db->db.db_size);
2286 2290 }
2287 2291 }
2288 2292
2289 2293 (void) refcount_add(&db->db_holds, tag);
2290 2294 DBUF_VERIFY(db);
2291 2295 mutex_exit(&db->db_mtx);
2292 2296
2293 2297 /* NOTE: we can't rele the parent until after we drop the db_mtx */
2294 2298 if (parent)
2295 2299 dbuf_rele(parent, NULL);
2296 2300
2297 2301 ASSERT3P(DB_DNODE(db), ==, dn);
2298 2302 ASSERT3U(db->db_blkid, ==, blkid);
2299 2303 ASSERT3U(db->db_level, ==, level);
2300 2304 *dbp = db;
2301 2305
2302 2306 return (0);
2303 2307 }
2304 2308
2305 2309 dmu_buf_impl_t *
2306 2310 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
2307 2311 {
2308 2312 return (dbuf_hold_level(dn, 0, blkid, tag));
2309 2313 }
2310 2314
2311 2315 dmu_buf_impl_t *
2312 2316 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
2313 2317 {
2314 2318 dmu_buf_impl_t *db;
2315 2319 int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
2316 2320 return (err ? NULL : db);
2317 2321 }
2318 2322
2319 2323 void
2320 2324 dbuf_create_bonus(dnode_t *dn)
2321 2325 {
2322 2326 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
2323 2327
2324 2328 ASSERT(dn->dn_bonus == NULL);
2325 2329 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
2326 2330 }
2327 2331
2328 2332 int
2329 2333 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
2330 2334 {
2331 2335 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2332 2336 dnode_t *dn;
2333 2337
2334 2338 if (db->db_blkid != DMU_SPILL_BLKID)
2335 2339 return (SET_ERROR(ENOTSUP));
2336 2340 if (blksz == 0)
2337 2341 blksz = SPA_MINBLOCKSIZE;
2338 2342 ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
2339 2343 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
2340 2344
2341 2345 DB_DNODE_ENTER(db);
2342 2346 dn = DB_DNODE(db);
2343 2347 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2344 2348 dbuf_new_size(db, blksz, tx);
2345 2349 rw_exit(&dn->dn_struct_rwlock);
2346 2350 DB_DNODE_EXIT(db);
2347 2351
2348 2352 return (0);
2349 2353 }
2350 2354
2351 2355 void
2352 2356 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
2353 2357 {
2354 2358 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2355 2359 }
2356 2360
2357 2361 #pragma weak dmu_buf_add_ref = dbuf_add_ref
2358 2362 void
2359 2363 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2360 2364 {
2361 2365 int64_t holds = refcount_add(&db->db_holds, tag);
2362 2366 ASSERT(holds > 1);
2363 2367 }
2364 2368
2365 2369 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
2366 2370 boolean_t
2367 2371 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
2368 2372 void *tag)
2369 2373 {
2370 2374 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2371 2375 dmu_buf_impl_t *found_db;
2372 2376 boolean_t result = B_FALSE;
2373 2377
2374 2378 if (db->db_blkid == DMU_BONUS_BLKID)
2375 2379 found_db = dbuf_find_bonus(os, obj);
2376 2380 else
2377 2381 found_db = dbuf_find(os, obj, 0, blkid);
2378 2382
2379 2383 if (found_db != NULL) {
2380 2384 if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
2381 2385 (void) refcount_add(&db->db_holds, tag);
2382 2386 result = B_TRUE;
2383 2387 }
2384 2388 mutex_exit(&db->db_mtx);
2385 2389 }
2386 2390 return (result);
2387 2391 }
2388 2392
2389 2393 /*
2390 2394 * If you call dbuf_rele() you had better not be referencing the dnode handle
2391 2395 * unless you have some other direct or indirect hold on the dnode. (An indirect
2392 2396 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2393 2397 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2394 2398 * dnode's parent dbuf evicting its dnode handles.
2395 2399 */
2396 2400 void
2397 2401 dbuf_rele(dmu_buf_impl_t *db, void *tag)
2398 2402 {
2399 2403 mutex_enter(&db->db_mtx);
2400 2404 dbuf_rele_and_unlock(db, tag);
2401 2405 }
2402 2406
2403 2407 void
2404 2408 dmu_buf_rele(dmu_buf_t *db, void *tag)
2405 2409 {
2406 2410 dbuf_rele((dmu_buf_impl_t *)db, tag);
2407 2411 }
2408 2412
2409 2413 /*
2410 2414 * dbuf_rele() for an already-locked dbuf. This is necessary to allow
2411 2415 * db_dirtycnt and db_holds to be updated atomically.
2412 2416 */
2413 2417 void
2414 2418 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2415 2419 {
2416 2420 int64_t holds;
2417 2421
2418 2422 ASSERT(MUTEX_HELD(&db->db_mtx));
2419 2423 DBUF_VERIFY(db);
2420 2424
2421 2425 /*
2422 2426 * Remove the reference to the dbuf before removing its hold on the
2423 2427 * dnode so we can guarantee in dnode_move() that a referenced bonus
2424 2428 * buffer has a corresponding dnode hold.
2425 2429 */
2426 2430 holds = refcount_remove(&db->db_holds, tag);
2427 2431 ASSERT(holds >= 0);
2428 2432
2429 2433 /*
2430 2434 * We can't freeze indirects if there is a possibility that they
2431 2435 * may be modified in the current syncing context.
2432 2436 */
2433 2437 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2434 2438 arc_buf_freeze(db->db_buf);
2435 2439
2436 2440 if (holds == db->db_dirtycnt &&
2437 2441 db->db_level == 0 && db->db_user_immediate_evict)
2438 2442 dbuf_evict_user(db);
2439 2443
2440 2444 if (holds == 0) {
2441 2445 if (db->db_blkid == DMU_BONUS_BLKID) {
2442 2446 dnode_t *dn;
2443 2447 boolean_t evict_dbuf = db->db_pending_evict;
2444 2448
2445 2449 /*
2446 2450 * If the dnode moves here, we cannot cross this
2447 2451 * barrier until the move completes.
2448 2452 */
2449 2453 DB_DNODE_ENTER(db);
2450 2454
2451 2455 dn = DB_DNODE(db);
2452 2456 atomic_dec_32(&dn->dn_dbufs_count);
2453 2457
2454 2458 /*
2455 2459 * Decrementing the dbuf count means that the bonus
2456 2460 * buffer's dnode hold is no longer discounted in
2457 2461 * dnode_move(). The dnode cannot move until after
2458 2462 * the dnode_rele() below.
2459 2463 */
2460 2464 DB_DNODE_EXIT(db);
2461 2465
2462 2466 /*
2463 2467 * Do not reference db after its lock is dropped.
2464 2468 * Another thread may evict it.
2465 2469 */
2466 2470 mutex_exit(&db->db_mtx);
2467 2471
2468 2472 if (evict_dbuf)
2469 2473 dnode_evict_bonus(dn);
2470 2474
2471 2475 dnode_rele(dn, db);
2472 2476 } else if (db->db_buf == NULL) {
2473 2477 /*
2474 2478 * This is a special case: we never associated this
2475 2479 * dbuf with any data allocated from the ARC.
2476 2480 */
2477 2481 ASSERT(db->db_state == DB_UNCACHED ||
2478 2482 db->db_state == DB_NOFILL);
2479 2483 dbuf_evict(db);
2480 2484 } else if (arc_released(db->db_buf)) {
2481 2485 arc_buf_t *buf = db->db_buf;
2482 2486 /*
2483 2487 * This dbuf has anonymous data associated with it.
2484 2488 */
2485 2489 dbuf_clear_data(db);
2486 2490 VERIFY(arc_buf_remove_ref(buf, db));
2487 2491 dbuf_evict(db);
2488 2492 } else {
2489 2493 VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2490 2494
2491 2495 /*
2492 2496 * A dbuf will be eligible for eviction if either the
2493 2497 * 'primarycache' property is set or a duplicate
2494 2498 * copy of this buffer is already cached in the arc.
2495 2499 *
2496 2500 * In the case of the 'primarycache' a buffer
2497 2501 * is considered for eviction if it matches the
2498 2502 * criteria set in the property.
2499 2503 *
2500 2504 * To decide if our buffer is considered a
2501 2505 * duplicate, we must call into the arc to determine
2502 2506 * if multiple buffers are referencing the same
2503 2507 * block on-disk. If so, then we simply evict
2504 2508 * ourselves.
2505 2509 */
2506 2510 if (!DBUF_IS_CACHEABLE(db)) {
2507 2511 if (db->db_blkptr != NULL &&
2508 2512 !BP_IS_HOLE(db->db_blkptr) &&
2509 2513 !BP_IS_EMBEDDED(db->db_blkptr)) {
2510 2514 spa_t *spa =
2511 2515 dmu_objset_spa(db->db_objset);
2512 2516 blkptr_t bp = *db->db_blkptr;
2513 2517 dbuf_clear(db);
2514 2518 arc_freed(spa, &bp);
2515 2519 } else {
2516 2520 dbuf_clear(db);
2517 2521 }
2518 2522 } else if (db->db_pending_evict ||
2519 2523 arc_buf_eviction_needed(db->db_buf)) {
2520 2524 dbuf_clear(db);
2521 2525 } else {
2522 2526 mutex_exit(&db->db_mtx);
2523 2527 }
2524 2528 }
2525 2529 } else {
2526 2530 mutex_exit(&db->db_mtx);
2527 2531 }
2528 2532 }
2529 2533
2530 2534 #pragma weak dmu_buf_refcount = dbuf_refcount
2531 2535 uint64_t
2532 2536 dbuf_refcount(dmu_buf_impl_t *db)
2533 2537 {
2534 2538 return (refcount_count(&db->db_holds));
2535 2539 }
2536 2540
2537 2541 void *
2538 2542 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
2539 2543 dmu_buf_user_t *new_user)
2540 2544 {
2541 2545 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2542 2546
2543 2547 mutex_enter(&db->db_mtx);
2544 2548 dbuf_verify_user(db, DBVU_NOT_EVICTING);
2545 2549 if (db->db_user == old_user)
2546 2550 db->db_user = new_user;
2547 2551 else
2548 2552 old_user = db->db_user;
2549 2553 dbuf_verify_user(db, DBVU_NOT_EVICTING);
2550 2554 mutex_exit(&db->db_mtx);
2551 2555
2552 2556 return (old_user);
2553 2557 }
2554 2558
2555 2559 void *
2556 2560 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2557 2561 {
2558 2562 return (dmu_buf_replace_user(db_fake, NULL, user));
2559 2563 }
2560 2564
2561 2565 void *
2562 2566 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2563 2567 {
2564 2568 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2565 2569
2566 2570 db->db_user_immediate_evict = TRUE;
2567 2571 return (dmu_buf_set_user(db_fake, user));
2568 2572 }
2569 2573
2570 2574 void *
2571 2575 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2572 2576 {
2573 2577 return (dmu_buf_replace_user(db_fake, user, NULL));
2574 2578 }
2575 2579
2576 2580 void *
2577 2581 dmu_buf_get_user(dmu_buf_t *db_fake)
2578 2582 {
2579 2583 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2580 2584
2581 2585 dbuf_verify_user(db, DBVU_NOT_EVICTING);
2582 2586 return (db->db_user);
2583 2587 }
2584 2588
2585 2589 void
2586 2590 dmu_buf_user_evict_wait()
2587 2591 {
2588 2592 taskq_wait(dbu_evict_taskq);
2589 2593 }
2590 2594
2591 2595 boolean_t
2592 2596 dmu_buf_freeable(dmu_buf_t *dbuf)
2593 2597 {
2594 2598 boolean_t res = B_FALSE;
2595 2599 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2596 2600
2597 2601 if (db->db_blkptr)
2598 2602 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2599 2603 db->db_blkptr, db->db_blkptr->blk_birth);
2600 2604
2601 2605 return (res);
2602 2606 }
2603 2607
2604 2608 blkptr_t *
2605 2609 dmu_buf_get_blkptr(dmu_buf_t *db)
2606 2610 {
2607 2611 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2608 2612 return (dbi->db_blkptr);
2609 2613 }
2610 2614
2611 2615 static void
2612 2616 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2613 2617 {
2614 2618 /* ASSERT(dmu_tx_is_syncing(tx) */
2615 2619 ASSERT(MUTEX_HELD(&db->db_mtx));
2616 2620
2617 2621 if (db->db_blkptr != NULL)
2618 2622 return;
2619 2623
2620 2624 if (db->db_blkid == DMU_SPILL_BLKID) {
2621 2625 db->db_blkptr = &dn->dn_phys->dn_spill;
2622 2626 BP_ZERO(db->db_blkptr);
2623 2627 return;
2624 2628 }
2625 2629 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2626 2630 /*
2627 2631 * This buffer was allocated at a time when there was
2628 2632 * no available blkptrs from the dnode, or it was
2629 2633 * inappropriate to hook it in (i.e., nlevels mis-match).
2630 2634 */
2631 2635 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2632 2636 ASSERT(db->db_parent == NULL);
2633 2637 db->db_parent = dn->dn_dbuf;
2634 2638 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2635 2639 DBUF_VERIFY(db);
2636 2640 } else {
2637 2641 dmu_buf_impl_t *parent = db->db_parent;
2638 2642 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2639 2643
2640 2644 ASSERT(dn->dn_phys->dn_nlevels > 1);
2641 2645 if (parent == NULL) {
2642 2646 mutex_exit(&db->db_mtx);
2643 2647 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2644 2648 parent = dbuf_hold_level(dn, db->db_level + 1,
2645 2649 db->db_blkid >> epbs, db);
2646 2650 rw_exit(&dn->dn_struct_rwlock);
2647 2651 mutex_enter(&db->db_mtx);
2648 2652 db->db_parent = parent;
2649 2653 }
2650 2654 db->db_blkptr = (blkptr_t *)parent->db.db_data +
2651 2655 (db->db_blkid & ((1ULL << epbs) - 1));
2652 2656 DBUF_VERIFY(db);
2653 2657 }
2654 2658 }
2655 2659
2656 2660 static void
2657 2661 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2658 2662 {
2659 2663 dmu_buf_impl_t *db = dr->dr_dbuf;
2660 2664 dnode_t *dn;
2661 2665 zio_t *zio;
2662 2666
2663 2667 ASSERT(dmu_tx_is_syncing(tx));
2664 2668
2665 2669 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2666 2670
2667 2671 mutex_enter(&db->db_mtx);
2668 2672
2669 2673 ASSERT(db->db_level > 0);
2670 2674 DBUF_VERIFY(db);
2671 2675
2672 2676 /* Read the block if it hasn't been read yet. */
2673 2677 if (db->db_buf == NULL) {
2674 2678 mutex_exit(&db->db_mtx);
2675 2679 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2676 2680 mutex_enter(&db->db_mtx);
2677 2681 }
2678 2682 ASSERT3U(db->db_state, ==, DB_CACHED);
2679 2683 ASSERT(db->db_buf != NULL);
2680 2684
2681 2685 DB_DNODE_ENTER(db);
2682 2686 dn = DB_DNODE(db);
2683 2687 /* Indirect block size must match what the dnode thinks it is. */
2684 2688 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2685 2689 dbuf_check_blkptr(dn, db);
2686 2690 DB_DNODE_EXIT(db);
2687 2691
2688 2692 /* Provide the pending dirty record to child dbufs */
2689 2693 db->db_data_pending = dr;
2690 2694
2691 2695 mutex_exit(&db->db_mtx);
2692 2696 dbuf_write(dr, db->db_buf, tx);
2693 2697
2694 2698 zio = dr->dr_zio;
2695 2699 mutex_enter(&dr->dt.di.dr_mtx);
2696 2700 dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
2697 2701 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2698 2702 mutex_exit(&dr->dt.di.dr_mtx);
2699 2703 zio_nowait(zio);
2700 2704 }
2701 2705
2702 2706 static void
2703 2707 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2704 2708 {
2705 2709 arc_buf_t **datap = &dr->dt.dl.dr_data;
2706 2710 dmu_buf_impl_t *db = dr->dr_dbuf;
2707 2711 dnode_t *dn;
2708 2712 objset_t *os;
2709 2713 uint64_t txg = tx->tx_txg;
2710 2714
2711 2715 ASSERT(dmu_tx_is_syncing(tx));
2712 2716
2713 2717 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2714 2718
2715 2719 mutex_enter(&db->db_mtx);
2716 2720 /*
2717 2721 * To be synced, we must be dirtied. But we
2718 2722 * might have been freed after the dirty.
2719 2723 */
2720 2724 if (db->db_state == DB_UNCACHED) {
2721 2725 /* This buffer has been freed since it was dirtied */
2722 2726 ASSERT(db->db.db_data == NULL);
2723 2727 } else if (db->db_state == DB_FILL) {
2724 2728 /* This buffer was freed and is now being re-filled */
2725 2729 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2726 2730 } else {
2727 2731 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2728 2732 }
2729 2733 DBUF_VERIFY(db);
2730 2734
2731 2735 DB_DNODE_ENTER(db);
2732 2736 dn = DB_DNODE(db);
2733 2737
2734 2738 if (db->db_blkid == DMU_SPILL_BLKID) {
2735 2739 mutex_enter(&dn->dn_mtx);
2736 2740 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2737 2741 mutex_exit(&dn->dn_mtx);
2738 2742 }
2739 2743
2740 2744 /*
2741 2745 * If this is a bonus buffer, simply copy the bonus data into the
2742 2746 * dnode. It will be written out when the dnode is synced (and it
2743 2747 * will be synced, since it must have been dirty for dbuf_sync to
2744 2748 * be called).
2745 2749 */
2746 2750 if (db->db_blkid == DMU_BONUS_BLKID) {
2747 2751 dbuf_dirty_record_t **drp;
2748 2752
2749 2753 ASSERT(*datap != NULL);
2750 2754 ASSERT0(db->db_level);
2751 2755 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2752 2756 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2753 2757 DB_DNODE_EXIT(db);
2754 2758
2755 2759 if (*datap != db->db.db_data) {
2756 2760 zio_buf_free(*datap, DN_MAX_BONUSLEN);
2757 2761 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2758 2762 }
2759 2763 db->db_data_pending = NULL;
2760 2764 drp = &db->db_last_dirty;
2761 2765 while (*drp != dr)
2762 2766 drp = &(*drp)->dr_next;
2763 2767 ASSERT(dr->dr_next == NULL);
2764 2768 ASSERT(dr->dr_dbuf == db);
2765 2769 *drp = dr->dr_next;
2766 2770 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2767 2771 ASSERT(db->db_dirtycnt > 0);
2768 2772 db->db_dirtycnt -= 1;
2769 2773 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2770 2774 return;
2771 2775 }
2772 2776
2773 2777 os = dn->dn_objset;
2774 2778
2775 2779 /*
2776 2780 * This function may have dropped the db_mtx lock allowing a dmu_sync
2777 2781 * operation to sneak in. As a result, we need to ensure that we
2778 2782 * don't check the dr_override_state until we have returned from
2779 2783 * dbuf_check_blkptr.
2780 2784 */
2781 2785 dbuf_check_blkptr(dn, db);
2782 2786
2783 2787 /*
2784 2788 * If this buffer is in the middle of an immediate write,
2785 2789 * wait for the synchronous IO to complete.
2786 2790 */
2787 2791 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2788 2792 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2789 2793 cv_wait(&db->db_changed, &db->db_mtx);
2790 2794 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2791 2795 }
2792 2796
2793 2797 if (db->db_state != DB_NOFILL &&
2794 2798 dn->dn_object != DMU_META_DNODE_OBJECT &&
2795 2799 refcount_count(&db->db_holds) > 1 &&
2796 2800 dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2797 2801 *datap == db->db_buf) {
2798 2802 /*
2799 2803 * If this buffer is currently "in use" (i.e., there
2800 2804 * are active holds and db_data still references it),
2801 2805 * then make a copy before we start the write so that
2802 2806 * any modifications from the open txg will not leak
2803 2807 * into this write.
2804 2808 *
2805 2809 * NOTE: this copy does not need to be made for
2806 2810 * objects only modified in the syncing context (e.g.
2807 2811 * DNONE_DNODE blocks).
2808 2812 */
2809 2813 int blksz = arc_buf_size(*datap);
2810 2814 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2811 2815 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2812 2816 bcopy(db->db.db_data, (*datap)->b_data, blksz);
2813 2817 }
2814 2818 db->db_data_pending = dr;
2815 2819
2816 2820 mutex_exit(&db->db_mtx);
2817 2821
2818 2822 dbuf_write(dr, *datap, tx);
2819 2823
2820 2824 ASSERT(!list_link_active(&dr->dr_dirty_node));
2821 2825 if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2822 2826 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2823 2827 DB_DNODE_EXIT(db);
2824 2828 } else {
2825 2829 /*
2826 2830 * Although zio_nowait() does not "wait for an IO", it does
2827 2831 * initiate the IO. If this is an empty write it seems plausible
2828 2832 * that the IO could actually be completed before the nowait
2829 2833 * returns. We need to DB_DNODE_EXIT() first in case
2830 2834 * zio_nowait() invalidates the dbuf.
2831 2835 */
2832 2836 DB_DNODE_EXIT(db);
2833 2837 zio_nowait(dr->dr_zio);
2834 2838 }
2835 2839 }
2836 2840
2837 2841 void
2838 2842 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
2839 2843 {
2840 2844 dbuf_dirty_record_t *dr;
2841 2845
2842 2846 while (dr = list_head(list)) {
2843 2847 if (dr->dr_zio != NULL) {
2844 2848 /*
2845 2849 * If we find an already initialized zio then we
2846 2850 * are processing the meta-dnode, and we have finished.
2847 2851 * The dbufs for all dnodes are put back on the list
2848 2852 * during processing, so that we can zio_wait()
2849 2853 * these IOs after initiating all child IOs.
2850 2854 */
2851 2855 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2852 2856 DMU_META_DNODE_OBJECT);
2853 2857 break;
2854 2858 }
2855 2859 if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
2856 2860 dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
2857 2861 VERIFY3U(dr->dr_dbuf->db_level, ==, level);
2858 2862 }
2859 2863 list_remove(list, dr);
2860 2864 if (dr->dr_dbuf->db_level > 0)
2861 2865 dbuf_sync_indirect(dr, tx);
2862 2866 else
2863 2867 dbuf_sync_leaf(dr, tx);
2864 2868 }
2865 2869 }
2866 2870
2867 2871 /* ARGSUSED */
2868 2872 static void
2869 2873 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2870 2874 {
2871 2875 dmu_buf_impl_t *db = vdb;
2872 2876 dnode_t *dn;
2873 2877 blkptr_t *bp = zio->io_bp;
2874 2878 blkptr_t *bp_orig = &zio->io_bp_orig;
2875 2879 spa_t *spa = zio->io_spa;
2876 2880 int64_t delta;
2877 2881 uint64_t fill = 0;
2878 2882 int i;
2879 2883
2880 2884 ASSERT3P(db->db_blkptr, ==, bp);
2881 2885
2882 2886 DB_DNODE_ENTER(db);
2883 2887 dn = DB_DNODE(db);
2884 2888 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2885 2889 dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2886 2890 zio->io_prev_space_delta = delta;
2887 2891
2888 2892 if (bp->blk_birth != 0) {
2889 2893 ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2890 2894 BP_GET_TYPE(bp) == dn->dn_type) ||
2891 2895 (db->db_blkid == DMU_SPILL_BLKID &&
2892 2896 BP_GET_TYPE(bp) == dn->dn_bonustype) ||
2893 2897 BP_IS_EMBEDDED(bp));
2894 2898 ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2895 2899 }
2896 2900
2897 2901 mutex_enter(&db->db_mtx);
2898 2902
2899 2903 #ifdef ZFS_DEBUG
2900 2904 if (db->db_blkid == DMU_SPILL_BLKID) {
2901 2905 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2902 2906 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2903 2907 db->db_blkptr == &dn->dn_phys->dn_spill);
2904 2908 }
2905 2909 #endif
2906 2910
2907 2911 if (db->db_level == 0) {
2908 2912 mutex_enter(&dn->dn_mtx);
2909 2913 if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2910 2914 db->db_blkid != DMU_SPILL_BLKID)
2911 2915 dn->dn_phys->dn_maxblkid = db->db_blkid;
2912 2916 mutex_exit(&dn->dn_mtx);
2913 2917
2914 2918 if (dn->dn_type == DMU_OT_DNODE) {
2915 2919 dnode_phys_t *dnp = db->db.db_data;
2916 2920 for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2917 2921 i--, dnp++) {
2918 2922 if (dnp->dn_type != DMU_OT_NONE)
2919 2923 fill++;
2920 2924 }
2921 2925 } else {
2922 2926 if (BP_IS_HOLE(bp)) {
2923 2927 fill = 0;
2924 2928 } else {
2925 2929 fill = 1;
2926 2930 }
2927 2931 }
2928 2932 } else {
2929 2933 blkptr_t *ibp = db->db.db_data;
2930 2934 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2931 2935 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2932 2936 if (BP_IS_HOLE(ibp))
2933 2937 continue;
2934 2938 fill += BP_GET_FILL(ibp);
2935 2939 }
2936 2940 }
2937 2941 DB_DNODE_EXIT(db);
2938 2942
2939 2943 if (!BP_IS_EMBEDDED(bp))
2940 2944 bp->blk_fill = fill;
2941 2945
2942 2946 mutex_exit(&db->db_mtx);
2943 2947 }
2944 2948
2945 2949 /*
2946 2950 * The SPA will call this callback several times for each zio - once
2947 2951 * for every physical child i/o (zio->io_phys_children times). This
2948 2952 * allows the DMU to monitor the progress of each logical i/o. For example,
2949 2953 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2950 2954 * block. There may be a long delay before all copies/fragments are completed,
2951 2955 * so this callback allows us to retire dirty space gradually, as the physical
2952 2956 * i/os complete.
2953 2957 */
2954 2958 /* ARGSUSED */
2955 2959 static void
2956 2960 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2957 2961 {
2958 2962 dmu_buf_impl_t *db = arg;
2959 2963 objset_t *os = db->db_objset;
2960 2964 dsl_pool_t *dp = dmu_objset_pool(os);
2961 2965 dbuf_dirty_record_t *dr;
2962 2966 int delta = 0;
2963 2967
2964 2968 dr = db->db_data_pending;
2965 2969 ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2966 2970
2967 2971 /*
2968 2972 * The callback will be called io_phys_children times. Retire one
2969 2973 * portion of our dirty space each time we are called. Any rounding
2970 2974 * error will be cleaned up by dsl_pool_sync()'s call to
2971 2975 * dsl_pool_undirty_space().
2972 2976 */
2973 2977 delta = dr->dr_accounted / zio->io_phys_children;
2974 2978 dsl_pool_undirty_space(dp, delta, zio->io_txg);
2975 2979 }
2976 2980
2977 2981 /* ARGSUSED */
2978 2982 static void
2979 2983 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2980 2984 {
2981 2985 dmu_buf_impl_t *db = vdb;
2982 2986 blkptr_t *bp_orig = &zio->io_bp_orig;
2983 2987 blkptr_t *bp = db->db_blkptr;
2984 2988 objset_t *os = db->db_objset;
2985 2989 dmu_tx_t *tx = os->os_synctx;
2986 2990 dbuf_dirty_record_t **drp, *dr;
2987 2991
2988 2992 ASSERT0(zio->io_error);
2989 2993 ASSERT(db->db_blkptr == bp);
2990 2994
2991 2995 /*
2992 2996 * For nopwrites and rewrites we ensure that the bp matches our
2993 2997 * original and bypass all the accounting.
2994 2998 */
2995 2999 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2996 3000 ASSERT(BP_EQUAL(bp, bp_orig));
2997 3001 } else {
2998 3002 dsl_dataset_t *ds = os->os_dsl_dataset;
2999 3003 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
3000 3004 dsl_dataset_block_born(ds, bp, tx);
3001 3005 }
3002 3006
3003 3007 mutex_enter(&db->db_mtx);
3004 3008
3005 3009 DBUF_VERIFY(db);
3006 3010
3007 3011 drp = &db->db_last_dirty;
3008 3012 while ((dr = *drp) != db->db_data_pending)
3009 3013 drp = &dr->dr_next;
3010 3014 ASSERT(!list_link_active(&dr->dr_dirty_node));
3011 3015 ASSERT(dr->dr_dbuf == db);
3012 3016 ASSERT(dr->dr_next == NULL);
3013 3017 *drp = dr->dr_next;
3014 3018
3015 3019 #ifdef ZFS_DEBUG
3016 3020 if (db->db_blkid == DMU_SPILL_BLKID) {
3017 3021 dnode_t *dn;
3018 3022
3019 3023 DB_DNODE_ENTER(db);
3020 3024 dn = DB_DNODE(db);
3021 3025 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
3022 3026 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
3023 3027 db->db_blkptr == &dn->dn_phys->dn_spill);
3024 3028 DB_DNODE_EXIT(db);
3025 3029 }
3026 3030 #endif
3027 3031
3028 3032 if (db->db_level == 0) {
3029 3033 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
3030 3034 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
3031 3035 if (db->db_state != DB_NOFILL) {
3032 3036 if (dr->dt.dl.dr_data != db->db_buf)
3033 3037 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
3034 3038 db));
3035 3039 else if (!arc_released(db->db_buf))
3036 3040 arc_set_callback(db->db_buf, dbuf_do_evict, db);
3037 3041 }
3038 3042 } else {
3039 3043 dnode_t *dn;
3040 3044
3041 3045 DB_DNODE_ENTER(db);
3042 3046 dn = DB_DNODE(db);
3043 3047 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
3044 3048 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
3045 3049 if (!BP_IS_HOLE(db->db_blkptr)) {
3046 3050 int epbs =
3047 3051 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
3048 3052 ASSERT3U(db->db_blkid, <=,
3049 3053 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
3050 3054 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
3051 3055 db->db.db_size);
3052 3056 if (!arc_released(db->db_buf))
3053 3057 arc_set_callback(db->db_buf, dbuf_do_evict, db);
3054 3058 }
3055 3059 DB_DNODE_EXIT(db);
3056 3060 mutex_destroy(&dr->dt.di.dr_mtx);
3057 3061 list_destroy(&dr->dt.di.dr_children);
3058 3062 }
3059 3063 kmem_free(dr, sizeof (dbuf_dirty_record_t));
3060 3064
3061 3065 cv_broadcast(&db->db_changed);
3062 3066 ASSERT(db->db_dirtycnt > 0);
3063 3067 db->db_dirtycnt -= 1;
3064 3068 db->db_data_pending = NULL;
3065 3069 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
3066 3070 }
3067 3071
3068 3072 static void
3069 3073 dbuf_write_nofill_ready(zio_t *zio)
3070 3074 {
3071 3075 dbuf_write_ready(zio, NULL, zio->io_private);
3072 3076 }
3073 3077
3074 3078 static void
3075 3079 dbuf_write_nofill_done(zio_t *zio)
3076 3080 {
3077 3081 dbuf_write_done(zio, NULL, zio->io_private);
3078 3082 }
3079 3083
3080 3084 static void
3081 3085 dbuf_write_override_ready(zio_t *zio)
3082 3086 {
3083 3087 dbuf_dirty_record_t *dr = zio->io_private;
3084 3088 dmu_buf_impl_t *db = dr->dr_dbuf;
3085 3089
3086 3090 dbuf_write_ready(zio, NULL, db);
3087 3091 }
3088 3092
3089 3093 static void
3090 3094 dbuf_write_override_done(zio_t *zio)
3091 3095 {
3092 3096 dbuf_dirty_record_t *dr = zio->io_private;
3093 3097 dmu_buf_impl_t *db = dr->dr_dbuf;
3094 3098 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
3095 3099
3096 3100 mutex_enter(&db->db_mtx);
3097 3101 if (!BP_EQUAL(zio->io_bp, obp)) {
3098 3102 if (!BP_IS_HOLE(obp))
3099 3103 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
3100 3104 arc_release(dr->dt.dl.dr_data, db);
3101 3105 }
3102 3106 mutex_exit(&db->db_mtx);
3103 3107
3104 3108 dbuf_write_done(zio, NULL, db);
3105 3109 }
3106 3110
3107 3111 /* Issue I/O to commit a dirty buffer to disk. */
3108 3112 static void
3109 3113 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
3110 3114 {
3111 3115 dmu_buf_impl_t *db = dr->dr_dbuf;
3112 3116 dnode_t *dn;
3113 3117 objset_t *os;
3114 3118 dmu_buf_impl_t *parent = db->db_parent;
3115 3119 uint64_t txg = tx->tx_txg;
3116 3120 zbookmark_phys_t zb;
3117 3121 zio_prop_t zp;
3118 3122 zio_t *zio;
3119 3123 int wp_flag = 0;
3120 3124
3121 3125 DB_DNODE_ENTER(db);
3122 3126 dn = DB_DNODE(db);
3123 3127 os = dn->dn_objset;
3124 3128
3125 3129 if (db->db_state != DB_NOFILL) {
3126 3130 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
3127 3131 /*
3128 3132 * Private object buffers are released here rather
3129 3133 * than in dbuf_dirty() since they are only modified
3130 3134 * in the syncing context and we don't want the
3131 3135 * overhead of making multiple copies of the data.
3132 3136 */
3133 3137 if (BP_IS_HOLE(db->db_blkptr)) {
3134 3138 arc_buf_thaw(data);
3135 3139 } else {
3136 3140 dbuf_release_bp(db);
3137 3141 }
3138 3142 }
3139 3143 }
3140 3144
3141 3145 if (parent != dn->dn_dbuf) {
3142 3146 /* Our parent is an indirect block. */
3143 3147 /* We have a dirty parent that has been scheduled for write. */
3144 3148 ASSERT(parent && parent->db_data_pending);
3145 3149 /* Our parent's buffer is one level closer to the dnode. */
3146 3150 ASSERT(db->db_level == parent->db_level-1);
3147 3151 /*
3148 3152 * We're about to modify our parent's db_data by modifying
3149 3153 * our block pointer, so the parent must be released.
3150 3154 */
3151 3155 ASSERT(arc_released(parent->db_buf));
3152 3156 zio = parent->db_data_pending->dr_zio;
3153 3157 } else {
3154 3158 /* Our parent is the dnode itself. */
3155 3159 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
3156 3160 db->db_blkid != DMU_SPILL_BLKID) ||
3157 3161 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
3158 3162 if (db->db_blkid != DMU_SPILL_BLKID)
3159 3163 ASSERT3P(db->db_blkptr, ==,
3160 3164 &dn->dn_phys->dn_blkptr[db->db_blkid]);
3161 3165 zio = dn->dn_zio;
3162 3166 }
3163 3167
3164 3168 ASSERT(db->db_level == 0 || data == db->db_buf);
3165 3169 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
3166 3170 ASSERT(zio);
3167 3171
3168 3172 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
3169 3173 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
3170 3174 db->db.db_object, db->db_level, db->db_blkid);
3171 3175
3172 3176 if (db->db_blkid == DMU_SPILL_BLKID)
3173 3177 wp_flag = WP_SPILL;
3174 3178 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
3175 3179
3176 3180 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
3177 3181 DB_DNODE_EXIT(db);
3178 3182
3179 3183 if (db->db_level == 0 &&
3180 3184 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
3181 3185 /*
3182 3186 * The BP for this block has been provided by open context
3183 3187 * (by dmu_sync() or dmu_buf_write_embedded()).
3184 3188 */
3185 3189 void *contents = (data != NULL) ? data->b_data : NULL;
3186 3190
3187 3191 dr->dr_zio = zio_write(zio, os->os_spa, txg,
3188 3192 db->db_blkptr, contents, db->db.db_size, &zp,
3189 3193 dbuf_write_override_ready, NULL, dbuf_write_override_done,
3190 3194 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
3191 3195 mutex_enter(&db->db_mtx);
3192 3196 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
3193 3197 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
3194 3198 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
3195 3199 mutex_exit(&db->db_mtx);
3196 3200 } else if (db->db_state == DB_NOFILL) {
3197 3201 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
3198 3202 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
3199 3203 dr->dr_zio = zio_write(zio, os->os_spa, txg,
3200 3204 db->db_blkptr, NULL, db->db.db_size, &zp,
3201 3205 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
3202 3206 ZIO_PRIORITY_ASYNC_WRITE,
3203 3207 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
3204 3208 } else {
3205 3209 ASSERT(arc_released(data));
3206 3210 dr->dr_zio = arc_write(zio, os->os_spa, txg,
3207 3211 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
3208 3212 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
3209 3213 dbuf_write_physdone, dbuf_write_done, db,
3210 3214 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
3211 3215 }
3212 3216 }
↓ open down ↓ |
2902 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX