1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 
  27 #include <sys/debug.h>
  28 #include <sys/types.h>
  29 #include <sys/file.h>
  30 #include <sys/errno.h>
  31 #include <sys/uio.h>
  32 #include <sys/open.h>
  33 #include <sys/cred.h>
  34 #include <sys/kmem.h>
  35 #include <sys/conf.h>
  36 #include <sys/cmn_err.h>
  37 #include <sys/modctl.h>
  38 #include <sys/disp.h>
  39 #include <sys/atomic.h>
  40 #include <sys/filio.h>
  41 #include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */
  42 #include <sys/kstat.h>
  43 
  44 #include <sys/ddi.h>
  45 #include <sys/devops.h>
  46 #include <sys/sunddi.h>
  47 #include <sys/esunddi.h>
  48 #include <sys/priv_names.h>
  49 
  50 #include <sys/fssnap.h>
  51 #include <sys/fssnap_if.h>
  52 
  53 /*
  54  * This module implements the file system snapshot code, which provides a
  55  * point-in-time image of a file system for the purposes of online backup.
  56  * There are essentially two parts to this project: the driver half and the
  57  * file system half.  The driver half is a pseudo device driver called
  58  * "fssnap" that represents the snapshot.  Each snapshot is assigned a
  59  * number that corresponds to the minor number of the device, and a control
  60  * device with a high minor number is used to initiate snapshot creation and
  61  * deletion.  For all practical purposes the driver half acts like a
  62  * read-only disk device whose contents are exactly the same as the master
  63  * file system at the time the snapshot was created.
  64  *
  65  * The file system half provides interfaces necessary for performing the
  66  * file system dependent operations required to create and delete snapshots
  67  * and a special driver strategy routine that must always be used by the file
  68  * system for snapshots to work correctly.
  69  *
  70  * When a snapshot is to be created, the user utility will send an ioctl to
  71  * the control device of the driver half specifying the file system to be
  72  * snapshotted, the file descriptor of a backing-store file which is used to
  73  * hold old data before it is overwritten, and other snapshot parameters.
  74  * This ioctl is passed on to the file system specified in the original
  75  * ioctl request.  The file system is expected to be able to flush
  76  * everything out to make the file system consistent and lock it to ensure
  77  * no changes occur while the snapshot is being created.  It then calls
  78  * fssnap_create() to create state for a new snapshot, from which an opaque
  79  * handle is returned with the snapshot locked.  Next, the file system must
  80  * populate the "candidate bitmap", which tells the snapshot code which
  81  * "chunks" should be considered for copy-on-write (a chunk is the unit of
  82  * granularity used for copy-on-write, which is independent of the device
  83  * and file system block sizes).  This is typically done by scanning the
  84  * file system allocation bitmaps to determine which chunks contain
  85  * allocated blocks in the file system at the time the snapshot was created.
  86  * If a chunk has no allocated blocks, it does not need to be copied before
  87  * being written to.  Once the candidate bitmap is populated with
  88  * fssnap_set_candidate(), the file system calls fssnap_create_done() to
  89  * complete the snapshot creation and unlock the snapshot.  The file system
  90  * may now be unlocked and modifications to it resumed.
  91  *
  92  * Once a snapshot is created, the file system must perform all writes
  93  * through a special strategy routine, fssnap_strategy().  This strategy
  94  * routine determines whether the chunks contained by the write must be
  95  * copied before being overwritten by consulting the candidate bitmap
  96  * described above, and the "hastrans bitmap" which tells it whether the chunk
  97  * has been copied already or not.  If the chunk is a candidate but has not
  98  * been copied, it reads the old data in and adds it to a queue.  The
  99  * old data can then be overwritten with the new data.  An asynchronous
 100  * task queue is dispatched for each old chunk read in which writes the old
 101  * data to the backing file specified at snapshot creation time.  The
 102  * backing file is a sparse file the same size as the file system that
 103  * contains the old data at the offset that data originally had in the
 104  * file system.  If the queue containing in-memory chunks gets too large,
 105  * writes to the file system may be throttled by a semaphore until the
 106  * task queues have a chance to push some of the chunks to the backing file.
 107  *
 108  * With the candidate bitmap, the hastrans bitmap, the data on the master
 109  * file system, and the old data in memory and in the backing file, the
 110  * snapshot pseudo-driver can piece together the original file system
 111  * information to satisfy read requests.  If the requested chunk is not a
 112  * candidate, it returns a zeroed buffer.  If the chunk is a candidate but
 113  * has not been copied it reads it from the master file system.  If it is a
 114  * candidate and has been copied, it either copies the data from the
 115  * in-memory queue or it reads it in from the backing file.  The result is
 116  * a replication of the original file system that can be backed up, mounted,
 117  * or manipulated by other file system utilities that work on a read-only
 118  * device.
 119  *
 120  * This module is divided into three roughly logical sections:
 121  *
 122  *     - The snapshot driver, which is a character/block driver
 123  *       representing the snapshot itself.  These routines are
 124  *       prefixed with "snap_".
 125  *
 126  *     - The library routines that are defined in fssnap_if.h that
 127  *       are used by file systems that use this snapshot implementation.
 128  *       These functions are prefixed with "fssnap_" and are called through
 129  *       a function vector from the file system.
 130  *
 131  *     - The helper routines used by the snapshot driver and the fssnap
 132  *       library routines for managing the translation table and other
 133  *       useful functions.  These routines are all static and are
 134  *       prefixed with either "fssnap_" or "transtbl_" if they
 135  *       are specifically used for translation table activities.
 136  */
 137 
 138 static dev_info_t               *fssnap_dip = NULL;
 139 static struct snapshot_id       *snapshot = NULL;
 140 static struct snapshot_id       snap_ctl;
 141 static int                      num_snapshots = 0;
 142 static kmutex_t                 snapshot_mutex;
 143 static char                     snapname[] = SNAP_NAME;
 144 
 145 /* "tunable" parameters */
 146 static int              fssnap_taskq_nthreads = FSSNAP_TASKQ_THREADS;
 147 static uint_t           fssnap_max_mem_chunks = FSSNAP_MAX_MEM_CHUNKS;
 148 static int              fssnap_taskq_maxtasks = FSSNAP_TASKQ_MAXTASKS;
 149 
 150 /* static function prototypes */
 151 
 152 /* snapshot driver */
 153 static int snap_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 154 static int snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
 155 static int snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
 156 static int snap_open(dev_t *devp, int flag, int otyp, cred_t *cred);
 157 static int snap_close(dev_t dev, int flag, int otyp, cred_t *cred);
 158 static int snap_strategy(struct buf *bp);
 159 static int snap_read(dev_t dev, struct uio *uiop, cred_t *credp);
 160 static int snap_print(dev_t dev, char *str);
 161 static int snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
 162     cred_t *credp, int *rvalp);
 163 static int snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
 164     int flags, char *name, caddr_t valuep, int *lengthp);
 165 static int snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk,
 166     int offset, int len, char *buffer);
 167 
 168 
 169 /* fssnap interface implementations (see fssnap_if.h) */
 170 static void fssnap_strategy_impl(void *, struct buf *);
 171 static void *fssnap_create_impl(chunknumber_t, uint_t, u_offset_t,
 172     struct vnode *, int, struct vnode **, char *, u_offset_t);
 173 static void fssnap_set_candidate_impl(void *, chunknumber_t);
 174 static int fssnap_is_candidate_impl(void *, u_offset_t);
 175 static int fssnap_create_done_impl(void *);
 176 static int fssnap_delete_impl(void *);
 177 
 178 /* fssnap interface support routines */
 179 static int  fssnap_translate(struct snapshot_id **, struct buf *);
 180 static void fssnap_write_taskq(void *);
 181 static void fssnap_create_kstats(snapshot_id_t *, int, const char *,
 182     const char *);
 183 static int  fssnap_update_kstat_num(kstat_t *, int);
 184 static void fssnap_delete_kstats(struct cow_info *);
 185 
 186 /* translation table prototypes */
 187 static cow_map_node_t *transtbl_add(cow_map_t *, chunknumber_t, caddr_t);
 188 static cow_map_node_t *transtbl_get(cow_map_t *, chunknumber_t);
 189 static void transtbl_delete(cow_map_t *, cow_map_node_t *);
 190 static void transtbl_free(cow_map_t *);
 191 
 192 static kstat_t *fssnap_highwater_kstat;
 193 
 194 /* ************************************************************************ */
 195 
 196 /* Device and Module Structures */
 197 
 198 static struct cb_ops snap_cb_ops = {
 199         snap_open,
 200         snap_close,
 201         snap_strategy,
 202         snap_print,
 203         nodev,          /* no snap_dump */
 204         snap_read,
 205         nodev,          /* no snap_write */
 206         snap_ioctl,
 207         nodev,          /* no snap_devmap */
 208         nodev,          /* no snap_mmap   */
 209         nodev,          /* no snap_segmap */
 210         nochpoll,
 211         snap_prop_op,
 212         NULL,           /* streamtab */
 213         D_64BIT | D_NEW | D_MP, /* driver compatibility */
 214         CB_REV,
 215         nodev,          /* async I/O read entry point */
 216         nodev           /* async I/O write entry point */
 217 };
 218 
 219 static struct dev_ops snap_ops = {
 220         DEVO_REV,
 221         0,                      /* ref count */
 222         snap_getinfo,
 223         nulldev,                /* snap_identify obsolete */
 224         nulldev,                /* no snap_probe */
 225         snap_attach,
 226         snap_detach,
 227         nodev,                  /* no snap_reset */
 228         &snap_cb_ops,
 229         (struct bus_ops *)NULL,
 230         nulldev,                /* no snap_power() */
 231         ddi_quiesce_not_needed,         /* quiesce */
 232 };
 233 
 234 extern struct mod_ops mod_driverops;
 235 
 236 static struct modldrv md = {
 237         &mod_driverops, /* Type of module. This is a driver */
 238         "snapshot driver",      /* Name of the module */
 239         &snap_ops,
 240 };
 241 
 242 static struct modlinkage ml = {
 243         MODREV_1,
 244         &md,
 245         NULL
 246 };
 247 
 248 static void *statep;
 249 
 250 int
 251 _init(void)
 252 {
 253         int     error;
 254         kstat_t *ksp;
 255         kstat_named_t   *ksdata;
 256 
 257         error = ddi_soft_state_init(&statep, sizeof (struct snapshot_id *), 1);
 258         if (error) {
 259                 cmn_err(CE_WARN, "_init: failed to init ddi_soft_state.");
 260                 return (error);
 261         }
 262 
 263         error = mod_install(&ml);
 264 
 265         if (error) {
 266                 cmn_err(CE_WARN, "_init: failed to mod_install.");
 267                 ddi_soft_state_fini(&statep);
 268                 return (error);
 269         }
 270 
 271         /*
 272          * Fill in the snapshot operations vector for file systems
 273          * (defined in fssnap_if.c)
 274          */
 275 
 276         snapops.fssnap_create = fssnap_create_impl;
 277         snapops.fssnap_set_candidate = fssnap_set_candidate_impl;
 278         snapops.fssnap_is_candidate = fssnap_is_candidate_impl;
 279         snapops.fssnap_create_done = fssnap_create_done_impl;
 280         snapops.fssnap_delete = fssnap_delete_impl;
 281         snapops.fssnap_strategy = fssnap_strategy_impl;
 282 
 283         mutex_init(&snapshot_mutex, NULL, MUTEX_DEFAULT, NULL);
 284 
 285         /*
 286          * Initialize the fssnap highwater kstat
 287          */
 288         ksp = kstat_create(snapname, 0, FSSNAP_KSTAT_HIGHWATER, "misc",
 289             KSTAT_TYPE_NAMED, 1, 0);
 290         if (ksp != NULL) {
 291                 ksdata = (kstat_named_t *)ksp->ks_data;
 292                 kstat_named_init(ksdata, FSSNAP_KSTAT_HIGHWATER,
 293                     KSTAT_DATA_UINT32);
 294                 ksdata->value.ui32 = 0;
 295                 kstat_install(ksp);
 296         } else {
 297                 cmn_err(CE_WARN, "_init: failed to create highwater kstat.");
 298         }
 299         fssnap_highwater_kstat = ksp;
 300 
 301         return (0);
 302 }
 303 
 304 int
 305 _info(struct modinfo *modinfop)
 306 {
 307         return (mod_info(&ml, modinfop));
 308 }
 309 
 310 int
 311 _fini(void)
 312 {
 313         int     error;
 314 
 315         error = mod_remove(&ml);
 316         if (error)
 317                 return (error);
 318         ddi_soft_state_fini(&statep);
 319 
 320         /*
 321          * delete the fssnap highwater kstat
 322          */
 323         kstat_delete(fssnap_highwater_kstat);
 324 
 325         mutex_destroy(&snapshot_mutex);
 326 
 327         /* Clear out the file system operations vector */
 328         snapops.fssnap_create = NULL;
 329         snapops.fssnap_set_candidate = NULL;
 330         snapops.fssnap_create_done = NULL;
 331         snapops.fssnap_delete = NULL;
 332         snapops.fssnap_strategy = NULL;
 333 
 334         return (0);
 335 }
 336 
 337 /* ************************************************************************ */
 338 
 339 /*
 340  * Snapshot Driver Routines
 341  *
 342  * This section implements the snapshot character and block drivers.  The
 343  * device will appear to be a consistent read-only file system to
 344  * applications that wish to back it up or mount it.  The snapshot driver
 345  * communicates with the file system through the translation table, which
 346  * tells the snapshot driver where to find the data necessary to piece
 347  * together the frozen file system.  The data may either be on the master
 348  * device (no translation exists), in memory (a translation exists but has
 349  * not been flushed to the backing store), or in the backing store file.
 350  * The read request may require the snapshot driver to retrieve data from
 351  * several different places and piece it together to look like a single
 352  * contiguous read.
 353  *
 354  * The device minor number corresponds to the snapshot number in the list of
 355  * snapshot identifiers.  The soft state for each minor number is simply a
 356  * pointer to the snapshot id, which holds all of the snapshot state.  One
 357  * minor number is designated as the control device.  All snapshot create
 358  * and delete requests go through the control device to ensure this module
 359  * is properly loaded and attached before the file system starts calling
 360  * routines defined here.
 361  */
 362 
 363 
 364 /*
 365  * snap_getinfo() - snapshot driver getinfo(9E) routine
 366  *
 367  */
 368 /*ARGSUSED*/
 369 static int
 370 snap_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 371 {
 372         switch (infocmd) {
 373         case DDI_INFO_DEVT2DEVINFO:
 374                 *result = fssnap_dip;
 375                 return (DDI_SUCCESS);
 376         case DDI_INFO_DEVT2INSTANCE:
 377                 *result = 0;    /* we only have one instance */
 378                 return (DDI_SUCCESS);
 379         }
 380         return (DDI_FAILURE);
 381 }
 382 
 383 /*
 384  * snap_attach() - snapshot driver attach(9E) routine
 385  *
 386  *    sets up snapshot control device and control state.  The control state
 387  *    is a pointer to an "anonymous" snapshot_id for tracking opens and closes
 388  */
 389 static int
 390 snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 391 {
 392         int                     error;
 393 
 394         switch (cmd) {
 395         case DDI_ATTACH:
 396                 /* create the control device */
 397                 error = ddi_create_priv_minor_node(dip, SNAP_CTL_NODE, S_IFCHR,
 398                     SNAP_CTL_MINOR, DDI_PSEUDO, PRIVONLY_DEV,
 399                     PRIV_SYS_CONFIG, PRIV_SYS_CONFIG, 0666);
 400                 if (error == DDI_FAILURE) {
 401                         return (DDI_FAILURE);
 402                 }
 403 
 404                 rw_init(&snap_ctl.sid_rwlock, NULL, RW_DEFAULT, NULL);
 405                 rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
 406                 fssnap_dip = dip;
 407                 snap_ctl.sid_snapnumber = SNAP_CTL_MINOR;
 408                 /* the control sid is not linked into the snapshot list */
 409                 snap_ctl.sid_next = NULL;
 410                 snap_ctl.sid_cowinfo = NULL;
 411                 snap_ctl.sid_flags = 0;
 412                 rw_exit(&snap_ctl.sid_rwlock);
 413                 ddi_report_dev(dip);
 414 
 415                 return (DDI_SUCCESS);
 416         case DDI_PM_RESUME:
 417                 return (DDI_SUCCESS);
 418 
 419         case DDI_RESUME:
 420                 return (DDI_SUCCESS);
 421 
 422         default:
 423                 return (DDI_FAILURE);
 424         }
 425 }
 426 
 427 /*
 428  * snap_detach() - snapshot driver detach(9E) routine
 429  *
 430  *    destroys snapshot control device and control state.  If any snapshots
 431  *    are active (ie. num_snapshots != 0), the device will refuse to detach.
 432  */
 433 static int
 434 snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 435 {
 436         struct snapshot_id *sidp, *sidnextp;
 437 
 438         switch (cmd) {
 439         case DDI_DETACH:
 440                 /* do not detach if the device is active */
 441                 mutex_enter(&snapshot_mutex);
 442                 if ((num_snapshots != 0) ||
 443                     ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0)) {
 444                         mutex_exit(&snapshot_mutex);
 445                         return (DDI_FAILURE);
 446                 }
 447 
 448                 /* free up the snapshot list */
 449                 for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
 450                         ASSERT(SID_AVAILABLE(sidp) &&
 451                             !RW_LOCK_HELD(&sidp->sid_rwlock));
 452                         sidnextp = sidp->sid_next;
 453                         rw_destroy(&sidp->sid_rwlock);
 454                         kmem_free(sidp, sizeof (struct snapshot_id));
 455                 }
 456                 snapshot = NULL;
 457 
 458                 /* delete the control device */
 459                 ddi_remove_minor_node(dip, SNAP_CTL_NODE);
 460                 fssnap_dip = NULL;
 461 
 462                 ASSERT((snap_ctl.sid_flags & SID_CHAR_BUSY) == 0);
 463                 rw_destroy(&snap_ctl.sid_rwlock);
 464                 mutex_exit(&snapshot_mutex);
 465 
 466                 return (DDI_SUCCESS);
 467 
 468         default:
 469                 return (DDI_FAILURE);
 470         }
 471 }
 472 
 473 /*
 474  * snap_open() - snapshot driver open(9E) routine
 475  *
 476  *     marks the snapshot id as busy so it will not be recycled when deleted
 477  *     until the snapshot is closed.
 478  */
 479 /* ARGSUSED */
 480 static int
 481 snap_open(dev_t *devp, int flag, int otyp, cred_t *cred)
 482 {
 483         minor_t minor;
 484         struct snapshot_id **sidpp, *sidp;
 485 
 486         /* snapshots are read-only */
 487         if (flag & FWRITE)
 488                 return (EROFS);
 489 
 490         minor = getminor(*devp);
 491 
 492         if (minor == SNAP_CTL_MINOR) {
 493                 /* control device must be opened exclusively */
 494                 if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR))
 495                         return (EINVAL);
 496 
 497                 rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
 498                 if ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0) {
 499                         rw_exit(&snap_ctl.sid_rwlock);
 500                         return (EBUSY);
 501                 }
 502 
 503                 snap_ctl.sid_flags |= SID_CHAR_BUSY;
 504                 rw_exit(&snap_ctl.sid_rwlock);
 505 
 506                 return (0);
 507         }
 508 
 509         sidpp = ddi_get_soft_state(statep, minor);
 510         if (sidpp == NULL || *sidpp == NULL)
 511                 return (ENXIO);
 512         sidp = *sidpp;
 513         rw_enter(&sidp->sid_rwlock, RW_WRITER);
 514 
 515         if ((flag & FEXCL) && SID_BUSY(sidp)) {
 516                 rw_exit(&sidp->sid_rwlock);
 517                 return (EAGAIN);
 518         }
 519 
 520         ASSERT(sidpp != NULL && sidp != NULL);
 521         /* check to see if this snapshot has been killed on us */
 522         if (SID_INACTIVE(sidp)) {
 523                 cmn_err(CE_WARN, "snap_open: snapshot %d does not exist.",
 524                     minor);
 525                 rw_exit(&sidp->sid_rwlock);
 526                 return (ENXIO);
 527         }
 528 
 529         switch (otyp) {
 530         case OTYP_CHR:
 531                 sidp->sid_flags |= SID_CHAR_BUSY;
 532                 break;
 533         case OTYP_BLK:
 534                 sidp->sid_flags |= SID_BLOCK_BUSY;
 535                 break;
 536         default:
 537                 rw_exit(&sidp->sid_rwlock);
 538                 return (EINVAL);
 539         }
 540 
 541         rw_exit(&sidp->sid_rwlock);
 542 
 543         /*
 544          * at this point if a valid snapshot was found then it has
 545          * been marked busy and we can use it.
 546          */
 547         return (0);
 548 }
 549 
 550 /*
 551  * snap_close() - snapshot driver close(9E) routine
 552  *
 553  *    unsets the busy bits in the snapshot id.  If the snapshot has been
 554  *    deleted while the snapshot device was open, the close call will clean
 555  *    up the remaining state information.
 556  */
 557 /* ARGSUSED */
 558 static int
 559 snap_close(dev_t dev, int flag, int otyp, cred_t *cred)
 560 {
 561         struct snapshot_id      **sidpp, *sidp;
 562         minor_t                 minor;
 563         char                    name[20];
 564 
 565         minor = getminor(dev);
 566 
 567         /* if this is the control device, close it and return */
 568         if (minor == SNAP_CTL_MINOR) {
 569                 rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
 570                 snap_ctl.sid_flags &= ~(SID_CHAR_BUSY);
 571                 rw_exit(&snap_ctl.sid_rwlock);
 572                 return (0);
 573         }
 574 
 575         sidpp = ddi_get_soft_state(statep, minor);
 576         if (sidpp == NULL || *sidpp == NULL) {
 577                 cmn_err(CE_WARN, "snap_close: could not find state for "
 578                     "snapshot %d.", minor);
 579                 return (ENXIO);
 580         }
 581         sidp = *sidpp;
 582         mutex_enter(&snapshot_mutex);
 583         rw_enter(&sidp->sid_rwlock, RW_WRITER);
 584 
 585         /* Mark the snapshot as not being busy anymore */
 586         switch (otyp) {
 587         case OTYP_CHR:
 588                 sidp->sid_flags &= ~(SID_CHAR_BUSY);
 589                 break;
 590         case OTYP_BLK:
 591                 sidp->sid_flags &= ~(SID_BLOCK_BUSY);
 592                 break;
 593         default:
 594                 mutex_exit(&snapshot_mutex);
 595                 rw_exit(&sidp->sid_rwlock);
 596                 return (EINVAL);
 597         }
 598 
 599         if (SID_AVAILABLE(sidp)) {
 600                 /*
 601                  * if this is the last close on a snapshot that has been
 602                  * deleted, then free up the soft state.  The snapdelete
 603                  * ioctl does not free this when the device is in use so
 604                  * we do it here after the last reference goes away.
 605                  */
 606 
 607                 /* remove the device nodes */
 608                 ASSERT(fssnap_dip != NULL);
 609                 (void) snprintf(name, sizeof (name), "%d",
 610                     sidp->sid_snapnumber);
 611                 ddi_remove_minor_node(fssnap_dip, name);
 612                 (void) snprintf(name, sizeof (name), "%d,raw",
 613                     sidp->sid_snapnumber);
 614                 ddi_remove_minor_node(fssnap_dip, name);
 615 
 616                 /* delete the state structure */
 617                 ddi_soft_state_free(statep, sidp->sid_snapnumber);
 618                 num_snapshots--;
 619         }
 620 
 621         mutex_exit(&snapshot_mutex);
 622         rw_exit(&sidp->sid_rwlock);
 623 
 624         return (0);
 625 }
 626 
 627 /*
 628  * snap_read() - snapshot driver read(9E) routine
 629  *
 630  *    reads data from the snapshot by calling snap_strategy() through physio()
 631  */
 632 /* ARGSUSED */
 633 static int
 634 snap_read(dev_t dev, struct uio *uiop, cred_t *credp)
 635 {
 636         minor_t         minor;
 637         struct snapshot_id **sidpp;
 638 
 639         minor = getminor(dev);
 640         sidpp = ddi_get_soft_state(statep, minor);
 641         if (sidpp == NULL || *sidpp == NULL) {
 642                 cmn_err(CE_WARN,
 643                     "snap_read: could not find state for snapshot %d.", minor);
 644                 return (ENXIO);
 645         }
 646         return (physio(snap_strategy, NULL, dev, B_READ, minphys, uiop));
 647 }
 648 
 649 /*
 650  * snap_strategy() - snapshot driver strategy(9E) routine
 651  *
 652  *    cycles through each chunk in the requested buffer and calls
 653  *    snap_getchunk() on each chunk to retrieve it from the appropriate
 654  *    place.  Once all of the parts are put together the requested buffer
 655  *    is returned.  The snapshot driver is read-only, so a write is invalid.
 656  */
 657 static int
 658 snap_strategy(struct buf *bp)
 659 {
 660         struct snapshot_id **sidpp, *sidp;
 661         minor_t         minor;
 662         chunknumber_t   chunk;
 663         int             off, len;
 664         u_longlong_t    reqptr;
 665         int             error = 0;
 666         size_t          chunksz;
 667         caddr_t         buf;
 668 
 669         /* snapshot device is read-only */
 670         if (bp->b_flags & B_WRITE) {
 671                 bioerror(bp, EROFS);
 672                 bp->b_resid = bp->b_bcount;
 673                 biodone(bp);
 674                 return (0);
 675         }
 676 
 677         minor = getminor(bp->b_edev);
 678         sidpp = ddi_get_soft_state(statep, minor);
 679         if (sidpp == NULL || *sidpp == NULL) {
 680                 cmn_err(CE_WARN,
 681                     "snap_strategy: could not find state for snapshot %d.",
 682                     minor);
 683                 bioerror(bp, ENXIO);
 684                 bp->b_resid = bp->b_bcount;
 685                 biodone(bp);
 686                 return (0);
 687         }
 688         sidp = *sidpp;
 689         ASSERT(sidp);
 690         rw_enter(&sidp->sid_rwlock, RW_READER);
 691 
 692         if (SID_INACTIVE(sidp)) {
 693                 bioerror(bp, ENXIO);
 694                 bp->b_resid = bp->b_bcount;
 695                 biodone(bp);
 696                 rw_exit(&sidp->sid_rwlock);
 697                 return (0);
 698         }
 699 
 700         if (bp->b_flags & (B_PAGEIO|B_PHYS))
 701                 bp_mapin(bp);
 702 
 703         bp->b_resid = bp->b_bcount;
 704         ASSERT(bp->b_un.b_addr);
 705         buf = bp->b_un.b_addr;
 706 
 707         chunksz = sidp->sid_cowinfo->cow_map.cmap_chunksz;
 708 
 709         /* reqptr is the current DEV_BSIZE offset into the device */
 710         /* chunk is the chunk containing reqptr */
 711         /* len is the length of the request (in the current chunk) in bytes */
 712         /* off is the byte offset into the current chunk */
 713         reqptr = bp->b_lblkno;
 714         while (bp->b_resid > 0) {
 715                 chunk = dbtocowchunk(&sidp->sid_cowinfo->cow_map, reqptr);
 716                 off = (reqptr % (chunksz >> DEV_BSHIFT)) << DEV_BSHIFT;
 717                 len = min(chunksz - off, bp->b_resid);
 718                 ASSERT((off + len) <= chunksz);
 719 
 720                 if ((error = snap_getchunk(sidp, chunk, off, len, buf)) != 0) {
 721                         /*
 722                          * EINVAL means the user tried to go out of range.
 723                          * Anything else means it's likely that we're
 724                          * confused.
 725                          */
 726                         if (error != EINVAL) {
 727                                 cmn_err(CE_WARN, "snap_strategy: error "
 728                                     "calling snap_getchunk, chunk = %llu, "
 729                                     "offset = %d, len = %d, resid = %lu, "
 730                                     "error = %d.",
 731                                     chunk, off, len, bp->b_resid, error);
 732                         }
 733                         bioerror(bp, error);
 734                         biodone(bp);
 735                         rw_exit(&sidp->sid_rwlock);
 736                         return (0);
 737                 }
 738                 bp->b_resid -= len;
 739                 reqptr += (len >> DEV_BSHIFT);
 740                 buf += len;
 741         }
 742 
 743         ASSERT(bp->b_resid == 0);
 744         biodone(bp);
 745 
 746         rw_exit(&sidp->sid_rwlock);
 747         return (0);
 748 }
 749 
 750 /*
 751  * snap_getchunk() - helper function for snap_strategy()
 752  *
 753  *    gets the requested data from the appropriate place and fills in the
 754  *    buffer.  chunk is the chunk number of the request, offset is the
 755  *    offset into that chunk and must be less than the chunk size.  len is
 756  *    the length of the request starting at offset, and must not exceed a
 757  *    chunk boundary.  buffer is the address to copy the data to.  len
 758  *    bytes are copied into the buffer starting at the location specified.
 759  *
 760  *    A chunk is located according to the following algorithm:
 761  *        - If the chunk does not have a translation or is not a candidate
 762  *          for translation, it is read straight from the master device.
 763  *        - If the chunk does have a translation, then it is either on
 764  *          disk or in memory:
 765  *            o If it is in memory the requested data is simply copied out
 766  *              of the in-memory buffer.
 767  *            o If it is in the backing store, it is read from there.
 768  *
 769  *    This function does the real work of the snapshot driver.
 770  */
 771 static int
 772 snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk, int offset,
 773     int len, char *buffer)
 774 {
 775         cow_map_t       *cmap = &sidp->sid_cowinfo->cow_map;
 776         cow_map_node_t  *cmn;
 777         struct buf      *snapbuf;
 778         int             error = 0;
 779         char            *newbuffer;
 780         int             newlen = 0;
 781         int             partial = 0;
 782 
 783         ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
 784         ASSERT(offset + len <= cmap->cmap_chunksz);
 785 
 786         /*
 787          * Check if the chunk number is out of range and if so bail out
 788          */
 789         if (chunk >= (cmap->cmap_bmsize * NBBY)) {
 790                 return (EINVAL);
 791         }
 792 
 793         /*
 794          * If the chunk is not a candidate for translation, then the chunk
 795          * was not allocated when the snapshot was taken.  Since it does
 796          * not contain data associated with this snapshot, just return a
 797          * zero buffer instead.
 798          */
 799         if (isclr(cmap->cmap_candidate, chunk)) {
 800                 bzero(buffer, len);
 801                 return (0);
 802         }
 803 
 804         /*
 805          * if the chunk is a candidate for translation but a
 806          * translation does not exist, then read through to the
 807          * original file system.  The rwlock is held until the read
 808          * completes if it hasn't been translated to make sure the
 809          * file system does not translate the block before we
 810          * access it. If it has already been translated we don't
 811          * need the lock, because the translation will never go away.
 812          */
 813         rw_enter(&cmap->cmap_rwlock, RW_READER);
 814         if (isclr(cmap->cmap_hastrans, chunk)) {
 815                 snapbuf = getrbuf(KM_SLEEP);
 816                 /*
 817                  * Reading into the buffer saves having to do a copy,
 818                  * but gets tricky if the request size is not a
 819                  * multiple of DEV_BSIZE.  However, we are filling the
 820                  * buffer left to right, so future reads will write
 821                  * over any extra data we might have read.
 822                  */
 823 
 824                 partial = len % DEV_BSIZE;
 825 
 826                 snapbuf->b_bcount = len;
 827                 snapbuf->b_lblkno = lbtodb(chunk * cmap->cmap_chunksz + offset);
 828                 snapbuf->b_un.b_addr = buffer;
 829 
 830                 snapbuf->b_iodone = NULL;
 831                 snapbuf->b_proc = NULL;              /* i.e. the kernel */
 832                 snapbuf->b_flags = B_READ | B_BUSY;
 833                 snapbuf->b_edev = sidp->sid_fvp->v_vfsp->vfs_dev;
 834 
 835                 if (partial) {
 836                         /*
 837                          * Partial block read in progress.
 838                          * This is bad as modules further down the line
 839                          * assume buf's are exact multiples of DEV_BSIZE
 840                          * and we end up with fewer, or zero, bytes read.
 841                          * To get round this we need to round up to the
 842                          * nearest full block read and then return only
 843                          * len bytes.
 844                          */
 845                         newlen = (len - partial) + DEV_BSIZE;
 846                         newbuffer = kmem_alloc(newlen, KM_SLEEP);
 847 
 848                         snapbuf->b_bcount = newlen;
 849                         snapbuf->b_un.b_addr = newbuffer;
 850                 }
 851 
 852                 (void) bdev_strategy(snapbuf);
 853                 (void) biowait(snapbuf);
 854 
 855                 error = geterror(snapbuf);
 856 
 857                 if (partial) {
 858                         /*
 859                          * Partial block read. Now we need to bcopy the
 860                          * correct number of bytes back into the
 861                          * supplied buffer, and tidy up our temp
 862                          * buffer.
 863                          */
 864                         bcopy(newbuffer, buffer, len);
 865                         kmem_free(newbuffer, newlen);
 866                 }
 867 
 868                 freerbuf(snapbuf);
 869                 rw_exit(&cmap->cmap_rwlock);
 870 
 871                 return (error);
 872         }
 873 
 874         /*
 875          * finally, if the chunk is a candidate for translation and it
 876          * has been translated, then we clone the chunk of the buffer
 877          * that was copied aside by the file system.
 878          * The cmap_rwlock does not need to be held after we know the
 879          * data has already been copied. Once a chunk has been copied
 880          * to the backing file, it is stable read only data.
 881          */
 882         cmn = transtbl_get(cmap, chunk);
 883 
 884         /* check whether the data is in memory or in the backing file */
 885         if (cmn != NULL) {
 886                 ASSERT(cmn->cmn_buf);
 887                 /* already in memory */
 888                 bcopy(cmn->cmn_buf + offset, buffer, len);
 889                 rw_exit(&cmap->cmap_rwlock);
 890         } else {
 891                 ssize_t resid = len;
 892                 int     bf_index;
 893                 /*
 894                  * can cause deadlock with writer if we don't drop the
 895                  * cmap_rwlock before trying to get the backing store file
 896                  * vnode rwlock.
 897                  */
 898                 rw_exit(&cmap->cmap_rwlock);
 899 
 900                 bf_index = chunk / cmap->cmap_chunksperbf;
 901 
 902                 /* read buffer from backing file */
 903                 error = vn_rdwr(UIO_READ,
 904                     (sidp->sid_cowinfo->cow_backfile_array)[bf_index],
 905                     buffer, len, ((chunk % cmap->cmap_chunksperbf) *
 906                     cmap->cmap_chunksz) + offset, UIO_SYSSPACE, 0,
 907                     RLIM64_INFINITY, kcred, &resid);
 908         }
 909 
 910         return (error);
 911 }
 912 
 913 /*
 914  * snap_print() - snapshot driver print(9E) routine
 915  *
 916  *    prints the device identification string.
 917  */
 918 static int
 919 snap_print(dev_t dev, char *str)
 920 {
 921         struct snapshot_id **sidpp;
 922         minor_t         minor;
 923 
 924         minor = getminor(dev);
 925         sidpp = ddi_get_soft_state(statep, minor);
 926         if (sidpp == NULL || *sidpp == NULL) {
 927                 cmn_err(CE_WARN,
 928                     "snap_print: could not find state for snapshot %d.", minor);
 929                 return (ENXIO);
 930         }
 931 
 932         cmn_err(CE_NOTE, "snap_print: snapshot %d: %s",  minor, str);
 933 
 934         return (0);
 935 }
 936 
 937 /*
 938  * snap_prop_op() - snapshot driver prop_op(9E) routine
 939  *
 940  *    get 32-bit and 64-bit values for size (character driver) and nblocks
 941  *    (block driver).
 942  */
 943 static int
 944 snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
 945     int flags, char *name, caddr_t valuep, int *lengthp)
 946 {
 947         int             minor;
 948         struct snapshot_id **sidpp;
 949         dev_t           mdev;
 950         dev_info_t      *mdip;
 951         int             error;
 952 
 953         minor = getminor(dev);
 954 
 955         /*
 956          * If this is the control device just check for .conf properties,
 957          * if the wildcard DDI_DEV_T_ANY was passed in via the dev_t
 958          * just fall back to the defaults.
 959          */
 960         if ((minor == SNAP_CTL_MINOR) || (dev == DDI_DEV_T_ANY))
 961                 return (ddi_prop_op(dev, dip, prop_op, flags, name,
 962                     valuep, lengthp));
 963 
 964         /* check to see if there is a master device plumbed */
 965         sidpp = ddi_get_soft_state(statep, minor);
 966         if (sidpp == NULL || *sidpp == NULL) {
 967                 cmn_err(CE_WARN,
 968                     "snap_prop_op: could not find state for "
 969                     "snapshot %d.", minor);
 970                 return (DDI_PROP_NOT_FOUND);
 971         }
 972 
 973         if (((*sidpp)->sid_fvp == NULL) || ((*sidpp)->sid_fvp->v_vfsp == NULL))
 974                 return (ddi_prop_op(dev, dip, prop_op, flags, name,
 975                     valuep, lengthp));
 976 
 977         /* hold master device and pass operation down */
 978         mdev = (*sidpp)->sid_fvp->v_vfsp->vfs_dev;
 979         if (mdip = e_ddi_hold_devi_by_dev(mdev, 0)) {
 980 
 981                 /* get size information from the master device. */
 982                 error = cdev_prop_op(mdev, mdip,
 983                     prop_op, flags, name, valuep, lengthp);
 984                 ddi_release_devi(mdip);
 985                 if (error == DDI_PROP_SUCCESS)
 986                         return (error);
 987         }
 988 
 989         /* master device did not service the request, try framework */
 990         return (ddi_prop_op(dev, dip, prop_op, flags, name, valuep, lengthp));
 991 
 992 }
 993 
 994 /*
 995  * snap_ioctl() - snapshot driver ioctl(9E) routine
 996  *
 997  *    only applies to the control device.  The control device accepts two
 998  *    ioctl requests: create a snapshot or delete a snapshot.  In either
 999  *    case, the vnode for the requested file system is extracted, and the
1000  *    request is passed on to the file system via the same ioctl.  The file
1001  *    system is responsible for doing the things necessary for creating or
1002  *    destroying a snapshot, including any file system specific operations
1003  *    that must be performed as well as setting up and deleting the snapshot
1004  *    state through the fssnap interfaces.
1005  */
1006 static int
1007 snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1008 int *rvalp)
1009 {
1010         minor_t minor;
1011         int error = 0;
1012 
1013         minor = getminor(dev);
1014 
1015         if (minor != SNAP_CTL_MINOR) {
1016                 return (EINVAL);
1017         }
1018 
1019         switch (cmd) {
1020         case _FIOSNAPSHOTCREATE:
1021         {
1022                 struct fiosnapcreate    fc;
1023                 struct file             *fp;
1024                 struct vnode            *vp;
1025 
1026                 if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1027                         return (EFAULT);
1028 
1029                 /* get vnode for file system mount point */
1030                 if ((fp = getf(fc.rootfiledesc)) == NULL)
1031                         return (EBADF);
1032 
1033                 ASSERT(fp->f_vnode);
1034                 vp = fp->f_vnode;
1035                 VN_HOLD(vp);
1036                 releasef(fc.rootfiledesc);
1037 
1038                 /* pass ioctl request to file system */
1039                 error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
1040                 VN_RELE(vp);
1041                 break;
1042         }
1043         case _FIOSNAPSHOTCREATE_MULTI:
1044         {
1045                 struct fiosnapcreate_multi      fc;
1046                 struct file             *fp;
1047                 struct vnode            *vp;
1048 
1049                 if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1050                         return (EFAULT);
1051 
1052                 /* get vnode for file system mount point */
1053                 if ((fp = getf(fc.rootfiledesc)) == NULL)
1054                         return (EBADF);
1055 
1056                 ASSERT(fp->f_vnode);
1057                 vp = fp->f_vnode;
1058                 VN_HOLD(vp);
1059                 releasef(fc.rootfiledesc);
1060 
1061                 /* pass ioctl request to file system */
1062                 error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
1063                 VN_RELE(vp);
1064                 break;
1065         }
1066         case _FIOSNAPSHOTDELETE:
1067         {
1068                 major_t                 major;
1069                 struct fiosnapdelete    fc;
1070                 snapshot_id_t           *sidp = NULL;
1071                 snapshot_id_t           *sidnextp = NULL;
1072                 struct file             *fp = NULL;
1073                 struct vnode            *vp = NULL;
1074                 struct vfs              *vfsp = NULL;
1075                 vfsops_t                *vfsops = EIO_vfsops;
1076 
1077                 if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1078                         return (EFAULT);
1079 
1080                 /* get vnode for file system mount point */
1081                 if ((fp = getf(fc.rootfiledesc)) == NULL)
1082                         return (EBADF);
1083 
1084                 ASSERT(fp->f_vnode);
1085                 vp = fp->f_vnode;
1086                 VN_HOLD(vp);
1087                 releasef(fc.rootfiledesc);
1088                 /*
1089                  * Test for two formats of delete and set correct minor/vp:
1090                  * pseudo device:
1091                  * fssnap -d [/dev/fssnap/x]
1092                  * or
1093                  * mount point:
1094                  * fssnap -d [/mntpt]
1095                  * Note that minor is verified to be equal to SNAP_CTL_MINOR
1096                  * at this point which is an invalid minor number.
1097                  */
1098                 ASSERT(fssnap_dip != NULL);
1099                 major = ddi_driver_major(fssnap_dip);
1100                 mutex_enter(&snapshot_mutex);
1101                 for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
1102                         rw_enter(&sidp->sid_rwlock, RW_READER);
1103                         sidnextp = sidp->sid_next;
1104                         /* pseudo device: */
1105                         if (major == getmajor(vp->v_rdev)) {
1106                                 minor = getminor(vp->v_rdev);
1107                                 if (sidp->sid_snapnumber == (uint_t)minor &&
1108                                     sidp->sid_fvp) {
1109                                         VN_RELE(vp);
1110                                         vp = sidp->sid_fvp;
1111                                         VN_HOLD(vp);
1112                                         rw_exit(&sidp->sid_rwlock);
1113                                         break;
1114                                 }
1115                         /* Mount point: */
1116                         } else {
1117                                 if (sidp->sid_fvp == vp) {
1118                                         minor = sidp->sid_snapnumber;
1119                                         rw_exit(&sidp->sid_rwlock);
1120                                         break;
1121                                 }
1122                         }
1123                         rw_exit(&sidp->sid_rwlock);
1124                 }
1125                 mutex_exit(&snapshot_mutex);
1126                 /* Verify minor got set correctly above */
1127                 if (minor == SNAP_CTL_MINOR) {
1128                         VN_RELE(vp);
1129                         return (EINVAL);
1130                 }
1131                 dev = makedevice(major, minor);
1132                 /*
1133                  * Create dummy vfs entry
1134                  * to use as a locking semaphore across the IOCTL
1135                  * for mount in progress cases...
1136                  */
1137                 vfsp = vfs_alloc(KM_SLEEP);
1138                 VFS_INIT(vfsp, vfsops, NULL);
1139                 VFS_HOLD(vfsp);
1140                 vfs_addmip(dev, vfsp);
1141                 if ((vfs_devmounting(dev, vfsp)) ||
1142                     (vfs_devismounted(dev))) {
1143                         vfs_delmip(vfsp);
1144                         VFS_RELE(vfsp);
1145                         VN_RELE(vp);
1146                         return (EBUSY);
1147                 }
1148                 /*
1149                  * Nobody mounted but do not release mount in progress lock
1150                  * until IOCTL complete to prohibit a mount sneaking
1151                  * in
1152                  */
1153                 error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
1154                 vfs_delmip(vfsp);
1155                 VFS_RELE(vfsp);
1156                 VN_RELE(vp);
1157                 break;
1158         }
1159         default:
1160                 cmn_err(CE_WARN, "snap_ioctl: Invalid ioctl cmd %d, minor %d.",
1161                     cmd, minor);
1162                 return (EINVAL);
1163         }
1164 
1165         return (error);
1166 }
1167 
1168 
1169 /* ************************************************************************ */
1170 
1171 /*
1172  * Translation Table Routines
1173  *
1174  *    These support routines implement a simple doubly linked list
1175  *    to keep track of chunks that are currently in memory.  The maximum
1176  *    size of the list is determined by the fssnap_max_mem_chunks variable.
1177  *    The cmap_rwlock is used to protect the linkage of the list.
1178  */
1179 
1180 /*
1181  * transtbl_add() - add a node to the translation table
1182  *
1183  *    allocates a new node and points it at the buffer passed in.  The node
1184  *    is added to the beginning of the doubly linked list and the head of
1185  *    the list is moved.  The cmap_rwlock must be held as a writer through
1186  *    this operation.
1187  */
1188 static cow_map_node_t *
1189 transtbl_add(cow_map_t *cmap, chunknumber_t chunk, caddr_t buf)
1190 {
1191         cow_map_node_t  *cmnode;
1192 
1193         ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1194 
1195         cmnode = kmem_alloc(sizeof (cow_map_node_t), KM_SLEEP);
1196 
1197         /*
1198          * insert new translations at the beginning so cmn_table is always
1199          * the first node.
1200          */
1201         cmnode->cmn_chunk = chunk;
1202         cmnode->cmn_buf = buf;
1203         cmnode->cmn_prev = NULL;
1204         cmnode->cmn_next = cmap->cmap_table;
1205         if (cmnode->cmn_next)
1206                 cmnode->cmn_next->cmn_prev = cmnode;
1207         cmap->cmap_table = cmnode;
1208 
1209         return (cmnode);
1210 }
1211 
1212 /*
1213  * transtbl_get() - look up a node in the translation table
1214  *
1215  *    called by the snapshot driver to find data that has been translated.
1216  *    The lookup is done by the chunk number, and the node is returned.
1217  *    If the node was not found, NULL is returned.
1218  */
1219 static cow_map_node_t *
1220 transtbl_get(cow_map_t *cmap, chunknumber_t chunk)
1221 {
1222         cow_map_node_t *cmn;
1223 
1224         ASSERT(RW_READ_HELD(&cmap->cmap_rwlock));
1225         ASSERT(cmap);
1226 
1227         /* search the translation table */
1228         for (cmn = cmap->cmap_table; cmn != NULL; cmn = cmn->cmn_next) {
1229                 if (cmn->cmn_chunk == chunk)
1230                         return (cmn);
1231         }
1232 
1233         /* not found */
1234         return (NULL);
1235 }
1236 
1237 /*
1238  * transtbl_delete() - delete a node from the translation table
1239  *
1240  *    called when a node's data has been written out to disk.  The
1241  *    cmap_rwlock must be held as a writer for this operation.  If the node
1242  *    being deleted is the head of the list, then the head is moved to the
1243  *    next node.  Both the node's data and the node itself are freed.
1244  */
1245 static void
1246 transtbl_delete(cow_map_t *cmap, cow_map_node_t *cmn)
1247 {
1248         ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1249         ASSERT(cmn);
1250         ASSERT(cmap->cmap_table);
1251 
1252         /* if the head of the list is being deleted, then move the head up */
1253         if (cmap->cmap_table == cmn) {
1254                 ASSERT(cmn->cmn_prev == NULL);
1255                 cmap->cmap_table = cmn->cmn_next;
1256         }
1257 
1258 
1259         /* make previous node's next pointer skip over current node */
1260         if (cmn->cmn_prev != NULL) {
1261                 ASSERT(cmn->cmn_prev->cmn_next == cmn);
1262                 cmn->cmn_prev->cmn_next = cmn->cmn_next;
1263         }
1264 
1265         /* make next node's previous pointer skip over current node */
1266         if (cmn->cmn_next != NULL) {
1267                 ASSERT(cmn->cmn_next->cmn_prev == cmn);
1268                 cmn->cmn_next->cmn_prev = cmn->cmn_prev;
1269         }
1270 
1271         /* free the data and the node */
1272         ASSERT(cmn->cmn_buf);
1273         kmem_free(cmn->cmn_buf, cmap->cmap_chunksz);
1274         kmem_free(cmn, sizeof (cow_map_node_t));
1275 }
1276 
1277 /*
1278  * transtbl_free() - free the entire translation table
1279  *
1280  *    called when the snapshot is deleted.  This frees all of the nodes in
1281  *    the translation table (but not the bitmaps).
1282  */
1283 static void
1284 transtbl_free(cow_map_t *cmap)
1285 {
1286         cow_map_node_t  *curnode;
1287         cow_map_node_t  *tempnode;
1288 
1289         for (curnode = cmap->cmap_table; curnode != NULL; curnode = tempnode) {
1290                 tempnode = curnode->cmn_next;
1291 
1292                 kmem_free(curnode->cmn_buf, cmap->cmap_chunksz);
1293                 kmem_free(curnode, sizeof (cow_map_node_t));
1294         }
1295 }
1296 
1297 
1298 /* ************************************************************************ */
1299 
1300 /*
1301  * Interface Implementation Routines
1302  *
1303  * The following functions implement snapshot interface routines that are
1304  * called by the file system to create, delete, and use a snapshot.  The
1305  * interfaces are defined in fssnap_if.c and are filled in by this driver
1306  * when it is loaded.  This technique allows the file system to depend on
1307  * the interface module without having to load the full implementation and
1308  * snapshot device drivers.
1309  */
1310 
1311 /*
1312  * fssnap_strategy_impl() - strategy routine called by the file system
1313  *
1314  *    called by the file system to handle copy-on-write when necessary.  All
1315  *    reads and writes that the file system performs should go through this
1316  *    function.  If the file system calls the underlying device's strategy
1317  *    routine without going through fssnap_strategy() (eg. by calling
1318  *    bdev_strategy()), the snapshot may not be consistent.
1319  *
1320  *    This function starts by doing significant sanity checking to insure
1321  *    the snapshot was not deleted out from under it or deleted and then
1322  *    recreated.  To do this, it checks the actual pointer passed into it
1323  *    (ie. the handle held by the file system).  NOTE that the parameter is
1324  *    a POINTER TO A POINTER to the snapshot id.  Once the snapshot id is
1325  *    locked, it knows things are ok and that this snapshot is really for
1326  *    this file system.
1327  *
1328  *    If the request is a write, fssnap_translate() is called to determine
1329  *    whether a copy-on-write is required.  If it is a read, the read is
1330  *    simply passed on to the underlying device.
1331  */
1332 static void
1333 fssnap_strategy_impl(void *snapshot_id, buf_t *bp)
1334 {
1335         struct snapshot_id **sidpp;
1336         struct snapshot_id *sidp;
1337         int error;
1338 
1339         /* read requests are always passed through */
1340         if (bp->b_flags & B_READ) {
1341                 (void) bdev_strategy(bp);
1342                 return;
1343         }
1344 
1345         /*
1346          * Because we were not able to take the snapshot read lock BEFORE
1347          * checking for a snapshot back in the file system, things may have
1348          * drastically changed out from under us.  For instance, the snapshot
1349          * may have been deleted, deleted and recreated, or worse yet, deleted
1350          * for this file system but now the snapshot number is in use by another
1351          * file system.
1352          *
1353          * Having a pointer to the file system's snapshot id pointer allows us
1354          * to sanity check most of this, though it assumes the file system is
1355          * keeping track of a pointer to the snapshot_id somewhere.
1356          */
1357         sidpp = (struct snapshot_id **)snapshot_id;
1358         sidp = *sidpp;
1359 
1360         /*
1361          * if this file system's snapshot was disabled, just pass the
1362          * request through.
1363          */
1364         if (sidp == NULL) {
1365                 (void) bdev_strategy(bp);
1366                 return;
1367         }
1368 
1369         /*
1370          * Once we have the reader lock the snapshot will not magically go
1371          * away.  But things may have changed on us before this so double check.
1372          */
1373         rw_enter(&sidp->sid_rwlock, RW_READER);
1374 
1375         /*
1376          * if an error was founds somewhere the DELETE flag will be
1377          * set to indicate the snapshot should be deleted and no new
1378          * translations should occur.
1379          */
1380         if (sidp->sid_flags & SID_DELETE) {
1381                 rw_exit(&sidp->sid_rwlock);
1382                 (void) fssnap_delete_impl(sidpp);
1383                 (void) bdev_strategy(bp);
1384                 return;
1385         }
1386 
1387         /*
1388          * If the file system is no longer pointing to the snapshot we were
1389          * called with, then it should not attempt to translate this buffer as
1390          * it may be going to a snapshot for a different file system.
1391          * Even if the file system snapshot pointer is still the same, the
1392          * snapshot may have been disabled before we got the reader lock.
1393          */
1394         if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1395                 rw_exit(&sidp->sid_rwlock);
1396                 (void) bdev_strategy(bp);
1397                 return;
1398         }
1399 
1400         /*
1401          * At this point we're sure the snapshot will not go away while the
1402          * reader lock is held, and we are reasonably certain that we are
1403          * writing to the correct snapshot.
1404          */
1405         if ((error = fssnap_translate(sidpp, bp)) != 0) {
1406                 /*
1407                  * fssnap_translate can release the reader lock if it
1408                  * has to wait for a semaphore.  In this case it is possible
1409                  * for the snapshot to be deleted in this time frame.  If this
1410                  * happens just sent the buf thru to the filesystems device.
1411                  */
1412                 if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1413                         rw_exit(&sidp->sid_rwlock);
1414                         (void) bdev_strategy(bp);
1415                         return;
1416                 }
1417                 bioerror(bp, error);
1418                 biodone(bp);
1419         }
1420         rw_exit(&sidp->sid_rwlock);
1421 }
1422 
1423 /*
1424  * fssnap_translate() - helper function for fssnap_strategy()
1425  *
1426  *    performs the actual copy-on-write for write requests, if required.
1427  *    This function does the real work of the file system side of things.
1428  *
1429  *    It first checks the candidate bitmap to quickly determine whether any
1430  *    action is necessary.  If the candidate bitmap indicates the chunk was
1431  *    allocated when the snapshot was created, then it checks to see whether
1432  *    a translation already exists.  If a translation already exists then no
1433  *    action is required.  If the chunk is a candidate for copy-on-write,
1434  *    and a translation does not already exist, then the chunk is read in
1435  *    and a node is added to the translation table.
1436  *
1437  *    Once all of the chunks in the request range have been copied (if they
1438  *    needed to be), then the original request can be satisfied and the old
1439  *    data can be overwritten.
1440  */
1441 static int
1442 fssnap_translate(struct snapshot_id **sidpp, struct buf *wbp)
1443 {
1444         snapshot_id_t   *sidp = *sidpp;
1445         struct buf      *oldbp; /* buffer to store old data in */
1446         struct cow_info *cowp = sidp->sid_cowinfo;
1447         cow_map_t       *cmap = &cowp->cow_map;
1448         cow_map_node_t  *cmn;
1449         chunknumber_t   cowchunk, startchunk, endchunk;
1450         int             error;
1451         int     throttle_write = 0;
1452 
1453         /* make sure the snapshot is active */
1454         ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
1455 
1456         startchunk = dbtocowchunk(cmap, wbp->b_lblkno);
1457         endchunk   = dbtocowchunk(cmap, wbp->b_lblkno +
1458             ((wbp->b_bcount-1) >> DEV_BSHIFT));
1459 
1460         /*
1461          * Do not throttle the writes of the fssnap taskq thread and
1462          * the log roll (trans_roll) thread. Furthermore the writes to
1463          * the on-disk log are also not subject to throttling.
1464          * The fssnap_write_taskq thread's write can block on the throttling
1465          * semaphore which leads to self-deadlock as this same thread
1466          * releases the throttling semaphore after completing the IO.
1467          * If the trans_roll thread's write is throttled then we can deadlock
1468          * because the fssnap_taskq_thread which releases the throttling
1469          * semaphore can block waiting for log space which can only be
1470          * released by the trans_roll thread.
1471          */
1472 
1473         throttle_write = !(taskq_member(cowp->cow_taskq, curthread) ||
1474             tsd_get(bypass_snapshot_throttle_key));
1475 
1476         /*
1477          * Iterate through all chunks covered by this write and perform the
1478          * copy-aside if necessary.  Once all chunks have been safely
1479          * stowed away, the new data may be written in a single sweep.
1480          *
1481          * For each chunk in the range, the following sequence is performed:
1482          *      - Is the chunk a candidate for translation?
1483          *              o If not, then no translation is necessary, continue
1484          *      - If it is a candidate, then does it already have a translation?
1485          *              o If so, then no translation is necessary, continue
1486          *      - If it is a candidate, but does not yet have a translation,
1487          *        then read the old data and schedule an asynchronous taskq
1488          *        to write the old data to the backing file.
1489          *
1490          * Once this has been performed over the entire range of chunks, then
1491          * it is safe to overwrite the data that is there.
1492          *
1493          * Note that no lock is required to check the candidate bitmap because
1494          * it never changes once the snapshot is created.  The reader lock is
1495          * taken to check the hastrans bitmap since it may change.  If it
1496          * turns out a copy is required, then the lock is upgraded to a
1497          * writer, and the bitmap is re-checked as it may have changed while
1498          * the lock was released.  Finally, the write lock is held while
1499          * reading the old data to make sure it is not translated out from
1500          * under us.
1501          *
1502          * This locking mechanism should be sufficient to handle multiple
1503          * threads writing to overlapping chunks simultaneously.
1504          */
1505         for (cowchunk = startchunk; cowchunk <= endchunk; cowchunk++) {
1506                 /*
1507                  * If the cowchunk is outside of the range of our
1508                  * candidate maps, then simply break out of the
1509                  * loop and pass the I/O through to bdev_strategy.
1510                  * This would occur if the file system has grown
1511                  * larger since the snapshot was taken.
1512                  */
1513                 if (cowchunk >= (cmap->cmap_bmsize * NBBY))
1514                         break;
1515 
1516                 /*
1517                  * If no disk blocks were allocated in this chunk when the
1518                  * snapshot was created then no copy-on-write will be
1519                  * required.  Since this bitmap is read-only no locks are
1520                  * necessary.
1521                  */
1522                 if (isclr(cmap->cmap_candidate, cowchunk)) {
1523                         continue;
1524                 }
1525 
1526                 /*
1527                  * If a translation already exists, the data can be written
1528                  * through since the old data has already been saved off.
1529                  */
1530                 if (isset(cmap->cmap_hastrans, cowchunk)) {
1531                         continue;
1532                 }
1533 
1534 
1535                 /*
1536                  * Throttle translations if there are too many outstanding
1537                  * chunks in memory.  The semaphore is sema_v'd by the taskq.
1538                  *
1539                  * You can't keep the sid_rwlock if you would go to sleep.
1540                  * This will result in deadlock when someone tries to delete
1541                  * the snapshot (wants the sid_rwlock as a writer, but can't
1542                  * get it).
1543                  */
1544                 if (throttle_write) {
1545                         if (sema_tryp(&cmap->cmap_throttle_sem) == 0) {
1546                                 rw_exit(&sidp->sid_rwlock);
1547                                 atomic_inc_32(&cmap->cmap_waiters);
1548                                 sema_p(&cmap->cmap_throttle_sem);
1549                                 atomic_dec_32(&cmap->cmap_waiters);
1550                                 rw_enter(&sidp->sid_rwlock, RW_READER);
1551 
1552                         /*
1553                          * Now since we released the sid_rwlock the state may
1554                          * have transitioned underneath us. so check that again.
1555                          */
1556                                 if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1557                                         sema_v(&cmap->cmap_throttle_sem);
1558                                         return (ENXIO);
1559                                 }
1560                         }
1561                 }
1562 
1563                 /*
1564                  * Acquire the lock as a writer and check to see if a
1565                  * translation has been added in the meantime.
1566                  */
1567                 rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1568                 if (isset(cmap->cmap_hastrans, cowchunk)) {
1569                         if (throttle_write)
1570                                 sema_v(&cmap->cmap_throttle_sem);
1571                         rw_exit(&cmap->cmap_rwlock);
1572                         continue; /* go to the next chunk */
1573                 }
1574 
1575                 /*
1576                  * read a full chunk of data from the requested offset rounded
1577                  * down to the nearest chunk size.
1578                  */
1579                 oldbp = getrbuf(KM_SLEEP);
1580                 oldbp->b_lblkno = cowchunktodb(cmap, cowchunk);
1581                 oldbp->b_edev = wbp->b_edev;
1582                 oldbp->b_bcount = cmap->cmap_chunksz;
1583                 oldbp->b_bufsize = cmap->cmap_chunksz;
1584                 oldbp->b_iodone = NULL;
1585                 oldbp->b_proc = NULL;
1586                 oldbp->b_flags = B_READ;
1587                 oldbp->b_un.b_addr = kmem_alloc(cmap->cmap_chunksz, KM_SLEEP);
1588 
1589                 (void) bdev_strategy(oldbp);
1590                 (void) biowait(oldbp);
1591 
1592                 /*
1593                  * It's ok to bail in the middle of translating the range
1594                  * because the extra copy-asides will not hurt anything
1595                  * (except by using extra space in the backing store).
1596                  */
1597                 if ((error = geterror(oldbp)) != 0) {
1598                         cmn_err(CE_WARN, "fssnap_translate: error reading "
1599                             "old data for snapshot %d, chunk %llu, disk block "
1600                             "%lld, size %lu, error %d.", sidp->sid_snapnumber,
1601                             cowchunk, oldbp->b_lblkno, oldbp->b_bcount, error);
1602                         kmem_free(oldbp->b_un.b_addr, cmap->cmap_chunksz);
1603                         freerbuf(oldbp);
1604                         rw_exit(&cmap->cmap_rwlock);
1605                         if (throttle_write)
1606                                 sema_v(&cmap->cmap_throttle_sem);
1607                         return (error);
1608                 }
1609 
1610                 /*
1611                  * add the node to the translation table and save a reference
1612                  * to pass to the taskq for writing out to the backing file
1613                  */
1614                 cmn = transtbl_add(cmap, cowchunk, oldbp->b_un.b_addr);
1615                 freerbuf(oldbp);
1616 
1617                 /*
1618                  * Add a reference to the snapshot id so the lower level
1619                  * processing (ie. the taskq) can get back to the state
1620                  * information.
1621                  */
1622                 cmn->cmn_sid = sidp;
1623                 cmn->release_sem = throttle_write;
1624                 setbit(cmap->cmap_hastrans, cowchunk);
1625 
1626                 rw_exit(&cmap->cmap_rwlock);
1627 
1628                 /*
1629                  * schedule the asynchronous write to the backing file
1630                  */
1631                 if (cowp->cow_backfile_array != NULL)
1632                         (void) taskq_dispatch(cowp->cow_taskq,
1633                             fssnap_write_taskq, cmn, TQ_SLEEP);
1634         }
1635 
1636         /*
1637          * Write new data in place of the old data.  At this point all of the
1638          * chunks touched by this write have been copied aside and so the new
1639          * data can be written out all at once.
1640          */
1641         (void) bdev_strategy(wbp);
1642 
1643         return (0);
1644 }
1645 
1646 /*
1647  * fssnap_write_taskq() - write in-memory translations to the backing file
1648  *
1649  *    writes in-memory translations to the backing file asynchronously.  A
1650  *    task is dispatched each time a new translation is created.  The task
1651  *    writes the data to the backing file and removes it from the memory
1652  *    list. The throttling semaphore is released only if the particular
1653  *    translation was throttled in fssnap_translate.
1654  */
1655 static void
1656 fssnap_write_taskq(void *arg)
1657 {
1658         cow_map_node_t  *cmn = (cow_map_node_t *)arg;
1659         snapshot_id_t   *sidp = cmn->cmn_sid;
1660         cow_info_t      *cowp = sidp->sid_cowinfo;
1661         cow_map_t       *cmap = &cowp->cow_map;
1662         int             error;
1663         int             bf_index;
1664         int             release_sem = cmn->release_sem;
1665 
1666         /*
1667          * The sid_rwlock does not need to be held here because the taskqs
1668          * are destroyed explicitly by fssnap_delete (with the sid_rwlock
1669          * held as a writer).  taskq_destroy() will flush all of the tasks
1670          * out before fssnap_delete frees up all of the structures.
1671          */
1672 
1673         /* if the snapshot was disabled from under us, drop the request. */
1674         rw_enter(&sidp->sid_rwlock, RW_READER);
1675         if (SID_INACTIVE(sidp)) {
1676                 rw_exit(&sidp->sid_rwlock);
1677                 if (release_sem)
1678                         sema_v(&cmap->cmap_throttle_sem);
1679                 return;
1680         }
1681         rw_exit(&sidp->sid_rwlock);
1682 
1683         atomic_inc_64((uint64_t *)&cmap->cmap_nchunks);
1684 
1685         if ((cmap->cmap_maxsize != 0) &&
1686             ((cmap->cmap_nchunks * cmap->cmap_chunksz) > cmap->cmap_maxsize)) {
1687                 cmn_err(CE_WARN, "fssnap_write_taskq: snapshot %d (%s) has "
1688                     "reached the maximum backing file size specified (%llu "
1689                     "bytes) and will be deleted.", sidp->sid_snapnumber,
1690                     (char *)cowp->cow_kstat_mntpt->ks_data,
1691                     cmap->cmap_maxsize);
1692                 if (release_sem)
1693                         sema_v(&cmap->cmap_throttle_sem);
1694                 atomic_or_uint(&sidp->sid_flags, SID_DELETE);
1695                 return;
1696         }
1697 
1698         /* perform the write */
1699         bf_index = cmn->cmn_chunk / cmap->cmap_chunksperbf;
1700 
1701         if (error = vn_rdwr(UIO_WRITE, (cowp->cow_backfile_array)[bf_index],
1702             cmn->cmn_buf, cmap->cmap_chunksz,
1703             (cmn->cmn_chunk % cmap->cmap_chunksperbf) * cmap->cmap_chunksz,
1704             UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, (ssize_t *)NULL)) {
1705                 cmn_err(CE_WARN, "fssnap_write_taskq: error writing to "
1706                     "backing file.  DELETING SNAPSHOT %d, backing file path "
1707                     "%s, offset %llu bytes, error %d.", sidp->sid_snapnumber,
1708                     (char *)cowp->cow_kstat_bfname->ks_data,
1709                     cmn->cmn_chunk * cmap->cmap_chunksz, error);
1710                 if (release_sem)
1711                         sema_v(&cmap->cmap_throttle_sem);
1712                 atomic_or_uint(&sidp->sid_flags, SID_DELETE);
1713                 return;
1714         }
1715 
1716         /*
1717          * now remove the node and buffer from memory
1718          */
1719         rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1720         transtbl_delete(cmap, cmn);
1721         rw_exit(&cmap->cmap_rwlock);
1722 
1723         /* Allow more translations */
1724         if (release_sem)
1725                 sema_v(&cmap->cmap_throttle_sem);
1726 
1727 }
1728 
1729 /*
1730  * fssnap_create_impl() - called from the file system to create a new snapshot
1731  *
1732  *    allocates and initializes the structures needed for a new snapshot.
1733  *    This is called by the file system when it receives an ioctl request to
1734  *    create a new snapshot.  An unused snapshot identifier is either found
1735  *    or created, and eventually returned as the opaque handle the file
1736  *    system will use to identify this snapshot.  The snapshot number
1737  *    associated with the snapshot identifier is the same as the minor
1738  *    number for the snapshot device that is used to access that snapshot.
1739  *
1740  *    The snapshot can not be used until the candidate bitmap is populated
1741  *    by the file system (see fssnap_set_candidate_impl()), and the file
1742  *    system finishes the setup process by calling fssnap_create_done().
1743  *    Nearly all of the snapshot locks are held for the duration of the
1744  *    create, and are not released until fssnap_create_done is called().
1745  */
1746 static void *
1747 fssnap_create_impl(chunknumber_t nchunks, uint_t chunksz, u_offset_t maxsize,
1748     struct vnode *fsvp, int backfilecount, struct vnode **bfvpp, char *backpath,
1749     u_offset_t max_backfile_size)
1750 {
1751         refstr_t *mountpoint;
1752         char taskqname[50];
1753         struct cow_info *cowp;
1754         struct cow_map  *cmap;
1755         struct snapshot_id *sidp;
1756         int lastsnap;
1757 
1758         /*
1759          * Sanity check the parameters we care about
1760          * (we don't care about the informational parameters)
1761          */
1762         if ((nchunks == 0) ||
1763             ((chunksz % DEV_BSIZE) != 0) ||
1764             (bfvpp == NULL)) {
1765                 return (NULL);
1766         }
1767 
1768         /*
1769          * Look for unused snapshot identifiers.  Snapshot ids are never
1770          * freed, but deleted snapshot ids will be recycled as needed.
1771          */
1772         mutex_enter(&snapshot_mutex);
1773 
1774 findagain:
1775         lastsnap = 0;
1776         for (sidp = snapshot; sidp != NULL; sidp = sidp->sid_next) {
1777                 if (sidp->sid_snapnumber > lastsnap)
1778                         lastsnap = sidp->sid_snapnumber;
1779 
1780                 /*
1781                  * The sid_rwlock is taken as a reader initially so that
1782                  * activity on each snapshot is not stalled while searching
1783                  * for a free snapshot id.
1784                  */
1785                 rw_enter(&sidp->sid_rwlock, RW_READER);
1786 
1787                 /*
1788                  * If the snapshot has been deleted and nobody is using the
1789                  * snapshot device than we can reuse this snapshot_id.  If
1790                  * the snapshot is marked to be deleted (SID_DELETE), then
1791                  * it hasn't been deleted yet so don't reuse it.
1792                  */
1793                 if (SID_AVAILABLE(sidp))
1794                         break; /* This spot is unused, so take it */
1795                 rw_exit(&sidp->sid_rwlock);
1796         }
1797 
1798         /*
1799          * add a new snapshot identifier if there are no deleted
1800          * entries.  Since it doesn't matter what order the entries
1801          * are in we can just add it to the beginning of the list.
1802          */
1803         if (sidp) {
1804                 if (rw_tryupgrade(&sidp->sid_rwlock) == 0) {
1805                         /* someone else grabbed it as a writer, try again */
1806                         rw_exit(&sidp->sid_rwlock);
1807                         goto findagain;
1808                 }
1809         } else {
1810                 /* Create a new node if we didn't find an unused one */
1811                 sidp = kmem_alloc(sizeof (struct snapshot_id), KM_SLEEP);
1812                 rw_init(&sidp->sid_rwlock, NULL, RW_DEFAULT, NULL);
1813                 rw_enter(&sidp->sid_rwlock, RW_WRITER);
1814                 sidp->sid_snapnumber = (snapshot == NULL) ? 0 : lastsnap + 1;
1815                 sidp->sid_cowinfo = NULL;
1816                 sidp->sid_flags = 0;
1817                 sidp->sid_next = snapshot;
1818                 snapshot = sidp;
1819         }
1820 
1821         ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
1822         ASSERT(sidp->sid_cowinfo == NULL);
1823         ASSERT(sidp->sid_snapnumber <= (lastsnap + 1));
1824 
1825         sidp->sid_flags |= SID_CREATING;
1826         /* The root vnode is held until snap_delete_impl() is called */
1827         VN_HOLD(fsvp);
1828         sidp->sid_fvp = fsvp;
1829         num_snapshots++;
1830 
1831         /* allocate and initialize structures */
1832 
1833         cowp = kmem_zalloc(sizeof (struct cow_info), KM_SLEEP);
1834 
1835         cowp->cow_backfile_array = bfvpp;
1836         cowp->cow_backcount = backfilecount;
1837         cowp->cow_backfile_sz = max_backfile_size;
1838 
1839         /*
1840          * Initialize task queues for this snapshot.  Only a small number
1841          * of threads are required because they will be serialized on the
1842          * backing file's reader/writer lock anyway.
1843          */
1844         (void) snprintf(taskqname, sizeof (taskqname), "%s_taskq_%d", snapname,
1845             sidp->sid_snapnumber);
1846         cowp->cow_taskq = taskq_create(taskqname, fssnap_taskq_nthreads,
1847             minclsyspri, 1,  fssnap_taskq_maxtasks, 0);
1848 
1849         /* don't allow tasks to start until after everything is ready */
1850         taskq_suspend(cowp->cow_taskq);
1851 
1852         /* initialize translation table */
1853         cmap = &cowp->cow_map;
1854         rw_init(&cmap->cmap_rwlock, NULL, RW_DEFAULT, NULL);
1855         rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1856 
1857         sema_init(&cmap->cmap_throttle_sem, fssnap_max_mem_chunks, NULL,
1858             SEMA_DEFAULT, NULL);
1859 
1860         cmap->cmap_chunksz = chunksz;
1861         cmap->cmap_maxsize = maxsize;
1862         cmap->cmap_chunksperbf = max_backfile_size / chunksz;
1863 
1864         /*
1865          * allocate one bit per chunk for the bitmaps, round up
1866          */
1867         cmap->cmap_bmsize = (nchunks + (NBBY - 1)) / NBBY;
1868         cmap->cmap_hastrans  = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
1869         cmap->cmap_candidate = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
1870 
1871         sidp->sid_cowinfo = cowp;
1872 
1873         /* initialize kstats for this snapshot */
1874         mountpoint = vfs_getmntpoint(fsvp->v_vfsp);
1875         fssnap_create_kstats(sidp, sidp->sid_snapnumber,
1876             refstr_value(mountpoint), backpath);
1877         refstr_rele(mountpoint);
1878 
1879         mutex_exit(&snapshot_mutex);
1880 
1881         /*
1882          * return with snapshot id rwlock held as a writer until
1883          * fssnap_create_done is called
1884          */
1885         return (sidp);
1886 }
1887 
1888 /*
1889  * fssnap_set_candidate_impl() - mark a chunk as a candidate for copy-on-write
1890  *
1891  *    sets a bit in the candidate bitmap that indicates that a chunk is a
1892  *    candidate for copy-on-write.  Typically, chunks that are allocated on
1893  *    the file system at the time the snapshot is taken are candidates,
1894  *    while chunks that have no allocated data do not need to be copied.
1895  *    Chunks containing metadata must be marked as candidates as well.
1896  */
1897 static void
1898 fssnap_set_candidate_impl(void *snapshot_id, chunknumber_t chunknumber)
1899 {
1900         struct snapshot_id      *sid = snapshot_id;
1901         struct cow_info *cowp = sid->sid_cowinfo;
1902         struct cow_map  *cmap = &cowp->cow_map;
1903 
1904         /* simple bitmap operation for now */
1905         ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
1906         setbit(cmap->cmap_candidate, chunknumber);
1907 }
1908 
1909 /*
1910  * fssnap_is_candidate_impl() - check whether a chunk is a candidate
1911  *
1912  *    returns 0 if the chunk is not a candidate and 1 if the chunk is a
1913  *    candidate.  This can be used by the file system to change behavior for
1914  *    chunks that might induce a copy-on-write.  The offset is specified in
1915  *    bytes since the chunk size may not be known by the file system.
1916  */
1917 static int
1918 fssnap_is_candidate_impl(void *snapshot_id, u_offset_t off)
1919 {
1920         struct snapshot_id      *sid = snapshot_id;
1921         struct cow_info *cowp = sid->sid_cowinfo;
1922         struct cow_map  *cmap = &cowp->cow_map;
1923         ulong_t chunknumber = off / cmap->cmap_chunksz;
1924 
1925         /* simple bitmap operation for now */
1926         ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
1927         return (isset(cmap->cmap_candidate, chunknumber));
1928 }
1929 
1930 /*
1931  * fssnap_create_done_impl() - complete the snapshot setup process
1932  *
1933  *    called when the file system is done populating the candidate bitmap
1934  *    and it is ready to start using the snapshot.  This routine releases
1935  *    the snapshot locks, allows taskq tasks to start processing, and
1936  *    creates the device minor nodes associated with the snapshot.
1937  */
1938 static int
1939 fssnap_create_done_impl(void *snapshot_id)
1940 {
1941         struct snapshot_id      **sidpp, *sidp = snapshot_id;
1942         struct cow_info         *cowp;
1943         struct cow_map          *cmap;
1944         int                     snapnumber = -1;
1945         char                    name[20];
1946 
1947         /* sid rwlock and cmap rwlock should be taken from fssnap_create */
1948         ASSERT(sidp);
1949         ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
1950         ASSERT(sidp->sid_cowinfo);
1951 
1952         cowp = sidp->sid_cowinfo;
1953         cmap = &cowp->cow_map;
1954 
1955         ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1956 
1957         sidp->sid_flags &= ~(SID_CREATING | SID_DISABLED);
1958         snapnumber = sidp->sid_snapnumber;
1959 
1960         /* allocate state structure and find new snapshot id */
1961         if (ddi_soft_state_zalloc(statep, snapnumber) != DDI_SUCCESS) {
1962                 cmn_err(CE_WARN,
1963                     "snap_ioctl: create: could not allocate "
1964                     "state for snapshot %d.", snapnumber);
1965                 snapnumber = -1;
1966                 goto out;
1967         }
1968 
1969         sidpp = ddi_get_soft_state(statep, snapnumber);
1970         *sidpp = sidp;
1971 
1972         /* create minor node based on snapshot number */
1973         ASSERT(fssnap_dip != NULL);
1974         (void) snprintf(name, sizeof (name), "%d", snapnumber);
1975         if (ddi_create_minor_node(fssnap_dip, name, S_IFBLK,
1976             snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
1977                 cmn_err(CE_WARN, "snap_ioctl: could not create "
1978                     "block minor node for snapshot %d.", snapnumber);
1979                 snapnumber = -1;
1980                 goto out;
1981         }
1982 
1983         (void) snprintf(name, sizeof (name), "%d,raw", snapnumber);
1984         if (ddi_create_minor_node(fssnap_dip, name, S_IFCHR,
1985             snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
1986                 cmn_err(CE_WARN, "snap_ioctl: could not create "
1987                     "character minor node for snapshot %d.", snapnumber);
1988                 snapnumber = -1;
1989         }
1990 
1991 out:
1992         rw_exit(&sidp->sid_rwlock);
1993         rw_exit(&cmap->cmap_rwlock);
1994 
1995         /* let the taskq threads start processing */
1996         taskq_resume(cowp->cow_taskq);
1997 
1998         return (snapnumber);
1999 }
2000 
2001 /*
2002  * fssnap_delete_impl() - delete a snapshot
2003  *
2004  *    used when a snapshot is no longer needed.  This is called by the file
2005  *    system when it receives an ioctl request to delete a snapshot.  It is
2006  *    also called internally when error conditions such as disk full, errors
2007  *    writing to the backing file, or backing file maxsize exceeded occur.
2008  *    If the snapshot device is busy when the delete request is received,
2009  *    all state will be deleted except for the soft state and device files
2010  *    associated with the snapshot; they will be deleted when the snapshot
2011  *    device is closed.
2012  *
2013  *    NOTE this function takes a POINTER TO A POINTER to the snapshot id,
2014  *    and expects to be able to set the handle held by the file system to
2015  *    NULL.  This depends on the file system checking that variable for NULL
2016  *    before calling fssnap_strategy().
2017  */
2018 static int
2019 fssnap_delete_impl(void *snapshot_id)
2020 {
2021         struct snapshot_id      **sidpp = (struct snapshot_id **)snapshot_id;
2022         struct snapshot_id      *sidp;
2023         struct snapshot_id      **statesidpp;
2024         struct cow_info         *cowp;
2025         struct cow_map          *cmap;
2026         char                    name[20];
2027         int                     snapnumber = -1;
2028         vnode_t                 **vpp;
2029 
2030         /*
2031          * sidp is guaranteed to be valid if sidpp is valid because
2032          * the snapshot list is append-only.
2033          */
2034         if (sidpp == NULL) {
2035                 return (-1);
2036         }
2037 
2038         sidp = *sidpp;
2039         rw_enter(&sidp->sid_rwlock, RW_WRITER);
2040 
2041         ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
2042 
2043         /*
2044          * double check that the snapshot is still valid for THIS file system
2045          */
2046         if (*sidpp == NULL) {
2047                 rw_exit(&sidp->sid_rwlock);
2048                 return (-1);
2049         }
2050 
2051         /*
2052          * Now we know the snapshot is still valid and will not go away
2053          * because we have the write lock.  Once the state is transitioned
2054          * to "disabling", the sid_rwlock can be released.  Any pending I/O
2055          * waiting for the lock as a reader will check for this state and
2056          * abort without touching data that may be getting freed.
2057          */
2058         sidp->sid_flags |= SID_DISABLING;
2059         if (sidp->sid_flags & SID_DELETE) {
2060                 cmn_err(CE_WARN, "Snapshot %d automatically deleted.",
2061                     sidp->sid_snapnumber);
2062                 sidp->sid_flags &= ~(SID_DELETE);
2063         }
2064 
2065 
2066         /*
2067          * This is pointing into file system specific data!  The assumption is
2068          * that fssnap_strategy() gets called from the file system based on
2069          * whether this reference to the snapshot_id is NULL or not.  So
2070          * setting this to NULL should disable snapshots for the file system.
2071          */
2072         *sidpp = NULL;
2073 
2074         /* remove cowinfo */
2075         cowp = sidp->sid_cowinfo;
2076         if (cowp == NULL) {
2077                 rw_exit(&sidp->sid_rwlock);
2078                 return (-1);
2079         }
2080         rw_exit(&sidp->sid_rwlock);
2081 
2082         /* destroy task queues first so they don't reference freed data. */
2083         if (cowp->cow_taskq) {
2084                 taskq_destroy(cowp->cow_taskq);
2085                 cowp->cow_taskq = NULL;
2086         }
2087 
2088         if (cowp->cow_backfile_array != NULL) {
2089                 for (vpp = cowp->cow_backfile_array; *vpp; vpp++)
2090                         VN_RELE(*vpp);
2091                 kmem_free(cowp->cow_backfile_array,
2092                     (cowp->cow_backcount + 1) * sizeof (vnode_t *));
2093                 cowp->cow_backfile_array = NULL;
2094         }
2095 
2096         sidp->sid_cowinfo = NULL;
2097 
2098         /* remove cmap */
2099         cmap = &cowp->cow_map;
2100         ASSERT(cmap);
2101 
2102         if (cmap->cmap_candidate)
2103                 kmem_free(cmap->cmap_candidate, cmap->cmap_bmsize);
2104 
2105         if (cmap->cmap_hastrans)
2106                 kmem_free(cmap->cmap_hastrans, cmap->cmap_bmsize);
2107 
2108         if (cmap->cmap_table)
2109                 transtbl_free(&cowp->cow_map);
2110 
2111         rw_destroy(&cmap->cmap_rwlock);
2112 
2113         while (cmap->cmap_waiters) {
2114                 sema_p(&cmap->cmap_throttle_sem);
2115                 sema_v(&cmap->cmap_throttle_sem);
2116         }
2117         sema_destroy(&cmap->cmap_throttle_sem);
2118 
2119         /* remove kstats */
2120         fssnap_delete_kstats(cowp);
2121 
2122         kmem_free(cowp, sizeof (struct cow_info));
2123 
2124         statesidpp = ddi_get_soft_state(statep, sidp->sid_snapnumber);
2125         if (statesidpp == NULL || *statesidpp == NULL) {
2126                 cmn_err(CE_WARN,
2127                     "fssnap_delete_impl: could not find state for snapshot %d.",
2128                     sidp->sid_snapnumber);
2129         }
2130         ASSERT(*statesidpp == sidp);
2131 
2132         /*
2133          * Leave the node in the list marked DISABLED so it can be reused
2134          * and avoid many race conditions.  Return the snapshot number
2135          * that was deleted.
2136          */
2137         mutex_enter(&snapshot_mutex);
2138         rw_enter(&sidp->sid_rwlock, RW_WRITER);
2139         sidp->sid_flags &= ~(SID_DISABLING);
2140         sidp->sid_flags |= SID_DISABLED;
2141         VN_RELE(sidp->sid_fvp);
2142         sidp->sid_fvp = NULL;
2143         snapnumber = sidp->sid_snapnumber;
2144 
2145         /*
2146          * If the snapshot is not busy, free the device info now.  Otherwise
2147          * the device nodes are freed in snap_close() when the device is
2148          * closed.  The sid will not be reused until the device is not busy.
2149          */
2150         if (SID_AVAILABLE(sidp)) {
2151                 /* remove the device nodes */
2152                 ASSERT(fssnap_dip != NULL);
2153                 (void) snprintf(name, sizeof (name), "%d",
2154                     sidp->sid_snapnumber);
2155                 ddi_remove_minor_node(fssnap_dip, name);
2156                 (void) snprintf(name, sizeof (name), "%d,raw",
2157                     sidp->sid_snapnumber);
2158                 ddi_remove_minor_node(fssnap_dip, name);
2159 
2160                 /* delete the state structure */
2161                 ddi_soft_state_free(statep, sidp->sid_snapnumber);
2162                 num_snapshots--;
2163         }
2164 
2165         mutex_exit(&snapshot_mutex);
2166         rw_exit(&sidp->sid_rwlock);
2167 
2168         return (snapnumber);
2169 }
2170 
2171 /*
2172  * fssnap_create_kstats() - allocate and initialize snapshot kstats
2173  *
2174  */
2175 static void
2176 fssnap_create_kstats(snapshot_id_t *sidp, int snapnum,
2177     const char *mountpoint, const char *backfilename)
2178 {
2179         kstat_t *num, *mntpoint, *bfname;
2180         kstat_named_t *hw;
2181         struct cow_info *cowp = sidp->sid_cowinfo;
2182         struct cow_kstat_num *stats;
2183 
2184         /* update the high water mark */
2185         if (fssnap_highwater_kstat == NULL) {
2186                 cmn_err(CE_WARN, "fssnap_create_kstats: failed to lookup "
2187                     "high water mark kstat.");
2188                 return;
2189         }
2190 
2191         hw = (kstat_named_t *)fssnap_highwater_kstat->ks_data;
2192         if (hw->value.ui32 < snapnum)
2193                 hw->value.ui32 = snapnum;
2194 
2195         /* initialize the mount point kstat */
2196         kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_MNTPT);
2197 
2198         if (mountpoint != NULL) {
2199                 mntpoint = kstat_create(snapname, snapnum, FSSNAP_KSTAT_MNTPT,
2200                     "misc", KSTAT_TYPE_RAW, strlen(mountpoint) + 1, 0);
2201                 if (mntpoint == NULL) {
2202                         cowp->cow_kstat_mntpt = NULL;
2203                         cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
2204                             "create mount point kstat");
2205                 } else {
2206                         (void) strncpy(mntpoint->ks_data, mountpoint,
2207                             strlen(mountpoint));
2208                         cowp->cow_kstat_mntpt = mntpoint;
2209                         kstat_install(mntpoint);
2210                 }
2211         } else {
2212                 cowp->cow_kstat_mntpt = NULL;
2213                 cmn_err(CE_WARN, "fssnap_create_kstats: mount point not "
2214                     "specified.");
2215         }
2216 
2217         /* initialize the backing file kstat */
2218         kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_BFNAME);
2219 
2220         if (backfilename == NULL) {
2221                 cowp->cow_kstat_bfname = NULL;
2222         } else {
2223                 bfname = kstat_create(snapname, snapnum, FSSNAP_KSTAT_BFNAME,
2224                     "misc", KSTAT_TYPE_RAW, strlen(backfilename) + 1, 0);
2225                 if (bfname != NULL) {
2226                         (void) strncpy(bfname->ks_data, backfilename,
2227                             strlen(backfilename));
2228                         cowp->cow_kstat_bfname = bfname;
2229                         kstat_install(bfname);
2230                 } else {
2231                         cowp->cow_kstat_bfname = NULL;
2232                         cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
2233                             "create backing file name kstat");
2234                 }
2235         }
2236 
2237         /* initialize numeric kstats */
2238         kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_NUM);
2239 
2240         num = kstat_create(snapname, snapnum, FSSNAP_KSTAT_NUM,
2241             "misc", KSTAT_TYPE_NAMED,
2242             sizeof (struct cow_kstat_num) / sizeof (kstat_named_t),
2243             0);
2244         if (num == NULL) {
2245                 cmn_err(CE_WARN, "fssnap_create_kstats: failed to create "
2246                     "numeric kstats");
2247                 cowp->cow_kstat_num = NULL;
2248                 return;
2249         }
2250 
2251         cowp->cow_kstat_num = num;
2252         stats = num->ks_data;
2253         num->ks_update = fssnap_update_kstat_num;
2254         num->ks_private = sidp;
2255 
2256         kstat_named_init(&stats->ckn_state, FSSNAP_KSTAT_NUM_STATE,
2257             KSTAT_DATA_INT32);
2258         kstat_named_init(&stats->ckn_bfsize, FSSNAP_KSTAT_NUM_BFSIZE,
2259             KSTAT_DATA_UINT64);
2260         kstat_named_init(&stats->ckn_maxsize, FSSNAP_KSTAT_NUM_MAXSIZE,
2261             KSTAT_DATA_UINT64);
2262         kstat_named_init(&stats->ckn_createtime, FSSNAP_KSTAT_NUM_CREATETIME,
2263             KSTAT_DATA_LONG);
2264         kstat_named_init(&stats->ckn_chunksize, FSSNAP_KSTAT_NUM_CHUNKSIZE,
2265             KSTAT_DATA_UINT32);
2266 
2267         /* initialize the static kstats */
2268         stats->ckn_chunksize.value.ui32 = cowp->cow_map.cmap_chunksz;
2269         stats->ckn_maxsize.value.ui64 = cowp->cow_map.cmap_maxsize;
2270         stats->ckn_createtime.value.l = gethrestime_sec();
2271 
2272         kstat_install(num);
2273 }
2274 
2275 /*
2276  * fssnap_update_kstat_num() - update a numerical snapshot kstat value
2277  *
2278  */
2279 int
2280 fssnap_update_kstat_num(kstat_t *ksp, int rw)
2281 {
2282         snapshot_id_t *sidp = (snapshot_id_t *)ksp->ks_private;
2283         struct cow_info *cowp = sidp->sid_cowinfo;
2284         struct cow_kstat_num *stats = ksp->ks_data;
2285 
2286         if (rw == KSTAT_WRITE)
2287                 return (EACCES);
2288 
2289         /* state */
2290         if (sidp->sid_flags & SID_CREATING)
2291                 stats->ckn_state.value.i32 = COWSTATE_CREATING;
2292         else if (SID_INACTIVE(sidp))
2293                 stats->ckn_state.value.i32 = COWSTATE_DISABLED;
2294         else if (SID_BUSY(sidp))
2295                 stats->ckn_state.value.i32 = COWSTATE_ACTIVE;
2296         else
2297                 stats->ckn_state.value.i32 = COWSTATE_IDLE;
2298 
2299         /* bfsize */
2300         stats->ckn_bfsize.value.ui64 = cowp->cow_map.cmap_nchunks *
2301             cowp->cow_map.cmap_chunksz;
2302 
2303         return (0);
2304 }
2305 
2306 /*
2307  * fssnap_delete_kstats() - deallocate snapshot kstats
2308  *
2309  */
2310 void
2311 fssnap_delete_kstats(struct cow_info *cowp)
2312 {
2313         if (cowp->cow_kstat_num != NULL) {
2314                 kstat_delete(cowp->cow_kstat_num);
2315                 cowp->cow_kstat_num = NULL;
2316         }
2317         if (cowp->cow_kstat_mntpt != NULL) {
2318                 kstat_delete(cowp->cow_kstat_mntpt);
2319                 cowp->cow_kstat_mntpt = NULL;
2320         }
2321         if (cowp->cow_kstat_bfname != NULL) {
2322                 kstat_delete(cowp->cow_kstat_bfname);
2323                 cowp->cow_kstat_bfname = NULL;
2324         }
2325 }