1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 #include <sys/file.h>
  26 #include <sys/stat.h>
  27 #include <sys/atomic.h>
  28 #include <sys/mntio.h>
  29 #include <sys/mnttab.h>
  30 #include <sys/mount.h>
  31 #include <sys/sunddi.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/systm.h>
  34 #include <sys/vfs.h>
  35 #include <sys/vfs_opreg.h>
  36 #include <sys/fs/mntdata.h>
  37 #include <fs/fs_subr.h>
  38 #include <sys/vmsystm.h>
  39 #include <vm/seg_vn.h>
  40 #include <sys/time.h>
  41 #include <sys/ksynch.h>
  42 #include <sys/sdt.h>
  43 
  44 #define MNTROOTINO      2
  45 
  46 static mntnode_t *mntgetnode(vnode_t *);
  47 
  48 vnodeops_t *mntvnodeops;
  49 extern void vfs_mnttab_readop(void);
  50 
  51 /*
  52  * Design of kernel mnttab accounting.
  53  *
  54  * mntfs provides two methods of reading the in-kernel mnttab, i.e. the state of
  55  * the mounted resources: the read-only file /etc/mnttab, and a collection of
  56  * ioctl() commands. Most of these interfaces are public and are described in
  57  * mnttab(4). Three private ioctl() commands, MNTIOC_GETMNTENT,
  58  * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY, provide for the getmntent(3C)
  59  * family of functions, allowing them to support white space in mount names.
  60  *
  61  * A significant feature of mntfs is that it provides a file descriptor with a
  62  * snapshot once it begins to consume mnttab data. Thus, as the process
  63  * continues to consume data, its view of the in-kernel mnttab does not change
  64  * even if resources are mounted or unmounted. The intent is to ensure that
  65  * processes are guaranteed to read self-consistent data even as the system
  66  * changes.
  67  *
  68  * The snapshot is implemented by a "database", unique to each zone, that
  69  * comprises a linked list of mntelem_ts. The database is identified by
  70  * zone_mntfs_db and is protected by zone_mntfs_db_lock. Each element contains
  71  * the text entry in /etc/mnttab for a mounted resource, i.e. a vfs_t, and is
  72  * marked with its time of "birth", i.e. creation. An element is "killed", and
  73  * marked with its time of death, when it is found to be out of date, e.g. when
  74  * the corresponding resource has been unmounted.
  75  *
  76  * When a process performs the first read() or ioctl() for a file descriptor for
  77  * /etc/mnttab, the database is updated by a call to mntfs_snapshot() to ensure
  78  * that an element exists for each currently mounted resource. Following this,
  79  * the current time is written into a snapshot structure, a mntsnap_t, embedded
  80  * in the descriptor's mntnode_t.
  81  *
  82  * mntfs is able to enumerate the /etc/mnttab entries corresponding to a
  83  * particular file descriptor by searching the database for entries that were
  84  * born before the appropriate snapshot and that either are still alive or died
  85  * after the snapshot was created. Consumers use the iterator function
  86  * mntfs_get_next_elem() to identify the next suitable element in the database.
  87  *
  88  * Each snapshot has a hold on its corresponding database elements, effected by
  89  * a per-element reference count. At last close(), a snapshot is destroyed in
  90  * mntfs_freesnap() by releasing all of its holds; an element is destroyed if
  91  * its reference count becomes zero. Therefore the database never exists unless
  92  * there is at least one active consumer of /etc/mnttab.
  93  *
  94  * getmntent(3C) et al. "do not open, close or rewind the file." This implies
  95  * that getmntent() and read() must be able to operate without interaction on
  96  * the same file descriptor; this is accomplished by the use of separate
  97  * mntsnap_ts for both read() and ioctl().
  98  *
  99  * mntfs observes the following lock-ordering:
 100  *
 101  *      mnp->mnt_contents -> vfslist -> zonep->zone_mntfs_db_lock
 102  *
 103  * NOTE: The following variable enables the generation of the "dev=xxx"
 104  * in the option string for a mounted file system.  Really this should
 105  * be gotten rid of altogether, but for the sake of backwards compatibility
 106  * we had to leave it in.  It is defined as a 32-bit device number.  This
 107  * means that when 64-bit device numbers are in use, if either the major or
 108  * minor part of the device number will not fit in a 16 bit quantity, the
 109  * "dev=" will be set to NODEV (0x7fffffff).  See PSARC 1999/566 and
 110  * 1999/131 for details.  The cmpldev() function used to generate the 32-bit
 111  * device number handles this check and assigns the proper value.
 112  */
 113 int mntfs_enabledev = 1;        /* enable old "dev=xxx" option */
 114 
 115 extern void vfs_mono_time(timespec_t *);
 116 enum { MNTFS_FIRST, MNTFS_SECOND, MNTFS_NEITHER };
 117 
 118 /*
 119  * Determine whether a field within a line from /etc/mnttab contains actual
 120  * content or simply the marker string "-". This never applies to the time,
 121  * therefore the delimiter must be a tab.
 122  */
 123 #define MNTFS_REAL_FIELD(x)     (*(x) != '-' || *((x) + 1) != '\t')
 124 
 125 static int
 126 mntfs_devsize(struct vfs *vfsp)
 127 {
 128         dev32_t odev;
 129 
 130         (void) cmpldev(&odev, vfsp->vfs_dev);
 131         return (snprintf(NULL, 0, "dev=%x", odev));
 132 }
 133 
 134 static int
 135 mntfs_devprint(struct vfs *vfsp, char *buf)
 136 {
 137         dev32_t odev;
 138 
 139         (void) cmpldev(&odev, vfsp->vfs_dev);
 140         return (snprintf(buf, MAX_MNTOPT_STR, "dev=%x", odev));
 141 }
 142 
 143 /* Identify which, if either, of two supplied timespec structs is newer. */
 144 static int
 145 mntfs_newest(timespec_t *a, timespec_t *b)
 146 {
 147         if (a->tv_sec == b->tv_sec &&
 148             a->tv_nsec == b->tv_nsec) {
 149                 return (MNTFS_NEITHER);
 150         } else if (b->tv_sec > a->tv_sec ||
 151             (b->tv_sec == a->tv_sec &&
 152             b->tv_nsec > a->tv_nsec)) {
 153                 return (MNTFS_SECOND);
 154         } else {
 155                 return (MNTFS_FIRST);
 156         }
 157 }
 158 
 159 static int
 160 mntfs_optsize(struct vfs *vfsp)
 161 {
 162         int i, size = 0;
 163         mntopt_t *mop;
 164 
 165         for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
 166                 mop = &vfsp->vfs_mntopts.mo_list[i];
 167                 if (mop->mo_flags & MO_NODISPLAY)
 168                         continue;
 169                 if (mop->mo_flags & MO_SET) {
 170                         if (size)
 171                                 size++; /* space for comma */
 172                         size += strlen(mop->mo_name);
 173                         /*
 174                          * count option value if there is one
 175                          */
 176                         if (mop->mo_arg != NULL) {
 177                                 size += strlen(mop->mo_arg) + 1;
 178                         }
 179                 }
 180         }
 181         if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
 182                 /*
 183                  * Add space for "zone=<zone_name>" if required.
 184                  */
 185                 if (size)
 186                         size++; /* space for comma */
 187                 size += sizeof ("zone=") - 1;
 188                 size += strlen(vfsp->vfs_zone->zone_name);
 189         }
 190         if (mntfs_enabledev) {
 191                 if (size != 0)
 192                         size++; /* space for comma */
 193                 size += mntfs_devsize(vfsp);
 194         }
 195         if (size == 0)
 196                 size = strlen("-");
 197         return (size);
 198 }
 199 
 200 static int
 201 mntfs_optprint(struct vfs *vfsp, char *buf)
 202 {
 203         int i, optinbuf = 0;
 204         mntopt_t *mop;
 205         char *origbuf = buf;
 206 
 207         for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
 208                 mop = &vfsp->vfs_mntopts.mo_list[i];
 209                 if (mop->mo_flags & MO_NODISPLAY)
 210                         continue;
 211                 if (mop->mo_flags & MO_SET) {
 212                         if (optinbuf)
 213                                 *buf++ = ',';
 214                         else
 215                                 optinbuf = 1;
 216                         buf += snprintf(buf, MAX_MNTOPT_STR,
 217                             "%s", mop->mo_name);
 218                         /*
 219                          * print option value if there is one
 220                          */
 221                         if (mop->mo_arg != NULL) {
 222                                 buf += snprintf(buf, MAX_MNTOPT_STR, "=%s",
 223                                     mop->mo_arg);
 224                         }
 225                 }
 226         }
 227         if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
 228                 if (optinbuf)
 229                         *buf++ = ',';
 230                 else
 231                         optinbuf = 1;
 232                 buf += snprintf(buf, MAX_MNTOPT_STR, "zone=%s",
 233                     vfsp->vfs_zone->zone_name);
 234         }
 235         if (mntfs_enabledev) {
 236                 if (optinbuf++)
 237                         *buf++ = ',';
 238                 buf += mntfs_devprint(vfsp, buf);
 239         }
 240         if (!optinbuf) {
 241                 buf += snprintf(buf, MAX_MNTOPT_STR, "-");
 242         }
 243         return (buf - origbuf);
 244 }
 245 
 246 void
 247 mntfs_populate_text(vfs_t *vfsp, zone_t *zonep, mntelem_t *elemp)
 248 {
 249         struct extmnttab *tabp = &elemp->mnte_tab;
 250         const char *resource, *mntpt;
 251         char *cp = elemp->mnte_text;
 252         mntpt = refstr_value(vfsp->vfs_mntpt);
 253         resource = refstr_value(vfsp->vfs_resource);
 254 
 255         tabp->mnt_special = 0;
 256         if (resource != NULL && resource[0] != '\0') {
 257                 if (resource[0] != '/') {
 258                         cp += snprintf(cp, MAXPATHLEN, "%s\t", resource);
 259                 } else if (!ZONE_PATH_VISIBLE(resource, zonep)) {
 260                         /*
 261                          * Use the mount point as the resource.
 262                          */
 263                         cp += snprintf(cp, MAXPATHLEN, "%s\t",
 264                             ZONE_PATH_TRANSLATE(mntpt, zonep));
 265                 } else {
 266                         cp += snprintf(cp, MAXPATHLEN, "%s\t",
 267                             ZONE_PATH_TRANSLATE(resource, zonep));
 268                 }
 269         } else {
 270                 cp += snprintf(cp, MAXPATHLEN, "-\t");
 271         }
 272 
 273         tabp->mnt_mountp = (char *)(cp - elemp->mnte_text);
 274         if (mntpt != NULL && mntpt[0] != '\0') {
 275                 /*
 276                  * We know the mount point is visible from within the zone,
 277                  * otherwise it wouldn't be on the zone's vfs list.
 278                  */
 279                 cp += snprintf(cp, MAXPATHLEN, "%s\t",
 280                     ZONE_PATH_TRANSLATE(mntpt, zonep));
 281         } else {
 282                 cp += snprintf(cp, MAXPATHLEN, "-\t");
 283         }
 284 
 285         tabp->mnt_fstype = (char *)(cp - elemp->mnte_text);
 286         cp += snprintf(cp, MAXPATHLEN, "%s\t",
 287             vfssw[vfsp->vfs_fstype].vsw_name);
 288 
 289         tabp->mnt_mntopts = (char *)(cp - elemp->mnte_text);
 290         cp += mntfs_optprint(vfsp, cp);
 291         *cp++ = '\t';
 292 
 293         tabp->mnt_time = (char *)(cp - elemp->mnte_text);
 294         cp += snprintf(cp, MAX_MNTOPT_STR, "%ld", vfsp->vfs_mtime);
 295         *cp++ = '\n'; /* over-write snprintf's trailing null-byte */
 296 
 297         tabp->mnt_major = getmajor(vfsp->vfs_dev);
 298         tabp->mnt_minor = getminor(vfsp->vfs_dev);
 299 
 300         elemp->mnte_text_size = cp - elemp->mnte_text;
 301         elemp->mnte_vfs_ctime = vfsp->vfs_hrctime;
 302         elemp->mnte_hidden = vfsp->vfs_flag & VFS_NOMNTTAB;
 303 }
 304 
 305 /* Determine the length of the /etc/mnttab entry for this vfs_t. */
 306 static size_t
 307 mntfs_text_len(vfs_t *vfsp, zone_t *zone)
 308 {
 309         size_t size = 0;
 310         const char *resource, *mntpt;
 311         size_t mntsize;
 312 
 313         mntpt = refstr_value(vfsp->vfs_mntpt);
 314         if (mntpt != NULL && mntpt[0] != '\0') {
 315                 mntsize = strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
 316         } else {
 317                 mntsize = 2;    /* "-\t" */
 318         }
 319         size += mntsize;
 320 
 321         resource = refstr_value(vfsp->vfs_resource);
 322         if (resource != NULL && resource[0] != '\0') {
 323                 if (resource[0] != '/') {
 324                         size += strlen(resource) + 1;
 325                 } else if (!ZONE_PATH_VISIBLE(resource, zone)) {
 326                         /*
 327                          * Same as the zone's view of the mount point.
 328                          */
 329                         size += mntsize;
 330                 } else {
 331                         size += strlen(ZONE_PATH_TRANSLATE(resource, zone)) + 1;
 332                 }
 333         } else {
 334                 size += 2;      /* "-\t" */
 335         }
 336         size += strlen(vfssw[vfsp->vfs_fstype].vsw_name) + 1;
 337         size += mntfs_optsize(vfsp);
 338         size += snprintf(NULL, 0, "\t%ld\n", vfsp->vfs_mtime);
 339         return (size);
 340 }
 341 
 342 /* Destroy the resources associated with a snapshot element. */
 343 static void
 344 mntfs_destroy_elem(mntelem_t *elemp)
 345 {
 346         kmem_free(elemp->mnte_text, elemp->mnte_text_size);
 347         kmem_free(elemp, sizeof (mntelem_t));
 348 }
 349 
 350 /*
 351  * Return 1 if the given snapshot is in the range of the given element; return
 352  * 0 otherwise.
 353  */
 354 static int
 355 mntfs_elem_in_range(mntsnap_t *snapp, mntelem_t *elemp)
 356 {
 357         timespec_t      *stimep = &snapp->mnts_time;
 358         timespec_t      *btimep = &elemp->mnte_birth;
 359         timespec_t      *dtimep = &elemp->mnte_death;
 360 
 361         /*
 362          * If a snapshot is in range of an element then the snapshot must have
 363          * been created after the birth of the element, and either the element
 364          * is still alive or it died after the snapshot was created.
 365          */
 366         if (mntfs_newest(btimep, stimep) == MNTFS_SECOND &&
 367             (MNTFS_ELEM_IS_ALIVE(elemp) ||
 368             mntfs_newest(stimep, dtimep) == MNTFS_SECOND))
 369                 return (1);
 370         else
 371                 return (0);
 372 }
 373 
 374 /*
 375  * Return the next valid database element, after the one provided, for a given
 376  * snapshot; return NULL if none exists. The caller must hold the zone's
 377  * database lock as a reader before calling this function.
 378  */
 379 static mntelem_t *
 380 mntfs_get_next_elem(mntsnap_t *snapp, mntelem_t *elemp)
 381 {
 382         int show_hidden = snapp->mnts_flags & MNTS_SHOWHIDDEN;
 383 
 384         do {
 385                 elemp = elemp->mnte_next;
 386         } while (elemp &&
 387             (!mntfs_elem_in_range(snapp, elemp) ||
 388             (!show_hidden && elemp->mnte_hidden)));
 389         return (elemp);
 390 }
 391 
 392 /*
 393  * This function frees the resources associated with a mntsnap_t. It walks
 394  * through the database, decrementing the reference count of any element that
 395  * satisfies the snapshot. If the reference count of an element becomes zero
 396  * then it is removed from the database.
 397  */
 398 static void
 399 mntfs_freesnap(mntnode_t *mnp, mntsnap_t *snapp)
 400 {
 401         zone_t *zonep = MTOD(mnp)->mnt_zone_ref.zref_zone;
 402         krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
 403         mntelem_t **elempp = &zonep->zone_mntfs_db;
 404         mntelem_t *elemp;
 405         int show_hidden = snapp->mnts_flags & MNTS_SHOWHIDDEN;
 406         size_t number_decremented = 0;
 407 
 408         ASSERT(RW_WRITE_HELD(&mnp->mnt_contents));
 409 
 410         /* Ignore an uninitialised snapshot. */
 411         if (snapp->mnts_nmnts == 0)
 412                 return;
 413 
 414         /* Drop the holds on any matching database elements. */
 415         rw_enter(dblockp, RW_WRITER);
 416         while ((elemp = *elempp) != NULL) {
 417                 if (mntfs_elem_in_range(snapp, elemp) &&
 418                     (!elemp->mnte_hidden || show_hidden) &&
 419                     ++number_decremented && --elemp->mnte_refcnt == 0) {
 420                         if ((*elempp = elemp->mnte_next) != NULL)
 421                                 (*elempp)->mnte_prev = elemp->mnte_prev;
 422                         mntfs_destroy_elem(elemp);
 423                 } else {
 424                         elempp = &elemp->mnte_next;
 425                 }
 426         }
 427         rw_exit(dblockp);
 428         ASSERT(number_decremented == snapp->mnts_nmnts);
 429 
 430         /* Clear the snapshot data. */
 431         bzero(snapp, sizeof (mntsnap_t));
 432 }
 433 
 434 /* Insert the new database element newp after the existing element prevp. */
 435 static void
 436 mntfs_insert_after(mntelem_t *newp, mntelem_t *prevp)
 437 {
 438         newp->mnte_prev = prevp;
 439         newp->mnte_next = prevp->mnte_next;
 440         prevp->mnte_next = newp;
 441         if (newp->mnte_next != NULL)
 442                 newp->mnte_next->mnte_prev = newp;
 443 }
 444 
 445 /* Create and return a copy of a given database element. */
 446 static mntelem_t *
 447 mntfs_copy(mntelem_t *origp)
 448 {
 449         mntelem_t *copyp;
 450 
 451         copyp = kmem_zalloc(sizeof (mntelem_t), KM_SLEEP);
 452         copyp->mnte_vfs_ctime = origp->mnte_vfs_ctime;
 453         copyp->mnte_text_size = origp->mnte_text_size;
 454         copyp->mnte_text = kmem_alloc(copyp->mnte_text_size, KM_SLEEP);
 455         bcopy(origp->mnte_text, copyp->mnte_text, copyp->mnte_text_size);
 456         copyp->mnte_tab = origp->mnte_tab;
 457         copyp->mnte_hidden = origp->mnte_hidden;
 458 
 459         return (copyp);
 460 }
 461 
 462 /*
 463  * Compare two database elements and determine whether or not the vfs_t payload
 464  * data of each are the same. Return 1 if so and 0 otherwise.
 465  */
 466 static int
 467 mntfs_is_same_element(mntelem_t *a, mntelem_t *b)
 468 {
 469         if (a->mnte_hidden == b->mnte_hidden &&
 470             a->mnte_text_size == b->mnte_text_size &&
 471             bcmp(a->mnte_text, b->mnte_text, a->mnte_text_size) == 0 &&
 472             bcmp(&a->mnte_tab, &b->mnte_tab, sizeof (struct extmnttab)) == 0)
 473                 return (1);
 474         else
 475                 return (0);
 476 }
 477 
 478 /*
 479  * mntfs_snapshot() updates the database, creating it if necessary, so that it
 480  * accurately reflects the state of the in-kernel mnttab. It also increments
 481  * the reference count on all database elements that correspond to currently-
 482  * mounted resources. Finally, it initialises the appropriate snapshot
 483  * structure.
 484  *
 485  * Each vfs_t is given a high-resolution time stamp, for the benefit of mntfs,
 486  * when it is inserted into the in-kernel mnttab. This time stamp is copied into
 487  * the corresponding database element when it is created, allowing the element
 488  * and the vfs_t to be identified as a pair. It is possible that some file
 489  * systems may make unadvertised changes to, for example, a resource's mount
 490  * options. Therefore, in order to determine whether a database element is an
 491  * up-to-date representation of a given vfs_t, it is compared with a temporary
 492  * element generated for this purpose. Although less efficient, this is safer
 493  * than implementing an mtime for a vfs_t.
 494  *
 495  * Some mounted resources are marked as "hidden" with a VFS_NOMNTTAB flag. These
 496  * are considered invisible unless the user has already set the MNT_SHOWHIDDEN
 497  * flag in the vnode using the MNTIOC_SHOWHIDDEN ioctl.
 498  */
 499 static void
 500 mntfs_snapshot(mntnode_t *mnp, mntsnap_t *snapp)
 501 {
 502         mntdata_t       *mnd = MTOD(mnp);
 503         zone_t          *zonep = mnd->mnt_zone_ref.zref_zone;
 504         int             is_global_zone = (zonep == global_zone);
 505         int             show_hidden = mnp->mnt_flags & MNT_SHOWHIDDEN;
 506         vfs_t           *vfsp, *firstvfsp, *lastvfsp;
 507         vfs_t           dummyvfs;
 508         vfs_t           *dummyvfsp = NULL;
 509         krwlock_t       *dblockp = &zonep->zone_mntfs_db_lock;
 510         mntelem_t       **headpp = &zonep->zone_mntfs_db;
 511         mntelem_t       *elemp;
 512         mntelem_t       *prevp = NULL;
 513         int             order;
 514         mntelem_t       *tempelemp;
 515         mntelem_t       *newp;
 516         mntelem_t       *firstp = NULL;
 517         size_t          nmnts = 0;
 518         size_t          total_text_size = 0;
 519         size_t          normal_text_size = 0;
 520         int             insert_before;
 521         timespec_t      last_mtime;
 522         size_t          entry_length, new_entry_length;
 523 
 524 
 525         ASSERT(RW_WRITE_HELD(&mnp->mnt_contents));
 526         vfs_list_read_lock();
 527         vfs_mnttab_modtime(&last_mtime);
 528 
 529         /*
 530          * If this snapshot already exists then we must have been asked to
 531          * rewind the file, i.e. discard the snapshot and create a new one in
 532          * its place. In this case we first see if the in-kernel mnttab has
 533          * advertised a change; if not then we simply reinitialise the metadata.
 534          */
 535         if (snapp->mnts_nmnts) {
 536                 if (mntfs_newest(&last_mtime, &snapp->mnts_last_mtime) ==
 537                     MNTFS_NEITHER) {
 538                         /*
 539                          * An unchanged mtime is no guarantee that the
 540                          * in-kernel mnttab is unchanged; for example, a
 541                          * concurrent remount may be between calls to
 542                          * vfs_setmntopt_nolock() and vfs_mnttab_modtimeupd().
 543                          * It follows that the database may have changed, and
 544                          * in particular that some elements in this snapshot
 545                          * may have been killed by another call to
 546                          * mntfs_snapshot(). It is therefore not merely
 547                          * unnecessary to update the snapshot's time but in
 548                          * fact dangerous; it needs to be left alone.
 549                          */
 550                         snapp->mnts_next = snapp->mnts_first;
 551                         snapp->mnts_flags &= ~MNTS_REWIND;
 552                         snapp->mnts_foffset = snapp->mnts_ieoffset = 0;
 553                         vfs_list_unlock();
 554                         return;
 555                 } else {
 556                         mntfs_freesnap(mnp, snapp);
 557                 }
 558         }
 559 
 560         /*
 561          * Create a temporary database element. For each vfs_t, the temporary
 562          * element will be populated with the corresponding text. If the vfs_t
 563          * does not have a corresponding element within the database, or if
 564          * there is such an element but it is stale, a copy of the temporary
 565          * element is inserted into the database at the appropriate location.
 566          */
 567         tempelemp = kmem_alloc(sizeof (mntelem_t), KM_SLEEP);
 568         entry_length = MNT_LINE_MAX;
 569         tempelemp->mnte_text = kmem_alloc(entry_length, KM_SLEEP);
 570 
 571         /* Find the first and last vfs_t for the given zone. */
 572         if (is_global_zone) {
 573                 firstvfsp = rootvfs;
 574                 lastvfsp = firstvfsp->vfs_prev;
 575         } else {
 576                 firstvfsp = zonep->zone_vfslist;
 577                 /*
 578                  * If there isn't already a vfs_t for root then we create a
 579                  * dummy which will be used as the head of the list (which will
 580                  * therefore no longer be circular).
 581                  */
 582                 if (firstvfsp == NULL ||
 583                     strcmp(refstr_value(firstvfsp->vfs_mntpt),
 584                     zonep->zone_rootpath) != 0) {
 585                         /*
 586                          * The zone's vfs_ts will have mount points relative to
 587                          * the zone's root path. The vfs_t for the zone's
 588                          * root file system would therefore have a mount point
 589                          * equal to the zone's root path. Since the zone's root
 590                          * path isn't a mount point, we copy the vfs_t of the
 591                          * zone's root vnode, and provide it with a fake mount
 592                          * and resource. However, if the zone's root is a
 593                          * zfs dataset, use the dataset name as the resource.
 594                          *
 595                          * Note that by cloning another vfs_t we also acquire
 596                          * its high-resolution ctime. This might appear to
 597                          * violate the requirement that the ctimes in the list
 598                          * of vfs_ts are unique and monotonically increasing;
 599                          * this is not the case. The dummy vfs_t appears in only
 600                          * a non-global zone's vfs_t list, where the cloned
 601                          * vfs_t would not ordinarily be visible; the ctimes are
 602                          * therefore unique. The zone's root path must be
 603                          * available before the zone boots, and so its root
 604                          * vnode's vfs_t's ctime must be lower than those of any
 605                          * resources subsequently mounted by the zone. The
 606                          * ctimes are therefore monotonically increasing.
 607                          */
 608                         dummyvfs = *zonep->zone_rootvp->v_vfsp;
 609                         dummyvfs.vfs_mntpt = refstr_alloc(zonep->zone_rootpath);
 610                         if (strcmp(vfssw[dummyvfs.vfs_fstype].vsw_name, "zfs")
 611                             != 0)
 612                                 dummyvfs.vfs_resource = dummyvfs.vfs_mntpt;
 613                         dummyvfsp = &dummyvfs;
 614                         if (firstvfsp == NULL) {
 615                                 lastvfsp = dummyvfsp;
 616                         } else {
 617                                 lastvfsp = firstvfsp->vfs_zone_prev;
 618                                 dummyvfsp->vfs_zone_next = firstvfsp;
 619                         }
 620                         firstvfsp = dummyvfsp;
 621                 } else {
 622                         lastvfsp = firstvfsp->vfs_zone_prev;
 623                 }
 624         }
 625 
 626         /*
 627          * Now walk through all the vfs_ts for this zone. For each one, find the
 628          * corresponding database element, creating it first if necessary, and
 629          * increment its reference count.
 630          */
 631         rw_enter(dblockp, RW_WRITER);
 632         elemp = zonep->zone_mntfs_db;
 633         /* CSTYLED */
 634         for (vfsp = firstvfsp;;
 635             vfsp = is_global_zone ? vfsp->vfs_next : vfsp->vfs_zone_next) {
 636                 DTRACE_PROBE1(new__vfs, vfs_t *, vfsp);
 637                 /* Consider only visible entries. */
 638                 if ((vfsp->vfs_flag & VFS_NOMNTTAB) == 0 || show_hidden) {
 639                         /*
 640                          * Walk through the existing database looking for either
 641                          * an element that matches the current vfs_t, or for the
 642                          * correct place in which to insert a new element.
 643                          */
 644                         insert_before = 0;
 645                         for (; elemp; prevp = elemp, elemp = elemp->mnte_next) {
 646                                 DTRACE_PROBE1(considering__elem, mntelem_t *,
 647                                     elemp);
 648 
 649                                 /* Compare the vfs_t with the element. */
 650                                 order = mntfs_newest(&elemp->mnte_vfs_ctime,
 651                                     &vfsp->vfs_hrctime);
 652 
 653                                 /*
 654                                  * If we encounter a database element newer than
 655                                  * this vfs_t then we've stepped over a gap
 656                                  * where the element for this vfs_t must be
 657                                  * inserted.
 658                                  */
 659                                 if (order == MNTFS_FIRST) {
 660                                         insert_before = 1;
 661                                         break;
 662                                 }
 663 
 664                                 /* Dead elements no longer interest us. */
 665                                 if (MNTFS_ELEM_IS_DEAD(elemp))
 666                                         continue;
 667 
 668                                 /*
 669                                  * If the time stamps are the same then the
 670                                  * element is potential match for the vfs_t,
 671                                  * although it may later prove to be stale.
 672                                  */
 673                                 if (order == MNTFS_NEITHER)
 674                                         break;
 675 
 676                                 /*
 677                                  * This element must be older than the vfs_t.
 678                                  * It must, therefore, correspond to a vfs_t
 679                                  * that has been unmounted. Since the element is
 680                                  * still alive, we kill it if it is visible.
 681                                  */
 682                                 if (!elemp->mnte_hidden || show_hidden)
 683                                         vfs_mono_time(&elemp->mnte_death);
 684                         }
 685                         DTRACE_PROBE2(possible__match, vfs_t *, vfsp,
 686                             mntelem_t *, elemp);
 687 
 688                         /* Create a new database element if required. */
 689                         new_entry_length = mntfs_text_len(vfsp, zonep);
 690                         if (new_entry_length > entry_length) {
 691                                 kmem_free(tempelemp->mnte_text, entry_length);
 692                                 tempelemp->mnte_text =
 693                                     kmem_alloc(new_entry_length, KM_SLEEP);
 694                                 entry_length = new_entry_length;
 695                         }
 696                         mntfs_populate_text(vfsp, zonep, tempelemp);
 697                         ASSERT(tempelemp->mnte_text_size == new_entry_length);
 698                         if (elemp == NULL) {
 699                                 /*
 700                                  * We ran off the end of the database. Insert a
 701                                  * new element at the end.
 702                                  */
 703                                 newp = mntfs_copy(tempelemp);
 704                                 vfs_mono_time(&newp->mnte_birth);
 705                                 if (prevp) {
 706                                         mntfs_insert_after(newp, prevp);
 707                                 } else {
 708                                         newp->mnte_next = NULL;
 709                                         newp->mnte_prev = NULL;
 710                                         ASSERT(*headpp == NULL);
 711                                         *headpp = newp;
 712                                 }
 713                                 elemp = newp;
 714                         } else if (insert_before) {
 715                                 /*
 716                                  * Insert a new element before the current one.
 717                                  */
 718                                 newp = mntfs_copy(tempelemp);
 719                                 vfs_mono_time(&newp->mnte_birth);
 720                                 if (prevp) {
 721                                         mntfs_insert_after(newp, prevp);
 722                                 } else {
 723                                         newp->mnte_next = elemp;
 724                                         newp->mnte_prev = NULL;
 725                                         elemp->mnte_prev = newp;
 726                                         ASSERT(*headpp == elemp);
 727                                         *headpp = newp;
 728                                 }
 729                                 elemp = newp;
 730                         } else if (!mntfs_is_same_element(elemp, tempelemp)) {
 731                                 /*
 732                                  * The element corresponds to the vfs_t, but the
 733                                  * vfs_t has changed; it must have been
 734                                  * remounted. Kill the old element and insert a
 735                                  * new one after it.
 736                                  */
 737                                 vfs_mono_time(&elemp->mnte_death);
 738                                 newp = mntfs_copy(tempelemp);
 739                                 vfs_mono_time(&newp->mnte_birth);
 740                                 mntfs_insert_after(newp, elemp);
 741                                 elemp = newp;
 742                         }
 743 
 744                         /* We've found the corresponding element. Hold it. */
 745                         DTRACE_PROBE1(incrementing, mntelem_t *, elemp);
 746                         elemp->mnte_refcnt++;
 747 
 748                         /*
 749                          * Update the parameters used to initialise the
 750                          * snapshot.
 751                          */
 752                         nmnts++;
 753                         total_text_size += elemp->mnte_text_size;
 754                         if (!elemp->mnte_hidden)
 755                                 normal_text_size += elemp->mnte_text_size;
 756                         if (!firstp)
 757                                 firstp = elemp;
 758 
 759                         prevp = elemp;
 760                         elemp = elemp->mnte_next;
 761                 }
 762 
 763                 if (vfsp == lastvfsp)
 764                         break;
 765         }
 766 
 767         /*
 768          * Any remaining visible database elements that are still alive must be
 769          * killed now, because their corresponding vfs_ts must have been
 770          * unmounted.
 771          */
 772         for (; elemp; elemp = elemp->mnte_next) {
 773                 if (MNTFS_ELEM_IS_ALIVE(elemp) &&
 774                     (!elemp->mnte_hidden || show_hidden))
 775                         vfs_mono_time(&elemp->mnte_death);
 776         }
 777 
 778         /* Initialise the snapshot. */
 779         vfs_mono_time(&snapp->mnts_time);
 780         snapp->mnts_last_mtime = last_mtime;
 781         snapp->mnts_first = snapp->mnts_next = firstp;
 782         snapp->mnts_flags = show_hidden ? MNTS_SHOWHIDDEN : 0;
 783         snapp->mnts_nmnts = nmnts;
 784         snapp->mnts_text_size = total_text_size;
 785         snapp->mnts_foffset = snapp->mnts_ieoffset = 0;
 786 
 787         /*
 788          * Record /etc/mnttab's current size and mtime for possible future use
 789          * by mntgetattr().
 790          */
 791         mnd->mnt_size = normal_text_size;
 792         mnd->mnt_mtime = last_mtime;
 793         if (show_hidden) {
 794                 mnd->mnt_hidden_size = total_text_size;
 795                 mnd->mnt_hidden_mtime = last_mtime;
 796         }
 797 
 798         /* Clean up. */
 799         rw_exit(dblockp);
 800         vfs_list_unlock();
 801         if (dummyvfsp != NULL)
 802                 refstr_rele(dummyvfsp->vfs_mntpt);
 803         kmem_free(tempelemp->mnte_text, entry_length);
 804         kmem_free(tempelemp, sizeof (mntelem_t));
 805 }
 806 
 807 /*
 808  * Public function to convert vfs_mntopts into a string.
 809  * A buffer of sufficient size is allocated, which is returned via bufp,
 810  * and whose length is returned via lenp.
 811  */
 812 void
 813 mntfs_getmntopts(struct vfs *vfsp, char **bufp, size_t *lenp)
 814 {
 815         size_t len;
 816         char *buf;
 817 
 818         vfs_list_read_lock();
 819 
 820         len = mntfs_optsize(vfsp) + 1;
 821         buf = kmem_alloc(len, KM_NOSLEEP);
 822         if (buf == NULL) {
 823                 *bufp = NULL;
 824                 vfs_list_unlock();
 825                 return;
 826         }
 827         buf[len - 1] = '\0';
 828         (void) mntfs_optprint(vfsp, buf);
 829         ASSERT(buf[len - 1] == '\0');
 830 
 831         vfs_list_unlock();
 832         *bufp = buf;
 833         *lenp = len;
 834 }
 835 
 836 /* ARGSUSED */
 837 static int
 838 mntopen(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 839 {
 840         vnode_t *vp = *vpp;
 841         mntnode_t *nmnp;
 842 
 843         /*
 844          * Not allowed to open for writing, return error.
 845          */
 846         if (flag & FWRITE)
 847                 return (EPERM);
 848         /*
 849          * Create a new mnt/vnode for each open, this will give us a handle to
 850          * hang the snapshot on.
 851          */
 852         nmnp = mntgetnode(vp);
 853 
 854         *vpp = MTOV(nmnp);
 855         atomic_inc_32(&MTOD(nmnp)->mnt_nopen);
 856         VN_RELE(vp);
 857         return (0);
 858 }
 859 
 860 /* ARGSUSED */
 861 static int
 862 mntclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 863         caller_context_t *ct)
 864 {
 865         mntnode_t *mnp = VTOM(vp);
 866 
 867         /* Clean up any locks or shares held by the current process */
 868         cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 869         cleanshares(vp, ttoproc(curthread)->p_pid);
 870 
 871         if (count > 1)
 872                 return (0);
 873         if (vp->v_count == 1) {
 874                 rw_enter(&mnp->mnt_contents, RW_WRITER);
 875                 mntfs_freesnap(mnp, &mnp->mnt_read);
 876                 mntfs_freesnap(mnp, &mnp->mnt_ioctl);
 877                 rw_exit(&mnp->mnt_contents);
 878                 atomic_dec_32(&MTOD(mnp)->mnt_nopen);
 879         }
 880         return (0);
 881 }
 882 
 883 /* ARGSUSED */
 884 static int
 885 mntread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, caller_context_t *ct)
 886 {
 887         mntnode_t *mnp = VTOM(vp);
 888         zone_t *zonep = MTOD(mnp)->mnt_zone_ref.zref_zone;
 889         mntsnap_t *snapp = &mnp->mnt_read;
 890         off_t off = uio->uio_offset;
 891         size_t len = uio->uio_resid;
 892         char *bufferp;
 893         size_t available, copylen;
 894         size_t written = 0;
 895         mntelem_t *elemp;
 896         krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
 897         int error = 0;
 898         off_t   ieoffset;
 899 
 900         rw_enter(&mnp->mnt_contents, RW_WRITER);
 901         if (snapp->mnts_nmnts == 0 || (off == (off_t)0))
 902                 mntfs_snapshot(mnp, snapp);
 903 
 904         if ((size_t)(off + len) > snapp->mnts_text_size)
 905                 len = snapp->mnts_text_size - off;
 906 
 907         if (off < 0 || len > snapp->mnts_text_size) {
 908                 rw_exit(&mnp->mnt_contents);
 909                 return (EFAULT);
 910         }
 911 
 912         if (len == 0) {
 913                 rw_exit(&mnp->mnt_contents);
 914                 return (0);
 915         }
 916 
 917         /*
 918          * For the file offset provided, locate the corresponding database
 919          * element and calculate the corresponding offset within its text. If
 920          * the file offset is the same as that reached during the last read(2)
 921          * then use the saved element and intra-element offset.
 922          */
 923         rw_enter(dblockp, RW_READER);
 924         if (off == 0 || (off == snapp->mnts_foffset)) {
 925                 elemp = snapp->mnts_next;
 926                 ieoffset = snapp->mnts_ieoffset;
 927         } else {
 928                 off_t total_off;
 929                 /*
 930                  * Find the element corresponding to the requested file offset
 931                  * by walking through the database and summing the text sizes
 932                  * of the individual elements. If the requested file offset is
 933                  * greater than that reached on the last visit then we can start
 934                  * at the last seen element; otherwise, we have to start at the
 935                  * beginning.
 936                  */
 937                 if (off > snapp->mnts_foffset) {
 938                         elemp = snapp->mnts_next;
 939                         total_off = snapp->mnts_foffset - snapp->mnts_ieoffset;
 940                 } else {
 941                         elemp = snapp->mnts_first;
 942                         total_off = 0;
 943                 }
 944                 while (off > total_off + elemp->mnte_text_size) {
 945                         total_off += elemp->mnte_text_size;
 946                         elemp = mntfs_get_next_elem(snapp, elemp);
 947                         ASSERT(elemp != NULL);
 948                 }
 949                 /* Calculate the intra-element offset. */
 950                 if (off > total_off)
 951                         ieoffset = off - total_off;
 952                 else
 953                         ieoffset = 0;
 954         }
 955 
 956         /*
 957          * Create a buffer and populate it with the text from successive
 958          * database elements until it is full.
 959          */
 960         bufferp = kmem_alloc(len, KM_SLEEP);
 961         while (written < len) {
 962                 available = elemp->mnte_text_size - ieoffset;
 963                 copylen = MIN(len - written, available);
 964                 bcopy(elemp->mnte_text + ieoffset, bufferp + written, copylen);
 965                 written += copylen;
 966                 if (copylen == available) {
 967                         elemp = mntfs_get_next_elem(snapp, elemp);
 968                         ASSERT(elemp != NULL || written == len);
 969                         ieoffset = 0;
 970                 } else {
 971                         ieoffset += copylen;
 972                 }
 973         }
 974         rw_exit(dblockp);
 975 
 976         /*
 977          * Write the populated buffer, update the snapshot's state if
 978          * successful and then advertise our read.
 979          */
 980         error = uiomove(bufferp, len, UIO_READ, uio);
 981         if (error == 0) {
 982                 snapp->mnts_next = elemp;
 983                 snapp->mnts_foffset = off + len;
 984                 snapp->mnts_ieoffset = ieoffset;
 985         }
 986         vfs_mnttab_readop();
 987         rw_exit(&mnp->mnt_contents);
 988 
 989         /* Clean up. */
 990         kmem_free(bufferp, len);
 991         return (error);
 992 }
 993 
 994 static int
 995 mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 996         caller_context_t *ct)
 997 {
 998         int mask = vap->va_mask;
 999         int error;
1000         mntnode_t *mnp = VTOM(vp);
1001         timespec_t mtime, old_mtime;
1002         size_t size, old_size;
1003         mntdata_t *mntdata = MTOD(VTOM(vp));
1004         mntsnap_t *rsnapp, *isnapp;
1005         extern timespec_t vfs_mnttab_ctime;
1006 
1007 
1008         /* AT_MODE, AT_UID and AT_GID are derived from the underlying file. */
1009         if (mask & AT_MODE|AT_UID|AT_GID) {
1010                 if (error = VOP_GETATTR(mnp->mnt_mountvp, vap, flags, cr, ct))
1011                         return (error);
1012         }
1013 
1014         /*
1015          * There are some minor subtleties in the determination of
1016          * /etc/mnttab's size and mtime. We wish to avoid any condition in
1017          * which, in the vicinity of a change to the in-kernel mnttab, we
1018          * return an old value for one but a new value for the other. We cannot
1019          * simply hold vfslist for the entire calculation because we might need
1020          * to call mntfs_snapshot(), which calls vfs_list_read_lock().
1021          */
1022         if (mask & AT_SIZE|AT_NBLOCKS) {
1023                 rw_enter(&mnp->mnt_contents, RW_WRITER);
1024 
1025                 vfs_list_read_lock();
1026                 vfs_mnttab_modtime(&mtime);
1027                 if (mnp->mnt_flags & MNT_SHOWHIDDEN) {
1028                         old_mtime = mntdata->mnt_hidden_mtime;
1029                         old_size = mntdata->mnt_hidden_size;
1030                 } else {
1031                         old_mtime = mntdata->mnt_mtime;
1032                         old_size = mntdata->mnt_size;
1033                 }
1034                 vfs_list_unlock();
1035 
1036                 rsnapp = &mnp->mnt_read;
1037                 isnapp = &mnp->mnt_ioctl;
1038                 if (rsnapp->mnts_nmnts || isnapp->mnts_nmnts) {
1039                         /*
1040                          * The mntnode already has at least one snapshot from
1041                          * which to take the size; the user will understand from
1042                          * mnttab(4) that the current size of the in-kernel
1043                          * mnttab is irrelevant.
1044                          */
1045                         size = rsnapp->mnts_nmnts ? rsnapp->mnts_text_size :
1046                             isnapp->mnts_text_size;
1047                 } else if (mntfs_newest(&mtime, &old_mtime) == MNTFS_NEITHER) {
1048                         /*
1049                          * There is no existing valid snapshot but the in-kernel
1050                          * mnttab has not changed since the time that the last
1051                          * one was generated. Use the old file size; note that
1052                          * it is guaranteed to be consistent with mtime, which
1053                          * may be returned to the user later.
1054                          */
1055                         size = old_size;
1056                 } else {
1057                         /*
1058                          * There is no snapshot and the in-kernel mnttab has
1059                          * changed since the last one was created. We generate a
1060                          * new snapshot which we use for not only the size but
1061                          * also the mtime, thereby ensuring that the two are
1062                          * consistent.
1063                          */
1064                         mntfs_snapshot(mnp, rsnapp);
1065                         size = rsnapp->mnts_text_size;
1066                         mtime = rsnapp->mnts_last_mtime;
1067                         mntfs_freesnap(mnp, rsnapp);
1068                 }
1069 
1070                 rw_exit(&mnp->mnt_contents);
1071         } else if (mask & AT_ATIME|AT_MTIME) {
1072                 vfs_list_read_lock();
1073                 vfs_mnttab_modtime(&mtime);
1074                 vfs_list_unlock();
1075         }
1076 
1077         /* Always look like a regular file. */
1078         if (mask & AT_TYPE)
1079                 vap->va_type = VREG;
1080         /* Mode should basically be read only. */
1081         if (mask & AT_MODE)
1082                 vap->va_mode &= 07444;
1083         if (mask & AT_FSID)
1084                 vap->va_fsid = vp->v_vfsp->vfs_dev;
1085         /* Nodeid is always ROOTINO. */
1086         if (mask & AT_NODEID)
1087                 vap->va_nodeid = (ino64_t)MNTROOTINO;
1088         /*
1089          * Set nlink to the number of open vnodes for mnttab info
1090          * plus one for existing.
1091          */
1092         if (mask & AT_NLINK)
1093                 vap->va_nlink = mntdata->mnt_nopen + 1;
1094         if (mask & AT_SIZE)
1095                 vap->va_size = size;
1096         if (mask & AT_ATIME)
1097                 vap->va_atime = mtime;
1098         if (mask & AT_MTIME)
1099                 vap->va_mtime = mtime;
1100         if (mask & AT_CTIME)
1101                 vap->va_ctime = vfs_mnttab_ctime;
1102         if (mask & AT_RDEV)
1103                 vap->va_rdev = 0;
1104         if (mask & AT_BLKSIZE)
1105                 vap->va_blksize = DEV_BSIZE;
1106         if (mask & AT_NBLOCKS)
1107                 vap->va_nblocks = btod(size);
1108         if (mask & AT_SEQ)
1109                 vap->va_seq = 0;
1110 
1111         return (0);
1112 }
1113 
1114 static int
1115 mntaccess(vnode_t *vp, int mode, int flags, cred_t *cr,
1116         caller_context_t *ct)
1117 {
1118         mntnode_t *mnp = VTOM(vp);
1119 
1120         if (mode & (VWRITE|VEXEC))
1121                 return (EROFS);
1122 
1123         /*
1124          * Do access check on the underlying directory vnode.
1125          */
1126         return (VOP_ACCESS(mnp->mnt_mountvp, mode, flags, cr, ct));
1127 }
1128 
1129 
1130 /*
1131  * New /mntfs vnode required; allocate it and fill in most of the fields.
1132  */
1133 static mntnode_t *
1134 mntgetnode(vnode_t *dp)
1135 {
1136         mntnode_t *mnp;
1137         vnode_t *vp;
1138 
1139         mnp = kmem_zalloc(sizeof (mntnode_t), KM_SLEEP);
1140         mnp->mnt_vnode = vn_alloc(KM_SLEEP);
1141         mnp->mnt_mountvp = VTOM(dp)->mnt_mountvp;
1142         rw_init(&mnp->mnt_contents, NULL, RW_DEFAULT, NULL);
1143         vp = MTOV(mnp);
1144         vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
1145         vn_setops(vp, mntvnodeops);
1146         vp->v_vfsp = dp->v_vfsp;
1147         vp->v_type = VREG;
1148         vp->v_data = (caddr_t)mnp;
1149 
1150         return (mnp);
1151 }
1152 
1153 /*
1154  * Free the storage obtained from mntgetnode().
1155  */
1156 static void
1157 mntfreenode(mntnode_t *mnp)
1158 {
1159         vnode_t *vp = MTOV(mnp);
1160 
1161         rw_destroy(&mnp->mnt_contents);
1162         vn_invalid(vp);
1163         vn_free(vp);
1164         kmem_free(mnp, sizeof (*mnp));
1165 }
1166 
1167 
1168 /* ARGSUSED */
1169 static int
1170 mntfsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1171 {
1172         return (0);
1173 }
1174 
1175 /* ARGSUSED */
1176 static void
1177 mntinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1178 {
1179         mntnode_t *mnp = VTOM(vp);
1180 
1181         mntfreenode(mnp);
1182 }
1183 
1184 /*
1185  * lseek(2) is supported only to rewind the file by resetmnttab(3C). Rewinding
1186  * has a special meaning for /etc/mnttab: it forces mntfs to refresh the
1187  * snapshot at the next ioctl().
1188  *
1189  * mnttab(4) explains that "the snapshot...is taken any time a read(2) is
1190  * performed at offset 0". We therefore ignore the read snapshot here.
1191  */
1192 /* ARGSUSED */
1193 static int
1194 mntseek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
1195 {
1196         mntnode_t *mnp = VTOM(vp);
1197 
1198         if (*noffp == 0) {
1199                 rw_enter(&mnp->mnt_contents, RW_WRITER);
1200                 mnp->mnt_ioctl.mnts_flags |= MNTS_REWIND;
1201                 rw_exit(&mnp->mnt_contents);
1202         }
1203 
1204         return (0);
1205 }
1206 
1207 /*
1208  * Return the answer requested to poll().
1209  * POLLRDBAND will return when the mtime of the mnttab
1210  * information is newer than the latest one read for this open.
1211  */
1212 /* ARGSUSED */
1213 static int
1214 mntpoll(vnode_t *vp, short ev, int any, short *revp, pollhead_t **phpp,
1215         caller_context_t *ct)
1216 {
1217         mntnode_t *mnp = VTOM(vp);
1218         mntsnap_t *snapp;
1219 
1220         rw_enter(&mnp->mnt_contents, RW_READER);
1221         if (mntfs_newest(&mnp->mnt_ioctl.mnts_last_mtime,
1222             &mnp->mnt_read.mnts_last_mtime) == MNTFS_FIRST)
1223                 snapp = &mnp->mnt_ioctl;
1224         else
1225                 snapp = &mnp->mnt_read;
1226 
1227         *revp = 0;
1228         *phpp = (pollhead_t *)NULL;
1229         if (ev & POLLIN)
1230                 *revp |= POLLIN;
1231 
1232         if (ev & POLLRDNORM)
1233                 *revp |= POLLRDNORM;
1234 
1235         if (ev & POLLRDBAND) {
1236                 vfs_mnttab_poll(&snapp->mnts_last_mtime, phpp);
1237                 if (*phpp == (pollhead_t *)NULL)
1238                         *revp |= POLLRDBAND;
1239         }
1240         rw_exit(&mnp->mnt_contents);
1241 
1242         if (*revp || *phpp != NULL || any) {
1243                 return (0);
1244         }
1245         /*
1246          * If someone is polling an unsupported poll events (e.g.
1247          * POLLOUT, POLLPRI, etc.), just return POLLERR revents.
1248          * That way we will ensure that we don't return a 0
1249          * revents with a NULL pollhead pointer.
1250          */
1251         *revp = POLLERR;
1252         return (0);
1253 }
1254 
1255 /*
1256  * mntfs_same_word() returns 1 if two words are the same in the context of
1257  * MNTIOC_GETMNTANY and 0 otherwise.
1258  *
1259  * worda is a memory address that lies somewhere in the buffer bufa; it cannot
1260  * be NULL since this is used to indicate to getmntany(3C) that the user does
1261  * not wish to match a particular field. The text to which worda points is
1262  * supplied by the user; if it is not null-terminated then it cannot match.
1263  *
1264  * Buffer bufb contains a line from /etc/mnttab, in which the fields are
1265  * delimited by tab or new-line characters. offb is the offset of the second
1266  * word within this buffer.
1267  *
1268  * mntfs_same_word() returns 1 if the words are the same and 0 otherwise.
1269  */
1270 int
1271 mntfs_same_word(char *worda, char *bufa, size_t sizea, off_t offb, char *bufb,
1272     size_t sizeb)
1273 {
1274         char *wordb = bufb + offb;
1275         int bytes_remaining;
1276 
1277         ASSERT(worda != NULL);
1278 
1279         bytes_remaining = MIN(((bufa + sizea) - worda),
1280             ((bufb + sizeb) - wordb));
1281         while (bytes_remaining && *worda == *wordb) {
1282                 worda++;
1283                 wordb++;
1284                 bytes_remaining--;
1285         }
1286         if (bytes_remaining &&
1287             *worda == '\0' && (*wordb == '\t' || *wordb == '\n'))
1288                 return (1);
1289         else
1290                 return (0);
1291 }
1292 
1293 /*
1294  * mntfs_special_info_string() returns which, if either, of VBLK or VCHR
1295  * corresponds to a supplied path. If the path is a special device then the
1296  * function optionally sets the major and minor numbers.
1297  */
1298 vtype_t
1299 mntfs_special_info_string(char *path, uint_t *major, uint_t *minor, cred_t *cr)
1300 {
1301         vattr_t vattr;
1302         vnode_t *vp;
1303         vtype_t type;
1304         int error;
1305 
1306         if (path == NULL || *path != '/' ||
1307             lookupnameat(path + 1, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp, rootdir))
1308                 return (0);
1309 
1310         vattr.va_mask = AT_TYPE | AT_RDEV;
1311         error = VOP_GETATTR(vp, &vattr, ATTR_REAL, cr, NULL);
1312         VN_RELE(vp);
1313 
1314         if (error == 0 && ((type = vattr.va_type) == VBLK || type == VCHR)) {
1315                 if (major && minor) {
1316                         *major = getmajor(vattr.va_rdev);
1317                         *minor = getminor(vattr.va_rdev);
1318                 }
1319                 return (type);
1320         } else {
1321                 return (0);
1322         }
1323 }
1324 
1325 /*
1326  * mntfs_special_info_element() extracts the name of the mounted resource
1327  * for a given element and copies it into a null-terminated string, which it
1328  * then passes to mntfs_special_info_string().
1329  */
1330 vtype_t
1331 mntfs_special_info_element(mntelem_t *elemp, cred_t *cr)
1332 {
1333         char *newpath;
1334         vtype_t type;
1335 
1336         newpath = kmem_alloc(elemp->mnte_text_size, KM_SLEEP);
1337         bcopy(elemp->mnte_text, newpath, (off_t)(elemp->mnte_tab.mnt_mountp));
1338         *(newpath + (off_t)elemp->mnte_tab.mnt_mountp - 1) = '\0';
1339         type = mntfs_special_info_string(newpath, NULL, NULL, cr);
1340         kmem_free(newpath, elemp->mnte_text_size);
1341 
1342         return (type);
1343 }
1344 
1345 /*
1346  * Convert an address that points to a byte within a user buffer into an
1347  * address that points to the corresponding offset within a kernel buffer. If
1348  * the user address is NULL then make no conversion. If the address does not
1349  * lie within the buffer then reset it to NULL.
1350  */
1351 char *
1352 mntfs_import_addr(char *uaddr, char *ubufp, char *kbufp, size_t bufsize)
1353 {
1354         if (uaddr < ubufp || uaddr >= ubufp + bufsize)
1355                 return (NULL);
1356         else
1357                 return (kbufp + (uaddr - ubufp));
1358 }
1359 
1360 /*
1361  * These 32-bit versions are to support STRUCT_DECL(9F) etc. in
1362  * mntfs_copyout_element() and mntioctl().
1363  */
1364 #ifdef _SYSCALL32_IMPL
1365 typedef struct extmnttab32 {
1366         uint32_t        mnt_special;
1367         uint32_t        mnt_mountp;
1368         uint32_t        mnt_fstype;
1369         uint32_t        mnt_mntopts;
1370         uint32_t        mnt_time;
1371         uint_t          mnt_major;
1372         uint_t          mnt_minor;
1373 } extmnttab32_t;
1374 
1375 typedef struct mnttab32 {
1376         uint32_t        mnt_special;
1377         uint32_t        mnt_mountp;
1378         uint32_t        mnt_fstype;
1379         uint32_t        mnt_mntopts;
1380         uint32_t        mnt_time;
1381 } mnttab32_t;
1382 
1383 struct mntentbuf32 {
1384         uint32_t        mbuf_emp;
1385         uint_t          mbuf_bufsize;
1386         uint32_t        mbuf_buf;
1387 };
1388 #endif
1389 
1390 /*
1391  * mntfs_copyout_element() is common code for the MNTIOC_GETMNTENT,
1392  * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY ioctls. Having identifed the
1393  * database element desired by the user, this function copies out the text and
1394  * the pointers to the relevant userland addresses. It returns 0 on success
1395  * and non-zero otherwise.
1396  */
1397 int
1398 mntfs_copyout_elem(mntelem_t *elemp, struct extmnttab *uemp,
1399     char *ubufp, int cmd, int datamodel)
1400 {
1401                 STRUCT_DECL(extmnttab, ktab);
1402                 char *dbbufp = elemp->mnte_text;
1403                 size_t dbbufsize = elemp->mnte_text_size;
1404                 struct extmnttab *dbtabp = &elemp->mnte_tab;
1405                 size_t ssize;
1406                 char *kbufp;
1407                 int error = 0;
1408 
1409 
1410                 /*
1411                  * We create a struct extmnttab within the kernel of the size
1412                  * determined by the user's data model. We then populate its
1413                  * fields by combining the start address of the text buffer
1414                  * supplied by the user, ubufp, with the offsets stored for
1415                  * this database element within dbtabp, a pointer to a struct
1416                  * extmnttab.
1417                  *
1418                  * Note that if the corresponding field is "-" this signifies
1419                  * no real content, and we set the address to NULL. This does
1420                  * not apply to mnt_time.
1421                  */
1422                 STRUCT_INIT(ktab, datamodel);
1423                 STRUCT_FSETP(ktab, mnt_special,
1424                     MNTFS_REAL_FIELD(dbbufp) ? ubufp : NULL);
1425                 STRUCT_FSETP(ktab, mnt_mountp,
1426                     MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_mountp) ?
1427                     ubufp + (off_t)dbtabp->mnt_mountp : NULL);
1428                 STRUCT_FSETP(ktab, mnt_fstype,
1429                     MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_fstype) ?
1430                     ubufp + (off_t)dbtabp->mnt_fstype : NULL);
1431                 STRUCT_FSETP(ktab, mnt_mntopts,
1432                     MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_mntopts) ?
1433                     ubufp + (off_t)dbtabp->mnt_mntopts : NULL);
1434                 STRUCT_FSETP(ktab, mnt_time,
1435                     ubufp + (off_t)dbtabp->mnt_time);
1436                 if (cmd == MNTIOC_GETEXTMNTENT) {
1437                         STRUCT_FSETP(ktab, mnt_major, dbtabp->mnt_major);
1438                         STRUCT_FSETP(ktab, mnt_minor, dbtabp->mnt_minor);
1439                         ssize = SIZEOF_STRUCT(extmnttab, datamodel);
1440                 } else {
1441                         ssize = SIZEOF_STRUCT(mnttab, datamodel);
1442                 }
1443                 if (copyout(STRUCT_BUF(ktab), uemp, ssize))
1444                         return (EFAULT);
1445 
1446                 /*
1447                  * We create a text buffer in the kernel into which we copy the
1448                  * /etc/mnttab entry for this element. We change the tab and
1449                  * new-line delimiters to null bytes before copying out the
1450                  * buffer.
1451                  */
1452                 kbufp = kmem_alloc(dbbufsize, KM_SLEEP);
1453                 bcopy(elemp->mnte_text, kbufp, dbbufsize);
1454                 *(kbufp + (off_t)dbtabp->mnt_mountp - 1) =
1455                     *(kbufp + (off_t)dbtabp->mnt_fstype - 1) =
1456                     *(kbufp + (off_t)dbtabp->mnt_mntopts - 1) =
1457                     *(kbufp + (off_t)dbtabp->mnt_time - 1) =
1458                     *(kbufp + dbbufsize - 1) = '\0';
1459                 if (copyout(kbufp, ubufp, dbbufsize))
1460                         error = EFAULT;
1461 
1462                 kmem_free(kbufp, dbbufsize);
1463                 return (error);
1464 }
1465 
1466 /* ARGSUSED */
1467 static int
1468 mntioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
1469     int *rvalp, caller_context_t *ct)
1470 {
1471         uint_t *up = (uint_t *)arg;
1472         mntnode_t *mnp = VTOM(vp);
1473         mntsnap_t *snapp = &mnp->mnt_ioctl;
1474         int error = 0;
1475         zone_t *zonep = MTOD(mnp)->mnt_zone_ref.zref_zone;
1476         krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
1477         model_t datamodel = flag & DATAMODEL_MASK;
1478 
1479         switch (cmd) {
1480 
1481         case MNTIOC_NMNTS:              /* get no. of mounted resources */
1482         {
1483                 rw_enter(&mnp->mnt_contents, RW_READER);
1484                 if (snapp->mnts_nmnts == 0 ||
1485                     (snapp->mnts_flags & MNTS_REWIND)) {
1486                         if (!rw_tryupgrade(&mnp->mnt_contents)) {
1487                                 rw_exit(&mnp->mnt_contents);
1488                                 rw_enter(&mnp->mnt_contents, RW_WRITER);
1489                         }
1490                         if (snapp->mnts_nmnts == 0 ||
1491                             (snapp->mnts_flags & MNTS_REWIND))
1492                                 mntfs_snapshot(mnp, snapp);
1493                 }
1494                 rw_exit(&mnp->mnt_contents);
1495 
1496                 if (suword32(up, snapp->mnts_nmnts) != 0)
1497                         error = EFAULT;
1498                 break;
1499         }
1500 
1501         case MNTIOC_GETDEVLIST:         /* get mounted device major/minor nos */
1502         {
1503                 size_t len;
1504                 uint_t *devlist;
1505                 mntelem_t *elemp;
1506                 int i = 0;
1507 
1508                 rw_enter(&mnp->mnt_contents, RW_READER);
1509                 if (snapp->mnts_nmnts == 0 ||
1510                     (snapp->mnts_flags & MNTS_REWIND)) {
1511                         if (!rw_tryupgrade(&mnp->mnt_contents)) {
1512                                 rw_exit(&mnp->mnt_contents);
1513                                 rw_enter(&mnp->mnt_contents, RW_WRITER);
1514                         }
1515                         if (snapp->mnts_nmnts == 0 ||
1516                             (snapp->mnts_flags & MNTS_REWIND))
1517                                 mntfs_snapshot(mnp, snapp);
1518                         rw_downgrade(&mnp->mnt_contents);
1519                 }
1520 
1521                 /* Create a local buffer to hold the device numbers. */
1522                 len = 2 * snapp->mnts_nmnts * sizeof (uint_t);
1523                 devlist = kmem_alloc(len, KM_SLEEP);
1524 
1525                 /*
1526                  * Walk the database elements for this snapshot and add their
1527                  * major and minor numbers.
1528                  */
1529                 rw_enter(dblockp, RW_READER);
1530                 for (elemp = snapp->mnts_first; elemp;
1531                     elemp = mntfs_get_next_elem(snapp, elemp)) {
1532                                 devlist[2 * i] = elemp->mnte_tab.mnt_major;
1533                                 devlist[2 * i + 1] = elemp->mnte_tab.mnt_minor;
1534                                 i++;
1535                 }
1536                 rw_exit(dblockp);
1537                 ASSERT(i == snapp->mnts_nmnts);
1538                 rw_exit(&mnp->mnt_contents);
1539 
1540                 error = xcopyout(devlist, up, len);
1541                 kmem_free(devlist, len);
1542                 break;
1543         }
1544 
1545         case MNTIOC_SETTAG:             /* set tag on mounted file system */
1546         case MNTIOC_CLRTAG:             /* clear tag on mounted file system */
1547         {
1548                 struct mnttagdesc *dp = (struct mnttagdesc *)arg;
1549                 STRUCT_DECL(mnttagdesc, tagdesc);
1550                 char *cptr;
1551                 uint32_t major, minor;
1552                 char tagbuf[MAX_MNTOPT_TAG];
1553                 char *pbuf;
1554                 size_t len;
1555                 uint_t start = 0;
1556                 mntdata_t *mntdata = MTOD(mnp);
1557                 zone_t *zone = mntdata->mnt_zone_ref.zref_zone;
1558 
1559                 STRUCT_INIT(tagdesc, flag & DATAMODEL_MASK);
1560                 if (copyin(dp, STRUCT_BUF(tagdesc), STRUCT_SIZE(tagdesc))) {
1561                         error = EFAULT;
1562                         break;
1563                 }
1564                 pbuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1565                 if (zone != global_zone) {
1566                         (void) strcpy(pbuf, zone->zone_rootpath);
1567                         /* truncate "/" and nul */
1568                         start = zone->zone_rootpathlen - 2;
1569                         ASSERT(pbuf[start] == '/');
1570                 }
1571                 cptr = STRUCT_FGETP(tagdesc, mtd_mntpt);
1572                 error = copyinstr(cptr, pbuf + start, MAXPATHLEN - start, &len);
1573                 if (error) {
1574                         kmem_free(pbuf, MAXPATHLEN);
1575                         break;
1576                 }
1577                 if (start != 0 && pbuf[start] != '/') {
1578                         kmem_free(pbuf, MAXPATHLEN);
1579                         error = EINVAL;
1580                         break;
1581                 }
1582                 cptr = STRUCT_FGETP(tagdesc, mtd_tag);
1583                 if ((error = copyinstr(cptr, tagbuf, MAX_MNTOPT_TAG, &len))) {
1584                         kmem_free(pbuf, MAXPATHLEN);
1585                         break;
1586                 }
1587                 major = STRUCT_FGET(tagdesc, mtd_major);
1588                 minor = STRUCT_FGET(tagdesc, mtd_minor);
1589                 if (cmd == MNTIOC_SETTAG)
1590                         error = vfs_settag(major, minor, pbuf, tagbuf, cr);
1591                 else
1592                         error = vfs_clrtag(major, minor, pbuf, tagbuf, cr);
1593                 kmem_free(pbuf, MAXPATHLEN);
1594                 break;
1595         }
1596 
1597         case MNTIOC_SHOWHIDDEN:
1598         {
1599                 rw_enter(&mnp->mnt_contents, RW_WRITER);
1600                 mnp->mnt_flags |= MNT_SHOWHIDDEN;
1601                 rw_exit(&mnp->mnt_contents);
1602                 break;
1603         }
1604 
1605         case MNTIOC_GETMNTANY:
1606         {
1607                 STRUCT_DECL(mntentbuf, embuf);  /* Our copy of user's embuf */
1608                 STRUCT_DECL(extmnttab, ktab);   /* Out copy of user's emp */
1609                 struct extmnttab *uemp;         /* uaddr of user's emp */
1610                 char *ubufp;                    /* uaddr of user's text buf */
1611                 size_t ubufsize;                /* size of the above */
1612                 struct extmnttab preftab;       /* our version of user's emp */
1613                 char *prefbuf;                  /* our copy of user's text */
1614                 mntelem_t *elemp;               /* a database element */
1615                 struct extmnttab *dbtabp;       /* element's extmnttab */
1616                 char *dbbufp;                   /* element's text buf */
1617                 size_t dbbufsize;               /* size of the above */
1618                 vtype_t type;                   /* type, if any, of special */
1619 
1620 
1621                 /*
1622                  * embuf is a struct embuf within the kernel. We copy into it
1623                  * the struct embuf supplied by the user.
1624                  */
1625                 STRUCT_INIT(embuf, datamodel);
1626                 if (copyin((void *) arg, STRUCT_BUF(embuf),
1627                     STRUCT_SIZE(embuf))) {
1628                         error = EFAULT;
1629                         break;
1630                 }
1631                 uemp = STRUCT_FGETP(embuf, mbuf_emp);
1632                 ubufp = STRUCT_FGETP(embuf, mbuf_buf);
1633                 ubufsize = STRUCT_FGET(embuf, mbuf_bufsize);
1634 
1635                 /*
1636                  * Check that the text buffer offered by the user is the
1637                  * agreed size.
1638                  */
1639                 if (ubufsize != MNT_LINE_MAX) {
1640                         error = EINVAL;
1641                         break;
1642                 }
1643 
1644                 /* Copy the user-supplied entry into a local buffer. */
1645                 prefbuf = kmem_alloc(MNT_LINE_MAX, KM_SLEEP);
1646                 if (copyin(ubufp, prefbuf, MNT_LINE_MAX)) {
1647                         kmem_free(prefbuf, MNT_LINE_MAX);
1648                         error = EFAULT;
1649                         break;
1650                 }
1651 
1652                 /* Ensure that any string within it is null-terminated. */
1653                 *(prefbuf + MNT_LINE_MAX - 1) = 0;
1654 
1655                 /* Copy in the user-supplied mpref */
1656                 STRUCT_INIT(ktab, datamodel);
1657                 if (copyin(uemp, STRUCT_BUF(ktab),
1658                     SIZEOF_STRUCT(mnttab, datamodel))) {
1659                         kmem_free(prefbuf, MNT_LINE_MAX);
1660                         error = EFAULT;
1661                         break;
1662                 }
1663 
1664                 /*
1665                  * Copy the members of the user's pref struct into a local
1666                  * struct. The pointers need to be offset and verified to
1667                  * ensure that they lie within the bounds of the buffer.
1668                  */
1669                 preftab.mnt_special = mntfs_import_addr(STRUCT_FGETP(ktab,
1670                     mnt_special), ubufp, prefbuf, MNT_LINE_MAX);
1671                 preftab.mnt_mountp = mntfs_import_addr(STRUCT_FGETP(ktab,
1672                     mnt_mountp), ubufp, prefbuf, MNT_LINE_MAX);
1673                 preftab.mnt_fstype = mntfs_import_addr(STRUCT_FGETP(ktab,
1674                     mnt_fstype), ubufp, prefbuf, MNT_LINE_MAX);
1675                 preftab.mnt_mntopts = mntfs_import_addr(STRUCT_FGETP(ktab,
1676                     mnt_mntopts), ubufp, prefbuf, MNT_LINE_MAX);
1677                 preftab.mnt_time = mntfs_import_addr(STRUCT_FGETP(ktab,
1678                     mnt_time), ubufp, prefbuf, MNT_LINE_MAX);
1679 
1680                 /*
1681                  * If the user specifies a mounted resource that is a special
1682                  * device then we capture its mode and major and minor numbers;
1683                  * cf. the block comment below.
1684                  */
1685                 type = mntfs_special_info_string(preftab.mnt_special,
1686                     &preftab.mnt_major, &preftab.mnt_minor, cr);
1687 
1688                 rw_enter(&mnp->mnt_contents, RW_WRITER);
1689                 if (snapp->mnts_nmnts == 0 ||
1690                     (snapp->mnts_flags & MNTS_REWIND))
1691                         mntfs_snapshot(mnp, snapp);
1692 
1693                 /*
1694                  * This is the core functionality that implements getmntany().
1695                  * We walk through the mntfs database until we find an element
1696                  * matching the user's preferences that are contained in
1697                  * preftab. Typically, this means checking that the text
1698                  * matches. However, the mounted resource is special: if the
1699                  * user is looking for a special device then we must find a
1700                  * database element with the same major and minor numbers and
1701                  * the same type, i.e. VBLK or VCHR. The type is not recorded
1702                  * in the element because it cannot be inferred from the vfs_t.
1703                  * We therefore check the type of suitable candidates via
1704                  * mntfs_special_info_element(); since this calls into the
1705                  * underlying file system we make sure to drop the database lock
1706                  * first.
1707                  */
1708                 elemp = snapp->mnts_next;
1709                 rw_enter(dblockp, RW_READER);
1710                 for (;;) {
1711                         for (; elemp; elemp = mntfs_get_next_elem(snapp,
1712                             elemp)) {
1713                                 dbtabp = &elemp->mnte_tab;
1714                                 dbbufp = elemp->mnte_text;
1715                                 dbbufsize = elemp->mnte_text_size;
1716 
1717                                 if (((type &&
1718                                     dbtabp->mnt_major == preftab.mnt_major &&
1719                                     dbtabp->mnt_minor == preftab.mnt_minor &&
1720                                     MNTFS_REAL_FIELD(dbbufp)) ||
1721                                     (!type && (!preftab.mnt_special ||
1722                                     mntfs_same_word(preftab.mnt_special,
1723                                     prefbuf, MNT_LINE_MAX, (off_t)0, dbbufp,
1724                                     dbbufsize)))) &&
1725 
1726                                     (!preftab.mnt_mountp || mntfs_same_word(
1727                                     preftab.mnt_mountp, prefbuf, MNT_LINE_MAX,
1728                                     (off_t)dbtabp->mnt_mountp, dbbufp,
1729                                     dbbufsize)) &&
1730 
1731                                     (!preftab.mnt_fstype || mntfs_same_word(
1732                                     preftab.mnt_fstype, prefbuf, MNT_LINE_MAX,
1733                                     (off_t)dbtabp->mnt_fstype, dbbufp,
1734                                     dbbufsize)) &&
1735 
1736                                     (!preftab.mnt_mntopts || mntfs_same_word(
1737                                     preftab.mnt_mntopts, prefbuf, MNT_LINE_MAX,
1738                                     (off_t)dbtabp->mnt_mntopts, dbbufp,
1739                                     dbbufsize)) &&
1740 
1741                                     (!preftab.mnt_time || mntfs_same_word(
1742                                     preftab.mnt_time, prefbuf, MNT_LINE_MAX,
1743                                     (off_t)dbtabp->mnt_time, dbbufp,
1744                                     dbbufsize)))
1745                                         break;
1746                         }
1747                         rw_exit(dblockp);
1748 
1749                         if (elemp == NULL || type == 0 ||
1750                             type == mntfs_special_info_element(elemp, cr))
1751                                 break;
1752 
1753                         rw_enter(dblockp, RW_READER);
1754                         elemp = mntfs_get_next_elem(snapp, elemp);
1755                 }
1756 
1757                 kmem_free(prefbuf, MNT_LINE_MAX);
1758 
1759                 /* If we failed to find a match then return EOF. */
1760                 if (elemp == NULL) {
1761                         rw_exit(&mnp->mnt_contents);
1762                         *rvalp = MNTFS_EOF;
1763                         break;
1764                 }
1765 
1766                 /*
1767                  * Check that the text buffer offered by the user will be large
1768                  * enough to accommodate the text for this entry.
1769                  */
1770                 if (elemp->mnte_text_size > MNT_LINE_MAX) {
1771                         rw_exit(&mnp->mnt_contents);
1772                         *rvalp = MNTFS_TOOLONG;
1773                         break;
1774                 }
1775 
1776                 /*
1777                  * Populate the user's struct mnttab and text buffer using the
1778                  * element's contents.
1779                  */
1780                 if (mntfs_copyout_elem(elemp, uemp, ubufp, cmd, datamodel)) {
1781                         error = EFAULT;
1782                 } else {
1783                         rw_enter(dblockp, RW_READER);
1784                         elemp = mntfs_get_next_elem(snapp, elemp);
1785                         rw_exit(dblockp);
1786                         snapp->mnts_next = elemp;
1787                 }
1788                 rw_exit(&mnp->mnt_contents);
1789                 break;
1790         }
1791 
1792         case MNTIOC_GETMNTENT:
1793         case MNTIOC_GETEXTMNTENT:
1794         {
1795                 STRUCT_DECL(mntentbuf, embuf);  /* Our copy of user's embuf */
1796                 struct extmnttab *uemp;         /* uaddr of user's emp */
1797                 char *ubufp;                    /* uaddr of user's text buf */
1798                 size_t ubufsize;                /* size of the above */
1799                 mntelem_t *elemp;               /* a database element */
1800 
1801 
1802                 rw_enter(&mnp->mnt_contents, RW_WRITER);
1803                 if (snapp->mnts_nmnts == 0 ||
1804                     (snapp->mnts_flags & MNTS_REWIND))
1805                         mntfs_snapshot(mnp, snapp);
1806                 if ((elemp = snapp->mnts_next) == NULL) {
1807                         rw_exit(&mnp->mnt_contents);
1808                         *rvalp = MNTFS_EOF;
1809                         break;
1810                 }
1811 
1812                 /*
1813                  * embuf is a struct embuf within the kernel. We copy into it
1814                  * the struct embuf supplied by the user.
1815                  */
1816                 STRUCT_INIT(embuf, datamodel);
1817                 if (copyin((void *) arg, STRUCT_BUF(embuf),
1818                     STRUCT_SIZE(embuf))) {
1819                         rw_exit(&mnp->mnt_contents);
1820                         error = EFAULT;
1821                         break;
1822                 }
1823                 uemp = STRUCT_FGETP(embuf, mbuf_emp);
1824                 ubufp = STRUCT_FGETP(embuf, mbuf_buf);
1825                 ubufsize = STRUCT_FGET(embuf, mbuf_bufsize);
1826 
1827                 /*
1828                  * Check that the text buffer offered by the user will be large
1829                  * enough to accommodate the text for this entry.
1830                  */
1831                 if (elemp->mnte_text_size > ubufsize) {
1832                         rw_exit(&mnp->mnt_contents);
1833                         *rvalp = MNTFS_TOOLONG;
1834                         break;
1835                 }
1836 
1837                 /*
1838                  * Populate the user's struct mnttab and text buffer using the
1839                  * element's contents.
1840                  */
1841                 if (mntfs_copyout_elem(elemp, uemp, ubufp, cmd, datamodel)) {
1842                         error = EFAULT;
1843                 } else {
1844                         rw_enter(dblockp, RW_READER);
1845                         elemp = mntfs_get_next_elem(snapp, elemp);
1846                         rw_exit(dblockp);
1847                         snapp->mnts_next = elemp;
1848                 }
1849                 rw_exit(&mnp->mnt_contents);
1850                 break;
1851         }
1852 
1853         default:
1854                 error = EINVAL;
1855                 break;
1856         }
1857 
1858         return (error);
1859 }
1860 
1861 /*
1862  * mntfs provides a new vnode for each open(2). Two vnodes will represent the
1863  * same instance of /etc/mnttab if they share the same (zone-specific) vfs.
1864  */
1865 /* ARGSUSED */
1866 int
1867 mntcmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
1868 {
1869         return (vp1 != NULL && vp2 != NULL && vp1->v_vfsp == vp2->v_vfsp);
1870 }
1871 
1872 /*
1873  * /mntfs vnode operations vector
1874  */
1875 const fs_operation_def_t mnt_vnodeops_template[] = {
1876         VOPNAME_OPEN,           { .vop_open = mntopen },
1877         VOPNAME_CLOSE,          { .vop_close = mntclose },
1878         VOPNAME_READ,           { .vop_read = mntread },
1879         VOPNAME_IOCTL,          { .vop_ioctl = mntioctl },
1880         VOPNAME_GETATTR,        { .vop_getattr = mntgetattr },
1881         VOPNAME_ACCESS,         { .vop_access = mntaccess },
1882         VOPNAME_FSYNC,          { .vop_fsync = mntfsync },
1883         VOPNAME_INACTIVE,       { .vop_inactive = mntinactive },
1884         VOPNAME_SEEK,           { .vop_seek = mntseek },
1885         VOPNAME_POLL,           { .vop_poll = mntpoll },
1886         VOPNAME_CMP,            { .vop_cmp = mntcmp },
1887         VOPNAME_DISPOSE,        { .error = fs_error },
1888         VOPNAME_SHRLOCK,        { .error = fs_error },
1889         NULL,                   NULL
1890 };