patch-lower-case-segops Wdiff usr/src/uts/common/os/zone.c

Print this page

patch lower-case-segops

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/zone.c
          +++ new/usr/src/uts/common/os/zone.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2013, Joyent Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  /*
  28   28   * Zones
  29   29   *
  30   30   *   A zone is a named collection of processes, namespace constraints,
  31   31   *   and other system resources which comprise a secure and manageable
  32   32   *   application containment facility.
  33   33   *
  34   34   *   Zones (represented by the reference counted zone_t) are tracked in
  35   35   *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  36   36   *   (zoneid_t) are used to track zone association.  Zone IDs are
  37   37   *   dynamically generated when the zone is created; if a persistent
  38   38   *   identifier is needed (core files, accounting logs, audit trail,
  39   39   *   etc.), the zone name should be used.
  40   40   *
  41   41   *
  42   42   *   Global Zone:
  43   43   *
  44   44   *   The global zone (zoneid 0) is automatically associated with all
  45   45   *   system resources that have not been bound to a user-created zone.
  46   46   *   This means that even systems where zones are not in active use
  47   47   *   have a global zone, and all processes, mounts, etc. are
  48   48   *   associated with that zone.  The global zone is generally
  49   49   *   unconstrained in terms of privileges and access, though the usual
  50   50   *   credential and privilege based restrictions apply.
  51   51   *
  52   52   *
  53   53   *   Zone States:
  54   54   *
  55   55   *   The states in which a zone may be in and the transitions are as
  56   56   *   follows:
  57   57   *
  58   58   *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  59   59   *   initialized zone is added to the list of active zones on the system but
  60   60   *   isn't accessible.
  61   61   *
  62   62   *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  63   63   *   not yet completed. Not possible to enter the zone, but attributes can
  64   64   *   be retrieved.
  65   65   *
  66   66   *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  67   67   *   ready.  The zone is made visible after the ZSD constructor callbacks are
  68   68   *   executed.  A zone remains in this state until it transitions into
  69   69   *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  70   70   *
  71   71   *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  72   72   *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  73   73   *   state.
  74   74   *
  75   75   *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  76   76   *   successfully started init.   A zone remains in this state until
  77   77   *   zone_shutdown() is called.
  78   78   *
  79   79   *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  80   80   *   killing all processes running in the zone. The zone remains
  81   81   *   in this state until there are no more user processes running in the zone.
  82   82   *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  83   83   *   Since zone_shutdown() is restartable, it may be called successfully
  84   84   *   multiple times for the same zone_t.  Setting of the zone's state to
  85   85   *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  86   86   *   the zone's status without worrying about it being a moving target.
  87   87   *
  88   88   *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  89   89   *   are no more user processes in the zone.  The zone remains in this
  90   90   *   state until there are no more kernel threads associated with the
  91   91   *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  92   92   *   fail.
  93   93   *
  94   94   *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  95   95   *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  96   96   *   join the zone or create kernel threads therein.
  97   97   *
  98   98   *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
  99   99   *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 100  100   *   return NULL from now on.
 101  101   *
 102  102   *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 103  103   *   processes or threads doing work on behalf of the zone.  The zone is
 104  104   *   removed from the list of active zones.  zone_destroy() returns, and
 105  105   *   the zone can be recreated.
 106  106   *
 107  107   *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 108  108   *   callbacks are executed, and all memory associated with the zone is
 109  109   *   freed.
 110  110   *
 111  111   *   Threads can wait for the zone to enter a requested state by using
 112  112   *   zone_status_wait() or zone_status_timedwait() with the desired
 113  113   *   state passed in as an argument.  Zone state transitions are
 114  114   *   uni-directional; it is not possible to move back to an earlier state.
 115  115   *
 116  116   *
 117  117   *   Zone-Specific Data:
 118  118   *
 119  119   *   Subsystems needing to maintain zone-specific data can store that
 120  120   *   data using the ZSD mechanism.  This provides a zone-specific data
 121  121   *   store, similar to thread-specific data (see pthread_getspecific(3C)
 122  122   *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 123  123   *   to register callbacks to be invoked when a zone is created, shut
 124  124   *   down, or destroyed.  This can be used to initialize zone-specific
 125  125   *   data for new zones and to clean up when zones go away.
 126  126   *
 127  127   *
 128  128   *   Data Structures:
 129  129   *
 130  130   *   The per-zone structure (zone_t) is reference counted, and freed
 131  131   *   when all references are released.  zone_hold and zone_rele can be
 132  132   *   used to adjust the reference count.  In addition, reference counts
 133  133   *   associated with the cred_t structure are tracked separately using
 134  134   *   zone_cred_hold and zone_cred_rele.
 135  135   *
 136  136   *   Pointers to active zone_t's are stored in two hash tables; one
 137  137   *   for searching by id, the other for searching by name.  Lookups
 138  138   *   can be performed on either basis, using zone_find_by_id and
 139  139   *   zone_find_by_name.  Both return zone_t pointers with the zone
 140  140   *   held, so zone_rele should be called when the pointer is no longer
 141  141   *   needed.  Zones can also be searched by path; zone_find_by_path
 142  142   *   returns the zone with which a path name is associated (global
 143  143   *   zone if the path is not within some other zone's file system
 144  144   *   hierarchy).  This currently requires iterating through each zone,
 145  145   *   so it is slower than an id or name search via a hash table.
 146  146   *
 147  147   *
 148  148   *   Locking:
 149  149   *
 150  150   *   zonehash_lock: This is a top-level global lock used to protect the
 151  151   *       zone hash tables and lists.  Zones cannot be created or destroyed
 152  152   *       while this lock is held.
 153  153   *   zone_status_lock: This is a global lock protecting zone state.
 154  154   *       Zones cannot change state while this lock is held.  It also
 155  155   *       protects the list of kernel threads associated with a zone.
 156  156   *   zone_lock: This is a per-zone lock used to protect several fields of
 157  157   *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 158  158   *       this lock means that the zone cannot go away.
 159  159   *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 160  160   *       related to the zone.max-lwps rctl.
 161  161   *   zone_mem_lock: This is a per-zone lock used to protect the fields
 162  162   *       related to the zone.max-locked-memory and zone.max-swap rctls.
 163  163   *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 164  164   *       currently just max_lofi
 165  165   *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 166  166   *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 167  167   *       list (a list of zones in the ZONE_IS_DEAD state).
 168  168   *
 169  169   *   Ordering requirements:
 170  170   *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 171  171   *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
 172  172   *
 173  173   *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 174  174   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 175  175   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 176  176   *
 177  177   *   Blocking memory allocations are permitted while holding any of the
 178  178   *   zone locks.
 179  179   *
 180  180   *
 181  181   *   System Call Interface:
 182  182   *
 183  183   *   The zone subsystem can be managed and queried from user level with
 184  184   *   the following system calls (all subcodes of the primary "zone"
 185  185   *   system call):
 186  186   *   - zone_create: creates a zone with selected attributes (name,
 187  187   *     root path, privileges, resource controls, ZFS datasets)
 188  188   *   - zone_enter: allows the current process to enter a zone
 189  189   *   - zone_getattr: reports attributes of a zone
 190  190   *   - zone_setattr: set attributes of a zone
 191  191   *   - zone_boot: set 'init' running for the zone
 192  192   *   - zone_list: lists all zones active in the system
 193  193   *   - zone_lookup: looks up zone id based on name
 194  194   *   - zone_shutdown: initiates shutdown process (see states above)
 195  195   *   - zone_destroy: completes shutdown process (see states above)
 196  196   *
 197  197   */
 198  198  
 199  199  #include <sys/priv_impl.h>
 200  200  #include <sys/cred.h>
 201  201  #include <c2/audit.h>
 202  202  #include <sys/debug.h>
 203  203  #include <sys/file.h>
 204  204  #include <sys/kmem.h>
 205  205  #include <sys/kstat.h>
 206  206  #include <sys/mutex.h>
 207  207  #include <sys/note.h>
 208  208  #include <sys/pathname.h>
 209  209  #include <sys/proc.h>
 210  210  #include <sys/project.h>
 211  211  #include <sys/sysevent.h>
 212  212  #include <sys/task.h>
 213  213  #include <sys/systm.h>
 214  214  #include <sys/types.h>
 215  215  #include <sys/utsname.h>
 216  216  #include <sys/vnode.h>
 217  217  #include <sys/vfs.h>
 218  218  #include <sys/systeminfo.h>
 219  219  #include <sys/policy.h>
 220  220  #include <sys/cred_impl.h>
 221  221  #include <sys/contract_impl.h>
 222  222  #include <sys/contract/process_impl.h>
 223  223  #include <sys/class.h>
 224  224  #include <sys/pool.h>
 225  225  #include <sys/pool_pset.h>
 226  226  #include <sys/pset.h>
 227  227  #include <sys/strlog.h>
 228  228  #include <sys/sysmacros.h>
 229  229  #include <sys/callb.h>
 230  230  #include <sys/vmparam.h>
 231  231  #include <sys/corectl.h>
 232  232  #include <sys/ipc_impl.h>
 233  233  #include <sys/klpd.h>
 234  234  
 235  235  #include <sys/door.h>
 236  236  #include <sys/cpuvar.h>
 237  237  #include <sys/sdt.h>
 238  238  
 239  239  #include <sys/uadmin.h>
 240  240  #include <sys/session.h>
 241  241  #include <sys/cmn_err.h>
 242  242  #include <sys/modhash.h>
 243  243  #include <sys/sunddi.h>
 244  244  #include <sys/nvpair.h>
 245  245  #include <sys/rctl.h>
 246  246  #include <sys/fss.h>
 247  247  #include <sys/brand.h>
 248  248  #include <sys/zone.h>
 249  249  #include <net/if.h>
 250  250  #include <sys/cpucaps.h>
 251  251  #include <vm/seg.h>
 252  252  #include <sys/mac.h>
 253  253  
 254  254  /*
 255  255   * This constant specifies the number of seconds that threads waiting for
 256  256   * subsystems to release a zone's general-purpose references will wait before
 257  257   * they log the zone's reference counts.  The constant's value shouldn't
 258  258   * be so small that reference counts are unnecessarily reported for zones
 259  259   * whose references are slowly released.  On the other hand, it shouldn't be so
 260  260   * large that users reboot their systems out of frustration over hung zones
 261  261   * before the system logs the zones' reference counts.
 262  262   */
 263  263  #define ZONE_DESTROY_TIMEOUT_SECS       60
 264  264  
 265  265  /* List of data link IDs which are accessible from the zone */
 266  266  typedef struct zone_dl {
 267  267          datalink_id_t   zdl_id;
 268  268          nvlist_t        *zdl_net;
 269  269          list_node_t     zdl_linkage;
 270  270  } zone_dl_t;
 271  271  
 272  272  /*
 273  273   * cv used to signal that all references to the zone have been released.  This
 274  274   * needs to be global since there may be multiple waiters, and the first to
 275  275   * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 276  276   */
 277  277  static kcondvar_t zone_destroy_cv;
 278  278  /*
 279  279   * Lock used to serialize access to zone_cv.  This could have been per-zone,
 280  280   * but then we'd need another lock for zone_destroy_cv, and why bother?
 281  281   */
 282  282  static kmutex_t zone_status_lock;
 283  283  
 284  284  /*
 285  285   * ZSD-related global variables.
 286  286   */
 287  287  static kmutex_t zsd_key_lock;   /* protects the following two */
 288  288  /*
 289  289   * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 290  290   */
 291  291  static zone_key_t zsd_keyval = 0;
 292  292  /*
 293  293   * Global list of registered keys.  We use this when a new zone is created.
 294  294   */
 295  295  static list_t zsd_registered_keys;
 296  296  
 297  297  int zone_hash_size = 256;
 298  298  static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 299  299  static kmutex_t zonehash_lock;
 300  300  static uint_t zonecount;
 301  301  static id_space_t *zoneid_space;
 302  302  
 303  303  /*
 304  304   * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 305  305   * kernel proper runs, and which manages all other zones.
 306  306   *
 307  307   * Although not declared as static, the variable "zone0" should not be used
 308  308   * except for by code that needs to reference the global zone early on in boot,
 309  309   * before it is fully initialized.  All other consumers should use
 310  310   * 'global_zone'.
 311  311   */
 312  312  zone_t zone0;
 313  313  zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 314  314  
 315  315  /*
 316  316   * List of active zones, protected by zonehash_lock.
 317  317   */
 318  318  static list_t zone_active;
 319  319  
 320  320  /*
 321  321   * List of destroyed zones that still have outstanding cred references.
 322  322   * Used for debugging.  Uses a separate lock to avoid lock ordering
 323  323   * problems in zone_free.
 324  324   */
 325  325  static list_t zone_deathrow;
 326  326  static kmutex_t zone_deathrow_lock;
 327  327  
 328  328  /* number of zones is limited by virtual interface limit in IP */
 329  329  uint_t maxzones = 8192;
 330  330  
 331  331  /* Event channel to sent zone state change notifications */
 332  332  evchan_t *zone_event_chan;
 333  333  
 334  334  /*
 335  335   * This table holds the mapping from kernel zone states to
 336  336   * states visible in the state notification API.
 337  337   * The idea is that we only expose "obvious" states and
 338  338   * do not expose states which are just implementation details.
 339  339   */
 340  340  const char  *zone_status_table[] = {
 341  341          ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 342  342          ZONE_EVENT_INITIALIZED,         /* initialized */
 343  343          ZONE_EVENT_READY,               /* ready */
 344  344          ZONE_EVENT_READY,               /* booting */
 345  345          ZONE_EVENT_RUNNING,             /* running */
 346  346          ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 347  347          ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 348  348          ZONE_EVENT_SHUTTING_DOWN,       /* down */
 349  349          ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 350  350          ZONE_EVENT_UNINITIALIZED,       /* dead */
 351  351  };
 352  352  
 353  353  /*
 354  354   * This array contains the names of the subsystems listed in zone_ref_subsys_t
 355  355   * (see sys/zone.h).
 356  356   */
 357  357  static char *zone_ref_subsys_names[] = {
 358  358          "NFS",          /* ZONE_REF_NFS */
 359  359          "NFSv4",        /* ZONE_REF_NFSV4 */
 360  360          "SMBFS",        /* ZONE_REF_SMBFS */
 361  361          "MNTFS",        /* ZONE_REF_MNTFS */
 362  362          "LOFI",         /* ZONE_REF_LOFI */
 363  363          "VFS",          /* ZONE_REF_VFS */
 364  364          "IPC"           /* ZONE_REF_IPC */
 365  365  };
 366  366  
 367  367  /*
 368  368   * This isn't static so lint doesn't complain.
 369  369   */
 370  370  rctl_hndl_t rc_zone_cpu_shares;
 371  371  rctl_hndl_t rc_zone_locked_mem;
 372  372  rctl_hndl_t rc_zone_max_swap;
 373  373  rctl_hndl_t rc_zone_max_lofi;
 374  374  rctl_hndl_t rc_zone_cpu_cap;
 375  375  rctl_hndl_t rc_zone_nlwps;
 376  376  rctl_hndl_t rc_zone_nprocs;
 377  377  rctl_hndl_t rc_zone_shmmax;
 378  378  rctl_hndl_t rc_zone_shmmni;
 379  379  rctl_hndl_t rc_zone_semmni;
 380  380  rctl_hndl_t rc_zone_msgmni;
 381  381  
 382  382  const char * const zone_default_initname = "/sbin/init";
 383  383  static char * const zone_prefix = "/zone/";
 384  384  static int zone_shutdown(zoneid_t zoneid);
 385  385  static int zone_add_datalink(zoneid_t, datalink_id_t);
 386  386  static int zone_remove_datalink(zoneid_t, datalink_id_t);
 387  387  static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 388  388  static int zone_set_network(zoneid_t, zone_net_data_t *);
 389  389  static int zone_get_network(zoneid_t, zone_net_data_t *);
 390  390  
 391  391  typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 392  392  
 393  393  static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 394  394  static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 395  395  static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 396  396  static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 397  397      zone_key_t);
 398  398  static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 399  399  static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 400  400      kmutex_t *);
 401  401  static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 402  402      kmutex_t *);
 403  403  
 404  404  /*
 405  405   * Bump this number when you alter the zone syscall interfaces; this is
 406  406   * because we need to have support for previous API versions in libc
 407  407   * to support patching; libc calls into the kernel to determine this number.
 408  408   *
 409  409   * Version 1 of the API is the version originally shipped with Solaris 10
 410  410   * Version 2 alters the zone_create system call in order to support more
 411  411   *     arguments by moving the args into a structure; and to do better
 412  412   *     error reporting when zone_create() fails.
 413  413   * Version 3 alters the zone_create system call in order to support the
 414  414   *     import of ZFS datasets to zones.
 415  415   * Version 4 alters the zone_create system call in order to support
 416  416   *     Trusted Extensions.
 417  417   * Version 5 alters the zone_boot system call, and converts its old
 418  418   *     bootargs parameter to be set by the zone_setattr API instead.
 419  419   * Version 6 adds the flag argument to zone_create.
 420  420   */
 421  421  static const int ZONE_SYSCALL_API_VERSION = 6;
 422  422  
 423  423  /*
 424  424   * Certain filesystems (such as NFS and autofs) need to know which zone
 425  425   * the mount is being placed in.  Because of this, we need to be able to
 426  426   * ensure that a zone isn't in the process of being created/destroyed such
 427  427   * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 428  428   * it gets added the list of mounted zones, it ends up on the wrong zone's
 429  429   * mount list. Since a zone can't reside on an NFS file system, we don't
 430  430   * have to worry about the zonepath itself.
 431  431   *
 432  432   * The following functions: block_mounts()/resume_mounts() and
 433  433   * mount_in_progress()/mount_completed() are used by zones and the VFS
 434  434   * layer (respectively) to synchronize zone state transitions and new
 435  435   * mounts within a zone. This syncronization is on a per-zone basis, so
 436  436   * activity for one zone will not interfere with activity for another zone.
 437  437   *
 438  438   * The semantics are like a reader-reader lock such that there may
 439  439   * either be multiple mounts (or zone state transitions, if that weren't
 440  440   * serialized by zonehash_lock) in progress at the same time, but not
 441  441   * both.
 442  442   *
 443  443   * We use cv's so the user can ctrl-C out of the operation if it's
 444  444   * taking too long.
 445  445   *
 446  446   * The semantics are such that there is unfair bias towards the
 447  447   * "current" operation.  This means that zone halt may starve if
 448  448   * there is a rapid succession of new mounts coming in to the zone.
 449  449   */
 450  450  /*
 451  451   * Prevent new mounts from progressing to the point of calling
 452  452   * VFS_MOUNT().  If there are already mounts in this "region", wait for
 453  453   * them to complete.
 454  454   */
 455  455  static int
 456  456  block_mounts(zone_t *zp)
 457  457  {
 458  458          int retval = 0;
 459  459  
 460  460          /*
 461  461           * Since it may block for a long time, block_mounts() shouldn't be
 462  462           * called with zonehash_lock held.
 463  463           */
 464  464          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 465  465          mutex_enter(&zp->zone_mount_lock);
 466  466          while (zp->zone_mounts_in_progress > 0) {
 467  467                  if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 468  468                          goto signaled;
 469  469          }
 470  470          /*
 471  471           * A negative value of mounts_in_progress indicates that mounts
 472  472           * have been blocked by (-mounts_in_progress) different callers
 473  473           * (remotely possible if two threads enter zone_shutdown at the same
 474  474           * time).
 475  475           */
 476  476          zp->zone_mounts_in_progress--;
 477  477          retval = 1;
 478  478  signaled:
 479  479          mutex_exit(&zp->zone_mount_lock);
 480  480          return (retval);
 481  481  }
 482  482  
 483  483  /*
 484  484   * The VFS layer may progress with new mounts as far as we're concerned.
 485  485   * Allow them to progress if we were the last obstacle.
 486  486   */
 487  487  static void
 488  488  resume_mounts(zone_t *zp)
 489  489  {
 490  490          mutex_enter(&zp->zone_mount_lock);
 491  491          if (++zp->zone_mounts_in_progress == 0)
 492  492                  cv_broadcast(&zp->zone_mount_cv);
 493  493          mutex_exit(&zp->zone_mount_lock);
 494  494  }
 495  495  
 496  496  /*
 497  497   * The VFS layer is busy with a mount; this zone should wait until all
 498  498   * of its mounts are completed to progress.
 499  499   */
 500  500  void
 501  501  mount_in_progress(zone_t *zp)
 502  502  {
 503  503          mutex_enter(&zp->zone_mount_lock);
 504  504          while (zp->zone_mounts_in_progress < 0)
 505  505                  cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 506  506          zp->zone_mounts_in_progress++;
 507  507          mutex_exit(&zp->zone_mount_lock);
 508  508  }
 509  509  
 510  510  /*
 511  511   * VFS is done with one mount; wake up any waiting block_mounts()
 512  512   * callers if this is the last mount.
 513  513   */
 514  514  void
 515  515  mount_completed(zone_t *zp)
 516  516  {
 517  517          mutex_enter(&zp->zone_mount_lock);
 518  518          if (--zp->zone_mounts_in_progress == 0)
 519  519                  cv_broadcast(&zp->zone_mount_cv);
 520  520          mutex_exit(&zp->zone_mount_lock);
 521  521  }
 522  522  
 523  523  /*
 524  524   * ZSD routines.
 525  525   *
 526  526   * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 527  527   * defined by the pthread_key_create() and related interfaces.
 528  528   *
 529  529   * Kernel subsystems may register one or more data items and/or
 530  530   * callbacks to be executed when a zone is created, shutdown, or
 531  531   * destroyed.
 532  532   *
 533  533   * Unlike the thread counterpart, destructor callbacks will be executed
 534  534   * even if the data pointer is NULL and/or there are no constructor
 535  535   * callbacks, so it is the responsibility of such callbacks to check for
 536  536   * NULL data values if necessary.
 537  537   *
 538  538   * The locking strategy and overall picture is as follows:
 539  539   *
 540  540   * When someone calls zone_key_create(), a template ZSD entry is added to the
 541  541   * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 542  542   * holding that lock all the existing zones are marked as
 543  543   * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 544  544   * zone_zsd list (protected by zone_lock). The global list is updated first
 545  545   * (under zone_key_lock) to make sure that newly created zones use the
 546  546   * most recent list of keys. Then under zonehash_lock we walk the zones
 547  547   * and mark them.  Similar locking is used in zone_key_delete().
 548  548   *
 549  549   * The actual create, shutdown, and destroy callbacks are done without
 550  550   * holding any lock. And zsd_flags are used to ensure that the operations
 551  551   * completed so that when zone_key_create (and zone_create) is done, as well as
 552  552   * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 553  553   * are completed.
 554  554   *
 555  555   * When new zones are created constructor callbacks for all registered ZSD
 556  556   * entries will be called. That also uses the above two phases of marking
 557  557   * what needs to be done, and then running the callbacks without holding
 558  558   * any locks.
 559  559   *
 560  560   * The framework does not provide any locking around zone_getspecific() and
 561  561   * zone_setspecific() apart from that needed for internal consistency, so
 562  562   * callers interested in atomic "test-and-set" semantics will need to provide
 563  563   * their own locking.
 564  564   */
 565  565  
 566  566  /*
 567  567   * Helper function to find the zsd_entry associated with the key in the
 568  568   * given list.
 569  569   */
 570  570  static struct zsd_entry *
 571  571  zsd_find(list_t *l, zone_key_t key)
 572  572  {
 573  573          struct zsd_entry *zsd;
 574  574  
 575  575          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 576  576                  if (zsd->zsd_key == key) {
 577  577                          return (zsd);
 578  578                  }
 579  579          }
 580  580          return (NULL);
 581  581  }
 582  582  
 583  583  /*
 584  584   * Helper function to find the zsd_entry associated with the key in the
 585  585   * given list. Move it to the front of the list.
 586  586   */
 587  587  static struct zsd_entry *
 588  588  zsd_find_mru(list_t *l, zone_key_t key)
 589  589  {
 590  590          struct zsd_entry *zsd;
 591  591  
 592  592          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 593  593                  if (zsd->zsd_key == key) {
 594  594                          /*
 595  595                           * Move to head of list to keep list in MRU order.
 596  596                           */
 597  597                          if (zsd != list_head(l)) {
 598  598                                  list_remove(l, zsd);
 599  599                                  list_insert_head(l, zsd);
 600  600                          }
 601  601                          return (zsd);
 602  602                  }
 603  603          }
 604  604          return (NULL);
 605  605  }
 606  606  
 607  607  void
 608  608  zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 609  609      void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 610  610  {
 611  611          struct zsd_entry *zsdp;
 612  612          struct zsd_entry *t;
 613  613          struct zone *zone;
 614  614          zone_key_t  key;
 615  615  
 616  616          zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 617  617          zsdp->zsd_data = NULL;
 618  618          zsdp->zsd_create = create;
 619  619          zsdp->zsd_shutdown = shutdown;
 620  620          zsdp->zsd_destroy = destroy;
 621  621  
 622  622          /*
 623  623           * Insert in global list of callbacks. Makes future zone creations
 624  624           * see it.
 625  625           */
 626  626          mutex_enter(&zsd_key_lock);
 627  627          key = zsdp->zsd_key = ++zsd_keyval;
 628  628          ASSERT(zsd_keyval != 0);
 629  629          list_insert_tail(&zsd_registered_keys, zsdp);
 630  630          mutex_exit(&zsd_key_lock);
 631  631  
 632  632          /*
 633  633           * Insert for all existing zones and mark them as needing
 634  634           * a create callback.
 635  635           */
 636  636          mutex_enter(&zonehash_lock);    /* stop the world */
 637  637          for (zone = list_head(&zone_active); zone != NULL;
 638  638              zone = list_next(&zone_active, zone)) {
 639  639                  zone_status_t status;
 640  640  
 641  641                  mutex_enter(&zone->zone_lock);
 642  642  
 643  643                  /* Skip zones that are on the way down or not yet up */
 644  644                  status = zone_status_get(zone);
 645  645                  if (status >= ZONE_IS_DOWN ||
 646  646                      status == ZONE_IS_UNINITIALIZED) {
 647  647                          mutex_exit(&zone->zone_lock);
 648  648                          continue;
 649  649                  }
 650  650  
 651  651                  t = zsd_find_mru(&zone->zone_zsd, key);
 652  652                  if (t != NULL) {
 653  653                          /*
 654  654                           * A zsd_configure already inserted it after
 655  655                           * we dropped zsd_key_lock above.
 656  656                           */
 657  657                          mutex_exit(&zone->zone_lock);
 658  658                          continue;
 659  659                  }
 660  660                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 661  661                  t->zsd_key = key;
 662  662                  t->zsd_create = create;
 663  663                  t->zsd_shutdown = shutdown;
 664  664                  t->zsd_destroy = destroy;
 665  665                  if (create != NULL) {
 666  666                          t->zsd_flags = ZSD_CREATE_NEEDED;
 667  667                          DTRACE_PROBE2(zsd__create__needed,
 668  668                              zone_t *, zone, zone_key_t, key);
 669  669                  }
 670  670                  list_insert_tail(&zone->zone_zsd, t);
 671  671                  mutex_exit(&zone->zone_lock);
 672  672          }
 673  673          mutex_exit(&zonehash_lock);
 674  674  
 675  675          if (create != NULL) {
 676  676                  /* Now call the create callback for this key */
 677  677                  zsd_apply_all_zones(zsd_apply_create, key);
 678  678          }
 679  679          /*
 680  680           * It is safe for consumers to use the key now, make it
 681  681           * globally visible. Specifically zone_getspecific() will
 682  682           * always successfully return the zone specific data associated
 683  683           * with the key.
 684  684           */
 685  685          *keyp = key;
 686  686  
 687  687  }
 688  688  
 689  689  /*
 690  690   * Function called when a module is being unloaded, or otherwise wishes
 691  691   * to unregister its ZSD key and callbacks.
 692  692   *
 693  693   * Remove from the global list and determine the functions that need to
 694  694   * be called under a global lock. Then call the functions without
 695  695   * holding any locks. Finally free up the zone_zsd entries. (The apply
 696  696   * functions need to access the zone_zsd entries to find zsd_data etc.)
 697  697   */
 698  698  int
 699  699  zone_key_delete(zone_key_t key)
 700  700  {
 701  701          struct zsd_entry *zsdp = NULL;
 702  702          zone_t *zone;
 703  703  
 704  704          mutex_enter(&zsd_key_lock);
 705  705          zsdp = zsd_find_mru(&zsd_registered_keys, key);
 706  706          if (zsdp == NULL) {
 707  707                  mutex_exit(&zsd_key_lock);
 708  708                  return (-1);
 709  709          }
 710  710          list_remove(&zsd_registered_keys, zsdp);
 711  711          mutex_exit(&zsd_key_lock);
 712  712  
 713  713          mutex_enter(&zonehash_lock);
 714  714          for (zone = list_head(&zone_active); zone != NULL;
 715  715              zone = list_next(&zone_active, zone)) {
 716  716                  struct zsd_entry *del;
 717  717  
 718  718                  mutex_enter(&zone->zone_lock);
 719  719                  del = zsd_find_mru(&zone->zone_zsd, key);
 720  720                  if (del == NULL) {
 721  721                          /*
 722  722                           * Somebody else got here first e.g the zone going
 723  723                           * away.
 724  724                           */
 725  725                          mutex_exit(&zone->zone_lock);
 726  726                          continue;
 727  727                  }
 728  728                  ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 729  729                  ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 730  730                  if (del->zsd_shutdown != NULL &&
 731  731                      (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 732  732                          del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 733  733                          DTRACE_PROBE2(zsd__shutdown__needed,
 734  734                              zone_t *, zone, zone_key_t, key);
 735  735                  }
 736  736                  if (del->zsd_destroy != NULL &&
 737  737                      (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 738  738                          del->zsd_flags |= ZSD_DESTROY_NEEDED;
 739  739                          DTRACE_PROBE2(zsd__destroy__needed,
 740  740                              zone_t *, zone, zone_key_t, key);
 741  741                  }
 742  742                  mutex_exit(&zone->zone_lock);
 743  743          }
 744  744          mutex_exit(&zonehash_lock);
 745  745          kmem_free(zsdp, sizeof (*zsdp));
 746  746  
 747  747          /* Now call the shutdown and destroy callback for this key */
 748  748          zsd_apply_all_zones(zsd_apply_shutdown, key);
 749  749          zsd_apply_all_zones(zsd_apply_destroy, key);
 750  750  
 751  751          /* Now we can free up the zsdp structures in each zone */
 752  752          mutex_enter(&zonehash_lock);
 753  753          for (zone = list_head(&zone_active); zone != NULL;
 754  754              zone = list_next(&zone_active, zone)) {
 755  755                  struct zsd_entry *del;
 756  756  
 757  757                  mutex_enter(&zone->zone_lock);
 758  758                  del = zsd_find(&zone->zone_zsd, key);
 759  759                  if (del != NULL) {
 760  760                          list_remove(&zone->zone_zsd, del);
 761  761                          ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 762  762                          kmem_free(del, sizeof (*del));
 763  763                  }
 764  764                  mutex_exit(&zone->zone_lock);
 765  765          }
 766  766          mutex_exit(&zonehash_lock);
 767  767  
 768  768          return (0);
 769  769  }
 770  770  
 771  771  /*
 772  772   * ZSD counterpart of pthread_setspecific().
 773  773   *
 774  774   * Since all zsd callbacks, including those with no create function,
 775  775   * have an entry in zone_zsd, if the key is registered it is part of
 776  776   * the zone_zsd list.
 777  777   * Return an error if the key wasn't registerd.
 778  778   */
 779  779  int
 780  780  zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 781  781  {
 782  782          struct zsd_entry *t;
 783  783  
 784  784          mutex_enter(&zone->zone_lock);
 785  785          t = zsd_find_mru(&zone->zone_zsd, key);
 786  786          if (t != NULL) {
 787  787                  /*
 788  788                   * Replace old value with new
 789  789                   */
 790  790                  t->zsd_data = (void *)data;
 791  791                  mutex_exit(&zone->zone_lock);
 792  792                  return (0);
 793  793          }
 794  794          mutex_exit(&zone->zone_lock);
 795  795          return (-1);
 796  796  }
 797  797  
 798  798  /*
 799  799   * ZSD counterpart of pthread_getspecific().
 800  800   */
 801  801  void *
 802  802  zone_getspecific(zone_key_t key, zone_t *zone)
 803  803  {
 804  804          struct zsd_entry *t;
 805  805          void *data;
 806  806  
 807  807          mutex_enter(&zone->zone_lock);
 808  808          t = zsd_find_mru(&zone->zone_zsd, key);
 809  809          data = (t == NULL ? NULL : t->zsd_data);
 810  810          mutex_exit(&zone->zone_lock);
 811  811          return (data);
 812  812  }
 813  813  
 814  814  /*
 815  815   * Function used to initialize a zone's list of ZSD callbacks and data
 816  816   * when the zone is being created.  The callbacks are initialized from
 817  817   * the template list (zsd_registered_keys). The constructor callback is
 818  818   * executed later (once the zone exists and with locks dropped).
 819  819   */
 820  820  static void
 821  821  zone_zsd_configure(zone_t *zone)
 822  822  {
 823  823          struct zsd_entry *zsdp;
 824  824          struct zsd_entry *t;
 825  825  
 826  826          ASSERT(MUTEX_HELD(&zonehash_lock));
 827  827          ASSERT(list_head(&zone->zone_zsd) == NULL);
 828  828          mutex_enter(&zone->zone_lock);
 829  829          mutex_enter(&zsd_key_lock);
 830  830          for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 831  831              zsdp = list_next(&zsd_registered_keys, zsdp)) {
 832  832                  /*
 833  833                   * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 834  834                   * should not have added anything to it.
 835  835                   */
 836  836                  ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 837  837  
 838  838                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 839  839                  t->zsd_key = zsdp->zsd_key;
 840  840                  t->zsd_create = zsdp->zsd_create;
 841  841                  t->zsd_shutdown = zsdp->zsd_shutdown;
 842  842                  t->zsd_destroy = zsdp->zsd_destroy;
 843  843                  if (zsdp->zsd_create != NULL) {
 844  844                          t->zsd_flags = ZSD_CREATE_NEEDED;
 845  845                          DTRACE_PROBE2(zsd__create__needed,
 846  846                              zone_t *, zone, zone_key_t, zsdp->zsd_key);
 847  847                  }
 848  848                  list_insert_tail(&zone->zone_zsd, t);
 849  849          }
 850  850          mutex_exit(&zsd_key_lock);
 851  851          mutex_exit(&zone->zone_lock);
 852  852  }
 853  853  
 854  854  enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 855  855  
 856  856  /*
 857  857   * Helper function to execute shutdown or destructor callbacks.
 858  858   */
 859  859  static void
 860  860  zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 861  861  {
 862  862          struct zsd_entry *t;
 863  863  
 864  864          ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 865  865          ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 866  866          ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 867  867  
 868  868          /*
 869  869           * Run the callback solely based on what is registered for the zone
 870  870           * in zone_zsd. The global list can change independently of this
 871  871           * as keys are registered and unregistered and we don't register new
 872  872           * callbacks for a zone that is in the process of going away.
 873  873           */
 874  874          mutex_enter(&zone->zone_lock);
 875  875          for (t = list_head(&zone->zone_zsd); t != NULL;
 876  876              t = list_next(&zone->zone_zsd, t)) {
 877  877                  zone_key_t key = t->zsd_key;
 878  878  
 879  879                  /* Skip if no callbacks registered */
 880  880  
 881  881                  if (ct == ZSD_SHUTDOWN) {
 882  882                          if (t->zsd_shutdown != NULL &&
 883  883                              (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 884  884                                  t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 885  885                                  DTRACE_PROBE2(zsd__shutdown__needed,
 886  886                                      zone_t *, zone, zone_key_t, key);
 887  887                          }
 888  888                  } else {
 889  889                          if (t->zsd_destroy != NULL &&
 890  890                              (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 891  891                                  t->zsd_flags |= ZSD_DESTROY_NEEDED;
 892  892                                  DTRACE_PROBE2(zsd__destroy__needed,
 893  893                                      zone_t *, zone, zone_key_t, key);
 894  894                          }
 895  895                  }
 896  896          }
 897  897          mutex_exit(&zone->zone_lock);
 898  898  
 899  899          /* Now call the shutdown and destroy callback for this key */
 900  900          zsd_apply_all_keys(zsd_apply_shutdown, zone);
 901  901          zsd_apply_all_keys(zsd_apply_destroy, zone);
 902  902  
 903  903  }
 904  904  
 905  905  /*
 906  906   * Called when the zone is going away; free ZSD-related memory, and
 907  907   * destroy the zone_zsd list.
 908  908   */
 909  909  static void
 910  910  zone_free_zsd(zone_t *zone)
 911  911  {
 912  912          struct zsd_entry *t, *next;
 913  913  
 914  914          /*
 915  915           * Free all the zsd_entry's we had on this zone.
 916  916           */
 917  917          mutex_enter(&zone->zone_lock);
 918  918          for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 919  919                  next = list_next(&zone->zone_zsd, t);
 920  920                  list_remove(&zone->zone_zsd, t);
 921  921                  ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 922  922                  kmem_free(t, sizeof (*t));
 923  923          }
 924  924          list_destroy(&zone->zone_zsd);
 925  925          mutex_exit(&zone->zone_lock);
 926  926  
 927  927  }
 928  928  
 929  929  /*
 930  930   * Apply a function to all zones for particular key value.
 931  931   *
 932  932   * The applyfn has to drop zonehash_lock if it does some work, and
 933  933   * then reacquire it before it returns.
 934  934   * When the lock is dropped we don't follow list_next even
 935  935   * if it is possible to do so without any hazards. This is
 936  936   * because we want the design to allow for the list of zones
 937  937   * to change in any arbitrary way during the time the
 938  938   * lock was dropped.
 939  939   *
 940  940   * It is safe to restart the loop at list_head since the applyfn
 941  941   * changes the zsd_flags as it does work, so a subsequent
 942  942   * pass through will have no effect in applyfn, hence the loop will terminate
 943  943   * in at worst O(N^2).
 944  944   */
 945  945  static void
 946  946  zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 947  947  {
 948  948          zone_t *zone;
 949  949  
 950  950          mutex_enter(&zonehash_lock);
 951  951          zone = list_head(&zone_active);
 952  952          while (zone != NULL) {
 953  953                  if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 954  954                          /* Lock dropped - restart at head */
 955  955                          zone = list_head(&zone_active);
 956  956                  } else {
 957  957                          zone = list_next(&zone_active, zone);
 958  958                  }
 959  959          }
 960  960          mutex_exit(&zonehash_lock);
 961  961  }
 962  962  
 963  963  /*
 964  964   * Apply a function to all keys for a particular zone.
 965  965   *
 966  966   * The applyfn has to drop zonehash_lock if it does some work, and
 967  967   * then reacquire it before it returns.
 968  968   * When the lock is dropped we don't follow list_next even
 969  969   * if it is possible to do so without any hazards. This is
 970  970   * because we want the design to allow for the list of zsd callbacks
 971  971   * to change in any arbitrary way during the time the
 972  972   * lock was dropped.
 973  973   *
 974  974   * It is safe to restart the loop at list_head since the applyfn
 975  975   * changes the zsd_flags as it does work, so a subsequent
 976  976   * pass through will have no effect in applyfn, hence the loop will terminate
 977  977   * in at worst O(N^2).
 978  978   */
 979  979  static void
 980  980  zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 981  981  {
 982  982          struct zsd_entry *t;
 983  983  
 984  984          mutex_enter(&zone->zone_lock);
 985  985          t = list_head(&zone->zone_zsd);
 986  986          while (t != NULL) {
 987  987                  if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 988  988                          /* Lock dropped - restart at head */
 989  989                          t = list_head(&zone->zone_zsd);
 990  990                  } else {
 991  991                          t = list_next(&zone->zone_zsd, t);
 992  992                  }
 993  993          }
 994  994          mutex_exit(&zone->zone_lock);
 995  995  }
 996  996  
 997  997  /*
 998  998   * Call the create function for the zone and key if CREATE_NEEDED
 999  999   * is set.
1000 1000   * If some other thread gets here first and sets CREATE_INPROGRESS, then
1001 1001   * we wait for that thread to complete so that we can ensure that
1002 1002   * all the callbacks are done when we've looped over all zones/keys.
1003 1003   *
1004 1004   * When we call the create function, we drop the global held by the
1005 1005   * caller, and return true to tell the caller it needs to re-evalute the
1006 1006   * state.
1007 1007   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1008 1008   * remains held on exit.
1009 1009   */
1010 1010  static boolean_t
1011 1011  zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1012 1012      zone_t *zone, zone_key_t key)
1013 1013  {
1014 1014          void *result;
1015 1015          struct zsd_entry *t;
1016 1016          boolean_t dropped;
1017 1017  
1018 1018          if (lockp != NULL) {
1019 1019                  ASSERT(MUTEX_HELD(lockp));
1020 1020          }
1021 1021          if (zone_lock_held) {
1022 1022                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1023 1023          } else {
1024 1024                  mutex_enter(&zone->zone_lock);
1025 1025          }
1026 1026  
1027 1027          t = zsd_find(&zone->zone_zsd, key);
1028 1028          if (t == NULL) {
1029 1029                  /*
1030 1030                   * Somebody else got here first e.g the zone going
1031 1031                   * away.
1032 1032                   */
1033 1033                  if (!zone_lock_held)
1034 1034                          mutex_exit(&zone->zone_lock);
1035 1035                  return (B_FALSE);
1036 1036          }
1037 1037          dropped = B_FALSE;
1038 1038          if (zsd_wait_for_inprogress(zone, t, lockp))
1039 1039                  dropped = B_TRUE;
1040 1040  
1041 1041          if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1042 1042                  t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1043 1043                  t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1044 1044                  DTRACE_PROBE2(zsd__create__inprogress,
1045 1045                      zone_t *, zone, zone_key_t, key);
1046 1046                  mutex_exit(&zone->zone_lock);
1047 1047                  if (lockp != NULL)
1048 1048                          mutex_exit(lockp);
1049 1049  
1050 1050                  dropped = B_TRUE;
1051 1051                  ASSERT(t->zsd_create != NULL);
1052 1052                  DTRACE_PROBE2(zsd__create__start,
1053 1053                      zone_t *, zone, zone_key_t, key);
1054 1054  
1055 1055                  result = (*t->zsd_create)(zone->zone_id);
1056 1056  
1057 1057                  DTRACE_PROBE2(zsd__create__end,
1058 1058                      zone_t *, zone, voidn *, result);
1059 1059  
1060 1060                  ASSERT(result != NULL);
1061 1061                  if (lockp != NULL)
1062 1062                          mutex_enter(lockp);
1063 1063                  mutex_enter(&zone->zone_lock);
1064 1064                  t->zsd_data = result;
1065 1065                  t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1066 1066                  t->zsd_flags |= ZSD_CREATE_COMPLETED;
1067 1067                  cv_broadcast(&t->zsd_cv);
1068 1068                  DTRACE_PROBE2(zsd__create__completed,
1069 1069                      zone_t *, zone, zone_key_t, key);
1070 1070          }
1071 1071          if (!zone_lock_held)
1072 1072                  mutex_exit(&zone->zone_lock);
1073 1073          return (dropped);
1074 1074  }
1075 1075  
1076 1076  /*
1077 1077   * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1078 1078   * is set.
1079 1079   * If some other thread gets here first and sets *_INPROGRESS, then
1080 1080   * we wait for that thread to complete so that we can ensure that
1081 1081   * all the callbacks are done when we've looped over all zones/keys.
1082 1082   *
1083 1083   * When we call the shutdown function, we drop the global held by the
1084 1084   * caller, and return true to tell the caller it needs to re-evalute the
1085 1085   * state.
1086 1086   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1087 1087   * remains held on exit.
1088 1088   */
1089 1089  static boolean_t
1090 1090  zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1091 1091      zone_t *zone, zone_key_t key)
1092 1092  {
1093 1093          struct zsd_entry *t;
1094 1094          void *data;
1095 1095          boolean_t dropped;
1096 1096  
1097 1097          if (lockp != NULL) {
1098 1098                  ASSERT(MUTEX_HELD(lockp));
1099 1099          }
1100 1100          if (zone_lock_held) {
1101 1101                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1102 1102          } else {
1103 1103                  mutex_enter(&zone->zone_lock);
1104 1104          }
1105 1105  
1106 1106          t = zsd_find(&zone->zone_zsd, key);
1107 1107          if (t == NULL) {
1108 1108                  /*
1109 1109                   * Somebody else got here first e.g the zone going
1110 1110                   * away.
1111 1111                   */
1112 1112                  if (!zone_lock_held)
1113 1113                          mutex_exit(&zone->zone_lock);
1114 1114                  return (B_FALSE);
1115 1115          }
1116 1116          dropped = B_FALSE;
1117 1117          if (zsd_wait_for_creator(zone, t, lockp))
1118 1118                  dropped = B_TRUE;
1119 1119  
1120 1120          if (zsd_wait_for_inprogress(zone, t, lockp))
1121 1121                  dropped = B_TRUE;
1122 1122  
1123 1123          if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1124 1124                  t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1125 1125                  t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1126 1126                  DTRACE_PROBE2(zsd__shutdown__inprogress,
1127 1127                      zone_t *, zone, zone_key_t, key);
1128 1128                  mutex_exit(&zone->zone_lock);
1129 1129                  if (lockp != NULL)
1130 1130                          mutex_exit(lockp);
1131 1131                  dropped = B_TRUE;
1132 1132  
1133 1133                  ASSERT(t->zsd_shutdown != NULL);
1134 1134                  data = t->zsd_data;
1135 1135  
1136 1136                  DTRACE_PROBE2(zsd__shutdown__start,
1137 1137                      zone_t *, zone, zone_key_t, key);
1138 1138  
1139 1139                  (t->zsd_shutdown)(zone->zone_id, data);
1140 1140                  DTRACE_PROBE2(zsd__shutdown__end,
1141 1141                      zone_t *, zone, zone_key_t, key);
1142 1142  
1143 1143                  if (lockp != NULL)
1144 1144                          mutex_enter(lockp);
1145 1145                  mutex_enter(&zone->zone_lock);
1146 1146                  t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1147 1147                  t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1148 1148                  cv_broadcast(&t->zsd_cv);
1149 1149                  DTRACE_PROBE2(zsd__shutdown__completed,
1150 1150                      zone_t *, zone, zone_key_t, key);
1151 1151          }
1152 1152          if (!zone_lock_held)
1153 1153                  mutex_exit(&zone->zone_lock);
1154 1154          return (dropped);
1155 1155  }
1156 1156  
1157 1157  /*
1158 1158   * Call the destroy function for the zone and key if DESTROY_NEEDED
1159 1159   * is set.
1160 1160   * If some other thread gets here first and sets *_INPROGRESS, then
1161 1161   * we wait for that thread to complete so that we can ensure that
1162 1162   * all the callbacks are done when we've looped over all zones/keys.
1163 1163   *
1164 1164   * When we call the destroy function, we drop the global held by the
1165 1165   * caller, and return true to tell the caller it needs to re-evalute the
1166 1166   * state.
1167 1167   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1168 1168   * remains held on exit.
1169 1169   */
1170 1170  static boolean_t
1171 1171  zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1172 1172      zone_t *zone, zone_key_t key)
1173 1173  {
1174 1174          struct zsd_entry *t;
1175 1175          void *data;
1176 1176          boolean_t dropped;
1177 1177  
1178 1178          if (lockp != NULL) {
1179 1179                  ASSERT(MUTEX_HELD(lockp));
1180 1180          }
1181 1181          if (zone_lock_held) {
1182 1182                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1183 1183          } else {
1184 1184                  mutex_enter(&zone->zone_lock);
1185 1185          }
1186 1186  
1187 1187          t = zsd_find(&zone->zone_zsd, key);
1188 1188          if (t == NULL) {
1189 1189                  /*
1190 1190                   * Somebody else got here first e.g the zone going
1191 1191                   * away.
1192 1192                   */
1193 1193                  if (!zone_lock_held)
1194 1194                          mutex_exit(&zone->zone_lock);
1195 1195                  return (B_FALSE);
1196 1196          }
1197 1197          dropped = B_FALSE;
1198 1198          if (zsd_wait_for_creator(zone, t, lockp))
1199 1199                  dropped = B_TRUE;
1200 1200  
1201 1201          if (zsd_wait_for_inprogress(zone, t, lockp))
1202 1202                  dropped = B_TRUE;
1203 1203  
1204 1204          if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1205 1205                  t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1206 1206                  t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1207 1207                  DTRACE_PROBE2(zsd__destroy__inprogress,
1208 1208                      zone_t *, zone, zone_key_t, key);
1209 1209                  mutex_exit(&zone->zone_lock);
1210 1210                  if (lockp != NULL)
1211 1211                          mutex_exit(lockp);
1212 1212                  dropped = B_TRUE;
1213 1213  
1214 1214                  ASSERT(t->zsd_destroy != NULL);
1215 1215                  data = t->zsd_data;
1216 1216                  DTRACE_PROBE2(zsd__destroy__start,
1217 1217                      zone_t *, zone, zone_key_t, key);
1218 1218  
1219 1219                  (t->zsd_destroy)(zone->zone_id, data);
1220 1220                  DTRACE_PROBE2(zsd__destroy__end,
1221 1221                      zone_t *, zone, zone_key_t, key);
1222 1222  
1223 1223                  if (lockp != NULL)
1224 1224                          mutex_enter(lockp);
1225 1225                  mutex_enter(&zone->zone_lock);
1226 1226                  t->zsd_data = NULL;
1227 1227                  t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1228 1228                  t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1229 1229                  cv_broadcast(&t->zsd_cv);
1230 1230                  DTRACE_PROBE2(zsd__destroy__completed,
1231 1231                      zone_t *, zone, zone_key_t, key);
1232 1232          }
1233 1233          if (!zone_lock_held)
1234 1234                  mutex_exit(&zone->zone_lock);
1235 1235          return (dropped);
1236 1236  }
1237 1237  
1238 1238  /*
1239 1239   * Wait for any CREATE_NEEDED flag to be cleared.
1240 1240   * Returns true if lockp was temporarily dropped while waiting.
1241 1241   */
1242 1242  static boolean_t
1243 1243  zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1244 1244  {
1245 1245          boolean_t dropped = B_FALSE;
1246 1246  
1247 1247          while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1248 1248                  DTRACE_PROBE2(zsd__wait__for__creator,
1249 1249                      zone_t *, zone, struct zsd_entry *, t);
1250 1250                  if (lockp != NULL) {
1251 1251                          dropped = B_TRUE;
1252 1252                          mutex_exit(lockp);
1253 1253                  }
1254 1254                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1255 1255                  if (lockp != NULL) {
1256 1256                          /* First drop zone_lock to preserve order */
1257 1257                          mutex_exit(&zone->zone_lock);
1258 1258                          mutex_enter(lockp);
1259 1259                          mutex_enter(&zone->zone_lock);
1260 1260                  }
1261 1261          }
1262 1262          return (dropped);
1263 1263  }
1264 1264  
1265 1265  /*
1266 1266   * Wait for any INPROGRESS flag to be cleared.
1267 1267   * Returns true if lockp was temporarily dropped while waiting.
1268 1268   */
1269 1269  static boolean_t
1270 1270  zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1271 1271  {
1272 1272          boolean_t dropped = B_FALSE;
1273 1273  
1274 1274          while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1275 1275                  DTRACE_PROBE2(zsd__wait__for__inprogress,
1276 1276                      zone_t *, zone, struct zsd_entry *, t);
1277 1277                  if (lockp != NULL) {
1278 1278                          dropped = B_TRUE;
1279 1279                          mutex_exit(lockp);
1280 1280                  }
1281 1281                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1282 1282                  if (lockp != NULL) {
1283 1283                          /* First drop zone_lock to preserve order */
1284 1284                          mutex_exit(&zone->zone_lock);
1285 1285                          mutex_enter(lockp);
1286 1286                          mutex_enter(&zone->zone_lock);
1287 1287                  }
1288 1288          }
1289 1289          return (dropped);
1290 1290  }
1291 1291  
1292 1292  /*
1293 1293   * Frees memory associated with the zone dataset list.
1294 1294   */
1295 1295  static void
1296 1296  zone_free_datasets(zone_t *zone)
1297 1297  {
1298 1298          zone_dataset_t *t, *next;
1299 1299  
1300 1300          for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1301 1301                  next = list_next(&zone->zone_datasets, t);
1302 1302                  list_remove(&zone->zone_datasets, t);
1303 1303                  kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1304 1304                  kmem_free(t, sizeof (*t));
1305 1305          }
1306 1306          list_destroy(&zone->zone_datasets);
1307 1307  }
1308 1308  
1309 1309  /*
1310 1310   * zone.cpu-shares resource control support.
1311 1311   */
1312 1312  /*ARGSUSED*/
1313 1313  static rctl_qty_t
1314 1314  zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1315 1315  {
1316 1316          ASSERT(MUTEX_HELD(&p->p_lock));
1317 1317          return (p->p_zone->zone_shares);
1318 1318  }
1319 1319  
1320 1320  /*ARGSUSED*/
1321 1321  static int
1322 1322  zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1323 1323      rctl_qty_t nv)
1324 1324  {
1325 1325          ASSERT(MUTEX_HELD(&p->p_lock));
1326 1326          ASSERT(e->rcep_t == RCENTITY_ZONE);
1327 1327          if (e->rcep_p.zone == NULL)
1328 1328                  return (0);
1329 1329  
1330 1330          e->rcep_p.zone->zone_shares = nv;
1331 1331          return (0);
1332 1332  }
1333 1333  
1334 1334  static rctl_ops_t zone_cpu_shares_ops = {
1335 1335          rcop_no_action,
1336 1336          zone_cpu_shares_usage,
1337 1337          zone_cpu_shares_set,
1338 1338          rcop_no_test
1339 1339  };
1340 1340  
1341 1341  /*
1342 1342   * zone.cpu-cap resource control support.
1343 1343   */
1344 1344  /*ARGSUSED*/
1345 1345  static rctl_qty_t
1346 1346  zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1347 1347  {
1348 1348          ASSERT(MUTEX_HELD(&p->p_lock));
1349 1349          return (cpucaps_zone_get(p->p_zone));
1350 1350  }
1351 1351  
1352 1352  /*ARGSUSED*/
1353 1353  static int
1354 1354  zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1355 1355      rctl_qty_t nv)
1356 1356  {
1357 1357          zone_t *zone = e->rcep_p.zone;
1358 1358  
1359 1359          ASSERT(MUTEX_HELD(&p->p_lock));
1360 1360          ASSERT(e->rcep_t == RCENTITY_ZONE);
1361 1361  
1362 1362          if (zone == NULL)
1363 1363                  return (0);
1364 1364  
1365 1365          /*
1366 1366           * set cap to the new value.
1367 1367           */
1368 1368          return (cpucaps_zone_set(zone, nv));
1369 1369  }
1370 1370  
1371 1371  static rctl_ops_t zone_cpu_cap_ops = {
1372 1372          rcop_no_action,
1373 1373          zone_cpu_cap_get,
1374 1374          zone_cpu_cap_set,
1375 1375          rcop_no_test
1376 1376  };
1377 1377  
1378 1378  /*ARGSUSED*/
1379 1379  static rctl_qty_t
1380 1380  zone_lwps_usage(rctl_t *r, proc_t *p)
1381 1381  {
1382 1382          rctl_qty_t nlwps;
1383 1383          zone_t *zone = p->p_zone;
1384 1384  
1385 1385          ASSERT(MUTEX_HELD(&p->p_lock));
1386 1386  
1387 1387          mutex_enter(&zone->zone_nlwps_lock);
1388 1388          nlwps = zone->zone_nlwps;
1389 1389          mutex_exit(&zone->zone_nlwps_lock);
1390 1390  
1391 1391          return (nlwps);
1392 1392  }
1393 1393  
1394 1394  /*ARGSUSED*/
1395 1395  static int
1396 1396  zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1397 1397      rctl_qty_t incr, uint_t flags)
1398 1398  {
1399 1399          rctl_qty_t nlwps;
1400 1400  
1401 1401          ASSERT(MUTEX_HELD(&p->p_lock));
1402 1402          ASSERT(e->rcep_t == RCENTITY_ZONE);
1403 1403          if (e->rcep_p.zone == NULL)
1404 1404                  return (0);
1405 1405          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1406 1406          nlwps = e->rcep_p.zone->zone_nlwps;
1407 1407  
1408 1408          if (nlwps + incr > rcntl->rcv_value)
1409 1409                  return (1);
1410 1410  
1411 1411          return (0);
1412 1412  }
1413 1413  
1414 1414  /*ARGSUSED*/
1415 1415  static int
1416 1416  zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1417 1417  {
1418 1418          ASSERT(MUTEX_HELD(&p->p_lock));
1419 1419          ASSERT(e->rcep_t == RCENTITY_ZONE);
1420 1420          if (e->rcep_p.zone == NULL)
1421 1421                  return (0);
1422 1422          e->rcep_p.zone->zone_nlwps_ctl = nv;
1423 1423          return (0);
1424 1424  }
1425 1425  
1426 1426  static rctl_ops_t zone_lwps_ops = {
1427 1427          rcop_no_action,
1428 1428          zone_lwps_usage,
1429 1429          zone_lwps_set,
1430 1430          zone_lwps_test,
1431 1431  };
1432 1432  
1433 1433  /*ARGSUSED*/
1434 1434  static rctl_qty_t
1435 1435  zone_procs_usage(rctl_t *r, proc_t *p)
1436 1436  {
1437 1437          rctl_qty_t nprocs;
1438 1438          zone_t *zone = p->p_zone;
1439 1439  
1440 1440          ASSERT(MUTEX_HELD(&p->p_lock));
1441 1441  
1442 1442          mutex_enter(&zone->zone_nlwps_lock);
1443 1443          nprocs = zone->zone_nprocs;
1444 1444          mutex_exit(&zone->zone_nlwps_lock);
1445 1445  
1446 1446          return (nprocs);
1447 1447  }
1448 1448  
1449 1449  /*ARGSUSED*/
1450 1450  static int
1451 1451  zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1452 1452      rctl_qty_t incr, uint_t flags)
1453 1453  {
1454 1454          rctl_qty_t nprocs;
1455 1455  
1456 1456          ASSERT(MUTEX_HELD(&p->p_lock));
1457 1457          ASSERT(e->rcep_t == RCENTITY_ZONE);
1458 1458          if (e->rcep_p.zone == NULL)
1459 1459                  return (0);
1460 1460          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1461 1461          nprocs = e->rcep_p.zone->zone_nprocs;
1462 1462  
1463 1463          if (nprocs + incr > rcntl->rcv_value)
1464 1464                  return (1);
1465 1465  
1466 1466          return (0);
1467 1467  }
1468 1468  
1469 1469  /*ARGSUSED*/
1470 1470  static int
1471 1471  zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1472 1472  {
1473 1473          ASSERT(MUTEX_HELD(&p->p_lock));
1474 1474          ASSERT(e->rcep_t == RCENTITY_ZONE);
1475 1475          if (e->rcep_p.zone == NULL)
1476 1476                  return (0);
1477 1477          e->rcep_p.zone->zone_nprocs_ctl = nv;
1478 1478          return (0);
1479 1479  }
1480 1480  
1481 1481  static rctl_ops_t zone_procs_ops = {
1482 1482          rcop_no_action,
1483 1483          zone_procs_usage,
1484 1484          zone_procs_set,
1485 1485          zone_procs_test,
1486 1486  };
1487 1487  
1488 1488  /*ARGSUSED*/
1489 1489  static int
1490 1490  zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1491 1491      rctl_qty_t incr, uint_t flags)
1492 1492  {
1493 1493          rctl_qty_t v;
1494 1494          ASSERT(MUTEX_HELD(&p->p_lock));
1495 1495          ASSERT(e->rcep_t == RCENTITY_ZONE);
1496 1496          v = e->rcep_p.zone->zone_shmmax + incr;
1497 1497          if (v > rval->rcv_value)
1498 1498                  return (1);
1499 1499          return (0);
1500 1500  }
1501 1501  
1502 1502  static rctl_ops_t zone_shmmax_ops = {
1503 1503          rcop_no_action,
1504 1504          rcop_no_usage,
1505 1505          rcop_no_set,
1506 1506          zone_shmmax_test
1507 1507  };
1508 1508  
1509 1509  /*ARGSUSED*/
1510 1510  static int
1511 1511  zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1512 1512      rctl_qty_t incr, uint_t flags)
1513 1513  {
1514 1514          rctl_qty_t v;
1515 1515          ASSERT(MUTEX_HELD(&p->p_lock));
1516 1516          ASSERT(e->rcep_t == RCENTITY_ZONE);
1517 1517          v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1518 1518          if (v > rval->rcv_value)
1519 1519                  return (1);
1520 1520          return (0);
1521 1521  }
1522 1522  
1523 1523  static rctl_ops_t zone_shmmni_ops = {
1524 1524          rcop_no_action,
1525 1525          rcop_no_usage,
1526 1526          rcop_no_set,
1527 1527          zone_shmmni_test
1528 1528  };
1529 1529  
1530 1530  /*ARGSUSED*/
1531 1531  static int
1532 1532  zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1533 1533      rctl_qty_t incr, uint_t flags)
1534 1534  {
1535 1535          rctl_qty_t v;
1536 1536          ASSERT(MUTEX_HELD(&p->p_lock));
1537 1537          ASSERT(e->rcep_t == RCENTITY_ZONE);
1538 1538          v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1539 1539          if (v > rval->rcv_value)
1540 1540                  return (1);
1541 1541          return (0);
1542 1542  }
1543 1543  
1544 1544  static rctl_ops_t zone_semmni_ops = {
1545 1545          rcop_no_action,
1546 1546          rcop_no_usage,
1547 1547          rcop_no_set,
1548 1548          zone_semmni_test
1549 1549  };
1550 1550  
1551 1551  /*ARGSUSED*/
1552 1552  static int
1553 1553  zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1554 1554      rctl_qty_t incr, uint_t flags)
1555 1555  {
1556 1556          rctl_qty_t v;
1557 1557          ASSERT(MUTEX_HELD(&p->p_lock));
1558 1558          ASSERT(e->rcep_t == RCENTITY_ZONE);
1559 1559          v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1560 1560          if (v > rval->rcv_value)
1561 1561                  return (1);
1562 1562          return (0);
1563 1563  }
1564 1564  
1565 1565  static rctl_ops_t zone_msgmni_ops = {
1566 1566          rcop_no_action,
1567 1567          rcop_no_usage,
1568 1568          rcop_no_set,
1569 1569          zone_msgmni_test
1570 1570  };
1571 1571  
1572 1572  /*ARGSUSED*/
1573 1573  static rctl_qty_t
1574 1574  zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1575 1575  {
1576 1576          rctl_qty_t q;
1577 1577          ASSERT(MUTEX_HELD(&p->p_lock));
1578 1578          mutex_enter(&p->p_zone->zone_mem_lock);
1579 1579          q = p->p_zone->zone_locked_mem;
1580 1580          mutex_exit(&p->p_zone->zone_mem_lock);
1581 1581          return (q);
1582 1582  }
1583 1583  
1584 1584  /*ARGSUSED*/
1585 1585  static int
1586 1586  zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1587 1587      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1588 1588  {
1589 1589          rctl_qty_t q;
1590 1590          zone_t *z;
1591 1591  
1592 1592          z = e->rcep_p.zone;
1593 1593          ASSERT(MUTEX_HELD(&p->p_lock));
1594 1594          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1595 1595          q = z->zone_locked_mem;
1596 1596          if (q + incr > rcntl->rcv_value)
1597 1597                  return (1);
1598 1598          return (0);
1599 1599  }
1600 1600  
1601 1601  /*ARGSUSED*/
1602 1602  static int
1603 1603  zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1604 1604      rctl_qty_t nv)
1605 1605  {
1606 1606          ASSERT(MUTEX_HELD(&p->p_lock));
1607 1607          ASSERT(e->rcep_t == RCENTITY_ZONE);
1608 1608          if (e->rcep_p.zone == NULL)
1609 1609                  return (0);
1610 1610          e->rcep_p.zone->zone_locked_mem_ctl = nv;
1611 1611          return (0);
1612 1612  }
1613 1613  
1614 1614  static rctl_ops_t zone_locked_mem_ops = {
1615 1615          rcop_no_action,
1616 1616          zone_locked_mem_usage,
1617 1617          zone_locked_mem_set,
1618 1618          zone_locked_mem_test
1619 1619  };
1620 1620  
1621 1621  /*ARGSUSED*/
1622 1622  static rctl_qty_t
1623 1623  zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1624 1624  {
1625 1625          rctl_qty_t q;
1626 1626          zone_t *z = p->p_zone;
1627 1627  
1628 1628          ASSERT(MUTEX_HELD(&p->p_lock));
1629 1629          mutex_enter(&z->zone_mem_lock);
1630 1630          q = z->zone_max_swap;
1631 1631          mutex_exit(&z->zone_mem_lock);
1632 1632          return (q);
1633 1633  }
1634 1634  
1635 1635  /*ARGSUSED*/
1636 1636  static int
1637 1637  zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1638 1638      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1639 1639  {
1640 1640          rctl_qty_t q;
1641 1641          zone_t *z;
1642 1642  
1643 1643          z = e->rcep_p.zone;
1644 1644          ASSERT(MUTEX_HELD(&p->p_lock));
1645 1645          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1646 1646          q = z->zone_max_swap;
1647 1647          if (q + incr > rcntl->rcv_value)
1648 1648                  return (1);
1649 1649          return (0);
1650 1650  }
1651 1651  
1652 1652  /*ARGSUSED*/
1653 1653  static int
1654 1654  zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1655 1655      rctl_qty_t nv)
1656 1656  {
1657 1657          ASSERT(MUTEX_HELD(&p->p_lock));
1658 1658          ASSERT(e->rcep_t == RCENTITY_ZONE);
1659 1659          if (e->rcep_p.zone == NULL)
1660 1660                  return (0);
1661 1661          e->rcep_p.zone->zone_max_swap_ctl = nv;
1662 1662          return (0);
1663 1663  }
1664 1664  
1665 1665  static rctl_ops_t zone_max_swap_ops = {
1666 1666          rcop_no_action,
1667 1667          zone_max_swap_usage,
1668 1668          zone_max_swap_set,
1669 1669          zone_max_swap_test
1670 1670  };
1671 1671  
1672 1672  /*ARGSUSED*/
1673 1673  static rctl_qty_t
1674 1674  zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1675 1675  {
1676 1676          rctl_qty_t q;
1677 1677          zone_t *z = p->p_zone;
1678 1678  
1679 1679          ASSERT(MUTEX_HELD(&p->p_lock));
1680 1680          mutex_enter(&z->zone_rctl_lock);
1681 1681          q = z->zone_max_lofi;
1682 1682          mutex_exit(&z->zone_rctl_lock);
1683 1683          return (q);
1684 1684  }
1685 1685  
1686 1686  /*ARGSUSED*/
1687 1687  static int
1688 1688  zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1689 1689      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1690 1690  {
1691 1691          rctl_qty_t q;
1692 1692          zone_t *z;
1693 1693  
1694 1694          z = e->rcep_p.zone;
1695 1695          ASSERT(MUTEX_HELD(&p->p_lock));
1696 1696          ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1697 1697          q = z->zone_max_lofi;
1698 1698          if (q + incr > rcntl->rcv_value)
1699 1699                  return (1);
1700 1700          return (0);
1701 1701  }
1702 1702  
1703 1703  /*ARGSUSED*/
1704 1704  static int
1705 1705  zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1706 1706      rctl_qty_t nv)
1707 1707  {
1708 1708          ASSERT(MUTEX_HELD(&p->p_lock));
1709 1709          ASSERT(e->rcep_t == RCENTITY_ZONE);
1710 1710          if (e->rcep_p.zone == NULL)
1711 1711                  return (0);
1712 1712          e->rcep_p.zone->zone_max_lofi_ctl = nv;
1713 1713          return (0);
1714 1714  }
1715 1715  
1716 1716  static rctl_ops_t zone_max_lofi_ops = {
1717 1717          rcop_no_action,
1718 1718          zone_max_lofi_usage,
1719 1719          zone_max_lofi_set,
1720 1720          zone_max_lofi_test
1721 1721  };
1722 1722  
1723 1723  /*
1724 1724   * Helper function to brand the zone with a unique ID.
1725 1725   */
1726 1726  static void
1727 1727  zone_uniqid(zone_t *zone)
1728 1728  {
1729 1729          static uint64_t uniqid = 0;
1730 1730  
1731 1731          ASSERT(MUTEX_HELD(&zonehash_lock));
1732 1732          zone->zone_uniqid = uniqid++;
1733 1733  }
1734 1734  
1735 1735  /*
1736 1736   * Returns a held pointer to the "kcred" for the specified zone.
1737 1737   */
1738 1738  struct cred *
1739 1739  zone_get_kcred(zoneid_t zoneid)
1740 1740  {
1741 1741          zone_t *zone;
1742 1742          cred_t *cr;
1743 1743  
1744 1744          if ((zone = zone_find_by_id(zoneid)) == NULL)
1745 1745                  return (NULL);
1746 1746          cr = zone->zone_kcred;
1747 1747          crhold(cr);
1748 1748          zone_rele(zone);
1749 1749          return (cr);
1750 1750  }
1751 1751  
1752 1752  static int
1753 1753  zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1754 1754  {
1755 1755          zone_t *zone = ksp->ks_private;
1756 1756          zone_kstat_t *zk = ksp->ks_data;
1757 1757  
1758 1758          if (rw == KSTAT_WRITE)
1759 1759                  return (EACCES);
1760 1760  
1761 1761          zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1762 1762          zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1763 1763          return (0);
1764 1764  }
1765 1765  
1766 1766  static int
1767 1767  zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1768 1768  {
1769 1769          zone_t *zone = ksp->ks_private;
1770 1770          zone_kstat_t *zk = ksp->ks_data;
1771 1771  
1772 1772          if (rw == KSTAT_WRITE)
1773 1773                  return (EACCES);
1774 1774  
1775 1775          zk->zk_usage.value.ui64 = zone->zone_nprocs;
1776 1776          zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1777 1777          return (0);
1778 1778  }
1779 1779  
1780 1780  static int
1781 1781  zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1782 1782  {
1783 1783          zone_t *zone = ksp->ks_private;
1784 1784          zone_kstat_t *zk = ksp->ks_data;
1785 1785  
1786 1786          if (rw == KSTAT_WRITE)
1787 1787                  return (EACCES);
1788 1788  
1789 1789          zk->zk_usage.value.ui64 = zone->zone_max_swap;
1790 1790          zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1791 1791          return (0);
1792 1792  }
1793 1793  
1794 1794  static kstat_t *
1795 1795  zone_kstat_create_common(zone_t *zone, char *name,
1796 1796      int (*updatefunc) (kstat_t *, int))
1797 1797  {
1798 1798          kstat_t *ksp;
1799 1799          zone_kstat_t *zk;
1800 1800  
1801 1801          ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1802 1802              sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1803 1803              KSTAT_FLAG_VIRTUAL);
1804 1804  
1805 1805          if (ksp == NULL)
1806 1806                  return (NULL);
1807 1807  
1808 1808          zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1809 1809          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1810 1810          kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1811 1811          kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1812 1812          kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1813 1813          kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1814 1814          ksp->ks_update = updatefunc;
1815 1815          ksp->ks_private = zone;
1816 1816          kstat_install(ksp);
1817 1817          return (ksp);
1818 1818  }
1819 1819  
1820 1820  static int
1821 1821  zone_misc_kstat_update(kstat_t *ksp, int rw)
1822 1822  {
1823 1823          zone_t *zone = ksp->ks_private;
1824 1824          zone_misc_kstat_t *zmp = ksp->ks_data;
1825 1825          hrtime_t tmp;
1826 1826  
1827 1827          if (rw == KSTAT_WRITE)
1828 1828                  return (EACCES);
1829 1829  
1830 1830          tmp = zone->zone_utime;
1831 1831          scalehrtime(&tmp);
1832 1832          zmp->zm_utime.value.ui64 = tmp;
1833 1833          tmp = zone->zone_stime;
1834 1834          scalehrtime(&tmp);
1835 1835          zmp->zm_stime.value.ui64 = tmp;
1836 1836          tmp = zone->zone_wtime;
1837 1837          scalehrtime(&tmp);
1838 1838          zmp->zm_wtime.value.ui64 = tmp;
1839 1839  
1840 1840          zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1841 1841          zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1842 1842          zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1843 1843  
1844 1844          zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1845 1845          zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1846 1846          zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1847 1847          zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1848 1848  
1849 1849          return (0);
1850 1850  }
1851 1851  
1852 1852  static kstat_t *
1853 1853  zone_misc_kstat_create(zone_t *zone)
1854 1854  {
1855 1855          kstat_t *ksp;
1856 1856          zone_misc_kstat_t *zmp;
1857 1857  
1858 1858          if ((ksp = kstat_create_zone("zones", zone->zone_id,
1859 1859              zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1860 1860              sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1861 1861              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1862 1862                  return (NULL);
1863 1863  
1864 1864          if (zone->zone_id != GLOBAL_ZONEID)
1865 1865                  kstat_zone_add(ksp, GLOBAL_ZONEID);
1866 1866  
1867 1867          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1868 1868          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1869 1869          ksp->ks_lock = &zone->zone_misc_lock;
1870 1870          zone->zone_misc_stats = zmp;
1871 1871  
1872 1872          /* The kstat "name" field is not large enough for a full zonename */
1873 1873          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1874 1874          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1875 1875          kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1876 1876          kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1877 1877          kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1878 1878          kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1879 1879          kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1880 1880          kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1881 1881              KSTAT_DATA_UINT32);
1882 1882          kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1883 1883          kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1884 1884              KSTAT_DATA_UINT32);
1885 1885          kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1886 1886          kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1887 1887  
1888 1888  
1889 1889          ksp->ks_update = zone_misc_kstat_update;
1890 1890          ksp->ks_private = zone;
1891 1891  
1892 1892          kstat_install(ksp);
1893 1893          return (ksp);
1894 1894  }
1895 1895  
1896 1896  static void
1897 1897  zone_kstat_create(zone_t *zone)
1898 1898  {
1899 1899          zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
1900 1900              "lockedmem", zone_lockedmem_kstat_update);
1901 1901          zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
1902 1902              "swapresv", zone_swapresv_kstat_update);
1903 1903          zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
1904 1904              "nprocs", zone_nprocs_kstat_update);
1905 1905  
1906 1906          if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
1907 1907                  zone->zone_misc_stats = kmem_zalloc(
1908 1908                      sizeof (zone_misc_kstat_t), KM_SLEEP);
1909 1909          }
1910 1910  }
1911 1911  
1912 1912  static void
1913 1913  zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
1914 1914  {
1915 1915          void *data;
1916 1916  
1917 1917          if (*pkstat != NULL) {
1918 1918                  data = (*pkstat)->ks_data;
1919 1919                  kstat_delete(*pkstat);
1920 1920                  kmem_free(data, datasz);
1921 1921                  *pkstat = NULL;
1922 1922          }
1923 1923  }
1924 1924  
1925 1925  static void
1926 1926  zone_kstat_delete(zone_t *zone)
1927 1927  {
1928 1928          zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
1929 1929              sizeof (zone_kstat_t));
1930 1930          zone_kstat_delete_common(&zone->zone_swapresv_kstat,
1931 1931              sizeof (zone_kstat_t));
1932 1932          zone_kstat_delete_common(&zone->zone_nprocs_kstat,
1933 1933              sizeof (zone_kstat_t));
1934 1934          zone_kstat_delete_common(&zone->zone_misc_ksp,
1935 1935              sizeof (zone_misc_kstat_t));
1936 1936  }
1937 1937  
1938 1938  /*
1939 1939   * Called very early on in boot to initialize the ZSD list so that
1940 1940   * zone_key_create() can be called before zone_init().  It also initializes
1941 1941   * portions of zone0 which may be used before zone_init() is called.  The
1942 1942   * variable "global_zone" will be set when zone0 is fully initialized by
1943 1943   * zone_init().
1944 1944   */
1945 1945  void
1946 1946  zone_zsd_init(void)
1947 1947  {
1948 1948          mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
1949 1949          mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
1950 1950          list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
1951 1951              offsetof(struct zsd_entry, zsd_linkage));
1952 1952          list_create(&zone_active, sizeof (zone_t),
1953 1953              offsetof(zone_t, zone_linkage));
1954 1954          list_create(&zone_deathrow, sizeof (zone_t),
1955 1955              offsetof(zone_t, zone_linkage));
1956 1956  
1957 1957          mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
1958 1958          mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
1959 1959          mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
1960 1960          zone0.zone_shares = 1;
1961 1961          zone0.zone_nlwps = 0;
1962 1962          zone0.zone_nlwps_ctl = INT_MAX;
1963 1963          zone0.zone_nprocs = 0;
1964 1964          zone0.zone_nprocs_ctl = INT_MAX;
1965 1965          zone0.zone_locked_mem = 0;
1966 1966          zone0.zone_locked_mem_ctl = UINT64_MAX;
1967 1967          ASSERT(zone0.zone_max_swap == 0);
1968 1968          zone0.zone_max_swap_ctl = UINT64_MAX;
1969 1969          zone0.zone_max_lofi = 0;
1970 1970          zone0.zone_max_lofi_ctl = UINT64_MAX;
1971 1971          zone0.zone_shmmax = 0;
1972 1972          zone0.zone_ipc.ipcq_shmmni = 0;
1973 1973          zone0.zone_ipc.ipcq_semmni = 0;
1974 1974          zone0.zone_ipc.ipcq_msgmni = 0;
1975 1975          zone0.zone_name = GLOBAL_ZONENAME;
1976 1976          zone0.zone_nodename = utsname.nodename;
1977 1977          zone0.zone_domain = srpc_domain;
1978 1978          zone0.zone_hostid = HW_INVALID_HOSTID;
1979 1979          zone0.zone_fs_allowed = NULL;
1980 1980          zone0.zone_ref = 1;
1981 1981          zone0.zone_id = GLOBAL_ZONEID;
1982 1982          zone0.zone_status = ZONE_IS_RUNNING;
1983 1983          zone0.zone_rootpath = "/";
1984 1984          zone0.zone_rootpathlen = 2;
1985 1985          zone0.zone_psetid = ZONE_PS_INVAL;
1986 1986          zone0.zone_ncpus = 0;
1987 1987          zone0.zone_ncpus_online = 0;
1988 1988          zone0.zone_proc_initpid = 1;
1989 1989          zone0.zone_initname = initname;
1990 1990          zone0.zone_lockedmem_kstat = NULL;
1991 1991          zone0.zone_swapresv_kstat = NULL;
1992 1992          zone0.zone_nprocs_kstat = NULL;
1993 1993  
1994 1994          zone0.zone_stime = 0;
1995 1995          zone0.zone_utime = 0;
1996 1996          zone0.zone_wtime = 0;
1997 1997  
1998 1998          list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
1999 1999              offsetof(zone_ref_t, zref_linkage));
2000 2000          list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2001 2001              offsetof(struct zsd_entry, zsd_linkage));
2002 2002          list_insert_head(&zone_active, &zone0);
2003 2003  
2004 2004          /*
2005 2005           * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2006 2006           * to anything meaningful.  It is assigned to be 'rootdir' in
2007 2007           * vfs_mountroot().
2008 2008           */
2009 2009          zone0.zone_rootvp = NULL;
2010 2010          zone0.zone_vfslist = NULL;
2011 2011          zone0.zone_bootargs = initargs;
2012 2012          zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2013 2013          /*
2014 2014           * The global zone has all privileges
2015 2015           */
2016 2016          priv_fillset(zone0.zone_privset);
2017 2017          /*
2018 2018           * Add p0 to the global zone
2019 2019           */
2020 2020          zone0.zone_zsched = &p0;
2021 2021          p0.p_zone = &zone0;
2022 2022  }
2023 2023  
2024 2024  /*
2025 2025   * Compute a hash value based on the contents of the label and the DOI.  The
2026 2026   * hash algorithm is somewhat arbitrary, but is based on the observation that
2027 2027   * humans will likely pick labels that differ by amounts that work out to be
2028 2028   * multiples of the number of hash chains, and thus stirring in some primes
2029 2029   * should help.
2030 2030   */
2031 2031  static uint_t
2032 2032  hash_bylabel(void *hdata, mod_hash_key_t key)
2033 2033  {
2034 2034          const ts_label_t *lab = (ts_label_t *)key;
2035 2035          const uint32_t *up, *ue;
2036 2036          uint_t hash;
2037 2037          int i;
2038 2038  
2039 2039          _NOTE(ARGUNUSED(hdata));
2040 2040  
2041 2041          hash = lab->tsl_doi + (lab->tsl_doi << 1);
2042 2042          /* we depend on alignment of label, but not representation */
2043 2043          up = (const uint32_t *)&lab->tsl_label;
2044 2044          ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2045 2045          i = 1;
2046 2046          while (up < ue) {
2047 2047                  /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2048 2048                  hash += *up + (*up << ((i % 16) + 1));
2049 2049                  up++;
2050 2050                  i++;
2051 2051          }
2052 2052          return (hash);
2053 2053  }
2054 2054  
2055 2055  /*
2056 2056   * All that mod_hash cares about here is zero (equal) versus non-zero (not
2057 2057   * equal).  This may need to be changed if less than / greater than is ever
2058 2058   * needed.
2059 2059   */
2060 2060  static int
2061 2061  hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2062 2062  {
2063 2063          ts_label_t *lab1 = (ts_label_t *)key1;
2064 2064          ts_label_t *lab2 = (ts_label_t *)key2;
2065 2065  
2066 2066          return (label_equal(lab1, lab2) ? 0 : 1);
2067 2067  }
2068 2068  
2069 2069  /*
2070 2070   * Called by main() to initialize the zones framework.
2071 2071   */
2072 2072  void
2073 2073  zone_init(void)
2074 2074  {
2075 2075          rctl_dict_entry_t *rde;
2076 2076          rctl_val_t *dval;
2077 2077          rctl_set_t *set;
2078 2078          rctl_alloc_gp_t *gp;
2079 2079          rctl_entity_p_t e;
2080 2080          int res;
2081 2081  
2082 2082          ASSERT(curproc == &p0);
2083 2083  
2084 2084          /*
2085 2085           * Create ID space for zone IDs.  ID 0 is reserved for the
2086 2086           * global zone.
2087 2087           */
2088 2088          zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2089 2089  
2090 2090          /*
2091 2091           * Initialize generic zone resource controls, if any.
2092 2092           */
2093 2093          rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2094 2094              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2095 2095              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2096 2096              FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2097 2097  
2098 2098          rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2099 2099              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2100 2100              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2101 2101              RCTL_GLOBAL_INFINITE,
2102 2102              MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2103 2103  
2104 2104          rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2105 2105              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2106 2106              INT_MAX, INT_MAX, &zone_lwps_ops);
2107 2107  
2108 2108          rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2109 2109              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2110 2110              INT_MAX, INT_MAX, &zone_procs_ops);
2111 2111  
2112 2112          /*
2113 2113           * System V IPC resource controls
2114 2114           */
2115 2115          rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2116 2116              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2117 2117              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2118 2118  
2119 2119          rc_zone_semmni = rctl_register("zone.max-sem-ids",
2120 2120              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2121 2121              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2122 2122  
2123 2123          rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2124 2124              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2125 2125              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2126 2126  
2127 2127          rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2128 2128              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2129 2129              RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2130 2130  
2131 2131          /*
2132 2132           * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2133 2133           * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2134 2134           */
2135 2135          dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2136 2136          bzero(dval, sizeof (rctl_val_t));
2137 2137          dval->rcv_value = 1;
2138 2138          dval->rcv_privilege = RCPRIV_PRIVILEGED;
2139 2139          dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2140 2140          dval->rcv_action_recip_pid = -1;
2141 2141  
2142 2142          rde = rctl_dict_lookup("zone.cpu-shares");
2143 2143          (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2144 2144  
2145 2145          rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2146 2146              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2147 2147              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2148 2148              &zone_locked_mem_ops);
2149 2149  
2150 2150          rc_zone_max_swap = rctl_register("zone.max-swap",
2151 2151              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2152 2152              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2153 2153              &zone_max_swap_ops);
2154 2154  
2155 2155          rc_zone_max_lofi = rctl_register("zone.max-lofi",
2156 2156              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2157 2157              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2158 2158              &zone_max_lofi_ops);
2159 2159  
2160 2160          /*
2161 2161           * Initialize the ``global zone''.
2162 2162           */
2163 2163          set = rctl_set_create();
2164 2164          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2165 2165          mutex_enter(&p0.p_lock);
2166 2166          e.rcep_p.zone = &zone0;
2167 2167          e.rcep_t = RCENTITY_ZONE;
2168 2168          zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2169 2169              gp);
2170 2170  
2171 2171          zone0.zone_nlwps = p0.p_lwpcnt;
2172 2172          zone0.zone_nprocs = 1;
2173 2173          zone0.zone_ntasks = 1;
2174 2174          mutex_exit(&p0.p_lock);
2175 2175          zone0.zone_restart_init = B_TRUE;
2176 2176          zone0.zone_brand = &native_brand;
2177 2177          rctl_prealloc_destroy(gp);
2178 2178          /*
2179 2179           * pool_default hasn't been initialized yet, so we let pool_init()
2180 2180           * take care of making sure the global zone is in the default pool.
2181 2181           */
2182 2182  
2183 2183          /*
2184 2184           * Initialize global zone kstats
2185 2185           */
2186 2186          zone_kstat_create(&zone0);
2187 2187  
2188 2188          /*
2189 2189           * Initialize zone label.
2190 2190           * mlp are initialized when tnzonecfg is loaded.
2191 2191           */
2192 2192          zone0.zone_slabel = l_admin_low;
2193 2193          rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2194 2194          label_hold(l_admin_low);
2195 2195  
2196 2196          /*
2197 2197           * Initialise the lock for the database structure used by mntfs.
2198 2198           */
2199 2199          rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2200 2200  
2201 2201          mutex_enter(&zonehash_lock);
2202 2202          zone_uniqid(&zone0);
2203 2203          ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2204 2204  
2205 2205          zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2206 2206              mod_hash_null_valdtor);
2207 2207          zonehashbyname = mod_hash_create_strhash("zone_by_name",
2208 2208              zone_hash_size, mod_hash_null_valdtor);
2209 2209          /*
2210 2210           * maintain zonehashbylabel only for labeled systems
2211 2211           */
2212 2212          if (is_system_labeled())
2213 2213                  zonehashbylabel = mod_hash_create_extended("zone_by_label",
2214 2214                      zone_hash_size, mod_hash_null_keydtor,
2215 2215                      mod_hash_null_valdtor, hash_bylabel, NULL,
2216 2216                      hash_labelkey_cmp, KM_SLEEP);
2217 2217          zonecount = 1;
2218 2218  
2219 2219          (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2220 2220              (mod_hash_val_t)&zone0);
2221 2221          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2222 2222              (mod_hash_val_t)&zone0);
2223 2223          if (is_system_labeled()) {
2224 2224                  zone0.zone_flags |= ZF_HASHED_LABEL;
2225 2225                  (void) mod_hash_insert(zonehashbylabel,
2226 2226                      (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2227 2227          }
2228 2228          mutex_exit(&zonehash_lock);
2229 2229  
2230 2230          /*
2231 2231           * We avoid setting zone_kcred until now, since kcred is initialized
2232 2232           * sometime after zone_zsd_init() and before zone_init().
2233 2233           */
2234 2234          zone0.zone_kcred = kcred;
2235 2235          /*
2236 2236           * The global zone is fully initialized (except for zone_rootvp which
2237 2237           * will be set when the root filesystem is mounted).
2238 2238           */
2239 2239          global_zone = &zone0;
2240 2240  
2241 2241          /*
2242 2242           * Setup an event channel to send zone status change notifications on
2243 2243           */
2244 2244          res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2245 2245              EVCH_CREAT);
2246 2246  
2247 2247          if (res)
2248 2248                  panic("Sysevent_evc_bind failed during zone setup.\n");
2249 2249  
2250 2250  }
2251 2251  
2252 2252  static void
2253 2253  zone_free(zone_t *zone)
2254 2254  {
2255 2255          ASSERT(zone != global_zone);
2256 2256          ASSERT(zone->zone_ntasks == 0);
2257 2257          ASSERT(zone->zone_nlwps == 0);
2258 2258          ASSERT(zone->zone_nprocs == 0);
2259 2259          ASSERT(zone->zone_cred_ref == 0);
2260 2260          ASSERT(zone->zone_kcred == NULL);
2261 2261          ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2262 2262              zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2263 2263          ASSERT(list_is_empty(&zone->zone_ref_list));
2264 2264  
2265 2265          /*
2266 2266           * Remove any zone caps.
2267 2267           */
2268 2268          cpucaps_zone_remove(zone);
2269 2269  
2270 2270          ASSERT(zone->zone_cpucap == NULL);
2271 2271  
2272 2272          /* remove from deathrow list */
2273 2273          if (zone_status_get(zone) == ZONE_IS_DEAD) {
2274 2274                  ASSERT(zone->zone_ref == 0);
2275 2275                  mutex_enter(&zone_deathrow_lock);
2276 2276                  list_remove(&zone_deathrow, zone);
2277 2277                  mutex_exit(&zone_deathrow_lock);
2278 2278          }
2279 2279  
2280 2280          list_destroy(&zone->zone_ref_list);
2281 2281          zone_free_zsd(zone);
2282 2282          zone_free_datasets(zone);
2283 2283          list_destroy(&zone->zone_dl_list);
2284 2284  
2285 2285          if (zone->zone_rootvp != NULL)
2286 2286                  VN_RELE(zone->zone_rootvp);
2287 2287          if (zone->zone_rootpath)
2288 2288                  kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2289 2289          if (zone->zone_name != NULL)
2290 2290                  kmem_free(zone->zone_name, ZONENAME_MAX);
2291 2291          if (zone->zone_slabel != NULL)
2292 2292                  label_rele(zone->zone_slabel);
2293 2293          if (zone->zone_nodename != NULL)
2294 2294                  kmem_free(zone->zone_nodename, _SYS_NMLN);
2295 2295          if (zone->zone_domain != NULL)
2296 2296                  kmem_free(zone->zone_domain, _SYS_NMLN);
2297 2297          if (zone->zone_privset != NULL)
2298 2298                  kmem_free(zone->zone_privset, sizeof (priv_set_t));
2299 2299          if (zone->zone_rctls != NULL)
2300 2300                  rctl_set_free(zone->zone_rctls);
2301 2301          if (zone->zone_bootargs != NULL)
2302 2302                  strfree(zone->zone_bootargs);
2303 2303          if (zone->zone_initname != NULL)
2304 2304                  strfree(zone->zone_initname);
2305 2305          if (zone->zone_fs_allowed != NULL)
2306 2306                  strfree(zone->zone_fs_allowed);
2307 2307          if (zone->zone_pfexecd != NULL)
2308 2308                  klpd_freelist(&zone->zone_pfexecd);
2309 2309          id_free(zoneid_space, zone->zone_id);
2310 2310          mutex_destroy(&zone->zone_lock);
2311 2311          cv_destroy(&zone->zone_cv);
2312 2312          rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2313 2313          rw_destroy(&zone->zone_mntfs_db_lock);
2314 2314          kmem_free(zone, sizeof (zone_t));
2315 2315  }
2316 2316  
2317 2317  /*
2318 2318   * See block comment at the top of this file for information about zone
2319 2319   * status values.
2320 2320   */
2321 2321  /*
2322 2322   * Convenience function for setting zone status.
2323 2323   */
2324 2324  static void
2325 2325  zone_status_set(zone_t *zone, zone_status_t status)
2326 2326  {
2327 2327  
2328 2328          nvlist_t *nvl = NULL;
2329 2329          ASSERT(MUTEX_HELD(&zone_status_lock));
2330 2330          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2331 2331              status >= zone_status_get(zone));
2332 2332  
2333 2333          if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2334 2334              nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2335 2335              nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2336 2336              zone_status_table[status]) ||
2337 2337              nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2338 2338              zone_status_table[zone->zone_status]) ||
2339 2339              nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2340 2340              nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2341 2341              sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2342 2342              ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2343 2343  #ifdef DEBUG
2344 2344                  (void) printf(
2345 2345                      "Failed to allocate and send zone state change event.\n");
2346 2346  #endif
2347 2347          }
2348 2348          nvlist_free(nvl);
2349 2349  
2350 2350          zone->zone_status = status;
2351 2351  
2352 2352          cv_broadcast(&zone->zone_cv);
2353 2353  }
2354 2354  
2355 2355  /*
2356 2356   * Public function to retrieve the zone status.  The zone status may
2357 2357   * change after it is retrieved.
2358 2358   */
2359 2359  zone_status_t
2360 2360  zone_status_get(zone_t *zone)
2361 2361  {
2362 2362          return (zone->zone_status);
2363 2363  }
2364 2364  
2365 2365  static int
2366 2366  zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2367 2367  {
2368 2368          char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2369 2369          int err = 0;
2370 2370  
2371 2371          ASSERT(zone != global_zone);
2372 2372          if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2373 2373                  goto done;      /* EFAULT or ENAMETOOLONG */
2374 2374  
2375 2375          if (zone->zone_bootargs != NULL)
2376 2376                  strfree(zone->zone_bootargs);
2377 2377  
2378 2378          zone->zone_bootargs = strdup(buf);
2379 2379  
2380 2380  done:
2381 2381          kmem_free(buf, BOOTARGS_MAX);
2382 2382          return (err);
2383 2383  }
2384 2384  
2385 2385  static int
2386 2386  zone_set_brand(zone_t *zone, const char *brand)
2387 2387  {
2388 2388          struct brand_attr *attrp;
2389 2389          brand_t *bp;
2390 2390  
2391 2391          attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2392 2392          if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2393 2393                  kmem_free(attrp, sizeof (struct brand_attr));
2394 2394                  return (EFAULT);
2395 2395          }
2396 2396  
2397 2397          bp = brand_register_zone(attrp);
2398 2398          kmem_free(attrp, sizeof (struct brand_attr));
2399 2399          if (bp == NULL)
2400 2400                  return (EINVAL);
2401 2401  
2402 2402          /*
2403 2403           * This is the only place where a zone can change it's brand.
2404 2404           * We already need to hold zone_status_lock to check the zone
2405 2405           * status, so we'll just use that lock to serialize zone
2406 2406           * branding requests as well.
2407 2407           */
2408 2408          mutex_enter(&zone_status_lock);
2409 2409  
2410 2410          /* Re-Branding is not allowed and the zone can't be booted yet */
2411 2411          if ((ZONE_IS_BRANDED(zone)) ||
2412 2412              (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2413 2413                  mutex_exit(&zone_status_lock);
2414 2414                  brand_unregister_zone(bp);
2415 2415                  return (EINVAL);
2416 2416          }
2417 2417  
2418 2418          /* set up the brand specific data */
2419 2419          zone->zone_brand = bp;
2420 2420          ZBROP(zone)->b_init_brand_data(zone);
2421 2421  
2422 2422          mutex_exit(&zone_status_lock);
2423 2423          return (0);
2424 2424  }
2425 2425  
2426 2426  static int
2427 2427  zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2428 2428  {
2429 2429          char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2430 2430          int err = 0;
2431 2431  
2432 2432          ASSERT(zone != global_zone);
2433 2433          if ((err = copyinstr(zone_fs_allowed, buf,
2434 2434              ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2435 2435                  goto done;
2436 2436  
2437 2437          if (zone->zone_fs_allowed != NULL)
2438 2438                  strfree(zone->zone_fs_allowed);
2439 2439  
2440 2440          zone->zone_fs_allowed = strdup(buf);
2441 2441  
2442 2442  done:
2443 2443          kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2444 2444          return (err);
2445 2445  }
2446 2446  
2447 2447  static int
2448 2448  zone_set_initname(zone_t *zone, const char *zone_initname)
2449 2449  {
2450 2450          char initname[INITNAME_SZ];
2451 2451          size_t len;
2452 2452          int err = 0;
2453 2453  
2454 2454          ASSERT(zone != global_zone);
2455 2455          if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2456 2456                  return (err);   /* EFAULT or ENAMETOOLONG */
2457 2457  
2458 2458          if (zone->zone_initname != NULL)
2459 2459                  strfree(zone->zone_initname);
2460 2460  
2461 2461          zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2462 2462          (void) strcpy(zone->zone_initname, initname);
2463 2463          return (0);
2464 2464  }
2465 2465  
2466 2466  static int
2467 2467  zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2468 2468  {
2469 2469          uint64_t mcap;
2470 2470          int err = 0;
2471 2471  
2472 2472          if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2473 2473                  zone->zone_phys_mcap = mcap;
2474 2474  
2475 2475          return (err);
2476 2476  }
2477 2477  
2478 2478  static int
2479 2479  zone_set_sched_class(zone_t *zone, const char *new_class)
2480 2480  {
2481 2481          char sched_class[PC_CLNMSZ];
2482 2482          id_t classid;
2483 2483          int err;
2484 2484  
2485 2485          ASSERT(zone != global_zone);
2486 2486          if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2487 2487                  return (err);   /* EFAULT or ENAMETOOLONG */
2488 2488  
2489 2489          if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2490 2490                  return (set_errno(EINVAL));
2491 2491          zone->zone_defaultcid = classid;
2492 2492          ASSERT(zone->zone_defaultcid > 0 &&
2493 2493              zone->zone_defaultcid < loaded_classes);
2494 2494  
2495 2495          return (0);
2496 2496  }
2497 2497  
2498 2498  /*
2499 2499   * Block indefinitely waiting for (zone_status >= status)
2500 2500   */
2501 2501  void
2502 2502  zone_status_wait(zone_t *zone, zone_status_t status)
2503 2503  {
2504 2504          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2505 2505  
2506 2506          mutex_enter(&zone_status_lock);
2507 2507          while (zone->zone_status < status) {
2508 2508                  cv_wait(&zone->zone_cv, &zone_status_lock);
2509 2509          }
2510 2510          mutex_exit(&zone_status_lock);
2511 2511  }
2512 2512  
2513 2513  /*
2514 2514   * Private CPR-safe version of zone_status_wait().
2515 2515   */
2516 2516  static void
2517 2517  zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2518 2518  {
2519 2519          callb_cpr_t cprinfo;
2520 2520  
2521 2521          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2522 2522  
2523 2523          CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2524 2524              str);
2525 2525          mutex_enter(&zone_status_lock);
2526 2526          while (zone->zone_status < status) {
2527 2527                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
2528 2528                  cv_wait(&zone->zone_cv, &zone_status_lock);
2529 2529                  CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2530 2530          }
2531 2531          /*
2532 2532           * zone_status_lock is implicitly released by the following.
2533 2533           */
2534 2534          CALLB_CPR_EXIT(&cprinfo);
2535 2535  }
2536 2536  
2537 2537  /*
2538 2538   * Block until zone enters requested state or signal is received.  Return (0)
2539 2539   * if signaled, non-zero otherwise.
2540 2540   */
2541 2541  int
2542 2542  zone_status_wait_sig(zone_t *zone, zone_status_t status)
2543 2543  {
2544 2544          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2545 2545  
2546 2546          mutex_enter(&zone_status_lock);
2547 2547          while (zone->zone_status < status) {
2548 2548                  if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2549 2549                          mutex_exit(&zone_status_lock);
2550 2550                          return (0);
2551 2551                  }
2552 2552          }
2553 2553          mutex_exit(&zone_status_lock);
2554 2554          return (1);
2555 2555  }
2556 2556  
2557 2557  /*
2558 2558   * Block until the zone enters the requested state or the timeout expires,
2559 2559   * whichever happens first.  Return (-1) if operation timed out, time remaining
2560 2560   * otherwise.
2561 2561   */
2562 2562  clock_t
2563 2563  zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2564 2564  {
2565 2565          clock_t timeleft = 0;
2566 2566  
2567 2567          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2568 2568  
2569 2569          mutex_enter(&zone_status_lock);
2570 2570          while (zone->zone_status < status && timeleft != -1) {
2571 2571                  timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2572 2572          }
2573 2573          mutex_exit(&zone_status_lock);
2574 2574          return (timeleft);
2575 2575  }
2576 2576  
2577 2577  /*
2578 2578   * Block until the zone enters the requested state, the current process is
2579 2579   * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2580 2580   * operation timed out, 0 if signaled, time remaining otherwise.
2581 2581   */
2582 2582  clock_t
2583 2583  zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2584 2584  {
2585 2585          clock_t timeleft = tim - ddi_get_lbolt();
2586 2586  
2587 2587          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2588 2588  
2589 2589          mutex_enter(&zone_status_lock);
2590 2590          while (zone->zone_status < status) {
2591 2591                  timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2592 2592                      tim);
2593 2593                  if (timeleft <= 0)
2594 2594                          break;
2595 2595          }
2596 2596          mutex_exit(&zone_status_lock);
2597 2597          return (timeleft);
2598 2598  }
2599 2599  
2600 2600  /*
2601 2601   * Zones have two reference counts: one for references from credential
2602 2602   * structures (zone_cred_ref), and one (zone_ref) for everything else.
2603 2603   * This is so we can allow a zone to be rebooted while there are still
2604 2604   * outstanding cred references, since certain drivers cache dblks (which
2605 2605   * implicitly results in cached creds).  We wait for zone_ref to drop to
2606 2606   * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2607 2607   * later freed when the zone_cred_ref drops to 0, though nothing other
2608 2608   * than the zone id and privilege set should be accessed once the zone
2609 2609   * is "dead".
2610 2610   *
2611 2611   * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2612 2612   * to force halt/reboot to block waiting for the zone_cred_ref to drop
2613 2613   * to 0.  This can be useful to flush out other sources of cached creds
2614 2614   * that may be less innocuous than the driver case.
2615 2615   *
2616 2616   * Zones also provide a tracked reference counting mechanism in which zone
2617 2617   * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2618 2618   * debuggers determine the sources of leaked zone references.  See
2619 2619   * zone_hold_ref() and zone_rele_ref() below for more information.
2620 2620   */
2621 2621  
2622 2622  int zone_wait_for_cred = 0;
2623 2623  
2624 2624  static void
2625 2625  zone_hold_locked(zone_t *z)
2626 2626  {
2627 2627          ASSERT(MUTEX_HELD(&z->zone_lock));
2628 2628          z->zone_ref++;
2629 2629          ASSERT(z->zone_ref != 0);
2630 2630  }
2631 2631  
2632 2632  /*
2633 2633   * Increment the specified zone's reference count.  The zone's zone_t structure
2634 2634   * will not be freed as long as the zone's reference count is nonzero.
2635 2635   * Decrement the zone's reference count via zone_rele().
2636 2636   *
2637 2637   * NOTE: This function should only be used to hold zones for short periods of
2638 2638   * time.  Use zone_hold_ref() if the zone must be held for a long time.
2639 2639   */
2640 2640  void
2641 2641  zone_hold(zone_t *z)
2642 2642  {
2643 2643          mutex_enter(&z->zone_lock);
2644 2644          zone_hold_locked(z);
2645 2645          mutex_exit(&z->zone_lock);
2646 2646  }
2647 2647  
2648 2648  /*
2649 2649   * If the non-cred ref count drops to 1 and either the cred ref count
2650 2650   * is 0 or we aren't waiting for cred references, the zone is ready to
2651 2651   * be destroyed.
2652 2652   */
2653 2653  #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2654 2654              (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2655 2655  
2656 2656  /*
2657 2657   * Common zone reference release function invoked by zone_rele() and
2658 2658   * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2659 2659   * zone's subsystem-specific reference counters are not affected by the
2660 2660   * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2661 2661   * removed from the specified zone's reference list.  ref must be non-NULL iff
2662 2662   * subsys is not ZONE_REF_NUM_SUBSYS.
2663 2663   */
2664 2664  static void
2665 2665  zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2666 2666  {
2667 2667          boolean_t wakeup;
2668 2668  
2669 2669          mutex_enter(&z->zone_lock);
2670 2670          ASSERT(z->zone_ref != 0);
2671 2671          z->zone_ref--;
2672 2672          if (subsys != ZONE_REF_NUM_SUBSYS) {
2673 2673                  ASSERT(z->zone_subsys_ref[subsys] != 0);
2674 2674                  z->zone_subsys_ref[subsys]--;
2675 2675                  list_remove(&z->zone_ref_list, ref);
2676 2676          }
2677 2677          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2678 2678                  /* no more refs, free the structure */
2679 2679                  mutex_exit(&z->zone_lock);
2680 2680                  zone_free(z);
2681 2681                  return;
2682 2682          }
2683 2683          /* signal zone_destroy so the zone can finish halting */
2684 2684          wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2685 2685          mutex_exit(&z->zone_lock);
2686 2686  
2687 2687          if (wakeup) {
2688 2688                  /*
2689 2689                   * Grabbing zonehash_lock here effectively synchronizes with
2690 2690                   * zone_destroy() to avoid missed signals.
2691 2691                   */
2692 2692                  mutex_enter(&zonehash_lock);
2693 2693                  cv_broadcast(&zone_destroy_cv);
2694 2694                  mutex_exit(&zonehash_lock);
2695 2695          }
2696 2696  }
2697 2697  
2698 2698  /*
2699 2699   * Decrement the specified zone's reference count.  The specified zone will
2700 2700   * cease to exist after this function returns if the reference count drops to
2701 2701   * zero.  This function should be paired with zone_hold().
2702 2702   */
2703 2703  void
2704 2704  zone_rele(zone_t *z)
2705 2705  {
2706 2706          zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2707 2707  }
2708 2708  
2709 2709  /*
2710 2710   * Initialize a zone reference structure.  This function must be invoked for
2711 2711   * a reference structure before the structure is passed to zone_hold_ref().
2712 2712   */
2713 2713  void
2714 2714  zone_init_ref(zone_ref_t *ref)
2715 2715  {
2716 2716          ref->zref_zone = NULL;
2717 2717          list_link_init(&ref->zref_linkage);
2718 2718  }
2719 2719  
2720 2720  /*
2721 2721   * Acquire a reference to zone z.  The caller must specify the
2722 2722   * zone_ref_subsys_t constant associated with its subsystem.  The specified
2723 2723   * zone_ref_t structure will represent a reference to the specified zone.  Use
2724 2724   * zone_rele_ref() to release the reference.
2725 2725   *
2726 2726   * The referenced zone_t structure will not be freed as long as the zone_t's
2727 2727   * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2728 2728   * references.
2729 2729   *
2730 2730   * NOTE: The zone_ref_t structure must be initialized before it is used.
2731 2731   * See zone_init_ref() above.
2732 2732   */
2733 2733  void
2734 2734  zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2735 2735  {
2736 2736          ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2737 2737  
2738 2738          /*
2739 2739           * Prevent consumers from reusing a reference structure before
2740 2740           * releasing it.
2741 2741           */
2742 2742          VERIFY(ref->zref_zone == NULL);
2743 2743  
2744 2744          ref->zref_zone = z;
2745 2745          mutex_enter(&z->zone_lock);
2746 2746          zone_hold_locked(z);
2747 2747          z->zone_subsys_ref[subsys]++;
2748 2748          ASSERT(z->zone_subsys_ref[subsys] != 0);
2749 2749          list_insert_head(&z->zone_ref_list, ref);
2750 2750          mutex_exit(&z->zone_lock);
2751 2751  }
2752 2752  
2753 2753  /*
2754 2754   * Release the zone reference represented by the specified zone_ref_t.
2755 2755   * The reference is invalid after it's released; however, the zone_ref_t
2756 2756   * structure can be reused without having to invoke zone_init_ref().
2757 2757   * subsys should be the same value that was passed to zone_hold_ref()
2758 2758   * when the reference was acquired.
2759 2759   */
2760 2760  void
2761 2761  zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2762 2762  {
2763 2763          zone_rele_common(ref->zref_zone, ref, subsys);
2764 2764  
2765 2765          /*
2766 2766           * Set the zone_ref_t's zref_zone field to NULL to generate panics
2767 2767           * when consumers dereference the reference.  This helps us catch
2768 2768           * consumers who use released references.  Furthermore, this lets
2769 2769           * consumers reuse the zone_ref_t structure without having to
2770 2770           * invoke zone_init_ref().
2771 2771           */
2772 2772          ref->zref_zone = NULL;
2773 2773  }
2774 2774  
2775 2775  void
2776 2776  zone_cred_hold(zone_t *z)
2777 2777  {
2778 2778          mutex_enter(&z->zone_lock);
2779 2779          z->zone_cred_ref++;
2780 2780          ASSERT(z->zone_cred_ref != 0);
2781 2781          mutex_exit(&z->zone_lock);
2782 2782  }
2783 2783  
2784 2784  void
2785 2785  zone_cred_rele(zone_t *z)
2786 2786  {
2787 2787          boolean_t wakeup;
2788 2788  
2789 2789          mutex_enter(&z->zone_lock);
2790 2790          ASSERT(z->zone_cred_ref != 0);
2791 2791          z->zone_cred_ref--;
2792 2792          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2793 2793                  /* no more refs, free the structure */
2794 2794                  mutex_exit(&z->zone_lock);
2795 2795                  zone_free(z);
2796 2796                  return;
2797 2797          }
2798 2798          /*
2799 2799           * If zone_destroy is waiting for the cred references to drain
2800 2800           * out, and they have, signal it.
2801 2801           */
2802 2802          wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2803 2803              zone_status_get(z) >= ZONE_IS_DEAD);
2804 2804          mutex_exit(&z->zone_lock);
2805 2805  
2806 2806          if (wakeup) {
2807 2807                  /*
2808 2808                   * Grabbing zonehash_lock here effectively synchronizes with
2809 2809                   * zone_destroy() to avoid missed signals.
2810 2810                   */
2811 2811                  mutex_enter(&zonehash_lock);
2812 2812                  cv_broadcast(&zone_destroy_cv);
2813 2813                  mutex_exit(&zonehash_lock);
2814 2814          }
2815 2815  }
2816 2816  
2817 2817  void
2818 2818  zone_task_hold(zone_t *z)
2819 2819  {
2820 2820          mutex_enter(&z->zone_lock);
2821 2821          z->zone_ntasks++;
2822 2822          ASSERT(z->zone_ntasks != 0);
2823 2823          mutex_exit(&z->zone_lock);
2824 2824  }
2825 2825  
2826 2826  void
2827 2827  zone_task_rele(zone_t *zone)
2828 2828  {
2829 2829          uint_t refcnt;
2830 2830  
2831 2831          mutex_enter(&zone->zone_lock);
2832 2832          ASSERT(zone->zone_ntasks != 0);
2833 2833          refcnt = --zone->zone_ntasks;
2834 2834          if (refcnt > 1) {       /* Common case */
2835 2835                  mutex_exit(&zone->zone_lock);
2836 2836                  return;
2837 2837          }
2838 2838          zone_hold_locked(zone); /* so we can use the zone_t later */
2839 2839          mutex_exit(&zone->zone_lock);
2840 2840          if (refcnt == 1) {
2841 2841                  /*
2842 2842                   * See if the zone is shutting down.
2843 2843                   */
2844 2844                  mutex_enter(&zone_status_lock);
2845 2845                  if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2846 2846                          goto out;
2847 2847                  }
2848 2848  
2849 2849                  /*
2850 2850                   * Make sure the ntasks didn't change since we
2851 2851                   * dropped zone_lock.
2852 2852                   */
2853 2853                  mutex_enter(&zone->zone_lock);
2854 2854                  if (refcnt != zone->zone_ntasks) {
2855 2855                          mutex_exit(&zone->zone_lock);
2856 2856                          goto out;
2857 2857                  }
2858 2858                  mutex_exit(&zone->zone_lock);
2859 2859  
2860 2860                  /*
2861 2861                   * No more user processes in the zone.  The zone is empty.
2862 2862                   */
2863 2863                  zone_status_set(zone, ZONE_IS_EMPTY);
2864 2864                  goto out;
2865 2865          }
2866 2866  
2867 2867          ASSERT(refcnt == 0);
2868 2868          /*
2869 2869           * zsched has exited; the zone is dead.
2870 2870           */
2871 2871          zone->zone_zsched = NULL;               /* paranoia */
2872 2872          mutex_enter(&zone_status_lock);
2873 2873          zone_status_set(zone, ZONE_IS_DEAD);
2874 2874  out:
2875 2875          mutex_exit(&zone_status_lock);
2876 2876          zone_rele(zone);
2877 2877  }
2878 2878  
2879 2879  zoneid_t
2880 2880  getzoneid(void)
2881 2881  {
2882 2882          return (curproc->p_zone->zone_id);
2883 2883  }
2884 2884  
2885 2885  /*
2886 2886   * Internal versions of zone_find_by_*().  These don't zone_hold() or
2887 2887   * check the validity of a zone's state.
2888 2888   */
2889 2889  static zone_t *
2890 2890  zone_find_all_by_id(zoneid_t zoneid)
2891 2891  {
2892 2892          mod_hash_val_t hv;
2893 2893          zone_t *zone = NULL;
2894 2894  
2895 2895          ASSERT(MUTEX_HELD(&zonehash_lock));
2896 2896  
2897 2897          if (mod_hash_find(zonehashbyid,
2898 2898              (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
2899 2899                  zone = (zone_t *)hv;
2900 2900          return (zone);
2901 2901  }
2902 2902  
2903 2903  static zone_t *
2904 2904  zone_find_all_by_label(const ts_label_t *label)
2905 2905  {
2906 2906          mod_hash_val_t hv;
2907 2907          zone_t *zone = NULL;
2908 2908  
2909 2909          ASSERT(MUTEX_HELD(&zonehash_lock));
2910 2910  
2911 2911          /*
2912 2912           * zonehashbylabel is not maintained for unlabeled systems
2913 2913           */
2914 2914          if (!is_system_labeled())
2915 2915                  return (NULL);
2916 2916          if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
2917 2917                  zone = (zone_t *)hv;
2918 2918          return (zone);
2919 2919  }
2920 2920  
2921 2921  static zone_t *
2922 2922  zone_find_all_by_name(char *name)
2923 2923  {
2924 2924          mod_hash_val_t hv;
2925 2925          zone_t *zone = NULL;
2926 2926  
2927 2927          ASSERT(MUTEX_HELD(&zonehash_lock));
2928 2928  
2929 2929          if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
2930 2930                  zone = (zone_t *)hv;
2931 2931          return (zone);
2932 2932  }
2933 2933  
2934 2934  /*
2935 2935   * Public interface for looking up a zone by zoneid.  Only returns the zone if
2936 2936   * it is fully initialized, and has not yet begun the zone_destroy() sequence.
2937 2937   * Caller must call zone_rele() once it is done with the zone.
2938 2938   *
2939 2939   * The zone may begin the zone_destroy() sequence immediately after this
2940 2940   * function returns, but may be safely used until zone_rele() is called.
2941 2941   */
2942 2942  zone_t *
2943 2943  zone_find_by_id(zoneid_t zoneid)
2944 2944  {
2945 2945          zone_t *zone;
2946 2946          zone_status_t status;
2947 2947  
2948 2948          mutex_enter(&zonehash_lock);
2949 2949          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2950 2950                  mutex_exit(&zonehash_lock);
2951 2951                  return (NULL);
2952 2952          }
2953 2953          status = zone_status_get(zone);
2954 2954          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2955 2955                  /*
2956 2956                   * For all practical purposes the zone doesn't exist.
2957 2957                   */
2958 2958                  mutex_exit(&zonehash_lock);
2959 2959                  return (NULL);
2960 2960          }
2961 2961          zone_hold(zone);
2962 2962          mutex_exit(&zonehash_lock);
2963 2963          return (zone);
2964 2964  }
2965 2965  
2966 2966  /*
2967 2967   * Similar to zone_find_by_id, but using zone label as the key.
2968 2968   */
2969 2969  zone_t *
2970 2970  zone_find_by_label(const ts_label_t *label)
2971 2971  {
2972 2972          zone_t *zone;
2973 2973          zone_status_t status;
2974 2974  
2975 2975          mutex_enter(&zonehash_lock);
2976 2976          if ((zone = zone_find_all_by_label(label)) == NULL) {
2977 2977                  mutex_exit(&zonehash_lock);
2978 2978                  return (NULL);
2979 2979          }
2980 2980  
2981 2981          status = zone_status_get(zone);
2982 2982          if (status > ZONE_IS_DOWN) {
2983 2983                  /*
2984 2984                   * For all practical purposes the zone doesn't exist.
2985 2985                   */
2986 2986                  mutex_exit(&zonehash_lock);
2987 2987                  return (NULL);
2988 2988          }
2989 2989          zone_hold(zone);
2990 2990          mutex_exit(&zonehash_lock);
2991 2991          return (zone);
2992 2992  }
2993 2993  
2994 2994  /*
2995 2995   * Similar to zone_find_by_id, but using zone name as the key.
2996 2996   */
2997 2997  zone_t *
2998 2998  zone_find_by_name(char *name)
2999 2999  {
3000 3000          zone_t *zone;
3001 3001          zone_status_t status;
3002 3002  
3003 3003          mutex_enter(&zonehash_lock);
3004 3004          if ((zone = zone_find_all_by_name(name)) == NULL) {
3005 3005                  mutex_exit(&zonehash_lock);
3006 3006                  return (NULL);
3007 3007          }
3008 3008          status = zone_status_get(zone);
3009 3009          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3010 3010                  /*
3011 3011                   * For all practical purposes the zone doesn't exist.
3012 3012                   */
3013 3013                  mutex_exit(&zonehash_lock);
3014 3014                  return (NULL);
3015 3015          }
3016 3016          zone_hold(zone);
3017 3017          mutex_exit(&zonehash_lock);
3018 3018          return (zone);
3019 3019  }
3020 3020  
3021 3021  /*
3022 3022   * Similar to zone_find_by_id(), using the path as a key.  For instance,
3023 3023   * if there is a zone "foo" rooted at /foo/root, and the path argument
3024 3024   * is "/foo/root/proc", it will return the held zone_t corresponding to
3025 3025   * zone "foo".
3026 3026   *
3027 3027   * zone_find_by_path() always returns a non-NULL value, since at the
3028 3028   * very least every path will be contained in the global zone.
3029 3029   *
3030 3030   * As with the other zone_find_by_*() functions, the caller is
3031 3031   * responsible for zone_rele()ing the return value of this function.
3032 3032   */
3033 3033  zone_t *
3034 3034  zone_find_by_path(const char *path)
3035 3035  {
3036 3036          zone_t *zone;
3037 3037          zone_t *zret = NULL;
3038 3038          zone_status_t status;
3039 3039  
3040 3040          if (path == NULL) {
3041 3041                  /*
3042 3042                   * Call from rootconf().
3043 3043                   */
3044 3044                  zone_hold(global_zone);
3045 3045                  return (global_zone);
3046 3046          }
3047 3047          ASSERT(*path == '/');
3048 3048          mutex_enter(&zonehash_lock);
3049 3049          for (zone = list_head(&zone_active); zone != NULL;
3050 3050              zone = list_next(&zone_active, zone)) {
3051 3051                  if (ZONE_PATH_VISIBLE(path, zone))
3052 3052                          zret = zone;
3053 3053          }
3054 3054          ASSERT(zret != NULL);
3055 3055          status = zone_status_get(zret);
3056 3056          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3057 3057                  /*
3058 3058                   * Zone practically doesn't exist.
3059 3059                   */
3060 3060                  zret = global_zone;
3061 3061          }
3062 3062          zone_hold(zret);
3063 3063          mutex_exit(&zonehash_lock);
3064 3064          return (zret);
3065 3065  }
3066 3066  
3067 3067  /*
3068 3068   * Public interface for updating per-zone load averages.  Called once per
3069 3069   * second.
3070 3070   *
3071 3071   * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3072 3072   */
3073 3073  void
3074 3074  zone_loadavg_update()
3075 3075  {
3076 3076          zone_t *zp;
3077 3077          zone_status_t status;
3078 3078          struct loadavg_s *lavg;
3079 3079          hrtime_t zone_total;
3080 3080          int i;
3081 3081          hrtime_t hr_avg;
3082 3082          int nrun;
3083 3083          static int64_t f[3] = { 135, 27, 9 };
3084 3084          int64_t q, r;
3085 3085  
3086 3086          mutex_enter(&zonehash_lock);
3087 3087          for (zp = list_head(&zone_active); zp != NULL;
3088 3088              zp = list_next(&zone_active, zp)) {
3089 3089                  mutex_enter(&zp->zone_lock);
3090 3090  
3091 3091                  /* Skip zones that are on the way down or not yet up */
3092 3092                  status = zone_status_get(zp);
3093 3093                  if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3094 3094                          /* For all practical purposes the zone doesn't exist. */
3095 3095                          mutex_exit(&zp->zone_lock);
3096 3096                          continue;
3097 3097                  }
3098 3098  
3099 3099                  /*
3100 3100                   * Update the 10 second moving average data in zone_loadavg.
3101 3101                   */
3102 3102                  lavg = &zp->zone_loadavg;
3103 3103  
3104 3104                  zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
3105 3105                  scalehrtime(&zone_total);
3106 3106  
3107 3107                  /* The zone_total should always be increasing. */
3108 3108                  lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3109 3109                      zone_total - lavg->lg_total : 0;
3110 3110                  lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3111 3111                  /* lg_total holds the prev. 1 sec. total */
3112 3112                  lavg->lg_total = zone_total;
3113 3113  
3114 3114                  /*
3115 3115                   * To simplify the calculation, we don't calculate the load avg.
3116 3116                   * until the zone has been up for at least 10 seconds and our
3117 3117                   * moving average is thus full.
3118 3118                   */
3119 3119                  if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3120 3120                          lavg->lg_len++;
3121 3121                          mutex_exit(&zp->zone_lock);
3122 3122                          continue;
3123 3123                  }
3124 3124  
3125 3125                  /* Now calculate the 1min, 5min, 15 min load avg. */
3126 3126                  hr_avg = 0;
3127 3127                  for (i = 0; i < S_LOADAVG_SZ; i++)
3128 3128                          hr_avg += lavg->lg_loads[i];
3129 3129                  hr_avg = hr_avg / S_LOADAVG_SZ;
3130 3130                  nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3131 3131  
3132 3132                  /* Compute load avg. See comment in calcloadavg() */
3133 3133                  for (i = 0; i < 3; i++) {
3134 3134                          q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3135 3135                          r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3136 3136                          zp->zone_hp_avenrun[i] +=
3137 3137                              ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3138 3138  
3139 3139                          /* avenrun[] can only hold 31 bits of load avg. */
3140 3140                          if (zp->zone_hp_avenrun[i] <
3141 3141                              ((uint64_t)1<<(31+16-FSHIFT)))
3142 3142                                  zp->zone_avenrun[i] = (int32_t)
3143 3143                                      (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3144 3144                          else
3145 3145                                  zp->zone_avenrun[i] = 0x7fffffff;
3146 3146                  }
3147 3147  
3148 3148                  mutex_exit(&zp->zone_lock);
3149 3149          }
3150 3150          mutex_exit(&zonehash_lock);
3151 3151  }
3152 3152  
3153 3153  /*
3154 3154   * Get the number of cpus visible to this zone.  The system-wide global
3155 3155   * 'ncpus' is returned if pools are disabled, the caller is in the
3156 3156   * global zone, or a NULL zone argument is passed in.
3157 3157   */
3158 3158  int
3159 3159  zone_ncpus_get(zone_t *zone)
3160 3160  {
3161 3161          int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3162 3162  
3163 3163          return (myncpus != 0 ? myncpus : ncpus);
3164 3164  }
3165 3165  
3166 3166  /*
3167 3167   * Get the number of online cpus visible to this zone.  The system-wide
3168 3168   * global 'ncpus_online' is returned if pools are disabled, the caller
3169 3169   * is in the global zone, or a NULL zone argument is passed in.
3170 3170   */
3171 3171  int
3172 3172  zone_ncpus_online_get(zone_t *zone)
3173 3173  {
3174 3174          int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3175 3175  
3176 3176          return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3177 3177  }
3178 3178  
3179 3179  /*
3180 3180   * Return the pool to which the zone is currently bound.
3181 3181   */
3182 3182  pool_t *
3183 3183  zone_pool_get(zone_t *zone)
3184 3184  {
3185 3185          ASSERT(pool_lock_held());
3186 3186  
3187 3187          return (zone->zone_pool);
3188 3188  }
3189 3189  
3190 3190  /*
3191 3191   * Set the zone's pool pointer and update the zone's visibility to match
3192 3192   * the resources in the new pool.
3193 3193   */
3194 3194  void
3195 3195  zone_pool_set(zone_t *zone, pool_t *pool)
3196 3196  {
3197 3197          ASSERT(pool_lock_held());
3198 3198          ASSERT(MUTEX_HELD(&cpu_lock));
3199 3199  
3200 3200          zone->zone_pool = pool;
3201 3201          zone_pset_set(zone, pool->pool_pset->pset_id);
3202 3202  }
3203 3203  
3204 3204  /*
3205 3205   * Return the cached value of the id of the processor set to which the
3206 3206   * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3207 3207   * facility is disabled.
3208 3208   */
3209 3209  psetid_t
3210 3210  zone_pset_get(zone_t *zone)
3211 3211  {
3212 3212          ASSERT(MUTEX_HELD(&cpu_lock));
3213 3213  
3214 3214          return (zone->zone_psetid);
3215 3215  }
3216 3216  
3217 3217  /*
3218 3218   * Set the cached value of the id of the processor set to which the zone
3219 3219   * is currently bound.  Also update the zone's visibility to match the
3220 3220   * resources in the new processor set.
3221 3221   */
3222 3222  void
3223 3223  zone_pset_set(zone_t *zone, psetid_t newpsetid)
3224 3224  {
3225 3225          psetid_t oldpsetid;
3226 3226  
3227 3227          ASSERT(MUTEX_HELD(&cpu_lock));
3228 3228          oldpsetid = zone_pset_get(zone);
3229 3229  
3230 3230          if (oldpsetid == newpsetid)
3231 3231                  return;
3232 3232          /*
3233 3233           * Global zone sees all.
3234 3234           */
3235 3235          if (zone != global_zone) {
3236 3236                  zone->zone_psetid = newpsetid;
3237 3237                  if (newpsetid != ZONE_PS_INVAL)
3238 3238                          pool_pset_visibility_add(newpsetid, zone);
3239 3239                  if (oldpsetid != ZONE_PS_INVAL)
3240 3240                          pool_pset_visibility_remove(oldpsetid, zone);
3241 3241          }
3242 3242          /*
3243 3243           * Disabling pools, so we should start using the global values
3244 3244           * for ncpus and ncpus_online.
3245 3245           */
3246 3246          if (newpsetid == ZONE_PS_INVAL) {
3247 3247                  zone->zone_ncpus = 0;
3248 3248                  zone->zone_ncpus_online = 0;
3249 3249          }
3250 3250  }
3251 3251  
3252 3252  /*
3253 3253   * Walk the list of active zones and issue the provided callback for
3254 3254   * each of them.
3255 3255   *
3256 3256   * Caller must not be holding any locks that may be acquired under
3257 3257   * zonehash_lock.  See comment at the beginning of the file for a list of
3258 3258   * common locks and their interactions with zones.
3259 3259   */
3260 3260  int
3261 3261  zone_walk(int (*cb)(zone_t *, void *), void *data)
3262 3262  {
3263 3263          zone_t *zone;
3264 3264          int ret = 0;
3265 3265          zone_status_t status;
3266 3266  
3267 3267          mutex_enter(&zonehash_lock);
3268 3268          for (zone = list_head(&zone_active); zone != NULL;
3269 3269              zone = list_next(&zone_active, zone)) {
3270 3270                  /*
3271 3271                   * Skip zones that shouldn't be externally visible.
3272 3272                   */
3273 3273                  status = zone_status_get(zone);
3274 3274                  if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3275 3275                          continue;
3276 3276                  /*
3277 3277                   * Bail immediately if any callback invocation returns a
3278 3278                   * non-zero value.
3279 3279                   */
3280 3280                  ret = (*cb)(zone, data);
3281 3281                  if (ret != 0)
3282 3282                          break;
3283 3283          }
3284 3284          mutex_exit(&zonehash_lock);
3285 3285          return (ret);
3286 3286  }
3287 3287  
3288 3288  static int
3289 3289  zone_set_root(zone_t *zone, const char *upath)
3290 3290  {
3291 3291          vnode_t *vp;
3292 3292          int trycount;
3293 3293          int error = 0;
3294 3294          char *path;
3295 3295          struct pathname upn, pn;
3296 3296          size_t pathlen;
3297 3297  
3298 3298          if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3299 3299                  return (error);
3300 3300  
3301 3301          pn_alloc(&pn);
3302 3302  
3303 3303          /* prevent infinite loop */
3304 3304          trycount = 10;
3305 3305          for (;;) {
3306 3306                  if (--trycount <= 0) {
3307 3307                          error = ESTALE;
3308 3308                          goto out;
3309 3309                  }
3310 3310  
3311 3311                  if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3312 3312                          /*
3313 3313                           * VOP_ACCESS() may cover 'vp' with a new
3314 3314                           * filesystem, if 'vp' is an autoFS vnode.
3315 3315                           * Get the new 'vp' if so.
3316 3316                           */
3317 3317                          if ((error =
3318 3318                              VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3319 3319                              (!vn_ismntpt(vp) ||
3320 3320                              (error = traverse(&vp)) == 0)) {
3321 3321                                  pathlen = pn.pn_pathlen + 2;
3322 3322                                  path = kmem_alloc(pathlen, KM_SLEEP);
3323 3323                                  (void) strncpy(path, pn.pn_path,
3324 3324                                      pn.pn_pathlen + 1);
3325 3325                                  path[pathlen - 2] = '/';
3326 3326                                  path[pathlen - 1] = '\0';
3327 3327                                  pn_free(&pn);
3328 3328                                  pn_free(&upn);
3329 3329  
3330 3330                                  /* Success! */
3331 3331                                  break;
3332 3332                          }
3333 3333                          VN_RELE(vp);
3334 3334                  }
3335 3335                  if (error != ESTALE)
3336 3336                          goto out;
3337 3337          }
3338 3338  
3339 3339          ASSERT(error == 0);
3340 3340          zone->zone_rootvp = vp;         /* we hold a reference to vp */
3341 3341          zone->zone_rootpath = path;
3342 3342          zone->zone_rootpathlen = pathlen;
3343 3343          if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3344 3344                  zone->zone_flags |= ZF_IS_SCRATCH;
3345 3345          return (0);
3346 3346  
3347 3347  out:
3348 3348          pn_free(&pn);
3349 3349          pn_free(&upn);
3350 3350          return (error);
3351 3351  }
3352 3352  
3353 3353  #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3354 3354                          ((c) >= 'a' && (c) <= 'z') || \
3355 3355                          ((c) >= 'A' && (c) <= 'Z'))
3356 3356  
3357 3357  static int
3358 3358  zone_set_name(zone_t *zone, const char *uname)
3359 3359  {
3360 3360          char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3361 3361          size_t len;
3362 3362          int i, err;
3363 3363  
3364 3364          if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3365 3365                  kmem_free(kname, ZONENAME_MAX);
3366 3366                  return (err);   /* EFAULT or ENAMETOOLONG */
3367 3367          }
3368 3368  
3369 3369          /* must be less than ZONENAME_MAX */
3370 3370          if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3371 3371                  kmem_free(kname, ZONENAME_MAX);
3372 3372                  return (EINVAL);
3373 3373          }
3374 3374  
3375 3375          /*
3376 3376           * Name must start with an alphanumeric and must contain only
3377 3377           * alphanumerics, '-', '_' and '.'.
3378 3378           */
3379 3379          if (!isalnum(kname[0])) {
3380 3380                  kmem_free(kname, ZONENAME_MAX);
3381 3381                  return (EINVAL);
3382 3382          }
3383 3383          for (i = 1; i < len - 1; i++) {
3384 3384                  if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3385 3385                      kname[i] != '.') {
3386 3386                          kmem_free(kname, ZONENAME_MAX);
3387 3387                          return (EINVAL);
3388 3388                  }
3389 3389          }
3390 3390  
3391 3391          zone->zone_name = kname;
3392 3392          return (0);
3393 3393  }
3394 3394  
3395 3395  /*
3396 3396   * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3397 3397   * is NULL or it points to a zone with no hostid emulation, then the machine's
3398 3398   * hostid (i.e., the global zone's hostid) is returned.  This function returns
3399 3399   * zero if neither the zone nor the host machine (global zone) have hostids.  It
3400 3400   * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3401 3401   * hostid and the machine's hostid is invalid.
3402 3402   */
3403 3403  uint32_t
3404 3404  zone_get_hostid(zone_t *zonep)
3405 3405  {
3406 3406          unsigned long machine_hostid;
3407 3407  
3408 3408          if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3409 3409                  if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3410 3410                          return (HW_INVALID_HOSTID);
3411 3411                  return ((uint32_t)machine_hostid);
3412 3412          }
3413 3413          return (zonep->zone_hostid);
3414 3414  }
3415 3415  
3416 3416  /*
3417 3417   * Similar to thread_create(), but makes sure the thread is in the appropriate
3418 3418   * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3419 3419   */
3420 3420  /*ARGSUSED*/
3421 3421  kthread_t *
3422 3422  zthread_create(
3423 3423      caddr_t stk,
3424 3424      size_t stksize,
3425 3425      void (*proc)(),
3426 3426      void *arg,
3427 3427      size_t len,
3428 3428      pri_t pri)
3429 3429  {
3430 3430          kthread_t *t;
3431 3431          zone_t *zone = curproc->p_zone;
3432 3432          proc_t *pp = zone->zone_zsched;
3433 3433  
3434 3434          zone_hold(zone);        /* Reference to be dropped when thread exits */
3435 3435  
3436 3436          /*
3437 3437           * No-one should be trying to create threads if the zone is shutting
3438 3438           * down and there aren't any kernel threads around.  See comment
3439 3439           * in zthread_exit().
3440 3440           */
3441 3441          ASSERT(!(zone->zone_kthreads == NULL &&
3442 3442              zone_status_get(zone) >= ZONE_IS_EMPTY));
3443 3443          /*
3444 3444           * Create a thread, but don't let it run until we've finished setting
3445 3445           * things up.
3446 3446           */
3447 3447          t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3448 3448          ASSERT(t->t_forw == NULL);
3449 3449          mutex_enter(&zone_status_lock);
3450 3450          if (zone->zone_kthreads == NULL) {
3451 3451                  t->t_forw = t->t_back = t;
3452 3452          } else {
3453 3453                  kthread_t *tx = zone->zone_kthreads;
3454 3454  
3455 3455                  t->t_forw = tx;
3456 3456                  t->t_back = tx->t_back;
3457 3457                  tx->t_back->t_forw = t;
3458 3458                  tx->t_back = t;
3459 3459          }
3460 3460          zone->zone_kthreads = t;
3461 3461          mutex_exit(&zone_status_lock);
3462 3462  
3463 3463          mutex_enter(&pp->p_lock);
3464 3464          t->t_proc_flag |= TP_ZTHREAD;
3465 3465          project_rele(t->t_proj);
3466 3466          t->t_proj = project_hold(pp->p_task->tk_proj);
3467 3467  
3468 3468          /*
3469 3469           * Setup complete, let it run.
3470 3470           */
3471 3471          thread_lock(t);
3472 3472          t->t_schedflag |= TS_ALLSTART;
3473 3473          setrun_locked(t);
3474 3474          thread_unlock(t);
3475 3475  
3476 3476          mutex_exit(&pp->p_lock);
3477 3477  
3478 3478          return (t);
3479 3479  }
3480 3480  
3481 3481  /*
3482 3482   * Similar to thread_exit().  Must be called by threads created via
3483 3483   * zthread_exit().
3484 3484   */
3485 3485  void
3486 3486  zthread_exit(void)
3487 3487  {
3488 3488          kthread_t *t = curthread;
3489 3489          proc_t *pp = curproc;
3490 3490          zone_t *zone = pp->p_zone;
3491 3491  
3492 3492          mutex_enter(&zone_status_lock);
3493 3493  
3494 3494          /*
3495 3495           * Reparent to p0
3496 3496           */
3497 3497          kpreempt_disable();
3498 3498          mutex_enter(&pp->p_lock);
3499 3499          t->t_proc_flag &= ~TP_ZTHREAD;
3500 3500          t->t_procp = &p0;
3501 3501          hat_thread_exit(t);
3502 3502          mutex_exit(&pp->p_lock);
3503 3503          kpreempt_enable();
3504 3504  
3505 3505          if (t->t_back == t) {
3506 3506                  ASSERT(t->t_forw == t);
3507 3507                  /*
3508 3508                   * If the zone is empty, once the thread count
3509 3509                   * goes to zero no further kernel threads can be
3510 3510                   * created.  This is because if the creator is a process
3511 3511                   * in the zone, then it must have exited before the zone
3512 3512                   * state could be set to ZONE_IS_EMPTY.
3513 3513                   * Otherwise, if the creator is a kernel thread in the
3514 3514                   * zone, the thread count is non-zero.
3515 3515                   *
3516 3516                   * This really means that non-zone kernel threads should
3517 3517                   * not create zone kernel threads.
3518 3518                   */
3519 3519                  zone->zone_kthreads = NULL;
3520 3520                  if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3521 3521                          zone_status_set(zone, ZONE_IS_DOWN);
3522 3522                          /*
3523 3523                           * Remove any CPU caps on this zone.
3524 3524                           */
3525 3525                          cpucaps_zone_remove(zone);
3526 3526                  }
3527 3527          } else {
3528 3528                  t->t_forw->t_back = t->t_back;
3529 3529                  t->t_back->t_forw = t->t_forw;
3530 3530                  if (zone->zone_kthreads == t)
3531 3531                          zone->zone_kthreads = t->t_forw;
3532 3532          }
3533 3533          mutex_exit(&zone_status_lock);
3534 3534          zone_rele(zone);
3535 3535          thread_exit();
3536 3536          /* NOTREACHED */
3537 3537  }
3538 3538  
3539 3539  static void
3540 3540  zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3541 3541  {
3542 3542          vnode_t *oldvp;
3543 3543  
3544 3544          /* we're going to hold a reference here to the directory */
3545 3545          VN_HOLD(vp);
3546 3546  
3547 3547          /* update abs cwd/root path see c2/audit.c */
3548 3548          if (AU_AUDITING())
3549 3549                  audit_chdirec(vp, vpp);
3550 3550  
3551 3551          mutex_enter(&pp->p_lock);
3552 3552          oldvp = *vpp;
3553 3553          *vpp = vp;
3554 3554          mutex_exit(&pp->p_lock);
3555 3555          if (oldvp != NULL)
3556 3556                  VN_RELE(oldvp);
3557 3557  }
3558 3558  
3559 3559  /*
3560 3560   * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3561 3561   */
3562 3562  static int
3563 3563  nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3564 3564  {
3565 3565          nvpair_t *nvp = NULL;
3566 3566          boolean_t priv_set = B_FALSE;
3567 3567          boolean_t limit_set = B_FALSE;
3568 3568          boolean_t action_set = B_FALSE;
3569 3569  
3570 3570          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3571 3571                  const char *name;
3572 3572                  uint64_t ui64;
3573 3573  
3574 3574                  name = nvpair_name(nvp);
3575 3575                  if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3576 3576                          return (EINVAL);
3577 3577                  (void) nvpair_value_uint64(nvp, &ui64);
3578 3578                  if (strcmp(name, "privilege") == 0) {
3579 3579                          /*
3580 3580                           * Currently only privileged values are allowed, but
3581 3581                           * this may change in the future.
3582 3582                           */
3583 3583                          if (ui64 != RCPRIV_PRIVILEGED)
3584 3584                                  return (EINVAL);
3585 3585                          rv->rcv_privilege = ui64;
3586 3586                          priv_set = B_TRUE;
3587 3587                  } else if (strcmp(name, "limit") == 0) {
3588 3588                          rv->rcv_value = ui64;
3589 3589                          limit_set = B_TRUE;
3590 3590                  } else if (strcmp(name, "action") == 0) {
3591 3591                          if (ui64 != RCTL_LOCAL_NOACTION &&
3592 3592                              ui64 != RCTL_LOCAL_DENY)
3593 3593                                  return (EINVAL);
3594 3594                          rv->rcv_flagaction = ui64;
3595 3595                          action_set = B_TRUE;
3596 3596                  } else {
3597 3597                          return (EINVAL);
3598 3598                  }
3599 3599          }
3600 3600  
3601 3601          if (!(priv_set && limit_set && action_set))
3602 3602                  return (EINVAL);
3603 3603          rv->rcv_action_signal = 0;
3604 3604          rv->rcv_action_recipient = NULL;
3605 3605          rv->rcv_action_recip_pid = -1;
3606 3606          rv->rcv_firing_time = 0;
3607 3607  
3608 3608          return (0);
3609 3609  }
3610 3610  
3611 3611  /*
3612 3612   * Non-global zone version of start_init.
3613 3613   */
3614 3614  void
3615 3615  zone_start_init(void)
3616 3616  {
3617 3617          proc_t *p = ttoproc(curthread);
3618 3618          zone_t *z = p->p_zone;
3619 3619  
3620 3620          ASSERT(!INGLOBALZONE(curproc));
3621 3621  
3622 3622          /*
3623 3623           * For all purposes (ZONE_ATTR_INITPID and restart_init),
3624 3624           * storing just the pid of init is sufficient.
3625 3625           */
3626 3626          z->zone_proc_initpid = p->p_pid;
3627 3627  
3628 3628          /*
3629 3629           * We maintain zone_boot_err so that we can return the cause of the
3630 3630           * failure back to the caller of the zone_boot syscall.
3631 3631           */
3632 3632          p->p_zone->zone_boot_err = start_init_common();
3633 3633  
3634 3634          /*
3635 3635           * We will prevent booting zones from becoming running zones if the
3636 3636           * global zone is shutting down.
3637 3637           */
3638 3638          mutex_enter(&zone_status_lock);
3639 3639          if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3640 3640              ZONE_IS_SHUTTING_DOWN) {
3641 3641                  /*
3642 3642                   * Make sure we are still in the booting state-- we could have
3643 3643                   * raced and already be shutting down, or even further along.
3644 3644                   */
3645 3645                  if (zone_status_get(z) == ZONE_IS_BOOTING) {
3646 3646                          zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3647 3647                  }
3648 3648                  mutex_exit(&zone_status_lock);
3649 3649                  /* It's gone bad, dispose of the process */
3650 3650                  if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3651 3651                          mutex_enter(&p->p_lock);
3652 3652                          ASSERT(p->p_flag & SEXITLWPS);
3653 3653                          lwp_exit();
3654 3654                  }
3655 3655          } else {
3656 3656                  if (zone_status_get(z) == ZONE_IS_BOOTING)
3657 3657                          zone_status_set(z, ZONE_IS_RUNNING);
3658 3658                  mutex_exit(&zone_status_lock);
3659 3659                  /* cause the process to return to userland. */
3660 3660                  lwp_rtt();
3661 3661          }
3662 3662  }
3663 3663  
3664 3664  struct zsched_arg {
3665 3665          zone_t *zone;
3666 3666          nvlist_t *nvlist;
3667 3667  };
3668 3668  
3669 3669  /*
3670 3670   * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3671 3671   * anything to do with scheduling, but rather with the fact that
3672 3672   * per-zone kernel threads are parented to zsched, just like regular
3673 3673   * kernel threads are parented to sched (p0).
3674 3674   *
3675 3675   * zsched is also responsible for launching init for the zone.
3676 3676   */
3677 3677  static void
3678 3678  zsched(void *arg)
3679 3679  {
3680 3680          struct zsched_arg *za = arg;
3681 3681          proc_t *pp = curproc;
3682 3682          proc_t *initp = proc_init;
3683 3683          zone_t *zone = za->zone;
3684 3684          cred_t *cr, *oldcred;
3685 3685          rctl_set_t *set;
3686 3686          rctl_alloc_gp_t *gp;
3687 3687          contract_t *ct = NULL;
3688 3688          task_t *tk, *oldtk;
3689 3689          rctl_entity_p_t e;
3690 3690          kproject_t *pj;
3691 3691  
3692 3692          nvlist_t *nvl = za->nvlist;
3693 3693          nvpair_t *nvp = NULL;
3694 3694  
3695 3695          bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3696 3696          bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3697 3697          PTOU(pp)->u_argc = 0;
3698 3698          PTOU(pp)->u_argv = NULL;
3699 3699          PTOU(pp)->u_envp = NULL;
3700 3700          closeall(P_FINFO(pp));
3701 3701  
3702 3702          /*
3703 3703           * We are this zone's "zsched" process.  As the zone isn't generally
3704 3704           * visible yet we don't need to grab any locks before initializing its
3705 3705           * zone_proc pointer.
3706 3706           */
3707 3707          zone_hold(zone);  /* this hold is released by zone_destroy() */
3708 3708          zone->zone_zsched = pp;
3709 3709          mutex_enter(&pp->p_lock);
3710 3710          pp->p_zone = zone;
3711 3711          mutex_exit(&pp->p_lock);
3712 3712  
3713 3713          /*
3714 3714           * Disassociate process from its 'parent'; parent ourselves to init
3715 3715           * (pid 1) and change other values as needed.
3716 3716           */
3717 3717          sess_create();
3718 3718  
3719 3719          mutex_enter(&pidlock);
3720 3720          proc_detach(pp);
3721 3721          pp->p_ppid = 1;
3722 3722          pp->p_flag |= SZONETOP;
3723 3723          pp->p_ancpid = 1;
3724 3724          pp->p_parent = initp;
3725 3725          pp->p_psibling = NULL;
3726 3726          if (initp->p_child)
3727 3727                  initp->p_child->p_psibling = pp;
3728 3728          pp->p_sibling = initp->p_child;
3729 3729          initp->p_child = pp;
3730 3730  
3731 3731          /* Decrement what newproc() incremented. */
3732 3732          upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3733 3733          /*
3734 3734           * Our credentials are about to become kcred-like, so we don't care
3735 3735           * about the caller's ruid.
3736 3736           */
3737 3737          upcount_inc(crgetruid(kcred), zone->zone_id);
3738 3738          mutex_exit(&pidlock);
3739 3739  
3740 3740          /*
3741 3741           * getting out of global zone, so decrement lwp and process counts
3742 3742           */
3743 3743          pj = pp->p_task->tk_proj;
3744 3744          mutex_enter(&global_zone->zone_nlwps_lock);
3745 3745          pj->kpj_nlwps -= pp->p_lwpcnt;
3746 3746          global_zone->zone_nlwps -= pp->p_lwpcnt;
3747 3747          pj->kpj_nprocs--;
3748 3748          global_zone->zone_nprocs--;
3749 3749          mutex_exit(&global_zone->zone_nlwps_lock);
3750 3750  
3751 3751          /*
3752 3752           * Decrement locked memory counts on old zone and project.
3753 3753           */
3754 3754          mutex_enter(&global_zone->zone_mem_lock);
3755 3755          global_zone->zone_locked_mem -= pp->p_locked_mem;
3756 3756          pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3757 3757          mutex_exit(&global_zone->zone_mem_lock);
3758 3758  
3759 3759          /*
3760 3760           * Create and join a new task in project '0' of this zone.
3761 3761           *
3762 3762           * We don't need to call holdlwps() since we know we're the only lwp in
3763 3763           * this process.
3764 3764           *
3765 3765           * task_join() returns with p_lock held.
3766 3766           */
3767 3767          tk = task_create(0, zone);
3768 3768          mutex_enter(&cpu_lock);
3769 3769          oldtk = task_join(tk, 0);
3770 3770  
3771 3771          pj = pp->p_task->tk_proj;
3772 3772  
3773 3773          mutex_enter(&zone->zone_mem_lock);
3774 3774          zone->zone_locked_mem += pp->p_locked_mem;
3775 3775          pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3776 3776          mutex_exit(&zone->zone_mem_lock);
3777 3777  
3778 3778          /*
3779 3779           * add lwp and process counts to zsched's zone, and increment
3780 3780           * project's task and process count due to the task created in
3781 3781           * the above task_create.
3782 3782           */
3783 3783          mutex_enter(&zone->zone_nlwps_lock);
3784 3784          pj->kpj_nlwps += pp->p_lwpcnt;
3785 3785          pj->kpj_ntasks += 1;
3786 3786          zone->zone_nlwps += pp->p_lwpcnt;
3787 3787          pj->kpj_nprocs++;
3788 3788          zone->zone_nprocs++;
3789 3789          mutex_exit(&zone->zone_nlwps_lock);
3790 3790  
3791 3791          mutex_exit(&curproc->p_lock);
3792 3792          mutex_exit(&cpu_lock);
3793 3793          task_rele(oldtk);
3794 3794  
3795 3795          /*
3796 3796           * The process was created by a process in the global zone, hence the
3797 3797           * credentials are wrong.  We might as well have kcred-ish credentials.
3798 3798           */
3799 3799          cr = zone->zone_kcred;
3800 3800          crhold(cr);
3801 3801          mutex_enter(&pp->p_crlock);
3802 3802          oldcred = pp->p_cred;
3803 3803          pp->p_cred = cr;
3804 3804          mutex_exit(&pp->p_crlock);
3805 3805          crfree(oldcred);
3806 3806  
3807 3807          /*
3808 3808           * Hold credentials again (for thread)
3809 3809           */
3810 3810          crhold(cr);
3811 3811  
3812 3812          /*
3813 3813           * p_lwpcnt can't change since this is a kernel process.
3814 3814           */
3815 3815          crset(pp, cr);
3816 3816  
3817 3817          /*
3818 3818           * Chroot
3819 3819           */
3820 3820          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3821 3821          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3822 3822  
3823 3823          /*
3824 3824           * Initialize zone's rctl set.
3825 3825           */
3826 3826          set = rctl_set_create();
3827 3827          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3828 3828          mutex_enter(&pp->p_lock);
3829 3829          e.rcep_p.zone = zone;
3830 3830          e.rcep_t = RCENTITY_ZONE;
3831 3831          zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3832 3832          mutex_exit(&pp->p_lock);
3833 3833          rctl_prealloc_destroy(gp);
3834 3834  
3835 3835          /*
3836 3836           * Apply the rctls passed in to zone_create().  This is basically a list
3837 3837           * assignment: all of the old values are removed and the new ones
3838 3838           * inserted.  That is, if an empty list is passed in, all values are
3839 3839           * removed.
3840 3840           */
3841 3841          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3842 3842                  rctl_dict_entry_t *rde;
3843 3843                  rctl_hndl_t hndl;
3844 3844                  char *name;
3845 3845                  nvlist_t **nvlarray;
3846 3846                  uint_t i, nelem;
3847 3847                  int error;      /* For ASSERT()s */
3848 3848  
3849 3849                  name = nvpair_name(nvp);
3850 3850                  hndl = rctl_hndl_lookup(name);
3851 3851                  ASSERT(hndl != -1);
3852 3852                  rde = rctl_dict_lookup_hndl(hndl);
3853 3853                  ASSERT(rde != NULL);
3854 3854  
3855 3855                  for (; /* ever */; ) {
3856 3856                          rctl_val_t oval;
3857 3857  
3858 3858                          mutex_enter(&pp->p_lock);
3859 3859                          error = rctl_local_get(hndl, NULL, &oval, pp);
3860 3860                          mutex_exit(&pp->p_lock);
3861 3861                          ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
3862 3862                          ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3863 3863                          if (oval.rcv_privilege == RCPRIV_SYSTEM)
3864 3864                                  break;
3865 3865                          mutex_enter(&pp->p_lock);
3866 3866                          error = rctl_local_delete(hndl, &oval, pp);
3867 3867                          mutex_exit(&pp->p_lock);
3868 3868                          ASSERT(error == 0);
3869 3869                  }
3870 3870                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3871 3871                  ASSERT(error == 0);
3872 3872                  for (i = 0; i < nelem; i++) {
3873 3873                          rctl_val_t *nvalp;
3874 3874  
3875 3875                          nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
3876 3876                          error = nvlist2rctlval(nvlarray[i], nvalp);
3877 3877                          ASSERT(error == 0);
3878 3878                          /*
3879 3879                           * rctl_local_insert can fail if the value being
3880 3880                           * inserted is a duplicate; this is OK.
3881 3881                           */
3882 3882                          mutex_enter(&pp->p_lock);
3883 3883                          if (rctl_local_insert(hndl, nvalp, pp) != 0)
3884 3884                                  kmem_cache_free(rctl_val_cache, nvalp);
3885 3885                          mutex_exit(&pp->p_lock);
3886 3886                  }
3887 3887          }
3888 3888          /*
3889 3889           * Tell the world that we're done setting up.
3890 3890           *
3891 3891           * At this point we want to set the zone status to ZONE_IS_INITIALIZED
3892 3892           * and atomically set the zone's processor set visibility.  Once
3893 3893           * we drop pool_lock() this zone will automatically get updated
3894 3894           * to reflect any future changes to the pools configuration.
3895 3895           *
3896 3896           * Note that after we drop the locks below (zonehash_lock in
3897 3897           * particular) other operations such as a zone_getattr call can
3898 3898           * now proceed and observe the zone. That is the reason for doing a
3899 3899           * state transition to the INITIALIZED state.
3900 3900           */
3901 3901          pool_lock();
3902 3902          mutex_enter(&cpu_lock);
3903 3903          mutex_enter(&zonehash_lock);
3904 3904          zone_uniqid(zone);
3905 3905          zone_zsd_configure(zone);
3906 3906          if (pool_state == POOL_ENABLED)
3907 3907                  zone_pset_set(zone, pool_default->pool_pset->pset_id);
3908 3908          mutex_enter(&zone_status_lock);
3909 3909          ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
3910 3910          zone_status_set(zone, ZONE_IS_INITIALIZED);
3911 3911          mutex_exit(&zone_status_lock);
3912 3912          mutex_exit(&zonehash_lock);
3913 3913          mutex_exit(&cpu_lock);
3914 3914          pool_unlock();
3915 3915  
3916 3916          /* Now call the create callback for this key */
3917 3917          zsd_apply_all_keys(zsd_apply_create, zone);
3918 3918  
3919 3919          /* The callbacks are complete. Mark ZONE_IS_READY */
3920 3920          mutex_enter(&zone_status_lock);
3921 3921          ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
3922 3922          zone_status_set(zone, ZONE_IS_READY);
3923 3923          mutex_exit(&zone_status_lock);
3924 3924  
3925 3925          /*
3926 3926           * Once we see the zone transition to the ZONE_IS_BOOTING state,
3927 3927           * we launch init, and set the state to running.
3928 3928           */
3929 3929          zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
3930 3930  
3931 3931          if (zone_status_get(zone) == ZONE_IS_BOOTING) {
3932 3932                  id_t cid;
3933 3933  
3934 3934                  /*
3935 3935                   * Ok, this is a little complicated.  We need to grab the
3936 3936                   * zone's pool's scheduling class ID; note that by now, we
3937 3937                   * are already bound to a pool if we need to be (zoneadmd
3938 3938                   * will have done that to us while we're in the READY
3939 3939                   * state).  *But* the scheduling class for the zone's 'init'
3940 3940                   * must be explicitly passed to newproc, which doesn't
3941 3941                   * respect pool bindings.
3942 3942                   *
3943 3943                   * We hold the pool_lock across the call to newproc() to
3944 3944                   * close the obvious race: the pool's scheduling class
3945 3945                   * could change before we manage to create the LWP with
3946 3946                   * classid 'cid'.
3947 3947                   */
3948 3948                  pool_lock();
3949 3949                  if (zone->zone_defaultcid > 0)
3950 3950                          cid = zone->zone_defaultcid;
3951 3951                  else
3952 3952                          cid = pool_get_class(zone->zone_pool);
3953 3953                  if (cid == -1)
3954 3954                          cid = defaultcid;
3955 3955  
3956 3956                  /*
3957 3957                   * If this fails, zone_boot will ultimately fail.  The
3958 3958                   * state of the zone will be set to SHUTTING_DOWN-- userland
3959 3959                   * will have to tear down the zone, and fail, or try again.
3960 3960                   */
3961 3961                  if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
3962 3962                      minclsyspri - 1, &ct, 0)) != 0) {
3963 3963                          mutex_enter(&zone_status_lock);
3964 3964                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3965 3965                          mutex_exit(&zone_status_lock);
3966 3966                  } else {
3967 3967                          zone->zone_boot_time = gethrestime_sec();
3968 3968                  }
3969 3969  
3970 3970                  pool_unlock();
3971 3971          }
3972 3972  
3973 3973          /*
3974 3974           * Wait for zone_destroy() to be called.  This is what we spend
3975 3975           * most of our life doing.
3976 3976           */
3977 3977          zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
3978 3978  
3979 3979          if (ct)
3980 3980                  /*
3981 3981                   * At this point the process contract should be empty.
3982 3982                   * (Though if it isn't, it's not the end of the world.)
3983 3983                   */
3984 3984                  VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
3985 3985  
3986 3986          /*
3987 3987           * Allow kcred to be freed when all referring processes
3988 3988           * (including this one) go away.  We can't just do this in
3989 3989           * zone_free because we need to wait for the zone_cred_ref to
3990 3990           * drop to 0 before calling zone_free, and the existence of
3991 3991           * zone_kcred will prevent that.  Thus, we call crfree here to
3992 3992           * balance the crdup in zone_create.  The crhold calls earlier
3993 3993           * in zsched will be dropped when the thread and process exit.
3994 3994           */
3995 3995          crfree(zone->zone_kcred);
3996 3996          zone->zone_kcred = NULL;
3997 3997  
3998 3998          exit(CLD_EXITED, 0);
3999 3999  }
4000 4000  
4001 4001  /*
4002 4002   * Helper function to determine if there are any submounts of the
4003 4003   * provided path.  Used to make sure the zone doesn't "inherit" any
4004 4004   * mounts from before it is created.
4005 4005   */
4006 4006  static uint_t
4007 4007  zone_mount_count(const char *rootpath)
4008 4008  {
4009 4009          vfs_t *vfsp;
4010 4010          uint_t count = 0;
4011 4011          size_t rootpathlen = strlen(rootpath);
4012 4012  
4013 4013          /*
4014 4014           * Holding zonehash_lock prevents race conditions with
4015 4015           * vfs_list_add()/vfs_list_remove() since we serialize with
4016 4016           * zone_find_by_path().
4017 4017           */
4018 4018          ASSERT(MUTEX_HELD(&zonehash_lock));
4019 4019          /*
4020 4020           * The rootpath must end with a '/'
4021 4021           */
4022 4022          ASSERT(rootpath[rootpathlen - 1] == '/');
4023 4023  
4024 4024          /*
4025 4025           * This intentionally does not count the rootpath itself if that
4026 4026           * happens to be a mount point.
4027 4027           */
4028 4028          vfs_list_read_lock();
4029 4029          vfsp = rootvfs;
4030 4030          do {
4031 4031                  if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4032 4032                      rootpathlen) == 0)
4033 4033                          count++;
4034 4034                  vfsp = vfsp->vfs_next;
4035 4035          } while (vfsp != rootvfs);
4036 4036          vfs_list_unlock();
4037 4037          return (count);
4038 4038  }
4039 4039  
4040 4040  /*
4041 4041   * Helper function to make sure that a zone created on 'rootpath'
4042 4042   * wouldn't end up containing other zones' rootpaths.
4043 4043   */
4044 4044  static boolean_t
4045 4045  zone_is_nested(const char *rootpath)
4046 4046  {
4047 4047          zone_t *zone;
4048 4048          size_t rootpathlen = strlen(rootpath);
4049 4049          size_t len;
4050 4050  
4051 4051          ASSERT(MUTEX_HELD(&zonehash_lock));
4052 4052  
4053 4053          /*
4054 4054           * zone_set_root() appended '/' and '\0' at the end of rootpath
4055 4055           */
4056 4056          if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4057 4057              (rootpath[1] == '/') && (rootpath[2] == '\0'))
4058 4058                  return (B_TRUE);
4059 4059  
4060 4060          for (zone = list_head(&zone_active); zone != NULL;
4061 4061              zone = list_next(&zone_active, zone)) {
4062 4062                  if (zone == global_zone)
4063 4063                          continue;
4064 4064                  len = strlen(zone->zone_rootpath);
4065 4065                  if (strncmp(rootpath, zone->zone_rootpath,
4066 4066                      MIN(rootpathlen, len)) == 0)
4067 4067                          return (B_TRUE);
4068 4068          }
4069 4069          return (B_FALSE);
4070 4070  }
4071 4071  
4072 4072  static int
4073 4073  zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4074 4074      size_t zone_privssz)
4075 4075  {
4076 4076          priv_set_t *privs;
4077 4077  
4078 4078          if (zone_privssz < sizeof (priv_set_t))
4079 4079                  return (ENOMEM);
4080 4080  
4081 4081          privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4082 4082  
4083 4083          if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4084 4084                  kmem_free(privs, sizeof (priv_set_t));
4085 4085                  return (EFAULT);
4086 4086          }
4087 4087  
4088 4088          zone->zone_privset = privs;
4089 4089          return (0);
4090 4090  }
4091 4091  
4092 4092  /*
4093 4093   * We make creative use of nvlists to pass in rctls from userland.  The list is
4094 4094   * a list of the following structures:
4095 4095   *
4096 4096   * (name = rctl_name, value = nvpair_list_array)
4097 4097   *
4098 4098   * Where each element of the nvpair_list_array is of the form:
4099 4099   *
4100 4100   * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4101 4101   *      (name = "limit", value = uint64_t),
4102 4102   *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4103 4103   */
4104 4104  static int
4105 4105  parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4106 4106  {
4107 4107          nvpair_t *nvp = NULL;
4108 4108          nvlist_t *nvl = NULL;
4109 4109          char *kbuf;
4110 4110          int error;
4111 4111          rctl_val_t rv;
4112 4112  
4113 4113          *nvlp = NULL;
4114 4114  
4115 4115          if (buflen == 0)
4116 4116                  return (0);
4117 4117  
4118 4118          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4119 4119                  return (ENOMEM);
4120 4120          if (copyin(ubuf, kbuf, buflen)) {
4121 4121                  error = EFAULT;
4122 4122                  goto out;
4123 4123          }
4124 4124          if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4125 4125                  /*
4126 4126                   * nvl may have been allocated/free'd, but the value set to
4127 4127                   * non-NULL, so we reset it here.
4128 4128                   */
4129 4129                  nvl = NULL;
4130 4130                  error = EINVAL;
4131 4131                  goto out;
4132 4132          }
4133 4133          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4134 4134                  rctl_dict_entry_t *rde;
4135 4135                  rctl_hndl_t hndl;
4136 4136                  nvlist_t **nvlarray;
4137 4137                  uint_t i, nelem;
4138 4138                  char *name;
4139 4139  
4140 4140                  error = EINVAL;
4141 4141                  name = nvpair_name(nvp);
4142 4142                  if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4143 4143                      != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4144 4144                          goto out;
4145 4145                  }
4146 4146                  if ((hndl = rctl_hndl_lookup(name)) == -1) {
4147 4147                          goto out;
4148 4148                  }
4149 4149                  rde = rctl_dict_lookup_hndl(hndl);
4150 4150                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4151 4151                  ASSERT(error == 0);
4152 4152                  for (i = 0; i < nelem; i++) {
4153 4153                          if (error = nvlist2rctlval(nvlarray[i], &rv))
4154 4154                                  goto out;
4155 4155                  }
4156 4156                  if (rctl_invalid_value(rde, &rv)) {
4157 4157                          error = EINVAL;
4158 4158                          goto out;
4159 4159                  }
4160 4160          }
4161 4161          error = 0;
4162 4162          *nvlp = nvl;
4163 4163  out:
4164 4164          kmem_free(kbuf, buflen);
4165 4165          if (error && nvl != NULL)
4166 4166                  nvlist_free(nvl);
4167 4167          return (error);
4168 4168  }
4169 4169  
4170 4170  int
4171 4171  zone_create_error(int er_error, int er_ext, int *er_out) {
4172 4172          if (er_out != NULL) {
4173 4173                  if (copyout(&er_ext, er_out, sizeof (int))) {
4174 4174                          return (set_errno(EFAULT));
4175 4175                  }
4176 4176          }
4177 4177          return (set_errno(er_error));
4178 4178  }
4179 4179  
4180 4180  static int
4181 4181  zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4182 4182  {
4183 4183          ts_label_t *tsl;
4184 4184          bslabel_t blab;
4185 4185  
4186 4186          /* Get label from user */
4187 4187          if (copyin(lab, &blab, sizeof (blab)) != 0)
4188 4188                  return (EFAULT);
4189 4189          tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4190 4190          if (tsl == NULL)
4191 4191                  return (ENOMEM);
4192 4192  
4193 4193          zone->zone_slabel = tsl;
4194 4194          return (0);
4195 4195  }
4196 4196  
4197 4197  /*
4198 4198   * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4199 4199   */
4200 4200  static int
4201 4201  parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4202 4202  {
4203 4203          char *kbuf;
4204 4204          char *dataset, *next;
4205 4205          zone_dataset_t *zd;
4206 4206          size_t len;
4207 4207  
4208 4208          if (ubuf == NULL || buflen == 0)
4209 4209                  return (0);
4210 4210  
4211 4211          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4212 4212                  return (ENOMEM);
4213 4213  
4214 4214          if (copyin(ubuf, kbuf, buflen) != 0) {
4215 4215                  kmem_free(kbuf, buflen);
4216 4216                  return (EFAULT);
4217 4217          }
4218 4218  
4219 4219          dataset = next = kbuf;
4220 4220          for (;;) {
4221 4221                  zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4222 4222  
4223 4223                  next = strchr(dataset, ',');
4224 4224  
4225 4225                  if (next == NULL)
4226 4226                          len = strlen(dataset);
4227 4227                  else
4228 4228                          len = next - dataset;
4229 4229  
4230 4230                  zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4231 4231                  bcopy(dataset, zd->zd_dataset, len);
4232 4232                  zd->zd_dataset[len] = '\0';
4233 4233  
4234 4234                  list_insert_head(&zone->zone_datasets, zd);
4235 4235  
4236 4236                  if (next == NULL)
4237 4237                          break;
4238 4238  
4239 4239                  dataset = next + 1;
4240 4240          }
4241 4241  
4242 4242          kmem_free(kbuf, buflen);
4243 4243          return (0);
4244 4244  }
4245 4245  
4246 4246  /*
4247 4247   * System call to create/initialize a new zone named 'zone_name', rooted
4248 4248   * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4249 4249   * and initialized with the zone-wide rctls described in 'rctlbuf', and
4250 4250   * with labeling set by 'match', 'doi', and 'label'.
4251 4251   *
4252 4252   * If extended error is non-null, we may use it to return more detailed
4253 4253   * error information.
4254 4254   */
4255 4255  static zoneid_t
4256 4256  zone_create(const char *zone_name, const char *zone_root,
4257 4257      const priv_set_t *zone_privs, size_t zone_privssz,
4258 4258      caddr_t rctlbuf, size_t rctlbufsz,
4259 4259      caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4260 4260      int match, uint32_t doi, const bslabel_t *label,
4261 4261      int flags)
4262 4262  {
4263 4263          struct zsched_arg zarg;
4264 4264          nvlist_t *rctls = NULL;
4265 4265          proc_t *pp = curproc;
4266 4266          zone_t *zone, *ztmp;
4267 4267          zoneid_t zoneid;
4268 4268          int error;
4269 4269          int error2 = 0;
4270 4270          char *str;
4271 4271          cred_t *zkcr;
4272 4272          boolean_t insert_label_hash;
4273 4273  
4274 4274          if (secpolicy_zone_config(CRED()) != 0)
4275 4275                  return (set_errno(EPERM));
4276 4276  
4277 4277          /* can't boot zone from within chroot environment */
4278 4278          if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4279 4279                  return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4280 4280                      extended_error));
4281 4281  
4282 4282          zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4283 4283          zoneid = zone->zone_id = id_alloc(zoneid_space);
4284 4284          zone->zone_status = ZONE_IS_UNINITIALIZED;
4285 4285          zone->zone_pool = pool_default;
4286 4286          zone->zone_pool_mod = gethrtime();
4287 4287          zone->zone_psetid = ZONE_PS_INVAL;
4288 4288          zone->zone_ncpus = 0;
4289 4289          zone->zone_ncpus_online = 0;
4290 4290          zone->zone_restart_init = B_TRUE;
4291 4291          zone->zone_brand = &native_brand;
4292 4292          zone->zone_initname = NULL;
4293 4293          mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4294 4294          mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4295 4295          mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4296 4296          cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4297 4297          list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4298 4298              offsetof(zone_ref_t, zref_linkage));
4299 4299          list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4300 4300              offsetof(struct zsd_entry, zsd_linkage));
4301 4301          list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4302 4302              offsetof(zone_dataset_t, zd_linkage));
4303 4303          list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4304 4304              offsetof(zone_dl_t, zdl_linkage));
4305 4305          rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4306 4306          rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4307 4307  
4308 4308          if (flags & ZCF_NET_EXCL) {
4309 4309                  zone->zone_flags |= ZF_NET_EXCL;
4310 4310          }
4311 4311  
4312 4312          if ((error = zone_set_name(zone, zone_name)) != 0) {
4313 4313                  zone_free(zone);
4314 4314                  return (zone_create_error(error, 0, extended_error));
4315 4315          }
4316 4316  
4317 4317          if ((error = zone_set_root(zone, zone_root)) != 0) {
4318 4318                  zone_free(zone);
4319 4319                  return (zone_create_error(error, 0, extended_error));
4320 4320          }
4321 4321          if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4322 4322                  zone_free(zone);
4323 4323                  return (zone_create_error(error, 0, extended_error));
4324 4324          }
4325 4325  
4326 4326          /* initialize node name to be the same as zone name */
4327 4327          zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4328 4328          (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4329 4329          zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4330 4330  
4331 4331          zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4332 4332          zone->zone_domain[0] = '\0';
4333 4333          zone->zone_hostid = HW_INVALID_HOSTID;
4334 4334          zone->zone_shares = 1;
4335 4335          zone->zone_shmmax = 0;
4336 4336          zone->zone_ipc.ipcq_shmmni = 0;
4337 4337          zone->zone_ipc.ipcq_semmni = 0;
4338 4338          zone->zone_ipc.ipcq_msgmni = 0;
4339 4339          zone->zone_bootargs = NULL;
4340 4340          zone->zone_fs_allowed = NULL;
4341 4341          zone->zone_initname =
4342 4342              kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4343 4343          (void) strcpy(zone->zone_initname, zone_default_initname);
4344 4344          zone->zone_nlwps = 0;
4345 4345          zone->zone_nlwps_ctl = INT_MAX;
4346 4346          zone->zone_nprocs = 0;
4347 4347          zone->zone_nprocs_ctl = INT_MAX;
4348 4348          zone->zone_locked_mem = 0;
4349 4349          zone->zone_locked_mem_ctl = UINT64_MAX;
4350 4350          zone->zone_max_swap = 0;
4351 4351          zone->zone_max_swap_ctl = UINT64_MAX;
4352 4352          zone->zone_max_lofi = 0;
4353 4353          zone->zone_max_lofi_ctl = UINT64_MAX;
4354 4354          zone0.zone_lockedmem_kstat = NULL;
4355 4355          zone0.zone_swapresv_kstat = NULL;
4356 4356  
4357 4357          /*
4358 4358           * Zsched initializes the rctls.
4359 4359           */
4360 4360          zone->zone_rctls = NULL;
4361 4361  
4362 4362          if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4363 4363                  zone_free(zone);
4364 4364                  return (zone_create_error(error, 0, extended_error));
4365 4365          }
4366 4366  
4367 4367          if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4368 4368                  zone_free(zone);
4369 4369                  return (set_errno(error));
4370 4370          }
4371 4371  
4372 4372          /*
4373 4373           * Read in the trusted system parameters:
4374 4374           * match flag and sensitivity label.
4375 4375           */
4376 4376          zone->zone_match = match;
4377 4377          if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4378 4378                  /* Fail if requested to set doi to anything but system's doi */
4379 4379                  if (doi != 0 && doi != default_doi) {
4380 4380                          zone_free(zone);
4381 4381                          return (set_errno(EINVAL));
4382 4382                  }
4383 4383                  /* Always apply system's doi to the zone */
4384 4384                  error = zone_set_label(zone, label, default_doi);
4385 4385                  if (error != 0) {
4386 4386                          zone_free(zone);
4387 4387                          return (set_errno(error));
4388 4388                  }
4389 4389                  insert_label_hash = B_TRUE;
4390 4390          } else {
4391 4391                  /* all zones get an admin_low label if system is not labeled */
4392 4392                  zone->zone_slabel = l_admin_low;
4393 4393                  label_hold(l_admin_low);
4394 4394                  insert_label_hash = B_FALSE;
4395 4395          }
4396 4396  
4397 4397          /*
4398 4398           * Stop all lwps since that's what normally happens as part of fork().
4399 4399           * This needs to happen before we grab any locks to avoid deadlock
4400 4400           * (another lwp in the process could be waiting for the held lock).
4401 4401           */
4402 4402          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4403 4403                  zone_free(zone);
4404 4404                  if (rctls)
4405 4405                          nvlist_free(rctls);
4406 4406                  return (zone_create_error(error, 0, extended_error));
4407 4407          }
4408 4408  
4409 4409          if (block_mounts(zone) == 0) {
4410 4410                  mutex_enter(&pp->p_lock);
4411 4411                  if (curthread != pp->p_agenttp)
4412 4412                          continuelwps(pp);
4413 4413                  mutex_exit(&pp->p_lock);
4414 4414                  zone_free(zone);
4415 4415                  if (rctls)
4416 4416                          nvlist_free(rctls);
4417 4417                  return (zone_create_error(error, 0, extended_error));
4418 4418          }
4419 4419  
4420 4420          /*
4421 4421           * Set up credential for kernel access.  After this, any errors
4422 4422           * should go through the dance in errout rather than calling
4423 4423           * zone_free directly.
4424 4424           */
4425 4425          zone->zone_kcred = crdup(kcred);
4426 4426          crsetzone(zone->zone_kcred, zone);
4427 4427          priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4428 4428          priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4429 4429          priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4430 4430          priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4431 4431  
4432 4432          mutex_enter(&zonehash_lock);
4433 4433          /*
4434 4434           * Make sure zone doesn't already exist.
4435 4435           *
4436 4436           * If the system and zone are labeled,
4437 4437           * make sure no other zone exists that has the same label.
4438 4438           */
4439 4439          if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4440 4440              (insert_label_hash &&
4441 4441              (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4442 4442                  zone_status_t status;
4443 4443  
4444 4444                  status = zone_status_get(ztmp);
4445 4445                  if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4446 4446                          error = EEXIST;
4447 4447                  else
4448 4448                          error = EBUSY;
4449 4449  
4450 4450                  if (insert_label_hash)
4451 4451                          error2 = ZE_LABELINUSE;
4452 4452  
4453 4453                  goto errout;
4454 4454          }
4455 4455  
4456 4456          /*
4457 4457           * Don't allow zone creations which would cause one zone's rootpath to
4458 4458           * be accessible from that of another (non-global) zone.
4459 4459           */
4460 4460          if (zone_is_nested(zone->zone_rootpath)) {
4461 4461                  error = EBUSY;
4462 4462                  goto errout;
4463 4463          }
4464 4464  
4465 4465          ASSERT(zonecount != 0);         /* check for leaks */
4466 4466          if (zonecount + 1 > maxzones) {
4467 4467                  error = ENOMEM;
4468 4468                  goto errout;
4469 4469          }
4470 4470  
4471 4471          if (zone_mount_count(zone->zone_rootpath) != 0) {
4472 4472                  error = EBUSY;
4473 4473                  error2 = ZE_AREMOUNTS;
4474 4474                  goto errout;
4475 4475          }
4476 4476  
4477 4477          /*
4478 4478           * Zone is still incomplete, but we need to drop all locks while
4479 4479           * zsched() initializes this zone's kernel process.  We
4480 4480           * optimistically add the zone to the hashtable and associated
4481 4481           * lists so a parallel zone_create() doesn't try to create the
4482 4482           * same zone.
4483 4483           */
4484 4484          zonecount++;
4485 4485          (void) mod_hash_insert(zonehashbyid,
4486 4486              (mod_hash_key_t)(uintptr_t)zone->zone_id,
4487 4487              (mod_hash_val_t)(uintptr_t)zone);
4488 4488          str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4489 4489          (void) strcpy(str, zone->zone_name);
4490 4490          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4491 4491              (mod_hash_val_t)(uintptr_t)zone);
4492 4492          if (insert_label_hash) {
4493 4493                  (void) mod_hash_insert(zonehashbylabel,
4494 4494                      (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4495 4495                  zone->zone_flags |= ZF_HASHED_LABEL;
4496 4496          }
4497 4497  
4498 4498          /*
4499 4499           * Insert into active list.  At this point there are no 'hold's
4500 4500           * on the zone, but everyone else knows not to use it, so we can
4501 4501           * continue to use it.  zsched() will do a zone_hold() if the
4502 4502           * newproc() is successful.
4503 4503           */
4504 4504          list_insert_tail(&zone_active, zone);
4505 4505          mutex_exit(&zonehash_lock);
4506 4506  
4507 4507          zarg.zone = zone;
4508 4508          zarg.nvlist = rctls;
4509 4509          /*
4510 4510           * The process, task, and project rctls are probably wrong;
4511 4511           * we need an interface to get the default values of all rctls,
4512 4512           * and initialize zsched appropriately.  I'm not sure that that
4513 4513           * makes much of a difference, though.
4514 4514           */
4515 4515          error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4516 4516          if (error != 0) {
4517 4517                  /*
4518 4518                   * We need to undo all globally visible state.
4519 4519                   */
4520 4520                  mutex_enter(&zonehash_lock);
4521 4521                  list_remove(&zone_active, zone);
4522 4522                  if (zone->zone_flags & ZF_HASHED_LABEL) {
4523 4523                          ASSERT(zone->zone_slabel != NULL);
4524 4524                          (void) mod_hash_destroy(zonehashbylabel,
4525 4525                              (mod_hash_key_t)zone->zone_slabel);
4526 4526                  }
4527 4527                  (void) mod_hash_destroy(zonehashbyname,
4528 4528                      (mod_hash_key_t)(uintptr_t)zone->zone_name);
4529 4529                  (void) mod_hash_destroy(zonehashbyid,
4530 4530                      (mod_hash_key_t)(uintptr_t)zone->zone_id);
4531 4531                  ASSERT(zonecount > 1);
4532 4532                  zonecount--;
4533 4533                  goto errout;
4534 4534          }
4535 4535  
4536 4536          /*
4537 4537           * Zone creation can't fail from now on.
4538 4538           */
4539 4539  
4540 4540          /*
4541 4541           * Create zone kstats
4542 4542           */
4543 4543          zone_kstat_create(zone);
4544 4544  
4545 4545          /*
4546 4546           * Let the other lwps continue.
4547 4547           */
4548 4548          mutex_enter(&pp->p_lock);
4549 4549          if (curthread != pp->p_agenttp)
4550 4550                  continuelwps(pp);
4551 4551          mutex_exit(&pp->p_lock);
4552 4552  
4553 4553          /*
4554 4554           * Wait for zsched to finish initializing the zone.
4555 4555           */
4556 4556          zone_status_wait(zone, ZONE_IS_READY);
4557 4557          /*
4558 4558           * The zone is fully visible, so we can let mounts progress.
4559 4559           */
4560 4560          resume_mounts(zone);
4561 4561          if (rctls)
4562 4562                  nvlist_free(rctls);
4563 4563  
4564 4564          return (zoneid);
4565 4565  
4566 4566  errout:
4567 4567          mutex_exit(&zonehash_lock);
4568 4568          /*
4569 4569           * Let the other lwps continue.
4570 4570           */
4571 4571          mutex_enter(&pp->p_lock);
4572 4572          if (curthread != pp->p_agenttp)
4573 4573                  continuelwps(pp);
4574 4574          mutex_exit(&pp->p_lock);
4575 4575  
4576 4576          resume_mounts(zone);
4577 4577          if (rctls)
4578 4578                  nvlist_free(rctls);
4579 4579          /*
4580 4580           * There is currently one reference to the zone, a cred_ref from
4581 4581           * zone_kcred.  To free the zone, we call crfree, which will call
4582 4582           * zone_cred_rele, which will call zone_free.
4583 4583           */
4584 4584          ASSERT(zone->zone_cred_ref == 1);
4585 4585          ASSERT(zone->zone_kcred->cr_ref == 1);
4586 4586          ASSERT(zone->zone_ref == 0);
4587 4587          zkcr = zone->zone_kcred;
4588 4588          zone->zone_kcred = NULL;
4589 4589          crfree(zkcr);                           /* triggers call to zone_free */
4590 4590          return (zone_create_error(error, error2, extended_error));
4591 4591  }
4592 4592  
4593 4593  /*
4594 4594   * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4595 4595   * the heavy lifting.  initname is the path to the program to launch
4596 4596   * at the "top" of the zone; if this is NULL, we use the system default,
4597 4597   * which is stored at zone_default_initname.
4598 4598   */
4599 4599  static int
4600 4600  zone_boot(zoneid_t zoneid)
4601 4601  {
4602 4602          int err;
4603 4603          zone_t *zone;
4604 4604  
4605 4605          if (secpolicy_zone_config(CRED()) != 0)
4606 4606                  return (set_errno(EPERM));
4607 4607          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4608 4608                  return (set_errno(EINVAL));
4609 4609  
4610 4610          mutex_enter(&zonehash_lock);
4611 4611          /*
4612 4612           * Look for zone under hash lock to prevent races with calls to
4613 4613           * zone_shutdown, zone_destroy, etc.
4614 4614           */
4615 4615          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4616 4616                  mutex_exit(&zonehash_lock);
4617 4617                  return (set_errno(EINVAL));
4618 4618          }
4619 4619  
4620 4620          mutex_enter(&zone_status_lock);
4621 4621          if (zone_status_get(zone) != ZONE_IS_READY) {
4622 4622                  mutex_exit(&zone_status_lock);
4623 4623                  mutex_exit(&zonehash_lock);
4624 4624                  return (set_errno(EINVAL));
4625 4625          }
4626 4626          zone_status_set(zone, ZONE_IS_BOOTING);
4627 4627          mutex_exit(&zone_status_lock);
4628 4628  
4629 4629          zone_hold(zone);        /* so we can use the zone_t later */
4630 4630          mutex_exit(&zonehash_lock);
4631 4631  
4632 4632          if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4633 4633                  zone_rele(zone);
4634 4634                  return (set_errno(EINTR));
4635 4635          }
4636 4636  
4637 4637          /*
4638 4638           * Boot (starting init) might have failed, in which case the zone
4639 4639           * will go to the SHUTTING_DOWN state; an appropriate errno will
4640 4640           * be placed in zone->zone_boot_err, and so we return that.
4641 4641           */
4642 4642          err = zone->zone_boot_err;
4643 4643          zone_rele(zone);
4644 4644          return (err ? set_errno(err) : 0);
4645 4645  }
4646 4646  
4647 4647  /*
4648 4648   * Kills all user processes in the zone, waiting for them all to exit
4649 4649   * before returning.
4650 4650   */
4651 4651  static int
4652 4652  zone_empty(zone_t *zone)
4653 4653  {
4654 4654          int waitstatus;
4655 4655  
4656 4656          /*
4657 4657           * We need to drop zonehash_lock before killing all
4658 4658           * processes, otherwise we'll deadlock with zone_find_*
4659 4659           * which can be called from the exit path.
4660 4660           */
4661 4661          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4662 4662          while ((waitstatus = zone_status_timedwait_sig(zone,
4663 4663              ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4664 4664                  killall(zone->zone_id);
4665 4665          }
4666 4666          /*
4667 4667           * return EINTR if we were signaled
4668 4668           */
4669 4669          if (waitstatus == 0)
4670 4670                  return (EINTR);
4671 4671          return (0);
4672 4672  }
4673 4673  
4674 4674  /*
4675 4675   * This function implements the policy for zone visibility.
4676 4676   *
4677 4677   * In standard Solaris, a non-global zone can only see itself.
4678 4678   *
4679 4679   * In Trusted Extensions, a labeled zone can lookup any zone whose label
4680 4680   * it dominates. For this test, the label of the global zone is treated as
4681 4681   * admin_high so it is special-cased instead of being checked for dominance.
4682 4682   *
4683 4683   * Returns true if zone attributes are viewable, false otherwise.
4684 4684   */
4685 4685  static boolean_t
4686 4686  zone_list_access(zone_t *zone)
4687 4687  {
4688 4688  
4689 4689          if (curproc->p_zone == global_zone ||
4690 4690              curproc->p_zone == zone) {
4691 4691                  return (B_TRUE);
4692 4692          } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4693 4693                  bslabel_t *curproc_label;
4694 4694                  bslabel_t *zone_label;
4695 4695  
4696 4696                  curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4697 4697                  zone_label = label2bslabel(zone->zone_slabel);
4698 4698  
4699 4699                  if (zone->zone_id != GLOBAL_ZONEID &&
4700 4700                      bldominates(curproc_label, zone_label)) {
4701 4701                          return (B_TRUE);
4702 4702                  } else {
4703 4703                          return (B_FALSE);
4704 4704                  }
4705 4705          } else {
4706 4706                  return (B_FALSE);
4707 4707          }
4708 4708  }
4709 4709  
4710 4710  /*
4711 4711   * Systemcall to start the zone's halt sequence.  By the time this
4712 4712   * function successfully returns, all user processes and kernel threads
4713 4713   * executing in it will have exited, ZSD shutdown callbacks executed,
4714 4714   * and the zone status set to ZONE_IS_DOWN.
4715 4715   *
4716 4716   * It is possible that the call will interrupt itself if the caller is the
4717 4717   * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4718 4718   */
4719 4719  static int
4720 4720  zone_shutdown(zoneid_t zoneid)
4721 4721  {
4722 4722          int error;
4723 4723          zone_t *zone;
4724 4724          zone_status_t status;
4725 4725  
4726 4726          if (secpolicy_zone_config(CRED()) != 0)
4727 4727                  return (set_errno(EPERM));
4728 4728          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4729 4729                  return (set_errno(EINVAL));
4730 4730  
4731 4731          mutex_enter(&zonehash_lock);
4732 4732          /*
4733 4733           * Look for zone under hash lock to prevent races with other
4734 4734           * calls to zone_shutdown and zone_destroy.
4735 4735           */
4736 4736          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4737 4737                  mutex_exit(&zonehash_lock);
4738 4738                  return (set_errno(EINVAL));
4739 4739          }
4740 4740  
4741 4741          /*
4742 4742           * We have to drop zonehash_lock before calling block_mounts.
4743 4743           * Hold the zone so we can continue to use the zone_t.
4744 4744           */
4745 4745          zone_hold(zone);
4746 4746          mutex_exit(&zonehash_lock);
4747 4747  
4748 4748          /*
4749 4749           * Block mounts so that VFS_MOUNT() can get an accurate view of
4750 4750           * the zone's status with regards to ZONE_IS_SHUTTING down.
4751 4751           *
4752 4752           * e.g. NFS can fail the mount if it determines that the zone
4753 4753           * has already begun the shutdown sequence.
4754 4754           *
4755 4755           */
4756 4756          if (block_mounts(zone) == 0) {
4757 4757                  zone_rele(zone);
4758 4758                  return (set_errno(EINTR));
4759 4759          }
4760 4760  
4761 4761          mutex_enter(&zonehash_lock);
4762 4762          mutex_enter(&zone_status_lock);
4763 4763          status = zone_status_get(zone);
4764 4764          /*
4765 4765           * Fail if the zone isn't fully initialized yet.
4766 4766           */
4767 4767          if (status < ZONE_IS_READY) {
4768 4768                  mutex_exit(&zone_status_lock);
4769 4769                  mutex_exit(&zonehash_lock);
4770 4770                  resume_mounts(zone);
4771 4771                  zone_rele(zone);
4772 4772                  return (set_errno(EINVAL));
4773 4773          }
4774 4774          /*
4775 4775           * If conditions required for zone_shutdown() to return have been met,
4776 4776           * return success.
4777 4777           */
4778 4778          if (status >= ZONE_IS_DOWN) {
4779 4779                  mutex_exit(&zone_status_lock);
4780 4780                  mutex_exit(&zonehash_lock);
4781 4781                  resume_mounts(zone);
4782 4782                  zone_rele(zone);
4783 4783                  return (0);
4784 4784          }
4785 4785          /*
4786 4786           * If zone_shutdown() hasn't been called before, go through the motions.
4787 4787           * If it has, there's nothing to do but wait for the kernel threads to
4788 4788           * drain.
4789 4789           */
4790 4790          if (status < ZONE_IS_EMPTY) {
4791 4791                  uint_t ntasks;
4792 4792  
4793 4793                  mutex_enter(&zone->zone_lock);
4794 4794                  if ((ntasks = zone->zone_ntasks) != 1) {
4795 4795                          /*
4796 4796                           * There's still stuff running.
4797 4797                           */
4798 4798                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4799 4799                  }
4800 4800                  mutex_exit(&zone->zone_lock);
4801 4801                  if (ntasks == 1) {
4802 4802                          /*
4803 4803                           * The only way to create another task is through
4804 4804                           * zone_enter(), which will block until we drop
4805 4805                           * zonehash_lock.  The zone is empty.
4806 4806                           */
4807 4807                          if (zone->zone_kthreads == NULL) {
4808 4808                                  /*
4809 4809                                   * Skip ahead to ZONE_IS_DOWN
4810 4810                                   */
4811 4811                                  zone_status_set(zone, ZONE_IS_DOWN);
4812 4812                          } else {
4813 4813                                  zone_status_set(zone, ZONE_IS_EMPTY);
4814 4814                          }
4815 4815                  }
4816 4816          }
4817 4817          mutex_exit(&zone_status_lock);
4818 4818          mutex_exit(&zonehash_lock);
4819 4819          resume_mounts(zone);
4820 4820  
4821 4821          if (error = zone_empty(zone)) {
4822 4822                  zone_rele(zone);
4823 4823                  return (set_errno(error));
4824 4824          }
4825 4825          /*
4826 4826           * After the zone status goes to ZONE_IS_DOWN this zone will no
4827 4827           * longer be notified of changes to the pools configuration, so
4828 4828           * in order to not end up with a stale pool pointer, we point
4829 4829           * ourselves at the default pool and remove all resource
4830 4830           * visibility.  This is especially important as the zone_t may
4831 4831           * languish on the deathrow for a very long time waiting for
4832 4832           * cred's to drain out.
4833 4833           *
4834 4834           * This rebinding of the zone can happen multiple times
4835 4835           * (presumably due to interrupted or parallel systemcalls)
4836 4836           * without any adverse effects.
4837 4837           */
4838 4838          if (pool_lock_intr() != 0) {
4839 4839                  zone_rele(zone);
4840 4840                  return (set_errno(EINTR));
4841 4841          }
4842 4842          if (pool_state == POOL_ENABLED) {
4843 4843                  mutex_enter(&cpu_lock);
4844 4844                  zone_pool_set(zone, pool_default);
4845 4845                  /*
4846 4846                   * The zone no longer needs to be able to see any cpus.
4847 4847                   */
4848 4848                  zone_pset_set(zone, ZONE_PS_INVAL);
4849 4849                  mutex_exit(&cpu_lock);
4850 4850          }
4851 4851          pool_unlock();
4852 4852  
4853 4853          /*
4854 4854           * ZSD shutdown callbacks can be executed multiple times, hence
4855 4855           * it is safe to not be holding any locks across this call.
4856 4856           */
4857 4857          zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
4858 4858  
4859 4859          mutex_enter(&zone_status_lock);
4860 4860          if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
4861 4861                  zone_status_set(zone, ZONE_IS_DOWN);
4862 4862          mutex_exit(&zone_status_lock);
4863 4863  
4864 4864          /*
4865 4865           * Wait for kernel threads to drain.
4866 4866           */
4867 4867          if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
4868 4868                  zone_rele(zone);
4869 4869                  return (set_errno(EINTR));
4870 4870          }
4871 4871  
4872 4872          /*
4873 4873           * Zone can be become down/destroyable even if the above wait
4874 4874           * returns EINTR, so any code added here may never execute.
4875 4875           * (i.e. don't add code here)
4876 4876           */
4877 4877  
4878 4878          zone_rele(zone);
4879 4879          return (0);
4880 4880  }
4881 4881  
4882 4882  /*
4883 4883   * Log the specified zone's reference counts.  The caller should not be
4884 4884   * holding the zone's zone_lock.
4885 4885   */
4886 4886  static void
4887 4887  zone_log_refcounts(zone_t *zone)
4888 4888  {
4889 4889          char *buffer;
4890 4890          char *buffer_position;
4891 4891          uint32_t buffer_size;
4892 4892          uint32_t index;
4893 4893          uint_t ref;
4894 4894          uint_t cred_ref;
4895 4895  
4896 4896          /*
4897 4897           * Construct a string representing the subsystem-specific reference
4898 4898           * counts.  The counts are printed in ascending order by index into the
4899 4899           * zone_t::zone_subsys_ref array.  The list will be surrounded by
4900 4900           * square brackets [] and will only contain nonzero reference counts.
4901 4901           *
4902 4902           * The buffer will hold two square bracket characters plus ten digits,
4903 4903           * one colon, one space, one comma, and some characters for a
4904 4904           * subsystem name per subsystem-specific reference count.  (Unsigned 32-
4905 4905           * bit integers have at most ten decimal digits.)  The last
4906 4906           * reference count's comma is replaced by the closing square
4907 4907           * bracket and a NULL character to terminate the string.
4908 4908           *
4909 4909           * NOTE: We have to grab the zone's zone_lock to create a consistent
4910 4910           * snapshot of the zone's reference counters.
4911 4911           *
4912 4912           * First, figure out how much space the string buffer will need.
4913 4913           * The buffer's size is stored in buffer_size.
4914 4914           */
4915 4915          buffer_size = 2;                        /* for the square brackets */
4916 4916          mutex_enter(&zone->zone_lock);
4917 4917          zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
4918 4918          ref = zone->zone_ref;
4919 4919          cred_ref = zone->zone_cred_ref;
4920 4920          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
4921 4921                  if (zone->zone_subsys_ref[index] != 0)
4922 4922                          buffer_size += strlen(zone_ref_subsys_names[index]) +
4923 4923                              13;
4924 4924          if (buffer_size == 2) {
4925 4925                  /*
4926 4926                   * No subsystems had nonzero reference counts.  Don't bother
4927 4927                   * with allocating a buffer; just log the general-purpose and
4928 4928                   * credential reference counts.
4929 4929                   */
4930 4930                  mutex_exit(&zone->zone_lock);
4931 4931                  (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4932 4932                      "Zone '%s' (ID: %d) is shutting down, but %u zone "
4933 4933                      "references and %u credential references are still extant",
4934 4934                      zone->zone_name, zone->zone_id, ref, cred_ref);
4935 4935                  return;
4936 4936          }
4937 4937  
4938 4938          /*
4939 4939           * buffer_size contains the exact number of characters that the
4940 4940           * buffer will need.  Allocate the buffer and fill it with nonzero
4941 4941           * subsystem-specific reference counts.  Surround the results with
4942 4942           * square brackets afterwards.
4943 4943           */
4944 4944          buffer = kmem_alloc(buffer_size, KM_SLEEP);
4945 4945          buffer_position = &buffer[1];
4946 4946          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
4947 4947                  /*
4948 4948                   * NOTE: The DDI's version of sprintf() returns a pointer to
4949 4949                   * the modified buffer rather than the number of bytes written
4950 4950                   * (as in snprintf(3C)).  This is unfortunate and annoying.
4951 4951                   * Therefore, we'll use snprintf() with INT_MAX to get the
4952 4952                   * number of bytes written.  Using INT_MAX is safe because
4953 4953                   * the buffer is perfectly sized for the data: we'll never
4954 4954                   * overrun the buffer.
4955 4955                   */
4956 4956                  if (zone->zone_subsys_ref[index] != 0)
4957 4957                          buffer_position += snprintf(buffer_position, INT_MAX,
4958 4958                              "%s: %u,", zone_ref_subsys_names[index],
4959 4959                              zone->zone_subsys_ref[index]);
4960 4960          }
4961 4961          mutex_exit(&zone->zone_lock);
4962 4962          buffer[0] = '[';
4963 4963          ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
4964 4964          ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
4965 4965          buffer_position[-1] = ']';
4966 4966  
4967 4967          /*
4968 4968           * Log the reference counts and free the message buffer.
4969 4969           */
4970 4970          (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4971 4971              "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
4972 4972              "%u credential references are still extant %s", zone->zone_name,
4973 4973              zone->zone_id, ref, cred_ref, buffer);
4974 4974          kmem_free(buffer, buffer_size);
4975 4975  }
4976 4976  
4977 4977  /*
4978 4978   * Systemcall entry point to finalize the zone halt process.  The caller
4979 4979   * must have already successfully called zone_shutdown().
4980 4980   *
4981 4981   * Upon successful completion, the zone will have been fully destroyed:
4982 4982   * zsched will have exited, destructor callbacks executed, and the zone
4983 4983   * removed from the list of active zones.
4984 4984   */
4985 4985  static int
4986 4986  zone_destroy(zoneid_t zoneid)
4987 4987  {
4988 4988          uint64_t uniqid;
4989 4989          zone_t *zone;
4990 4990          zone_status_t status;
4991 4991          clock_t wait_time;
4992 4992          boolean_t log_refcounts;
4993 4993  
4994 4994          if (secpolicy_zone_config(CRED()) != 0)
4995 4995                  return (set_errno(EPERM));
4996 4996          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4997 4997                  return (set_errno(EINVAL));
4998 4998  
4999 4999          mutex_enter(&zonehash_lock);
5000 5000          /*
5001 5001           * Look for zone under hash lock to prevent races with other
5002 5002           * calls to zone_destroy.
5003 5003           */
5004 5004          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5005 5005                  mutex_exit(&zonehash_lock);
5006 5006                  return (set_errno(EINVAL));
5007 5007          }
5008 5008  
5009 5009          if (zone_mount_count(zone->zone_rootpath) != 0) {
5010 5010                  mutex_exit(&zonehash_lock);
5011 5011                  return (set_errno(EBUSY));
5012 5012          }
5013 5013          mutex_enter(&zone_status_lock);
5014 5014          status = zone_status_get(zone);
5015 5015          if (status < ZONE_IS_DOWN) {
5016 5016                  mutex_exit(&zone_status_lock);
5017 5017                  mutex_exit(&zonehash_lock);
5018 5018                  return (set_errno(EBUSY));
5019 5019          } else if (status == ZONE_IS_DOWN) {
5020 5020                  zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5021 5021          }
5022 5022          mutex_exit(&zone_status_lock);
5023 5023          zone_hold(zone);
5024 5024          mutex_exit(&zonehash_lock);
5025 5025  
5026 5026          /*
5027 5027           * wait for zsched to exit
5028 5028           */
5029 5029          zone_status_wait(zone, ZONE_IS_DEAD);
5030 5030          zone_zsd_callbacks(zone, ZSD_DESTROY);
5031 5031          zone->zone_netstack = NULL;
5032 5032          uniqid = zone->zone_uniqid;
5033 5033          zone_rele(zone);
5034 5034          zone = NULL;    /* potentially free'd */
5035 5035  
5036 5036          log_refcounts = B_FALSE;
5037 5037          wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5038 5038          mutex_enter(&zonehash_lock);
5039 5039          for (; /* ever */; ) {
5040 5040                  boolean_t unref;
5041 5041                  boolean_t refs_have_been_logged;
5042 5042  
5043 5043                  if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5044 5044                      zone->zone_uniqid != uniqid) {
5045 5045                          /*
5046 5046                           * The zone has gone away.  Necessary conditions
5047 5047                           * are met, so we return success.
5048 5048                           */
5049 5049                          mutex_exit(&zonehash_lock);
5050 5050                          return (0);
5051 5051                  }
5052 5052                  mutex_enter(&zone->zone_lock);
5053 5053                  unref = ZONE_IS_UNREF(zone);
5054 5054                  refs_have_been_logged = (zone->zone_flags &
5055 5055                      ZF_REFCOUNTS_LOGGED);
5056 5056                  mutex_exit(&zone->zone_lock);
5057 5057                  if (unref) {
5058 5058                          /*
5059 5059                           * There is only one reference to the zone -- that
5060 5060                           * added when the zone was added to the hashtables --
5061 5061                           * and things will remain this way until we drop
5062 5062                           * zonehash_lock... we can go ahead and cleanup the
5063 5063                           * zone.
5064 5064                           */
5065 5065                          break;
5066 5066                  }
5067 5067  
5068 5068                  /*
5069 5069                   * Wait for zone_rele_common() or zone_cred_rele() to signal
5070 5070                   * zone_destroy_cv.  zone_destroy_cv is signaled only when
5071 5071                   * some zone's general-purpose reference count reaches one.
5072 5072                   * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5073 5073                   * on zone_destroy_cv, then log the zone's reference counts and
5074 5074                   * continue to wait for zone_rele() and zone_cred_rele().
5075 5075                   */
5076 5076                  if (!refs_have_been_logged) {
5077 5077                          if (!log_refcounts) {
5078 5078                                  /*
5079 5079                                   * This thread hasn't timed out waiting on
5080 5080                                   * zone_destroy_cv yet.  Wait wait_time clock
5081 5081                                   * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5082 5082                                   * seconds) for the zone's references to clear.
5083 5083                                   */
5084 5084                                  ASSERT(wait_time > 0);
5085 5085                                  wait_time = cv_reltimedwait_sig(
5086 5086                                      &zone_destroy_cv, &zonehash_lock, wait_time,
5087 5087                                      TR_SEC);
5088 5088                                  if (wait_time > 0) {
5089 5089                                          /*
5090 5090                                           * A thread in zone_rele() or
5091 5091                                           * zone_cred_rele() signaled
5092 5092                                           * zone_destroy_cv before this thread's
5093 5093                                           * wait timed out.  The zone might have
5094 5094                                           * only one reference left; find out!
5095 5095                                           */
5096 5096                                          continue;
5097 5097                                  } else if (wait_time == 0) {
5098 5098                                          /* The thread's process was signaled. */
5099 5099                                          mutex_exit(&zonehash_lock);
5100 5100                                          return (set_errno(EINTR));
5101 5101                                  }
5102 5102  
5103 5103                                  /*
5104 5104                                   * The thread timed out while waiting on
5105 5105                                   * zone_destroy_cv.  Even though the thread
5106 5106                                   * timed out, it has to check whether another
5107 5107                                   * thread woke up from zone_destroy_cv and
5108 5108                                   * destroyed the zone.
5109 5109                                   *
5110 5110                                   * If the zone still exists and has more than
5111 5111                                   * one unreleased general-purpose reference,
5112 5112                                   * then log the zone's reference counts.
5113 5113                                   */
5114 5114                                  log_refcounts = B_TRUE;
5115 5115                                  continue;
5116 5116                          }
5117 5117  
5118 5118                          /*
5119 5119                           * The thread already timed out on zone_destroy_cv while
5120 5120                           * waiting for subsystems to release the zone's last
5121 5121                           * general-purpose references.  Log the zone's reference
5122 5122                           * counts and wait indefinitely on zone_destroy_cv.
5123 5123                           */
5124 5124                          zone_log_refcounts(zone);
5125 5125                  }
5126 5126                  if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5127 5127                          /* The thread's process was signaled. */
5128 5128                          mutex_exit(&zonehash_lock);
5129 5129                          return (set_errno(EINTR));
5130 5130                  }
5131 5131          }
5132 5132  
5133 5133          /*
5134 5134           * Remove CPU cap for this zone now since we're not going to
5135 5135           * fail below this point.
5136 5136           */
5137 5137          cpucaps_zone_remove(zone);
5138 5138  
5139 5139          /* Get rid of the zone's kstats */
5140 5140          zone_kstat_delete(zone);
5141 5141  
5142 5142          /* remove the pfexecd doors */
5143 5143          if (zone->zone_pfexecd != NULL) {
5144 5144                  klpd_freelist(&zone->zone_pfexecd);
5145 5145                  zone->zone_pfexecd = NULL;
5146 5146          }
5147 5147  
5148 5148          /* free brand specific data */
5149 5149          if (ZONE_IS_BRANDED(zone))
5150 5150                  ZBROP(zone)->b_free_brand_data(zone);
5151 5151  
5152 5152          /* Say goodbye to brand framework. */
5153 5153          brand_unregister_zone(zone->zone_brand);
5154 5154  
5155 5155          /*
5156 5156           * It is now safe to let the zone be recreated; remove it from the
5157 5157           * lists.  The memory will not be freed until the last cred
5158 5158           * reference goes away.
5159 5159           */
5160 5160          ASSERT(zonecount > 1);  /* must be > 1; can't destroy global zone */
5161 5161          zonecount--;
5162 5162          /* remove from active list and hash tables */
5163 5163          list_remove(&zone_active, zone);
5164 5164          (void) mod_hash_destroy(zonehashbyname,
5165 5165              (mod_hash_key_t)zone->zone_name);
5166 5166          (void) mod_hash_destroy(zonehashbyid,
5167 5167              (mod_hash_key_t)(uintptr_t)zone->zone_id);
5168 5168          if (zone->zone_flags & ZF_HASHED_LABEL)
5169 5169                  (void) mod_hash_destroy(zonehashbylabel,
5170 5170                      (mod_hash_key_t)zone->zone_slabel);
5171 5171          mutex_exit(&zonehash_lock);
5172 5172  
5173 5173          /*
5174 5174           * Release the root vnode; we're not using it anymore.  Nor should any
5175 5175           * other thread that might access it exist.
5176 5176           */
5177 5177          if (zone->zone_rootvp != NULL) {
5178 5178                  VN_RELE(zone->zone_rootvp);
5179 5179                  zone->zone_rootvp = NULL;
5180 5180          }
5181 5181  
5182 5182          /* add to deathrow list */
5183 5183          mutex_enter(&zone_deathrow_lock);
5184 5184          list_insert_tail(&zone_deathrow, zone);
5185 5185          mutex_exit(&zone_deathrow_lock);
5186 5186  
5187 5187          /*
5188 5188           * Drop last reference (which was added by zsched()), this will
5189 5189           * free the zone unless there are outstanding cred references.
5190 5190           */
5191 5191          zone_rele(zone);
5192 5192          return (0);
5193 5193  }
5194 5194  
5195 5195  /*
5196 5196   * Systemcall entry point for zone_getattr(2).
5197 5197   */
5198 5198  static ssize_t
5199 5199  zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5200 5200  {
5201 5201          size_t size;
5202 5202          int error = 0, err;
5203 5203          zone_t *zone;
5204 5204          char *zonepath;
5205 5205          char *outstr;
5206 5206          zone_status_t zone_status;
5207 5207          pid_t initpid;
5208 5208          boolean_t global = (curzone == global_zone);
5209 5209          boolean_t inzone = (curzone->zone_id == zoneid);
5210 5210          ushort_t flags;
5211 5211          zone_net_data_t *zbuf;
5212 5212  
5213 5213          mutex_enter(&zonehash_lock);
5214 5214          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5215 5215                  mutex_exit(&zonehash_lock);
5216 5216                  return (set_errno(EINVAL));
5217 5217          }
5218 5218          zone_status = zone_status_get(zone);
5219 5219          if (zone_status < ZONE_IS_INITIALIZED) {
5220 5220                  mutex_exit(&zonehash_lock);
5221 5221                  return (set_errno(EINVAL));
5222 5222          }
5223 5223          zone_hold(zone);
5224 5224          mutex_exit(&zonehash_lock);
5225 5225  
5226 5226          /*
5227 5227           * If not in the global zone, don't show information about other zones,
5228 5228           * unless the system is labeled and the local zone's label dominates
5229 5229           * the other zone.
5230 5230           */
5231 5231          if (!zone_list_access(zone)) {
5232 5232                  zone_rele(zone);
5233 5233                  return (set_errno(EINVAL));
5234 5234          }
5235 5235  
5236 5236          switch (attr) {
5237 5237          case ZONE_ATTR_ROOT:
5238 5238                  if (global) {
5239 5239                          /*
5240 5240                           * Copy the path to trim the trailing "/" (except for
5241 5241                           * the global zone).
5242 5242                           */
5243 5243                          if (zone != global_zone)
5244 5244                                  size = zone->zone_rootpathlen - 1;
5245 5245                          else
5246 5246                                  size = zone->zone_rootpathlen;
5247 5247                          zonepath = kmem_alloc(size, KM_SLEEP);
5248 5248                          bcopy(zone->zone_rootpath, zonepath, size);
5249 5249                          zonepath[size - 1] = '\0';
5250 5250                  } else {
5251 5251                          if (inzone || !is_system_labeled()) {
5252 5252                                  /*
5253 5253                                   * Caller is not in the global zone.
5254 5254                                   * if the query is on the current zone
5255 5255                                   * or the system is not labeled,
5256 5256                                   * just return faked-up path for current zone.
5257 5257                                   */
5258 5258                                  zonepath = "/";
5259 5259                                  size = 2;
5260 5260                          } else {
5261 5261                                  /*
5262 5262                                   * Return related path for current zone.
5263 5263                                   */
5264 5264                                  int prefix_len = strlen(zone_prefix);
5265 5265                                  int zname_len = strlen(zone->zone_name);
5266 5266  
5267 5267                                  size = prefix_len + zname_len + 1;
5268 5268                                  zonepath = kmem_alloc(size, KM_SLEEP);
5269 5269                                  bcopy(zone_prefix, zonepath, prefix_len);
5270 5270                                  bcopy(zone->zone_name, zonepath +
5271 5271                                      prefix_len, zname_len);
5272 5272                                  zonepath[size - 1] = '\0';
5273 5273                          }
5274 5274                  }
5275 5275                  if (bufsize > size)
5276 5276                          bufsize = size;
5277 5277                  if (buf != NULL) {
5278 5278                          err = copyoutstr(zonepath, buf, bufsize, NULL);
5279 5279                          if (err != 0 && err != ENAMETOOLONG)
5280 5280                                  error = EFAULT;
5281 5281                  }
5282 5282                  if (global || (is_system_labeled() && !inzone))
5283 5283                          kmem_free(zonepath, size);
5284 5284                  break;
5285 5285  
5286 5286          case ZONE_ATTR_NAME:
5287 5287                  size = strlen(zone->zone_name) + 1;
5288 5288                  if (bufsize > size)
5289 5289                          bufsize = size;
5290 5290                  if (buf != NULL) {
5291 5291                          err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5292 5292                          if (err != 0 && err != ENAMETOOLONG)
5293 5293                                  error = EFAULT;
5294 5294                  }
5295 5295                  break;
5296 5296  
5297 5297          case ZONE_ATTR_STATUS:
5298 5298                  /*
5299 5299                   * Since we're not holding zonehash_lock, the zone status
5300 5300                   * may be anything; leave it up to userland to sort it out.
5301 5301                   */
5302 5302                  size = sizeof (zone_status);
5303 5303                  if (bufsize > size)
5304 5304                          bufsize = size;
5305 5305                  zone_status = zone_status_get(zone);
5306 5306                  if (buf != NULL &&
5307 5307                      copyout(&zone_status, buf, bufsize) != 0)
5308 5308                          error = EFAULT;
5309 5309                  break;
5310 5310          case ZONE_ATTR_FLAGS:
5311 5311                  size = sizeof (zone->zone_flags);
5312 5312                  if (bufsize > size)
5313 5313                          bufsize = size;
5314 5314                  flags = zone->zone_flags;
5315 5315                  if (buf != NULL &&
5316 5316                      copyout(&flags, buf, bufsize) != 0)
5317 5317                          error = EFAULT;
5318 5318                  break;
5319 5319          case ZONE_ATTR_PRIVSET:
5320 5320                  size = sizeof (priv_set_t);
5321 5321                  if (bufsize > size)
5322 5322                          bufsize = size;
5323 5323                  if (buf != NULL &&
5324 5324                      copyout(zone->zone_privset, buf, bufsize) != 0)
5325 5325                          error = EFAULT;
5326 5326                  break;
5327 5327          case ZONE_ATTR_UNIQID:
5328 5328                  size = sizeof (zone->zone_uniqid);
5329 5329                  if (bufsize > size)
5330 5330                          bufsize = size;
5331 5331                  if (buf != NULL &&
5332 5332                      copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5333 5333                          error = EFAULT;
5334 5334                  break;
5335 5335          case ZONE_ATTR_POOLID:
5336 5336                  {
5337 5337                          pool_t *pool;
5338 5338                          poolid_t poolid;
5339 5339  
5340 5340                          if (pool_lock_intr() != 0) {
5341 5341                                  error = EINTR;
5342 5342                                  break;
5343 5343                          }
5344 5344                          pool = zone_pool_get(zone);
5345 5345                          poolid = pool->pool_id;
5346 5346                          pool_unlock();
5347 5347                          size = sizeof (poolid);
5348 5348                          if (bufsize > size)
5349 5349                                  bufsize = size;
5350 5350                          if (buf != NULL && copyout(&poolid, buf, size) != 0)
5351 5351                                  error = EFAULT;
5352 5352                  }
5353 5353                  break;
5354 5354          case ZONE_ATTR_SLBL:
5355 5355                  size = sizeof (bslabel_t);
5356 5356                  if (bufsize > size)
5357 5357                          bufsize = size;
5358 5358                  if (zone->zone_slabel == NULL)
5359 5359                          error = EINVAL;
5360 5360                  else if (buf != NULL &&
5361 5361                      copyout(label2bslabel(zone->zone_slabel), buf,
5362 5362                      bufsize) != 0)
5363 5363                          error = EFAULT;
5364 5364                  break;
5365 5365          case ZONE_ATTR_INITPID:
5366 5366                  size = sizeof (initpid);
5367 5367                  if (bufsize > size)
5368 5368                          bufsize = size;
5369 5369                  initpid = zone->zone_proc_initpid;
5370 5370                  if (initpid == -1) {
5371 5371                          error = ESRCH;
5372 5372                          break;
5373 5373                  }
5374 5374                  if (buf != NULL &&
5375 5375                      copyout(&initpid, buf, bufsize) != 0)
5376 5376                          error = EFAULT;
5377 5377                  break;
5378 5378          case ZONE_ATTR_BRAND:
5379 5379                  size = strlen(zone->zone_brand->b_name) + 1;
5380 5380  
5381 5381                  if (bufsize > size)
5382 5382                          bufsize = size;
5383 5383                  if (buf != NULL) {
5384 5384                          err = copyoutstr(zone->zone_brand->b_name, buf,
5385 5385                              bufsize, NULL);
5386 5386                          if (err != 0 && err != ENAMETOOLONG)
5387 5387                                  error = EFAULT;
5388 5388                  }
5389 5389                  break;
5390 5390          case ZONE_ATTR_INITNAME:
5391 5391                  size = strlen(zone->zone_initname) + 1;
5392 5392                  if (bufsize > size)
5393 5393                          bufsize = size;
5394 5394                  if (buf != NULL) {
5395 5395                          err = copyoutstr(zone->zone_initname, buf, bufsize,
5396 5396                              NULL);
5397 5397                          if (err != 0 && err != ENAMETOOLONG)
5398 5398                                  error = EFAULT;
5399 5399                  }
5400 5400                  break;
5401 5401          case ZONE_ATTR_BOOTARGS:
5402 5402                  if (zone->zone_bootargs == NULL)
5403 5403                          outstr = "";
5404 5404                  else
5405 5405                          outstr = zone->zone_bootargs;
5406 5406                  size = strlen(outstr) + 1;
5407 5407                  if (bufsize > size)
5408 5408                          bufsize = size;
5409 5409                  if (buf != NULL) {
5410 5410                          err = copyoutstr(outstr, buf, bufsize, NULL);
5411 5411                          if (err != 0 && err != ENAMETOOLONG)
5412 5412                                  error = EFAULT;
5413 5413                  }
5414 5414                  break;
5415 5415          case ZONE_ATTR_PHYS_MCAP:
5416 5416                  size = sizeof (zone->zone_phys_mcap);
5417 5417                  if (bufsize > size)
5418 5418                          bufsize = size;
5419 5419                  if (buf != NULL &&
5420 5420                      copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5421 5421                          error = EFAULT;
5422 5422                  break;
5423 5423          case ZONE_ATTR_SCHED_CLASS:
5424 5424                  mutex_enter(&class_lock);
5425 5425  
5426 5426                  if (zone->zone_defaultcid >= loaded_classes)
5427 5427                          outstr = "";
5428 5428                  else
5429 5429                          outstr = sclass[zone->zone_defaultcid].cl_name;
5430 5430                  size = strlen(outstr) + 1;
5431 5431                  if (bufsize > size)
5432 5432                          bufsize = size;
5433 5433                  if (buf != NULL) {
5434 5434                          err = copyoutstr(outstr, buf, bufsize, NULL);
5435 5435                          if (err != 0 && err != ENAMETOOLONG)
5436 5436                                  error = EFAULT;
5437 5437                  }
5438 5438  
5439 5439                  mutex_exit(&class_lock);
5440 5440                  break;
5441 5441          case ZONE_ATTR_HOSTID:
5442 5442                  if (zone->zone_hostid != HW_INVALID_HOSTID &&
5443 5443                      bufsize == sizeof (zone->zone_hostid)) {
5444 5444                          size = sizeof (zone->zone_hostid);
5445 5445                          if (buf != NULL && copyout(&zone->zone_hostid, buf,
5446 5446                              bufsize) != 0)
5447 5447                                  error = EFAULT;
5448 5448                  } else {
5449 5449                          error = EINVAL;
5450 5450                  }
5451 5451                  break;
5452 5452          case ZONE_ATTR_FS_ALLOWED:
5453 5453                  if (zone->zone_fs_allowed == NULL)
5454 5454                          outstr = "";
5455 5455                  else
5456 5456                          outstr = zone->zone_fs_allowed;
5457 5457                  size = strlen(outstr) + 1;
5458 5458                  if (bufsize > size)
5459 5459                          bufsize = size;
5460 5460                  if (buf != NULL) {
5461 5461                          err = copyoutstr(outstr, buf, bufsize, NULL);
5462 5462                          if (err != 0 && err != ENAMETOOLONG)
5463 5463                                  error = EFAULT;
5464 5464                  }
5465 5465                  break;
5466 5466          case ZONE_ATTR_NETWORK:
5467 5467                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5468 5468                  if (copyin(buf, zbuf, bufsize) != 0) {
5469 5469                          error = EFAULT;
5470 5470                  } else {
5471 5471                          error = zone_get_network(zoneid, zbuf);
5472 5472                          if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5473 5473                                  error = EFAULT;
5474 5474                  }
5475 5475                  kmem_free(zbuf, bufsize);
5476 5476                  break;
5477 5477          default:
5478 5478                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5479 5479                          size = bufsize;
5480 5480                          error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5481 5481                  } else {
5482 5482                          error = EINVAL;
5483 5483                  }
5484 5484          }
5485 5485          zone_rele(zone);
5486 5486  
5487 5487          if (error)
5488 5488                  return (set_errno(error));
5489 5489          return ((ssize_t)size);
5490 5490  }
5491 5491  
5492 5492  /*
5493 5493   * Systemcall entry point for zone_setattr(2).
5494 5494   */
5495 5495  /*ARGSUSED*/
5496 5496  static int
5497 5497  zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5498 5498  {
5499 5499          zone_t *zone;
5500 5500          zone_status_t zone_status;
5501 5501          int err = -1;
5502 5502          zone_net_data_t *zbuf;
5503 5503  
5504 5504          if (secpolicy_zone_config(CRED()) != 0)
5505 5505                  return (set_errno(EPERM));
5506 5506  
5507 5507          /*
5508 5508           * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5509 5509           * global zone.
5510 5510           */
5511 5511          if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5512 5512                  return (set_errno(EINVAL));
5513 5513          }
5514 5514  
5515 5515          mutex_enter(&zonehash_lock);
5516 5516          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5517 5517                  mutex_exit(&zonehash_lock);
5518 5518                  return (set_errno(EINVAL));
5519 5519          }
5520 5520          zone_hold(zone);
5521 5521          mutex_exit(&zonehash_lock);
5522 5522  
5523 5523          /*
5524 5524           * At present most attributes can only be set on non-running,
5525 5525           * non-global zones.
5526 5526           */
5527 5527          zone_status = zone_status_get(zone);
5528 5528          if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5529 5529                  err = EINVAL;
5530 5530                  goto done;
5531 5531          }
5532 5532  
5533 5533          switch (attr) {
5534 5534          case ZONE_ATTR_INITNAME:
5535 5535                  err = zone_set_initname(zone, (const char *)buf);
5536 5536                  break;
5537 5537          case ZONE_ATTR_INITNORESTART:
5538 5538                  zone->zone_restart_init = B_FALSE;
5539 5539                  err = 0;
5540 5540                  break;
5541 5541          case ZONE_ATTR_BOOTARGS:
5542 5542                  err = zone_set_bootargs(zone, (const char *)buf);
5543 5543                  break;
5544 5544          case ZONE_ATTR_BRAND:
5545 5545                  err = zone_set_brand(zone, (const char *)buf);
5546 5546                  break;
5547 5547          case ZONE_ATTR_FS_ALLOWED:
5548 5548                  err = zone_set_fs_allowed(zone, (const char *)buf);
5549 5549                  break;
5550 5550          case ZONE_ATTR_PHYS_MCAP:
5551 5551                  err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5552 5552                  break;
5553 5553          case ZONE_ATTR_SCHED_CLASS:
5554 5554                  err = zone_set_sched_class(zone, (const char *)buf);
5555 5555                  break;
5556 5556          case ZONE_ATTR_HOSTID:
5557 5557                  if (bufsize == sizeof (zone->zone_hostid)) {
5558 5558                          if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5559 5559                                  err = 0;
5560 5560                          else
5561 5561                                  err = EFAULT;
5562 5562                  } else {
5563 5563                          err = EINVAL;
5564 5564                  }
5565 5565                  break;
5566 5566          case ZONE_ATTR_NETWORK:
5567 5567                  if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5568 5568                          err = EINVAL;
5569 5569                          break;
5570 5570                  }
5571 5571                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5572 5572                  if (copyin(buf, zbuf, bufsize) != 0) {
5573 5573                          kmem_free(zbuf, bufsize);
5574 5574                          err = EFAULT;
5575 5575                          break;
5576 5576                  }
5577 5577                  err = zone_set_network(zoneid, zbuf);
5578 5578                  kmem_free(zbuf, bufsize);
5579 5579                  break;
5580 5580          default:
5581 5581                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5582 5582                          err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5583 5583                  else
5584 5584                          err = EINVAL;
5585 5585          }
5586 5586  
5587 5587  done:
5588 5588          zone_rele(zone);
5589 5589          ASSERT(err != -1);
5590 5590          return (err != 0 ? set_errno(err) : 0);
5591 5591  }
5592 5592  
5593 5593  /*
5594 5594   * Return zero if the process has at least one vnode mapped in to its
5595 5595   * address space which shouldn't be allowed to change zones.
5596 5596   *
5597 5597   * Also return zero if the process has any shared mappings which reserve
5598 5598   * swap.  This is because the counting for zone.max-swap does not allow swap
5599 5599   * reservation to be shared between zones.  zone swap reservation is counted
5600 5600   * on zone->zone_max_swap.
5601 5601   */
5602 5602  static int
5603 5603  as_can_change_zones(void)
5604 5604  {
5605 5605          proc_t *pp = curproc;
5606 5606          struct seg *seg;
5607 5607          struct as *as = pp->p_as;
5608 5608          vnode_t *vp;
5609 5609          int allow = 1;
5610 5610  
5611 5611          ASSERT(pp->p_as != &kas);
5612 5612          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
5613 5613          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5614 5614  
5615 5615                  /*
5616 5616                   * Cannot enter zone with shared anon memory which
5617 5617                   * reserves swap.  See comment above.

↓ open down ↓

5617 lines elided

↑ open up ↑

5618 5618                   */
5619 5619                  if (seg_can_change_zones(seg) == B_FALSE) {
5620 5620                          allow = 0;
5621 5621                          break;
5622 5622                  }
5623 5623                  /*
5624 5624                   * if we can't get a backing vnode for this segment then skip
5625 5625                   * it.
5626 5626                   */
5627 5627                  vp = NULL;
5628      -                if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
     5628 +                if (segop_getvp(seg, seg->s_base, &vp) != 0 || vp == NULL)
5629 5629                          continue;
5630 5630                  if (!vn_can_change_zones(vp)) { /* bail on first match */
5631 5631                          allow = 0;
5632 5632                          break;
5633 5633                  }
5634 5634          }
5635 5635          AS_LOCK_EXIT(as, &as->a_lock);
5636 5636          return (allow);
5637 5637  }
5638 5638

5639 5639  /*
5640 5640   * Count swap reserved by curproc's address space
5641 5641   */
5642 5642  static size_t
5643 5643  as_swresv(void)
5644 5644  {
5645 5645          proc_t *pp = curproc;
5646 5646          struct seg *seg;
5647 5647          struct as *as = pp->p_as;
5648 5648          size_t swap = 0;
5649 5649  
5650 5650          ASSERT(pp->p_as != &kas);
5651 5651          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
5652 5652          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5653 5653                  swap += seg_swresv(seg);
5654 5654  
5655 5655          return (swap);
5656 5656  }
5657 5657  
5658 5658  /*
5659 5659   * Systemcall entry point for zone_enter().
5660 5660   *
5661 5661   * The current process is injected into said zone.  In the process
5662 5662   * it will change its project membership, privileges, rootdir/cwd,
5663 5663   * zone-wide rctls, and pool association to match those of the zone.
5664 5664   *
5665 5665   * The first zone_enter() called while the zone is in the ZONE_IS_READY
5666 5666   * state will transition it to ZONE_IS_RUNNING.  Processes may only
5667 5667   * enter a zone that is "ready" or "running".
5668 5668   */
5669 5669  static int
5670 5670  zone_enter(zoneid_t zoneid)
5671 5671  {
5672 5672          zone_t *zone;
5673 5673          vnode_t *vp;
5674 5674          proc_t *pp = curproc;
5675 5675          contract_t *ct;
5676 5676          cont_process_t *ctp;
5677 5677          task_t *tk, *oldtk;
5678 5678          kproject_t *zone_proj0;
5679 5679          cred_t *cr, *newcr;
5680 5680          pool_t *oldpool, *newpool;
5681 5681          sess_t *sp;
5682 5682          uid_t uid;
5683 5683          zone_status_t status;
5684 5684          int err = 0;
5685 5685          rctl_entity_p_t e;
5686 5686          size_t swap;
5687 5687          kthread_id_t t;
5688 5688  
5689 5689          if (secpolicy_zone_config(CRED()) != 0)
5690 5690                  return (set_errno(EPERM));
5691 5691          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5692 5692                  return (set_errno(EINVAL));
5693 5693  
5694 5694          /*
5695 5695           * Stop all lwps so we don't need to hold a lock to look at
5696 5696           * curproc->p_zone.  This needs to happen before we grab any
5697 5697           * locks to avoid deadlock (another lwp in the process could
5698 5698           * be waiting for the held lock).
5699 5699           */
5700 5700          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5701 5701                  return (set_errno(EINTR));
5702 5702  
5703 5703          /*
5704 5704           * Make sure we're not changing zones with files open or mapped in
5705 5705           * to our address space which shouldn't be changing zones.
5706 5706           */
5707 5707          if (!files_can_change_zones()) {
5708 5708                  err = EBADF;
5709 5709                  goto out;
5710 5710          }
5711 5711          if (!as_can_change_zones()) {
5712 5712                  err = EFAULT;
5713 5713                  goto out;
5714 5714          }
5715 5715  
5716 5716          mutex_enter(&zonehash_lock);
5717 5717          if (pp->p_zone != global_zone) {
5718 5718                  mutex_exit(&zonehash_lock);
5719 5719                  err = EINVAL;
5720 5720                  goto out;
5721 5721          }
5722 5722  
5723 5723          zone = zone_find_all_by_id(zoneid);
5724 5724          if (zone == NULL) {
5725 5725                  mutex_exit(&zonehash_lock);
5726 5726                  err = EINVAL;
5727 5727                  goto out;
5728 5728          }
5729 5729  
5730 5730          /*
5731 5731           * To prevent processes in a zone from holding contracts on
5732 5732           * extrazonal resources, and to avoid process contract
5733 5733           * memberships which span zones, contract holders and processes
5734 5734           * which aren't the sole members of their encapsulating process
5735 5735           * contracts are not allowed to zone_enter.
5736 5736           */
5737 5737          ctp = pp->p_ct_process;
5738 5738          ct = &ctp->conp_contract;
5739 5739          mutex_enter(&ct->ct_lock);
5740 5740          mutex_enter(&pp->p_lock);
5741 5741          if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5742 5742                  mutex_exit(&pp->p_lock);
5743 5743                  mutex_exit(&ct->ct_lock);
5744 5744                  mutex_exit(&zonehash_lock);
5745 5745                  err = EINVAL;
5746 5746                  goto out;
5747 5747          }
5748 5748  
5749 5749          /*
5750 5750           * Moreover, we don't allow processes whose encapsulating
5751 5751           * process contracts have inherited extrazonal contracts.
5752 5752           * While it would be easier to eliminate all process contracts
5753 5753           * with inherited contracts, we need to be able to give a
5754 5754           * restarted init (or other zone-penetrating process) its
5755 5755           * predecessor's contracts.
5756 5756           */
5757 5757          if (ctp->conp_ninherited != 0) {
5758 5758                  contract_t *next;
5759 5759                  for (next = list_head(&ctp->conp_inherited); next;
5760 5760                      next = list_next(&ctp->conp_inherited, next)) {
5761 5761                          if (contract_getzuniqid(next) != zone->zone_uniqid) {
5762 5762                                  mutex_exit(&pp->p_lock);
5763 5763                                  mutex_exit(&ct->ct_lock);
5764 5764                                  mutex_exit(&zonehash_lock);
5765 5765                                  err = EINVAL;
5766 5766                                  goto out;
5767 5767                          }
5768 5768                  }
5769 5769          }
5770 5770  
5771 5771          mutex_exit(&pp->p_lock);
5772 5772          mutex_exit(&ct->ct_lock);
5773 5773  
5774 5774          status = zone_status_get(zone);
5775 5775          if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5776 5776                  /*
5777 5777                   * Can't join
5778 5778                   */
5779 5779                  mutex_exit(&zonehash_lock);
5780 5780                  err = EINVAL;
5781 5781                  goto out;
5782 5782          }
5783 5783  
5784 5784          /*
5785 5785           * Make sure new priv set is within the permitted set for caller
5786 5786           */
5787 5787          if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5788 5788                  mutex_exit(&zonehash_lock);
5789 5789                  err = EPERM;
5790 5790                  goto out;
5791 5791          }
5792 5792          /*
5793 5793           * We want to momentarily drop zonehash_lock while we optimistically
5794 5794           * bind curproc to the pool it should be running in.  This is safe
5795 5795           * since the zone can't disappear (we have a hold on it).
5796 5796           */
5797 5797          zone_hold(zone);
5798 5798          mutex_exit(&zonehash_lock);
5799 5799  
5800 5800          /*
5801 5801           * Grab pool_lock to keep the pools configuration from changing
5802 5802           * and to stop ourselves from getting rebound to another pool
5803 5803           * until we join the zone.
5804 5804           */
5805 5805          if (pool_lock_intr() != 0) {
5806 5806                  zone_rele(zone);
5807 5807                  err = EINTR;
5808 5808                  goto out;
5809 5809          }
5810 5810          ASSERT(secpolicy_pool(CRED()) == 0);
5811 5811          /*
5812 5812           * Bind ourselves to the pool currently associated with the zone.
5813 5813           */
5814 5814          oldpool = curproc->p_pool;
5815 5815          newpool = zone_pool_get(zone);
5816 5816          if (pool_state == POOL_ENABLED && newpool != oldpool &&
5817 5817              (err = pool_do_bind(newpool, P_PID, P_MYID,
5818 5818              POOL_BIND_ALL)) != 0) {
5819 5819                  pool_unlock();
5820 5820                  zone_rele(zone);
5821 5821                  goto out;
5822 5822          }
5823 5823  
5824 5824          /*
5825 5825           * Grab cpu_lock now; we'll need it later when we call
5826 5826           * task_join().
5827 5827           */
5828 5828          mutex_enter(&cpu_lock);
5829 5829          mutex_enter(&zonehash_lock);
5830 5830          /*
5831 5831           * Make sure the zone hasn't moved on since we dropped zonehash_lock.
5832 5832           */
5833 5833          if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
5834 5834                  /*
5835 5835                   * Can't join anymore.
5836 5836                   */
5837 5837                  mutex_exit(&zonehash_lock);
5838 5838                  mutex_exit(&cpu_lock);
5839 5839                  if (pool_state == POOL_ENABLED &&
5840 5840                      newpool != oldpool)
5841 5841                          (void) pool_do_bind(oldpool, P_PID, P_MYID,
5842 5842                              POOL_BIND_ALL);
5843 5843                  pool_unlock();
5844 5844                  zone_rele(zone);
5845 5845                  err = EINVAL;
5846 5846                  goto out;
5847 5847          }
5848 5848  
5849 5849          /*
5850 5850           * a_lock must be held while transfering locked memory and swap
5851 5851           * reservation from the global zone to the non global zone because
5852 5852           * asynchronous faults on the processes' address space can lock
5853 5853           * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
5854 5854           * segments respectively.
5855 5855           */
5856 5856          AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
5857 5857          swap = as_swresv();
5858 5858          mutex_enter(&pp->p_lock);
5859 5859          zone_proj0 = zone->zone_zsched->p_task->tk_proj;
5860 5860          /* verify that we do not exceed and task or lwp limits */
5861 5861          mutex_enter(&zone->zone_nlwps_lock);
5862 5862          /* add new lwps to zone and zone's proj0 */
5863 5863          zone_proj0->kpj_nlwps += pp->p_lwpcnt;
5864 5864          zone->zone_nlwps += pp->p_lwpcnt;
5865 5865          /* add 1 task to zone's proj0 */
5866 5866          zone_proj0->kpj_ntasks += 1;
5867 5867  
5868 5868          zone_proj0->kpj_nprocs++;
5869 5869          zone->zone_nprocs++;
5870 5870          mutex_exit(&zone->zone_nlwps_lock);
5871 5871  
5872 5872          mutex_enter(&zone->zone_mem_lock);
5873 5873          zone->zone_locked_mem += pp->p_locked_mem;
5874 5874          zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
5875 5875          zone->zone_max_swap += swap;
5876 5876          mutex_exit(&zone->zone_mem_lock);
5877 5877  
5878 5878          mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
5879 5879          zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
5880 5880          mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
5881 5881  
5882 5882          /* remove lwps and process from proc's old zone and old project */
5883 5883          mutex_enter(&pp->p_zone->zone_nlwps_lock);
5884 5884          pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
5885 5885          pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
5886 5886          pp->p_task->tk_proj->kpj_nprocs--;
5887 5887          pp->p_zone->zone_nprocs--;
5888 5888          mutex_exit(&pp->p_zone->zone_nlwps_lock);
5889 5889  
5890 5890          mutex_enter(&pp->p_zone->zone_mem_lock);
5891 5891          pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
5892 5892          pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
5893 5893          pp->p_zone->zone_max_swap -= swap;
5894 5894          mutex_exit(&pp->p_zone->zone_mem_lock);
5895 5895  
5896 5896          mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5897 5897          pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
5898 5898          mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5899 5899  
5900 5900          pp->p_flag |= SZONETOP;
5901 5901          pp->p_zone = zone;
5902 5902          mutex_exit(&pp->p_lock);
5903 5903          AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
5904 5904  
5905 5905          /*
5906 5906           * Joining the zone cannot fail from now on.
5907 5907           *
5908 5908           * This means that a lot of the following code can be commonized and
5909 5909           * shared with zsched().
5910 5910           */
5911 5911  
5912 5912          /*
5913 5913           * If the process contract fmri was inherited, we need to
5914 5914           * flag this so that any contract status will not leak
5915 5915           * extra zone information, svc_fmri in this case
5916 5916           */
5917 5917          if (ctp->conp_svc_ctid != ct->ct_id) {
5918 5918                  mutex_enter(&ct->ct_lock);
5919 5919                  ctp->conp_svc_zone_enter = ct->ct_id;
5920 5920                  mutex_exit(&ct->ct_lock);
5921 5921          }
5922 5922  
5923 5923          /*
5924 5924           * Reset the encapsulating process contract's zone.
5925 5925           */
5926 5926          ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
5927 5927          contract_setzuniqid(ct, zone->zone_uniqid);
5928 5928  
5929 5929          /*
5930 5930           * Create a new task and associate the process with the project keyed
5931 5931           * by (projid,zoneid).
5932 5932           *
5933 5933           * We might as well be in project 0; the global zone's projid doesn't
5934 5934           * make much sense in a zone anyhow.
5935 5935           *
5936 5936           * This also increments zone_ntasks, and returns with p_lock held.
5937 5937           */
5938 5938          tk = task_create(0, zone);
5939 5939          oldtk = task_join(tk, 0);
5940 5940          mutex_exit(&cpu_lock);
5941 5941  
5942 5942          /*
5943 5943           * call RCTLOP_SET functions on this proc
5944 5944           */
5945 5945          e.rcep_p.zone = zone;
5946 5946          e.rcep_t = RCENTITY_ZONE;
5947 5947          (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
5948 5948              RCD_CALLBACK);
5949 5949          mutex_exit(&pp->p_lock);
5950 5950  
5951 5951          /*
5952 5952           * We don't need to hold any of zsched's locks here; not only do we know
5953 5953           * the process and zone aren't going away, we know its session isn't
5954 5954           * changing either.
5955 5955           *
5956 5956           * By joining zsched's session here, we mimic the behavior in the
5957 5957           * global zone of init's sid being the pid of sched.  We extend this
5958 5958           * to all zlogin-like zone_enter()'ing processes as well.
5959 5959           */
5960 5960          mutex_enter(&pidlock);
5961 5961          sp = zone->zone_zsched->p_sessp;
5962 5962          sess_hold(zone->zone_zsched);
5963 5963          mutex_enter(&pp->p_lock);
5964 5964          pgexit(pp);
5965 5965          sess_rele(pp->p_sessp, B_TRUE);
5966 5966          pp->p_sessp = sp;
5967 5967          pgjoin(pp, zone->zone_zsched->p_pidp);
5968 5968  
5969 5969          /*
5970 5970           * If any threads are scheduled to be placed on zone wait queue they
5971 5971           * should abandon the idea since the wait queue is changing.
5972 5972           * We need to be holding pidlock & p_lock to do this.
5973 5973           */
5974 5974          if ((t = pp->p_tlist) != NULL) {
5975 5975                  do {
5976 5976                          thread_lock(t);
5977 5977                          /*
5978 5978                           * Kick this thread so that he doesn't sit
5979 5979                           * on a wrong wait queue.
5980 5980                           */
5981 5981                          if (ISWAITING(t))
5982 5982                                  setrun_locked(t);
5983 5983  
5984 5984                          if (t->t_schedflag & TS_ANYWAITQ)
5985 5985                                  t->t_schedflag &= ~ TS_ANYWAITQ;
5986 5986  
5987 5987                          thread_unlock(t);
5988 5988                  } while ((t = t->t_forw) != pp->p_tlist);
5989 5989          }
5990 5990  
5991 5991          /*
5992 5992           * If there is a default scheduling class for the zone and it is not
5993 5993           * the class we are currently in, change all of the threads in the
5994 5994           * process to the new class.  We need to be holding pidlock & p_lock
5995 5995           * when we call parmsset so this is a good place to do it.
5996 5996           */
5997 5997          if (zone->zone_defaultcid > 0 &&
5998 5998              zone->zone_defaultcid != curthread->t_cid) {
5999 5999                  pcparms_t pcparms;
6000 6000  
6001 6001                  pcparms.pc_cid = zone->zone_defaultcid;
6002 6002                  pcparms.pc_clparms[0] = 0;
6003 6003  
6004 6004                  /*
6005 6005                   * If setting the class fails, we still want to enter the zone.
6006 6006                   */
6007 6007                  if ((t = pp->p_tlist) != NULL) {
6008 6008                          do {
6009 6009                                  (void) parmsset(&pcparms, t);
6010 6010                          } while ((t = t->t_forw) != pp->p_tlist);
6011 6011                  }
6012 6012          }
6013 6013  
6014 6014          mutex_exit(&pp->p_lock);
6015 6015          mutex_exit(&pidlock);
6016 6016  
6017 6017          mutex_exit(&zonehash_lock);
6018 6018          /*
6019 6019           * We're firmly in the zone; let pools progress.
6020 6020           */
6021 6021          pool_unlock();
6022 6022          task_rele(oldtk);
6023 6023          /*
6024 6024           * We don't need to retain a hold on the zone since we already
6025 6025           * incremented zone_ntasks, so the zone isn't going anywhere.
6026 6026           */
6027 6027          zone_rele(zone);
6028 6028  
6029 6029          /*
6030 6030           * Chroot
6031 6031           */
6032 6032          vp = zone->zone_rootvp;
6033 6033          zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6034 6034          zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6035 6035  
6036 6036          /*
6037 6037           * Change process credentials
6038 6038           */
6039 6039          newcr = cralloc();
6040 6040          mutex_enter(&pp->p_crlock);
6041 6041          cr = pp->p_cred;
6042 6042          crcopy_to(cr, newcr);
6043 6043          crsetzone(newcr, zone);
6044 6044          pp->p_cred = newcr;
6045 6045  
6046 6046          /*
6047 6047           * Restrict all process privilege sets to zone limit
6048 6048           */
6049 6049          priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6050 6050          priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6051 6051          priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6052 6052          priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6053 6053          mutex_exit(&pp->p_crlock);
6054 6054          crset(pp, newcr);
6055 6055  
6056 6056          /*
6057 6057           * Adjust upcount to reflect zone entry.
6058 6058           */
6059 6059          uid = crgetruid(newcr);
6060 6060          mutex_enter(&pidlock);
6061 6061          upcount_dec(uid, GLOBAL_ZONEID);
6062 6062          upcount_inc(uid, zoneid);
6063 6063          mutex_exit(&pidlock);
6064 6064  
6065 6065          /*
6066 6066           * Set up core file path and content.
6067 6067           */
6068 6068          set_core_defaults();
6069 6069  
6070 6070  out:
6071 6071          /*
6072 6072           * Let the other lwps continue.
6073 6073           */
6074 6074          mutex_enter(&pp->p_lock);
6075 6075          if (curthread != pp->p_agenttp)
6076 6076                  continuelwps(pp);
6077 6077          mutex_exit(&pp->p_lock);
6078 6078  
6079 6079          return (err != 0 ? set_errno(err) : 0);
6080 6080  }
6081 6081  
6082 6082  /*
6083 6083   * Systemcall entry point for zone_list(2).
6084 6084   *
6085 6085   * Processes running in a (non-global) zone only see themselves.
6086 6086   * On labeled systems, they see all zones whose label they dominate.
6087 6087   */
6088 6088  static int
6089 6089  zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6090 6090  {
6091 6091          zoneid_t *zoneids;
6092 6092          zone_t *zone, *myzone;
6093 6093          uint_t user_nzones, real_nzones;
6094 6094          uint_t domi_nzones;
6095 6095          int error;
6096 6096  
6097 6097          if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6098 6098                  return (set_errno(EFAULT));
6099 6099  
6100 6100          myzone = curproc->p_zone;
6101 6101          if (myzone != global_zone) {
6102 6102                  bslabel_t *mybslab;
6103 6103  
6104 6104                  if (!is_system_labeled()) {
6105 6105                          /* just return current zone */
6106 6106                          real_nzones = domi_nzones = 1;
6107 6107                          zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6108 6108                          zoneids[0] = myzone->zone_id;
6109 6109                  } else {
6110 6110                          /* return all zones that are dominated */
6111 6111                          mutex_enter(&zonehash_lock);
6112 6112                          real_nzones = zonecount;
6113 6113                          domi_nzones = 0;
6114 6114                          if (real_nzones > 0) {
6115 6115                                  zoneids = kmem_alloc(real_nzones *
6116 6116                                      sizeof (zoneid_t), KM_SLEEP);
6117 6117                                  mybslab = label2bslabel(myzone->zone_slabel);
6118 6118                                  for (zone = list_head(&zone_active);
6119 6119                                      zone != NULL;
6120 6120                                      zone = list_next(&zone_active, zone)) {
6121 6121                                          if (zone->zone_id == GLOBAL_ZONEID)
6122 6122                                                  continue;
6123 6123                                          if (zone != myzone &&
6124 6124                                              (zone->zone_flags & ZF_IS_SCRATCH))
6125 6125                                                  continue;
6126 6126                                          /*
6127 6127                                           * Note that a label always dominates
6128 6128                                           * itself, so myzone is always included
6129 6129                                           * in the list.
6130 6130                                           */
6131 6131                                          if (bldominates(mybslab,
6132 6132                                              label2bslabel(zone->zone_slabel))) {
6133 6133                                                  zoneids[domi_nzones++] =
6134 6134                                                      zone->zone_id;
6135 6135                                          }
6136 6136                                  }
6137 6137                          }
6138 6138                          mutex_exit(&zonehash_lock);
6139 6139                  }
6140 6140          } else {
6141 6141                  mutex_enter(&zonehash_lock);
6142 6142                  real_nzones = zonecount;
6143 6143                  domi_nzones = 0;
6144 6144                  if (real_nzones > 0) {
6145 6145                          zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6146 6146                              KM_SLEEP);
6147 6147                          for (zone = list_head(&zone_active); zone != NULL;
6148 6148                              zone = list_next(&zone_active, zone))
6149 6149                                  zoneids[domi_nzones++] = zone->zone_id;
6150 6150                          ASSERT(domi_nzones == real_nzones);
6151 6151                  }
6152 6152                  mutex_exit(&zonehash_lock);
6153 6153          }
6154 6154  
6155 6155          /*
6156 6156           * If user has allocated space for fewer entries than we found, then
6157 6157           * return only up to his limit.  Either way, tell him exactly how many
6158 6158           * we found.
6159 6159           */
6160 6160          if (domi_nzones < user_nzones)
6161 6161                  user_nzones = domi_nzones;
6162 6162          error = 0;
6163 6163          if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6164 6164                  error = EFAULT;
6165 6165          } else if (zoneidlist != NULL && user_nzones != 0) {
6166 6166                  if (copyout(zoneids, zoneidlist,
6167 6167                      user_nzones * sizeof (zoneid_t)) != 0)
6168 6168                          error = EFAULT;
6169 6169          }
6170 6170  
6171 6171          if (real_nzones > 0)
6172 6172                  kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6173 6173  
6174 6174          if (error != 0)
6175 6175                  return (set_errno(error));
6176 6176          else
6177 6177                  return (0);
6178 6178  }
6179 6179  
6180 6180  /*
6181 6181   * Systemcall entry point for zone_lookup(2).
6182 6182   *
6183 6183   * Non-global zones are only able to see themselves and (on labeled systems)
6184 6184   * the zones they dominate.
6185 6185   */
6186 6186  static zoneid_t
6187 6187  zone_lookup(const char *zone_name)
6188 6188  {
6189 6189          char *kname;
6190 6190          zone_t *zone;
6191 6191          zoneid_t zoneid;
6192 6192          int err;
6193 6193  
6194 6194          if (zone_name == NULL) {
6195 6195                  /* return caller's zone id */
6196 6196                  return (getzoneid());
6197 6197          }
6198 6198  
6199 6199          kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6200 6200          if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6201 6201                  kmem_free(kname, ZONENAME_MAX);
6202 6202                  return (set_errno(err));
6203 6203          }
6204 6204  
6205 6205          mutex_enter(&zonehash_lock);
6206 6206          zone = zone_find_all_by_name(kname);
6207 6207          kmem_free(kname, ZONENAME_MAX);
6208 6208          /*
6209 6209           * In a non-global zone, can only lookup global and own name.
6210 6210           * In Trusted Extensions zone label dominance rules apply.
6211 6211           */
6212 6212          if (zone == NULL ||
6213 6213              zone_status_get(zone) < ZONE_IS_READY ||
6214 6214              !zone_list_access(zone)) {
6215 6215                  mutex_exit(&zonehash_lock);
6216 6216                  return (set_errno(EINVAL));
6217 6217          } else {
6218 6218                  zoneid = zone->zone_id;
6219 6219                  mutex_exit(&zonehash_lock);
6220 6220                  return (zoneid);
6221 6221          }
6222 6222  }
6223 6223  
6224 6224  static int
6225 6225  zone_version(int *version_arg)
6226 6226  {
6227 6227          int version = ZONE_SYSCALL_API_VERSION;
6228 6228  
6229 6229          if (copyout(&version, version_arg, sizeof (int)) != 0)
6230 6230                  return (set_errno(EFAULT));
6231 6231          return (0);
6232 6232  }
6233 6233  
6234 6234  /* ARGSUSED */
6235 6235  long
6236 6236  zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6237 6237  {
6238 6238          zone_def zs;
6239 6239          int err;
6240 6240  
6241 6241          switch (cmd) {
6242 6242          case ZONE_CREATE:
6243 6243                  if (get_udatamodel() == DATAMODEL_NATIVE) {
6244 6244                          if (copyin(arg1, &zs, sizeof (zone_def))) {
6245 6245                                  return (set_errno(EFAULT));
6246 6246                          }
6247 6247                  } else {
6248 6248  #ifdef _SYSCALL32_IMPL
6249 6249                          zone_def32 zs32;
6250 6250  
6251 6251                          if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6252 6252                                  return (set_errno(EFAULT));
6253 6253                          }
6254 6254                          zs.zone_name =
6255 6255                              (const char *)(unsigned long)zs32.zone_name;
6256 6256                          zs.zone_root =
6257 6257                              (const char *)(unsigned long)zs32.zone_root;
6258 6258                          zs.zone_privs =
6259 6259                              (const struct priv_set *)
6260 6260                              (unsigned long)zs32.zone_privs;
6261 6261                          zs.zone_privssz = zs32.zone_privssz;
6262 6262                          zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6263 6263                          zs.rctlbufsz = zs32.rctlbufsz;
6264 6264                          zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6265 6265                          zs.zfsbufsz = zs32.zfsbufsz;
6266 6266                          zs.extended_error =
6267 6267                              (int *)(unsigned long)zs32.extended_error;
6268 6268                          zs.match = zs32.match;
6269 6269                          zs.doi = zs32.doi;
6270 6270                          zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6271 6271                          zs.flags = zs32.flags;
6272 6272  #else
6273 6273                          panic("get_udatamodel() returned bogus result\n");
6274 6274  #endif
6275 6275                  }
6276 6276  
6277 6277                  return (zone_create(zs.zone_name, zs.zone_root,
6278 6278                      zs.zone_privs, zs.zone_privssz,
6279 6279                      (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6280 6280                      (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6281 6281                      zs.extended_error, zs.match, zs.doi,
6282 6282                      zs.label, zs.flags));
6283 6283          case ZONE_BOOT:
6284 6284                  return (zone_boot((zoneid_t)(uintptr_t)arg1));
6285 6285          case ZONE_DESTROY:
6286 6286                  return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6287 6287          case ZONE_GETATTR:
6288 6288                  return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6289 6289                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6290 6290          case ZONE_SETATTR:
6291 6291                  return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6292 6292                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6293 6293          case ZONE_ENTER:
6294 6294                  return (zone_enter((zoneid_t)(uintptr_t)arg1));
6295 6295          case ZONE_LIST:
6296 6296                  return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6297 6297          case ZONE_SHUTDOWN:
6298 6298                  return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6299 6299          case ZONE_LOOKUP:
6300 6300                  return (zone_lookup((const char *)arg1));
6301 6301          case ZONE_VERSION:
6302 6302                  return (zone_version((int *)arg1));
6303 6303          case ZONE_ADD_DATALINK:
6304 6304                  return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6305 6305                      (datalink_id_t)(uintptr_t)arg2));
6306 6306          case ZONE_DEL_DATALINK:
6307 6307                  return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6308 6308                      (datalink_id_t)(uintptr_t)arg2));
6309 6309          case ZONE_CHECK_DATALINK: {
6310 6310                  zoneid_t        zoneid;
6311 6311                  boolean_t       need_copyout;
6312 6312  
6313 6313                  if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6314 6314                          return (EFAULT);
6315 6315                  need_copyout = (zoneid == ALL_ZONES);
6316 6316                  err = zone_check_datalink(&zoneid,
6317 6317                      (datalink_id_t)(uintptr_t)arg2);
6318 6318                  if (err == 0 && need_copyout) {
6319 6319                          if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6320 6320                                  err = EFAULT;
6321 6321                  }
6322 6322                  return (err == 0 ? 0 : set_errno(err));
6323 6323          }
6324 6324          case ZONE_LIST_DATALINK:
6325 6325                  return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6326 6326                      (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6327 6327          default:
6328 6328                  return (set_errno(EINVAL));
6329 6329          }
6330 6330  }
6331 6331  
6332 6332  struct zarg {
6333 6333          zone_t *zone;
6334 6334          zone_cmd_arg_t arg;
6335 6335  };
6336 6336  
6337 6337  static int
6338 6338  zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6339 6339  {
6340 6340          char *buf;
6341 6341          size_t buflen;
6342 6342          int error;
6343 6343  
6344 6344          buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6345 6345          buf = kmem_alloc(buflen, KM_SLEEP);
6346 6346          (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6347 6347          error = door_ki_open(buf, doorp);
6348 6348          kmem_free(buf, buflen);
6349 6349          return (error);
6350 6350  }
6351 6351  
6352 6352  static void
6353 6353  zone_release_door(door_handle_t *doorp)
6354 6354  {
6355 6355          door_ki_rele(*doorp);
6356 6356          *doorp = NULL;
6357 6357  }
6358 6358  
6359 6359  static void
6360 6360  zone_ki_call_zoneadmd(struct zarg *zargp)
6361 6361  {
6362 6362          door_handle_t door = NULL;
6363 6363          door_arg_t darg, save_arg;
6364 6364          char *zone_name;
6365 6365          size_t zone_namelen;
6366 6366          zoneid_t zoneid;
6367 6367          zone_t *zone;
6368 6368          zone_cmd_arg_t arg;
6369 6369          uint64_t uniqid;
6370 6370          size_t size;
6371 6371          int error;
6372 6372          int retry;
6373 6373  
6374 6374          zone = zargp->zone;
6375 6375          arg = zargp->arg;
6376 6376          kmem_free(zargp, sizeof (*zargp));
6377 6377  
6378 6378          zone_namelen = strlen(zone->zone_name) + 1;
6379 6379          zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6380 6380          bcopy(zone->zone_name, zone_name, zone_namelen);
6381 6381          zoneid = zone->zone_id;
6382 6382          uniqid = zone->zone_uniqid;
6383 6383          /*
6384 6384           * zoneadmd may be down, but at least we can empty out the zone.
6385 6385           * We can ignore the return value of zone_empty() since we're called
6386 6386           * from a kernel thread and know we won't be delivered any signals.
6387 6387           */
6388 6388          ASSERT(curproc == &p0);
6389 6389          (void) zone_empty(zone);
6390 6390          ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6391 6391          zone_rele(zone);
6392 6392  
6393 6393          size = sizeof (arg);
6394 6394          darg.rbuf = (char *)&arg;
6395 6395          darg.data_ptr = (char *)&arg;
6396 6396          darg.rsize = size;
6397 6397          darg.data_size = size;
6398 6398          darg.desc_ptr = NULL;
6399 6399          darg.desc_num = 0;
6400 6400  
6401 6401          save_arg = darg;
6402 6402          /*
6403 6403           * Since we're not holding a reference to the zone, any number of
6404 6404           * things can go wrong, including the zone disappearing before we get a
6405 6405           * chance to talk to zoneadmd.
6406 6406           */
6407 6407          for (retry = 0; /* forever */; retry++) {
6408 6408                  if (door == NULL &&
6409 6409                      (error = zone_lookup_door(zone_name, &door)) != 0) {
6410 6410                          goto next;
6411 6411                  }
6412 6412                  ASSERT(door != NULL);
6413 6413  
6414 6414                  if ((error = door_ki_upcall_limited(door, &darg, NULL,
6415 6415                      SIZE_MAX, 0)) == 0) {
6416 6416                          break;
6417 6417                  }
6418 6418                  switch (error) {
6419 6419                  case EINTR:
6420 6420                          /* FALLTHROUGH */
6421 6421                  case EAGAIN:    /* process may be forking */
6422 6422                          /*
6423 6423                           * Back off for a bit
6424 6424                           */
6425 6425                          break;
6426 6426                  case EBADF:
6427 6427                          zone_release_door(&door);
6428 6428                          if (zone_lookup_door(zone_name, &door) != 0) {
6429 6429                                  /*
6430 6430                                   * zoneadmd may be dead, but it may come back to
6431 6431                                   * life later.
6432 6432                                   */
6433 6433                                  break;
6434 6434                          }
6435 6435                          break;
6436 6436                  default:
6437 6437                          cmn_err(CE_WARN,
6438 6438                              "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6439 6439                              error);
6440 6440                          goto out;
6441 6441                  }
6442 6442  next:
6443 6443                  /*
6444 6444                   * If this isn't the same zone_t that we originally had in mind,
6445 6445                   * then this is the same as if two kadmin requests come in at
6446 6446                   * the same time: the first one wins.  This means we lose, so we
6447 6447                   * bail.
6448 6448                   */
6449 6449                  if ((zone = zone_find_by_id(zoneid)) == NULL) {
6450 6450                          /*
6451 6451                           * Problem is solved.
6452 6452                           */
6453 6453                          break;
6454 6454                  }
6455 6455                  if (zone->zone_uniqid != uniqid) {
6456 6456                          /*
6457 6457                           * zoneid recycled
6458 6458                           */
6459 6459                          zone_rele(zone);
6460 6460                          break;
6461 6461                  }
6462 6462                  /*
6463 6463                   * We could zone_status_timedwait(), but there doesn't seem to
6464 6464                   * be much point in doing that (plus, it would mean that
6465 6465                   * zone_free() isn't called until this thread exits).
6466 6466                   */
6467 6467                  zone_rele(zone);
6468 6468                  delay(hz);
6469 6469                  darg = save_arg;
6470 6470          }
6471 6471  out:
6472 6472          if (door != NULL) {
6473 6473                  zone_release_door(&door);
6474 6474          }
6475 6475          kmem_free(zone_name, zone_namelen);
6476 6476          thread_exit();
6477 6477  }
6478 6478  
6479 6479  /*
6480 6480   * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6481 6481   * kadmin().  The caller is a process in the zone.
6482 6482   *
6483 6483   * In order to shutdown the zone, we will hand off control to zoneadmd
6484 6484   * (running in the global zone) via a door.  We do a half-hearted job at
6485 6485   * killing all processes in the zone, create a kernel thread to contact
6486 6486   * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6487 6487   * a form of generation number used to let zoneadmd (as well as
6488 6488   * zone_destroy()) know exactly which zone they're re talking about.
6489 6489   */
6490 6490  int
6491 6491  zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6492 6492  {
6493 6493          struct zarg *zargp;
6494 6494          zone_cmd_t zcmd;
6495 6495          zone_t *zone;
6496 6496  
6497 6497          zone = curproc->p_zone;
6498 6498          ASSERT(getzoneid() != GLOBAL_ZONEID);
6499 6499  
6500 6500          switch (cmd) {
6501 6501          case A_SHUTDOWN:
6502 6502                  switch (fcn) {
6503 6503                  case AD_HALT:
6504 6504                  case AD_POWEROFF:
6505 6505                          zcmd = Z_HALT;
6506 6506                          break;
6507 6507                  case AD_BOOT:
6508 6508                          zcmd = Z_REBOOT;
6509 6509                          break;
6510 6510                  case AD_IBOOT:
6511 6511                  case AD_SBOOT:
6512 6512                  case AD_SIBOOT:
6513 6513                  case AD_NOSYNC:
6514 6514                          return (ENOTSUP);
6515 6515                  default:
6516 6516                          return (EINVAL);
6517 6517                  }
6518 6518                  break;
6519 6519          case A_REBOOT:
6520 6520                  zcmd = Z_REBOOT;
6521 6521                  break;
6522 6522          case A_FTRACE:
6523 6523          case A_REMOUNT:
6524 6524          case A_FREEZE:
6525 6525          case A_DUMP:
6526 6526          case A_CONFIG:
6527 6527                  return (ENOTSUP);
6528 6528          default:
6529 6529                  ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6530 6530                  return (EINVAL);
6531 6531          }
6532 6532  
6533 6533          if (secpolicy_zone_admin(credp, B_FALSE))
6534 6534                  return (EPERM);
6535 6535          mutex_enter(&zone_status_lock);
6536 6536  
6537 6537          /*
6538 6538           * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6539 6539           * is in the zone.
6540 6540           */
6541 6541          ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6542 6542          if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6543 6543                  /*
6544 6544                   * This zone is already on its way down.
6545 6545                   */
6546 6546                  mutex_exit(&zone_status_lock);
6547 6547                  return (0);
6548 6548          }
6549 6549          /*
6550 6550           * Prevent future zone_enter()s
6551 6551           */
6552 6552          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6553 6553          mutex_exit(&zone_status_lock);
6554 6554  
6555 6555          /*
6556 6556           * Kill everyone now and call zoneadmd later.
6557 6557           * zone_ki_call_zoneadmd() will do a more thorough job of this
6558 6558           * later.
6559 6559           */
6560 6560          killall(zone->zone_id);
6561 6561          /*
6562 6562           * Now, create the thread to contact zoneadmd and do the rest of the
6563 6563           * work.  This thread can't be created in our zone otherwise
6564 6564           * zone_destroy() would deadlock.
6565 6565           */
6566 6566          zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6567 6567          zargp->arg.cmd = zcmd;
6568 6568          zargp->arg.uniqid = zone->zone_uniqid;
6569 6569          zargp->zone = zone;
6570 6570          (void) strcpy(zargp->arg.locale, "C");
6571 6571          /* mdep was already copied in for us by uadmin */
6572 6572          if (mdep != NULL)
6573 6573                  (void) strlcpy(zargp->arg.bootbuf, mdep,
6574 6574                      sizeof (zargp->arg.bootbuf));
6575 6575          zone_hold(zone);
6576 6576  
6577 6577          (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6578 6578              TS_RUN, minclsyspri);
6579 6579          exit(CLD_EXITED, 0);
6580 6580  
6581 6581          return (EINVAL);
6582 6582  }
6583 6583  
6584 6584  /*
6585 6585   * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6586 6586   * status to ZONE_IS_SHUTTING_DOWN.
6587 6587   *
6588 6588   * This function also shuts down all running zones to ensure that they won't
6589 6589   * fork new processes.
6590 6590   */
6591 6591  void
6592 6592  zone_shutdown_global(void)
6593 6593  {
6594 6594          zone_t *current_zonep;
6595 6595  
6596 6596          ASSERT(INGLOBALZONE(curproc));
6597 6597          mutex_enter(&zonehash_lock);
6598 6598          mutex_enter(&zone_status_lock);
6599 6599  
6600 6600          /* Modify the global zone's status first. */
6601 6601          ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6602 6602          zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6603 6603  
6604 6604          /*
6605 6605           * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6606 6606           * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6607 6607           * could cause assertions to fail (e.g., assertions about a zone's
6608 6608           * state during initialization, readying, or booting) or produce races.
6609 6609           * We'll let threads continue to initialize and ready new zones: they'll
6610 6610           * fail to boot the new zones when they see that the global zone is
6611 6611           * shutting down.
6612 6612           */
6613 6613          for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6614 6614              current_zonep = list_next(&zone_active, current_zonep)) {
6615 6615                  if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6616 6616                          zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6617 6617          }
6618 6618          mutex_exit(&zone_status_lock);
6619 6619          mutex_exit(&zonehash_lock);
6620 6620  }
6621 6621  
6622 6622  /*
6623 6623   * Returns true if the named dataset is visible in the current zone.
6624 6624   * The 'write' parameter is set to 1 if the dataset is also writable.
6625 6625   */
6626 6626  int
6627 6627  zone_dataset_visible(const char *dataset, int *write)
6628 6628  {
6629 6629          static int zfstype = -1;
6630 6630          zone_dataset_t *zd;
6631 6631          size_t len;
6632 6632          zone_t *zone = curproc->p_zone;
6633 6633          const char *name = NULL;
6634 6634          vfs_t *vfsp = NULL;
6635 6635  
6636 6636          if (dataset[0] == '\0')
6637 6637                  return (0);
6638 6638  
6639 6639          /*
6640 6640           * Walk the list once, looking for datasets which match exactly, or
6641 6641           * specify a dataset underneath an exported dataset.  If found, return
6642 6642           * true and note that it is writable.
6643 6643           */
6644 6644          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6645 6645              zd = list_next(&zone->zone_datasets, zd)) {
6646 6646  
6647 6647                  len = strlen(zd->zd_dataset);
6648 6648                  if (strlen(dataset) >= len &&
6649 6649                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6650 6650                      (dataset[len] == '\0' || dataset[len] == '/' ||
6651 6651                      dataset[len] == '@')) {
6652 6652                          if (write)
6653 6653                                  *write = 1;
6654 6654                          return (1);
6655 6655                  }
6656 6656          }
6657 6657  
6658 6658          /*
6659 6659           * Walk the list a second time, searching for datasets which are parents
6660 6660           * of exported datasets.  These should be visible, but read-only.
6661 6661           *
6662 6662           * Note that we also have to support forms such as 'pool/dataset/', with
6663 6663           * a trailing slash.
6664 6664           */
6665 6665          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6666 6666              zd = list_next(&zone->zone_datasets, zd)) {
6667 6667  
6668 6668                  len = strlen(dataset);
6669 6669                  if (dataset[len - 1] == '/')
6670 6670                          len--;  /* Ignore trailing slash */
6671 6671                  if (len < strlen(zd->zd_dataset) &&
6672 6672                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6673 6673                      zd->zd_dataset[len] == '/') {
6674 6674                          if (write)
6675 6675                                  *write = 0;
6676 6676                          return (1);
6677 6677                  }
6678 6678          }
6679 6679  
6680 6680          /*
6681 6681           * We reach here if the given dataset is not found in the zone_dataset
6682 6682           * list. Check if this dataset was added as a filesystem (ie. "add fs")
6683 6683           * instead of delegation. For this we search for the dataset in the
6684 6684           * zone_vfslist of this zone. If found, return true and note that it is
6685 6685           * not writable.
6686 6686           */
6687 6687  
6688 6688          /*
6689 6689           * Initialize zfstype if it is not initialized yet.
6690 6690           */
6691 6691          if (zfstype == -1) {
6692 6692                  struct vfssw *vswp = vfs_getvfssw("zfs");
6693 6693                  zfstype = vswp - vfssw;
6694 6694                  vfs_unrefvfssw(vswp);
6695 6695          }
6696 6696  
6697 6697          vfs_list_read_lock();
6698 6698          vfsp = zone->zone_vfslist;
6699 6699          do {
6700 6700                  ASSERT(vfsp);
6701 6701                  if (vfsp->vfs_fstype == zfstype) {
6702 6702                          name = refstr_value(vfsp->vfs_resource);
6703 6703  
6704 6704                          /*
6705 6705                           * Check if we have an exact match.
6706 6706                           */
6707 6707                          if (strcmp(dataset, name) == 0) {
6708 6708                                  vfs_list_unlock();
6709 6709                                  if (write)
6710 6710                                          *write = 0;
6711 6711                                  return (1);
6712 6712                          }
6713 6713                          /*
6714 6714                           * We need to check if we are looking for parents of
6715 6715                           * a dataset. These should be visible, but read-only.
6716 6716                           */
6717 6717                          len = strlen(dataset);
6718 6718                          if (dataset[len - 1] == '/')
6719 6719                                  len--;
6720 6720  
6721 6721                          if (len < strlen(name) &&
6722 6722                              bcmp(dataset, name, len) == 0 && name[len] == '/') {
6723 6723                                  vfs_list_unlock();
6724 6724                                  if (write)
6725 6725                                          *write = 0;
6726 6726                                  return (1);
6727 6727                          }
6728 6728                  }
6729 6729                  vfsp = vfsp->vfs_zone_next;
6730 6730          } while (vfsp != zone->zone_vfslist);
6731 6731  
6732 6732          vfs_list_unlock();
6733 6733          return (0);
6734 6734  }
6735 6735  
6736 6736  /*
6737 6737   * zone_find_by_any_path() -
6738 6738   *
6739 6739   * kernel-private routine similar to zone_find_by_path(), but which
6740 6740   * effectively compares against zone paths rather than zonerootpath
6741 6741   * (i.e., the last component of zonerootpaths, which should be "root/",
6742 6742   * are not compared.)  This is done in order to accurately identify all
6743 6743   * paths, whether zone-visible or not, including those which are parallel
6744 6744   * to /root/, such as /dev/, /home/, etc...
6745 6745   *
6746 6746   * If the specified path does not fall under any zone path then global
6747 6747   * zone is returned.
6748 6748   *
6749 6749   * The treat_abs parameter indicates whether the path should be treated as
6750 6750   * an absolute path although it does not begin with "/".  (This supports
6751 6751   * nfs mount syntax such as host:any/path.)
6752 6752   *
6753 6753   * The caller is responsible for zone_rele of the returned zone.
6754 6754   */
6755 6755  zone_t *
6756 6756  zone_find_by_any_path(const char *path, boolean_t treat_abs)
6757 6757  {
6758 6758          zone_t *zone;
6759 6759          int path_offset = 0;
6760 6760  
6761 6761          if (path == NULL) {
6762 6762                  zone_hold(global_zone);
6763 6763                  return (global_zone);
6764 6764          }
6765 6765  
6766 6766          if (*path != '/') {
6767 6767                  ASSERT(treat_abs);
6768 6768                  path_offset = 1;
6769 6769          }
6770 6770  
6771 6771          mutex_enter(&zonehash_lock);
6772 6772          for (zone = list_head(&zone_active); zone != NULL;
6773 6773              zone = list_next(&zone_active, zone)) {
6774 6774                  char    *c;
6775 6775                  size_t  pathlen;
6776 6776                  char *rootpath_start;
6777 6777  
6778 6778                  if (zone == global_zone)        /* skip global zone */
6779 6779                          continue;
6780 6780  
6781 6781                  /* scan backwards to find start of last component */
6782 6782                  c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6783 6783                  do {
6784 6784                          c--;
6785 6785                  } while (*c != '/');
6786 6786  
6787 6787                  pathlen = c - zone->zone_rootpath + 1 - path_offset;
6788 6788                  rootpath_start = (zone->zone_rootpath + path_offset);
6789 6789                  if (strncmp(path, rootpath_start, pathlen) == 0)
6790 6790                          break;
6791 6791          }
6792 6792          if (zone == NULL)
6793 6793                  zone = global_zone;
6794 6794          zone_hold(zone);
6795 6795          mutex_exit(&zonehash_lock);
6796 6796          return (zone);
6797 6797  }
6798 6798  
6799 6799  /*
6800 6800   * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
6801 6801   * zone_dl_t pointer if found, and NULL otherwise.
6802 6802   */
6803 6803  static zone_dl_t *
6804 6804  zone_find_dl(zone_t *zone, datalink_id_t linkid)
6805 6805  {
6806 6806          zone_dl_t *zdl;
6807 6807  
6808 6808          ASSERT(mutex_owned(&zone->zone_lock));
6809 6809          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6810 6810              zdl = list_next(&zone->zone_dl_list, zdl)) {
6811 6811                  if (zdl->zdl_id == linkid)
6812 6812                          break;
6813 6813          }
6814 6814          return (zdl);
6815 6815  }
6816 6816  
6817 6817  static boolean_t
6818 6818  zone_dl_exists(zone_t *zone, datalink_id_t linkid)
6819 6819  {
6820 6820          boolean_t exists;
6821 6821  
6822 6822          mutex_enter(&zone->zone_lock);
6823 6823          exists = (zone_find_dl(zone, linkid) != NULL);
6824 6824          mutex_exit(&zone->zone_lock);
6825 6825          return (exists);
6826 6826  }
6827 6827  
6828 6828  /*
6829 6829   * Add an data link name for the zone.
6830 6830   */
6831 6831  static int
6832 6832  zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
6833 6833  {
6834 6834          zone_dl_t *zdl;
6835 6835          zone_t *zone;
6836 6836          zone_t *thiszone;
6837 6837  
6838 6838          if ((thiszone = zone_find_by_id(zoneid)) == NULL)
6839 6839                  return (set_errno(ENXIO));
6840 6840  
6841 6841          /* Verify that the datalink ID doesn't already belong to a zone. */
6842 6842          mutex_enter(&zonehash_lock);
6843 6843          for (zone = list_head(&zone_active); zone != NULL;
6844 6844              zone = list_next(&zone_active, zone)) {
6845 6845                  if (zone_dl_exists(zone, linkid)) {
6846 6846                          mutex_exit(&zonehash_lock);
6847 6847                          zone_rele(thiszone);
6848 6848                          return (set_errno((zone == thiszone) ? EEXIST : EPERM));
6849 6849                  }
6850 6850          }
6851 6851  
6852 6852          zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
6853 6853          zdl->zdl_id = linkid;
6854 6854          zdl->zdl_net = NULL;
6855 6855          mutex_enter(&thiszone->zone_lock);
6856 6856          list_insert_head(&thiszone->zone_dl_list, zdl);
6857 6857          mutex_exit(&thiszone->zone_lock);
6858 6858          mutex_exit(&zonehash_lock);
6859 6859          zone_rele(thiszone);
6860 6860          return (0);
6861 6861  }
6862 6862  
6863 6863  static int
6864 6864  zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
6865 6865  {
6866 6866          zone_dl_t *zdl;
6867 6867          zone_t *zone;
6868 6868          int err = 0;
6869 6869  
6870 6870          if ((zone = zone_find_by_id(zoneid)) == NULL)
6871 6871                  return (set_errno(EINVAL));
6872 6872  
6873 6873          mutex_enter(&zone->zone_lock);
6874 6874          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
6875 6875                  err = ENXIO;
6876 6876          } else {
6877 6877                  list_remove(&zone->zone_dl_list, zdl);
6878 6878                  if (zdl->zdl_net != NULL)
6879 6879                          nvlist_free(zdl->zdl_net);
6880 6880                  kmem_free(zdl, sizeof (zone_dl_t));
6881 6881          }
6882 6882          mutex_exit(&zone->zone_lock);
6883 6883          zone_rele(zone);
6884 6884          return (err == 0 ? 0 : set_errno(err));
6885 6885  }
6886 6886  
6887 6887  /*
6888 6888   * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
6889 6889   * the linkid.  Otherwise we just check if the specified zoneidp has been
6890 6890   * assigned the supplied linkid.
6891 6891   */
6892 6892  int
6893 6893  zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
6894 6894  {
6895 6895          zone_t *zone;
6896 6896          int err = ENXIO;
6897 6897  
6898 6898          if (*zoneidp != ALL_ZONES) {
6899 6899                  if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
6900 6900                          if (zone_dl_exists(zone, linkid))
6901 6901                                  err = 0;
6902 6902                          zone_rele(zone);
6903 6903                  }
6904 6904                  return (err);
6905 6905          }
6906 6906  
6907 6907          mutex_enter(&zonehash_lock);
6908 6908          for (zone = list_head(&zone_active); zone != NULL;
6909 6909              zone = list_next(&zone_active, zone)) {
6910 6910                  if (zone_dl_exists(zone, linkid)) {
6911 6911                          *zoneidp = zone->zone_id;
6912 6912                          err = 0;
6913 6913                          break;
6914 6914                  }
6915 6915          }
6916 6916          mutex_exit(&zonehash_lock);
6917 6917          return (err);
6918 6918  }
6919 6919  
6920 6920  /*
6921 6921   * Get the list of datalink IDs assigned to a zone.
6922 6922   *
6923 6923   * On input, *nump is the number of datalink IDs that can fit in the supplied
6924 6924   * idarray.  Upon return, *nump is either set to the number of datalink IDs
6925 6925   * that were placed in the array if the array was large enough, or to the
6926 6926   * number of datalink IDs that the function needs to place in the array if the
6927 6927   * array is too small.
6928 6928   */
6929 6929  static int
6930 6930  zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
6931 6931  {
6932 6932          uint_t num, dlcount;
6933 6933          zone_t *zone;
6934 6934          zone_dl_t *zdl;
6935 6935          datalink_id_t *idptr = idarray;
6936 6936  
6937 6937          if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
6938 6938                  return (set_errno(EFAULT));
6939 6939          if ((zone = zone_find_by_id(zoneid)) == NULL)
6940 6940                  return (set_errno(ENXIO));
6941 6941  
6942 6942          num = 0;
6943 6943          mutex_enter(&zone->zone_lock);
6944 6944          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6945 6945              zdl = list_next(&zone->zone_dl_list, zdl)) {
6946 6946                  /*
6947 6947                   * If the list is bigger than what the caller supplied, just
6948 6948                   * count, don't do copyout.
6949 6949                   */
6950 6950                  if (++num > dlcount)
6951 6951                          continue;
6952 6952                  if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
6953 6953                          mutex_exit(&zone->zone_lock);
6954 6954                          zone_rele(zone);
6955 6955                          return (set_errno(EFAULT));
6956 6956                  }
6957 6957                  idptr++;
6958 6958          }
6959 6959          mutex_exit(&zone->zone_lock);
6960 6960          zone_rele(zone);
6961 6961  
6962 6962          /* Increased or decreased, caller should be notified. */
6963 6963          if (num != dlcount) {
6964 6964                  if (copyout(&num, nump, sizeof (num)) != 0)
6965 6965                          return (set_errno(EFAULT));
6966 6966          }
6967 6967          return (0);
6968 6968  }
6969 6969  
6970 6970  /*
6971 6971   * Public interface for looking up a zone by zoneid. It's a customized version
6972 6972   * for netstack_zone_create(). It can only be called from the zsd create
6973 6973   * callbacks, since it doesn't have reference on the zone structure hence if
6974 6974   * it is called elsewhere the zone could disappear after the zonehash_lock
6975 6975   * is dropped.
6976 6976   *
6977 6977   * Furthermore it
6978 6978   * 1. Doesn't check the status of the zone.
6979 6979   * 2. It will be called even before zone_init is called, in that case the
6980 6980   *    address of zone0 is returned directly, and netstack_zone_create()
6981 6981   *    will only assign a value to zone0.zone_netstack, won't break anything.
6982 6982   * 3. Returns without the zone being held.
6983 6983   */
6984 6984  zone_t *
6985 6985  zone_find_by_id_nolock(zoneid_t zoneid)
6986 6986  {
6987 6987          zone_t *zone;
6988 6988  
6989 6989          mutex_enter(&zonehash_lock);
6990 6990          if (zonehashbyid == NULL)
6991 6991                  zone = &zone0;
6992 6992          else
6993 6993                  zone = zone_find_all_by_id(zoneid);
6994 6994          mutex_exit(&zonehash_lock);
6995 6995          return (zone);
6996 6996  }
6997 6997  
6998 6998  /*
6999 6999   * Walk the datalinks for a given zone
7000 7000   */
7001 7001  int
7002 7002  zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7003 7003      void *data)
7004 7004  {
7005 7005          zone_t          *zone;
7006 7006          zone_dl_t       *zdl;
7007 7007          datalink_id_t   *idarray;
7008 7008          uint_t          idcount = 0;
7009 7009          int             i, ret = 0;
7010 7010  
7011 7011          if ((zone = zone_find_by_id(zoneid)) == NULL)
7012 7012                  return (ENOENT);
7013 7013  
7014 7014          /*
7015 7015           * We first build an array of linkid's so that we can walk these and
7016 7016           * execute the callback with the zone_lock dropped.
7017 7017           */
7018 7018          mutex_enter(&zone->zone_lock);
7019 7019          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7020 7020              zdl = list_next(&zone->zone_dl_list, zdl)) {
7021 7021                  idcount++;
7022 7022          }
7023 7023  
7024 7024          if (idcount == 0) {
7025 7025                  mutex_exit(&zone->zone_lock);
7026 7026                  zone_rele(zone);
7027 7027                  return (0);
7028 7028          }
7029 7029  
7030 7030          idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7031 7031          if (idarray == NULL) {
7032 7032                  mutex_exit(&zone->zone_lock);
7033 7033                  zone_rele(zone);
7034 7034                  return (ENOMEM);
7035 7035          }
7036 7036  
7037 7037          for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7038 7038              i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7039 7039                  idarray[i] = zdl->zdl_id;
7040 7040          }
7041 7041  
7042 7042          mutex_exit(&zone->zone_lock);
7043 7043  
7044 7044          for (i = 0; i < idcount && ret == 0; i++) {
7045 7045                  if ((ret = (*cb)(idarray[i], data)) != 0)
7046 7046                          break;
7047 7047          }
7048 7048  
7049 7049          zone_rele(zone);
7050 7050          kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7051 7051          return (ret);
7052 7052  }
7053 7053  
7054 7054  static char *
7055 7055  zone_net_type2name(int type)
7056 7056  {
7057 7057          switch (type) {
7058 7058          case ZONE_NETWORK_ADDRESS:
7059 7059                  return (ZONE_NET_ADDRNAME);
7060 7060          case ZONE_NETWORK_DEFROUTER:
7061 7061                  return (ZONE_NET_RTRNAME);
7062 7062          default:
7063 7063                  return (NULL);
7064 7064          }
7065 7065  }
7066 7066  
7067 7067  static int
7068 7068  zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7069 7069  {
7070 7070          zone_t *zone;
7071 7071          zone_dl_t *zdl;
7072 7072          nvlist_t *nvl;
7073 7073          int err = 0;
7074 7074          uint8_t *new = NULL;
7075 7075          char *nvname;
7076 7076          int bufsize;
7077 7077          datalink_id_t linkid = znbuf->zn_linkid;
7078 7078  
7079 7079          if (secpolicy_zone_config(CRED()) != 0)
7080 7080                  return (set_errno(EPERM));
7081 7081  
7082 7082          if (zoneid == GLOBAL_ZONEID)
7083 7083                  return (set_errno(EINVAL));
7084 7084  
7085 7085          nvname = zone_net_type2name(znbuf->zn_type);
7086 7086          bufsize = znbuf->zn_len;
7087 7087          new = znbuf->zn_val;
7088 7088          if (nvname == NULL)
7089 7089                  return (set_errno(EINVAL));
7090 7090  
7091 7091          if ((zone = zone_find_by_id(zoneid)) == NULL) {
7092 7092                  return (set_errno(EINVAL));
7093 7093          }
7094 7094  
7095 7095          mutex_enter(&zone->zone_lock);
7096 7096          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7097 7097                  err = ENXIO;
7098 7098                  goto done;
7099 7099          }
7100 7100          if ((nvl = zdl->zdl_net) == NULL) {
7101 7101                  if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7102 7102                          err = ENOMEM;
7103 7103                          goto done;
7104 7104                  } else {
7105 7105                          zdl->zdl_net = nvl;
7106 7106                  }
7107 7107          }
7108 7108          if (nvlist_exists(nvl, nvname)) {
7109 7109                  err = EINVAL;
7110 7110                  goto done;
7111 7111          }
7112 7112          err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7113 7113          ASSERT(err == 0);
7114 7114  done:
7115 7115          mutex_exit(&zone->zone_lock);
7116 7116          zone_rele(zone);
7117 7117          if (err != 0)
7118 7118                  return (set_errno(err));
7119 7119          else
7120 7120                  return (0);
7121 7121  }
7122 7122  
7123 7123  static int
7124 7124  zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7125 7125  {
7126 7126          zone_t *zone;
7127 7127          zone_dl_t *zdl;
7128 7128          nvlist_t *nvl;
7129 7129          uint8_t *ptr;
7130 7130          uint_t psize;
7131 7131          int err = 0;
7132 7132          char *nvname;
7133 7133          int bufsize;
7134 7134          void *buf;
7135 7135          datalink_id_t linkid = znbuf->zn_linkid;
7136 7136  
7137 7137          if (zoneid == GLOBAL_ZONEID)
7138 7138                  return (set_errno(EINVAL));
7139 7139  
7140 7140          nvname = zone_net_type2name(znbuf->zn_type);
7141 7141          bufsize = znbuf->zn_len;
7142 7142          buf = znbuf->zn_val;
7143 7143  
7144 7144          if (nvname == NULL)
7145 7145                  return (set_errno(EINVAL));
7146 7146          if ((zone = zone_find_by_id(zoneid)) == NULL)
7147 7147                  return (set_errno(EINVAL));
7148 7148  
7149 7149          mutex_enter(&zone->zone_lock);
7150 7150          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7151 7151                  err = ENXIO;
7152 7152                  goto done;
7153 7153          }
7154 7154          if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7155 7155                  err = ENOENT;
7156 7156                  goto done;
7157 7157          }
7158 7158          err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7159 7159          ASSERT(err == 0);
7160 7160  
7161 7161          if (psize > bufsize) {
7162 7162                  err = ENOBUFS;
7163 7163                  goto done;
7164 7164          }
7165 7165          znbuf->zn_len = psize;
7166 7166          bcopy(ptr, buf, psize);
7167 7167  done:
7168 7168          mutex_exit(&zone->zone_lock);
7169 7169          zone_rele(zone);
7170 7170          if (err != 0)
7171 7171                  return (set_errno(err));
7172 7172          else
7173 7173                  return (0);
7174 7174  }

↓ open down ↓

1536 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX