1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 
  29 #include <sys/types.h>
  30 #include <sys/param.h>
  31 #include <sys/sysmacros.h>
  32 #include <sys/signal.h>
  33 #include <sys/cred.h>
  34 #include <sys/policy.h>
  35 #include <sys/user.h>
  36 #include <sys/systm.h>
  37 #include <sys/cpuvar.h>
  38 #include <sys/vfs.h>
  39 #include <sys/vnode.h>
  40 #include <sys/file.h>
  41 #include <sys/errno.h>
  42 #include <sys/time.h>
  43 #include <sys/proc.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/acct.h>
  46 #include <sys/tuneable.h>
  47 #include <sys/class.h>
  48 #include <sys/kmem.h>
  49 #include <sys/session.h>
  50 #include <sys/ucontext.h>
  51 #include <sys/stack.h>
  52 #include <sys/procfs.h>
  53 #include <sys/prsystm.h>
  54 #include <sys/vmsystm.h>
  55 #include <sys/vtrace.h>
  56 #include <sys/debug.h>
  57 #include <sys/shm_impl.h>
  58 #include <sys/door_data.h>
  59 #include <vm/as.h>
  60 #include <vm/rm.h>
  61 #include <c2/audit.h>
  62 #include <sys/var.h>
  63 #include <sys/schedctl.h>
  64 #include <sys/utrap.h>
  65 #include <sys/task.h>
  66 #include <sys/resource.h>
  67 #include <sys/cyclic.h>
  68 #include <sys/lgrp.h>
  69 #include <sys/rctl.h>
  70 #include <sys/contract_impl.h>
  71 #include <sys/contract/process_impl.h>
  72 #include <sys/list.h>
  73 #include <sys/dtrace.h>
  74 #include <sys/pool.h>
  75 #include <sys/zone.h>
  76 #include <sys/sdt.h>
  77 #include <sys/class.h>
  78 #include <sys/corectl.h>
  79 #include <sys/brand.h>
  80 #include <sys/fork.h>
  81 
  82 static int64_t cfork(int, int, int);
  83 static int getproc(proc_t **, pid_t, uint_t);
  84 #define GETPROC_USER    0x0
  85 #define GETPROC_KERNEL  0x1
  86 
  87 static void fork_fail(proc_t *);
  88 static void forklwp_fail(proc_t *);
  89 
  90 int fork_fail_pending;
  91 
  92 extern struct kmem_cache *process_cache;
  93 
  94 /*
  95  * The vfork() system call trap is no longer invoked by libc.
  96  * It is retained only for the benefit of applications running
  97  * within a solaris10 branded zone.  It should be eliminated
  98  * when we no longer support solaris10 branded zones.
  99  */
 100 int64_t
 101 vfork(void)
 102 {
 103         curthread->t_post_sys = 1;   /* so vfwait() will be called */
 104         return (cfork(1, 1, 0));
 105 }
 106 
 107 /*
 108  * forksys system call - forkx, forkallx, vforkx.  This is the
 109  * interface invoked by libc for fork1(), forkall(), and vfork()
 110  */
 111 int64_t
 112 forksys(int subcode, int flags)
 113 {
 114         switch (subcode) {
 115         case 0:
 116                 return (cfork(0, 1, flags));    /* forkx(flags) */
 117         case 1:
 118                 return (cfork(0, 0, flags));    /* forkallx(flags) */
 119         case 2:
 120                 curthread->t_post_sys = 1;   /* so vfwait() will be called */
 121                 return (cfork(1, 1, flags));    /* vforkx(flags) */
 122         default:
 123                 return ((int64_t)set_errno(EINVAL));
 124         }
 125 }
 126 
 127 /* ARGSUSED */
 128 static int64_t
 129 cfork(int isvfork, int isfork1, int flags)
 130 {
 131         proc_t *p = ttoproc(curthread);
 132         struct as *as;
 133         proc_t *cp, **orphpp;
 134         klwp_t *clone;
 135         kthread_t *t;
 136         task_t *tk;
 137         rval_t  r;
 138         int error;
 139         int i;
 140         rctl_set_t *dup_set;
 141         rctl_alloc_gp_t *dup_gp;
 142         rctl_entity_p_t e;
 143         lwpdir_t *ldp;
 144         lwpent_t *lep;
 145         lwpent_t *clep;
 146 
 147         /*
 148          * Allow only these two flags.
 149          */
 150         if ((flags & ~(FORK_NOSIGCHLD | FORK_WAITPID)) != 0) {
 151                 error = EINVAL;
 152                 goto forkerr;
 153         }
 154 
 155         /*
 156          * fork is not supported for the /proc agent lwp.
 157          */
 158         if (curthread == p->p_agenttp) {
 159                 error = ENOTSUP;
 160                 goto forkerr;
 161         }
 162 
 163         if ((error = secpolicy_basic_fork(CRED())) != 0)
 164                 goto forkerr;
 165 
 166         /*
 167          * If the calling lwp is doing a fork1() then the
 168          * other lwps in this process are not duplicated and
 169          * don't need to be held where their kernel stacks can be
 170          * cloned.  If doing forkall(), the process is held with
 171          * SHOLDFORK, so that the lwps are at a point where their
 172          * stacks can be copied which is on entry or exit from
 173          * the kernel.
 174          */
 175         if (!holdlwps(isfork1 ? SHOLDFORK1 : SHOLDFORK)) {
 176                 aston(curthread);
 177                 error = EINTR;
 178                 goto forkerr;
 179         }
 180 
 181 #if defined(__sparc)
 182         /*
 183          * Ensure that the user stack is fully constructed
 184          * before creating the child process structure.
 185          */
 186         (void) flush_user_windows_to_stack(NULL);
 187 #endif
 188 
 189         mutex_enter(&p->p_lock);
 190         /*
 191          * If this is vfork(), cancel any suspend request we might
 192          * have gotten from some other thread via lwp_suspend().
 193          * Otherwise we could end up with a deadlock on return
 194          * from the vfork() in both the parent and the child.
 195          */
 196         if (isvfork)
 197                 curthread->t_proc_flag &= ~TP_HOLDLWP;
 198         /*
 199          * Prevent our resource set associations from being changed during fork.
 200          */
 201         pool_barrier_enter();
 202         mutex_exit(&p->p_lock);
 203 
 204         /*
 205          * Create a child proc struct. Place a VN_HOLD on appropriate vnodes.
 206          */
 207         if (getproc(&cp, 0, GETPROC_USER) < 0) {
 208                 mutex_enter(&p->p_lock);
 209                 pool_barrier_exit();
 210                 continuelwps(p);
 211                 mutex_exit(&p->p_lock);
 212                 error = EAGAIN;
 213                 goto forkerr;
 214         }
 215 
 216         TRACE_2(TR_FAC_PROC, TR_PROC_FORK, "proc_fork:cp %p p %p", cp, p);
 217 
 218         /*
 219          * Assign an address space to child
 220          */
 221         if (isvfork) {
 222                 /*
 223                  * Clear any watched areas and remember the
 224                  * watched pages for restoring in vfwait().
 225                  */
 226                 as = p->p_as;
 227                 if (avl_numnodes(&as->a_wpage) != 0) {
 228                         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 229                         as_clearwatch(as);
 230                         p->p_wpage = as->a_wpage;
 231                         avl_create(&as->a_wpage, wp_compare,
 232                             sizeof (struct watched_page),
 233                             offsetof(struct watched_page, wp_link));
 234                         AS_LOCK_EXIT(as, &as->a_lock);
 235                 }
 236                 cp->p_as = as;
 237                 cp->p_flag |= SVFORK;
 238 
 239                 /*
 240                  * Use the parent's shm segment list information for
 241                  * the child as it uses its address space till it execs.
 242                  */
 243                 cp->p_segacct = p->p_segacct;
 244         } else {
 245                 /*
 246                  * We need to hold P_PR_LOCK until the address space has
 247                  * been duplicated and we've had a chance to remove from the
 248                  * child any DTrace probes that were in the parent. Holding
 249                  * P_PR_LOCK prevents any new probes from being added and any
 250                  * extant probes from being removed.
 251                  */
 252                 mutex_enter(&p->p_lock);
 253                 sprlock_proc(p);
 254                 p->p_flag |= SFORKING;
 255                 mutex_exit(&p->p_lock);
 256 
 257                 error = as_dup(p->p_as, cp);
 258                 if (error != 0) {
 259                         mutex_enter(&p->p_lock);
 260                         sprunlock(p);
 261                         fork_fail(cp);
 262                         mutex_enter(&pidlock);
 263                         orphpp = &p->p_orphan;
 264                         while (*orphpp != cp)
 265                                 orphpp = &(*orphpp)->p_nextorph;
 266                         *orphpp = cp->p_nextorph;
 267                         if (p->p_child == cp)
 268                                 p->p_child = cp->p_sibling;
 269                         if (cp->p_sibling)
 270                                 cp->p_sibling->p_psibling = cp->p_psibling;
 271                         if (cp->p_psibling)
 272                                 cp->p_psibling->p_sibling = cp->p_sibling;
 273                         mutex_enter(&cp->p_lock);
 274                         tk = cp->p_task;
 275                         task_detach(cp);
 276                         ASSERT(cp->p_pool->pool_ref > 0);
 277                         atomic_dec_32(&cp->p_pool->pool_ref);
 278                         mutex_exit(&cp->p_lock);
 279                         pid_exit(cp, tk);
 280                         mutex_exit(&pidlock);
 281                         task_rele(tk);
 282 
 283                         mutex_enter(&p->p_lock);
 284                         p->p_flag &= ~SFORKING;
 285                         pool_barrier_exit();
 286                         continuelwps(p);
 287                         mutex_exit(&p->p_lock);
 288                         /*
 289                          * Preserve ENOMEM error condition but
 290                          * map all others to EAGAIN.
 291                          */
 292                         error = (error == ENOMEM) ? ENOMEM : EAGAIN;
 293                         goto forkerr;
 294                 }
 295 
 296                 /*
 297                  * Remove all DTrace tracepoints from the child process. We
 298                  * need to do this _before_ duplicating USDT providers since
 299                  * any associated probes may be immediately enabled.
 300                  */
 301                 if (p->p_dtrace_count > 0)
 302                         dtrace_fasttrap_fork(p, cp);
 303 
 304                 mutex_enter(&p->p_lock);
 305                 sprunlock(p);
 306 
 307                 /* Duplicate parent's shared memory */
 308                 if (p->p_segacct)
 309                         shmfork(p, cp);
 310 
 311                 /*
 312                  * Duplicate any helper actions and providers. The SFORKING
 313                  * we set above informs the code to enable USDT probes that
 314                  * sprlock() may fail because the child is being forked.
 315                  */
 316                 if (p->p_dtrace_helpers != NULL) {
 317                         ASSERT(dtrace_helpers_fork != NULL);
 318                         (*dtrace_helpers_fork)(p, cp);
 319                 }
 320 
 321                 mutex_enter(&p->p_lock);
 322                 p->p_flag &= ~SFORKING;
 323                 mutex_exit(&p->p_lock);
 324         }
 325 
 326         /*
 327          * Duplicate parent's resource controls.
 328          */
 329         dup_set = rctl_set_create();
 330         for (;;) {
 331                 dup_gp = rctl_set_dup_prealloc(p->p_rctls);
 332                 mutex_enter(&p->p_rctls->rcs_lock);
 333                 if (rctl_set_dup_ready(p->p_rctls, dup_gp))
 334                         break;
 335                 mutex_exit(&p->p_rctls->rcs_lock);
 336                 rctl_prealloc_destroy(dup_gp);
 337         }
 338         e.rcep_p.proc = cp;
 339         e.rcep_t = RCENTITY_PROCESS;
 340         cp->p_rctls = rctl_set_dup(p->p_rctls, p, cp, &e, dup_set, dup_gp,
 341             RCD_DUP | RCD_CALLBACK);
 342         mutex_exit(&p->p_rctls->rcs_lock);
 343 
 344         rctl_prealloc_destroy(dup_gp);
 345 
 346         /*
 347          * Allocate the child's lwp directory and lwpid hash table.
 348          */
 349         if (isfork1)
 350                 cp->p_lwpdir_sz = 2;
 351         else
 352                 cp->p_lwpdir_sz = p->p_lwpdir_sz;
 353         cp->p_lwpdir = cp->p_lwpfree = ldp =
 354             kmem_zalloc(cp->p_lwpdir_sz * sizeof (lwpdir_t), KM_SLEEP);
 355         for (i = 1; i < cp->p_lwpdir_sz; i++, ldp++)
 356                 ldp->ld_next = ldp + 1;
 357         cp->p_tidhash_sz = (cp->p_lwpdir_sz + 2) / 2;
 358         cp->p_tidhash =
 359             kmem_zalloc(cp->p_tidhash_sz * sizeof (tidhash_t), KM_SLEEP);
 360 
 361         /*
 362          * Duplicate parent's lwps.
 363          * Mutual exclusion is not needed because the process is
 364          * in the hold state and only the current lwp is running.
 365          */
 366         klgrpset_clear(cp->p_lgrpset);
 367         if (isfork1) {
 368                 clone = forklwp(ttolwp(curthread), cp, curthread->t_tid);
 369                 if (clone == NULL)
 370                         goto forklwperr;
 371                 /*
 372                  * Inherit only the lwp_wait()able flag,
 373                  * Daemon threads should not call fork1(), but oh well...
 374                  */
 375                 lwptot(clone)->t_proc_flag |=
 376                     (curthread->t_proc_flag & TP_TWAIT);
 377         } else {
 378                 /* this is forkall(), no one can be in lwp_wait() */
 379                 ASSERT(p->p_lwpwait == 0 && p->p_lwpdwait == 0);
 380                 /* for each entry in the parent's lwp directory... */
 381                 for (i = 0, ldp = p->p_lwpdir; i < p->p_lwpdir_sz; i++, ldp++) {
 382                         klwp_t *clwp;
 383                         kthread_t *ct;
 384 
 385                         if ((lep = ldp->ld_entry) == NULL)
 386                                 continue;
 387 
 388                         if ((t = lep->le_thread) != NULL) {
 389                                 clwp = forklwp(ttolwp(t), cp, t->t_tid);
 390                                 if (clwp == NULL)
 391                                         goto forklwperr;
 392                                 ct = lwptot(clwp);
 393                                 /*
 394                                  * Inherit lwp_wait()able and daemon flags.
 395                                  */
 396                                 ct->t_proc_flag |=
 397                                     (t->t_proc_flag & (TP_TWAIT|TP_DAEMON));
 398                                 /*
 399                                  * Keep track of the clone of curthread to
 400                                  * post return values through lwp_setrval().
 401                                  * Mark other threads for special treatment
 402                                  * by lwp_rtt() / post_syscall().
 403                                  */
 404                                 if (t == curthread)
 405                                         clone = clwp;
 406                                 else
 407                                         ct->t_flag |= T_FORKALL;
 408                         } else {
 409                                 /*
 410                                  * Replicate zombie lwps in the child.
 411                                  */
 412                                 clep = kmem_zalloc(sizeof (*clep), KM_SLEEP);
 413                                 clep->le_lwpid = lep->le_lwpid;
 414                                 clep->le_start = lep->le_start;
 415                                 lwp_hash_in(cp, clep,
 416                                     cp->p_tidhash, cp->p_tidhash_sz, 0);
 417                         }
 418                 }
 419         }
 420 
 421         /*
 422          * Put new process in the parent's process contract, or put it
 423          * in a new one if there is an active process template.  Send a
 424          * fork event (if requested) to whatever contract the child is
 425          * a member of.  Fails if the parent has been SIGKILLed.
 426          */
 427         if (contract_process_fork(NULL, cp, p, B_TRUE) == NULL)
 428                 goto forklwperr;
 429 
 430         /*
 431          * No fork failures occur beyond this point.
 432          */
 433 
 434         cp->p_lwpid = p->p_lwpid;
 435         if (!isfork1) {
 436                 cp->p_lwpdaemon = p->p_lwpdaemon;
 437                 cp->p_zombcnt = p->p_zombcnt;
 438                 /*
 439                  * If the parent's lwp ids have wrapped around, so have the
 440                  * child's.
 441                  */
 442                 cp->p_flag |= p->p_flag & SLWPWRAP;
 443         }
 444 
 445         mutex_enter(&p->p_lock);
 446         corectl_path_hold(cp->p_corefile = p->p_corefile);
 447         corectl_content_hold(cp->p_content = p->p_content);
 448         mutex_exit(&p->p_lock);
 449 
 450         /*
 451          * Duplicate process context ops, if any.
 452          */
 453         if (p->p_pctx)
 454                 forkpctx(p, cp);
 455 
 456 #ifdef __sparc
 457         utrap_dup(p, cp);
 458 #endif
 459         /*
 460          * If the child process has been marked to stop on exit
 461          * from this fork, arrange for all other lwps to stop in
 462          * sympathy with the active lwp.
 463          */
 464         if (PTOU(cp)->u_systrap &&
 465             prismember(&PTOU(cp)->u_exitmask, curthread->t_sysnum)) {
 466                 mutex_enter(&cp->p_lock);
 467                 t = cp->p_tlist;
 468                 do {
 469                         t->t_proc_flag |= TP_PRSTOP;
 470                         aston(t);       /* so TP_PRSTOP will be seen */
 471                 } while ((t = t->t_forw) != cp->p_tlist);
 472                 mutex_exit(&cp->p_lock);
 473         }
 474         /*
 475          * If the parent process has been marked to stop on exit
 476          * from this fork, and its asynchronous-stop flag has not
 477          * been set, arrange for all other lwps to stop before
 478          * they return back to user level.
 479          */
 480         if (!(p->p_proc_flag & P_PR_ASYNC) && PTOU(p)->u_systrap &&
 481             prismember(&PTOU(p)->u_exitmask, curthread->t_sysnum)) {
 482                 mutex_enter(&p->p_lock);
 483                 t = p->p_tlist;
 484                 do {
 485                         t->t_proc_flag |= TP_PRSTOP;
 486                         aston(t);       /* so TP_PRSTOP will be seen */
 487                 } while ((t = t->t_forw) != p->p_tlist);
 488                 mutex_exit(&p->p_lock);
 489         }
 490 
 491         if (PROC_IS_BRANDED(p))
 492                 BROP(p)->b_lwp_setrval(clone, p->p_pid, 1);
 493         else
 494                 lwp_setrval(clone, p->p_pid, 1);
 495 
 496         /* set return values for parent */
 497         r.r_val1 = (int)cp->p_pid;
 498         r.r_val2 = 0;
 499 
 500         /*
 501          * pool_barrier_exit() can now be called because the child process has:
 502          * - all identifying features cloned or set (p_pid, p_task, p_pool)
 503          * - all resource sets associated (p_tlist->*->t_cpupart, p_as->a_mset)
 504          * - any other fields set which are used in resource set binding.
 505          */
 506         mutex_enter(&p->p_lock);
 507         pool_barrier_exit();
 508         mutex_exit(&p->p_lock);
 509 
 510         mutex_enter(&pidlock);
 511         mutex_enter(&cp->p_lock);
 512 
 513         /*
 514          * Set flags telling the child what (not) to do on exit.
 515          */
 516         if (flags & FORK_NOSIGCHLD)
 517                 cp->p_pidflag |= CLDNOSIGCHLD;
 518         if (flags & FORK_WAITPID)
 519                 cp->p_pidflag |= CLDWAITPID;
 520 
 521         /*
 522          * Now that there are lwps and threads attached, add the new
 523          * process to the process group.
 524          */
 525         pgjoin(cp, p->p_pgidp);
 526         cp->p_stat = SRUN;
 527         /*
 528          * We are now done with all the lwps in the child process.
 529          */
 530         t = cp->p_tlist;
 531         do {
 532                 /*
 533                  * Set the lwp_suspend()ed lwps running.
 534                  * They will suspend properly at syscall exit.
 535                  */
 536                 if (t->t_proc_flag & TP_HOLDLWP)
 537                         lwp_create_done(t);
 538                 else {
 539                         /* set TS_CREATE to allow continuelwps() to work */
 540                         thread_lock(t);
 541                         ASSERT(t->t_state == TS_STOPPED &&
 542                             !(t->t_schedflag & (TS_CREATE|TS_CSTART)));
 543                         t->t_schedflag |= TS_CREATE;
 544                         thread_unlock(t);
 545                 }
 546         } while ((t = t->t_forw) != cp->p_tlist);
 547         mutex_exit(&cp->p_lock);
 548 
 549         if (isvfork) {
 550                 CPU_STATS_ADDQ(CPU, sys, sysvfork, 1);
 551                 mutex_enter(&p->p_lock);
 552                 p->p_flag |= SVFWAIT;
 553                 curthread->t_flag |= T_VFPARENT;
 554                 DTRACE_PROC1(create, proc_t *, cp);
 555                 cv_broadcast(&pr_pid_cv[p->p_slot]);     /* inform /proc */
 556                 mutex_exit(&p->p_lock);
 557                 /*
 558                  * Grab child's p_lock before dropping pidlock to ensure
 559                  * the process will not disappear before we set it running.
 560                  */
 561                 mutex_enter(&cp->p_lock);
 562                 mutex_exit(&pidlock);
 563                 sigdefault(cp);
 564                 continuelwps(cp);
 565                 mutex_exit(&cp->p_lock);
 566         } else {
 567                 CPU_STATS_ADDQ(CPU, sys, sysfork, 1);
 568                 DTRACE_PROC1(create, proc_t *, cp);
 569                 /*
 570                  * It is CL_FORKRET's job to drop pidlock.
 571                  * If we do it here, the process could be set running
 572                  * and disappear before CL_FORKRET() is called.
 573                  */
 574                 CL_FORKRET(curthread, cp->p_tlist);
 575                 schedctl_set_cidpri(curthread);
 576                 ASSERT(MUTEX_NOT_HELD(&pidlock));
 577         }
 578 
 579         return (r.r_vals);
 580 
 581 forklwperr:
 582         if (isvfork) {
 583                 if (avl_numnodes(&p->p_wpage) != 0) {
 584                         /* restore watchpoints to parent */
 585                         as = p->p_as;
 586                         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 587                         as->a_wpage = p->p_wpage;
 588                         avl_create(&p->p_wpage, wp_compare,
 589                             sizeof (struct watched_page),
 590                             offsetof(struct watched_page, wp_link));
 591                         as_setwatch(as);
 592                         AS_LOCK_EXIT(as, &as->a_lock);
 593                 }
 594         } else {
 595                 if (cp->p_segacct)
 596                         shmexit(cp);
 597                 as = cp->p_as;
 598                 cp->p_as = &kas;
 599                 as_free(as);
 600         }
 601 
 602         if (cp->p_lwpdir) {
 603                 for (i = 0, ldp = cp->p_lwpdir; i < cp->p_lwpdir_sz; i++, ldp++)
 604                         if ((lep = ldp->ld_entry) != NULL)
 605                                 kmem_free(lep, sizeof (*lep));
 606                 kmem_free(cp->p_lwpdir,
 607                     cp->p_lwpdir_sz * sizeof (*cp->p_lwpdir));
 608         }
 609         cp->p_lwpdir = NULL;
 610         cp->p_lwpfree = NULL;
 611         cp->p_lwpdir_sz = 0;
 612 
 613         if (cp->p_tidhash)
 614                 kmem_free(cp->p_tidhash,
 615                     cp->p_tidhash_sz * sizeof (*cp->p_tidhash));
 616         cp->p_tidhash = NULL;
 617         cp->p_tidhash_sz = 0;
 618 
 619         forklwp_fail(cp);
 620         fork_fail(cp);
 621         rctl_set_free(cp->p_rctls);
 622         mutex_enter(&pidlock);
 623 
 624         /*
 625          * Detach failed child from task.
 626          */
 627         mutex_enter(&cp->p_lock);
 628         tk = cp->p_task;
 629         task_detach(cp);
 630         ASSERT(cp->p_pool->pool_ref > 0);
 631         atomic_dec_32(&cp->p_pool->pool_ref);
 632         mutex_exit(&cp->p_lock);
 633 
 634         orphpp = &p->p_orphan;
 635         while (*orphpp != cp)
 636                 orphpp = &(*orphpp)->p_nextorph;
 637         *orphpp = cp->p_nextorph;
 638         if (p->p_child == cp)
 639                 p->p_child = cp->p_sibling;
 640         if (cp->p_sibling)
 641                 cp->p_sibling->p_psibling = cp->p_psibling;
 642         if (cp->p_psibling)
 643                 cp->p_psibling->p_sibling = cp->p_sibling;
 644         pid_exit(cp, tk);
 645         mutex_exit(&pidlock);
 646 
 647         task_rele(tk);
 648 
 649         mutex_enter(&p->p_lock);
 650         pool_barrier_exit();
 651         continuelwps(p);
 652         mutex_exit(&p->p_lock);
 653         error = EAGAIN;
 654 forkerr:
 655         return ((int64_t)set_errno(error));
 656 }
 657 
 658 /*
 659  * Free allocated resources from getproc() if a fork failed.
 660  */
 661 static void
 662 fork_fail(proc_t *cp)
 663 {
 664         uf_info_t *fip = P_FINFO(cp);
 665 
 666         fcnt_add(fip, -1);
 667         sigdelq(cp, NULL, 0);
 668 
 669         mutex_enter(&pidlock);
 670         upcount_dec(crgetruid(cp->p_cred), crgetzoneid(cp->p_cred));
 671         mutex_exit(&pidlock);
 672 
 673         /*
 674          * single threaded, so no locking needed here
 675          */
 676         crfree(cp->p_cred);
 677 
 678         kmem_free(fip->fi_list, fip->fi_nfiles * sizeof (uf_entry_t));
 679 
 680         VN_RELE(PTOU(curproc)->u_cdir);
 681         if (PTOU(curproc)->u_rdir)
 682                 VN_RELE(PTOU(curproc)->u_rdir);
 683         if (cp->p_exec)
 684                 VN_RELE(cp->p_exec);
 685         if (cp->p_execdir)
 686                 VN_RELE(cp->p_execdir);
 687         if (PTOU(curproc)->u_cwd)
 688                 refstr_rele(PTOU(curproc)->u_cwd);
 689         if (PROC_IS_BRANDED(cp)) {
 690                 brand_clearbrand(cp, B_TRUE);
 691         }
 692 }
 693 
 694 /*
 695  * Clean up the lwps already created for this child process.
 696  * The fork failed while duplicating all the lwps of the parent
 697  * and those lwps already created must be freed.
 698  * This process is invisible to the rest of the system,
 699  * so we don't need to hold p->p_lock to protect the list.
 700  */
 701 static void
 702 forklwp_fail(proc_t *p)
 703 {
 704         kthread_t *t;
 705         task_t *tk;
 706         int branded = 0;
 707 
 708         if (PROC_IS_BRANDED(p))
 709                 branded = 1;
 710 
 711         while ((t = p->p_tlist) != NULL) {
 712                 /*
 713                  * First remove the lwp from the process's p_tlist.
 714                  */
 715                 if (t != t->t_forw)
 716                         p->p_tlist = t->t_forw;
 717                 else
 718                         p->p_tlist = NULL;
 719                 p->p_lwpcnt--;
 720                 t->t_forw->t_back = t->t_back;
 721                 t->t_back->t_forw = t->t_forw;
 722 
 723                 tk = p->p_task;
 724                 mutex_enter(&p->p_zone->zone_nlwps_lock);
 725                 tk->tk_nlwps--;
 726                 tk->tk_proj->kpj_nlwps--;
 727                 p->p_zone->zone_nlwps--;
 728                 mutex_exit(&p->p_zone->zone_nlwps_lock);
 729 
 730                 ASSERT(t->t_schedctl == NULL);
 731 
 732                 if (branded)
 733                         BROP(p)->b_freelwp(ttolwp(t));
 734 
 735                 if (t->t_door != NULL) {
 736                         kmem_free(t->t_door, sizeof (door_data_t));
 737                         t->t_door = NULL;
 738                 }
 739                 lwp_ctmpl_clear(ttolwp(t));
 740 
 741                 /*
 742                  * Remove the thread from the all threads list.
 743                  * We need to hold pidlock for this.
 744                  */
 745                 mutex_enter(&pidlock);
 746                 t->t_next->t_prev = t->t_prev;
 747                 t->t_prev->t_next = t->t_next;
 748                 CL_EXIT(t);     /* tell the scheduler that we're exiting */
 749                 cv_broadcast(&t->t_joincv);      /* tell anyone in thread_join */
 750                 mutex_exit(&pidlock);
 751 
 752                 /*
 753                  * Let the lgroup load averages know that this thread isn't
 754                  * going to show up (i.e. un-do what was done on behalf of
 755                  * this thread by the earlier lgrp_move_thread()).
 756                  */
 757                 kpreempt_disable();
 758                 lgrp_move_thread(t, NULL, 1);
 759                 kpreempt_enable();
 760 
 761                 /*
 762                  * The thread was created TS_STOPPED.
 763                  * We change it to TS_FREE to avoid an
 764                  * ASSERT() panic in thread_free().
 765                  */
 766                 t->t_state = TS_FREE;
 767                 thread_rele(t);
 768                 thread_free(t);
 769         }
 770 }
 771 
 772 extern struct as kas;
 773 
 774 /*
 775  * fork a kernel process.
 776  */
 777 int
 778 newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
 779     pid_t pid)
 780 {
 781         proc_t *p;
 782         struct user *up;
 783         kthread_t *t;
 784         cont_process_t *ctp = NULL;
 785         rctl_entity_p_t e;
 786 
 787         ASSERT(cid != sysdccid);
 788         ASSERT(cid != syscid || ct == NULL);
 789         if (CLASS_KERNEL(cid)) {
 790                 rctl_alloc_gp_t *init_gp;
 791                 rctl_set_t *init_set;
 792 
 793                 ASSERT(pid != 1);
 794 
 795                 if (getproc(&p, pid, GETPROC_KERNEL) < 0)
 796                         return (EAGAIN);
 797 
 798                 /*
 799                  * Release the hold on the p_exec and p_execdir, these
 800                  * were acquired in getproc()
 801                  */
 802                 if (p->p_execdir != NULL)
 803                         VN_RELE(p->p_execdir);
 804                 if (p->p_exec != NULL)
 805                         VN_RELE(p->p_exec);
 806                 p->p_flag |= SNOWAIT;
 807                 p->p_exec = NULL;
 808                 p->p_execdir = NULL;
 809 
 810                 init_set = rctl_set_create();
 811                 init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
 812 
 813                 /*
 814                  * kernel processes do not inherit /proc tracing flags.
 815                  */
 816                 sigemptyset(&p->p_sigmask);
 817                 premptyset(&p->p_fltmask);
 818                 up = PTOU(p);
 819                 up->u_systrap = 0;
 820                 premptyset(&(up->u_entrymask));
 821                 premptyset(&(up->u_exitmask));
 822                 mutex_enter(&p->p_lock);
 823                 e.rcep_p.proc = p;
 824                 e.rcep_t = RCENTITY_PROCESS;
 825                 p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
 826                     init_gp);
 827                 mutex_exit(&p->p_lock);
 828 
 829                 rctl_prealloc_destroy(init_gp);
 830 
 831                 t = lwp_kernel_create(p, pc, arg, TS_STOPPED, pri);
 832         } else {
 833                 rctl_alloc_gp_t *init_gp, *default_gp;
 834                 rctl_set_t *init_set;
 835                 task_t *tk, *tk_old;
 836                 klwp_t *lwp;
 837 
 838                 if (getproc(&p, pid, GETPROC_USER) < 0)
 839                         return (EAGAIN);
 840                 /*
 841                  * init creates a new task, distinct from the task
 842                  * containing kernel "processes".
 843                  */
 844                 tk = task_create(0, p->p_zone);
 845                 mutex_enter(&tk->tk_zone->zone_nlwps_lock);
 846                 tk->tk_proj->kpj_ntasks++;
 847                 tk->tk_nprocs++;
 848                 mutex_exit(&tk->tk_zone->zone_nlwps_lock);
 849 
 850                 default_gp = rctl_rlimit_set_prealloc(RLIM_NLIMITS);
 851                 init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
 852                 init_set = rctl_set_create();
 853 
 854                 mutex_enter(&pidlock);
 855                 mutex_enter(&p->p_lock);
 856                 tk_old = p->p_task;  /* switch to new task */
 857 
 858                 task_detach(p);
 859                 task_begin(tk, p);
 860                 mutex_exit(&pidlock);
 861 
 862                 mutex_enter(&tk_old->tk_zone->zone_nlwps_lock);
 863                 tk_old->tk_nprocs--;
 864                 mutex_exit(&tk_old->tk_zone->zone_nlwps_lock);
 865 
 866                 e.rcep_p.proc = p;
 867                 e.rcep_t = RCENTITY_PROCESS;
 868                 p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
 869                     init_gp);
 870                 rctlproc_default_init(p, default_gp);
 871                 mutex_exit(&p->p_lock);
 872 
 873                 task_rele(tk_old);
 874                 rctl_prealloc_destroy(default_gp);
 875                 rctl_prealloc_destroy(init_gp);
 876 
 877                 if ((lwp = lwp_create(pc, arg, 0, p, TS_STOPPED, pri,
 878                     &curthread->t_hold, cid, 1)) == NULL) {
 879                         task_t *tk;
 880                         fork_fail(p);
 881                         mutex_enter(&pidlock);
 882                         mutex_enter(&p->p_lock);
 883                         tk = p->p_task;
 884                         task_detach(p);
 885                         ASSERT(p->p_pool->pool_ref > 0);
 886                         atomic_add_32(&p->p_pool->pool_ref, -1);
 887                         mutex_exit(&p->p_lock);
 888                         pid_exit(p, tk);
 889                         mutex_exit(&pidlock);
 890                         task_rele(tk);
 891 
 892                         return (EAGAIN);
 893                 }
 894                 t = lwptot(lwp);
 895 
 896                 ctp = contract_process_fork(sys_process_tmpl, p, curproc,
 897                     B_FALSE);
 898                 ASSERT(ctp != NULL);
 899                 if (ct != NULL)
 900                         *ct = &ctp->conp_contract;
 901         }
 902 
 903         ASSERT3U(t->t_tid, ==, 1);
 904         p->p_lwpid = 1;
 905         mutex_enter(&pidlock);
 906         pgjoin(p, p->p_parent->p_pgidp);
 907         p->p_stat = SRUN;
 908         mutex_enter(&p->p_lock);
 909         t->t_proc_flag &= ~TP_HOLDLWP;
 910         lwp_create_done(t);
 911         mutex_exit(&p->p_lock);
 912         mutex_exit(&pidlock);
 913         return (0);
 914 }
 915 
 916 /*
 917  * create a child proc struct.
 918  */
 919 static int
 920 getproc(proc_t **cpp, pid_t pid, uint_t flags)
 921 {
 922         proc_t          *pp, *cp;
 923         pid_t           newpid;
 924         struct user     *uarea;
 925         extern uint_t   nproc;
 926         struct cred     *cr;
 927         uid_t           ruid;
 928         zoneid_t        zoneid;
 929         task_t          *task;
 930         kproject_t      *proj;
 931         zone_t          *zone;
 932         int             rctlfail = 0;
 933 
 934         if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
 935                 return (-1);    /* no point in starting new processes */
 936 
 937         pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
 938         task = pp->p_task;
 939         proj = task->tk_proj;
 940         zone = pp->p_zone;
 941 
 942         mutex_enter(&pp->p_lock);
 943         mutex_enter(&zone->zone_nlwps_lock);
 944         if (proj != proj0p) {
 945                 if (task->tk_nprocs >= task->tk_nprocs_ctl)
 946                         if (rctl_test(rc_task_nprocs, task->tk_rctls,
 947                             pp, 1, 0) & RCT_DENY)
 948                                 rctlfail = 1;
 949 
 950                 if (proj->kpj_nprocs >= proj->kpj_nprocs_ctl)
 951                         if (rctl_test(rc_project_nprocs, proj->kpj_rctls,
 952                             pp, 1, 0) & RCT_DENY)
 953                                 rctlfail = 1;
 954 
 955                 if (zone->zone_nprocs >= zone->zone_nprocs_ctl)
 956                         if (rctl_test(rc_zone_nprocs, zone->zone_rctls,
 957                             pp, 1, 0) & RCT_DENY)
 958                                 rctlfail = 1;
 959 
 960                 if (rctlfail) {
 961                         mutex_exit(&zone->zone_nlwps_lock);
 962                         mutex_exit(&pp->p_lock);
 963                         goto punish;
 964                 }
 965         }
 966         task->tk_nprocs++;
 967         proj->kpj_nprocs++;
 968         zone->zone_nprocs++;
 969         mutex_exit(&zone->zone_nlwps_lock);
 970         mutex_exit(&pp->p_lock);
 971 
 972         cp = kmem_cache_alloc(process_cache, KM_SLEEP);
 973         bzero(cp, sizeof (proc_t));
 974 
 975         /*
 976          * Make proc entry for child process
 977          */
 978         mutex_init(&cp->p_splock, NULL, MUTEX_DEFAULT, NULL);
 979         mutex_init(&cp->p_crlock, NULL, MUTEX_DEFAULT, NULL);
 980         mutex_init(&cp->p_pflock, NULL, MUTEX_DEFAULT, NULL);
 981 #if defined(__x86)
 982         mutex_init(&cp->p_ldtlock, NULL, MUTEX_DEFAULT, NULL);
 983 #endif
 984         mutex_init(&cp->p_maplock, NULL, MUTEX_DEFAULT, NULL);
 985         cp->p_stat = SIDL;
 986         cp->p_mstart = gethrtime();
 987         cp->p_as = &kas;
 988         /*
 989          * p_zone must be set before we call pid_allocate since the process
 990          * will be visible after that and code such as prfind_zone will
 991          * look at the p_zone field.
 992          */
 993         cp->p_zone = pp->p_zone;
 994         cp->p_t1_lgrpid = LGRP_NONE;
 995         cp->p_tr_lgrpid = LGRP_NONE;
 996 
 997         if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
 998                 if (nproc == v.v_proc) {
 999                         CPU_STATS_ADDQ(CPU, sys, procovf, 1);
1000                         cmn_err(CE_WARN, "out of processes");
1001                 }
1002                 goto bad;
1003         }
1004 
1005         mutex_enter(&pp->p_lock);
1006         cp->p_exec = pp->p_exec;
1007         cp->p_execdir = pp->p_execdir;
1008         mutex_exit(&pp->p_lock);
1009 
1010         if (cp->p_exec) {
1011                 VN_HOLD(cp->p_exec);
1012                 /*
1013                  * Each VOP_OPEN() must be paired with a corresponding
1014                  * VOP_CLOSE(). In this case, the executable will be
1015                  * closed for the child in either proc_exit() or gexec().
1016                  */
1017                 if (VOP_OPEN(&cp->p_exec, FREAD, CRED(), NULL) != 0) {
1018                         VN_RELE(cp->p_exec);
1019                         cp->p_exec = NULLVP;
1020                         cp->p_execdir = NULLVP;
1021                         goto bad;
1022                 }
1023         }
1024         if (cp->p_execdir)
1025                 VN_HOLD(cp->p_execdir);
1026 
1027         /*
1028          * If not privileged make sure that this user hasn't exceeded
1029          * v.v_maxup processes, and that users collectively haven't
1030          * exceeded v.v_maxupttl processes.
1031          */
1032         mutex_enter(&pidlock);
1033         ASSERT(nproc < v.v_proc);    /* otherwise how'd we get our pid? */
1034         cr = CRED();
1035         ruid = crgetruid(cr);
1036         zoneid = crgetzoneid(cr);
1037         if (nproc >= v.v_maxup &&    /* short-circuit; usually false */
1038             (nproc >= v.v_maxupttl ||
1039             upcount_get(ruid, zoneid) >= v.v_maxup) &&
1040             secpolicy_newproc(cr) != 0) {
1041                 mutex_exit(&pidlock);
1042                 zcmn_err(zoneid, CE_NOTE,
1043                     "out of per-user processes for uid %d", ruid);
1044                 goto bad;
1045         }
1046 
1047         /*
1048          * Everything is cool, put the new proc on the active process list.
1049          * It is already on the pid list and in /proc.
1050          * Increment the per uid process count (upcount).
1051          */
1052         nproc++;
1053         upcount_inc(ruid, zoneid);
1054 
1055         cp->p_next = practive;
1056         practive->p_prev = cp;
1057         practive = cp;
1058 
1059         cp->p_ignore = pp->p_ignore;
1060         cp->p_siginfo = pp->p_siginfo;
1061         cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);
1062         cp->p_sessp = pp->p_sessp;
1063         sess_hold(pp);
1064         cp->p_brand = pp->p_brand;
1065         if (PROC_IS_BRANDED(pp))
1066                 BROP(pp)->b_copy_procdata(cp, pp);
1067         cp->p_bssbase = pp->p_bssbase;
1068         cp->p_brkbase = pp->p_brkbase;
1069         cp->p_brksize = pp->p_brksize;
1070         cp->p_brkpageszc = pp->p_brkpageszc;
1071         cp->p_stksize = pp->p_stksize;
1072         cp->p_stkpageszc = pp->p_stkpageszc;
1073         cp->p_stkprot = pp->p_stkprot;
1074         cp->p_datprot = pp->p_datprot;
1075         cp->p_usrstack = pp->p_usrstack;
1076         cp->p_model = pp->p_model;
1077         cp->p_ppid = pp->p_pid;
1078         cp->p_ancpid = pp->p_pid;
1079         cp->p_portcnt = pp->p_portcnt;
1080 
1081         /*
1082          * Initialize watchpoint structures
1083          */
1084         avl_create(&cp->p_warea, wa_compare, sizeof (struct watched_area),
1085             offsetof(struct watched_area, wa_link));
1086 
1087         /*
1088          * Initialize immediate resource control values.
1089          */
1090         cp->p_stk_ctl = pp->p_stk_ctl;
1091         cp->p_fsz_ctl = pp->p_fsz_ctl;
1092         cp->p_vmem_ctl = pp->p_vmem_ctl;
1093         cp->p_fno_ctl = pp->p_fno_ctl;
1094 
1095         /*
1096          * Link up to parent-child-sibling chain.  No need to lock
1097          * in general since only a call to freeproc() (done by the
1098          * same parent as newproc()) diddles with the child chain.
1099          */
1100         cp->p_sibling = pp->p_child;
1101         if (pp->p_child)
1102                 pp->p_child->p_psibling = cp;
1103 
1104         cp->p_parent = pp;
1105         pp->p_child = cp;
1106 
1107         cp->p_child_ns = NULL;
1108         cp->p_sibling_ns = NULL;
1109 
1110         cp->p_nextorph = pp->p_orphan;
1111         cp->p_nextofkin = pp;
1112         pp->p_orphan = cp;
1113 
1114         /*
1115          * Inherit profiling state; do not inherit REALPROF profiling state.
1116          */
1117         cp->p_prof = pp->p_prof;
1118         cp->p_rprof_cyclic = CYCLIC_NONE;
1119 
1120         /*
1121          * Inherit pool pointer from the parent.  Kernel processes are
1122          * always bound to the default pool.
1123          */
1124         mutex_enter(&pp->p_lock);
1125         if (flags & GETPROC_KERNEL) {
1126                 cp->p_pool = pool_default;
1127                 cp->p_flag |= SSYS;
1128         } else {
1129                 cp->p_pool = pp->p_pool;
1130         }
1131         atomic_inc_32(&cp->p_pool->pool_ref);
1132         mutex_exit(&pp->p_lock);
1133 
1134         /*
1135          * Add the child process to the current task.  Kernel processes
1136          * are always attached to task0.
1137          */
1138         mutex_enter(&cp->p_lock);
1139         if (flags & GETPROC_KERNEL)
1140                 task_attach(task0p, cp);
1141         else
1142                 task_attach(pp->p_task, cp);
1143         mutex_exit(&cp->p_lock);
1144         mutex_exit(&pidlock);
1145 
1146         avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),
1147             offsetof(contract_t, ct_ctlist));
1148 
1149         /*
1150          * Duplicate any audit information kept in the process table
1151          */
1152         if (audit_active)       /* copy audit data to cp */
1153                 audit_newproc(cp);
1154 
1155         crhold(cp->p_cred = cr);
1156 
1157         /*
1158          * Bump up the counts on the file structures pointed at by the
1159          * parent's file table since the child will point at them too.
1160          */
1161         fcnt_add(P_FINFO(pp), 1);
1162 
1163         if (PTOU(pp)->u_cdir) {
1164                 VN_HOLD(PTOU(pp)->u_cdir);
1165         } else {
1166                 ASSERT(pp == &p0);
1167                 /*
1168                  * We must be at or before vfs_mountroot(); it will take care of
1169                  * assigning our current directory.
1170                  */
1171         }
1172         if (PTOU(pp)->u_rdir)
1173                 VN_HOLD(PTOU(pp)->u_rdir);
1174         if (PTOU(pp)->u_cwd)
1175                 refstr_hold(PTOU(pp)->u_cwd);
1176 
1177         /*
1178          * copy the parent's uarea.
1179          */
1180         uarea = PTOU(cp);
1181         bcopy(PTOU(pp), uarea, sizeof (*uarea));
1182         flist_fork(P_FINFO(pp), P_FINFO(cp));
1183 
1184         gethrestime(&uarea->u_start);
1185         uarea->u_ticks = ddi_get_lbolt();
1186         uarea->u_mem = rm_asrss(pp->p_as);
1187         uarea->u_acflag = AFORK;
1188 
1189         /*
1190          * If inherit-on-fork, copy /proc tracing flags to child.
1191          */
1192         if ((pp->p_proc_flag & P_PR_FORK) != 0) {
1193                 cp->p_proc_flag |= pp->p_proc_flag & (P_PR_TRACE|P_PR_FORK);
1194                 cp->p_sigmask = pp->p_sigmask;
1195                 cp->p_fltmask = pp->p_fltmask;
1196         } else {
1197                 sigemptyset(&cp->p_sigmask);
1198                 premptyset(&cp->p_fltmask);
1199                 uarea->u_systrap = 0;
1200                 premptyset(&uarea->u_entrymask);
1201                 premptyset(&uarea->u_exitmask);
1202         }
1203         /*
1204          * If microstate accounting is being inherited, mark child
1205          */
1206         if ((pp->p_flag & SMSFORK) != 0)
1207                 cp->p_flag |= pp->p_flag & (SMSFORK|SMSACCT);
1208 
1209         /*
1210          * Inherit fixalignment flag from the parent
1211          */
1212         cp->p_fixalignment = pp->p_fixalignment;
1213 
1214         *cpp = cp;
1215         return (0);
1216 
1217 bad:
1218         ASSERT(MUTEX_NOT_HELD(&pidlock));
1219 
1220         mutex_destroy(&cp->p_crlock);
1221         mutex_destroy(&cp->p_pflock);
1222 #if defined(__x86)
1223         mutex_destroy(&cp->p_ldtlock);
1224 #endif
1225         if (newpid != -1) {
1226                 proc_entry_free(cp->p_pidp);
1227                 (void) pid_rele(cp->p_pidp);
1228         }
1229         kmem_cache_free(process_cache, cp);
1230 
1231         mutex_enter(&zone->zone_nlwps_lock);
1232         task->tk_nprocs--;
1233         proj->kpj_nprocs--;
1234         zone->zone_nprocs--;
1235         mutex_exit(&zone->zone_nlwps_lock);
1236 
1237 punish:
1238         /*
1239          * We most likely got into this situation because some process is
1240          * forking out of control.  As punishment, put it to sleep for a
1241          * bit so it can't eat the machine alive.  Sleep interval is chosen
1242          * to allow no more than one fork failure per cpu per clock tick
1243          * on average (yes, I just made this up).  This has two desirable
1244          * properties: (1) it sets a constant limit on the fork failure
1245          * rate, and (2) the busier the system is, the harsher the penalty
1246          * for abusing it becomes.
1247          */
1248         INCR_COUNT(&fork_fail_pending, &pidlock);
1249         delay(fork_fail_pending / ncpus + 1);
1250         DECR_COUNT(&fork_fail_pending, &pidlock);
1251 
1252         return (-1); /* out of memory or proc slots */
1253 }
1254 
1255 /*
1256  * Release virtual memory.
1257  * In the case of vfork(), the child was given exclusive access to its
1258  * parent's address space.  The parent is waiting in vfwait() for the
1259  * child to release its exclusive claim via relvm().
1260  */
1261 void
1262 relvm()
1263 {
1264         proc_t *p = curproc;
1265 
1266         ASSERT((unsigned)p->p_lwpcnt <= 1);
1267 
1268         prrelvm();      /* inform /proc */
1269 
1270         if (p->p_flag & SVFORK) {
1271                 proc_t *pp = p->p_parent;
1272                 /*
1273                  * The child process is either exec'ing or exit'ing.
1274                  * The child is now separated from the parent's address
1275                  * space.  The parent process is made dispatchable.
1276                  *
1277                  * This is a delicate locking maneuver, involving
1278                  * both the parent's p_lock and the child's p_lock.
1279                  * As soon as the SVFORK flag is turned off, the
1280                  * parent is free to run, but it must not run until
1281                  * we wake it up using its p_cv because it might
1282                  * exit and we would be referencing invalid memory.
1283                  * Therefore, we hold the parent with its p_lock
1284                  * while protecting our p_flags with our own p_lock.
1285                  */
1286 try_again:
1287                 mutex_enter(&p->p_lock); /* grab child's lock first */
1288                 prbarrier(p);           /* make sure /proc is blocked out */
1289                 mutex_enter(&pp->p_lock);
1290 
1291                 /*
1292                  * Check if parent is locked by /proc.
1293                  */
1294                 if (pp->p_proc_flag & P_PR_LOCK) {
1295                         /*
1296                          * Delay until /proc is done with the parent.
1297                          * We must drop our (the child's) p->p_lock, wait
1298                          * via prbarrier() on the parent, then start over.
1299                          */
1300                         mutex_exit(&p->p_lock);
1301                         prbarrier(pp);
1302                         mutex_exit(&pp->p_lock);
1303                         goto try_again;
1304                 }
1305                 p->p_flag &= ~SVFORK;
1306                 kpreempt_disable();
1307                 p->p_as = &kas;
1308 
1309                 /*
1310                  * notify hat of change in thread's address space
1311                  */
1312                 hat_thread_exit(curthread);
1313                 kpreempt_enable();
1314 
1315                 /*
1316                  * child sizes are copied back to parent because
1317                  * child may have grown.
1318                  */
1319                 pp->p_brkbase = p->p_brkbase;
1320                 pp->p_brksize = p->p_brksize;
1321                 pp->p_stksize = p->p_stksize;
1322 
1323                 /*
1324                  * Copy back the shm accounting information
1325                  * to the parent process.
1326                  */
1327                 pp->p_segacct = p->p_segacct;
1328                 p->p_segacct = NULL;
1329 
1330                 /*
1331                  * The parent is no longer waiting for the vfork()d child.
1332                  * Restore the parent's watched pages, if any.  This is
1333                  * safe because we know the parent is not locked by /proc
1334                  */
1335                 pp->p_flag &= ~SVFWAIT;
1336                 if (avl_numnodes(&pp->p_wpage) != 0) {
1337                         pp->p_as->a_wpage = pp->p_wpage;
1338                         avl_create(&pp->p_wpage, wp_compare,
1339                             sizeof (struct watched_page),
1340                             offsetof(struct watched_page, wp_link));
1341                 }
1342                 cv_signal(&pp->p_cv);
1343                 mutex_exit(&pp->p_lock);
1344                 mutex_exit(&p->p_lock);
1345         } else {
1346                 if (p->p_as != &kas) {
1347                         struct as *as;
1348 
1349                         if (p->p_segacct)
1350                                 shmexit(p);
1351 
1352                         /*
1353                          * We grab p_lock for the benefit of /proc
1354                          */
1355                         kpreempt_disable();
1356                         mutex_enter(&p->p_lock);
1357                         prbarrier(p);   /* make sure /proc is blocked out */
1358                         as = p->p_as;
1359                         p->p_as = &kas;
1360                         mutex_exit(&p->p_lock);
1361 
1362                         /*
1363                          * notify hat of change in thread's address space
1364                          */
1365                         hat_thread_exit(curthread);
1366                         kpreempt_enable();
1367 
1368                         as_free(as);
1369                         p->p_tr_lgrpid = LGRP_NONE;
1370                 }
1371         }
1372 }
1373 
1374 /*
1375  * Wait for child to exec or exit.
1376  * Called by parent of vfork'ed process.
1377  * See important comments in relvm(), above.
1378  */
1379 void
1380 vfwait(pid_t pid)
1381 {
1382         int signalled = 0;
1383         proc_t *pp = ttoproc(curthread);
1384         proc_t *cp;
1385 
1386         /*
1387          * Wait for child to exec or exit.
1388          */
1389         for (;;) {
1390                 mutex_enter(&pidlock);
1391                 cp = prfind(pid);
1392                 if (cp == NULL || cp->p_parent != pp) {
1393                         /*
1394                          * Child has exit()ed.
1395                          */
1396                         mutex_exit(&pidlock);
1397                         break;
1398                 }
1399                 /*
1400                  * Grab the child's p_lock before releasing pidlock.
1401                  * Otherwise, the child could exit and we would be
1402                  * referencing invalid memory.
1403                  */
1404                 mutex_enter(&cp->p_lock);
1405                 mutex_exit(&pidlock);
1406                 if (!(cp->p_flag & SVFORK)) {
1407                         /*
1408                          * Child has exec()ed or is exit()ing.
1409                          */
1410                         mutex_exit(&cp->p_lock);
1411                         break;
1412                 }
1413                 mutex_enter(&pp->p_lock);
1414                 mutex_exit(&cp->p_lock);
1415                 /*
1416                  * We might be waked up spuriously from the cv_wait().
1417                  * We have to do the whole operation over again to be
1418                  * sure the child's SVFORK flag really is turned off.
1419                  * We cannot make reference to the child because it can
1420                  * exit before we return and we would be referencing
1421                  * invalid memory.
1422                  *
1423                  * Because this is potentially a very long-term wait,
1424                  * we call cv_wait_sig() (for its jobcontrol and /proc
1425                  * side-effects) unless there is a current signal, in
1426                  * which case we use cv_wait() because we cannot return
1427                  * from this function until the child has released the
1428                  * address space.  Calling cv_wait_sig() with a current
1429                  * signal would lead to an indefinite loop here because
1430                  * cv_wait_sig() returns immediately in this case.
1431                  */
1432                 if (signalled)
1433                         cv_wait(&pp->p_cv, &pp->p_lock);
1434                 else
1435                         signalled = !cv_wait_sig(&pp->p_cv, &pp->p_lock);
1436                 mutex_exit(&pp->p_lock);
1437         }
1438 
1439         /* restore watchpoints to parent */
1440         if (pr_watch_active(pp)) {
1441                 struct as *as = pp->p_as;
1442                 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1443                 as_setwatch(as);
1444                 AS_LOCK_EXIT(as, &as->a_lock);
1445         }
1446 
1447         mutex_enter(&pp->p_lock);
1448         prbarrier(pp);  /* barrier against /proc locking */
1449         continuelwps(pp);
1450         mutex_exit(&pp->p_lock);
1451 }