1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 
  28 #include <sys/types.h>
  29 #include <sys/param.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/systm.h>
  32 #include <sys/cred.h>
  33 #include <sys/user.h>
  34 #include <sys/errno.h>
  35 #include <sys/proc.h>
  36 #include <sys/ucontext.h>
  37 #include <sys/procfs.h>
  38 #include <sys/vnode.h>
  39 #include <sys/acct.h>
  40 #include <sys/var.h>
  41 #include <sys/cmn_err.h>
  42 #include <sys/debug.h>
  43 #include <sys/wait.h>
  44 #include <sys/siginfo.h>
  45 #include <sys/procset.h>
  46 #include <sys/class.h>
  47 #include <sys/file.h>
  48 #include <sys/session.h>
  49 #include <sys/kmem.h>
  50 #include <sys/vtrace.h>
  51 #include <sys/prsystm.h>
  52 #include <sys/ipc.h>
  53 #include <sys/sem_impl.h>
  54 #include <c2/audit.h>
  55 #include <sys/aio_impl.h>
  56 #include <vm/as.h>
  57 #include <sys/poll.h>
  58 #include <sys/door.h>
  59 #include <sys/lwpchan_impl.h>
  60 #include <sys/utrap.h>
  61 #include <sys/task.h>
  62 #include <sys/exacct.h>
  63 #include <sys/cyclic.h>
  64 #include <sys/schedctl.h>
  65 #include <sys/rctl.h>
  66 #include <sys/contract_impl.h>
  67 #include <sys/contract/process_impl.h>
  68 #include <sys/list.h>
  69 #include <sys/dtrace.h>
  70 #include <sys/pool.h>
  71 #include <sys/sdt.h>
  72 #include <sys/corectl.h>
  73 #include <sys/brand.h>
  74 #include <sys/libc_kernel.h>
  75 
  76 /*
  77  * convert code/data pair into old style wait status
  78  */
  79 int
  80 wstat(int code, int data)
  81 {
  82         int stat = (data & 0377);
  83 
  84         switch (code) {
  85         case CLD_EXITED:
  86                 stat <<= 8;
  87                 break;
  88         case CLD_DUMPED:
  89                 stat |= WCOREFLG;
  90                 break;
  91         case CLD_KILLED:
  92                 break;
  93         case CLD_TRAPPED:
  94         case CLD_STOPPED:
  95                 stat <<= 8;
  96                 stat |= WSTOPFLG;
  97                 break;
  98         case CLD_CONTINUED:
  99                 stat = WCONTFLG;
 100                 break;
 101         default:
 102                 cmn_err(CE_PANIC, "wstat: bad code");
 103                 /* NOTREACHED */
 104         }
 105         return (stat);
 106 }
 107 
 108 static char *
 109 exit_reason(char *buf, size_t bufsz, int what, int why)
 110 {
 111         switch (why) {
 112         case CLD_EXITED:
 113                 (void) snprintf(buf, bufsz, "exited with status %d", what);
 114                 break;
 115         case CLD_KILLED:
 116                 (void) snprintf(buf, bufsz, "exited on fatal signal %d", what);
 117                 break;
 118         case CLD_DUMPED:
 119                 (void) snprintf(buf, bufsz, "core dumped on signal %d", what);
 120                 break;
 121         default:
 122                 (void) snprintf(buf, bufsz, "encountered unknown error "
 123                     "(%d, %d)", why, what);
 124                 break;
 125         }
 126 
 127         return (buf);
 128 }
 129 
 130 /*
 131  * exit system call: pass back caller's arg.
 132  */
 133 void
 134 rexit(int rval)
 135 {
 136         exit(CLD_EXITED, rval);
 137 }
 138 
 139 /*
 140  * Called by proc_exit() when a zone's init exits, presumably because
 141  * it failed.  As long as the given zone is still in the "running"
 142  * state, we will re-exec() init, but first we need to reset things
 143  * which are usually inherited across exec() but will break init's
 144  * assumption that it is being exec()'d from a virgin process.  Most
 145  * importantly this includes closing all file descriptors (exec only
 146  * closes those marked close-on-exec) and resetting signals (exec only
 147  * resets handled signals, and we need to clear any signals which
 148  * killed init).  Anything else that exec(2) says would be inherited,
 149  * but would affect the execution of init, needs to be reset.
 150  */
 151 static int
 152 restart_init(int what, int why)
 153 {
 154         kthread_t *t = curthread;
 155         klwp_t *lwp = ttolwp(t);
 156         proc_t *p = ttoproc(t);
 157         user_t *up = PTOU(p);
 158 
 159         vnode_t *oldcd, *oldrd;
 160         int i, err;
 161         char reason_buf[64];
 162 
 163         /*
 164          * Let zone admin (and global zone admin if this is for a non-global
 165          * zone) know that init has failed and will be restarted.
 166          */
 167         zcmn_err(p->p_zone->zone_id, CE_WARN,
 168             "init(1M) %s: restarting automatically",
 169             exit_reason(reason_buf, sizeof (reason_buf), what, why));
 170 
 171         if (!INGLOBALZONE(p)) {
 172                 cmn_err(CE_WARN, "init(1M) for zone %s (pid %d) %s: "
 173                     "restarting automatically",
 174                     p->p_zone->zone_name, p->p_pid, reason_buf);
 175         }
 176 
 177         /*
 178          * Remove any fpollinfo_t's for this (last) thread from our file
 179          * descriptors so closeall() can ASSERT() that they're all gone.
 180          * Then close all open file descriptors in the process.
 181          */
 182         pollcleanup();
 183         closeall(P_FINFO(p));
 184 
 185         /*
 186          * Grab p_lock and begin clearing miscellaneous global process
 187          * state that needs to be reset before we exec the new init(1M).
 188          */
 189 
 190         mutex_enter(&p->p_lock);
 191         prbarrier(p);
 192 
 193         p->p_flag &= ~(SKILLED | SEXTKILLED | SEXITING | SDOCORE);
 194         up->u_cmask = CMASK;
 195 
 196         sigemptyset(&t->t_hold);
 197         sigemptyset(&t->t_sig);
 198         sigemptyset(&t->t_extsig);
 199 
 200         sigemptyset(&p->p_sig);
 201         sigemptyset(&p->p_extsig);
 202 
 203         sigdelq(p, t, 0);
 204         sigdelq(p, NULL, 0);
 205 
 206         if (p->p_killsqp) {
 207                 siginfofree(p->p_killsqp);
 208                 p->p_killsqp = NULL;
 209         }
 210 
 211         /*
 212          * Reset any signals that are ignored back to the default disposition.
 213          * Other u_signal members will be cleared when exec calls sigdefault().
 214          */
 215         for (i = 1; i < NSIG; i++) {
 216                 if (up->u_signal[i - 1] == SIG_IGN) {
 217                         up->u_signal[i - 1] = SIG_DFL;
 218                         sigemptyset(&up->u_sigmask[i - 1]);
 219                 }
 220         }
 221 
 222         /*
 223          * Clear the current signal, any signal info associated with it, and
 224          * any signal information from contracts and/or contract templates.
 225          */
 226         lwp->lwp_cursig = 0;
 227         lwp->lwp_extsig = 0;
 228         if (lwp->lwp_curinfo != NULL) {
 229                 siginfofree(lwp->lwp_curinfo);
 230                 lwp->lwp_curinfo = NULL;
 231         }
 232         lwp_ctmpl_clear(lwp);
 233 
 234         /*
 235          * Reset both the process root directory and the current working
 236          * directory to the root of the zone just as we do during boot.
 237          */
 238         VN_HOLD(p->p_zone->zone_rootvp);
 239         oldrd = up->u_rdir;
 240         up->u_rdir = p->p_zone->zone_rootvp;
 241 
 242         VN_HOLD(p->p_zone->zone_rootvp);
 243         oldcd = up->u_cdir;
 244         up->u_cdir = p->p_zone->zone_rootvp;
 245 
 246         if (up->u_cwd != NULL) {
 247                 refstr_rele(up->u_cwd);
 248                 up->u_cwd = NULL;
 249         }
 250 
 251         mutex_exit(&p->p_lock);
 252 
 253         if (oldrd != NULL)
 254                 VN_RELE(oldrd);
 255         if (oldcd != NULL)
 256                 VN_RELE(oldcd);
 257 
 258         /* Free the controlling tty.  (freectty() always assumes curproc.) */
 259         ASSERT(p == curproc);
 260         (void) freectty(B_TRUE);
 261 
 262         /*
 263          * Now exec() the new init(1M) on top of the current process.  If we
 264          * succeed, the caller will treat this like a successful system call.
 265          * If we fail, we issue messages and the caller will proceed with exit.
 266          */
 267         err = exec_init(p->p_zone->zone_initname, NULL);
 268 
 269         if (err == 0)
 270                 return (0);
 271 
 272         zcmn_err(p->p_zone->zone_id, CE_WARN,
 273             "failed to restart init(1M) (err=%d): system reboot required", err);
 274 
 275         if (!INGLOBALZONE(p)) {
 276                 cmn_err(CE_WARN, "failed to restart init(1M) for zone %s "
 277                     "(pid %d, err=%d): zoneadm(1M) boot required",
 278                     p->p_zone->zone_name, p->p_pid, err);
 279         }
 280 
 281         return (-1);
 282 }
 283 
 284 /*
 285  * Release resources.
 286  * Enter zombie state.
 287  * Wake up parent and init processes,
 288  * and dispose of children.
 289  */
 290 void
 291 exit(int why, int what)
 292 {
 293         /*
 294          * If proc_exit() fails, then some other lwp in the process
 295          * got there first.  We just have to call lwp_exit() to allow
 296          * the other lwp to finish exiting the process.  Otherwise we're
 297          * restarting init, and should return.
 298          */
 299         if (proc_exit(why, what) != 0) {
 300                 mutex_enter(&curproc->p_lock);
 301                 ASSERT(curproc->p_flag & SEXITLWPS);
 302                 lwp_exit();
 303                 /* NOTREACHED */
 304         }
 305 }
 306 
 307 /*
 308  * Set the SEXITING flag on the process, after making sure /proc does
 309  * not have it locked.  This is done in more places than proc_exit(),
 310  * so it is a separate function.
 311  */
 312 void
 313 proc_is_exiting(proc_t *p)
 314 {
 315         mutex_enter(&p->p_lock);
 316         prbarrier(p);
 317         p->p_flag |= SEXITING;
 318         mutex_exit(&p->p_lock);
 319 }
 320 
 321 /*
 322  * Return value:
 323  *   1 - exitlwps() failed, call (or continue) lwp_exit()
 324  *   0 - restarting init.  Return through system call path
 325  */
 326 int
 327 proc_exit(int why, int what)
 328 {
 329         kthread_t *t = curthread;
 330         klwp_t *lwp = ttolwp(t);
 331         proc_t *p = ttoproc(t);
 332         zone_t *z = p->p_zone;
 333         timeout_id_t tmp_id;
 334         int rv;
 335         proc_t *q;
 336         task_t *tk;
 337         vnode_t *exec_vp, *execdir_vp, *cdir, *rdir;
 338         sigqueue_t *sqp;
 339         lwpdir_t *lwpdir;
 340         uint_t lwpdir_sz;
 341         tidhash_t *tidhash;
 342         uint_t tidhash_sz;
 343         ret_tidhash_t *ret_tidhash;
 344         refstr_t *cwd;
 345         hrtime_t hrutime, hrstime;
 346         int evaporate;
 347 
 348         /*
 349          * Stop and discard the process's lwps except for the current one,
 350          * unless some other lwp beat us to it.  If exitlwps() fails then
 351          * return and the calling lwp will call (or continue in) lwp_exit().
 352          */
 353         proc_is_exiting(p);
 354         if (exitlwps(0) != 0)
 355                 return (1);
 356 
 357         mutex_enter(&p->p_lock);
 358         if (p->p_ttime > 0) {
 359                 /*
 360                  * Account any remaining ticks charged to this process
 361                  * on its way out.
 362                  */
 363                 (void) task_cpu_time_incr(p->p_task, p->p_ttime);
 364                 p->p_ttime = 0;
 365         }
 366         mutex_exit(&p->p_lock);
 367 
 368         DTRACE_PROC(lwp__exit);
 369         DTRACE_PROC1(exit, int, why);
 370 
 371         /*
 372          * Will perform any brand specific proc exit processing, since this
 373          * is always the last lwp, will also perform lwp_exit and free brand
 374          * data
 375          */
 376         if (PROC_IS_BRANDED(p)) {
 377                 lwp_detach_brand_hdlrs(lwp);
 378                 brand_clearbrand(p, B_FALSE);
 379         }
 380 
 381         /*
 382          * Don't let init exit unless zone_start_init() failed its exec, or
 383          * we are shutting down the zone or the machine.
 384          *
 385          * Since we are single threaded, we don't need to lock the
 386          * following accesses to zone_proc_initpid.
 387          */
 388         if (p->p_pid == z->zone_proc_initpid) {
 389                 if (z->zone_boot_err == 0 &&
 390                     zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
 391                     zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN &&
 392                     z->zone_restart_init == B_TRUE &&
 393                     restart_init(what, why) == 0)
 394                         return (0);
 395                 /*
 396                  * Since we didn't or couldn't restart init, we clear
 397                  * the zone's init state and proceed with exit
 398                  * processing.
 399                  */
 400                 z->zone_proc_initpid = -1;
 401         }
 402 
 403         lwp_pcb_exit();
 404 
 405         /*
 406          * Allocate a sigqueue now, before we grab locks.
 407          * It will be given to sigcld(), below.
 408          * Special case:  If we will be making the process disappear
 409          * without a trace because it is either:
 410          *      * an exiting SSYS process, or
 411          *      * a posix_spawn() vfork child who requests it,
 412          * we don't bother to allocate a useless sigqueue.
 413          */
 414         evaporate = (p->p_flag & SSYS) || ((p->p_flag & SVFORK) &&
 415             why == CLD_EXITED && what == _EVAPORATE);
 416         if (!evaporate)
 417                 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 418 
 419         /*
 420          * revoke any doors created by the process.
 421          */
 422         if (p->p_door_list)
 423                 door_exit();
 424 
 425         /*
 426          * Release schedctl data structures.
 427          */
 428         if (p->p_pagep)
 429                 schedctl_proc_cleanup();
 430 
 431         /*
 432          * make sure all pending kaio has completed.
 433          */
 434         if (p->p_aio)
 435                 aio_cleanup_exit();
 436 
 437         /*
 438          * discard the lwpchan cache.
 439          */
 440         if (p->p_lcp != NULL)
 441                 lwpchan_destroy_cache(0);
 442 
 443         /*
 444          * Clean up any DTrace helper actions or probes for the process.
 445          */
 446         if (p->p_dtrace_helpers != NULL) {
 447                 ASSERT(dtrace_helpers_cleanup != NULL);
 448                 (*dtrace_helpers_cleanup)();
 449         }
 450 
 451         /* untimeout the realtime timers */
 452         if (p->p_itimer != NULL)
 453                 timer_exit();
 454 
 455         if ((tmp_id = p->p_alarmid) != 0) {
 456                 p->p_alarmid = 0;
 457                 (void) untimeout(tmp_id);
 458         }
 459 
 460         /*
 461          * Remove any fpollinfo_t's for this (last) thread from our file
 462          * descriptors so closeall() can ASSERT() that they're all gone.
 463          */
 464         pollcleanup();
 465 
 466         if (p->p_rprof_cyclic != CYCLIC_NONE) {
 467                 mutex_enter(&cpu_lock);
 468                 cyclic_remove(p->p_rprof_cyclic);
 469                 mutex_exit(&cpu_lock);
 470         }
 471 
 472         mutex_enter(&p->p_lock);
 473 
 474         /*
 475          * Clean up any DTrace probes associated with this process.
 476          */
 477         if (p->p_dtrace_probes) {
 478                 ASSERT(dtrace_fasttrap_exit_ptr != NULL);
 479                 dtrace_fasttrap_exit_ptr(p);
 480         }
 481 
 482         while ((tmp_id = p->p_itimerid) != 0) {
 483                 p->p_itimerid = 0;
 484                 mutex_exit(&p->p_lock);
 485                 (void) untimeout(tmp_id);
 486                 mutex_enter(&p->p_lock);
 487         }
 488 
 489         lwp_cleanup();
 490 
 491         /*
 492          * We are about to exit; prevent our resource associations from
 493          * being changed.
 494          */
 495         pool_barrier_enter();
 496 
 497         /*
 498          * Block the process against /proc now that we have really
 499          * acquired p->p_lock (to manipulate p_tlist at least).
 500          */
 501         prbarrier(p);
 502 
 503         sigfillset(&p->p_ignore);
 504         sigemptyset(&p->p_siginfo);
 505         sigemptyset(&p->p_sig);
 506         sigemptyset(&p->p_extsig);
 507         sigemptyset(&t->t_sig);
 508         sigemptyset(&t->t_extsig);
 509         sigemptyset(&p->p_sigmask);
 510         sigdelq(p, t, 0);
 511         lwp->lwp_cursig = 0;
 512         lwp->lwp_extsig = 0;
 513         p->p_flag &= ~(SKILLED | SEXTKILLED);
 514         if (lwp->lwp_curinfo) {
 515                 siginfofree(lwp->lwp_curinfo);
 516                 lwp->lwp_curinfo = NULL;
 517         }
 518 
 519         t->t_proc_flag |= TP_LWPEXIT;
 520         ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
 521         prlwpexit(t);           /* notify /proc */
 522         lwp_hash_out(p, t->t_tid);
 523         prexit(p);
 524 
 525         p->p_lwpcnt = 0;
 526         p->p_tlist = NULL;
 527         sigqfree(p);
 528         term_mstate(t);
 529         p->p_mterm = gethrtime();
 530 
 531         exec_vp = p->p_exec;
 532         execdir_vp = p->p_execdir;
 533         p->p_exec = NULLVP;
 534         p->p_execdir = NULLVP;
 535         mutex_exit(&p->p_lock);
 536 
 537         pr_free_watched_pages(p);
 538 
 539         closeall(P_FINFO(p));
 540 
 541         /* Free the controlling tty.  (freectty() always assumes curproc.) */
 542         ASSERT(p == curproc);
 543         (void) freectty(B_TRUE);
 544 
 545 #if defined(__sparc)
 546         if (p->p_utraps != NULL)
 547                 utrap_free(p);
 548 #endif
 549         if (p->p_semacct)                    /* IPC semaphore exit */
 550                 semexit(p);
 551         rv = wstat(why, what);
 552 
 553         acct(rv & 0xff);
 554         exacct_commit_proc(p, rv);
 555 
 556         /*
 557          * Release any resources associated with C2 auditing
 558          */
 559         if (AU_AUDITING()) {
 560                 /*
 561                  * audit exit system call
 562                  */
 563                 audit_exit(why, what);
 564         }
 565 
 566         /*
 567          * Free address space.
 568          */
 569         relvm();
 570 
 571         if (exec_vp) {
 572                 /*
 573                  * Close this executable which has been opened when the process
 574                  * was created by getproc().
 575                  */
 576                 (void) VOP_CLOSE(exec_vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 577                 VN_RELE(exec_vp);
 578         }
 579         if (execdir_vp)
 580                 VN_RELE(execdir_vp);
 581 
 582         /*
 583          * Release held contracts.
 584          */
 585         contract_exit(p);
 586 
 587         /*
 588          * Depart our encapsulating process contract.
 589          */
 590         if ((p->p_flag & SSYS) == 0) {
 591                 ASSERT(p->p_ct_process);
 592                 contract_process_exit(p->p_ct_process, p, rv);
 593         }
 594 
 595         /*
 596          * Remove pool association, and block if requested by pool_do_bind.
 597          */
 598         mutex_enter(&p->p_lock);
 599         ASSERT(p->p_pool->pool_ref > 0);
 600         atomic_dec_32(&p->p_pool->pool_ref);
 601         p->p_pool = pool_default;
 602         /*
 603          * Now that our address space has been freed and all other threads
 604          * in this process have exited, set the PEXITED pool flag.  This
 605          * tells the pools subsystems to ignore this process if it was
 606          * requested to rebind this process to a new pool.
 607          */
 608         p->p_poolflag |= PEXITED;
 609         pool_barrier_exit();
 610         mutex_exit(&p->p_lock);
 611 
 612         mutex_enter(&pidlock);
 613 
 614         /*
 615          * Delete this process from the newstate list of its parent. We
 616          * will put it in the right place in the sigcld in the end.
 617          */
 618         delete_ns(p->p_parent, p);
 619 
 620         /*
 621          * Reassign the orphans to the next of kin.
 622          * Don't rearrange init's orphanage.
 623          */
 624         if ((q = p->p_orphan) != NULL && p != proc_init) {
 625 
 626                 proc_t *nokp = p->p_nextofkin;
 627 
 628                 for (;;) {
 629                         q->p_nextofkin = nokp;
 630                         if (q->p_nextorph == NULL)
 631                                 break;
 632                         q = q->p_nextorph;
 633                 }
 634                 q->p_nextorph = nokp->p_orphan;
 635                 nokp->p_orphan = p->p_orphan;
 636                 p->p_orphan = NULL;
 637         }
 638 
 639         /*
 640          * Reassign the children to init.
 641          * Don't try to assign init's children to init.
 642          */
 643         if ((q = p->p_child) != NULL && p != proc_init) {
 644                 struct proc     *np;
 645                 struct proc     *initp = proc_init;
 646                 boolean_t       setzonetop = B_FALSE;
 647 
 648                 if (!INGLOBALZONE(curproc))
 649                         setzonetop = B_TRUE;
 650 
 651                 pgdetach(p);
 652 
 653                 do {
 654                         np = q->p_sibling;
 655                         /*
 656                          * Delete it from its current parent new state
 657                          * list and add it to init new state list
 658                          */
 659                         delete_ns(q->p_parent, q);
 660 
 661                         q->p_ppid = 1;
 662                         q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);
 663                         if (setzonetop) {
 664                                 mutex_enter(&q->p_lock);
 665                                 q->p_flag |= SZONETOP;
 666                                 mutex_exit(&q->p_lock);
 667                         }
 668                         q->p_parent = initp;
 669 
 670                         /*
 671                          * Since q will be the first child,
 672                          * it will not have a previous sibling.
 673                          */
 674                         q->p_psibling = NULL;
 675                         if (initp->p_child) {
 676                                 initp->p_child->p_psibling = q;
 677                         }
 678                         q->p_sibling = initp->p_child;
 679                         initp->p_child = q;
 680                         if (q->p_proc_flag & P_PR_PTRACE) {
 681                                 mutex_enter(&q->p_lock);
 682                                 sigtoproc(q, NULL, SIGKILL);
 683                                 mutex_exit(&q->p_lock);
 684                         }
 685                         /*
 686                          * sigcld() will add the child to parents
 687                          * newstate list.
 688                          */
 689                         if (q->p_stat == SZOMB)
 690                                 sigcld(q, NULL);
 691                 } while ((q = np) != NULL);
 692 
 693                 p->p_child = NULL;
 694                 ASSERT(p->p_child_ns == NULL);
 695         }
 696 
 697         TRACE_1(TR_FAC_PROC, TR_PROC_EXIT, "proc_exit: %p", p);
 698 
 699         mutex_enter(&p->p_lock);
 700         CL_EXIT(curthread); /* tell the scheduler that curthread is exiting */
 701 
 702         /*
 703          * Have our task accummulate our resource usage data before they
 704          * become contaminated by p_cacct etc., and before we renounce
 705          * membership of the task.
 706          *
 707          * We do this regardless of whether or not task accounting is active.
 708          * This is to avoid having nonsense data reported for this task if
 709          * task accounting is subsequently enabled. The overhead is minimal;
 710          * by this point, this process has accounted for the usage of all its
 711          * LWPs. We nonetheless do the work here, and under the protection of
 712          * pidlock, so that the movement of the process's usage to the task
 713          * happens at the same time as the removal of the process from the
 714          * task, from the point of view of exacct_snapshot_task_usage().
 715          */
 716         exacct_update_task_mstate(p);
 717 
 718         hrutime = mstate_aggr_state(p, LMS_USER);
 719         hrstime = mstate_aggr_state(p, LMS_SYSTEM);
 720         p->p_utime = (clock_t)NSEC_TO_TICK(hrutime) + p->p_cutime;
 721         p->p_stime = (clock_t)NSEC_TO_TICK(hrstime) + p->p_cstime;
 722 
 723         p->p_acct[LMS_USER]  += p->p_cacct[LMS_USER];
 724         p->p_acct[LMS_SYSTEM]        += p->p_cacct[LMS_SYSTEM];
 725         p->p_acct[LMS_TRAP]  += p->p_cacct[LMS_TRAP];
 726         p->p_acct[LMS_TFAULT]        += p->p_cacct[LMS_TFAULT];
 727         p->p_acct[LMS_DFAULT]        += p->p_cacct[LMS_DFAULT];
 728         p->p_acct[LMS_KFAULT]        += p->p_cacct[LMS_KFAULT];
 729         p->p_acct[LMS_USER_LOCK] += p->p_cacct[LMS_USER_LOCK];
 730         p->p_acct[LMS_SLEEP] += p->p_cacct[LMS_SLEEP];
 731         p->p_acct[LMS_WAIT_CPU]      += p->p_cacct[LMS_WAIT_CPU];
 732         p->p_acct[LMS_STOPPED]       += p->p_cacct[LMS_STOPPED];
 733 
 734         p->p_ru.minflt       += p->p_cru.minflt;
 735         p->p_ru.majflt       += p->p_cru.majflt;
 736         p->p_ru.nswap        += p->p_cru.nswap;
 737         p->p_ru.inblock      += p->p_cru.inblock;
 738         p->p_ru.oublock      += p->p_cru.oublock;
 739         p->p_ru.msgsnd       += p->p_cru.msgsnd;
 740         p->p_ru.msgrcv       += p->p_cru.msgrcv;
 741         p->p_ru.nsignals += p->p_cru.nsignals;
 742         p->p_ru.nvcsw        += p->p_cru.nvcsw;
 743         p->p_ru.nivcsw       += p->p_cru.nivcsw;
 744         p->p_ru.sysc += p->p_cru.sysc;
 745         p->p_ru.ioch += p->p_cru.ioch;
 746 
 747         p->p_stat = SZOMB;
 748         p->p_proc_flag &= ~P_PR_PTRACE;
 749         p->p_wdata = what;
 750         p->p_wcode = (char)why;
 751 
 752         cdir = PTOU(p)->u_cdir;
 753         rdir = PTOU(p)->u_rdir;
 754         cwd = PTOU(p)->u_cwd;
 755 
 756         ASSERT(cdir != NULL || p->p_parent == &p0);
 757 
 758         /*
 759          * Release resource controls, as they are no longer enforceable.
 760          */
 761         rctl_set_free(p->p_rctls);
 762 
 763         /*
 764          * Decrement tk_nlwps counter for our task.max-lwps resource control.
 765          * An extended accounting record, if that facility is active, is
 766          * scheduled to be written.  We cannot give up task and project
 767          * membership at this point because that would allow zombies to escape
 768          * from the max-processes resource controls.  Zombies stay in their
 769          * current task and project until the process table slot is released
 770          * in freeproc().
 771          */
 772         tk = p->p_task;
 773 
 774         mutex_enter(&p->p_zone->zone_nlwps_lock);
 775         tk->tk_nlwps--;
 776         tk->tk_proj->kpj_nlwps--;
 777         p->p_zone->zone_nlwps--;
 778         mutex_exit(&p->p_zone->zone_nlwps_lock);
 779 
 780         /*
 781          * Clear the lwp directory and the lwpid hash table
 782          * now that /proc can't bother us any more.
 783          * We free the memory below, after dropping p->p_lock.
 784          */
 785         lwpdir = p->p_lwpdir;
 786         lwpdir_sz = p->p_lwpdir_sz;
 787         tidhash = p->p_tidhash;
 788         tidhash_sz = p->p_tidhash_sz;
 789         ret_tidhash = p->p_ret_tidhash;
 790         p->p_lwpdir = NULL;
 791         p->p_lwpfree = NULL;
 792         p->p_lwpdir_sz = 0;
 793         p->p_tidhash = NULL;
 794         p->p_tidhash_sz = 0;
 795         p->p_ret_tidhash = NULL;
 796 
 797         /*
 798          * If the process has context ops installed, call the exit routine
 799          * on behalf of this last remaining thread. Normally exitpctx() is
 800          * called during thread_exit() or lwp_exit(), but because this is the
 801          * last thread in the process, we must call it here. By the time
 802          * thread_exit() is called (below), the association with the relevant
 803          * process has been lost.
 804          *
 805          * We also free the context here.
 806          */
 807         if (p->p_pctx) {
 808                 kpreempt_disable();
 809                 exitpctx(p);
 810                 kpreempt_enable();
 811 
 812                 freepctx(p, 0);
 813         }
 814 
 815         /*
 816          * curthread's proc pointer is changed to point to the 'sched'
 817          * process for the corresponding zone, except in the case when
 818          * the exiting process is in fact a zsched instance, in which
 819          * case the proc pointer is set to p0.  We do so, so that the
 820          * process still points at the right zone when we call the VN_RELE()
 821          * below.
 822          *
 823          * This is because curthread's original proc pointer can be freed as
 824          * soon as the child sends a SIGCLD to its parent.  We use zsched so
 825          * that for user processes, even in the final moments of death, the
 826          * process is still associated with its zone.
 827          */
 828         if (p != t->t_procp->p_zone->zone_zsched)
 829                 t->t_procp = t->t_procp->p_zone->zone_zsched;
 830         else
 831                 t->t_procp = &p0;
 832 
 833         mutex_exit(&p->p_lock);
 834         if (!evaporate) {
 835                 p->p_pidflag &= ~CLDPEND;
 836                 sigcld(p, sqp);
 837         } else {
 838                 /*
 839                  * Do what sigcld() would do if the disposition
 840                  * of the SIGCHLD signal were set to be ignored.
 841                  */
 842                 cv_broadcast(&p->p_srwchan_cv);
 843                 freeproc(p);
 844         }
 845         mutex_exit(&pidlock);
 846 
 847         /*
 848          * We don't release u_cdir and u_rdir until SZOMB is set.
 849          * This protects us against dofusers().
 850          */
 851         if (cdir)
 852                 VN_RELE(cdir);
 853         if (rdir)
 854                 VN_RELE(rdir);
 855         if (cwd)
 856                 refstr_rele(cwd);
 857 
 858         /*
 859          * task_rele() may ultimately cause the zone to go away (or
 860          * may cause the last user process in a zone to go away, which
 861          * signals zsched to go away).  So prior to this call, we must
 862          * no longer point at zsched.
 863          */
 864         t->t_procp = &p0;
 865 
 866         kmem_free(lwpdir, lwpdir_sz * sizeof (lwpdir_t));
 867         kmem_free(tidhash, tidhash_sz * sizeof (tidhash_t));
 868         while (ret_tidhash != NULL) {
 869                 ret_tidhash_t *next = ret_tidhash->rth_next;
 870                 kmem_free(ret_tidhash->rth_tidhash,
 871                     ret_tidhash->rth_tidhash_sz * sizeof (tidhash_t));
 872                 kmem_free(ret_tidhash, sizeof (*ret_tidhash));
 873                 ret_tidhash = next;
 874         }
 875 
 876         thread_exit();
 877         /* NOTREACHED */
 878 }
 879 
 880 /*
 881  * Format siginfo structure for wait system calls.
 882  */
 883 void
 884 winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
 885 {
 886         ASSERT(MUTEX_HELD(&pidlock));
 887 
 888         bzero(ip, sizeof (k_siginfo_t));
 889         ip->si_signo = SIGCLD;
 890         ip->si_code = pp->p_wcode;
 891         ip->si_pid = pp->p_pid;
 892         ip->si_ctid = PRCTID(pp);
 893         ip->si_zoneid = pp->p_zone->zone_id;
 894         ip->si_status = pp->p_wdata;
 895         ip->si_stime = pp->p_stime;
 896         ip->si_utime = pp->p_utime;
 897 
 898         if (waitflag) {
 899                 pp->p_wcode = 0;
 900                 pp->p_wdata = 0;
 901                 pp->p_pidflag &= ~CLDPEND;
 902         }
 903 }
 904 
 905 /*
 906  * Wait system call.
 907  * Search for a terminated (zombie) child,
 908  * finally lay it to rest, and collect its status.
 909  * Look also for stopped children,
 910  * and pass back status from them.
 911  */
 912 int
 913 waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 914 {
 915         int found;
 916         proc_t *cp, *pp;
 917         int proc_gone;
 918         int waitflag = !(options & WNOWAIT);
 919 
 920         /*
 921          * Obsolete flag, defined here only for binary compatibility
 922          * with old statically linked executables.  Delete this when
 923          * we no longer care about these old and broken applications.
 924          */
 925 #define _WNOCHLD        0400
 926         options &= ~_WNOCHLD;
 927 
 928         if (options == 0 || (options & ~WOPTMASK))
 929                 return (EINVAL);
 930 
 931         switch (idtype) {
 932         case P_PID:
 933         case P_PGID:
 934                 if (id < 0 || id >= maxpid)
 935                         return (EINVAL);
 936                 /* FALLTHROUGH */
 937         case P_ALL:
 938                 break;
 939         default:
 940                 return (EINVAL);
 941         }
 942 
 943         pp = ttoproc(curthread);
 944 
 945         /*
 946          * lock parent mutex so that sibling chain can be searched.
 947          */
 948         mutex_enter(&pidlock);
 949 
 950         /*
 951          * if we are only looking for exited processes and child_ns list
 952          * is empty no reason to look at all children.
 953          */
 954         if (idtype == P_ALL &&
 955             (options & ~WNOWAIT) == (WNOHANG | WEXITED) &&
 956             pp->p_child_ns == NULL) {
 957                 if (pp->p_child) {
 958                         mutex_exit(&pidlock);
 959                         bzero(ip, sizeof (k_siginfo_t));
 960                         return (0);
 961                 }
 962                 mutex_exit(&pidlock);
 963                 return (ECHILD);
 964         }
 965 
 966         while (pp->p_child != NULL) {
 967 
 968                 proc_gone = 0;
 969 
 970                 for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {
 971                         if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))
 972                                 continue;
 973                         if (idtype == P_PID && id != cp->p_pid)
 974                                 continue;
 975                         if (idtype == P_PGID && id != cp->p_pgrp)
 976                                 continue;
 977 
 978                         switch (cp->p_wcode) {
 979 
 980                         case CLD_TRAPPED:
 981                         case CLD_STOPPED:
 982                         case CLD_CONTINUED:
 983                                 cmn_err(CE_PANIC,
 984                                     "waitid: wrong state %d on the p_newstate"
 985                                     " list", cp->p_wcode);
 986                                 break;
 987 
 988                         case CLD_EXITED:
 989                         case CLD_DUMPED:
 990                         case CLD_KILLED:
 991                                 if (!(options & WEXITED)) {
 992                                         /*
 993                                          * Count how many are already gone
 994                                          * for good.
 995                                          */
 996                                         proc_gone++;
 997                                         break;
 998                                 }
 999                                 if (!waitflag) {
1000                                         winfo(cp, ip, 0);
1001                                 } else {
1002                                         winfo(cp, ip, 1);
1003                                         freeproc(cp);
1004                                 }
1005                                 mutex_exit(&pidlock);
1006                                 if (waitflag) {         /* accept SIGCLD */
1007                                         sigcld_delete(ip);
1008                                         sigcld_repost();
1009                                 }
1010                                 return (0);
1011                         }
1012 
1013                         if (idtype == P_PID)
1014                                 break;
1015                 }
1016 
1017                 /*
1018                  * Wow! None of the threads on the p_sibling_ns list were
1019                  * interesting threads. Check all the kids!
1020                  */
1021                 found = 0;
1022                 for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {
1023                         if (idtype == P_PID && id != cp->p_pid)
1024                                 continue;
1025                         if (idtype == P_PGID && id != cp->p_pgrp)
1026                                 continue;
1027 
1028                         switch (cp->p_wcode) {
1029                         case CLD_TRAPPED:
1030                                 if (!(options & WTRAPPED))
1031                                         break;
1032                                 winfo(cp, ip, waitflag);
1033                                 mutex_exit(&pidlock);
1034                                 if (waitflag) {         /* accept SIGCLD */
1035                                         sigcld_delete(ip);
1036                                         sigcld_repost();
1037                                 }
1038                                 return (0);
1039 
1040                         case CLD_STOPPED:
1041                                 if (!(options & WSTOPPED))
1042                                         break;
1043                                 /* Is it still stopped? */
1044                                 mutex_enter(&cp->p_lock);
1045                                 if (!jobstopped(cp)) {
1046                                         mutex_exit(&cp->p_lock);
1047                                         break;
1048                                 }
1049                                 mutex_exit(&cp->p_lock);
1050                                 winfo(cp, ip, waitflag);
1051                                 mutex_exit(&pidlock);
1052                                 if (waitflag) {         /* accept SIGCLD */
1053                                         sigcld_delete(ip);
1054                                         sigcld_repost();
1055                                 }
1056                                 return (0);
1057 
1058                         case CLD_CONTINUED:
1059                                 if (!(options & WCONTINUED))
1060                                         break;
1061                                 winfo(cp, ip, waitflag);
1062                                 mutex_exit(&pidlock);
1063                                 if (waitflag) {         /* accept SIGCLD */
1064                                         sigcld_delete(ip);
1065                                         sigcld_repost();
1066                                 }
1067                                 return (0);
1068 
1069                         case CLD_EXITED:
1070                         case CLD_DUMPED:
1071                         case CLD_KILLED:
1072                                 if (idtype != P_PID &&
1073                                     (cp->p_pidflag & CLDWAITPID))
1074                                         continue;
1075                                 /*
1076                                  * Don't complain if a process was found in
1077                                  * the first loop but we broke out of the loop
1078                                  * because of the arguments passed to us.
1079                                  */
1080                                 if (proc_gone == 0) {
1081                                         cmn_err(CE_PANIC,
1082                                             "waitid: wrong state on the"
1083                                             " p_child list");
1084                                 } else {
1085                                         break;
1086                                 }
1087                         }
1088 
1089                         found++;
1090 
1091                         if (idtype == P_PID)
1092                                 break;
1093                 }
1094 
1095                 /*
1096                  * If we found no interesting processes at all,
1097                  * break out and return ECHILD.
1098                  */
1099                 if (found + proc_gone == 0)
1100                         break;
1101 
1102                 if (options & WNOHANG) {
1103                         mutex_exit(&pidlock);
1104                         bzero(ip, sizeof (k_siginfo_t));
1105                         /*
1106                          * We should set ip->si_signo = SIGCLD,
1107                          * but there is an SVVS test that expects
1108                          * ip->si_signo to be zero in this case.
1109                          */
1110                         return (0);
1111                 }
1112 
1113                 /*
1114                  * If we found no processes of interest that could
1115                  * change state while we wait, we don't wait at all.
1116                  * Get out with ECHILD according to SVID.
1117                  */
1118                 if (found == proc_gone)
1119                         break;
1120 
1121                 if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
1122                         mutex_exit(&pidlock);
1123                         return (EINTR);
1124                 }
1125         }
1126         mutex_exit(&pidlock);
1127         return (ECHILD);
1128 }
1129 
1130 int
1131 waitsys(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1132 {
1133         int error;
1134         k_siginfo_t info;
1135 
1136         if (error = waitid(idtype, id, &info, options))
1137                 return (set_errno(error));
1138         if (copyout(&info, infop, sizeof (k_siginfo_t)))
1139                 return (set_errno(EFAULT));
1140         return (0);
1141 }
1142 
1143 #ifdef _SYSCALL32_IMPL
1144 
1145 int
1146 waitsys32(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1147 {
1148         int error;
1149         k_siginfo_t info;
1150         siginfo32_t info32;
1151 
1152         if (error = waitid(idtype, id, &info, options))
1153                 return (set_errno(error));
1154         siginfo_kto32(&info, &info32);
1155         if (copyout(&info32, infop, sizeof (info32)))
1156                 return (set_errno(EFAULT));
1157         return (0);
1158 }
1159 
1160 #endif  /* _SYSCALL32_IMPL */
1161 
1162 void
1163 proc_detach(proc_t *p)
1164 {
1165         proc_t *q;
1166 
1167         ASSERT(MUTEX_HELD(&pidlock));
1168 
1169         q = p->p_parent;
1170         ASSERT(q != NULL);
1171 
1172         /*
1173          * Take it off the newstate list of its parent
1174          */
1175         delete_ns(q, p);
1176 
1177         if (q->p_child == p) {
1178                 q->p_child = p->p_sibling;
1179                 /*
1180                  * If the parent has no children, it better not
1181                  * have any with new states either!
1182                  */
1183                 ASSERT(q->p_child ? 1 : q->p_child_ns == NULL);
1184         }
1185 
1186         if (p->p_sibling) {
1187                 p->p_sibling->p_psibling = p->p_psibling;
1188         }
1189 
1190         if (p->p_psibling) {
1191                 p->p_psibling->p_sibling = p->p_sibling;
1192         }
1193 }
1194 
1195 /*
1196  * Remove zombie children from the process table.
1197  */
1198 void
1199 freeproc(proc_t *p)
1200 {
1201         proc_t *q;
1202         task_t *tk;
1203 
1204         ASSERT(p->p_stat == SZOMB);
1205         ASSERT(p->p_tlist == NULL);
1206         ASSERT(MUTEX_HELD(&pidlock));
1207 
1208         sigdelq(p, NULL, 0);
1209         if (p->p_killsqp) {
1210                 siginfofree(p->p_killsqp);
1211                 p->p_killsqp = NULL;
1212         }
1213 
1214         prfree(p);      /* inform /proc */
1215 
1216         /*
1217          * Don't free the init processes.
1218          * Other dying processes will access it.
1219          */
1220         if (p == proc_init)
1221                 return;
1222 
1223 
1224         /*
1225          * We wait until now to free the cred structure because a
1226          * zombie process's credentials may be examined by /proc.
1227          * No cred locking needed because there are no threads at this point.
1228          */
1229         upcount_dec(crgetruid(p->p_cred), crgetzoneid(p->p_cred));
1230         crfree(p->p_cred);
1231         if (p->p_corefile != NULL) {
1232                 corectl_path_rele(p->p_corefile);
1233                 p->p_corefile = NULL;
1234         }
1235         if (p->p_content != NULL) {
1236                 corectl_content_rele(p->p_content);
1237                 p->p_content = NULL;
1238         }
1239 
1240         if (p->p_nextofkin && !((p->p_nextofkin->p_flag & SNOWAIT) ||
1241             (PTOU(p->p_nextofkin)->u_signal[SIGCLD - 1] == SIG_IGN))) {
1242                 /*
1243                  * This should still do the right thing since p_utime/stime
1244                  * get set to the correct value on process exit, so it
1245                  * should get properly updated
1246                  */
1247                 p->p_nextofkin->p_cutime += p->p_utime;
1248                 p->p_nextofkin->p_cstime += p->p_stime;
1249 
1250                 p->p_nextofkin->p_cacct[LMS_USER] += p->p_acct[LMS_USER];
1251                 p->p_nextofkin->p_cacct[LMS_SYSTEM] += p->p_acct[LMS_SYSTEM];
1252                 p->p_nextofkin->p_cacct[LMS_TRAP] += p->p_acct[LMS_TRAP];
1253                 p->p_nextofkin->p_cacct[LMS_TFAULT] += p->p_acct[LMS_TFAULT];
1254                 p->p_nextofkin->p_cacct[LMS_DFAULT] += p->p_acct[LMS_DFAULT];
1255                 p->p_nextofkin->p_cacct[LMS_KFAULT] += p->p_acct[LMS_KFAULT];
1256                 p->p_nextofkin->p_cacct[LMS_USER_LOCK]
1257                     += p->p_acct[LMS_USER_LOCK];
1258                 p->p_nextofkin->p_cacct[LMS_SLEEP] += p->p_acct[LMS_SLEEP];
1259                 p->p_nextofkin->p_cacct[LMS_WAIT_CPU]
1260                     += p->p_acct[LMS_WAIT_CPU];
1261                 p->p_nextofkin->p_cacct[LMS_STOPPED] += p->p_acct[LMS_STOPPED];
1262 
1263                 p->p_nextofkin->p_cru.minflt      += p->p_ru.minflt;
1264                 p->p_nextofkin->p_cru.majflt      += p->p_ru.majflt;
1265                 p->p_nextofkin->p_cru.nswap       += p->p_ru.nswap;
1266                 p->p_nextofkin->p_cru.inblock     += p->p_ru.inblock;
1267                 p->p_nextofkin->p_cru.oublock     += p->p_ru.oublock;
1268                 p->p_nextofkin->p_cru.msgsnd      += p->p_ru.msgsnd;
1269                 p->p_nextofkin->p_cru.msgrcv      += p->p_ru.msgrcv;
1270                 p->p_nextofkin->p_cru.nsignals    += p->p_ru.nsignals;
1271                 p->p_nextofkin->p_cru.nvcsw       += p->p_ru.nvcsw;
1272                 p->p_nextofkin->p_cru.nivcsw      += p->p_ru.nivcsw;
1273                 p->p_nextofkin->p_cru.sysc        += p->p_ru.sysc;
1274                 p->p_nextofkin->p_cru.ioch        += p->p_ru.ioch;
1275 
1276         }
1277 
1278         q = p->p_nextofkin;
1279         if (q && q->p_orphan == p)
1280                 q->p_orphan = p->p_nextorph;
1281         else if (q) {
1282                 for (q = q->p_orphan; q; q = q->p_nextorph)
1283                         if (q->p_nextorph == p)
1284                                 break;
1285                 ASSERT(q && q->p_nextorph == p);
1286                 q->p_nextorph = p->p_nextorph;
1287         }
1288 
1289         /*
1290          * The process table slot is being freed, so it is now safe to give up
1291          * task and project membership.
1292          */
1293         mutex_enter(&p->p_lock);
1294         tk = p->p_task;
1295         task_detach(p);
1296         mutex_exit(&p->p_lock);
1297 
1298         proc_detach(p);
1299         pid_exit(p, tk);        /* frees pid and proc structure */
1300 
1301         task_rele(tk);
1302 }
1303 
1304 /*
1305  * Delete process "child" from the newstate list of process "parent"
1306  */
1307 void
1308 delete_ns(proc_t *parent, proc_t *child)
1309 {
1310         proc_t **ns;
1311 
1312         ASSERT(MUTEX_HELD(&pidlock));
1313         ASSERT(child->p_parent == parent);
1314         for (ns = &parent->p_child_ns; *ns != NULL; ns = &(*ns)->p_sibling_ns) {
1315                 if (*ns == child) {
1316 
1317                         ASSERT((*ns)->p_parent == parent);
1318 
1319                         *ns = child->p_sibling_ns;
1320                         child->p_sibling_ns = NULL;
1321                         return;
1322                 }
1323         }
1324 }
1325 
1326 /*
1327  * Add process "child" to the new state list of process "parent"
1328  */
1329 void
1330 add_ns(proc_t *parent, proc_t *child)
1331 {
1332         ASSERT(child->p_sibling_ns == NULL);
1333         child->p_sibling_ns = parent->p_child_ns;
1334         parent->p_child_ns = child;
1335 }