1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 #include <sys/param.h>
  27 #include <sys/vmparam.h>
  28 #include <sys/types.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/systm.h>
  31 #include <sys/cmn_err.h>
  32 #include <sys/signal.h>
  33 #include <sys/stack.h>
  34 #include <sys/cred.h>
  35 #include <sys/user.h>
  36 #include <sys/debug.h>
  37 #include <sys/errno.h>
  38 #include <sys/proc.h>
  39 #include <sys/var.h>
  40 #include <sys/inline.h>
  41 #include <sys/syscall.h>
  42 #include <sys/ucontext.h>
  43 #include <sys/cpuvar.h>
  44 #include <sys/siginfo.h>
  45 #include <sys/trap.h>
  46 #include <sys/machtrap.h>
  47 #include <sys/sysinfo.h>
  48 #include <sys/procfs.h>
  49 #include <sys/prsystm.h>
  50 #include <sys/fpu/fpusystm.h>
  51 #include <sys/modctl.h>
  52 #include <sys/aio_impl.h>
  53 #include <c2/audit.h>
  54 #include <sys/tnf.h>
  55 #include <sys/tnf_probe.h>
  56 #include <sys/machpcb.h>
  57 #include <sys/privregs.h>
  58 #include <sys/copyops.h>
  59 #include <sys/timer.h>
  60 #include <sys/priv.h>
  61 #include <sys/msacct.h>
  62 
  63 int syscalltrace = 0;
  64 #ifdef SYSCALLTRACE
  65 static kmutex_t systrace_lock;          /* syscall tracing lock */
  66 #endif /* SYSCALLTRACE */
  67 
  68 static krwlock_t *lock_syscall(struct sysent *, uint_t);
  69 
  70 #ifdef _SYSCALL32_IMPL
  71 static struct sysent *
  72 lwp_getsysent(klwp_t *lwp)
  73 {
  74         if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE)
  75                 return (sysent);
  76         return (sysent32);
  77 }
  78 #define LWP_GETSYSENT(lwp)      (lwp_getsysent(lwp))
  79 #else
  80 #define LWP_GETSYSENT(lwp)      (sysent)
  81 #endif
  82 
  83 /*
  84  * Called to restore the lwp's register window just before
  85  * returning to user level (only if the registers have been
  86  * fetched or modified through /proc).
  87  */
  88 /*ARGSUSED1*/
  89 void
  90 xregrestore(klwp_t *lwp, int shared)
  91 {
  92         /*
  93          * If locals+ins were modified by /proc copy them out.
  94          * Also copy to the shared window, if necessary.
  95          */
  96         if (lwp->lwp_pcb.pcb_xregstat == XREGMODIFIED) {
  97                 struct machpcb *mpcb = lwptompcb(lwp);
  98                 caddr_t sp = (caddr_t)lwptoregs(lwp)->r_sp;
  99 
 100                 size_t rwinsize;
 101                 caddr_t rwp;
 102                 int is64;
 103 
 104                 if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) {
 105                         rwinsize = sizeof (struct rwindow);
 106                         rwp = sp + STACK_BIAS;
 107                         is64 = 1;
 108                 } else {
 109                         rwinsize = sizeof (struct rwindow32);
 110                         sp = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)sp;
 111                         rwp = sp;
 112                         is64 = 0;
 113                 }
 114 
 115                 if (is64)
 116                         (void) copyout_nowatch(&lwp->lwp_pcb.pcb_xregs,
 117                             rwp, rwinsize);
 118                 else {
 119                         struct rwindow32 rwindow32;
 120                         int watched;
 121 
 122                         watched = watch_disable_addr(rwp, rwinsize, S_WRITE);
 123                         rwindow_nto32(&lwp->lwp_pcb.pcb_xregs, &rwindow32);
 124                         (void) copyout(&rwindow32, rwp, rwinsize);
 125                         if (watched)
 126                                 watch_enable_addr(rwp, rwinsize, S_WRITE);
 127                 }
 128 
 129                 /* also copy to the user return window */
 130                 mpcb->mpcb_rsp[0] = sp;
 131                 mpcb->mpcb_rsp[1] = NULL;
 132                 bcopy(&lwp->lwp_pcb.pcb_xregs, &mpcb->mpcb_rwin[0],
 133                     sizeof (lwp->lwp_pcb.pcb_xregs));
 134         }
 135         lwp->lwp_pcb.pcb_xregstat = XREGNONE;
 136 }
 137 
 138 
 139 /*
 140  * Get the arguments to the current system call.
 141  *      lwp->lwp_ap normally points to the out regs in the reg structure.
 142  *      If the user is going to change the out registers and might want to
 143  *      get the args (for /proc tracing), it must copy the args elsewhere
 144  *      via save_syscall_args().
 145  */
 146 uint_t
 147 get_syscall_args(klwp_t *lwp, long *argp, int *nargsp)
 148 {
 149         kthread_t       *t = lwptot(lwp);
 150         uint_t  code = t->t_sysnum;
 151         long    mask;
 152         long    *ap;
 153         int     nargs;
 154 
 155         if (lwptoproc(lwp)->p_model == DATAMODEL_ILP32)
 156                 mask = (uint32_t)0xffffffffU;
 157         else
 158                 mask = 0xffffffffffffffff;
 159 
 160         if (code != 0 && code < NSYSCALL) {
 161 
 162                 nargs = LWP_GETSYSENT(lwp)[code].sy_narg;
 163 
 164                 ASSERT(nargs <= MAXSYSARGS);
 165 
 166                 *nargsp = nargs;
 167                 ap = lwp->lwp_ap;
 168                 while (nargs-- > 0)
 169                         *argp++ = *ap++ & mask;
 170         } else {
 171                 *nargsp = 0;
 172         }
 173         return (code);
 174 }
 175 
 176 #ifdef _SYSCALL32_IMPL
 177 /*
 178  * Get the arguments to the current 32-bit system call.
 179  */
 180 uint_t
 181 get_syscall32_args(klwp_t *lwp, int *argp, int *nargsp)
 182 {
 183         long args[MAXSYSARGS];
 184         uint_t i, code;
 185 
 186         code = get_syscall_args(lwp, args, nargsp);
 187         for (i = 0; i != *nargsp; i++)
 188                 *argp++ = (int)args[i];
 189         return (code);
 190 }
 191 #endif
 192 
 193 /*
 194  *      Save the system call arguments in a safe place.
 195  *      lwp->lwp_ap normally points to the out regs in the reg structure.
 196  *      If the user is going to change the out registers, g1, or the stack,
 197  *      and might want to get the args (for /proc tracing), it must copy
 198  *      the args elsewhere via save_syscall_args().
 199  *
 200  *      This may be called from stop() even when we're not in a system call.
 201  *      Since there's no easy way to tell, this must be safe (not panic).
 202  *      If the copyins get data faults, return non-zero.
 203  */
 204 int
 205 save_syscall_args()
 206 {
 207         kthread_t       *t = curthread;
 208         klwp_t          *lwp = ttolwp(t);
 209         struct regs     *rp = lwptoregs(lwp);
 210         uint_t          code = t->t_sysnum;
 211         uint_t          nargs;
 212         int             i;
 213         caddr_t         ua;
 214         model_t         datamodel;
 215 
 216         if (lwp->lwp_argsaved || code == 0)
 217                 return (0);             /* args already saved or not needed */
 218 
 219         if (code >= NSYSCALL) {
 220                 nargs = 0;              /* illegal syscall */
 221         } else {
 222                 struct sysent *se = LWP_GETSYSENT(lwp);
 223                 struct sysent *callp = se + code;
 224 
 225                 nargs = callp->sy_narg;
 226                 if (LOADABLE_SYSCALL(callp) && nargs == 0) {
 227                         krwlock_t       *module_lock;
 228 
 229                         /*
 230                          * Find out how many arguments the system
 231                          * call uses.
 232                          *
 233                          * We have the property that loaded syscalls
 234                          * never change the number of arguments they
 235                          * use after they've been loaded once.  This
 236                          * allows us to stop for /proc tracing without
 237                          * holding the module lock.
 238                          * /proc is assured that sy_narg is valid.
 239                          */
 240                         module_lock = lock_syscall(se, code);
 241                         nargs = callp->sy_narg;
 242                         rw_exit(module_lock);
 243                 }
 244         }
 245 
 246         /*
 247          * Fetch the system call arguments.
 248          */
 249         if (nargs == 0)
 250                 goto out;
 251 
 252 
 253         ASSERT(nargs <= MAXSYSARGS);
 254 
 255         if ((datamodel = lwp_getdatamodel(lwp)) == DATAMODEL_ILP32) {
 256 
 257                 if (rp->r_g1 == 0) { /* indirect syscall */
 258 
 259                         lwp->lwp_arg[0] = (uint32_t)rp->r_o1;
 260                         lwp->lwp_arg[1] = (uint32_t)rp->r_o2;
 261                         lwp->lwp_arg[2] = (uint32_t)rp->r_o3;
 262                         lwp->lwp_arg[3] = (uint32_t)rp->r_o4;
 263                         lwp->lwp_arg[4] = (uint32_t)rp->r_o5;
 264                         if (nargs > 5) {
 265                                 ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
 266                                     (rp->r_sp + MINFRAME32);
 267                                 for (i = 5; i < nargs; i++) {
 268                                         uint32_t a;
 269                                         if (fuword32(ua, &a) != 0)
 270                                                 return (-1);
 271                                         lwp->lwp_arg[i] = a;
 272                                         ua += sizeof (a);
 273                                 }
 274                         }
 275                 } else {
 276                         lwp->lwp_arg[0] = (uint32_t)rp->r_o0;
 277                         lwp->lwp_arg[1] = (uint32_t)rp->r_o1;
 278                         lwp->lwp_arg[2] = (uint32_t)rp->r_o2;
 279                         lwp->lwp_arg[3] = (uint32_t)rp->r_o3;
 280                         lwp->lwp_arg[4] = (uint32_t)rp->r_o4;
 281                         lwp->lwp_arg[5] = (uint32_t)rp->r_o5;
 282                         if (nargs > 6) {
 283                                 ua = (caddr_t)(uintptr_t)(caddr32_t)(uintptr_t)
 284                                     (rp->r_sp + MINFRAME32);
 285                                 for (i = 6; i < nargs; i++) {
 286                                         uint32_t a;
 287                                         if (fuword32(ua, &a) != 0)
 288                                                 return (-1);
 289                                         lwp->lwp_arg[i] = a;
 290                                         ua += sizeof (a);
 291                                 }
 292                         }
 293                 }
 294         } else {
 295                 ASSERT(datamodel == DATAMODEL_LP64);
 296                 lwp->lwp_arg[0] = rp->r_o0;
 297                 lwp->lwp_arg[1] = rp->r_o1;
 298                 lwp->lwp_arg[2] = rp->r_o2;
 299                 lwp->lwp_arg[3] = rp->r_o3;
 300                 lwp->lwp_arg[4] = rp->r_o4;
 301                 lwp->lwp_arg[5] = rp->r_o5;
 302                 if (nargs > 6) {
 303                         ua = (caddr_t)rp->r_sp + MINFRAME + STACK_BIAS;
 304                         for (i = 6; i < nargs; i++) {
 305                                 unsigned long a;
 306                                 if (fulword(ua, &a) != 0)
 307                                         return (-1);
 308                                 lwp->lwp_arg[i] = a;
 309                                 ua += sizeof (a);
 310                         }
 311                 }
 312         }
 313 
 314 out:
 315         lwp->lwp_ap = lwp->lwp_arg;
 316         lwp->lwp_argsaved = 1;
 317         t->t_post_sys = 1;   /* so lwp_ap will be reset */
 318         return (0);
 319 }
 320 
 321 void
 322 reset_syscall_args(void)
 323 {
 324         klwp_t *lwp = ttolwp(curthread);
 325 
 326         lwp->lwp_ap = (long *)&lwptoregs(lwp)->r_o0;
 327         lwp->lwp_argsaved = 0;
 328 }
 329 
 330 /*
 331  * nonexistent system call-- signal lwp (may want to handle it)
 332  * flag error if lwp won't see signal immediately
 333  * This works for old or new calling sequence.
 334  */
 335 int64_t
 336 nosys()
 337 {
 338         tsignal(curthread, SIGSYS);
 339         return ((int64_t)set_errno(ENOSYS));
 340 }
 341 
 342 /*
 343  * Perform pre-system-call processing, including stopping for tracing,
 344  * auditing, microstate-accounting, etc.
 345  *
 346  * This routine is called only if the t_pre_sys flag is set.  Any condition
 347  * requiring pre-syscall handling must set the t_pre_sys flag.  If the
 348  * condition is persistent, this routine will repost t_pre_sys.
 349  */
 350 int
 351 pre_syscall(int arg0)
 352 {
 353         unsigned int code;
 354         kthread_t *t = curthread;
 355         proc_t *p = ttoproc(t);
 356         klwp_t *lwp = ttolwp(t);
 357         struct regs *rp = lwptoregs(lwp);
 358         int     repost;
 359 
 360         t->t_pre_sys = repost = 0;   /* clear pre-syscall processing flag */
 361 
 362         syscall_mstate(LMS_USER, LMS_SYSTEM);
 363 
 364         /*
 365          * The syscall arguments in the out registers should be pointed to
 366          * by lwp_ap.  If the args need to be copied so that the outs can
 367          * be changed without losing the ability to get the args for /proc,
 368          * they can be saved by save_syscall_args(), and lwp_ap will be
 369          * restored by post_syscall().
 370          */
 371         ASSERT(lwp->lwp_ap == (long *)&rp->r_o0);
 372 
 373         /*
 374          * Make sure the thread is holding the latest credentials for the
 375          * process.  The credentials in the process right now apply to this
 376          * thread for the entire system call.
 377          */
 378         if (t->t_cred != p->p_cred) {
 379                 cred_t *oldcred = t->t_cred;
 380                 /*
 381                  * DTrace accesses t_cred in probe context.  t_cred must
 382                  * always be either NULL, or point to a valid, allocated cred
 383                  * structure.
 384                  */
 385                 t->t_cred = crgetcred();
 386                 crfree(oldcred);
 387         }
 388 
 389         /*
 390          * Undo special arrangements to single-step the lwp
 391          * so that a debugger will see valid register contents.
 392          * Also so that the pc is valid for syncfpu().
 393          * Also so that a syscall like exec() can be stepped.
 394          */
 395         if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
 396                 (void) prundostep();
 397                 repost = 1;
 398         }
 399 
 400         /*
 401          * Check for indirect system call in case we stop for tracing.
 402          * Don't allow multiple indirection.
 403          */
 404         code = t->t_sysnum;
 405         if (code == 0 && arg0 != 0) {           /* indirect syscall */
 406                 code = arg0;
 407                 t->t_sysnum = arg0;
 408         }
 409 
 410         /*
 411          * From the proc(4) manual page:
 412          * When entry to a system call is being traced, the traced process
 413          * stops after having begun the call to the system but before the
 414          * system call arguments have been fetched from the process.
 415          * If proc changes the args we must refetch them after starting.
 416          */
 417         if (PTOU(p)->u_systrap) {
 418                 if (prismember(&PTOU(p)->u_entrymask, code)) {
 419                         /*
 420                          * Recheck stop condition, now that lock is held.
 421                          */
 422                         mutex_enter(&p->p_lock);
 423                         if (PTOU(p)->u_systrap &&
 424                             prismember(&PTOU(p)->u_entrymask, code)) {
 425                                 stop(PR_SYSENTRY, code);
 426                                 /*
 427                                  * Must refetch args since they were
 428                                  * possibly modified by /proc.  Indicate
 429                                  * that the valid copy is in the
 430                                  * registers.
 431                                  */
 432                                 lwp->lwp_argsaved = 0;
 433                                 lwp->lwp_ap = (long *)&rp->r_o0;
 434                         }
 435                         mutex_exit(&p->p_lock);
 436                 }
 437                 repost = 1;
 438         }
 439 
 440         if (lwp->lwp_sysabort) {
 441                 /*
 442                  * lwp_sysabort may have been set via /proc while the process
 443                  * was stopped on PR_SYSENTRY.  If so, abort the system call.
 444                  * Override any error from the copyin() of the arguments.
 445                  */
 446                 lwp->lwp_sysabort = 0;
 447                 (void) set_errno(EINTR); /* sets post-sys processing */
 448                 t->t_pre_sys = 1;    /* repost anyway */
 449                 return (1);             /* don't do system call, return EINTR */
 450         }
 451 
 452         /* begin auditing for this syscall */
 453         if (audit_active == C2AUDIT_LOADED) {
 454                 uint32_t auditing = au_zone_getstate(NULL);
 455 
 456                 if (auditing & AU_AUDIT_MASK) {
 457                         int error;
 458                         if (error = audit_start(T_SYSCALL, code, auditing, \
 459                             0, lwp)) {
 460                                 t->t_pre_sys = 1;    /* repost anyway */
 461                                 lwp->lwp_error = 0;  /* for old drivers */
 462                                 return (error);
 463                         }
 464                         repost = 1;
 465                 }
 466         }
 467 
 468 #ifndef NPROBE
 469         /* Kernel probe */
 470         if (tnf_tracing_active) {
 471                 TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */,
 472                         tnf_sysnum,     sysnum,         t->t_sysnum);
 473                 t->t_post_sys = 1;   /* make sure post_syscall runs */
 474                 repost = 1;
 475         }
 476 #endif /* NPROBE */
 477 
 478 #ifdef SYSCALLTRACE
 479         if (syscalltrace) {
 480                 int i;
 481                 long *ap;
 482                 char *cp;
 483                 char *sysname;
 484                 struct sysent *callp;
 485 
 486                 if (code >= NSYSCALL)
 487                         callp = &nosys_ent; /* nosys has no args */
 488                 else
 489                         callp = LWP_GETSYSENT(lwp) + code;
 490                 (void) save_syscall_args();
 491                 mutex_enter(&systrace_lock);
 492                 printf("%d: ", p->p_pid);
 493                 if (code >= NSYSCALL)
 494                         printf("0x%x", code);
 495                 else {
 496                         sysname = mod_getsysname(code);
 497                         printf("%s[0x%x]", sysname == NULL ? "NULL" :
 498                             sysname, code);
 499                 }
 500                 cp = "(";
 501                 for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) {
 502                         printf("%s%lx", cp, *ap);
 503                         cp = ", ";
 504                 }
 505                 if (i)
 506                         printf(")");
 507                 printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread);
 508                 mutex_exit(&systrace_lock);
 509         }
 510 #endif /* SYSCALLTRACE */
 511 
 512         /*
 513          * If there was a continuing reason for pre-syscall processing,
 514          * set the t_pre_sys flag for the next system call.
 515          */
 516         if (repost)
 517                 t->t_pre_sys = 1;
 518         lwp->lwp_error = 0;  /* for old drivers */
 519         lwp->lwp_badpriv = PRIV_NONE;        /* for privilege tracing */
 520         return (0);
 521 }
 522 
 523 /*
 524  * Post-syscall processing.  Perform abnormal system call completion
 525  * actions such as /proc tracing, profiling, signals, preemption, etc.
 526  *
 527  * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set.
 528  * Any condition requiring pre-syscall handling must set one of these.
 529  * If the condition is persistent, this routine will repost t_post_sys.
 530  */
 531 void
 532 post_syscall(long rval1, long rval2)
 533 {
 534         kthread_t       *t = curthread;
 535         proc_t  *p = curproc;
 536         klwp_t  *lwp = ttolwp(t);
 537         struct regs *rp = lwptoregs(lwp);
 538         uint_t  error;
 539         int     code = t->t_sysnum;
 540         int     repost = 0;
 541         int     proc_stop = 0;          /* non-zero if stopping for /proc */
 542         int     sigprof = 0;            /* non-zero if sending SIGPROF */
 543 
 544         t->t_post_sys = 0;
 545 
 546         error = lwp->lwp_errno;
 547 
 548         /*
 549          * Code can be zero if this is a new LWP returning after a forkall(),
 550          * other than the one which matches the one in the parent which called
 551          * forkall().  In these LWPs, skip most of post-syscall activity.
 552          */
 553         if (code == 0)
 554                 goto sig_check;
 555 
 556         /* put out audit record for this syscall */
 557         if (AU_AUDITING()) {
 558                 rval_t  rval;   /* fix audit_finish() someday */
 559 
 560                 /* XX64 -- truncation of 64-bit return values? */
 561                 rval.r_val1 = (int)rval1;
 562                 rval.r_val2 = (int)rval2;
 563                 audit_finish(T_SYSCALL, code, error, &rval);
 564                 repost = 1;
 565         }
 566 
 567         if (curthread->t_pdmsg != NULL) {
 568                 char *m = curthread->t_pdmsg;
 569 
 570                 uprintf("%s", m);
 571                 kmem_free(m, strlen(m) + 1);
 572                 curthread->t_pdmsg = NULL;
 573         }
 574 
 575         /*
 576          * If we're going to stop for /proc tracing, set the flag and
 577          * save the arguments so that the return values don't smash them.
 578          */
 579         if (PTOU(p)->u_systrap) {
 580                 if (prismember(&PTOU(p)->u_exitmask, code)) {
 581                         proc_stop = 1;
 582                         (void) save_syscall_args();
 583                 }
 584                 repost = 1;
 585         }
 586 
 587         /*
 588          * Similarly check to see if SIGPROF might be sent.
 589          */
 590         if (curthread->t_rprof != NULL &&
 591             curthread->t_rprof->rp_anystate != 0) {
 592                 (void) save_syscall_args();
 593                 sigprof = 1;
 594         }
 595 
 596         if (lwp->lwp_eosys == NORMALRETURN) {
 597                 if (error == 0) {
 598 #ifdef SYSCALLTRACE
 599                         if (syscalltrace) {
 600                                 mutex_enter(&systrace_lock);
 601                                 printf(
 602                                     "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n",
 603                                     p->p_pid, rval1, rval2, curthread);
 604                                 mutex_exit(&systrace_lock);
 605                         }
 606 #endif /* SYSCALLTRACE */
 607                         rp->r_tstate &= ~TSTATE_IC;
 608                         rp->r_o0 = rval1;
 609                         rp->r_o1 = rval2;
 610                 } else {
 611                         int sig;
 612 
 613 #ifdef SYSCALLTRACE
 614                         if (syscalltrace) {
 615                                 mutex_enter(&systrace_lock);
 616                                 printf("%d: error=%d, id 0x%p\n",
 617                                     p->p_pid, error, curthread);
 618                                 mutex_exit(&systrace_lock);
 619                         }
 620 #endif /* SYSCALLTRACE */
 621                         if (error == EINTR && t->t_activefd.a_stale)
 622                                 error = EBADF;
 623                         if (error == EINTR &&
 624                             (sig = lwp->lwp_cursig) != 0 &&
 625                             sigismember(&PTOU(p)->u_sigrestart, sig) &&
 626                             PTOU(p)->u_signal[sig - 1] != SIG_DFL &&
 627                             PTOU(p)->u_signal[sig - 1] != SIG_IGN)
 628                                 error = ERESTART;
 629                         rp->r_o0 = error;
 630                         rp->r_tstate |= TSTATE_IC;
 631                 }
 632                 /*
 633                  * The default action is to redo the trap instruction.
 634                  * We increment the pc and npc past it for NORMALRETURN.
 635                  * JUSTRETURN has set up a new pc and npc already.
 636                  * If we are a cloned thread of forkall(), don't
 637                  * adjust here because we have already inherited
 638                  * the adjusted values from our clone.
 639                  */
 640                 if (!(t->t_flag & T_FORKALL)) {
 641                         rp->r_pc = rp->r_npc;
 642                         rp->r_npc += 4;
 643                 }
 644         }
 645 
 646         /*
 647          * From the proc(4) manual page:
 648          * When exit from a system call is being traced, the traced process
 649          * stops on completion of the system call just prior to checking for
 650          * signals and returning to user level.  At this point all return
 651          * values have been stored into the traced process's saved registers.
 652          */
 653         if (proc_stop) {
 654                 mutex_enter(&p->p_lock);
 655                 if (PTOU(p)->u_systrap &&
 656                     prismember(&PTOU(p)->u_exitmask, code))
 657                         stop(PR_SYSEXIT, code);
 658                 mutex_exit(&p->p_lock);
 659         }
 660 
 661         /*
 662          * If we are the parent returning from a successful
 663          * vfork, wait for the child to exec or exit.
 664          * This code must be here and not in the bowels of the system
 665          * so that /proc can intercept exit from vfork in a timely way.
 666          */
 667         if (t->t_flag & T_VFPARENT) {
 668                 ASSERT(code == SYS_vfork || code == SYS_forksys);
 669                 ASSERT(rp->r_o1 == 0 && error == 0);
 670                 vfwait((pid_t)rval1);
 671                 t->t_flag &= ~T_VFPARENT;
 672         }
 673 
 674         /*
 675          * If profiling is active, bill the current PC in user-land
 676          * and keep reposting until profiling is disabled.
 677          */
 678         if (p->p_prof.pr_scale) {
 679                 if (lwp->lwp_oweupc)
 680                         profil_tick(rp->r_pc);
 681                 repost = 1;
 682         }
 683 
 684 sig_check:
 685         /*
 686          * Reset flag for next time.
 687          * We must do this after stopping on PR_SYSEXIT
 688          * because /proc uses the information in lwp_eosys.
 689          */
 690         lwp->lwp_eosys = NORMALRETURN;
 691         clear_stale_fd();
 692         t->t_flag &= ~T_FORKALL;
 693 
 694         if (t->t_astflag | t->t_sig_check) {
 695                 /*
 696                  * Turn off the AST flag before checking all the conditions that
 697                  * may have caused an AST.  This flag is on whenever a signal or
 698                  * unusual condition should be handled after the next trap or
 699                  * syscall.
 700                  */
 701                 astoff(t);
 702                 t->t_sig_check = 0;
 703 
 704                 /*
 705                  * The following check is legal for the following reasons:
 706                  *      1) The thread we are checking, is ourselves, so there is
 707                  *         no way the proc can go away.
 708                  *      2) The only time we need to be protected by the
 709                  *         lock is if the binding is changed.
 710                  *
 711                  *      Note we will still take the lock and check the binding
 712                  *      if the condition was true without the lock held.  This
 713                  *      prevents lock contention among threads owned by the
 714                  *      same proc.
 715                  */
 716 
 717                 if (curthread->t_proc_flag & TP_CHANGEBIND) {
 718                         mutex_enter(&p->p_lock);
 719                         if (curthread->t_proc_flag & TP_CHANGEBIND) {
 720                                 timer_lwpbind();
 721                                 curthread->t_proc_flag &= ~TP_CHANGEBIND;
 722                         }
 723                         mutex_exit(&p->p_lock);
 724                 }
 725 
 726                 /*
 727                  * for kaio requests on the special kaio poll queue,
 728                  * copyout their results to user memory.
 729                  */
 730                 if (p->p_aio)
 731                         aio_cleanup(0);
 732 
 733                 /*
 734                  * If this LWP was asked to hold, call holdlwp(), which will
 735                  * stop.  holdlwps() sets this up and calls pokelwps() which
 736                  * sets the AST flag.
 737                  *
 738                  * Also check TP_EXITLWP, since this is used by fresh new LWPs
 739                  * through lwp_rtt().  That flag is set if the lwp_create(2)
 740                  * syscall failed after creating the LWP.
 741                  */
 742                 if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP))
 743                         holdlwp();
 744 
 745                 /*
 746                  * All code that sets signals and makes ISSIG_PENDING
 747                  * evaluate true must set t_sig_check afterwards.
 748                  */
 749                 if (ISSIG_PENDING(t, lwp, p)) {
 750                         if (issig(FORREAL))
 751                                 psig();
 752                         t->t_sig_check = 1;  /* recheck next time */
 753                 }
 754 
 755                 if (sigprof) {
 756                         int nargs = (code > 0 && code < NSYSCALL)?
 757                             LWP_GETSYSENT(lwp)[code].sy_narg : 0;
 758                         realsigprof(code, nargs, error);
 759                         t->t_sig_check = 1;  /* recheck next time */
 760                 }
 761 
 762                 /*
 763                  * If a performance counter overflow interrupt was
 764                  * delivered *during* the syscall, then re-enable the
 765                  * AST so that we take a trip through trap() to cause
 766                  * the SIGEMT to be delivered.
 767                  */
 768                 if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW)
 769                         aston(t);
 770 
 771                 /*
 772                  * If an asynchronous hardware error is pending, turn AST flag
 773                  * back on.  AST will be checked again before we return to user
 774                  * mode and we'll come back through trap() to handle the error.
 775                  */
 776                 if (lwp->lwp_pcb.pcb_flags & ASYNC_HWERR)
 777                         aston(t);
 778         }
 779 
 780         /*
 781          * Restore register window if a debugger modified it.
 782          * Set up to perform a single-step if a debugger requested it.
 783          */
 784         if (lwp->lwp_pcb.pcb_xregstat != XREGNONE)
 785                 xregrestore(lwp, 1);
 786 
 787         lwp->lwp_errno = 0;          /* clear error for next time */
 788 
 789 #ifndef NPROBE
 790         /* Kernel probe */
 791         if (tnf_tracing_active) {
 792                 TNF_PROBE_3(syscall_end, "syscall thread", /* CSTYLED */,
 793                     tnf_long,   rval1,          rval1,
 794                     tnf_long,   rval2,          rval2,
 795                     tnf_long,   errno,          (long)error);
 796                 repost = 1;
 797         }
 798 #endif /* NPROBE */
 799 
 800         /*
 801          * Set state to LWP_USER here so preempt won't give us a kernel
 802          * priority if it occurs after this point.  Call CL_TRAPRET() to
 803          * restore the user-level priority.
 804          *
 805          * It is important that no locks (other than spinlocks) be entered
 806          * after this point before returning to user mode (unless lwp_state
 807          * is set back to LWP_SYS).
 808          *
 809          * Sampled times past this point are charged to the user.
 810          */
 811         lwp->lwp_state = LWP_USER;
 812 
 813         if (t->t_trapret) {
 814                 t->t_trapret = 0;
 815                 thread_lock(t);
 816                 CL_TRAPRET(t);
 817                 thread_unlock(t);
 818         }
 819         if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
 820                 preempt();
 821         prunstop();
 822 
 823         /*
 824          * t_post_sys will be set if pcb_step is active.
 825          */
 826         if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
 827                 prdostep();
 828                 repost = 1;
 829         }
 830 
 831         t->t_sysnum = 0;     /* no longer in a system call */
 832 
 833         /*
 834          * In case the args were copied to the lwp, reset the
 835          * pointer so the next syscall will have the right lwp_ap pointer.
 836          */
 837         lwp->lwp_ap = (long *)&rp->r_o0;
 838         lwp->lwp_argsaved = 0;
 839 
 840         /*
 841          * If there was a continuing reason for post-syscall processing,
 842          * set the t_post_sys flag for the next system call.
 843          */
 844         if (repost)
 845                 t->t_post_sys = 1;
 846 
 847         /*
 848          * If there is a ustack registered for this lwp, and the stack rlimit
 849          * has been altered, read in the ustack. If the saved stack rlimit
 850          * matches the bounds of the ustack, update the ustack to reflect
 851          * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable
 852          * stack checking by setting the size to 0.
 853          */
 854         if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) {
 855                 rlim64_t new_size;
 856                 model_t model;
 857                 caddr_t top;
 858                 struct rlimit64 rl;
 859 
 860                 mutex_enter(&p->p_lock);
 861                 new_size = p->p_stk_ctl;
 862                 model = p->p_model;
 863                 top = p->p_usrstack;
 864                 (void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl);
 865                 mutex_exit(&p->p_lock);
 866 
 867                 if (rl.rlim_cur == RLIM64_INFINITY)
 868                         new_size = 0;
 869 
 870                 if (model == DATAMODEL_NATIVE) {
 871                         stack_t stk;
 872 
 873                         if (copyin((stack_t *)lwp->lwp_ustack, &stk,
 874                             sizeof (stack_t)) == 0 &&
 875                             (stk.ss_size == lwp->lwp_old_stk_ctl ||
 876                             stk.ss_size == 0) &&
 877                             stk.ss_sp == top - stk.ss_size) {
 878                                 stk.ss_sp = (void *)((uintptr_t)stk.ss_sp +
 879                                     stk.ss_size - new_size);
 880                                 stk.ss_size = new_size;
 881 
 882                                 (void) copyout(&stk,
 883                                     (stack_t *)lwp->lwp_ustack,
 884                                     sizeof (stack_t));
 885                         }
 886                 } else {
 887                         stack32_t stk32;
 888 
 889                         if (copyin((stack32_t *)lwp->lwp_ustack, &stk32,
 890                             sizeof (stack32_t)) == 0 &&
 891                             (stk32.ss_size == lwp->lwp_old_stk_ctl ||
 892                             stk32.ss_size == 0) &&
 893                             stk32.ss_sp ==
 894                             (caddr32_t)(uintptr_t)(top - stk32.ss_size)) {
 895                                 stk32.ss_sp += stk32.ss_size - new_size;
 896                                 stk32.ss_size = new_size;
 897 
 898                                 (void) copyout(&stk32,
 899                                     (stack32_t *)lwp->lwp_ustack,
 900                                     sizeof (stack32_t));
 901                         }
 902                 }
 903 
 904                 lwp->lwp_old_stk_ctl = 0;
 905         }
 906 
 907         syscall_mstate(LMS_SYSTEM, LMS_USER);
 908 }
 909 
 910 /*
 911  * Call a system call which takes a pointer to the user args struct and
 912  * a pointer to the return values.  This is a bit slower than the standard
 913  * C arg-passing method in some cases.
 914  */
 915 int64_t
 916 syscall_ap()
 917 {
 918         uint_t  error;
 919         struct sysent *callp;
 920         rval_t  rval;
 921         klwp_t  *lwp = ttolwp(curthread);
 922         struct regs *rp = lwptoregs(lwp);
 923 
 924         callp = LWP_GETSYSENT(lwp) + curthread->t_sysnum;
 925 
 926         /*
 927          * If the arguments don't fit in registers %o0 - o5, make sure they
 928          * have been copied to the lwp_arg array.
 929          */
 930         if (callp->sy_narg > 6 && save_syscall_args())
 931                 return ((int64_t)set_errno(EFAULT));
 932 
 933         rval.r_val1 = 0;
 934         rval.r_val2 = (int)rp->r_o1;
 935         lwp->lwp_error = 0;  /* for old drivers */
 936         error = (*(callp->sy_call))(lwp->lwp_ap, &rval);
 937         if (error)
 938                 return ((int64_t)set_errno(error));
 939         return (rval.r_vals);
 940 }
 941 
 942 /*
 943  * Load system call module.
 944  *      Returns with pointer to held read lock for module.
 945  */
 946 static krwlock_t *
 947 lock_syscall(struct sysent *table, uint_t code)
 948 {
 949         krwlock_t       *module_lock;
 950         struct modctl   *modp;
 951         int             id;
 952         struct sysent   *callp;
 953 
 954         module_lock = table[code].sy_lock;
 955         callp = &table[code];
 956 
 957         /*
 958          * Optimization to only call modload if we don't have a loaded
 959          * syscall.
 960          */
 961         rw_enter(module_lock, RW_READER);
 962         if (LOADED_SYSCALL(callp))
 963                 return (module_lock);
 964         rw_exit(module_lock);
 965 
 966         for (;;) {
 967                 if ((id = modload("sys", syscallnames[code])) == -1)
 968                         break;
 969 
 970                 /*
 971                  * If we loaded successfully at least once, the modctl
 972                  * will still be valid, so we try to grab it by filename.
 973                  * If this call fails, it's because the mod_filename
 974                  * was changed after the call to modload() (mod_hold_by_name()
 975                  * is the likely culprit).  We can safely just take
 976                  * another lap if this is the case;  the modload() will
 977                  * change the mod_filename back to one by which we can
 978                  * find the modctl.
 979                  */
 980                 modp = mod_find_by_filename("sys", syscallnames[code]);
 981 
 982                 if (modp == NULL)
 983                         continue;
 984 
 985                 mutex_enter(&mod_lock);
 986 
 987                 if (!modp->mod_installed) {
 988                         mutex_exit(&mod_lock);
 989                         continue;
 990                 }
 991                 break;
 992         }
 993 
 994         rw_enter(module_lock, RW_READER);
 995 
 996         if (id != -1)
 997                 mutex_exit(&mod_lock);
 998 
 999         return (module_lock);
1000 }
1001 
1002 /*
1003  * Loadable syscall support.
1004  *      If needed, load the module, then reserve it by holding a read
1005  *      lock for the duration of the call.
1006  *      Later, if the syscall is not unloadable, it could patch the vector.
1007  */
1008 /*ARGSUSED*/
1009 int64_t
1010 loadable_syscall(
1011     long a0, long a1, long a2, long a3,
1012     long a4, long a5, long a6, long a7)
1013 {
1014         int64_t         rval;
1015         struct sysent   *callp;
1016         struct sysent   *se = LWP_GETSYSENT(ttolwp(curthread));
1017         krwlock_t       *module_lock;
1018         int             code;
1019 
1020         code = curthread->t_sysnum;
1021         callp = se + code;
1022 
1023         /*
1024          * Try to autoload the system call if necessary.
1025          */
1026         module_lock = lock_syscall(se, code);
1027         THREAD_KPRI_RELEASE();  /* drop priority given by rw_enter */
1028 
1029         /*
1030          * we've locked either the loaded syscall or nosys
1031          */
1032         if (callp->sy_flags & SE_ARGC) {
1033                 int64_t (*sy_call)();
1034 
1035                 sy_call = (int64_t (*)())callp->sy_call;
1036                 rval = (*sy_call)(a0, a1, a2, a3, a4, a5);
1037         } else {
1038                 rval = syscall_ap();
1039         }
1040 
1041         THREAD_KPRI_REQUEST();  /* regain priority from read lock */
1042         rw_exit(module_lock);
1043         return (rval);
1044 }
1045 
1046 /*
1047  * Handle indirect system calls.
1048  *      This interface should be deprecated.  The library can handle
1049  *      this more efficiently, but keep this implementation for old binaries.
1050  *
1051  * XX64 Needs some work.
1052  */
1053 int64_t
1054 indir(int code, long a0, long a1, long a2, long a3, long a4)
1055 {
1056         klwp_t          *lwp = ttolwp(curthread);
1057         struct sysent   *callp;
1058 
1059         if (code <= 0 || code >= NSYSCALL)
1060                 return (nosys());
1061 
1062         ASSERT(lwp->lwp_ap != NULL);
1063 
1064         curthread->t_sysnum = code;
1065         callp = LWP_GETSYSENT(lwp) + code;
1066 
1067         /*
1068          * Handle argument setup, unless already done in pre_syscall().
1069          */
1070         if (callp->sy_narg > 5) {
1071                 if (save_syscall_args())        /* move args to LWP array */
1072                         return ((int64_t)set_errno(EFAULT));
1073         } else if (!lwp->lwp_argsaved) {
1074                 long *ap;
1075 
1076                 ap = lwp->lwp_ap;            /* args haven't been saved */
1077                 lwp->lwp_ap = ap + 1;                /* advance arg pointer */
1078                 curthread->t_post_sys = 1;   /* so lwp_ap will be reset */
1079         }
1080         return ((*callp->sy_callc)(a0, a1, a2, a3, a4, lwp->lwp_arg[5]));
1081 }
1082 
1083 /*
1084  * set_errno - set an error return from the current system call.
1085  *      This could be a macro.
1086  *      This returns the value it is passed, so that the caller can
1087  *      use tail-recursion-elimination and do return (set_errno(ERRNO));
1088  */
1089 uint_t
1090 set_errno(uint_t error)
1091 {
1092         ASSERT(error != 0);             /* must not be used to clear errno */
1093 
1094         curthread->t_post_sys = 1;   /* have post_syscall do error return */
1095         return (ttolwp(curthread)->lwp_errno = error);
1096 }
1097 
1098 /*
1099  * set_proc_pre_sys - Set pre-syscall processing for entire process.
1100  */
1101 void
1102 set_proc_pre_sys(proc_t *p)
1103 {
1104         kthread_t       *t;
1105         kthread_t       *first;
1106 
1107         ASSERT(MUTEX_HELD(&p->p_lock));
1108 
1109         t = first = p->p_tlist;
1110         do {
1111                 t->t_pre_sys = 1;
1112         } while ((t = t->t_forw) != first);
1113 }
1114 
1115 /*
1116  * set_proc_post_sys - Set post-syscall processing for entire process.
1117  */
1118 void
1119 set_proc_post_sys(proc_t *p)
1120 {
1121         kthread_t       *t;
1122         kthread_t       *first;
1123 
1124         ASSERT(MUTEX_HELD(&p->p_lock));
1125 
1126         t = first = p->p_tlist;
1127         do {
1128                 t->t_post_sys = 1;
1129         } while ((t = t->t_forw) != first);
1130 }
1131 
1132 /*
1133  * set_proc_sys - Set pre- and post-syscall processing for entire process.
1134  */
1135 void
1136 set_proc_sys(proc_t *p)
1137 {
1138         kthread_t       *t;
1139         kthread_t       *first;
1140 
1141         ASSERT(MUTEX_HELD(&p->p_lock));
1142 
1143         t = first = p->p_tlist;
1144         do {
1145                 t->t_pre_sys = 1;
1146                 t->t_post_sys = 1;
1147         } while ((t = t->t_forw) != first);
1148 }
1149 
1150 /*
1151  * set_all_proc_sys - set pre- and post-syscall processing flags for all
1152  * user processes.
1153  *
1154  * This is needed when auditing, tracing, or other facilities which affect
1155  * all processes are turned on.
1156  */
1157 void
1158 set_all_proc_sys()
1159 {
1160         kthread_t       *t;
1161         kthread_t       *first;
1162 
1163         mutex_enter(&pidlock);
1164         t = first = curthread;
1165         do {
1166                 t->t_pre_sys = 1;
1167                 t->t_post_sys = 1;
1168         } while ((t = t->t_next) != first);
1169         mutex_exit(&pidlock);
1170 }
1171 
1172 /*
1173  * set_all_zone_usr_proc_sys - set pre- and post-syscall processing flags for
1174  * all user processes running in the zone of the current process
1175  *
1176  * This is needed when auditing is turned on.
1177  */
1178 void
1179 set_all_zone_usr_proc_sys(zoneid_t zoneid)
1180 {
1181         proc_t      *p;
1182         kthread_t   *t;
1183 
1184         mutex_enter(&pidlock);
1185         for (p = practive; p != NULL; p = p->p_next) {
1186                 /* skip kernel processes */
1187                 if (p->p_exec == NULLVP || p->p_as == &kas ||
1188                     p->p_stat == SIDL || p->p_stat == SZOMB ||
1189                     (p->p_flag & (SSYS | SEXITING | SEXITLWPS)))
1190                         continue;
1191                 /*
1192                  * Only processes in the given zone (eventually in
1193                  * all zones) are taken into account
1194                  */
1195                 if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) {
1196                         mutex_enter(&p->p_lock);
1197                         if ((t = p->p_tlist) == NULL) {
1198                                 mutex_exit(&p->p_lock);
1199                                 continue;
1200                         }
1201                         /*
1202                          * Set pre- and post-syscall processing flags
1203                          * for all threads of the process
1204                          */
1205                         do {
1206                                 t->t_pre_sys = 1;
1207                                 t->t_post_sys = 1;
1208                         } while (p->p_tlist != (t = t->t_forw));
1209                         mutex_exit(&p->p_lock);
1210                 }
1211         }
1212         mutex_exit(&pidlock);
1213 }
1214 
1215 /*
1216  * set_proc_ast - Set asynchronous service trap (AST) flag for all
1217  * threads in process.
1218  */
1219 void
1220 set_proc_ast(proc_t *p)
1221 {
1222         kthread_t       *t;
1223         kthread_t       *first;
1224 
1225         ASSERT(MUTEX_HELD(&p->p_lock));
1226 
1227         t = first = p->p_tlist;
1228         do {
1229                 aston(t);
1230         } while ((t = t->t_forw) != first);
1231 }