1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  28 /*        All Rights Reserved   */
  29 
  30 #include <sys/param.h>
  31 #include <sys/types.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/systm.h>
  34 #include <sys/cred.h>
  35 #include <sys/user.h>
  36 #include <sys/errno.h>
  37 #include <sys/file.h>
  38 #include <sys/proc.h>
  39 #include <sys/prsystm.h>
  40 #include <sys/kmem.h>
  41 #include <sys/sobject.h>
  42 #include <sys/fault.h>
  43 #include <sys/procfs.h>
  44 #include <sys/watchpoint.h>
  45 #include <sys/time.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/machlock.h>
  48 #include <sys/debug.h>
  49 #include <sys/synch.h>
  50 #include <sys/synch32.h>
  51 #include <sys/mman.h>
  52 #include <sys/class.h>
  53 #include <sys/schedctl.h>
  54 #include <sys/sleepq.h>
  55 #include <sys/policy.h>
  56 #include <sys/tnf_probe.h>
  57 #include <sys/lwpchan_impl.h>
  58 #include <sys/turnstile.h>
  59 #include <sys/atomic.h>
  60 #include <sys/lwp_timer_impl.h>
  61 #include <sys/lwp_upimutex_impl.h>
  62 #include <vm/as.h>
  63 #include <sys/sdt.h>
  64 
  65 static kthread_t *lwpsobj_owner(caddr_t);
  66 static void lwp_unsleep(kthread_t *t);
  67 static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip);
  68 static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg);
  69 static void lwp_mutex_unregister(void *uaddr);
  70 static void set_owner_pid(lwp_mutex_t *, uintptr_t, pid_t);
  71 static int iswanted(kthread_t *, lwpchan_t *);
  72 
  73 extern int lwp_cond_signal(lwp_cond_t *cv);
  74 
  75 /*
  76  * Maximum number of user prio inheritance locks that can be held by a thread.
  77  * Used to limit kmem for each thread. This is a per-thread limit that
  78  * can be administered on a system wide basis (using /etc/system).
  79  *
  80  * Also, when a limit, say maxlwps is added for numbers of lwps within a
  81  * process, the per-thread limit automatically becomes a process-wide limit
  82  * of maximum number of held upi locks within a process:
  83  *      maxheldupimx = maxnestupimx * maxlwps;
  84  */
  85 static uint32_t maxnestupimx = 2000;
  86 
  87 /*
  88  * The sobj_ops vector exports a set of functions needed when a thread
  89  * is asleep on a synchronization object of this type.
  90  */
  91 static sobj_ops_t lwp_sobj_ops = {
  92         SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri
  93 };
  94 
  95 static kthread_t *lwpsobj_pi_owner(upimutex_t *up);
  96 
  97 static sobj_ops_t lwp_sobj_pi_ops = {
  98         SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep,
  99         turnstile_change_pri
 100 };
 101 
 102 static sleepq_head_t    lwpsleepq[NSLEEPQ];
 103 upib_t                  upimutextab[UPIMUTEX_TABSIZE];
 104 
 105 #define LWPCHAN_LOCK_SHIFT      10      /* 1024 locks for each pool */
 106 #define LWPCHAN_LOCK_SIZE       (1 << LWPCHAN_LOCK_SHIFT)
 107 
 108 /*
 109  * We know that both lc_wchan and lc_wchan0 are addresses that most
 110  * likely are 8-byte aligned, so we shift off the low-order 3 bits.
 111  * 'pool' is either 0 or 1.
 112  */
 113 #define LWPCHAN_LOCK_HASH(X, pool) \
 114         (((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \
 115         (LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0))
 116 
 117 static kmutex_t         lwpchanlock[2 * LWPCHAN_LOCK_SIZE];
 118 
 119 /*
 120  * Is this a POSIX threads user-level lock requiring priority inheritance?
 121  */
 122 #define UPIMUTEX(type)  ((type) & LOCK_PRIO_INHERIT)
 123 
 124 static sleepq_head_t *
 125 lwpsqhash(lwpchan_t *lwpchan)
 126 {
 127         uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
 128         return (&lwpsleepq[SQHASHINDEX(x)]);
 129 }
 130 
 131 /*
 132  * Lock an lwpchan.
 133  * Keep this in sync with lwpchan_unlock(), below.
 134  */
 135 static void
 136 lwpchan_lock(lwpchan_t *lwpchan, int pool)
 137 {
 138         uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
 139         mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
 140 }
 141 
 142 /*
 143  * Unlock an lwpchan.
 144  * Keep this in sync with lwpchan_lock(), above.
 145  */
 146 static void
 147 lwpchan_unlock(lwpchan_t *lwpchan, int pool)
 148 {
 149         uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
 150         mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
 151 }
 152 
 153 /*
 154  * Delete mappings from the lwpchan cache for pages that are being
 155  * unmapped by as_unmap().  Given a range of addresses, "start" to "end",
 156  * all mappings within the range are deleted from the lwpchan cache.
 157  */
 158 void
 159 lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end)
 160 {
 161         lwpchan_data_t *lcp;
 162         lwpchan_hashbucket_t *hashbucket;
 163         lwpchan_hashbucket_t *endbucket;
 164         lwpchan_entry_t *ent;
 165         lwpchan_entry_t **prev;
 166         caddr_t addr;
 167 
 168         mutex_enter(&p->p_lcp_lock);
 169         lcp = p->p_lcp;
 170         hashbucket = lcp->lwpchan_cache;
 171         endbucket = hashbucket + lcp->lwpchan_size;
 172         for (; hashbucket < endbucket; hashbucket++) {
 173                 if (hashbucket->lwpchan_chain == NULL)
 174                         continue;
 175                 mutex_enter(&hashbucket->lwpchan_lock);
 176                 prev = &hashbucket->lwpchan_chain;
 177                 /* check entire chain */
 178                 while ((ent = *prev) != NULL) {
 179                         addr = ent->lwpchan_addr;
 180                         if (start <= addr && addr < end) {
 181                                 *prev = ent->lwpchan_next;
 182                                 /*
 183                                  * We do this only for the obsolete type
 184                                  * USYNC_PROCESS_ROBUST.  Otherwise robust
 185                                  * locks do not draw ELOCKUNMAPPED or
 186                                  * EOWNERDEAD due to being unmapped.
 187                                  */
 188                                 if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
 189                                     (ent->lwpchan_type & USYNC_PROCESS_ROBUST))
 190                                         lwp_mutex_cleanup(ent, LOCK_UNMAPPED);
 191                                 /*
 192                                  * If there is a user-level robust lock
 193                                  * registration, mark it as invalid.
 194                                  */
 195                                 if ((addr = ent->lwpchan_uaddr) != NULL)
 196                                         lwp_mutex_unregister(addr);
 197                                 kmem_free(ent, sizeof (*ent));
 198                                 atomic_dec_32(&lcp->lwpchan_entries);
 199                         } else {
 200                                 prev = &ent->lwpchan_next;
 201                         }
 202                 }
 203                 mutex_exit(&hashbucket->lwpchan_lock);
 204         }
 205         mutex_exit(&p->p_lcp_lock);
 206 }
 207 
 208 /*
 209  * Given an lwpchan cache pointer and a process virtual address,
 210  * return a pointer to the corresponding lwpchan hash bucket.
 211  */
 212 static lwpchan_hashbucket_t *
 213 lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr)
 214 {
 215         uint_t i;
 216 
 217         /*
 218          * All user-level sync object addresses are 8-byte aligned.
 219          * Ignore the lowest 3 bits of the address and use the
 220          * higher-order 2*lwpchan_bits bits for the hash index.
 221          */
 222         addr >>= 3;
 223         i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask;
 224         return (lcp->lwpchan_cache + i);
 225 }
 226 
 227 /*
 228  * (Re)allocate the per-process lwpchan cache.
 229  */
 230 static void
 231 lwpchan_alloc_cache(proc_t *p, uint_t bits)
 232 {
 233         lwpchan_data_t *lcp;
 234         lwpchan_data_t *old_lcp;
 235         lwpchan_hashbucket_t *hashbucket;
 236         lwpchan_hashbucket_t *endbucket;
 237         lwpchan_hashbucket_t *newbucket;
 238         lwpchan_entry_t *ent;
 239         lwpchan_entry_t *next;
 240         uint_t count;
 241 
 242         ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS);
 243 
 244         lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP);
 245         lcp->lwpchan_bits = bits;
 246         lcp->lwpchan_size = 1 << lcp->lwpchan_bits;
 247         lcp->lwpchan_mask = lcp->lwpchan_size - 1;
 248         lcp->lwpchan_entries = 0;
 249         lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size *
 250             sizeof (lwpchan_hashbucket_t), KM_SLEEP);
 251         lcp->lwpchan_next_data = NULL;
 252 
 253         mutex_enter(&p->p_lcp_lock);
 254         if ((old_lcp = p->p_lcp) != NULL) {
 255                 if (old_lcp->lwpchan_bits >= bits) {
 256                         /* someone beat us to it */
 257                         mutex_exit(&p->p_lcp_lock);
 258                         kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
 259                             sizeof (lwpchan_hashbucket_t));
 260                         kmem_free(lcp, sizeof (lwpchan_data_t));
 261                         return;
 262                 }
 263                 /*
 264                  * Acquire all of the old hash table locks.
 265                  */
 266                 hashbucket = old_lcp->lwpchan_cache;
 267                 endbucket = hashbucket + old_lcp->lwpchan_size;
 268                 for (; hashbucket < endbucket; hashbucket++)
 269                         mutex_enter(&hashbucket->lwpchan_lock);
 270                 /*
 271                  * Move all of the old hash table entries to the
 272                  * new hash table.  The new hash table has not yet
 273                  * been installed so we don't need any of its locks.
 274                  */
 275                 count = 0;
 276                 hashbucket = old_lcp->lwpchan_cache;
 277                 for (; hashbucket < endbucket; hashbucket++) {
 278                         ent = hashbucket->lwpchan_chain;
 279                         while (ent != NULL) {
 280                                 next = ent->lwpchan_next;
 281                                 newbucket = lwpchan_bucket(lcp,
 282                                     (uintptr_t)ent->lwpchan_addr);
 283                                 ent->lwpchan_next = newbucket->lwpchan_chain;
 284                                 newbucket->lwpchan_chain = ent;
 285                                 ent = next;
 286                                 count++;
 287                         }
 288                         hashbucket->lwpchan_chain = NULL;
 289                 }
 290                 lcp->lwpchan_entries = count;
 291         }
 292 
 293         /*
 294          * Retire the old hash table.  We can't actually kmem_free() it
 295          * now because someone may still have a pointer to it.  Instead,
 296          * we link it onto the new hash table's list of retired hash tables.
 297          * The new hash table is double the size of the previous one, so
 298          * the total size of all retired hash tables is less than the size
 299          * of the new one.  exit() and exec() free the retired hash tables
 300          * (see lwpchan_destroy_cache(), below).
 301          */
 302         lcp->lwpchan_next_data = old_lcp;
 303 
 304         /*
 305          * As soon as we store the new lcp, future locking operations will
 306          * use it.  Therefore, we must ensure that all the state we've just
 307          * established reaches global visibility before the new lcp does.
 308          */
 309         membar_producer();
 310         p->p_lcp = lcp;
 311 
 312         if (old_lcp != NULL) {
 313                 /*
 314                  * Release all of the old hash table locks.
 315                  */
 316                 hashbucket = old_lcp->lwpchan_cache;
 317                 for (; hashbucket < endbucket; hashbucket++)
 318                         mutex_exit(&hashbucket->lwpchan_lock);
 319         }
 320         mutex_exit(&p->p_lcp_lock);
 321 }
 322 
 323 /*
 324  * Deallocate the lwpchan cache, and any dynamically allocated mappings.
 325  * Called when the process exits or execs.  All lwps except one have
 326  * exited so we need no locks here.
 327  */
 328 void
 329 lwpchan_destroy_cache(int exec)
 330 {
 331         proc_t *p = curproc;
 332         lwpchan_hashbucket_t *hashbucket;
 333         lwpchan_hashbucket_t *endbucket;
 334         lwpchan_data_t *lcp;
 335         lwpchan_entry_t *ent;
 336         lwpchan_entry_t *next;
 337         uint16_t lockflg;
 338 
 339         lcp = p->p_lcp;
 340         p->p_lcp = NULL;
 341 
 342         lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD;
 343         hashbucket = lcp->lwpchan_cache;
 344         endbucket = hashbucket + lcp->lwpchan_size;
 345         for (; hashbucket < endbucket; hashbucket++) {
 346                 ent = hashbucket->lwpchan_chain;
 347                 hashbucket->lwpchan_chain = NULL;
 348                 while (ent != NULL) {
 349                         next = ent->lwpchan_next;
 350                         if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
 351                             (ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
 352                             == (USYNC_PROCESS | LOCK_ROBUST))
 353                                 lwp_mutex_cleanup(ent, lockflg);
 354                         kmem_free(ent, sizeof (*ent));
 355                         ent = next;
 356                 }
 357         }
 358 
 359         while (lcp != NULL) {
 360                 lwpchan_data_t *next_lcp = lcp->lwpchan_next_data;
 361                 kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
 362                     sizeof (lwpchan_hashbucket_t));
 363                 kmem_free(lcp, sizeof (lwpchan_data_t));
 364                 lcp = next_lcp;
 365         }
 366 }
 367 
 368 /*
 369  * Return zero when there is an entry in the lwpchan cache for the
 370  * given process virtual address and non-zero when there is not.
 371  * The returned non-zero value is the current length of the
 372  * hash chain plus one.  The caller holds the hash bucket lock.
 373  */
 374 static uint_t
 375 lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan,
 376         lwpchan_hashbucket_t *hashbucket)
 377 {
 378         lwpchan_entry_t *ent;
 379         uint_t count = 1;
 380 
 381         for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) {
 382                 if (ent->lwpchan_addr == addr) {
 383                         if (ent->lwpchan_type != type ||
 384                             ent->lwpchan_pool != pool) {
 385                                 /*
 386                                  * This shouldn't happen, but might if the
 387                                  * process reuses its memory for different
 388                                  * types of sync objects.  We test first
 389                                  * to avoid grabbing the memory cache line.
 390                                  */
 391                                 ent->lwpchan_type = (uint16_t)type;
 392                                 ent->lwpchan_pool = (uint16_t)pool;
 393                         }
 394                         *lwpchan = ent->lwpchan_lwpchan;
 395                         return (0);
 396                 }
 397                 count++;
 398         }
 399         return (count);
 400 }
 401 
 402 /*
 403  * Return the cached lwpchan mapping if cached, otherwise insert
 404  * a virtual address to lwpchan mapping into the cache.
 405  */
 406 static int
 407 lwpchan_get_mapping(struct as *as, caddr_t addr, caddr_t uaddr,
 408         int type, lwpchan_t *lwpchan, int pool)
 409 {
 410         proc_t *p = curproc;
 411         lwpchan_data_t *lcp;
 412         lwpchan_hashbucket_t *hashbucket;
 413         lwpchan_entry_t *ent;
 414         memid_t memid;
 415         uint_t count;
 416         uint_t bits;
 417 
 418 top:
 419         /* initialize the lwpchan cache, if necesary */
 420         if ((lcp = p->p_lcp) == NULL) {
 421                 lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS);
 422                 goto top;
 423         }
 424         hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr);
 425         mutex_enter(&hashbucket->lwpchan_lock);
 426         if (lcp != p->p_lcp) {
 427                 /* someone resized the lwpchan cache; start over */
 428                 mutex_exit(&hashbucket->lwpchan_lock);
 429                 goto top;
 430         }
 431         if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) {
 432                 /* it's in the cache */
 433                 mutex_exit(&hashbucket->lwpchan_lock);
 434                 return (1);
 435         }
 436         mutex_exit(&hashbucket->lwpchan_lock);
 437         if (as_getmemid(as, addr, &memid) != 0)
 438                 return (0);
 439         lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0];
 440         lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1];
 441         ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP);
 442         mutex_enter(&hashbucket->lwpchan_lock);
 443         if (lcp != p->p_lcp) {
 444                 /* someone resized the lwpchan cache; start over */
 445                 mutex_exit(&hashbucket->lwpchan_lock);
 446                 kmem_free(ent, sizeof (*ent));
 447                 goto top;
 448         }
 449         count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket);
 450         if (count == 0) {
 451                 /* someone else added this entry to the cache */
 452                 mutex_exit(&hashbucket->lwpchan_lock);
 453                 kmem_free(ent, sizeof (*ent));
 454                 return (1);
 455         }
 456         if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */
 457             (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) {
 458                 /* hash chain too long; reallocate the hash table */
 459                 mutex_exit(&hashbucket->lwpchan_lock);
 460                 kmem_free(ent, sizeof (*ent));
 461                 lwpchan_alloc_cache(p, bits + 1);
 462                 goto top;
 463         }
 464         ent->lwpchan_addr = addr;
 465         ent->lwpchan_uaddr = uaddr;
 466         ent->lwpchan_type = (uint16_t)type;
 467         ent->lwpchan_pool = (uint16_t)pool;
 468         ent->lwpchan_lwpchan = *lwpchan;
 469         ent->lwpchan_next = hashbucket->lwpchan_chain;
 470         hashbucket->lwpchan_chain = ent;
 471         atomic_inc_32(&lcp->lwpchan_entries);
 472         mutex_exit(&hashbucket->lwpchan_lock);
 473         return (1);
 474 }
 475 
 476 /*
 477  * Return a unique pair of identifiers that corresponds to a
 478  * synchronization object's virtual address.  Process-shared
 479  * sync objects usually get vnode/offset from as_getmemid().
 480  */
 481 static int
 482 get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool)
 483 {
 484         /*
 485          * If the lwp synch object is defined to be process-private,
 486          * we just make the first field of the lwpchan be 'as' and
 487          * the second field be the synch object's virtual address.
 488          * (segvn_getmemid() does the same for MAP_PRIVATE mappings.)
 489          * The lwpchan cache is used only for process-shared objects.
 490          */
 491         if (!(type & USYNC_PROCESS)) {
 492                 lwpchan->lc_wchan0 = (caddr_t)as;
 493                 lwpchan->lc_wchan = addr;
 494                 return (1);
 495         }
 496 
 497         return (lwpchan_get_mapping(as, addr, NULL, type, lwpchan, pool));
 498 }
 499 
 500 static void
 501 lwp_block(lwpchan_t *lwpchan)
 502 {
 503         kthread_t *t = curthread;
 504         klwp_t *lwp = ttolwp(t);
 505         sleepq_head_t *sqh;
 506 
 507         thread_lock(t);
 508         t->t_flag |= T_WAKEABLE;
 509         t->t_lwpchan = *lwpchan;
 510         t->t_sobj_ops = &lwp_sobj_ops;
 511         t->t_release = 0;
 512         sqh = lwpsqhash(lwpchan);
 513         disp_lock_enter_high(&sqh->sq_lock);
 514         CL_SLEEP(t);
 515         DTRACE_SCHED(sleep);
 516         THREAD_SLEEP(t, &sqh->sq_lock);
 517         sleepq_insert(&sqh->sq_queue, t);
 518         thread_unlock(t);
 519         lwp->lwp_asleep = 1;
 520         lwp->lwp_sysabort = 0;
 521         lwp->lwp_ru.nvcsw++;
 522         (void) new_mstate(curthread, LMS_SLEEP);
 523 }
 524 
 525 static kthread_t *
 526 lwpsobj_pi_owner(upimutex_t *up)
 527 {
 528         return (up->upi_owner);
 529 }
 530 
 531 static struct upimutex *
 532 upi_get(upib_t *upibp, lwpchan_t *lcp)
 533 {
 534         struct upimutex *upip;
 535 
 536         for (upip = upibp->upib_first; upip != NULL;
 537             upip = upip->upi_nextchain) {
 538                 if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 &&
 539                     upip->upi_lwpchan.lc_wchan == lcp->lc_wchan)
 540                         break;
 541         }
 542         return (upip);
 543 }
 544 
 545 static void
 546 upi_chain_add(upib_t *upibp, struct upimutex *upimutex)
 547 {
 548         ASSERT(MUTEX_HELD(&upibp->upib_lock));
 549 
 550         /*
 551          * Insert upimutex at front of list. Maybe a bit unfair
 552          * but assume that not many lwpchans hash to the same
 553          * upimutextab bucket, i.e. the list of upimutexes from
 554          * upib_first is not too long.
 555          */
 556         upimutex->upi_nextchain = upibp->upib_first;
 557         upibp->upib_first = upimutex;
 558 }
 559 
 560 static void
 561 upi_chain_del(upib_t *upibp, struct upimutex *upimutex)
 562 {
 563         struct upimutex **prev;
 564 
 565         ASSERT(MUTEX_HELD(&upibp->upib_lock));
 566 
 567         prev = &upibp->upib_first;
 568         while (*prev != upimutex) {
 569                 prev = &(*prev)->upi_nextchain;
 570         }
 571         *prev = upimutex->upi_nextchain;
 572         upimutex->upi_nextchain = NULL;
 573 }
 574 
 575 /*
 576  * Add upimutex to chain of upimutexes held by curthread.
 577  * Returns number of upimutexes held by curthread.
 578  */
 579 static uint32_t
 580 upi_mylist_add(struct upimutex *upimutex)
 581 {
 582         kthread_t *t = curthread;
 583 
 584         /*
 585          * Insert upimutex at front of list of upimutexes owned by t. This
 586          * would match typical LIFO order in which nested locks are acquired
 587          * and released.
 588          */
 589         upimutex->upi_nextowned = t->t_upimutex;
 590         t->t_upimutex = upimutex;
 591         t->t_nupinest++;
 592         ASSERT(t->t_nupinest > 0);
 593         return (t->t_nupinest);
 594 }
 595 
 596 /*
 597  * Delete upimutex from list of upimutexes owned by curthread.
 598  */
 599 static void
 600 upi_mylist_del(struct upimutex *upimutex)
 601 {
 602         kthread_t *t = curthread;
 603         struct upimutex **prev;
 604 
 605         /*
 606          * Since the order in which nested locks are acquired and released,
 607          * is typically LIFO, and typical nesting levels are not too deep, the
 608          * following should not be expensive in the general case.
 609          */
 610         prev = &t->t_upimutex;
 611         while (*prev != upimutex) {
 612                 prev = &(*prev)->upi_nextowned;
 613         }
 614         *prev = upimutex->upi_nextowned;
 615         upimutex->upi_nextowned = NULL;
 616         ASSERT(t->t_nupinest > 0);
 617         t->t_nupinest--;
 618 }
 619 
 620 /*
 621  * Returns true if upimutex is owned. Should be called only when upim points
 622  * to kmem which cannot disappear from underneath.
 623  */
 624 static int
 625 upi_owned(upimutex_t *upim)
 626 {
 627         return (upim->upi_owner == curthread);
 628 }
 629 
 630 /*
 631  * Returns pointer to kernel object (upimutex_t *) if lp is owned.
 632  */
 633 static struct upimutex *
 634 lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type)
 635 {
 636         lwpchan_t lwpchan;
 637         upib_t *upibp;
 638         struct upimutex *upimutex;
 639 
 640         if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
 641             &lwpchan, LWPCHAN_MPPOOL))
 642                 return (NULL);
 643 
 644         upibp = &UPI_CHAIN(lwpchan);
 645         mutex_enter(&upibp->upib_lock);
 646         upimutex = upi_get(upibp, &lwpchan);
 647         if (upimutex == NULL || upimutex->upi_owner != curthread) {
 648                 mutex_exit(&upibp->upib_lock);
 649                 return (NULL);
 650         }
 651         mutex_exit(&upibp->upib_lock);
 652         return (upimutex);
 653 }
 654 
 655 /*
 656  * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if
 657  * no lock hand-off occurrs.
 658  */
 659 static void
 660 upimutex_unlock(struct upimutex *upimutex, uint16_t flag)
 661 {
 662         turnstile_t *ts;
 663         upib_t *upibp;
 664         kthread_t *newowner;
 665 
 666         upi_mylist_del(upimutex);
 667         upibp = upimutex->upi_upibp;
 668         mutex_enter(&upibp->upib_lock);
 669         if (upimutex->upi_waiter != 0) { /* if waiters */
 670                 ts = turnstile_lookup(upimutex);
 671                 if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) {
 672                         /* hand-off lock to highest prio waiter */
 673                         newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first;
 674                         upimutex->upi_owner = newowner;
 675                         if (ts->ts_waiters == 1)
 676                                 upimutex->upi_waiter = 0;
 677                         turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner);
 678                         mutex_exit(&upibp->upib_lock);
 679                         return;
 680                 } else if (ts != NULL) {
 681                         /* LOCK_NOTRECOVERABLE: wakeup all */
 682                         turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
 683                 } else {
 684                         /*
 685                          * Misleading w bit. Waiters might have been
 686                          * interrupted. No need to clear the w bit (upimutex
 687                          * will soon be freed). Re-calculate PI from existing
 688                          * waiters.
 689                          */
 690                         turnstile_exit(upimutex);
 691                         turnstile_pi_recalc();
 692                 }
 693         }
 694         /*
 695          * no waiters, or LOCK_NOTRECOVERABLE.
 696          * remove from the bucket chain of upi mutexes.
 697          * de-allocate kernel memory (upimutex).
 698          */
 699         upi_chain_del(upimutex->upi_upibp, upimutex);
 700         mutex_exit(&upibp->upib_lock);
 701         kmem_free(upimutex, sizeof (upimutex_t));
 702 }
 703 
 704 static int
 705 lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp)
 706 {
 707         label_t ljb;
 708         int error = 0;
 709         lwpchan_t lwpchan;
 710         uint16_t flag;
 711         upib_t *upibp;
 712         volatile struct upimutex *upimutex = NULL;
 713         turnstile_t *ts;
 714         uint32_t nupinest;
 715         volatile int upilocked = 0;
 716 
 717         if (on_fault(&ljb)) {
 718                 if (upilocked)
 719                         upimutex_unlock((upimutex_t *)upimutex, 0);
 720                 error = EFAULT;
 721                 goto out;
 722         }
 723         if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
 724             &lwpchan, LWPCHAN_MPPOOL)) {
 725                 error = EFAULT;
 726                 goto out;
 727         }
 728         upibp = &UPI_CHAIN(lwpchan);
 729 retry:
 730         mutex_enter(&upibp->upib_lock);
 731         upimutex = upi_get(upibp, &lwpchan);
 732         if (upimutex == NULL)  {
 733                 /* lock available since lwpchan has no upimutex */
 734                 upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP);
 735                 upi_chain_add(upibp, (upimutex_t *)upimutex);
 736                 upimutex->upi_owner = curthread; /* grab lock */
 737                 upimutex->upi_upibp = upibp;
 738                 upimutex->upi_vaddr = lp;
 739                 upimutex->upi_lwpchan = lwpchan;
 740                 mutex_exit(&upibp->upib_lock);
 741                 nupinest = upi_mylist_add((upimutex_t *)upimutex);
 742                 upilocked = 1;
 743                 fuword16_noerr(&lp->mutex_flag, &flag);
 744                 if (nupinest > maxnestupimx &&
 745                     secpolicy_resource(CRED()) != 0) {
 746                         upimutex_unlock((upimutex_t *)upimutex, flag);
 747                         error = ENOMEM;
 748                         goto out;
 749                 }
 750                 if (flag & LOCK_NOTRECOVERABLE) {
 751                         /*
 752                          * Since the setting of LOCK_NOTRECOVERABLE
 753                          * was done under the high-level upi mutex,
 754                          * in lwp_upimutex_unlock(), this flag needs to
 755                          * be checked while holding the upi mutex.
 756                          * If set, this thread should return without
 757                          * the lock held, and with the right error code.
 758                          */
 759                         upimutex_unlock((upimutex_t *)upimutex, flag);
 760                         upilocked = 0;
 761                         error = ENOTRECOVERABLE;
 762                 } else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
 763                         if (flag & LOCK_OWNERDEAD)
 764                                 error = EOWNERDEAD;
 765                         else if (type & USYNC_PROCESS_ROBUST)
 766                                 error = ELOCKUNMAPPED;
 767                         else
 768                                 error = EOWNERDEAD;
 769                 }
 770                 goto out;
 771         }
 772         /*
 773          * If a upimutex object exists, it must have an owner.
 774          * This is due to lock hand-off, and release of upimutex when no
 775          * waiters are present at unlock time,
 776          */
 777         ASSERT(upimutex->upi_owner != NULL);
 778         if (upimutex->upi_owner == curthread) {
 779                 /*
 780                  * The user wrapper can check if the mutex type is
 781                  * ERRORCHECK: if not, it should stall at user-level.
 782                  * If so, it should return the error code.
 783                  */
 784                 mutex_exit(&upibp->upib_lock);
 785                 error = EDEADLK;
 786                 goto out;
 787         }
 788         if (try == UPIMUTEX_TRY) {
 789                 mutex_exit(&upibp->upib_lock);
 790                 error = EBUSY;
 791                 goto out;
 792         }
 793         /*
 794          * Block for the lock.
 795          */
 796         if ((error = lwptp->lwpt_time_error) != 0) {
 797                 /*
 798                  * The SUSV3 Posix spec is very clear that we
 799                  * should get no error from validating the
 800                  * timer until we would actually sleep.
 801                  */
 802                 mutex_exit(&upibp->upib_lock);
 803                 goto out;
 804         }
 805         if (lwptp->lwpt_tsp != NULL) {
 806                 /*
 807                  * Unlike the protocol for other lwp timedwait operations,
 808                  * we must drop t_delay_lock before going to sleep in
 809                  * turnstile_block() for a upi mutex.
 810                  * See the comments below and in turnstile.c
 811                  */
 812                 mutex_enter(&curthread->t_delay_lock);
 813                 (void) lwp_timer_enqueue(lwptp);
 814                 mutex_exit(&curthread->t_delay_lock);
 815         }
 816         /*
 817          * Now, set the waiter bit and block for the lock in turnstile_block().
 818          * No need to preserve the previous wbit since a lock try is not
 819          * attempted after setting the wait bit. Wait bit is set under
 820          * the upib_lock, which is not released until the turnstile lock
 821          * is acquired. Say, the upimutex is L:
 822          *
 823          * 1. upib_lock is held so the waiter does not have to retry L after
 824          *    setting the wait bit: since the owner has to grab the upib_lock
 825          *    to unlock L, it will certainly see the wait bit set.
 826          * 2. upib_lock is not released until the turnstile lock is acquired.
 827          *    This is the key to preventing a missed wake-up. Otherwise, the
 828          *    owner could acquire the upib_lock, and the tc_lock, to call
 829          *    turnstile_wakeup(). All this, before the waiter gets tc_lock
 830          *    to sleep in turnstile_block(). turnstile_wakeup() will then not
 831          *    find this waiter, resulting in the missed wakeup.
 832          * 3. The upib_lock, being a kernel mutex, cannot be released while
 833          *    holding the tc_lock (since mutex_exit() could need to acquire
 834          *    the same tc_lock)...and so is held when calling turnstile_block().
 835          *    The address of upib_lock is passed to turnstile_block() which
 836          *    releases it after releasing all turnstile locks, and before going
 837          *    to sleep in swtch().
 838          * 4. The waiter value cannot be a count of waiters, because a waiter
 839          *    can be interrupted. The interrupt occurs under the tc_lock, at
 840          *    which point, the upib_lock cannot be locked, to decrement waiter
 841          *    count. So, just treat the waiter state as a bit, not a count.
 842          */
 843         ts = turnstile_lookup((upimutex_t *)upimutex);
 844         upimutex->upi_waiter = 1;
 845         error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex,
 846             &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp);
 847         /*
 848          * Hand-off implies that we wakeup holding the lock, except when:
 849          *      - deadlock is detected
 850          *      - lock is not recoverable
 851          *      - we got an interrupt or timeout
 852          * If we wake up due to an interrupt or timeout, we may
 853          * or may not be holding the lock due to mutex hand-off.
 854          * Use lwp_upimutex_owned() to check if we do hold the lock.
 855          */
 856         if (error != 0) {
 857                 if ((error == EINTR || error == ETIME) &&
 858                     (upimutex = lwp_upimutex_owned(lp, type))) {
 859                         /*
 860                          * Unlock and return - the re-startable syscall will
 861                          * try the lock again if we got EINTR.
 862                          */
 863                         (void) upi_mylist_add((upimutex_t *)upimutex);
 864                         upimutex_unlock((upimutex_t *)upimutex, 0);
 865                 }
 866                 /*
 867                  * The only other possible error is EDEADLK.  If so, upimutex
 868                  * is valid, since its owner is deadlocked with curthread.
 869                  */
 870                 ASSERT(error == EINTR || error == ETIME ||
 871                     (error == EDEADLK && !upi_owned((upimutex_t *)upimutex)));
 872                 ASSERT(!lwp_upimutex_owned(lp, type));
 873                 goto out;
 874         }
 875         if (lwp_upimutex_owned(lp, type)) {
 876                 ASSERT(lwp_upimutex_owned(lp, type) == upimutex);
 877                 nupinest = upi_mylist_add((upimutex_t *)upimutex);
 878                 upilocked = 1;
 879         }
 880         /*
 881          * Now, need to read the user-level lp->mutex_flag to do the following:
 882          *
 883          * - if lock is held, check if EOWNERDEAD or ELOCKUNMAPPED
 884          *   should be returned.
 885          * - if lock isn't held, check if ENOTRECOVERABLE should
 886          *   be returned.
 887          *
 888          * Now, either lp->mutex_flag is readable or it's not. If not
 889          * readable, the on_fault path will cause a return with EFAULT
 890          * as it should.  If it is readable, the state of the flag
 891          * encodes the robustness state of the lock:
 892          *
 893          * If the upimutex is locked here, the flag's LOCK_OWNERDEAD
 894          * or LOCK_UNMAPPED setting will influence the return code
 895          * appropriately.  If the upimutex is not locked here, this
 896          * could be due to a spurious wake-up or a NOTRECOVERABLE
 897          * event.  The flag's setting can be used to distinguish
 898          * between these two events.
 899          */
 900         fuword16_noerr(&lp->mutex_flag, &flag);
 901         if (upilocked) {
 902                 /*
 903                  * If the thread wakes up from turnstile_block with the lock
 904                  * held, the flag could not be set to LOCK_NOTRECOVERABLE,
 905                  * since it would not have been handed-off the lock.
 906                  * So, no need to check for this case.
 907                  */
 908                 if (nupinest > maxnestupimx &&
 909                     secpolicy_resource(CRED()) != 0) {
 910                         upimutex_unlock((upimutex_t *)upimutex, flag);
 911                         upilocked = 0;
 912                         error = ENOMEM;
 913                 } else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
 914                         if (flag & LOCK_OWNERDEAD)
 915                                 error = EOWNERDEAD;
 916                         else if (type & USYNC_PROCESS_ROBUST)
 917                                 error = ELOCKUNMAPPED;
 918                         else
 919                                 error = EOWNERDEAD;
 920                 }
 921         } else {
 922                 /*
 923                  * Wake-up without the upimutex held. Either this is a
 924                  * spurious wake-up (due to signals, forkall(), whatever), or
 925                  * it is a LOCK_NOTRECOVERABLE robustness event. The setting
 926                  * of the mutex flag can be used to distinguish between the
 927                  * two events.
 928                  */
 929                 if (flag & LOCK_NOTRECOVERABLE) {
 930                         error = ENOTRECOVERABLE;
 931                 } else {
 932                         /*
 933                          * Here, the flag could be set to LOCK_OWNERDEAD or
 934                          * not. In both cases, this is a spurious wakeup,
 935                          * since the upi lock is not held, but the thread
 936                          * has returned from turnstile_block().
 937                          *
 938                          * The user flag could be LOCK_OWNERDEAD if, at the
 939                          * same time as curthread having been woken up
 940                          * spuriously, the owner (say Tdead) has died, marked
 941                          * the mutex flag accordingly, and handed off the lock
 942                          * to some other waiter (say Tnew). curthread just
 943                          * happened to read the flag while Tnew has yet to deal
 944                          * with the owner-dead event.
 945                          *
 946                          * In this event, curthread should retry the lock.
 947                          * If Tnew is able to cleanup the lock, curthread
 948                          * will eventually get the lock with a zero error code,
 949                          * If Tnew is unable to cleanup, its eventual call to
 950                          * unlock the lock will result in the mutex flag being
 951                          * set to LOCK_NOTRECOVERABLE, and the wake-up of
 952                          * all waiters, including curthread, which will then
 953                          * eventually return ENOTRECOVERABLE due to the above
 954                          * check.
 955                          *
 956                          * Of course, if the user-flag is not set with
 957                          * LOCK_OWNERDEAD, retrying is the thing to do, since
 958                          * this is definitely a spurious wakeup.
 959                          */
 960                         goto retry;
 961                 }
 962         }
 963 
 964 out:
 965         no_fault();
 966         return (error);
 967 }
 968 
 969 
 970 static int
 971 lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type)
 972 {
 973         label_t ljb;
 974         int error = 0;
 975         lwpchan_t lwpchan;
 976         uint16_t flag;
 977         upib_t *upibp;
 978         volatile struct upimutex *upimutex = NULL;
 979         volatile int upilocked = 0;
 980 
 981         if (on_fault(&ljb)) {
 982                 if (upilocked)
 983                         upimutex_unlock((upimutex_t *)upimutex, 0);
 984                 error = EFAULT;
 985                 goto out;
 986         }
 987         if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
 988             &lwpchan, LWPCHAN_MPPOOL)) {
 989                 error = EFAULT;
 990                 goto out;
 991         }
 992         upibp = &UPI_CHAIN(lwpchan);
 993         mutex_enter(&upibp->upib_lock);
 994         upimutex = upi_get(upibp, &lwpchan);
 995         /*
 996          * If the lock is not held, or the owner is not curthread, return
 997          * error. The user-level wrapper can return this error or stall,
 998          * depending on whether mutex is of ERRORCHECK type or not.
 999          */
1000         if (upimutex == NULL || upimutex->upi_owner != curthread) {
1001                 mutex_exit(&upibp->upib_lock);
1002                 error = EPERM;
1003                 goto out;
1004         }
1005         mutex_exit(&upibp->upib_lock); /* release for user memory access */
1006         upilocked = 1;
1007         fuword16_noerr(&lp->mutex_flag, &flag);
1008         if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1009                 /*
1010                  * transition mutex to the LOCK_NOTRECOVERABLE state.
1011                  */
1012                 flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
1013                 flag |= LOCK_NOTRECOVERABLE;
1014                 suword16_noerr(&lp->mutex_flag, flag);
1015         }
1016         set_owner_pid(lp, 0, 0);
1017         upimutex_unlock((upimutex_t *)upimutex, flag);
1018         upilocked = 0;
1019 out:
1020         no_fault();
1021         return (error);
1022 }
1023 
1024 /*
1025  * Set the owner and ownerpid fields of a user-level mutex.
1026  */
1027 static void
1028 set_owner_pid(lwp_mutex_t *lp, uintptr_t owner, pid_t pid)
1029 {
1030         union {
1031                 uint64_t word64;
1032                 uint32_t word32[2];
1033         } un;
1034 
1035         un.word64 = (uint64_t)owner;
1036 
1037         suword32_noerr(&lp->mutex_ownerpid, pid);
1038 #if defined(_LP64)
1039         if (((uintptr_t)lp & (_LONG_LONG_ALIGNMENT - 1)) == 0) { /* aligned */
1040                 suword64_noerr(&lp->mutex_owner, un.word64);
1041                 return;
1042         }
1043 #endif
1044         /* mutex is unaligned or we are running on a 32-bit kernel */
1045         suword32_noerr((uint32_t *)&lp->mutex_owner, un.word32[0]);
1046         suword32_noerr((uint32_t *)&lp->mutex_owner + 1, un.word32[1]);
1047 }
1048 
1049 /*
1050  * Clear the contents of a user-level mutex; return the flags.
1051  * Used only by upi_dead() and lwp_mutex_cleanup(), below.
1052  */
1053 static uint16_t
1054 lwp_clear_mutex(lwp_mutex_t *lp, uint16_t lockflg)
1055 {
1056         uint16_t flag;
1057 
1058         fuword16_noerr(&lp->mutex_flag, &flag);
1059         if ((flag &
1060             (LOCK_OWNERDEAD | LOCK_UNMAPPED | LOCK_NOTRECOVERABLE)) == 0) {
1061                 flag |= lockflg;
1062                 suword16_noerr(&lp->mutex_flag, flag);
1063         }
1064         set_owner_pid(lp, 0, 0);
1065         suword8_noerr(&lp->mutex_rcount, 0);
1066 
1067         return (flag);
1068 }
1069 
1070 /*
1071  * Mark user mutex state, corresponding to kernel upimutex,
1072  * as LOCK_UNMAPPED or LOCK_OWNERDEAD, as appropriate
1073  */
1074 static int
1075 upi_dead(upimutex_t *upip, uint16_t lockflg)
1076 {
1077         label_t ljb;
1078         int error = 0;
1079         lwp_mutex_t *lp;
1080 
1081         if (on_fault(&ljb)) {
1082                 error = EFAULT;
1083                 goto out;
1084         }
1085 
1086         lp = upip->upi_vaddr;
1087         (void) lwp_clear_mutex(lp, lockflg);
1088         suword8_noerr(&lp->mutex_lockw, 0);
1089 out:
1090         no_fault();
1091         return (error);
1092 }
1093 
1094 /*
1095  * Unlock all upimutexes held by curthread, since curthread is dying.
1096  * For each upimutex, attempt to mark its corresponding user mutex object as
1097  * dead.
1098  */
1099 void
1100 upimutex_cleanup()
1101 {
1102         kthread_t *t = curthread;
1103         uint16_t lockflg = (ttoproc(t)->p_proc_flag & P_PR_EXEC)?
1104             LOCK_UNMAPPED : LOCK_OWNERDEAD;
1105         struct upimutex *upip;
1106 
1107         while ((upip = t->t_upimutex) != NULL) {
1108                 if (upi_dead(upip, lockflg) != 0) {
1109                         /*
1110                          * If the user object associated with this upimutex is
1111                          * unmapped, unlock upimutex with the
1112                          * LOCK_NOTRECOVERABLE flag, so that all waiters are
1113                          * woken up. Since user object is unmapped, it could
1114                          * not be marked as dead or notrecoverable.
1115                          * The waiters will now all wake up and return
1116                          * ENOTRECOVERABLE, since they would find that the lock
1117                          * has not been handed-off to them.
1118                          * See lwp_upimutex_lock().
1119                          */
1120                         upimutex_unlock(upip, LOCK_NOTRECOVERABLE);
1121                 } else {
1122                         /*
1123                          * The user object has been updated as dead.
1124                          * Unlock the upimutex: if no waiters, upip kmem will
1125                          * be freed. If there is a waiter, the lock will be
1126                          * handed off. If exit() is in progress, each existing
1127                          * waiter will successively get the lock, as owners
1128                          * die, and each new owner will call this routine as
1129                          * it dies. The last owner will free kmem, since
1130                          * it will find the upimutex has no waiters. So,
1131                          * eventually, the kmem is guaranteed to be freed.
1132                          */
1133                         upimutex_unlock(upip, 0);
1134                 }
1135                 /*
1136                  * Note that the call to upimutex_unlock() above will delete
1137                  * upimutex from the t_upimutexes chain. And so the
1138                  * while loop will eventually terminate.
1139                  */
1140         }
1141 }
1142 
1143 int
1144 lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp, uintptr_t owner)
1145 {
1146         kthread_t *t = curthread;
1147         klwp_t *lwp = ttolwp(t);
1148         proc_t *p = ttoproc(t);
1149         lwp_timer_t lwpt;
1150         caddr_t timedwait;
1151         int error = 0;
1152         int time_error;
1153         clock_t tim = -1;
1154         uchar_t waiters;
1155         volatile int locked = 0;
1156         volatile int watched = 0;
1157         label_t ljb;
1158         volatile uint8_t type = 0;
1159         lwpchan_t lwpchan;
1160         sleepq_head_t *sqh;
1161         uint16_t flag;
1162         int imm_timeout = 0;
1163 
1164         if ((caddr_t)lp >= p->p_as->a_userlimit)
1165                 return (set_errno(EFAULT));
1166 
1167         /*
1168          * Put the lwp in an orderly state for debugging,
1169          * in case we are stopped while sleeping, below.
1170          */
1171         prstop(PR_REQUESTED, 0);
1172 
1173         timedwait = (caddr_t)tsp;
1174         if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
1175             lwpt.lwpt_imm_timeout) {
1176                 imm_timeout = 1;
1177                 timedwait = NULL;
1178         }
1179 
1180         /*
1181          * Although LMS_USER_LOCK implies "asleep waiting for user-mode lock",
1182          * this micro state is really a run state. If the thread indeed blocks,
1183          * this state becomes valid. If not, the state is converted back to
1184          * LMS_SYSTEM. So, it is OK to set the mstate here, instead of just
1185          * when blocking.
1186          */
1187         (void) new_mstate(t, LMS_USER_LOCK);
1188         if (on_fault(&ljb)) {
1189                 if (locked)
1190                         lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1191                 error = EFAULT;
1192                 goto out;
1193         }
1194         /*
1195          * Force Copy-on-write if necessary and ensure that the
1196          * synchronization object resides in read/write memory.
1197          * Cause an EFAULT return now if this is not so.
1198          */
1199         fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1200         suword8_noerr(&lp->mutex_type, type);
1201         if (UPIMUTEX(type)) {
1202                 no_fault();
1203                 error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt);
1204                 if (error == 0 || error == EOWNERDEAD || error == ELOCKUNMAPPED)
1205                         set_owner_pid(lp, owner,
1206                             (type & USYNC_PROCESS)? p->p_pid : 0);
1207                 if (tsp && !time_error) /* copyout the residual time left */
1208                         error = lwp_timer_copyout(&lwpt, error);
1209                 if (error)
1210                         return (set_errno(error));
1211                 return (0);
1212         }
1213         if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1214             &lwpchan, LWPCHAN_MPPOOL)) {
1215                 error = EFAULT;
1216                 goto out;
1217         }
1218         lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1219         locked = 1;
1220         if (type & LOCK_ROBUST) {
1221                 fuword16_noerr(&lp->mutex_flag, &flag);
1222                 if (flag & LOCK_NOTRECOVERABLE) {
1223                         lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1224                         error = ENOTRECOVERABLE;
1225                         goto out;
1226                 }
1227         }
1228         fuword8_noerr(&lp->mutex_waiters, &waiters);
1229         suword8_noerr(&lp->mutex_waiters, 1);
1230 
1231         /*
1232          * If watchpoints are set, they need to be restored, since
1233          * atomic accesses of memory such as the call to ulock_try()
1234          * below cannot be watched.
1235          */
1236 
1237         watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1238 
1239         while (!ulock_try(&lp->mutex_lockw)) {
1240                 if (time_error) {
1241                         /*
1242                          * The SUSV3 Posix spec is very clear that we
1243                          * should get no error from validating the
1244                          * timer until we would actually sleep.
1245                          */
1246                         error = time_error;
1247                         break;
1248                 }
1249 
1250                 if (watched) {
1251                         watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1252                         watched = 0;
1253                 }
1254 
1255                 if (timedwait) {
1256                         /*
1257                          * If we successfully queue the timeout,
1258                          * then don't drop t_delay_lock until
1259                          * we are on the sleep queue (below).
1260                          */
1261                         mutex_enter(&t->t_delay_lock);
1262                         if (lwp_timer_enqueue(&lwpt) != 0) {
1263                                 mutex_exit(&t->t_delay_lock);
1264                                 imm_timeout = 1;
1265                                 timedwait = NULL;
1266                         }
1267                 }
1268                 lwp_block(&lwpchan);
1269                 /*
1270                  * Nothing should happen to cause the lwp to go to
1271                  * sleep again until after it returns from swtch().
1272                  */
1273                 if (timedwait)
1274                         mutex_exit(&t->t_delay_lock);
1275                 locked = 0;
1276                 lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1277                 if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
1278                         setrun(t);
1279                 swtch();
1280                 t->t_flag &= ~T_WAKEABLE;
1281                 if (timedwait)
1282                         tim = lwp_timer_dequeue(&lwpt);
1283                 setallwatch();
1284                 if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
1285                         error = EINTR;
1286                 else if (imm_timeout || (timedwait && tim == -1))
1287                         error = ETIME;
1288                 if (error) {
1289                         lwp->lwp_asleep = 0;
1290                         lwp->lwp_sysabort = 0;
1291                         watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1292                             S_WRITE);
1293 
1294                         /*
1295                          * Need to re-compute waiters bit. The waiters field in
1296                          * the lock is not reliable. Either of two things could
1297                          * have occurred: no lwp may have called lwp_release()
1298                          * for me but I have woken up due to a signal or
1299                          * timeout.  In this case, the waiter bit is incorrect
1300                          * since it is still set to 1, set above.
1301                          * OR an lwp_release() did occur for some other lwp on
1302                          * the same lwpchan. In this case, the waiter bit is
1303                          * correct.  But which event occurred, one can't tell.
1304                          * So, recompute.
1305                          */
1306                         lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1307                         locked = 1;
1308                         sqh = lwpsqhash(&lwpchan);
1309                         disp_lock_enter(&sqh->sq_lock);
1310                         waiters = iswanted(sqh->sq_queue.sq_first, &lwpchan);
1311                         disp_lock_exit(&sqh->sq_lock);
1312                         break;
1313                 }
1314                 lwp->lwp_asleep = 0;
1315                 watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1316                     S_WRITE);
1317                 lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1318                 locked = 1;
1319                 fuword8_noerr(&lp->mutex_waiters, &waiters);
1320                 suword8_noerr(&lp->mutex_waiters, 1);
1321                 if (type & LOCK_ROBUST) {
1322                         fuword16_noerr(&lp->mutex_flag, &flag);
1323                         if (flag & LOCK_NOTRECOVERABLE) {
1324                                 error = ENOTRECOVERABLE;
1325                                 break;
1326                         }
1327                 }
1328         }
1329 
1330         if (t->t_mstate == LMS_USER_LOCK)
1331                 (void) new_mstate(t, LMS_SYSTEM);
1332 
1333         if (error == 0) {
1334                 set_owner_pid(lp, owner, (type & USYNC_PROCESS)? p->p_pid : 0);
1335                 if (type & LOCK_ROBUST) {
1336                         fuword16_noerr(&lp->mutex_flag, &flag);
1337                         if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1338                                 if (flag & LOCK_OWNERDEAD)
1339                                         error = EOWNERDEAD;
1340                                 else if (type & USYNC_PROCESS_ROBUST)
1341                                         error = ELOCKUNMAPPED;
1342                                 else
1343                                         error = EOWNERDEAD;
1344                         }
1345                 }
1346         }
1347         suword8_noerr(&lp->mutex_waiters, waiters);
1348         locked = 0;
1349         lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1350 out:
1351         no_fault();
1352         if (watched)
1353                 watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1354         if (tsp && !time_error)         /* copyout the residual time left */
1355                 error = lwp_timer_copyout(&lwpt, error);
1356         if (error)
1357                 return (set_errno(error));
1358         return (0);
1359 }
1360 
1361 static int
1362 iswanted(kthread_t *t, lwpchan_t *lwpchan)
1363 {
1364         /*
1365          * The caller holds the dispatcher lock on the sleep queue.
1366          */
1367         while (t != NULL) {
1368                 if (t->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1369                     t->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1370                         return (1);
1371                 t = t->t_link;
1372         }
1373         return (0);
1374 }
1375 
1376 /*
1377  * Return the highest priority thread sleeping on this lwpchan.
1378  */
1379 static kthread_t *
1380 lwp_queue_waiter(lwpchan_t *lwpchan)
1381 {
1382         sleepq_head_t *sqh;
1383         kthread_t *tp;
1384 
1385         sqh = lwpsqhash(lwpchan);
1386         disp_lock_enter(&sqh->sq_lock);          /* lock the sleep queue */
1387         for (tp = sqh->sq_queue.sq_first; tp != NULL; tp = tp->t_link) {
1388                 if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1389                     tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1390                         break;
1391         }
1392         disp_lock_exit(&sqh->sq_lock);
1393         return (tp);
1394 }
1395 
1396 static int
1397 lwp_release(lwpchan_t *lwpchan, uchar_t *waiters, int sync_type)
1398 {
1399         sleepq_head_t *sqh;
1400         kthread_t *tp;
1401         kthread_t **tpp;
1402 
1403         sqh = lwpsqhash(lwpchan);
1404         disp_lock_enter(&sqh->sq_lock);          /* lock the sleep queue */
1405         tpp = &sqh->sq_queue.sq_first;
1406         while ((tp = *tpp) != NULL) {
1407                 if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1408                     tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1409                         /*
1410                          * The following is typically false. It could be true
1411                          * only if lwp_release() is called from
1412                          * lwp_mutex_wakeup() after reading the waiters field
1413                          * from memory in which the lwp lock used to be, but has
1414                          * since been re-used to hold a lwp cv or lwp semaphore.
1415                          * The thread "tp" found to match the lwp lock's wchan
1416                          * is actually sleeping for the cv or semaphore which
1417                          * now has the same wchan. In this case, lwp_release()
1418                          * should return failure.
1419                          */
1420                         if (sync_type != (tp->t_flag & T_WAITCVSEM)) {
1421                                 ASSERT(sync_type == 0);
1422                                 /*
1423                                  * assert that this can happen only for mutexes
1424                                  * i.e. sync_type == 0, for correctly written
1425                                  * user programs.
1426                                  */
1427                                 disp_lock_exit(&sqh->sq_lock);
1428                                 return (0);
1429                         }
1430                         *waiters = iswanted(tp->t_link, lwpchan);
1431                         sleepq_unlink(tpp, tp);
1432                         DTRACE_SCHED1(wakeup, kthread_t *, tp);
1433                         tp->t_wchan0 = NULL;
1434                         tp->t_wchan = NULL;
1435                         tp->t_sobj_ops = NULL;
1436                         tp->t_release = 1;
1437                         THREAD_TRANSITION(tp);  /* drops sleepq lock */
1438                         CL_WAKEUP(tp);
1439                         thread_unlock(tp);      /* drop run queue lock */
1440                         return (1);
1441                 }
1442                 tpp = &tp->t_link;
1443         }
1444         *waiters = 0;
1445         disp_lock_exit(&sqh->sq_lock);
1446         return (0);
1447 }
1448 
1449 static void
1450 lwp_release_all(lwpchan_t *lwpchan)
1451 {
1452         sleepq_head_t   *sqh;
1453         kthread_t *tp;
1454         kthread_t **tpp;
1455 
1456         sqh = lwpsqhash(lwpchan);
1457         disp_lock_enter(&sqh->sq_lock);          /* lock sleep q queue */
1458         tpp = &sqh->sq_queue.sq_first;
1459         while ((tp = *tpp) != NULL) {
1460                 if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1461                     tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1462                         sleepq_unlink(tpp, tp);
1463                         DTRACE_SCHED1(wakeup, kthread_t *, tp);
1464                         tp->t_wchan0 = NULL;
1465                         tp->t_wchan = NULL;
1466                         tp->t_sobj_ops = NULL;
1467                         CL_WAKEUP(tp);
1468                         thread_unlock_high(tp); /* release run queue lock */
1469                 } else {
1470                         tpp = &tp->t_link;
1471                 }
1472         }
1473         disp_lock_exit(&sqh->sq_lock);           /* drop sleep q lock */
1474 }
1475 
1476 /*
1477  * unblock a lwp that is trying to acquire this mutex. the blocked
1478  * lwp resumes and retries to acquire the lock.
1479  */
1480 int
1481 lwp_mutex_wakeup(lwp_mutex_t *lp, int release_all)
1482 {
1483         proc_t *p = ttoproc(curthread);
1484         lwpchan_t lwpchan;
1485         uchar_t waiters;
1486         volatile int locked = 0;
1487         volatile int watched = 0;
1488         volatile uint8_t type = 0;
1489         label_t ljb;
1490         int error = 0;
1491 
1492         if ((caddr_t)lp >= p->p_as->a_userlimit)
1493                 return (set_errno(EFAULT));
1494 
1495         watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1496 
1497         if (on_fault(&ljb)) {
1498                 if (locked)
1499                         lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1500                 error = EFAULT;
1501                 goto out;
1502         }
1503         /*
1504          * Force Copy-on-write if necessary and ensure that the
1505          * synchronization object resides in read/write memory.
1506          * Cause an EFAULT return now if this is not so.
1507          */
1508         fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1509         suword8_noerr(&lp->mutex_type, type);
1510         if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1511             &lwpchan, LWPCHAN_MPPOOL)) {
1512                 error = EFAULT;
1513                 goto out;
1514         }
1515         lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1516         locked = 1;
1517         /*
1518          * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
1519          * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
1520          * may fail.  If it fails, do not write into the waiter bit.
1521          * The call to lwp_release() might fail due to one of three reasons:
1522          *
1523          *      1. due to the thread which set the waiter bit not actually
1524          *         sleeping since it got the lock on the re-try. The waiter
1525          *         bit will then be correctly updated by that thread. This
1526          *         window may be closed by reading the wait bit again here
1527          *         and not calling lwp_release() at all if it is zero.
1528          *      2. the thread which set the waiter bit and went to sleep
1529          *         was woken up by a signal. This time, the waiter recomputes
1530          *         the wait bit in the return with EINTR code.
1531          *      3. the waiter bit read by lwp_mutex_wakeup() was in
1532          *         memory that has been re-used after the lock was dropped.
1533          *         In this case, writing into the waiter bit would cause data
1534          *         corruption.
1535          */
1536         if (release_all)
1537                 lwp_release_all(&lwpchan);
1538         else if (lwp_release(&lwpchan, &waiters, 0))
1539                 suword8_noerr(&lp->mutex_waiters, waiters);
1540         lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1541 out:
1542         no_fault();
1543         if (watched)
1544                 watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1545         if (error)
1546                 return (set_errno(error));
1547         return (0);
1548 }
1549 
1550 /*
1551  * lwp_cond_wait() has four arguments, a pointer to a condition variable,
1552  * a pointer to a mutex, a pointer to a timespec for a timed wait and
1553  * a flag telling the kernel whether or not to honor the kernel/user
1554  * schedctl parking protocol (see schedctl_is_park() in schedctl.c).
1555  * The kernel puts the lwp to sleep on a unique pair of caddr_t's called an
1556  * lwpchan, returned by get_lwpchan().  If the timespec pointer is non-NULL,
1557  * it is used an an in/out parameter.  On entry, it contains the relative
1558  * time until timeout.  On exit, we copyout the residual time left to it.
1559  */
1560 int
1561 lwp_cond_wait(lwp_cond_t *cv, lwp_mutex_t *mp, timespec_t *tsp, int check_park)
1562 {
1563         kthread_t *t = curthread;
1564         klwp_t *lwp = ttolwp(t);
1565         proc_t *p = ttoproc(t);
1566         lwp_timer_t lwpt;
1567         lwpchan_t cv_lwpchan;
1568         lwpchan_t m_lwpchan;
1569         caddr_t timedwait;
1570         volatile uint16_t type = 0;
1571         volatile uint8_t mtype = 0;
1572         uchar_t waiters;
1573         volatile int error;
1574         clock_t tim = -1;
1575         volatile int locked = 0;
1576         volatile int m_locked = 0;
1577         volatile int cvwatched = 0;
1578         volatile int mpwatched = 0;
1579         label_t ljb;
1580         volatile int no_lwpchan = 1;
1581         int imm_timeout = 0;
1582         int imm_unpark = 0;
1583 
1584         if ((caddr_t)cv >= p->p_as->a_userlimit ||
1585             (caddr_t)mp >= p->p_as->a_userlimit)
1586                 return (set_errno(EFAULT));
1587 
1588         /*
1589          * Put the lwp in an orderly state for debugging,
1590          * in case we are stopped while sleeping, below.
1591          */
1592         prstop(PR_REQUESTED, 0);
1593 
1594         timedwait = (caddr_t)tsp;
1595         if ((error = lwp_timer_copyin(&lwpt, tsp)) != 0)
1596                 return (set_errno(error));
1597         if (lwpt.lwpt_imm_timeout) {
1598                 imm_timeout = 1;
1599                 timedwait = NULL;
1600         }
1601 
1602         (void) new_mstate(t, LMS_USER_LOCK);
1603 
1604         if (on_fault(&ljb)) {
1605                 if (no_lwpchan) {
1606                         error = EFAULT;
1607                         goto out;
1608                 }
1609                 if (m_locked) {
1610                         m_locked = 0;
1611                         lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1612                 }
1613                 if (locked) {
1614                         locked = 0;
1615                         lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1616                 }
1617                 /*
1618                  * set up another on_fault() for a possible fault
1619                  * on the user lock accessed at "efault"
1620                  */
1621                 if (on_fault(&ljb)) {
1622                         if (m_locked) {
1623                                 m_locked = 0;
1624                                 lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1625                         }
1626                         goto out;
1627                 }
1628                 error = EFAULT;
1629                 goto efault;
1630         }
1631 
1632         /*
1633          * Force Copy-on-write if necessary and ensure that the
1634          * synchronization object resides in read/write memory.
1635          * Cause an EFAULT return now if this is not so.
1636          */
1637         fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
1638         suword8_noerr(&mp->mutex_type, mtype);
1639         if (UPIMUTEX(mtype) == 0) {
1640                 /* convert user level mutex, "mp", to a unique lwpchan */
1641                 /* check if mtype is ok to use below, instead of type from cv */
1642                 if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
1643                     &m_lwpchan, LWPCHAN_MPPOOL)) {
1644                         error = EFAULT;
1645                         goto out;
1646                 }
1647         }
1648         fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1649         suword16_noerr(&cv->cond_type, type);
1650         /* convert user level condition variable, "cv", to a unique lwpchan */
1651         if (!get_lwpchan(p->p_as, (caddr_t)cv, type,
1652             &cv_lwpchan, LWPCHAN_CVPOOL)) {
1653                 error = EFAULT;
1654                 goto out;
1655         }
1656         no_lwpchan = 0;
1657         cvwatched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1658         if (UPIMUTEX(mtype) == 0)
1659                 mpwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp),
1660                     S_WRITE);
1661 
1662         /*
1663          * lwpchan_lock ensures that the calling lwp is put to sleep atomically
1664          * with respect to a possible wakeup which is a result of either
1665          * an lwp_cond_signal() or an lwp_cond_broadcast().
1666          *
1667          * What's misleading, is that the lwp is put to sleep after the
1668          * condition variable's mutex is released.  This is OK as long as
1669          * the release operation is also done while holding lwpchan_lock.
1670          * The lwp is then put to sleep when the possibility of pagefaulting
1671          * or sleeping is completely eliminated.
1672          */
1673         lwpchan_lock(&cv_lwpchan, LWPCHAN_CVPOOL);
1674         locked = 1;
1675         if (UPIMUTEX(mtype) == 0) {
1676                 lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1677                 m_locked = 1;
1678                 suword8_noerr(&cv->cond_waiters_kernel, 1);
1679                 /*
1680                  * unlock the condition variable's mutex. (pagefaults are
1681                  * possible here.)
1682                  */
1683                 set_owner_pid(mp, 0, 0);
1684                 ulock_clear(&mp->mutex_lockw);
1685                 fuword8_noerr(&mp->mutex_waiters, &waiters);
1686                 if (waiters != 0) {
1687                         /*
1688                          * Given the locking of lwpchan_lock around the release
1689                          * of the mutex and checking for waiters, the following
1690                          * call to lwp_release() can fail ONLY if the lock
1691                          * acquirer is interrupted after setting the waiter bit,
1692                          * calling lwp_block() and releasing lwpchan_lock.
1693                          * In this case, it could get pulled off the lwp sleep
1694                          * q (via setrun()) before the following call to
1695                          * lwp_release() occurs. In this case, the lock
1696                          * requestor will update the waiter bit correctly by
1697                          * re-evaluating it.
1698                          */
1699                         if (lwp_release(&m_lwpchan, &waiters, 0))
1700                                 suword8_noerr(&mp->mutex_waiters, waiters);
1701                 }
1702                 m_locked = 0;
1703                 lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1704         } else {
1705                 suword8_noerr(&cv->cond_waiters_kernel, 1);
1706                 error = lwp_upimutex_unlock(mp, mtype);
1707                 if (error) {    /* if the upimutex unlock failed */
1708                         locked = 0;
1709                         lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1710                         goto out;
1711                 }
1712         }
1713         no_fault();
1714 
1715         if (mpwatched) {
1716                 watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1717                 mpwatched = 0;
1718         }
1719         if (cvwatched) {
1720                 watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1721                 cvwatched = 0;
1722         }
1723 
1724         if (check_park && (!schedctl_is_park() || t->t_unpark)) {
1725                 /*
1726                  * We received a signal at user-level before calling here
1727                  * or another thread wants us to return immediately
1728                  * with EINTR.  See lwp_unpark().
1729                  */
1730                 imm_unpark = 1;
1731                 t->t_unpark = 0;
1732                 timedwait = NULL;
1733         } else if (timedwait) {
1734                 /*
1735                  * If we successfully queue the timeout,
1736                  * then don't drop t_delay_lock until
1737                  * we are on the sleep queue (below).
1738                  */
1739                 mutex_enter(&t->t_delay_lock);
1740                 if (lwp_timer_enqueue(&lwpt) != 0) {
1741                         mutex_exit(&t->t_delay_lock);
1742                         imm_timeout = 1;
1743                         timedwait = NULL;
1744                 }
1745         }
1746         t->t_flag |= T_WAITCVSEM;
1747         lwp_block(&cv_lwpchan);
1748         /*
1749          * Nothing should happen to cause the lwp to go to sleep
1750          * until after it returns from swtch().
1751          */
1752         if (timedwait)
1753                 mutex_exit(&t->t_delay_lock);
1754         locked = 0;
1755         lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1756         if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
1757             (imm_timeout | imm_unpark))
1758                 setrun(t);
1759         swtch();
1760         t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
1761         if (timedwait)
1762                 tim = lwp_timer_dequeue(&lwpt);
1763         if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
1764             MUSTRETURN(p, t) || imm_unpark)
1765                 error = EINTR;
1766         else if (imm_timeout || (timedwait && tim == -1))
1767                 error = ETIME;
1768         lwp->lwp_asleep = 0;
1769         lwp->lwp_sysabort = 0;
1770         setallwatch();
1771 
1772         if (t->t_mstate == LMS_USER_LOCK)
1773                 (void) new_mstate(t, LMS_SYSTEM);
1774 
1775         if (tsp && check_park)          /* copyout the residual time left */
1776                 error = lwp_timer_copyout(&lwpt, error);
1777 
1778         /* the mutex is reacquired by the caller on return to user level */
1779         if (error) {
1780                 /*
1781                  * If we were concurrently lwp_cond_signal()d and we
1782                  * received a UNIX signal or got a timeout, then perform
1783                  * another lwp_cond_signal() to avoid consuming the wakeup.
1784                  */
1785                 if (t->t_release)
1786                         (void) lwp_cond_signal(cv);
1787                 return (set_errno(error));
1788         }
1789         return (0);
1790 
1791 efault:
1792         /*
1793          * make sure that the user level lock is dropped before
1794          * returning to caller, since the caller always re-acquires it.
1795          */
1796         if (UPIMUTEX(mtype) == 0) {
1797                 lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1798                 m_locked = 1;
1799                 set_owner_pid(mp, 0, 0);
1800                 ulock_clear(&mp->mutex_lockw);
1801                 fuword8_noerr(&mp->mutex_waiters, &waiters);
1802                 if (waiters != 0) {
1803                         /*
1804                          * See comment above on lock clearing and lwp_release()
1805                          * success/failure.
1806                          */
1807                         if (lwp_release(&m_lwpchan, &waiters, 0))
1808                                 suword8_noerr(&mp->mutex_waiters, waiters);
1809                 }
1810                 m_locked = 0;
1811                 lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1812         } else {
1813                 (void) lwp_upimutex_unlock(mp, mtype);
1814         }
1815 out:
1816         no_fault();
1817         if (mpwatched)
1818                 watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1819         if (cvwatched)
1820                 watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1821         if (t->t_mstate == LMS_USER_LOCK)
1822                 (void) new_mstate(t, LMS_SYSTEM);
1823         return (set_errno(error));
1824 }
1825 
1826 /*
1827  * wakeup one lwp that's blocked on this condition variable.
1828  */
1829 int
1830 lwp_cond_signal(lwp_cond_t *cv)
1831 {
1832         proc_t *p = ttoproc(curthread);
1833         lwpchan_t lwpchan;
1834         uchar_t waiters;
1835         volatile uint16_t type = 0;
1836         volatile int locked = 0;
1837         volatile int watched = 0;
1838         label_t ljb;
1839         int error = 0;
1840 
1841         if ((caddr_t)cv >= p->p_as->a_userlimit)
1842                 return (set_errno(EFAULT));
1843 
1844         watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1845 
1846         if (on_fault(&ljb)) {
1847                 if (locked)
1848                         lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1849                 error = EFAULT;
1850                 goto out;
1851         }
1852         /*
1853          * Force Copy-on-write if necessary and ensure that the
1854          * synchronization object resides in read/write memory.
1855          * Cause an EFAULT return now if this is not so.
1856          */
1857         fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1858         suword16_noerr(&cv->cond_type, type);
1859         if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1860             &lwpchan, LWPCHAN_CVPOOL)) {
1861                 error = EFAULT;
1862                 goto out;
1863         }
1864         lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1865         locked = 1;
1866         fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1867         if (waiters != 0) {
1868                 /*
1869                  * The following call to lwp_release() might fail but it is
1870                  * OK to write into the waiters bit below, since the memory
1871                  * could not have been re-used or unmapped (for correctly
1872                  * written user programs) as in the case of lwp_mutex_wakeup().
1873                  * For an incorrect program, we should not care about data
1874                  * corruption since this is just one instance of other places
1875                  * where corruption can occur for such a program. Of course
1876                  * if the memory is unmapped, normal fault recovery occurs.
1877                  */
1878                 (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1879                 suword8_noerr(&cv->cond_waiters_kernel, waiters);
1880         }
1881         lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1882 out:
1883         no_fault();
1884         if (watched)
1885                 watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1886         if (error)
1887                 return (set_errno(error));
1888         return (0);
1889 }
1890 
1891 /*
1892  * wakeup every lwp that's blocked on this condition variable.
1893  */
1894 int
1895 lwp_cond_broadcast(lwp_cond_t *cv)
1896 {
1897         proc_t *p = ttoproc(curthread);
1898         lwpchan_t lwpchan;
1899         volatile uint16_t type = 0;
1900         volatile int locked = 0;
1901         volatile int watched = 0;
1902         label_t ljb;
1903         uchar_t waiters;
1904         int error = 0;
1905 
1906         if ((caddr_t)cv >= p->p_as->a_userlimit)
1907                 return (set_errno(EFAULT));
1908 
1909         watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1910 
1911         if (on_fault(&ljb)) {
1912                 if (locked)
1913                         lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1914                 error = EFAULT;
1915                 goto out;
1916         }
1917         /*
1918          * Force Copy-on-write if necessary and ensure that the
1919          * synchronization object resides in read/write memory.
1920          * Cause an EFAULT return now if this is not so.
1921          */
1922         fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1923         suword16_noerr(&cv->cond_type, type);
1924         if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1925             &lwpchan, LWPCHAN_CVPOOL)) {
1926                 error = EFAULT;
1927                 goto out;
1928         }
1929         lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1930         locked = 1;
1931         fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1932         if (waiters != 0) {
1933                 lwp_release_all(&lwpchan);
1934                 suword8_noerr(&cv->cond_waiters_kernel, 0);
1935         }
1936         lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1937 out:
1938         no_fault();
1939         if (watched)
1940                 watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1941         if (error)
1942                 return (set_errno(error));
1943         return (0);
1944 }
1945 
1946 int
1947 lwp_sema_trywait(lwp_sema_t *sp)
1948 {
1949         kthread_t *t = curthread;
1950         proc_t *p = ttoproc(t);
1951         label_t ljb;
1952         volatile int locked = 0;
1953         volatile int watched = 0;
1954         volatile uint16_t type = 0;
1955         int count;
1956         lwpchan_t lwpchan;
1957         uchar_t waiters;
1958         int error = 0;
1959 
1960         if ((caddr_t)sp >= p->p_as->a_userlimit)
1961                 return (set_errno(EFAULT));
1962 
1963         watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1964 
1965         if (on_fault(&ljb)) {
1966                 if (locked)
1967                         lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1968                 error = EFAULT;
1969                 goto out;
1970         }
1971         /*
1972          * Force Copy-on-write if necessary and ensure that the
1973          * synchronization object resides in read/write memory.
1974          * Cause an EFAULT return now if this is not so.
1975          */
1976         fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
1977         suword16_noerr((void *)&sp->sema_type, type);
1978         if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
1979             &lwpchan, LWPCHAN_CVPOOL)) {
1980                 error = EFAULT;
1981                 goto out;
1982         }
1983         lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1984         locked = 1;
1985         fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
1986         if (count == 0)
1987                 error = EBUSY;
1988         else
1989                 suword32_noerr((void *)&sp->sema_count, --count);
1990         if (count != 0) {
1991                 fuword8_noerr(&sp->sema_waiters, &waiters);
1992                 if (waiters != 0) {
1993                         (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1994                         suword8_noerr(&sp->sema_waiters, waiters);
1995                 }
1996         }
1997         lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1998 out:
1999         no_fault();
2000         if (watched)
2001                 watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2002         if (error)
2003                 return (set_errno(error));
2004         return (0);
2005 }
2006 
2007 /*
2008  * See lwp_cond_wait(), above, for an explanation of the 'check_park' argument.
2009  */
2010 int
2011 lwp_sema_timedwait(lwp_sema_t *sp, timespec_t *tsp, int check_park)
2012 {
2013         kthread_t *t = curthread;
2014         klwp_t *lwp = ttolwp(t);
2015         proc_t *p = ttoproc(t);
2016         lwp_timer_t lwpt;
2017         caddr_t timedwait;
2018         clock_t tim = -1;
2019         label_t ljb;
2020         volatile int locked = 0;
2021         volatile int watched = 0;
2022         volatile uint16_t type = 0;
2023         int count;
2024         lwpchan_t lwpchan;
2025         uchar_t waiters;
2026         int error = 0;
2027         int time_error;
2028         int imm_timeout = 0;
2029         int imm_unpark = 0;
2030 
2031         if ((caddr_t)sp >= p->p_as->a_userlimit)
2032                 return (set_errno(EFAULT));
2033 
2034         /*
2035          * Put the lwp in an orderly state for debugging,
2036          * in case we are stopped while sleeping, below.
2037          */
2038         prstop(PR_REQUESTED, 0);
2039 
2040         timedwait = (caddr_t)tsp;
2041         if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2042             lwpt.lwpt_imm_timeout) {
2043                 imm_timeout = 1;
2044                 timedwait = NULL;
2045         }
2046 
2047         watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2048 
2049         if (on_fault(&ljb)) {
2050                 if (locked)
2051                         lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2052                 error = EFAULT;
2053                 goto out;
2054         }
2055         /*
2056          * Force Copy-on-write if necessary and ensure that the
2057          * synchronization object resides in read/write memory.
2058          * Cause an EFAULT return now if this is not so.
2059          */
2060         fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
2061         suword16_noerr((void *)&sp->sema_type, type);
2062         if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
2063             &lwpchan, LWPCHAN_CVPOOL)) {
2064                 error = EFAULT;
2065                 goto out;
2066         }
2067         lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2068         locked = 1;
2069         fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2070         while (error == 0 && count == 0) {
2071                 if (time_error) {
2072                         /*
2073                          * The SUSV3 Posix spec is very clear that we
2074                          * should get no error from validating the
2075                          * timer until we would actually sleep.
2076                          */
2077                         error = time_error;
2078                         break;
2079                 }
2080                 suword8_noerr(&sp->sema_waiters, 1);
2081                 if (watched)
2082                         watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2083                 if (check_park && (!schedctl_is_park() || t->t_unpark)) {
2084                         /*
2085                          * We received a signal at user-level before calling
2086                          * here or another thread wants us to return
2087                          * immediately with EINTR.  See lwp_unpark().
2088                          */
2089                         imm_unpark = 1;
2090                         t->t_unpark = 0;
2091                         timedwait = NULL;
2092                 } else if (timedwait) {
2093                         /*
2094                          * If we successfully queue the timeout,
2095                          * then don't drop t_delay_lock until
2096                          * we are on the sleep queue (below).
2097                          */
2098                         mutex_enter(&t->t_delay_lock);
2099                         if (lwp_timer_enqueue(&lwpt) != 0) {
2100                                 mutex_exit(&t->t_delay_lock);
2101                                 imm_timeout = 1;
2102                                 timedwait = NULL;
2103                         }
2104                 }
2105                 t->t_flag |= T_WAITCVSEM;
2106                 lwp_block(&lwpchan);
2107                 /*
2108                  * Nothing should happen to cause the lwp to sleep
2109                  * again until after it returns from swtch().
2110                  */
2111                 if (timedwait)
2112                         mutex_exit(&t->t_delay_lock);
2113                 locked = 0;
2114                 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2115                 if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
2116                     (imm_timeout | imm_unpark))
2117                         setrun(t);
2118                 swtch();
2119                 t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2120                 if (timedwait)
2121                         tim = lwp_timer_dequeue(&lwpt);
2122                 setallwatch();
2123                 if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
2124                     MUSTRETURN(p, t) || imm_unpark)
2125                         error = EINTR;
2126                 else if (imm_timeout || (timedwait && tim == -1))
2127                         error = ETIME;
2128                 lwp->lwp_asleep = 0;
2129                 lwp->lwp_sysabort = 0;
2130                 watched = watch_disable_addr((caddr_t)sp,
2131                     sizeof (*sp), S_WRITE);
2132                 lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2133                 locked = 1;
2134                 fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2135         }
2136         if (error == 0)
2137                 suword32_noerr((void *)&sp->sema_count, --count);
2138         if (count != 0) {
2139                 (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2140                 suword8_noerr(&sp->sema_waiters, waiters);
2141         }
2142         lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2143 out:
2144         no_fault();
2145         if (watched)
2146                 watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2147         if (tsp && check_park && !time_error)
2148                 error = lwp_timer_copyout(&lwpt, error);
2149         if (error)
2150                 return (set_errno(error));
2151         return (0);
2152 }
2153 
2154 int
2155 lwp_sema_post(lwp_sema_t *sp)
2156 {
2157         proc_t *p = ttoproc(curthread);
2158         label_t ljb;
2159         volatile int locked = 0;
2160         volatile int watched = 0;
2161         volatile uint16_t type = 0;
2162         int count;
2163         lwpchan_t lwpchan;
2164         uchar_t waiters;
2165         int error = 0;
2166 
2167         if ((caddr_t)sp >= p->p_as->a_userlimit)
2168                 return (set_errno(EFAULT));
2169 
2170         watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2171 
2172         if (on_fault(&ljb)) {
2173                 if (locked)
2174                         lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2175                 error = EFAULT;
2176                 goto out;
2177         }
2178         /*
2179          * Force Copy-on-write if necessary and ensure that the
2180          * synchronization object resides in read/write memory.
2181          * Cause an EFAULT return now if this is not so.
2182          */
2183         fuword16_noerr(&sp->sema_type, (uint16_t *)&type);
2184         suword16_noerr(&sp->sema_type, type);
2185         if (!get_lwpchan(curproc->p_as, (caddr_t)sp, type,
2186             &lwpchan, LWPCHAN_CVPOOL)) {
2187                 error = EFAULT;
2188                 goto out;
2189         }
2190         lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2191         locked = 1;
2192         fuword32_noerr(&sp->sema_count, (uint32_t *)&count);
2193         if (count == _SEM_VALUE_MAX)
2194                 error = EOVERFLOW;
2195         else
2196                 suword32_noerr(&sp->sema_count, ++count);
2197         if (count == 1) {
2198                 fuword8_noerr(&sp->sema_waiters, &waiters);
2199                 if (waiters) {
2200                         (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2201                         suword8_noerr(&sp->sema_waiters, waiters);
2202                 }
2203         }
2204         lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2205 out:
2206         no_fault();
2207         if (watched)
2208                 watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2209         if (error)
2210                 return (set_errno(error));
2211         return (0);
2212 }
2213 
2214 #define TRW_WANT_WRITE          0x1
2215 #define TRW_LOCK_GRANTED        0x2
2216 
2217 #define READ_LOCK               0
2218 #define WRITE_LOCK              1
2219 #define TRY_FLAG                0x10
2220 #define READ_LOCK_TRY           (READ_LOCK | TRY_FLAG)
2221 #define WRITE_LOCK_TRY          (WRITE_LOCK | TRY_FLAG)
2222 
2223 /*
2224  * Release one writer or one or more readers. Compute the rwstate word to
2225  * reflect the new state of the queue. For a safe hand-off we copy the new
2226  * rwstate value back to userland before we wake any of the new lock holders.
2227  *
2228  * Note that sleepq_insert() implements a prioritized FIFO (with writers
2229  * being given precedence over readers of the same priority).
2230  *
2231  * If the first thread is a reader we scan the queue releasing all readers
2232  * until we hit a writer or the end of the queue. If the first thread is a
2233  * writer we still need to check for another writer.
2234  */
2235 void
2236 lwp_rwlock_release(lwpchan_t *lwpchan, lwp_rwlock_t *rw)
2237 {
2238         sleepq_head_t *sqh;
2239         kthread_t *tp;
2240         kthread_t **tpp;
2241         kthread_t *tpnext;
2242         kthread_t *wakelist = NULL;
2243         uint32_t rwstate = 0;
2244         int wcount = 0;
2245         int rcount = 0;
2246 
2247         sqh = lwpsqhash(lwpchan);
2248         disp_lock_enter(&sqh->sq_lock);
2249         tpp = &sqh->sq_queue.sq_first;
2250         while ((tp = *tpp) != NULL) {
2251                 if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
2252                     tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
2253                         if (tp->t_writer & TRW_WANT_WRITE) {
2254                                 if ((wcount++ == 0) && (rcount == 0)) {
2255                                         rwstate |= URW_WRITE_LOCKED;
2256 
2257                                         /* Just one writer to wake. */
2258                                         sleepq_unlink(tpp, tp);
2259                                         wakelist = tp;
2260 
2261                                         /* tpp already set for next thread. */
2262                                         continue;
2263                                 } else {
2264                                         rwstate |= URW_HAS_WAITERS;
2265                                         /* We need look no further. */
2266                                         break;
2267                                 }
2268                         } else {
2269                                 rcount++;
2270                                 if (wcount == 0) {
2271                                         rwstate++;
2272 
2273                                         /* Add reader to wake list. */
2274                                         sleepq_unlink(tpp, tp);
2275                                         tp->t_link = wakelist;
2276                                         wakelist = tp;
2277 
2278                                         /* tpp already set for next thread. */
2279                                         continue;
2280                                 } else {
2281                                         rwstate |= URW_HAS_WAITERS;
2282                                         /* We need look no further. */
2283                                         break;
2284                                 }
2285                         }
2286                 }
2287                 tpp = &tp->t_link;
2288         }
2289 
2290         /* Copy the new rwstate back to userland. */
2291         suword32_noerr(&rw->rwlock_readers, rwstate);
2292 
2293         /* Wake the new lock holder(s) up. */
2294         tp = wakelist;
2295         while (tp != NULL) {
2296                 DTRACE_SCHED1(wakeup, kthread_t *, tp);
2297                 tp->t_wchan0 = NULL;
2298                 tp->t_wchan = NULL;
2299                 tp->t_sobj_ops = NULL;
2300                 tp->t_writer |= TRW_LOCK_GRANTED;
2301                 tpnext = tp->t_link;
2302                 tp->t_link = NULL;
2303                 CL_WAKEUP(tp);
2304                 thread_unlock_high(tp);
2305                 tp = tpnext;
2306         }
2307 
2308         disp_lock_exit(&sqh->sq_lock);
2309 }
2310 
2311 /*
2312  * We enter here holding the user-level mutex, which we must release before
2313  * returning or blocking. Based on lwp_cond_wait().
2314  */
2315 static int
2316 lwp_rwlock_lock(lwp_rwlock_t *rw, timespec_t *tsp, int rd_wr)
2317 {
2318         lwp_mutex_t *mp = NULL;
2319         kthread_t *t = curthread;
2320         kthread_t *tp;
2321         klwp_t *lwp = ttolwp(t);
2322         proc_t *p = ttoproc(t);
2323         lwp_timer_t lwpt;
2324         lwpchan_t lwpchan;
2325         lwpchan_t mlwpchan;
2326         caddr_t timedwait;
2327         volatile uint16_t type = 0;
2328         volatile uint8_t mtype = 0;
2329         uchar_t mwaiters;
2330         volatile int error = 0;
2331         int time_error;
2332         clock_t tim = -1;
2333         volatile int locked = 0;
2334         volatile int mlocked = 0;
2335         volatile int watched = 0;
2336         volatile int mwatched = 0;
2337         label_t ljb;
2338         volatile int no_lwpchan = 1;
2339         int imm_timeout = 0;
2340         int try_flag;
2341         uint32_t rwstate;
2342         int acquired = 0;
2343 
2344         /* We only check rw because the mutex is included in it. */
2345         if ((caddr_t)rw >= p->p_as->a_userlimit)
2346                 return (set_errno(EFAULT));
2347 
2348         /*
2349          * Put the lwp in an orderly state for debugging,
2350          * in case we are stopped while sleeping, below.
2351          */
2352         prstop(PR_REQUESTED, 0);
2353 
2354         /* We must only report this error if we are about to sleep (later). */
2355         timedwait = (caddr_t)tsp;
2356         if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2357             lwpt.lwpt_imm_timeout) {
2358                 imm_timeout = 1;
2359                 timedwait = NULL;
2360         }
2361 
2362         (void) new_mstate(t, LMS_USER_LOCK);
2363 
2364         if (on_fault(&ljb)) {
2365                 if (no_lwpchan) {
2366                         error = EFAULT;
2367                         goto out_nodrop;
2368                 }
2369                 if (mlocked) {
2370                         mlocked = 0;
2371                         lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2372                 }
2373                 if (locked) {
2374                         locked = 0;
2375                         lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2376                 }
2377                 /*
2378                  * Set up another on_fault() for a possible fault
2379                  * on the user lock accessed at "out_drop".
2380                  */
2381                 if (on_fault(&ljb)) {
2382                         if (mlocked) {
2383                                 mlocked = 0;
2384                                 lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2385                         }
2386                         error = EFAULT;
2387                         goto out_nodrop;
2388                 }
2389                 error = EFAULT;
2390                 goto out_nodrop;
2391         }
2392 
2393         /* Process rd_wr (including sanity check). */
2394         try_flag = (rd_wr & TRY_FLAG);
2395         rd_wr &= ~TRY_FLAG;
2396         if ((rd_wr != READ_LOCK) && (rd_wr != WRITE_LOCK)) {
2397                 error = EINVAL;
2398                 goto out_nodrop;
2399         }
2400 
2401         /*
2402          * Force Copy-on-write if necessary and ensure that the
2403          * synchronization object resides in read/write memory.
2404          * Cause an EFAULT return now if this is not so.
2405          */
2406         mp = &rw->mutex;
2407         fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
2408         fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2409         suword8_noerr(&mp->mutex_type, mtype);
2410         suword16_noerr(&rw->rwlock_type, type);
2411 
2412         /* We can only continue for simple USYNC_PROCESS locks. */
2413         if ((mtype != USYNC_PROCESS) || (type != USYNC_PROCESS)) {
2414                 error = EINVAL;
2415                 goto out_nodrop;
2416         }
2417 
2418         /* Convert user level mutex, "mp", to a unique lwpchan. */
2419         if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
2420             &mlwpchan, LWPCHAN_MPPOOL)) {
2421                 error = EFAULT;
2422                 goto out_nodrop;
2423         }
2424 
2425         /* Convert user level rwlock, "rw", to a unique lwpchan. */
2426         if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2427             &lwpchan, LWPCHAN_CVPOOL)) {
2428                 error = EFAULT;
2429                 goto out_nodrop;
2430         }
2431 
2432         no_lwpchan = 0;
2433         watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2434         mwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2435 
2436         /*
2437          * lwpchan_lock() ensures that the calling LWP is put to sleep
2438          * atomically with respect to a possible wakeup which is a result
2439          * of lwp_rwlock_unlock().
2440          *
2441          * What's misleading is that the LWP is put to sleep after the
2442          * rwlock's mutex is released. This is OK as long as the release
2443          * operation is also done while holding mlwpchan. The LWP is then
2444          * put to sleep when the possibility of pagefaulting or sleeping
2445          * has been completely eliminated.
2446          */
2447         lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2448         locked = 1;
2449         lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2450         mlocked = 1;
2451 
2452         /*
2453          * Fetch the current rwlock state.
2454          *
2455          * The possibility of spurious wake-ups or killed waiters means
2456          * rwstate's URW_HAS_WAITERS bit may indicate false positives.
2457          * We only fix these if they are important to us.
2458          *
2459          * Although various error states can be observed here (e.g. the lock
2460          * is not held, but there are waiters) we assume these are applicaton
2461          * errors and so we take no corrective action.
2462          */
2463         fuword32_noerr(&rw->rwlock_readers, &rwstate);
2464         /*
2465          * We cannot legitimately get here from user-level
2466          * without URW_HAS_WAITERS being set.
2467          * Set it now to guard against user-level error.
2468          */
2469         rwstate |= URW_HAS_WAITERS;
2470 
2471         /*
2472          * We can try only if the lock isn't held by a writer.
2473          */
2474         if (!(rwstate & URW_WRITE_LOCKED)) {
2475                 tp = lwp_queue_waiter(&lwpchan);
2476                 if (tp == NULL) {
2477                         /*
2478                          * Hmmm, rwstate indicates waiters but there are
2479                          * none queued. This could just be the result of a
2480                          * spurious wakeup, so let's ignore it.
2481                          *
2482                          * We now have a chance to acquire the lock
2483                          * uncontended, but this is the last chance for
2484                          * a writer to acquire the lock without blocking.
2485                          */
2486                         if (rd_wr == READ_LOCK) {
2487                                 rwstate++;
2488                                 acquired = 1;
2489                         } else if ((rwstate & URW_READERS_MASK) == 0) {
2490                                 rwstate |= URW_WRITE_LOCKED;
2491                                 acquired = 1;
2492                         }
2493                 } else if (rd_wr == READ_LOCK) {
2494                         /*
2495                          * This is the last chance for a reader to acquire
2496                          * the lock now, but it can only do so if there is
2497                          * no writer of equal or greater priority at the
2498                          * head of the queue .
2499                          *
2500                          * It is also just possible that there is a reader
2501                          * at the head of the queue. This may be the result
2502                          * of a spurious wakeup or an application failure.
2503                          * In this case we only acquire the lock if we have
2504                          * equal or greater priority. It is not our job to
2505                          * release spurious waiters.
2506                          */
2507                         pri_t our_pri = DISP_PRIO(t);
2508                         pri_t his_pri = DISP_PRIO(tp);
2509 
2510                         if ((our_pri > his_pri) || ((our_pri == his_pri) &&
2511                             !(tp->t_writer & TRW_WANT_WRITE))) {
2512                                 rwstate++;
2513                                 acquired = 1;
2514                         }
2515                 }
2516         }
2517 
2518         if (acquired || try_flag || time_error) {
2519                 /*
2520                  * We're not going to block this time.
2521                  */
2522                 suword32_noerr(&rw->rwlock_readers, rwstate);
2523                 lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2524                 locked = 0;
2525 
2526                 if (acquired) {
2527                         /*
2528                          * Got the lock!
2529                          */
2530                         error = 0;
2531 
2532                 } else if (try_flag) {
2533                         /*
2534                          * We didn't get the lock and we're about to block.
2535                          * If we're doing a trylock, return EBUSY instead.
2536                          */
2537                         error = EBUSY;
2538 
2539                 } else if (time_error) {
2540                         /*
2541                          * The SUSV3 POSIX spec is very clear that we should
2542                          * get no error from validating the timer (above)
2543                          * until we would actually sleep.
2544                          */
2545                         error = time_error;
2546                 }
2547 
2548                 goto out_drop;
2549         }
2550 
2551         /*
2552          * We're about to block, so indicate what kind of waiter we are.
2553          */
2554         t->t_writer = 0;
2555         if (rd_wr == WRITE_LOCK)
2556                 t->t_writer = TRW_WANT_WRITE;
2557         suword32_noerr(&rw->rwlock_readers, rwstate);
2558 
2559         /*
2560          * Unlock the rwlock's mutex (pagefaults are possible here).
2561          */
2562         set_owner_pid(mp, 0, 0);
2563         ulock_clear(&mp->mutex_lockw);
2564         fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2565         if (mwaiters != 0) {
2566                 /*
2567                  * Given the locking of mlwpchan around the release of
2568                  * the mutex and checking for waiters, the following
2569                  * call to lwp_release() can fail ONLY if the lock
2570                  * acquirer is interrupted after setting the waiter bit,
2571                  * calling lwp_block() and releasing mlwpchan.
2572                  * In this case, it could get pulled off the LWP sleep
2573                  * queue (via setrun()) before the following call to
2574                  * lwp_release() occurs, and the lock requestor will
2575                  * update the waiter bit correctly by re-evaluating it.
2576                  */
2577                 if (lwp_release(&mlwpchan, &mwaiters, 0))
2578                         suword8_noerr(&mp->mutex_waiters, mwaiters);
2579         }
2580         lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2581         mlocked = 0;
2582         no_fault();
2583 
2584         if (mwatched) {
2585                 watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2586                 mwatched = 0;
2587         }
2588         if (watched) {
2589                 watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2590                 watched = 0;
2591         }
2592 
2593         if (timedwait) {
2594                 /*
2595                  * If we successfully queue the timeout,
2596                  * then don't drop t_delay_lock until
2597                  * we are on the sleep queue (below).
2598                  */
2599                 mutex_enter(&t->t_delay_lock);
2600                 if (lwp_timer_enqueue(&lwpt) != 0) {
2601                         mutex_exit(&t->t_delay_lock);
2602                         imm_timeout = 1;
2603                         timedwait = NULL;
2604                 }
2605         }
2606         t->t_flag |= T_WAITCVSEM;
2607         lwp_block(&lwpchan);
2608 
2609         /*
2610          * Nothing should happen to cause the LWp to go to sleep until after
2611          * it returns from swtch().
2612          */
2613         if (timedwait)
2614                 mutex_exit(&t->t_delay_lock);
2615         locked = 0;
2616         lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2617         if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
2618                 setrun(t);
2619         swtch();
2620 
2621         /*
2622          * We're back, but we need to work out why. Were we interrupted? Did
2623          * we timeout? Were we granted the lock?
2624          */
2625         error = EAGAIN;
2626         acquired = (t->t_writer & TRW_LOCK_GRANTED);
2627         t->t_writer = 0;
2628         t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2629         if (timedwait)
2630                 tim = lwp_timer_dequeue(&lwpt);
2631         if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
2632                 error = EINTR;
2633         else if (imm_timeout || (timedwait && tim == -1))
2634                 error = ETIME;
2635         lwp->lwp_asleep = 0;
2636         lwp->lwp_sysabort = 0;
2637         setallwatch();
2638 
2639         /*
2640          * If we were granted the lock we don't care about EINTR or ETIME.
2641          */
2642         if (acquired)
2643                 error = 0;
2644 
2645         if (t->t_mstate == LMS_USER_LOCK)
2646                 (void) new_mstate(t, LMS_SYSTEM);
2647 
2648         if (error)
2649                 return (set_errno(error));
2650         return (0);
2651 
2652 out_drop:
2653         /*
2654          * Make sure that the user level lock is dropped before returning
2655          * to the caller.
2656          */
2657         if (!mlocked) {
2658                 lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2659                 mlocked = 1;
2660         }
2661         set_owner_pid(mp, 0, 0);
2662         ulock_clear(&mp->mutex_lockw);
2663         fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2664         if (mwaiters != 0) {
2665                 /*
2666                  * See comment above on lock clearing and lwp_release()
2667                  * success/failure.
2668                  */
2669                 if (lwp_release(&mlwpchan, &mwaiters, 0))
2670                         suword8_noerr(&mp->mutex_waiters, mwaiters);
2671         }
2672         lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2673         mlocked = 0;
2674 
2675 out_nodrop:
2676         no_fault();
2677         if (mwatched)
2678                 watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2679         if (watched)
2680                 watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2681         if (t->t_mstate == LMS_USER_LOCK)
2682                 (void) new_mstate(t, LMS_SYSTEM);
2683         if (error)
2684                 return (set_errno(error));
2685         return (0);
2686 }
2687 
2688 /*
2689  * We enter here holding the user-level mutex but, unlike lwp_rwlock_lock(),
2690  * we never drop the lock.
2691  */
2692 static int
2693 lwp_rwlock_unlock(lwp_rwlock_t *rw)
2694 {
2695         kthread_t *t = curthread;
2696         proc_t *p = ttoproc(t);
2697         lwpchan_t lwpchan;
2698         volatile uint16_t type = 0;
2699         volatile int error = 0;
2700         volatile int locked = 0;
2701         volatile int watched = 0;
2702         label_t ljb;
2703         volatile int no_lwpchan = 1;
2704         uint32_t rwstate;
2705 
2706         /* We only check rw because the mutex is included in it. */
2707         if ((caddr_t)rw >= p->p_as->a_userlimit)
2708                 return (set_errno(EFAULT));
2709 
2710         if (on_fault(&ljb)) {
2711                 if (no_lwpchan) {
2712                         error = EFAULT;
2713                         goto out_nodrop;
2714                 }
2715                 if (locked) {
2716                         locked = 0;
2717                         lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2718                 }
2719                 error = EFAULT;
2720                 goto out_nodrop;
2721         }
2722 
2723         /*
2724          * Force Copy-on-write if necessary and ensure that the
2725          * synchronization object resides in read/write memory.
2726          * Cause an EFAULT return now if this is not so.
2727          */
2728         fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2729         suword16_noerr(&rw->rwlock_type, type);
2730 
2731         /* We can only continue for simple USYNC_PROCESS locks. */
2732         if (type != USYNC_PROCESS) {
2733                 error = EINVAL;
2734                 goto out_nodrop;
2735         }
2736 
2737         /* Convert user level rwlock, "rw", to a unique lwpchan. */
2738         if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2739             &lwpchan, LWPCHAN_CVPOOL)) {
2740                 error = EFAULT;
2741                 goto out_nodrop;
2742         }
2743 
2744         no_lwpchan = 0;
2745         watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2746 
2747         lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2748         locked = 1;
2749 
2750         /*
2751          * We can resolve multiple readers (except the last reader) here.
2752          * For the last reader or a writer we need lwp_rwlock_release(),
2753          * to which we also delegate the task of copying the new rwstate
2754          * back to userland (see the comment there).
2755          */
2756         fuword32_noerr(&rw->rwlock_readers, &rwstate);
2757         if (rwstate & URW_WRITE_LOCKED)
2758                 lwp_rwlock_release(&lwpchan, rw);
2759         else if ((rwstate & URW_READERS_MASK) > 0) {
2760                 rwstate--;
2761                 if ((rwstate & URW_READERS_MASK) == 0)
2762                         lwp_rwlock_release(&lwpchan, rw);
2763                 else
2764                         suword32_noerr(&rw->rwlock_readers, rwstate);
2765         }
2766 
2767         lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2768         locked = 0;
2769         error = 0;
2770 
2771 out_nodrop:
2772         no_fault();
2773         if (watched)
2774                 watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2775         if (error)
2776                 return (set_errno(error));
2777         return (0);
2778 }
2779 
2780 int
2781 lwp_rwlock_sys(int subcode, lwp_rwlock_t *rwlp, timespec_t *tsp)
2782 {
2783         switch (subcode) {
2784         case 0:
2785                 return (lwp_rwlock_lock(rwlp, tsp, READ_LOCK));
2786         case 1:
2787                 return (lwp_rwlock_lock(rwlp, tsp, WRITE_LOCK));
2788         case 2:
2789                 return (lwp_rwlock_lock(rwlp, NULL, READ_LOCK_TRY));
2790         case 3:
2791                 return (lwp_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY));
2792         case 4:
2793                 return (lwp_rwlock_unlock(rwlp));
2794         }
2795         return (set_errno(EINVAL));
2796 }
2797 
2798 /*
2799  * Return the owner of the user-level s-object.
2800  * Since we can't really do this, return NULL.
2801  */
2802 /* ARGSUSED */
2803 static kthread_t *
2804 lwpsobj_owner(caddr_t sobj)
2805 {
2806         return ((kthread_t *)NULL);
2807 }
2808 
2809 /*
2810  * Wake up a thread asleep on a user-level synchronization
2811  * object.
2812  */
2813 static void
2814 lwp_unsleep(kthread_t *t)
2815 {
2816         ASSERT(THREAD_LOCK_HELD(t));
2817         if (t->t_wchan0 != NULL) {
2818                 sleepq_head_t *sqh;
2819                 sleepq_t *sqp = t->t_sleepq;
2820 
2821                 if (sqp != NULL) {
2822                         sqh = lwpsqhash(&t->t_lwpchan);
2823                         ASSERT(&sqh->sq_queue == sqp);
2824                         sleepq_unsleep(t);
2825                         disp_lock_exit_high(&sqh->sq_lock);
2826                         CL_SETRUN(t);
2827                         return;
2828                 }
2829         }
2830         panic("lwp_unsleep: thread %p not on sleepq", (void *)t);
2831 }
2832 
2833 /*
2834  * Change the priority of a thread asleep on a user-level
2835  * synchronization object. To maintain proper priority order,
2836  * we:
2837  *      o dequeue the thread.
2838  *      o change its priority.
2839  *      o re-enqueue the thread.
2840  * Assumption: the thread is locked on entry.
2841  */
2842 static void
2843 lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip)
2844 {
2845         ASSERT(THREAD_LOCK_HELD(t));
2846         if (t->t_wchan0 != NULL) {
2847                 sleepq_t   *sqp = t->t_sleepq;
2848 
2849                 sleepq_dequeue(t);
2850                 *t_prip = pri;
2851                 sleepq_insert(sqp, t);
2852         } else
2853                 panic("lwp_change_pri: %p not on a sleep queue", (void *)t);
2854 }
2855 
2856 /*
2857  * Clean up a left-over process-shared robust mutex
2858  */
2859 static void
2860 lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg)
2861 {
2862         uint16_t flag;
2863         uchar_t waiters;
2864         label_t ljb;
2865         pid_t owner_pid;
2866         lwp_mutex_t *lp;
2867         volatile int locked = 0;
2868         volatile int watched = 0;
2869         volatile struct upimutex *upimutex = NULL;
2870         volatile int upilocked = 0;
2871 
2872         if ((ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
2873             != (USYNC_PROCESS | LOCK_ROBUST))
2874                 return;
2875 
2876         lp = (lwp_mutex_t *)ent->lwpchan_addr;
2877         watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2878         if (on_fault(&ljb)) {
2879                 if (locked)
2880                         lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2881                 if (upilocked)
2882                         upimutex_unlock((upimutex_t *)upimutex, 0);
2883                 goto out;
2884         }
2885 
2886         fuword32_noerr(&lp->mutex_ownerpid, (uint32_t *)&owner_pid);
2887 
2888         if (UPIMUTEX(ent->lwpchan_type)) {
2889                 lwpchan_t lwpchan = ent->lwpchan_lwpchan;
2890                 upib_t *upibp = &UPI_CHAIN(lwpchan);
2891 
2892                 if (owner_pid != curproc->p_pid)
2893                         goto out;
2894                 mutex_enter(&upibp->upib_lock);
2895                 upimutex = upi_get(upibp, &lwpchan);
2896                 if (upimutex == NULL || upimutex->upi_owner != curthread) {
2897                         mutex_exit(&upibp->upib_lock);
2898                         goto out;
2899                 }
2900                 mutex_exit(&upibp->upib_lock);
2901                 upilocked = 1;
2902                 flag = lwp_clear_mutex(lp, lockflg);
2903                 suword8_noerr(&lp->mutex_lockw, 0);
2904                 upimutex_unlock((upimutex_t *)upimutex, flag);
2905         } else {
2906                 lwpchan_lock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2907                 locked = 1;
2908                 /*
2909                  * Clear the spinners count because one of our
2910                  * threads could have been spinning for this lock
2911                  * at user level when the process was suddenly killed.
2912                  * There is no harm in this since user-level libc code
2913                  * will adapt to the sudden change in the spinner count.
2914                  */
2915                 suword8_noerr(&lp->mutex_spinners, 0);
2916                 if (owner_pid != curproc->p_pid) {
2917                         /*
2918                          * We are not the owner.  There may or may not be one.
2919                          * If there are waiters, we wake up one or all of them.
2920                          * It doesn't hurt to wake them up in error since
2921                          * they will just retry the lock and go to sleep
2922                          * again if necessary.
2923                          */
2924                         fuword8_noerr(&lp->mutex_waiters, &waiters);
2925                         if (waiters != 0) {     /* there are waiters */
2926                                 fuword16_noerr(&lp->mutex_flag, &flag);
2927                                 if (flag & LOCK_NOTRECOVERABLE) {
2928                                         lwp_release_all(&ent->lwpchan_lwpchan);
2929                                         suword8_noerr(&lp->mutex_waiters, 0);
2930                                 } else if (lwp_release(&ent->lwpchan_lwpchan,
2931                                     &waiters, 0)) {
2932                                         suword8_noerr(&lp->mutex_waiters,
2933                                             waiters);
2934                                 }
2935                         }
2936                 } else {
2937                         /*
2938                          * We are the owner.  Release it.
2939                          */
2940                         (void) lwp_clear_mutex(lp, lockflg);
2941                         ulock_clear(&lp->mutex_lockw);
2942                         fuword8_noerr(&lp->mutex_waiters, &waiters);
2943                         if (waiters &&
2944                             lwp_release(&ent->lwpchan_lwpchan, &waiters, 0))
2945                                 suword8_noerr(&lp->mutex_waiters, waiters);
2946                 }
2947                 lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2948         }
2949 out:
2950         no_fault();
2951         if (watched)
2952                 watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2953 }
2954 
2955 /*
2956  * Register a process-shared robust mutex in the lwpchan cache.
2957  */
2958 int
2959 lwp_mutex_register(lwp_mutex_t *lp, caddr_t uaddr)
2960 {
2961         int error = 0;
2962         volatile int watched;
2963         label_t ljb;
2964         uint8_t type;
2965         lwpchan_t lwpchan;
2966 
2967         if ((caddr_t)lp >= (caddr_t)USERLIMIT)
2968                 return (set_errno(EFAULT));
2969 
2970         watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2971 
2972         if (on_fault(&ljb)) {
2973                 error = EFAULT;
2974         } else {
2975                 /*
2976                  * Force Copy-on-write if necessary and ensure that the
2977                  * synchronization object resides in read/write memory.
2978                  * Cause an EFAULT return now if this is not so.
2979                  */
2980                 fuword8_noerr(&lp->mutex_type, &type);
2981                 suword8_noerr(&lp->mutex_type, type);
2982                 if ((type & (USYNC_PROCESS|LOCK_ROBUST))
2983                     != (USYNC_PROCESS|LOCK_ROBUST)) {
2984                         error = EINVAL;
2985                 } else if (!lwpchan_get_mapping(curproc->p_as, (caddr_t)lp,
2986                     uaddr, type, &lwpchan, LWPCHAN_MPPOOL)) {
2987                         error = EFAULT;
2988                 }
2989         }
2990         no_fault();
2991         if (watched)
2992                 watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2993         if (error)
2994                 return (set_errno(error));
2995         return (0);
2996 }
2997 
2998 /*
2999  * There is a user-level robust lock registration in libc.
3000  * Mark it as invalid by storing -1 into the location of the pointer.
3001  */
3002 static void
3003 lwp_mutex_unregister(void *uaddr)
3004 {
3005         if (get_udatamodel() == DATAMODEL_NATIVE) {
3006                 (void) sulword(uaddr, (ulong_t)-1);
3007 #ifdef _SYSCALL32_IMPL
3008         } else {
3009                 (void) suword32(uaddr, (uint32_t)-1);
3010 #endif
3011         }
3012 }
3013 
3014 int
3015 lwp_mutex_trylock(lwp_mutex_t *lp, uintptr_t owner)
3016 {
3017         kthread_t *t = curthread;
3018         proc_t *p = ttoproc(t);
3019         int error = 0;
3020         volatile int locked = 0;
3021         volatile int watched = 0;
3022         label_t ljb;
3023         volatile uint8_t type = 0;
3024         uint16_t flag;
3025         lwpchan_t lwpchan;
3026 
3027         if ((caddr_t)lp >= p->p_as->a_userlimit)
3028                 return (set_errno(EFAULT));
3029 
3030         (void) new_mstate(t, LMS_USER_LOCK);
3031 
3032         if (on_fault(&ljb)) {
3033                 if (locked)
3034                         lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3035                 error = EFAULT;
3036                 goto out;
3037         }
3038         /*
3039          * Force Copy-on-write if necessary and ensure that the
3040          * synchronization object resides in read/write memory.
3041          * Cause an EFAULT return now if this is not so.
3042          */
3043         fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3044         suword8_noerr(&lp->mutex_type, type);
3045         if (UPIMUTEX(type)) {
3046                 no_fault();
3047                 error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL);
3048                 if (error == 0 || error == EOWNERDEAD || error == ELOCKUNMAPPED)
3049                         set_owner_pid(lp, owner,
3050                             (type & USYNC_PROCESS)? p->p_pid : 0);
3051                 if (error)
3052                         return (set_errno(error));
3053                 return (0);
3054         }
3055         if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3056             &lwpchan, LWPCHAN_MPPOOL)) {
3057                 error = EFAULT;
3058                 goto out;
3059         }
3060         lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3061         locked = 1;
3062         if (type & LOCK_ROBUST) {
3063                 fuword16_noerr(&lp->mutex_flag, &flag);
3064                 if (flag & LOCK_NOTRECOVERABLE) {
3065                         lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3066                         error =  ENOTRECOVERABLE;
3067                         goto out;
3068                 }
3069         }
3070 
3071         watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3072 
3073         if (!ulock_try(&lp->mutex_lockw))
3074                 error = EBUSY;
3075         else {
3076                 set_owner_pid(lp, owner, (type & USYNC_PROCESS)? p->p_pid : 0);
3077                 if (type & LOCK_ROBUST) {
3078                         fuword16_noerr(&lp->mutex_flag, &flag);
3079                         if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3080                                 if (flag & LOCK_OWNERDEAD)
3081                                         error = EOWNERDEAD;
3082                                 else if (type & USYNC_PROCESS_ROBUST)
3083                                         error = ELOCKUNMAPPED;
3084                                 else
3085                                         error = EOWNERDEAD;
3086                         }
3087                 }
3088         }
3089         locked = 0;
3090         lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3091 out:
3092 
3093         if (t->t_mstate == LMS_USER_LOCK)
3094                 (void) new_mstate(t, LMS_SYSTEM);
3095 
3096         no_fault();
3097         if (watched)
3098                 watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3099         if (error)
3100                 return (set_errno(error));
3101         return (0);
3102 }
3103 
3104 /*
3105  * unlock the mutex and unblock lwps that is trying to acquire this mutex.
3106  * the blocked lwp resumes and retries to acquire the lock.
3107  */
3108 int
3109 lwp_mutex_unlock(lwp_mutex_t *lp)
3110 {
3111         proc_t *p = ttoproc(curthread);
3112         lwpchan_t lwpchan;
3113         uchar_t waiters;
3114         volatile int locked = 0;
3115         volatile int watched = 0;
3116         volatile uint8_t type = 0;
3117         label_t ljb;
3118         uint16_t flag;
3119         int error = 0;
3120 
3121         if ((caddr_t)lp >= p->p_as->a_userlimit)
3122                 return (set_errno(EFAULT));
3123 
3124         if (on_fault(&ljb)) {
3125                 if (locked)
3126                         lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3127                 error = EFAULT;
3128                 goto out;
3129         }
3130 
3131         /*
3132          * Force Copy-on-write if necessary and ensure that the
3133          * synchronization object resides in read/write memory.
3134          * Cause an EFAULT return now if this is not so.
3135          */
3136         fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3137         suword8_noerr(&lp->mutex_type, type);
3138 
3139         if (UPIMUTEX(type)) {
3140                 no_fault();
3141                 error = lwp_upimutex_unlock(lp, type);
3142                 if (error)
3143                         return (set_errno(error));
3144                 return (0);
3145         }
3146 
3147         watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3148 
3149         if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3150             &lwpchan, LWPCHAN_MPPOOL)) {
3151                 error = EFAULT;
3152                 goto out;
3153         }
3154         lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3155         locked = 1;
3156         if (type & LOCK_ROBUST) {
3157                 fuword16_noerr(&lp->mutex_flag, &flag);
3158                 if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3159                         flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
3160                         flag |= LOCK_NOTRECOVERABLE;
3161                         suword16_noerr(&lp->mutex_flag, flag);
3162                 }
3163         }
3164         set_owner_pid(lp, 0, 0);
3165         ulock_clear(&lp->mutex_lockw);
3166         /*
3167          * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
3168          * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
3169          * may fail.  If it fails, do not write into the waiter bit.
3170          * The call to lwp_release() might fail due to one of three reasons:
3171          *
3172          *      1. due to the thread which set the waiter bit not actually
3173          *         sleeping since it got the lock on the re-try. The waiter
3174          *         bit will then be correctly updated by that thread. This
3175          *         window may be closed by reading the wait bit again here
3176          *         and not calling lwp_release() at all if it is zero.
3177          *      2. the thread which set the waiter bit and went to sleep
3178          *         was woken up by a signal. This time, the waiter recomputes
3179          *         the wait bit in the return with EINTR code.
3180          *      3. the waiter bit read by lwp_mutex_wakeup() was in
3181          *         memory that has been re-used after the lock was dropped.
3182          *         In this case, writing into the waiter bit would cause data
3183          *         corruption.
3184          */
3185         fuword8_noerr(&lp->mutex_waiters, &waiters);
3186         if (waiters) {
3187                 if ((type & LOCK_ROBUST) &&
3188                     (flag & LOCK_NOTRECOVERABLE)) {
3189                         lwp_release_all(&lwpchan);
3190                         suword8_noerr(&lp->mutex_waiters, 0);
3191                 } else if (lwp_release(&lwpchan, &waiters, 0)) {
3192                         suword8_noerr(&lp->mutex_waiters, waiters);
3193                 }
3194         }
3195 
3196         lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3197 out:
3198         no_fault();
3199         if (watched)
3200                 watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3201         if (error)
3202                 return (set_errno(error));
3203         return (0);
3204 }