1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * Portions of this source code were derived from Berkeley 4.3 BSD
  31  * under license from the Regents of the University of California.
  32  */
  33 
  34 /*
  35  * VM - generic vnode mapping segment.
  36  *
  37  * The segmap driver is used only by the kernel to get faster (than seg_vn)
  38  * mappings [lower routine overhead; more persistent cache] to random
  39  * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/t_lock.h>
  44 #include <sys/param.h>
  45 #include <sys/sysmacros.h>
  46 #include <sys/buf.h>
  47 #include <sys/systm.h>
  48 #include <sys/vnode.h>
  49 #include <sys/mman.h>
  50 #include <sys/errno.h>
  51 #include <sys/cred.h>
  52 #include <sys/kmem.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/debug.h>
  56 #include <sys/thread.h>
  57 #include <sys/dumphdr.h>
  58 #include <sys/bitmap.h>
  59 #include <sys/lgrp.h>
  60 
  61 #include <vm/seg_kmem.h>
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_kpm.h>
  66 #include <vm/seg_map.h>
  67 #include <vm/page.h>
  68 #include <vm/pvn.h>
  69 #include <vm/rm.h>
  70 
  71 /*
  72  * Private seg op routines.
  73  */
  74 static void     segmap_free(struct seg *seg);
  75 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
  76                         size_t len, enum fault_type type, enum seg_rw rw);
  77 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
  78 static int      segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
  79                         uint_t prot);
  80 static int      segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
  81 static int      segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
  82                         uint_t *protv);
  83 static u_offset_t       segmap_getoffset(struct seg *seg, caddr_t addr);
  84 static int      segmap_gettype(struct seg *seg, caddr_t addr);
  85 static int      segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
  86 static void     segmap_dump(struct seg *seg);
  87 static int      segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
  88                         struct page ***ppp, enum lock_type type,
  89                         enum seg_rw rw);
  90 static int      segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
  91 static int      segmap_capable(struct seg *seg, segcapability_t capability);
  92 
  93 /* segkpm support */
  94 static caddr_t  segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
  95                         struct smap *, enum seg_rw);
  96 struct smap     *get_smap_kpm(caddr_t, page_t **);
  97 
  98 static struct seg_ops segmap_ops = {
  99         .free           = segmap_free,
 100         .fault          = segmap_fault,
 101         .faulta         = segmap_faulta,
 102         .checkprot      = segmap_checkprot,
 103         .kluster        = segmap_kluster,
 104         .getprot        = segmap_getprot,
 105         .getoffset      = segmap_getoffset,
 106         .gettype        = segmap_gettype,
 107         .getvp          = segmap_getvp,
 108         .dump           = segmap_dump,
 109         .pagelock       = segmap_pagelock,
 110         .getmemid       = segmap_getmemid,
 111         .capable        = segmap_capable,
 112 };
 113 
 114 /*
 115  * Private segmap routines.
 116  */
 117 static void     segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
 118                         size_t len, enum seg_rw rw, struct smap *smp);
 119 static void     segmap_smapadd(struct smap *smp);
 120 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
 121                         u_offset_t off, int hashid);
 122 static void     segmap_hashout(struct smap *smp);
 123 
 124 
 125 /*
 126  * Statistics for segmap operations.
 127  *
 128  * No explicit locking to protect these stats.
 129  */
 130 struct segmapcnt segmapcnt = {
 131         { "fault",              KSTAT_DATA_ULONG },
 132         { "faulta",             KSTAT_DATA_ULONG },
 133         { "getmap",             KSTAT_DATA_ULONG },
 134         { "get_use",            KSTAT_DATA_ULONG },
 135         { "get_reclaim",        KSTAT_DATA_ULONG },
 136         { "get_reuse",          KSTAT_DATA_ULONG },
 137         { "get_unused",         KSTAT_DATA_ULONG },
 138         { "get_nofree",         KSTAT_DATA_ULONG },
 139         { "rel_async",          KSTAT_DATA_ULONG },
 140         { "rel_write",          KSTAT_DATA_ULONG },
 141         { "rel_free",           KSTAT_DATA_ULONG },
 142         { "rel_abort",          KSTAT_DATA_ULONG },
 143         { "rel_dontneed",       KSTAT_DATA_ULONG },
 144         { "release",            KSTAT_DATA_ULONG },
 145         { "pagecreate",         KSTAT_DATA_ULONG },
 146         { "free_notfree",       KSTAT_DATA_ULONG },
 147         { "free_dirty",         KSTAT_DATA_ULONG },
 148         { "free",               KSTAT_DATA_ULONG },
 149         { "stolen",             KSTAT_DATA_ULONG },
 150         { "get_nomtx",          KSTAT_DATA_ULONG }
 151 };
 152 
 153 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
 154 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
 155 
 156 /*
 157  * Return number of map pages in segment.
 158  */
 159 #define MAP_PAGES(seg)          ((seg)->s_size >> MAXBSHIFT)
 160 
 161 /*
 162  * Translate addr into smap number within segment.
 163  */
 164 #define MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
 165 
 166 /*
 167  * Translate addr in seg into struct smap pointer.
 168  */
 169 #define GET_SMAP(seg, addr)     \
 170         &(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
 171 
 172 /*
 173  * Bit in map (16 bit bitmap).
 174  */
 175 #define SMAP_BIT_MASK(bitindex) (1 << ((bitindex) & 0xf))
 176 
 177 static int smd_colormsk = 0;
 178 static int smd_ncolor = 0;
 179 static int smd_nfree = 0;
 180 static int smd_freemsk = 0;
 181 #ifdef DEBUG
 182 static int *colors_used;
 183 #endif
 184 static struct smap *smd_smap;
 185 static struct smaphash *smd_hash;
 186 #ifdef SEGMAP_HASHSTATS
 187 static unsigned int *smd_hash_len;
 188 #endif
 189 static struct smfree *smd_free;
 190 static ulong_t smd_hashmsk = 0;
 191 
 192 #define SEGMAP_MAXCOLOR         2
 193 #define SEGMAP_CACHE_PAD        64
 194 
 195 union segmap_cpu {
 196         struct {
 197                 uint32_t        scpu_free_ndx[SEGMAP_MAXCOLOR];
 198                 struct smap     *scpu_last_smap;
 199                 ulong_t         scpu_getmap;
 200                 ulong_t         scpu_release;
 201                 ulong_t         scpu_get_reclaim;
 202                 ulong_t         scpu_fault;
 203                 ulong_t         scpu_pagecreate;
 204                 ulong_t         scpu_get_reuse;
 205         } scpu;
 206         char    scpu_pad[SEGMAP_CACHE_PAD];
 207 };
 208 static union segmap_cpu *smd_cpu;
 209 
 210 /*
 211  * There are three locks in seg_map:
 212  *      - per freelist mutexes
 213  *      - per hashchain mutexes
 214  *      - per smap mutexes
 215  *
 216  * The lock ordering is to get the smap mutex to lock down the slot
 217  * first then the hash lock (for hash in/out (vp, off) list) or the
 218  * freelist lock to put the slot back on the free list.
 219  *
 220  * The hash search is done by only holding the hashchain lock, when a wanted
 221  * slot is found, we drop the hashchain lock then lock the slot so there
 222  * is no overlapping of hashchain and smap locks. After the slot is
 223  * locked, we verify again if the slot is still what we are looking
 224  * for.
 225  *
 226  * Allocation of a free slot is done by holding the freelist lock,
 227  * then locking the smap slot at the head of the freelist. This is
 228  * in reversed lock order so mutex_tryenter() is used.
 229  *
 230  * The smap lock protects all fields in smap structure except for
 231  * the link fields for hash/free lists which are protected by
 232  * hashchain and freelist locks.
 233  */
 234 
 235 #define SHASHMTX(hashid)        (&smd_hash[hashid].sh_mtx)
 236 
 237 #define SMP2SMF(smp)            (&smd_free[(smp - smd_smap) & smd_freemsk])
 238 #define SMP2SMF_NDX(smp)        (ushort_t)((smp - smd_smap) & smd_freemsk)
 239 
 240 #define SMAPMTX(smp) (&smp->sm_mtx)
 241 
 242 #define SMAP_HASHFUNC(vp, off, hashid) \
 243         { \
 244         hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
 245                 ((off) >> MAXBSHIFT)) & smd_hashmsk); \
 246         }
 247 
 248 /*
 249  * The most frequently updated kstat counters are kept in the
 250  * per cpu array to avoid hot cache blocks. The update function
 251  * sums the cpu local counters to update the global counters.
 252  */
 253 
 254 /* ARGSUSED */
 255 int
 256 segmap_kstat_update(kstat_t *ksp, int rw)
 257 {
 258         int i;
 259         ulong_t getmap, release, get_reclaim;
 260         ulong_t fault, pagecreate, get_reuse;
 261 
 262         if (rw == KSTAT_WRITE)
 263                 return (EACCES);
 264         getmap = release = get_reclaim = (ulong_t)0;
 265         fault = pagecreate = get_reuse = (ulong_t)0;
 266         for (i = 0; i < max_ncpus; i++) {
 267                 getmap += smd_cpu[i].scpu.scpu_getmap;
 268                 release  += smd_cpu[i].scpu.scpu_release;
 269                 get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
 270                 fault  += smd_cpu[i].scpu.scpu_fault;
 271                 pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
 272                 get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
 273         }
 274         segmapcnt.smp_getmap.value.ul = getmap;
 275         segmapcnt.smp_release.value.ul = release;
 276         segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
 277         segmapcnt.smp_fault.value.ul = fault;
 278         segmapcnt.smp_pagecreate.value.ul = pagecreate;
 279         segmapcnt.smp_get_reuse.value.ul = get_reuse;
 280         return (0);
 281 }
 282 
 283 int
 284 segmap_create(struct seg *seg, void *argsp)
 285 {
 286         struct segmap_data *smd;
 287         struct smap *smp;
 288         struct smfree *sm;
 289         struct segmap_crargs *a = (struct segmap_crargs *)argsp;
 290         struct smaphash *shashp;
 291         union segmap_cpu *scpu;
 292         long i, npages;
 293         size_t hashsz;
 294         uint_t nfreelist;
 295         extern void prefetch_smap_w(void *);
 296         extern int max_ncpus;
 297 
 298         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 299 
 300         if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
 301                 panic("segkmap not MAXBSIZE aligned");
 302                 /*NOTREACHED*/
 303         }
 304 
 305         smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
 306 
 307         seg->s_data = (void *)smd;
 308         seg->s_ops = &segmap_ops;
 309         smd->smd_prot = a->prot;
 310 
 311         /*
 312          * Scale the number of smap freelists to be
 313          * proportional to max_ncpus * number of virtual colors.
 314          * The caller can over-ride this scaling by providing
 315          * a non-zero a->nfreelist argument.
 316          */
 317         nfreelist = a->nfreelist;
 318         if (nfreelist == 0)
 319                 nfreelist = max_ncpus;
 320         else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
 321                 cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
 322                 "%d, using %d", nfreelist, max_ncpus);
 323                 nfreelist = max_ncpus;
 324         }
 325         if (!ISP2(nfreelist)) {
 326                 /* round up nfreelist to the next power of two. */
 327                 nfreelist = 1 << (highbit(nfreelist));
 328         }
 329 
 330         /*
 331          * Get the number of virtual colors - must be a power of 2.
 332          */
 333         if (a->shmsize)
 334                 smd_ncolor = a->shmsize >> MAXBSHIFT;
 335         else
 336                 smd_ncolor = 1;
 337         ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
 338         ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
 339         smd_colormsk = smd_ncolor - 1;
 340         smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
 341         smd_freemsk = smd_nfree - 1;
 342 
 343         /*
 344          * Allocate and initialize the freelist headers.
 345          * Note that sm_freeq[1] starts out as the release queue. This
 346          * is known when the smap structures are initialized below.
 347          */
 348         smd_free = smd->smd_free =
 349             kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
 350         for (i = 0; i < smd_nfree; i++) {
 351                 sm = &smd->smd_free[i];
 352                 mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 353                 mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 354                 sm->sm_allocq = &sm->sm_freeq[0];
 355                 sm->sm_releq = &sm->sm_freeq[1];
 356         }
 357 
 358         /*
 359          * Allocate and initialize the smap hash chain headers.
 360          * Compute hash size rounding down to the next power of two.
 361          */
 362         npages = MAP_PAGES(seg);
 363         smd->smd_npages = npages;
 364         hashsz = npages / SMAP_HASHAVELEN;
 365         hashsz = 1 << (highbit(hashsz)-1);
 366         smd_hashmsk = hashsz - 1;
 367         smd_hash = smd->smd_hash =
 368             kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
 369 #ifdef SEGMAP_HASHSTATS
 370         smd_hash_len =
 371             kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
 372 #endif
 373         for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
 374                 shashp->sh_hash_list = NULL;
 375                 mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
 376         }
 377 
 378         /*
 379          * Allocate and initialize the smap structures.
 380          * Link all slots onto the appropriate freelist.
 381          * The smap array is large enough to affect boot time
 382          * on large systems, so use memory prefetching and only
 383          * go through the array 1 time. Inline a optimized version
 384          * of segmap_smapadd to add structures to freelists with
 385          * knowledge that no locks are needed here.
 386          */
 387         smd_smap = smd->smd_sm =
 388             kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
 389 
 390         for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
 391             smp >= smd->smd_sm; smp--) {
 392                 struct smap *smpfreelist;
 393                 struct sm_freeq *releq;
 394 
 395                 prefetch_smap_w((char *)smp);
 396 
 397                 smp->sm_vp = NULL;
 398                 smp->sm_hash = NULL;
 399                 smp->sm_off = 0;
 400                 smp->sm_bitmap = 0;
 401                 smp->sm_refcnt = 0;
 402                 mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
 403                 smp->sm_free_ndx = SMP2SMF_NDX(smp);
 404 
 405                 sm = SMP2SMF(smp);
 406                 releq = sm->sm_releq;
 407 
 408                 smpfreelist = releq->smq_free;
 409                 if (smpfreelist == 0) {
 410                         releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 411                 } else {
 412                         smp->sm_next = smpfreelist;
 413                         smp->sm_prev = smpfreelist->sm_prev;
 414                         smpfreelist->sm_prev = smp;
 415                         smp->sm_prev->sm_next = smp;
 416                         releq->smq_free = smp->sm_next;
 417                 }
 418 
 419                 /*
 420                  * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
 421                  */
 422                 smp->sm_flags = 0;
 423 
 424 #ifdef  SEGKPM_SUPPORT
 425                 /*
 426                  * Due to the fragile prefetch loop no
 427                  * separate function is used here.
 428                  */
 429                 smp->sm_kpme_next = NULL;
 430                 smp->sm_kpme_prev = NULL;
 431                 smp->sm_kpme_page = NULL;
 432 #endif
 433         }
 434 
 435         /*
 436          * Allocate the per color indices that distribute allocation
 437          * requests over the free lists. Each cpu will have a private
 438          * rotor index to spread the allocations even across the available
 439          * smap freelists. Init the scpu_last_smap field to the first
 440          * smap element so there is no need to check for NULL.
 441          */
 442         smd_cpu =
 443             kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
 444         for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
 445                 int j;
 446                 for (j = 0; j < smd_ncolor; j++)
 447                         scpu->scpu.scpu_free_ndx[j] = j;
 448                 scpu->scpu.scpu_last_smap = smd_smap;
 449         }
 450 
 451         vpm_init();
 452 
 453 #ifdef DEBUG
 454         /*
 455          * Keep track of which colors are used more often.
 456          */
 457         colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
 458 #endif /* DEBUG */
 459 
 460         return (0);
 461 }
 462 
 463 static void
 464 segmap_free(seg)
 465         struct seg *seg;
 466 {
 467         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 468 }
 469 
 470 /*
 471  * Do a F_SOFTUNLOCK call over the range requested.
 472  * The range must have already been F_SOFTLOCK'ed.
 473  */
 474 static void
 475 segmap_unlock(
 476         struct hat *hat,
 477         struct seg *seg,
 478         caddr_t addr,
 479         size_t len,
 480         enum seg_rw rw,
 481         struct smap *smp)
 482 {
 483         page_t *pp;
 484         caddr_t adr;
 485         u_offset_t off;
 486         struct vnode *vp;
 487         kmutex_t *smtx;
 488 
 489         ASSERT(smp->sm_refcnt > 0);
 490 
 491 #ifdef lint
 492         seg = seg;
 493 #endif
 494 
 495         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 496 
 497                 /*
 498                  * We're called only from segmap_fault and this was a
 499                  * NOP in case of a kpm based smap, so dangerous things
 500                  * must have happened in the meantime. Pages are prefaulted
 501                  * and locked in segmap_getmapflt and they will not be
 502                  * unlocked until segmap_release.
 503                  */
 504                 panic("segmap_unlock: called with kpm addr %p", (void *)addr);
 505                 /*NOTREACHED*/
 506         }
 507 
 508         vp = smp->sm_vp;
 509         off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 510 
 511         hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
 512         for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
 513                 ushort_t bitmask;
 514 
 515                 /*
 516                  * Use page_find() instead of page_lookup() to
 517                  * find the page since we know that it has
 518                  * "shared" lock.
 519                  */
 520                 pp = page_find(vp, off);
 521                 if (pp == NULL) {
 522                         panic("segmap_unlock: page not found");
 523                         /*NOTREACHED*/
 524                 }
 525 
 526                 if (rw == S_WRITE) {
 527                         hat_setrefmod(pp);
 528                 } else if (rw != S_OTHER) {
 529                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 530                         "segmap_fault:pp %p vp %p offset %llx", pp, vp, off);
 531                         hat_setref(pp);
 532                 }
 533 
 534                 /*
 535                  * Clear bitmap, if the bit corresponding to "off" is set,
 536                  * since the page and translation are being unlocked.
 537                  */
 538                 bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
 539 
 540                 /*
 541                  * Large Files: Following assertion is to verify
 542                  * the correctness of the cast to (int) above.
 543                  */
 544                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
 545                 smtx = SMAPMTX(smp);
 546                 mutex_enter(smtx);
 547                 if (smp->sm_bitmap & bitmask) {
 548                         smp->sm_bitmap &= ~bitmask;
 549                 }
 550                 mutex_exit(smtx);
 551 
 552                 page_unlock(pp);
 553         }
 554 }
 555 
 556 #define MAXPPB  (MAXBSIZE/4096) /* assumes minimum page size of 4k */
 557 
 558 /*
 559  * This routine is called via a machine specific fault handling
 560  * routine.  It is also called by software routines wishing to
 561  * lock or unlock a range of addresses.
 562  *
 563  * Note that this routine expects a page-aligned "addr".
 564  */
 565 faultcode_t
 566 segmap_fault(
 567         struct hat *hat,
 568         struct seg *seg,
 569         caddr_t addr,
 570         size_t len,
 571         enum fault_type type,
 572         enum seg_rw rw)
 573 {
 574         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 575         struct smap *smp;
 576         page_t *pp, **ppp;
 577         struct vnode *vp;
 578         u_offset_t off;
 579         page_t *pl[MAXPPB + 1];
 580         uint_t prot;
 581         u_offset_t addroff;
 582         caddr_t adr;
 583         int err;
 584         u_offset_t sm_off;
 585         int hat_flag;
 586 
 587         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 588                 int newpage;
 589                 kmutex_t *smtx;
 590 
 591                 /*
 592                  * Pages are successfully prefaulted and locked in
 593                  * segmap_getmapflt and can't be unlocked until
 594                  * segmap_release. No hat mappings have to be locked
 595                  * and they also can't be unlocked as long as the
 596                  * caller owns an active kpm addr.
 597                  */
 598 #ifndef DEBUG
 599                 if (type != F_SOFTUNLOCK)
 600                         return (0);
 601 #endif
 602 
 603                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 604                         panic("segmap_fault: smap not found "
 605                             "for addr %p", (void *)addr);
 606                         /*NOTREACHED*/
 607                 }
 608 
 609                 smtx = SMAPMTX(smp);
 610 #ifdef  DEBUG
 611                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 612                 if (newpage) {
 613                         cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
 614                             (void *)smp);
 615                 }
 616 
 617                 if (type != F_SOFTUNLOCK) {
 618                         mutex_exit(smtx);
 619                         return (0);
 620                 }
 621 #endif
 622                 mutex_exit(smtx);
 623                 vp = smp->sm_vp;
 624                 sm_off = smp->sm_off;
 625 
 626                 if (vp == NULL)
 627                         return (FC_MAKE_ERR(EIO));
 628 
 629                 ASSERT(smp->sm_refcnt > 0);
 630 
 631                 addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 632                 if (addroff + len > MAXBSIZE)
 633                         panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
 634                             (void *)(addr + len));
 635 
 636                 off = sm_off + addroff;
 637 
 638                 pp = page_find(vp, off);
 639 
 640                 if (pp == NULL)
 641                         panic("segmap_fault: softunlock page not found");
 642 
 643                 /*
 644                  * Set ref bit also here in case of S_OTHER to avoid the
 645                  * overhead of supporting other cases than F_SOFTUNLOCK
 646                  * with segkpm. We can do this because the underlying
 647                  * pages are locked anyway.
 648                  */
 649                 if (rw == S_WRITE) {
 650                         hat_setrefmod(pp);
 651                 } else {
 652                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 653                             "segmap_fault:pp %p vp %p offset %llx",
 654                             pp, vp, off);
 655                         hat_setref(pp);
 656                 }
 657 
 658                 return (0);
 659         }
 660 
 661         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
 662         smp = GET_SMAP(seg, addr);
 663         vp = smp->sm_vp;
 664         sm_off = smp->sm_off;
 665 
 666         if (vp == NULL)
 667                 return (FC_MAKE_ERR(EIO));
 668 
 669         ASSERT(smp->sm_refcnt > 0);
 670 
 671         addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 672         if (addroff + len > MAXBSIZE) {
 673                 panic("segmap_fault: endaddr %p "
 674                     "exceeds MAXBSIZE chunk", (void *)(addr + len));
 675                 /*NOTREACHED*/
 676         }
 677         off = sm_off + addroff;
 678 
 679         /*
 680          * First handle the easy stuff
 681          */
 682         if (type == F_SOFTUNLOCK) {
 683                 segmap_unlock(hat, seg, addr, len, rw, smp);
 684                 return (0);
 685         }
 686 
 687         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 688             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 689         err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
 690             seg, addr, rw, CRED(), NULL);
 691 
 692         if (err)
 693                 return (FC_MAKE_ERR(err));
 694 
 695         prot &= smd->smd_prot;
 696 
 697         /*
 698          * Handle all pages returned in the pl[] array.
 699          * This loop is coded on the assumption that if
 700          * there was no error from the VOP_GETPAGE routine,
 701          * that the page list returned will contain all the
 702          * needed pages for the vp from [off..off + len].
 703          */
 704         ppp = pl;
 705         while ((pp = *ppp++) != NULL) {
 706                 u_offset_t poff;
 707                 ASSERT(pp->p_vnode == vp);
 708                 hat_flag = HAT_LOAD;
 709 
 710                 /*
 711                  * Verify that the pages returned are within the range
 712                  * of this segmap region.  Note that it is theoretically
 713                  * possible for pages outside this range to be returned,
 714                  * but it is not very likely.  If we cannot use the
 715                  * page here, just release it and go on to the next one.
 716                  */
 717                 if (pp->p_offset < sm_off ||
 718                     pp->p_offset >= sm_off + MAXBSIZE) {
 719                         (void) page_release(pp, 1);
 720                         continue;
 721                 }
 722 
 723                 ASSERT(hat == kas.a_hat);
 724                 poff = pp->p_offset;
 725                 adr = addr + (poff - off);
 726                 if (adr >= addr && adr < addr + len) {
 727                         hat_setref(pp);
 728                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 729                             "segmap_fault:pp %p vp %p offset %llx",
 730                             pp, vp, poff);
 731                         if (type == F_SOFTLOCK)
 732                                 hat_flag = HAT_LOAD_LOCK;
 733                 }
 734 
 735                 /*
 736                  * Deal with VMODSORT pages here. If we know this is a write
 737                  * do the setmod now and allow write protection.
 738                  * As long as it's modified or not S_OTHER, remove write
 739                  * protection. With S_OTHER it's up to the FS to deal with this.
 740                  */
 741                 if (IS_VMODSORT(vp)) {
 742                         if (rw == S_WRITE)
 743                                 hat_setmod(pp);
 744                         else if (rw != S_OTHER && !hat_ismod(pp))
 745                                 prot &= ~PROT_WRITE;
 746                 }
 747 
 748                 hat_memload(hat, adr, pp, prot, hat_flag);
 749                 if (hat_flag != HAT_LOAD_LOCK)
 750                         page_unlock(pp);
 751         }
 752         return (0);
 753 }
 754 
 755 /*
 756  * This routine is used to start I/O on pages asynchronously.
 757  */
 758 static faultcode_t
 759 segmap_faulta(struct seg *seg, caddr_t addr)
 760 {
 761         struct smap *smp;
 762         struct vnode *vp;
 763         u_offset_t off;
 764         int err;
 765 
 766         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 767                 int     newpage;
 768                 kmutex_t *smtx;
 769 
 770                 /*
 771                  * Pages are successfully prefaulted and locked in
 772                  * segmap_getmapflt and can't be unlocked until
 773                  * segmap_release. No hat mappings have to be locked
 774                  * and they also can't be unlocked as long as the
 775                  * caller owns an active kpm addr.
 776                  */
 777 #ifdef  DEBUG
 778                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 779                         panic("segmap_faulta: smap not found "
 780                             "for addr %p", (void *)addr);
 781                         /*NOTREACHED*/
 782                 }
 783 
 784                 smtx = SMAPMTX(smp);
 785                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 786                 mutex_exit(smtx);
 787                 if (newpage)
 788                         cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
 789                             (void *)smp);
 790 #endif
 791                 return (0);
 792         }
 793 
 794         segmapcnt.smp_faulta.value.ul++;
 795         smp = GET_SMAP(seg, addr);
 796 
 797         ASSERT(smp->sm_refcnt > 0);
 798 
 799         vp = smp->sm_vp;
 800         off = smp->sm_off;
 801 
 802         if (vp == NULL) {
 803                 cmn_err(CE_WARN, "segmap_faulta - no vp");
 804                 return (FC_MAKE_ERR(EIO));
 805         }
 806 
 807         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 808             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 809 
 810         err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
 811             & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
 812             seg, addr, S_READ, CRED(), NULL);
 813 
 814         if (err)
 815                 return (FC_MAKE_ERR(err));
 816         return (0);
 817 }
 818 
 819 /*ARGSUSED*/
 820 static int
 821 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
 822 {
 823         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 824 
 825         ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
 826 
 827         /*
 828          * Need not acquire the segment lock since
 829          * "smd_prot" is a read-only field.
 830          */
 831         return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
 832 }
 833 
 834 static int
 835 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
 836 {
 837         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 838         size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
 839 
 840         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
 841 
 842         if (pgno != 0) {
 843                 do {
 844                         protv[--pgno] = smd->smd_prot;
 845                 } while (pgno != 0);
 846         }
 847         return (0);
 848 }
 849 
 850 static u_offset_t
 851 segmap_getoffset(struct seg *seg, caddr_t addr)
 852 {
 853         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 854 
 855         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 856 
 857         return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
 858 }
 859 
 860 /*ARGSUSED*/
 861 static int
 862 segmap_gettype(struct seg *seg, caddr_t addr)
 863 {
 864         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 865 
 866         return (MAP_SHARED);
 867 }
 868 
 869 /*ARGSUSED*/
 870 static int
 871 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
 872 {
 873         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 874 
 875         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 876 
 877         /* XXX - This doesn't make any sense */
 878         *vpp = smd->smd_sm->sm_vp;
 879         return (0);
 880 }
 881 
 882 /*
 883  * Check to see if it makes sense to do kluster/read ahead to
 884  * addr + delta relative to the mapping at addr.  We assume here
 885  * that delta is a signed PAGESIZE'd multiple (which can be negative).
 886  *
 887  * For segmap we always "approve" of this action from our standpoint.
 888  */
 889 /*ARGSUSED*/
 890 static int
 891 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 892 {
 893         return (0);
 894 }
 895 
 896 /*
 897  * Special private segmap operations
 898  */
 899 
 900 /*
 901  * Add smap to the appropriate free list.
 902  */
 903 static void
 904 segmap_smapadd(struct smap *smp)
 905 {
 906         struct smfree *sm;
 907         struct smap *smpfreelist;
 908         struct sm_freeq *releq;
 909 
 910         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 911 
 912         if (smp->sm_refcnt != 0) {
 913                 panic("segmap_smapadd");
 914                 /*NOTREACHED*/
 915         }
 916 
 917         sm = &smd_free[smp->sm_free_ndx];
 918         /*
 919          * Add to the tail of the release queue
 920          * Note that sm_releq and sm_allocq could toggle
 921          * before we get the lock. This does not affect
 922          * correctness as the 2 queues are only maintained
 923          * to reduce lock pressure.
 924          */
 925         releq = sm->sm_releq;
 926         if (releq == &sm->sm_freeq[0])
 927                 smp->sm_flags |= SM_QNDX_ZERO;
 928         else
 929                 smp->sm_flags &= ~SM_QNDX_ZERO;
 930         mutex_enter(&releq->smq_mtx);
 931         smpfreelist = releq->smq_free;
 932         if (smpfreelist == 0) {
 933                 int want;
 934 
 935                 releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 936                 /*
 937                  * Both queue mutexes held to set sm_want;
 938                  * snapshot the value before dropping releq mutex.
 939                  * If sm_want appears after the releq mutex is dropped,
 940                  * then the smap just freed is already gone.
 941                  */
 942                 want = sm->sm_want;
 943                 mutex_exit(&releq->smq_mtx);
 944                 /*
 945                  * See if there was a waiter before dropping the releq mutex
 946                  * then recheck after obtaining sm_freeq[0] mutex as
 947                  * the another thread may have already signaled.
 948                  */
 949                 if (want) {
 950                         mutex_enter(&sm->sm_freeq[0].smq_mtx);
 951                         if (sm->sm_want)
 952                                 cv_signal(&sm->sm_free_cv);
 953                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
 954                 }
 955         } else {
 956                 smp->sm_next = smpfreelist;
 957                 smp->sm_prev = smpfreelist->sm_prev;
 958                 smpfreelist->sm_prev = smp;
 959                 smp->sm_prev->sm_next = smp;
 960                 mutex_exit(&releq->smq_mtx);
 961         }
 962 }
 963 
 964 
 965 static struct smap *
 966 segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
 967 {
 968         struct smap **hpp;
 969         struct smap *tmp;
 970         kmutex_t *hmtx;
 971 
 972         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 973         ASSERT(smp->sm_vp == NULL);
 974         ASSERT(smp->sm_hash == NULL);
 975         ASSERT(smp->sm_prev == NULL);
 976         ASSERT(smp->sm_next == NULL);
 977         ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
 978 
 979         hmtx = SHASHMTX(hashid);
 980 
 981         mutex_enter(hmtx);
 982         /*
 983          * First we need to verify that no one has created a smp
 984          * with (vp,off) as its tag before we us.
 985          */
 986         for (tmp = smd_hash[hashid].sh_hash_list;
 987             tmp != NULL; tmp = tmp->sm_hash)
 988                 if (tmp->sm_vp == vp && tmp->sm_off == off)
 989                         break;
 990 
 991         if (tmp == NULL) {
 992                 /*
 993                  * No one created one yet.
 994                  *
 995                  * Funniness here - we don't increment the ref count on the
 996                  * vnode * even though we have another pointer to it here.
 997                  * The reason for this is that we don't want the fact that
 998                  * a seg_map entry somewhere refers to a vnode to prevent the
 999                  * vnode * itself from going away.  This is because this
1000                  * reference to the vnode is a "soft one".  In the case where
1001                  * a mapping is being used by a rdwr [or directory routine?]
1002                  * there already has to be a non-zero ref count on the vnode.
1003                  * In the case where the vp has been freed and the the smap
1004                  * structure is on the free list, there are no pages in memory
1005                  * that can refer to the vnode.  Thus even if we reuse the same
1006                  * vnode/smap structure for a vnode which has the same
1007                  * address but represents a different object, we are ok.
1008                  */
1009                 smp->sm_vp = vp;
1010                 smp->sm_off = off;
1011 
1012                 hpp = &smd_hash[hashid].sh_hash_list;
1013                 smp->sm_hash = *hpp;
1014                 *hpp = smp;
1015 #ifdef SEGMAP_HASHSTATS
1016                 smd_hash_len[hashid]++;
1017 #endif
1018         }
1019         mutex_exit(hmtx);
1020 
1021         return (tmp);
1022 }
1023 
1024 static void
1025 segmap_hashout(struct smap *smp)
1026 {
1027         struct smap **hpp, *hp;
1028         struct vnode *vp;
1029         kmutex_t *mtx;
1030         int hashid;
1031         u_offset_t off;
1032 
1033         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1034 
1035         vp = smp->sm_vp;
1036         off = smp->sm_off;
1037 
1038         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1039         mtx = SHASHMTX(hashid);
1040         mutex_enter(mtx);
1041 
1042         hpp = &smd_hash[hashid].sh_hash_list;
1043         for (;;) {
1044                 hp = *hpp;
1045                 if (hp == NULL) {
1046                         panic("segmap_hashout");
1047                         /*NOTREACHED*/
1048                 }
1049                 if (hp == smp)
1050                         break;
1051                 hpp = &hp->sm_hash;
1052         }
1053 
1054         *hpp = smp->sm_hash;
1055         smp->sm_hash = NULL;
1056 #ifdef SEGMAP_HASHSTATS
1057         smd_hash_len[hashid]--;
1058 #endif
1059         mutex_exit(mtx);
1060 
1061         smp->sm_vp = NULL;
1062         smp->sm_off = (u_offset_t)0;
1063 
1064 }
1065 
1066 /*
1067  * Attempt to free unmodified, unmapped, and non locked segmap
1068  * pages.
1069  */
1070 void
1071 segmap_pagefree(struct vnode *vp, u_offset_t off)
1072 {
1073         u_offset_t pgoff;
1074         page_t  *pp;
1075 
1076         for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
1077 
1078                 if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL)
1079                         continue;
1080 
1081                 switch (page_release(pp, 1)) {
1082                 case PGREL_NOTREL:
1083                         segmapcnt.smp_free_notfree.value.ul++;
1084                         break;
1085                 case PGREL_MOD:
1086                         segmapcnt.smp_free_dirty.value.ul++;
1087                         break;
1088                 case PGREL_CLEAN:
1089                         segmapcnt.smp_free.value.ul++;
1090                         break;
1091                 }
1092         }
1093 }
1094 
1095 /*
1096  * Locks held on entry: smap lock
1097  * Locks held on exit : smap lock.
1098  */
1099 
1100 static void
1101 grab_smp(struct smap *smp, page_t *pp)
1102 {
1103         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1104         ASSERT(smp->sm_refcnt == 0);
1105 
1106         if (smp->sm_vp != (struct vnode *)NULL) {
1107                 struct vnode    *vp = smp->sm_vp;
1108                 u_offset_t      off = smp->sm_off;
1109                 /*
1110                  * Destroy old vnode association and
1111                  * unload any hardware translations to
1112                  * the old object.
1113                  */
1114                 smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
1115                 segmap_hashout(smp);
1116 
1117                 /*
1118                  * This node is off freelist and hashlist,
1119                  * so there is no reason to drop/reacquire sm_mtx
1120                  * across calls to hat_unload.
1121                  */
1122                 if (segmap_kpm) {
1123                         caddr_t vaddr;
1124                         int hat_unload_needed = 0;
1125 
1126                         /*
1127                          * unload kpm mapping
1128                          */
1129                         if (pp != NULL) {
1130                                 vaddr = hat_kpm_page2va(pp, 1);
1131                                 hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
1132                                 page_unlock(pp);
1133                         }
1134 
1135                         /*
1136                          * Check if we have (also) the rare case of a
1137                          * non kpm mapping.
1138                          */
1139                         if (smp->sm_flags & SM_NOTKPM_RELEASED) {
1140                                 hat_unload_needed = 1;
1141                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1142                         }
1143 
1144                         if (hat_unload_needed) {
1145                                 hat_unload(kas.a_hat, segkmap->s_base +
1146                                     ((smp - smd_smap) * MAXBSIZE),
1147                                     MAXBSIZE, HAT_UNLOAD);
1148                         }
1149 
1150                 } else {
1151                         ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
1152                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1153                         hat_unload(kas.a_hat, segkmap->s_base +
1154                             ((smp - smd_smap) * MAXBSIZE),
1155                             MAXBSIZE, HAT_UNLOAD);
1156                 }
1157                 segmap_pagefree(vp, off);
1158         }
1159 }
1160 
1161 static struct smap *
1162 get_free_smp(int free_ndx)
1163 {
1164         struct smfree *sm;
1165         kmutex_t *smtx;
1166         struct smap *smp, *first;
1167         struct sm_freeq *allocq, *releq;
1168         struct kpme *kpme;
1169         page_t *pp = NULL;
1170         int end_ndx, page_locked = 0;
1171 
1172         end_ndx = free_ndx;
1173         sm = &smd_free[free_ndx];
1174 
1175 retry_queue:
1176         allocq = sm->sm_allocq;
1177         mutex_enter(&allocq->smq_mtx);
1178 
1179         if ((smp = allocq->smq_free) == NULL) {
1180 
1181 skip_queue:
1182                 /*
1183                  * The alloc list is empty or this queue is being skipped;
1184                  * first see if the allocq toggled.
1185                  */
1186                 if (sm->sm_allocq != allocq) {
1187                         /* queue changed */
1188                         mutex_exit(&allocq->smq_mtx);
1189                         goto retry_queue;
1190                 }
1191                 releq = sm->sm_releq;
1192                 if (!mutex_tryenter(&releq->smq_mtx)) {
1193                         /* cannot get releq; a free smp may be there now */
1194                         mutex_exit(&allocq->smq_mtx);
1195 
1196                         /*
1197                          * This loop could spin forever if this thread has
1198                          * higher priority than the thread that is holding
1199                          * releq->smq_mtx. In order to force the other thread
1200                          * to run, we'll lock/unlock the mutex which is safe
1201                          * since we just unlocked the allocq mutex.
1202                          */
1203                         mutex_enter(&releq->smq_mtx);
1204                         mutex_exit(&releq->smq_mtx);
1205                         goto retry_queue;
1206                 }
1207                 if (releq->smq_free == NULL) {
1208                         /*
1209                          * This freelist is empty.
1210                          * This should not happen unless clients
1211                          * are failing to release the segmap
1212                          * window after accessing the data.
1213                          * Before resorting to sleeping, try
1214                          * the next list of the same color.
1215                          */
1216                         free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
1217                         if (free_ndx != end_ndx) {
1218                                 mutex_exit(&releq->smq_mtx);
1219                                 mutex_exit(&allocq->smq_mtx);
1220                                 sm = &smd_free[free_ndx];
1221                                 goto retry_queue;
1222                         }
1223                         /*
1224                          * Tried all freelists of the same color once,
1225                          * wait on this list and hope something gets freed.
1226                          */
1227                         segmapcnt.smp_get_nofree.value.ul++;
1228                         sm->sm_want++;
1229                         mutex_exit(&sm->sm_freeq[1].smq_mtx);
1230                         cv_wait(&sm->sm_free_cv,
1231                             &sm->sm_freeq[0].smq_mtx);
1232                         sm->sm_want--;
1233                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
1234                         sm = &smd_free[free_ndx];
1235                         goto retry_queue;
1236                 } else {
1237                         /*
1238                          * Something on the rele queue; flip the alloc
1239                          * and rele queues and retry.
1240                          */
1241                         sm->sm_allocq = releq;
1242                         sm->sm_releq = allocq;
1243                         mutex_exit(&allocq->smq_mtx);
1244                         mutex_exit(&releq->smq_mtx);
1245                         if (page_locked) {
1246                                 delay(hz >> 2);
1247                                 page_locked = 0;
1248                         }
1249                         goto retry_queue;
1250                 }
1251         } else {
1252                 /*
1253                  * Fastpath the case we get the smap mutex
1254                  * on the first try.
1255                  */
1256                 first = smp;
1257 next_smap:
1258                 smtx = SMAPMTX(smp);
1259                 if (!mutex_tryenter(smtx)) {
1260                         /*
1261                          * Another thread is trying to reclaim this slot.
1262                          * Skip to the next queue or smap.
1263                          */
1264                         if ((smp = smp->sm_next) == first) {
1265                                 goto skip_queue;
1266                         } else {
1267                                 goto next_smap;
1268                         }
1269                 } else {
1270                         /*
1271                          * if kpme exists, get shared lock on the page
1272                          */
1273                         if (segmap_kpm && smp->sm_vp != NULL) {
1274 
1275                                 kpme = GET_KPME(smp);
1276                                 pp = kpme->kpe_page;
1277 
1278                                 if (pp != NULL) {
1279                                         if (!page_trylock(pp, SE_SHARED)) {
1280                                                 smp = smp->sm_next;
1281                                                 mutex_exit(smtx);
1282                                                 page_locked = 1;
1283 
1284                                                 pp = NULL;
1285 
1286                                                 if (smp == first) {
1287                                                         goto skip_queue;
1288                                                 } else {
1289                                                         goto next_smap;
1290                                                 }
1291                                         } else {
1292                                                 if (kpme->kpe_page == NULL) {
1293                                                         page_unlock(pp);
1294                                                         pp = NULL;
1295                                                 }
1296                                         }
1297                                 }
1298                         }
1299 
1300                         /*
1301                          * At this point, we've selected smp.  Remove smp
1302                          * from its freelist.  If smp is the first one in
1303                          * the freelist, update the head of the freelist.
1304                          */
1305                         if (first == smp) {
1306                                 ASSERT(first == allocq->smq_free);
1307                                 allocq->smq_free = smp->sm_next;
1308                         }
1309 
1310                         /*
1311                          * if the head of the freelist still points to smp,
1312                          * then there are no more free smaps in that list.
1313                          */
1314                         if (allocq->smq_free == smp)
1315                                 /*
1316                                  * Took the last one
1317                                  */
1318                                 allocq->smq_free = NULL;
1319                         else {
1320                                 smp->sm_prev->sm_next = smp->sm_next;
1321                                 smp->sm_next->sm_prev = smp->sm_prev;
1322                         }
1323                         mutex_exit(&allocq->smq_mtx);
1324                         smp->sm_prev = smp->sm_next = NULL;
1325 
1326                         /*
1327                          * if pp != NULL, pp must have been locked;
1328                          * grab_smp() unlocks pp.
1329                          */
1330                         ASSERT((pp == NULL) || PAGE_LOCKED(pp));
1331                         grab_smp(smp, pp);
1332                         /* return smp locked. */
1333                         ASSERT(SMAPMTX(smp) == smtx);
1334                         ASSERT(MUTEX_HELD(smtx));
1335                         return (smp);
1336                 }
1337         }
1338 }
1339 
1340 /*
1341  * Special public segmap operations
1342  */
1343 
1344 /*
1345  * Create pages (without using VOP_GETPAGE) and load up translations to them.
1346  * If softlock is TRUE, then set things up so that it looks like a call
1347  * to segmap_fault with F_SOFTLOCK.
1348  *
1349  * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
1350  *
1351  * All fields in the generic segment (struct seg) are considered to be
1352  * read-only for "segmap" even though the kernel address space (kas) may
1353  * not be locked, hence no lock is needed to access them.
1354  */
1355 int
1356 segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
1357 {
1358         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
1359         page_t *pp;
1360         u_offset_t off;
1361         struct smap *smp;
1362         struct vnode *vp;
1363         caddr_t eaddr;
1364         int newpage = 0;
1365         uint_t prot;
1366         kmutex_t *smtx;
1367         int hat_flag;
1368 
1369         ASSERT(seg->s_as == &kas);
1370 
1371         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1372                 /*
1373                  * Pages are successfully prefaulted and locked in
1374                  * segmap_getmapflt and can't be unlocked until
1375                  * segmap_release. The SM_KPM_NEWPAGE flag is set
1376                  * in segmap_pagecreate_kpm when new pages are created.
1377                  * and it is returned as "newpage" indication here.
1378                  */
1379                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1380                         panic("segmap_pagecreate: smap not found "
1381                             "for addr %p", (void *)addr);
1382                         /*NOTREACHED*/
1383                 }
1384 
1385                 smtx = SMAPMTX(smp);
1386                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
1387                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
1388                 mutex_exit(smtx);
1389 
1390                 return (newpage);
1391         }
1392 
1393         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
1394 
1395         eaddr = addr + len;
1396         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1397 
1398         smp = GET_SMAP(seg, addr);
1399 
1400         /*
1401          * We don't grab smp mutex here since we assume the smp
1402          * has a refcnt set already which prevents the slot from
1403          * changing its id.
1404          */
1405         ASSERT(smp->sm_refcnt > 0);
1406 
1407         vp = smp->sm_vp;
1408         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1409         prot = smd->smd_prot;
1410 
1411         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1412                 hat_flag = HAT_LOAD;
1413                 pp = page_lookup(vp, off, SE_SHARED);
1414                 if (pp == NULL) {
1415                         ushort_t bitindex;
1416 
1417                         if ((pp = page_create_va(vp, off,
1418                             PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
1419                                 panic("segmap_pagecreate: page_create failed");
1420                                 /*NOTREACHED*/
1421                         }
1422                         newpage = 1;
1423                         page_io_unlock(pp);
1424 
1425                         /*
1426                          * Since pages created here do not contain valid
1427                          * data until the caller writes into them, the
1428                          * "exclusive" lock will not be dropped to prevent
1429                          * other users from accessing the page.  We also
1430                          * have to lock the translation to prevent a fault
1431                          * from occurring when the virtual address mapped by
1432                          * this page is written into.  This is necessary to
1433                          * avoid a deadlock since we haven't dropped the
1434                          * "exclusive" lock.
1435                          */
1436                         bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
1437 
1438                         /*
1439                          * Large Files: The following assertion is to
1440                          * verify the cast above.
1441                          */
1442                         ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1443                         smtx = SMAPMTX(smp);
1444                         mutex_enter(smtx);
1445                         smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
1446                         mutex_exit(smtx);
1447 
1448                         hat_flag = HAT_LOAD_LOCK;
1449                 } else if (softlock) {
1450                         hat_flag = HAT_LOAD_LOCK;
1451                 }
1452 
1453                 if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
1454                         hat_setmod(pp);
1455 
1456                 hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
1457 
1458                 if (hat_flag != HAT_LOAD_LOCK)
1459                         page_unlock(pp);
1460 
1461                 TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE,
1462                     "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx",
1463                     seg, addr, pp, vp, off);
1464         }
1465 
1466         return (newpage);
1467 }
1468 
1469 void
1470 segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
1471 {
1472         struct smap     *smp;
1473         ushort_t        bitmask;
1474         page_t          *pp;
1475         struct  vnode   *vp;
1476         u_offset_t      off;
1477         caddr_t         eaddr;
1478         kmutex_t        *smtx;
1479 
1480         ASSERT(seg->s_as == &kas);
1481 
1482         eaddr = addr + len;
1483         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1484 
1485         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1486                 /*
1487                  * Pages are successfully prefaulted and locked in
1488                  * segmap_getmapflt and can't be unlocked until
1489                  * segmap_release, so no pages or hat mappings have
1490                  * to be unlocked at this point.
1491                  */
1492 #ifdef DEBUG
1493                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1494                         panic("segmap_pageunlock: smap not found "
1495                             "for addr %p", (void *)addr);
1496                         /*NOTREACHED*/
1497                 }
1498 
1499                 ASSERT(smp->sm_refcnt > 0);
1500                 mutex_exit(SMAPMTX(smp));
1501 #endif
1502                 return;
1503         }
1504 
1505         smp = GET_SMAP(seg, addr);
1506         smtx = SMAPMTX(smp);
1507 
1508         ASSERT(smp->sm_refcnt > 0);
1509 
1510         vp = smp->sm_vp;
1511         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1512 
1513         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1514                 bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
1515 
1516                 /*
1517                  * Large Files: Following assertion is to verify
1518                  * the correctness of the cast to (int) above.
1519                  */
1520                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1521 
1522                 /*
1523                  * If the bit corresponding to "off" is set,
1524                  * clear this bit in the bitmap, unlock translations,
1525                  * and release the "exclusive" lock on the page.
1526                  */
1527                 if (smp->sm_bitmap & bitmask) {
1528                         mutex_enter(smtx);
1529                         smp->sm_bitmap &= ~bitmask;
1530                         mutex_exit(smtx);
1531 
1532                         hat_unlock(kas.a_hat, addr, PAGESIZE);
1533 
1534                         /*
1535                          * Use page_find() instead of page_lookup() to
1536                          * find the page since we know that it has
1537                          * "exclusive" lock.
1538                          */
1539                         pp = page_find(vp, off);
1540                         if (pp == NULL) {
1541                                 panic("segmap_pageunlock: page not found");
1542                                 /*NOTREACHED*/
1543                         }
1544                         if (rw == S_WRITE) {
1545                                 hat_setrefmod(pp);
1546                         } else if (rw != S_OTHER) {
1547                                 hat_setref(pp);
1548                         }
1549 
1550                         page_unlock(pp);
1551                 }
1552         }
1553 }
1554 
1555 caddr_t
1556 segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off)
1557 {
1558         return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
1559 }
1560 
1561 /*
1562  * This is the magic virtual address that offset 0 of an ELF
1563  * file gets mapped to in user space. This is used to pick
1564  * the vac color on the freelist.
1565  */
1566 #define ELF_OFFZERO_VA  (0x10000)
1567 /*
1568  * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
1569  * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
1570  * The return address is  always MAXBSIZE aligned.
1571  *
1572  * If forcefault is nonzero and the MMU translations haven't yet been created,
1573  * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
1574  */
1575 caddr_t
1576 segmap_getmapflt(
1577         struct seg *seg,
1578         struct vnode *vp,
1579         u_offset_t off,
1580         size_t len,
1581         int forcefault,
1582         enum seg_rw rw)
1583 {
1584         struct smap *smp, *nsmp;
1585         extern struct vnode *common_specvp();
1586         caddr_t baseaddr;                       /* MAXBSIZE aligned */
1587         u_offset_t baseoff;
1588         int newslot;
1589         caddr_t vaddr;
1590         int color, hashid;
1591         kmutex_t *hashmtx, *smapmtx;
1592         struct smfree *sm;
1593         page_t  *pp;
1594         struct kpme *kpme;
1595         uint_t  prot;
1596         caddr_t base;
1597         page_t  *pl[MAXPPB + 1];
1598         int     error;
1599         int     is_kpm = 1;
1600 
1601         ASSERT(seg->s_as == &kas);
1602         ASSERT(seg == segkmap);
1603 
1604         baseoff = off & (offset_t)MAXBMASK;
1605         if (off + len > baseoff + MAXBSIZE) {
1606                 panic("segmap_getmap bad len");
1607                 /*NOTREACHED*/
1608         }
1609 
1610         /*
1611          * If this is a block device we have to be sure to use the
1612          * "common" block device vnode for the mapping.
1613          */
1614         if (vp->v_type == VBLK)
1615                 vp = common_specvp(vp);
1616 
1617         smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
1618 
1619         if (segmap_kpm == 0 ||
1620             (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
1621                 is_kpm = 0;
1622         }
1623 
1624         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1625         hashmtx = SHASHMTX(hashid);
1626 
1627 retry_hash:
1628         mutex_enter(hashmtx);
1629         for (smp = smd_hash[hashid].sh_hash_list;
1630             smp != NULL; smp = smp->sm_hash)
1631                 if (smp->sm_vp == vp && smp->sm_off == baseoff)
1632                         break;
1633         mutex_exit(hashmtx);
1634 
1635 vrfy_smp:
1636         if (smp != NULL) {
1637 
1638                 ASSERT(vp->v_count != 0);
1639 
1640                 /*
1641                  * Get smap lock and recheck its tag. The hash lock
1642                  * is dropped since the hash is based on (vp, off)
1643                  * and (vp, off) won't change when we have smap mtx.
1644                  */
1645                 smapmtx = SMAPMTX(smp);
1646                 mutex_enter(smapmtx);
1647                 if (smp->sm_vp != vp || smp->sm_off != baseoff) {
1648                         mutex_exit(smapmtx);
1649                         goto retry_hash;
1650                 }
1651 
1652                 if (smp->sm_refcnt == 0) {
1653 
1654                         smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
1655 
1656                         /*
1657                          * Could still be on the free list. However, this
1658                          * could also be an smp that is transitioning from
1659                          * the free list when we have too much contention
1660                          * for the smapmtx's. In this case, we have an
1661                          * unlocked smp that is not on the free list any
1662                          * longer, but still has a 0 refcnt.  The only way
1663                          * to be sure is to check the freelist pointers.
1664                          * Since we now have the smapmtx, we are guaranteed
1665                          * that the (vp, off) won't change, so we are safe
1666                          * to reclaim it.  get_free_smp() knows that this
1667                          * can happen, and it will check the refcnt.
1668                          */
1669 
1670                         if ((smp->sm_next != NULL)) {
1671                                 struct sm_freeq *freeq;
1672 
1673                                 ASSERT(smp->sm_prev != NULL);
1674                                 sm = &smd_free[smp->sm_free_ndx];
1675 
1676                                 if (smp->sm_flags & SM_QNDX_ZERO)
1677                                         freeq = &sm->sm_freeq[0];
1678                                 else
1679                                         freeq = &sm->sm_freeq[1];
1680 
1681                                 mutex_enter(&freeq->smq_mtx);
1682                                 if (freeq->smq_free != smp) {
1683                                         /*
1684                                          * fastpath normal case
1685                                          */
1686                                         smp->sm_prev->sm_next = smp->sm_next;
1687                                         smp->sm_next->sm_prev = smp->sm_prev;
1688                                 } else if (smp == smp->sm_next) {
1689                                         /*
1690                                          * Taking the last smap on freelist
1691                                          */
1692                                         freeq->smq_free = NULL;
1693                                 } else {
1694                                         /*
1695                                          * Reclaiming 1st smap on list
1696                                          */
1697                                         freeq->smq_free = smp->sm_next;
1698                                         smp->sm_prev->sm_next = smp->sm_next;
1699                                         smp->sm_next->sm_prev = smp->sm_prev;
1700                                 }
1701                                 mutex_exit(&freeq->smq_mtx);
1702                                 smp->sm_prev = smp->sm_next = NULL;
1703                         } else {
1704                                 ASSERT(smp->sm_prev == NULL);
1705                                 segmapcnt.smp_stolen.value.ul++;
1706                         }
1707 
1708                 } else {
1709                         segmapcnt.smp_get_use.value.ul++;
1710                 }
1711                 smp->sm_refcnt++;            /* another user */
1712 
1713                 /*
1714                  * We don't invoke segmap_fault via TLB miss, so we set ref
1715                  * and mod bits in advance. For S_OTHER  we set them in
1716                  * segmap_fault F_SOFTUNLOCK.
1717                  */
1718                 if (is_kpm) {
1719                         if (rw == S_WRITE) {
1720                                 smp->sm_flags |= SM_WRITE_DATA;
1721                         } else if (rw == S_READ) {
1722                                 smp->sm_flags |= SM_READ_DATA;
1723                         }
1724                 }
1725                 mutex_exit(smapmtx);
1726 
1727                 newslot = 0;
1728         } else {
1729 
1730                 uint32_t free_ndx, *free_ndxp;
1731                 union segmap_cpu *scpu;
1732 
1733                 /*
1734                  * On a PAC machine or a machine with anti-alias
1735                  * hardware, smd_colormsk will be zero.
1736                  *
1737                  * On a VAC machine- pick color by offset in the file
1738                  * so we won't get VAC conflicts on elf files.
1739                  * On data files, color does not matter but we
1740                  * don't know what kind of file it is so we always
1741                  * pick color by offset. This causes color
1742                  * corresponding to file offset zero to be used more
1743                  * heavily.
1744                  */
1745                 color = (baseoff >> MAXBSHIFT) & smd_colormsk;
1746                 scpu = smd_cpu+CPU->cpu_seqid;
1747                 free_ndxp = &scpu->scpu.scpu_free_ndx[color];
1748                 free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
1749 #ifdef DEBUG
1750                 colors_used[free_ndx]++;
1751 #endif /* DEBUG */
1752 
1753                 /*
1754                  * Get a locked smp slot from the free list.
1755                  */
1756                 smp = get_free_smp(free_ndx);
1757                 smapmtx = SMAPMTX(smp);
1758 
1759                 ASSERT(smp->sm_vp == NULL);
1760 
1761                 if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
1762                         /*
1763                          * Failed to hashin, there exists one now.
1764                          * Return the smp we just allocated.
1765                          */
1766                         segmap_smapadd(smp);
1767                         mutex_exit(smapmtx);
1768 
1769                         smp = nsmp;
1770                         goto vrfy_smp;
1771                 }
1772                 smp->sm_refcnt++;            /* another user */
1773 
1774                 /*
1775                  * We don't invoke segmap_fault via TLB miss, so we set ref
1776                  * and mod bits in advance. For S_OTHER  we set them in
1777                  * segmap_fault F_SOFTUNLOCK.
1778                  */
1779                 if (is_kpm) {
1780                         if (rw == S_WRITE) {
1781                                 smp->sm_flags |= SM_WRITE_DATA;
1782                         } else if (rw == S_READ) {
1783                                 smp->sm_flags |= SM_READ_DATA;
1784                         }
1785                 }
1786                 mutex_exit(smapmtx);
1787 
1788                 newslot = 1;
1789         }
1790 
1791         if (!is_kpm)
1792                 goto use_segmap_range;
1793 
1794         /*
1795          * Use segkpm
1796          */
1797         /* Lint directive required until 6746211 is fixed */
1798         /*CONSTCOND*/
1799         ASSERT(PAGESIZE == MAXBSIZE);
1800 
1801         /*
1802          * remember the last smp faulted on this cpu.
1803          */
1804         (smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
1805 
1806         if (forcefault == SM_PAGECREATE) {
1807                 baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
1808                 return (baseaddr);
1809         }
1810 
1811         if (newslot == 0 &&
1812             (pp = GET_KPME(smp)->kpe_page) != NULL) {
1813 
1814                 /* fastpath */
1815                 switch (rw) {
1816                 case S_READ:
1817                 case S_WRITE:
1818                         if (page_trylock(pp, SE_SHARED)) {
1819                                 if (PP_ISFREE(pp) ||
1820                                     !(pp->p_vnode == vp &&
1821                                     pp->p_offset == baseoff)) {
1822                                         page_unlock(pp);
1823                                         pp = page_lookup(vp, baseoff,
1824                                             SE_SHARED);
1825                                 }
1826                         } else {
1827                                 pp = page_lookup(vp, baseoff, SE_SHARED);
1828                         }
1829 
1830                         if (pp == NULL) {
1831                                 ASSERT(GET_KPME(smp)->kpe_page == NULL);
1832                                 break;
1833                         }
1834 
1835                         if (rw == S_WRITE &&
1836                             hat_page_getattr(pp, P_MOD | P_REF) !=
1837                             (P_MOD | P_REF)) {
1838                                 page_unlock(pp);
1839                                 break;
1840                         }
1841 
1842                         /*
1843                          * We have the p_selock as reader, grab_smp
1844                          * can't hit us, we have bumped the smap
1845                          * refcnt and hat_pageunload needs the
1846                          * p_selock exclusive.
1847                          */
1848                         kpme = GET_KPME(smp);
1849                         if (kpme->kpe_page == pp) {
1850                                 baseaddr = hat_kpm_page2va(pp, 0);
1851                         } else if (kpme->kpe_page == NULL) {
1852                                 baseaddr = hat_kpm_mapin(pp, kpme);
1853                         } else {
1854                                 panic("segmap_getmapflt: stale "
1855                                     "kpme page, kpme %p", (void *)kpme);
1856                                 /*NOTREACHED*/
1857                         }
1858 
1859                         /*
1860                          * We don't invoke segmap_fault via TLB miss,
1861                          * so we set ref and mod bits in advance.
1862                          * For S_OTHER and we set them in segmap_fault
1863                          * F_SOFTUNLOCK.
1864                          */
1865                         if (rw == S_READ && !hat_isref(pp))
1866                                 hat_setref(pp);
1867 
1868                         return (baseaddr);
1869                 default:
1870                         break;
1871                 }
1872         }
1873 
1874         base = segkpm_create_va(baseoff);
1875         error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
1876             seg, base, rw, CRED(), NULL);
1877 
1878         pp = pl[0];
1879         if (error || pp == NULL) {
1880                 /*
1881                  * Use segmap address slot and let segmap_fault deal
1882                  * with the error cases. There is no error return
1883                  * possible here.
1884                  */
1885                 goto use_segmap_range;
1886         }
1887 
1888         ASSERT(pl[1] == NULL);
1889 
1890         /*
1891          * When prot is not returned w/ PROT_ALL the returned pages
1892          * are not backed by fs blocks. For most of the segmap users
1893          * this is no problem, they don't write to the pages in the
1894          * same request and therefore don't rely on a following
1895          * trap driven segmap_fault. With SM_LOCKPROTO users it
1896          * is more secure to use segkmap adresses to allow
1897          * protection segmap_fault's.
1898          */
1899         if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
1900                 /*
1901                  * Use segmap address slot and let segmap_fault
1902                  * do the error return.
1903                  */
1904                 ASSERT(rw != S_WRITE);
1905                 ASSERT(PAGE_LOCKED(pp));
1906                 page_unlock(pp);
1907                 forcefault = 0;
1908                 goto use_segmap_range;
1909         }
1910 
1911         /*
1912          * We have the p_selock as reader, grab_smp can't hit us, we
1913          * have bumped the smap refcnt and hat_pageunload needs the
1914          * p_selock exclusive.
1915          */
1916         kpme = GET_KPME(smp);
1917         if (kpme->kpe_page == pp) {
1918                 baseaddr = hat_kpm_page2va(pp, 0);
1919         } else if (kpme->kpe_page == NULL) {
1920                 baseaddr = hat_kpm_mapin(pp, kpme);
1921         } else {
1922                 panic("segmap_getmapflt: stale kpme page after "
1923                     "VOP_GETPAGE, kpme %p", (void *)kpme);
1924                 /*NOTREACHED*/
1925         }
1926 
1927         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
1928 
1929         return (baseaddr);
1930 
1931 
1932 use_segmap_range:
1933         baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
1934         TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP,
1935             "segmap_getmap:seg %p addr %p vp %p offset %llx",
1936             seg, baseaddr, vp, baseoff);
1937 
1938         /*
1939          * Prefault the translations
1940          */
1941         vaddr = baseaddr + (off - baseoff);
1942         if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
1943 
1944                 caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
1945                     (uintptr_t)PAGEMASK);
1946 
1947                 (void) segmap_fault(kas.a_hat, seg, pgaddr,
1948                     (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
1949                     F_INVAL, rw);
1950         }
1951 
1952         return (baseaddr);
1953 }
1954 
1955 int
1956 segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
1957 {
1958         struct smap     *smp;
1959         int             error;
1960         int             bflags = 0;
1961         struct vnode    *vp;
1962         u_offset_t      offset;
1963         kmutex_t        *smtx;
1964         int             is_kpm = 0;
1965         page_t          *pp;
1966 
1967         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1968 
1969                 if (((uintptr_t)addr & MAXBOFFSET) != 0) {
1970                         panic("segmap_release: addr %p not "
1971                             "MAXBSIZE aligned", (void *)addr);
1972                         /*NOTREACHED*/
1973                 }
1974 
1975                 if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
1976                         panic("segmap_release: smap not found "
1977                             "for addr %p", (void *)addr);
1978                         /*NOTREACHED*/
1979                 }
1980 
1981                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
1982                     "segmap_relmap:seg %p addr %p smp %p",
1983                     seg, addr, smp);
1984 
1985                 smtx = SMAPMTX(smp);
1986 
1987                 /*
1988                  * For compatibility reasons segmap_pagecreate_kpm sets this
1989                  * flag to allow a following segmap_pagecreate to return
1990                  * this as "newpage" flag. When segmap_pagecreate is not
1991                  * called at all we clear it now.
1992                  */
1993                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
1994                 is_kpm = 1;
1995                 if (smp->sm_flags & SM_WRITE_DATA) {
1996                         hat_setrefmod(pp);
1997                 } else if (smp->sm_flags & SM_READ_DATA) {
1998                         hat_setref(pp);
1999                 }
2000         } else {
2001                 if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
2002                     ((uintptr_t)addr & MAXBOFFSET) != 0) {
2003                         panic("segmap_release: bad addr %p", (void *)addr);
2004                         /*NOTREACHED*/
2005                 }
2006                 smp = GET_SMAP(seg, addr);
2007 
2008                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2009                     "segmap_relmap:seg %p addr %p smp %p",
2010                     seg, addr, smp);
2011 
2012                 smtx = SMAPMTX(smp);
2013                 mutex_enter(smtx);
2014                 smp->sm_flags |= SM_NOTKPM_RELEASED;
2015         }
2016 
2017         ASSERT(smp->sm_refcnt > 0);
2018 
2019         /*
2020          * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2021          * are set.
2022          */
2023         if ((flags & ~SM_DONTNEED) != 0) {
2024                 if (flags & SM_WRITE)
2025                         segmapcnt.smp_rel_write.value.ul++;
2026                 if (flags & SM_ASYNC) {
2027                         bflags |= B_ASYNC;
2028                         segmapcnt.smp_rel_async.value.ul++;
2029                 }
2030                 if (flags & SM_INVAL) {
2031                         bflags |= B_INVAL;
2032                         segmapcnt.smp_rel_abort.value.ul++;
2033                 }
2034                 if (flags & SM_DESTROY) {
2035                         bflags |= (B_INVAL|B_TRUNC);
2036                         segmapcnt.smp_rel_abort.value.ul++;
2037                 }
2038                 if (smp->sm_refcnt == 1) {
2039                         /*
2040                          * We only bother doing the FREE and DONTNEED flags
2041                          * if no one else is still referencing this mapping.
2042                          */
2043                         if (flags & SM_FREE) {
2044                                 bflags |= B_FREE;
2045                                 segmapcnt.smp_rel_free.value.ul++;
2046                         }
2047                         if (flags & SM_DONTNEED) {
2048                                 bflags |= B_DONTNEED;
2049                                 segmapcnt.smp_rel_dontneed.value.ul++;
2050                         }
2051                 }
2052         } else {
2053                 smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
2054         }
2055 
2056         vp = smp->sm_vp;
2057         offset = smp->sm_off;
2058 
2059         if (--smp->sm_refcnt == 0) {
2060 
2061                 smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
2062 
2063                 if (flags & (SM_INVAL|SM_DESTROY)) {
2064                         segmap_hashout(smp);    /* remove map info */
2065                         if (is_kpm) {
2066                                 hat_kpm_mapout(pp, GET_KPME(smp), addr);
2067                                 if (smp->sm_flags & SM_NOTKPM_RELEASED) {
2068                                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2069                                         hat_unload(kas.a_hat, segkmap->s_base +
2070                                             ((smp - smd_smap) * MAXBSIZE),
2071                                             MAXBSIZE, HAT_UNLOAD);
2072                                 }
2073 
2074                         } else {
2075                                 if (segmap_kpm)
2076                                         segkpm_mapout_validkpme(GET_KPME(smp));
2077 
2078                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2079                                 hat_unload(kas.a_hat, addr, MAXBSIZE,
2080                                     HAT_UNLOAD);
2081                         }
2082                 }
2083                 segmap_smapadd(smp);    /* add to free list */
2084         }
2085 
2086         mutex_exit(smtx);
2087 
2088         if (is_kpm)
2089                 page_unlock(pp);
2090         /*
2091          * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2092          * are set.
2093          */
2094         if ((flags & ~SM_DONTNEED) != 0) {
2095                 error = VOP_PUTPAGE(vp, offset, MAXBSIZE,
2096                     bflags, CRED(), NULL);
2097         } else {
2098                 error = 0;
2099         }
2100 
2101         return (error);
2102 }
2103 
2104 /*
2105  * Dump the pages belonging to this segmap segment.
2106  */
2107 static void
2108 segmap_dump(struct seg *seg)
2109 {
2110         struct segmap_data *smd;
2111         struct smap *smp, *smp_end;
2112         page_t *pp;
2113         pfn_t pfn;
2114         u_offset_t off;
2115         caddr_t addr;
2116 
2117         smd = (struct segmap_data *)seg->s_data;
2118         addr = seg->s_base;
2119         for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
2120             smp < smp_end; smp++) {
2121 
2122                 if (smp->sm_refcnt) {
2123                         for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
2124                                 int we_own_it = 0;
2125 
2126                                 /*
2127                                  * If pp == NULL, the page either does
2128                                  * not exist or is exclusively locked.
2129                                  * So determine if it exists before
2130                                  * searching for it.
2131                                  */
2132                                 if ((pp = page_lookup_nowait(smp->sm_vp,
2133                                     smp->sm_off + off, SE_SHARED)))
2134                                         we_own_it = 1;
2135                                 else
2136                                         pp = page_exists(smp->sm_vp,
2137                                             smp->sm_off + off);
2138 
2139                                 if (pp) {
2140                                         pfn = page_pptonum(pp);
2141                                         dump_addpage(seg->s_as,
2142                                             addr + off, pfn);
2143                                         if (we_own_it)
2144                                                 page_unlock(pp);
2145                                 }
2146                                 dump_timeleft = dump_timeout;
2147                         }
2148                 }
2149                 addr += MAXBSIZE;
2150         }
2151 }
2152 
2153 /*ARGSUSED*/
2154 static int
2155 segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
2156     struct page ***ppp, enum lock_type type, enum seg_rw rw)
2157 {
2158         return (ENOTSUP);
2159 }
2160 
2161 static int
2162 segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
2163 {
2164         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
2165 
2166         memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
2167         memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
2168         return (0);
2169 }
2170 
2171 /*ARGSUSED*/
2172 static int
2173 segmap_capable(struct seg *seg, segcapability_t capability)
2174 {
2175         return (0);
2176 }
2177 
2178 
2179 #ifdef  SEGKPM_SUPPORT
2180 
2181 /*
2182  * segkpm support routines
2183  */
2184 
2185 static caddr_t
2186 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2187         struct smap *smp, enum seg_rw rw)
2188 {
2189         caddr_t base;
2190         page_t  *pp;
2191         int     newpage = 0;
2192         struct kpme     *kpme;
2193 
2194         ASSERT(smp->sm_refcnt > 0);
2195 
2196         if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
2197                 kmutex_t *smtx;
2198 
2199                 base = segkpm_create_va(off);
2200 
2201                 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
2202                     seg, base)) == NULL) {
2203                         panic("segmap_pagecreate_kpm: "
2204                             "page_create failed");
2205                         /*NOTREACHED*/
2206                 }
2207 
2208                 newpage = 1;
2209                 page_io_unlock(pp);
2210                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
2211 
2212                 /*
2213                  * Mark this here until the following segmap_pagecreate
2214                  * or segmap_release.
2215                  */
2216                 smtx = SMAPMTX(smp);
2217                 mutex_enter(smtx);
2218                 smp->sm_flags |= SM_KPM_NEWPAGE;
2219                 mutex_exit(smtx);
2220         }
2221 
2222         kpme = GET_KPME(smp);
2223         if (!newpage && kpme->kpe_page == pp)
2224                 base = hat_kpm_page2va(pp, 0);
2225         else
2226                 base = hat_kpm_mapin(pp, kpme);
2227 
2228         /*
2229          * FS code may decide not to call segmap_pagecreate and we
2230          * don't invoke segmap_fault via TLB miss, so we have to set
2231          * ref and mod bits in advance.
2232          */
2233         if (rw == S_WRITE) {
2234                 hat_setrefmod(pp);
2235         } else {
2236                 ASSERT(rw == S_READ);
2237                 hat_setref(pp);
2238         }
2239 
2240         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
2241 
2242         return (base);
2243 }
2244 
2245 /*
2246  * Find the smap structure corresponding to the
2247  * KPM addr and return it locked.
2248  */
2249 struct smap *
2250 get_smap_kpm(caddr_t addr, page_t **ppp)
2251 {
2252         struct smap     *smp;
2253         struct vnode    *vp;
2254         u_offset_t      offset;
2255         caddr_t         baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
2256         int             hashid;
2257         kmutex_t        *hashmtx;
2258         page_t          *pp;
2259         union segmap_cpu *scpu;
2260 
2261         pp = hat_kpm_vaddr2page(baseaddr);
2262 
2263         ASSERT(pp && !PP_ISFREE(pp));
2264         ASSERT(PAGE_LOCKED(pp));
2265         ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
2266 
2267         vp = pp->p_vnode;
2268         offset = pp->p_offset;
2269         ASSERT(vp != NULL);
2270 
2271         /*
2272          * Assume the last smap used on this cpu is the one needed.
2273          */
2274         scpu = smd_cpu+CPU->cpu_seqid;
2275         smp = scpu->scpu.scpu_last_smap;
2276         mutex_enter(&smp->sm_mtx);
2277         if (smp->sm_vp == vp && smp->sm_off == offset) {
2278                 ASSERT(smp->sm_refcnt > 0);
2279         } else {
2280                 /*
2281                  * Assumption wrong, find the smap on the hash chain.
2282                  */
2283                 mutex_exit(&smp->sm_mtx);
2284                 SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
2285                 hashmtx = SHASHMTX(hashid);
2286 
2287                 mutex_enter(hashmtx);
2288                 smp = smd_hash[hashid].sh_hash_list;
2289                 for (; smp != NULL; smp = smp->sm_hash) {
2290                         if (smp->sm_vp == vp && smp->sm_off == offset)
2291                                 break;
2292                 }
2293                 mutex_exit(hashmtx);
2294                 if (smp) {
2295                         mutex_enter(&smp->sm_mtx);
2296                         ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
2297                 }
2298         }
2299 
2300         if (ppp)
2301                 *ppp = smp ? pp : NULL;
2302 
2303         return (smp);
2304 }
2305 
2306 #else   /* SEGKPM_SUPPORT */
2307 
2308 /* segkpm stubs */
2309 
2310 /*ARGSUSED*/
2311 static caddr_t
2312 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2313         struct smap *smp, enum seg_rw rw)
2314 {
2315         return (NULL);
2316 }
2317 
2318 /*ARGSUSED*/
2319 struct smap *
2320 get_smap_kpm(caddr_t addr, page_t **ppp)
2321 {
2322         return (NULL);
2323 }
2324 
2325 #endif  /* SEGKPM_SUPPORT */