1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * Portions of this source code were derived from Berkeley 4.3 BSD
  31  * under license from the Regents of the University of California.
  32  */
  33 
  34 /*
  35  * VM - generic vnode mapping segment.
  36  *
  37  * The segmap driver is used only by the kernel to get faster (than seg_vn)
  38  * mappings [lower routine overhead; more persistent cache] to random
  39  * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/t_lock.h>
  44 #include <sys/param.h>
  45 #include <sys/sysmacros.h>
  46 #include <sys/buf.h>
  47 #include <sys/systm.h>
  48 #include <sys/vnode.h>
  49 #include <sys/mman.h>
  50 #include <sys/errno.h>
  51 #include <sys/cred.h>
  52 #include <sys/kmem.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/debug.h>
  56 #include <sys/thread.h>
  57 #include <sys/dumphdr.h>
  58 #include <sys/bitmap.h>
  59 #include <sys/lgrp.h>
  60 
  61 #include <vm/seg_kmem.h>
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_kpm.h>
  66 #include <vm/seg_map.h>
  67 #include <vm/page.h>
  68 #include <vm/pvn.h>
  69 #include <vm/rm.h>
  70 
  71 /*
  72  * Private seg op routines.
  73  */
  74 static void     segmap_free(struct seg *seg);
  75 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
  76                         size_t len, enum fault_type type, enum seg_rw rw);
  77 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
  78 static int      segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
  79                         uint_t prot);
  80 static int      segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
  81 static int      segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
  82                         uint_t *protv);
  83 static u_offset_t       segmap_getoffset(struct seg *seg, caddr_t addr);
  84 static int      segmap_gettype(struct seg *seg, caddr_t addr);
  85 static int      segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
  86 static void     segmap_dump(struct seg *seg);
  87 static int      segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
  88                         struct page ***ppp, enum lock_type type,
  89                         enum seg_rw rw);
  90 static int      segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
  91 
  92 /* segkpm support */
  93 static caddr_t  segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
  94                         struct smap *, enum seg_rw);
  95 struct smap     *get_smap_kpm(caddr_t, page_t **);
  96 
  97 static struct seg_ops segmap_ops = {
  98         .free           = segmap_free,
  99         .fault          = segmap_fault,
 100         .faulta         = segmap_faulta,
 101         .checkprot      = segmap_checkprot,
 102         .kluster        = segmap_kluster,
 103         .getprot        = segmap_getprot,
 104         .getoffset      = segmap_getoffset,
 105         .gettype        = segmap_gettype,
 106         .getvp          = segmap_getvp,
 107         .dump           = segmap_dump,
 108         .pagelock       = segmap_pagelock,
 109         .getmemid       = segmap_getmemid,
 110 };
 111 
 112 /*
 113  * Private segmap routines.
 114  */
 115 static void     segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
 116                         size_t len, enum seg_rw rw, struct smap *smp);
 117 static void     segmap_smapadd(struct smap *smp);
 118 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
 119                         u_offset_t off, int hashid);
 120 static void     segmap_hashout(struct smap *smp);
 121 
 122 
 123 /*
 124  * Statistics for segmap operations.
 125  *
 126  * No explicit locking to protect these stats.
 127  */
 128 struct segmapcnt segmapcnt = {
 129         { "fault",              KSTAT_DATA_ULONG },
 130         { "faulta",             KSTAT_DATA_ULONG },
 131         { "getmap",             KSTAT_DATA_ULONG },
 132         { "get_use",            KSTAT_DATA_ULONG },
 133         { "get_reclaim",        KSTAT_DATA_ULONG },
 134         { "get_reuse",          KSTAT_DATA_ULONG },
 135         { "get_unused",         KSTAT_DATA_ULONG },
 136         { "get_nofree",         KSTAT_DATA_ULONG },
 137         { "rel_async",          KSTAT_DATA_ULONG },
 138         { "rel_write",          KSTAT_DATA_ULONG },
 139         { "rel_free",           KSTAT_DATA_ULONG },
 140         { "rel_abort",          KSTAT_DATA_ULONG },
 141         { "rel_dontneed",       KSTAT_DATA_ULONG },
 142         { "release",            KSTAT_DATA_ULONG },
 143         { "pagecreate",         KSTAT_DATA_ULONG },
 144         { "free_notfree",       KSTAT_DATA_ULONG },
 145         { "free_dirty",         KSTAT_DATA_ULONG },
 146         { "free",               KSTAT_DATA_ULONG },
 147         { "stolen",             KSTAT_DATA_ULONG },
 148         { "get_nomtx",          KSTAT_DATA_ULONG }
 149 };
 150 
 151 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
 152 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
 153 
 154 /*
 155  * Return number of map pages in segment.
 156  */
 157 #define MAP_PAGES(seg)          ((seg)->s_size >> MAXBSHIFT)
 158 
 159 /*
 160  * Translate addr into smap number within segment.
 161  */
 162 #define MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
 163 
 164 /*
 165  * Translate addr in seg into struct smap pointer.
 166  */
 167 #define GET_SMAP(seg, addr)     \
 168         &(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
 169 
 170 /*
 171  * Bit in map (16 bit bitmap).
 172  */
 173 #define SMAP_BIT_MASK(bitindex) (1 << ((bitindex) & 0xf))
 174 
 175 static int smd_colormsk = 0;
 176 static int smd_ncolor = 0;
 177 static int smd_nfree = 0;
 178 static int smd_freemsk = 0;
 179 #ifdef DEBUG
 180 static int *colors_used;
 181 #endif
 182 static struct smap *smd_smap;
 183 static struct smaphash *smd_hash;
 184 #ifdef SEGMAP_HASHSTATS
 185 static unsigned int *smd_hash_len;
 186 #endif
 187 static struct smfree *smd_free;
 188 static ulong_t smd_hashmsk = 0;
 189 
 190 #define SEGMAP_MAXCOLOR         2
 191 #define SEGMAP_CACHE_PAD        64
 192 
 193 union segmap_cpu {
 194         struct {
 195                 uint32_t        scpu_free_ndx[SEGMAP_MAXCOLOR];
 196                 struct smap     *scpu_last_smap;
 197                 ulong_t         scpu_getmap;
 198                 ulong_t         scpu_release;
 199                 ulong_t         scpu_get_reclaim;
 200                 ulong_t         scpu_fault;
 201                 ulong_t         scpu_pagecreate;
 202                 ulong_t         scpu_get_reuse;
 203         } scpu;
 204         char    scpu_pad[SEGMAP_CACHE_PAD];
 205 };
 206 static union segmap_cpu *smd_cpu;
 207 
 208 /*
 209  * There are three locks in seg_map:
 210  *      - per freelist mutexes
 211  *      - per hashchain mutexes
 212  *      - per smap mutexes
 213  *
 214  * The lock ordering is to get the smap mutex to lock down the slot
 215  * first then the hash lock (for hash in/out (vp, off) list) or the
 216  * freelist lock to put the slot back on the free list.
 217  *
 218  * The hash search is done by only holding the hashchain lock, when a wanted
 219  * slot is found, we drop the hashchain lock then lock the slot so there
 220  * is no overlapping of hashchain and smap locks. After the slot is
 221  * locked, we verify again if the slot is still what we are looking
 222  * for.
 223  *
 224  * Allocation of a free slot is done by holding the freelist lock,
 225  * then locking the smap slot at the head of the freelist. This is
 226  * in reversed lock order so mutex_tryenter() is used.
 227  *
 228  * The smap lock protects all fields in smap structure except for
 229  * the link fields for hash/free lists which are protected by
 230  * hashchain and freelist locks.
 231  */
 232 
 233 #define SHASHMTX(hashid)        (&smd_hash[hashid].sh_mtx)
 234 
 235 #define SMP2SMF(smp)            (&smd_free[(smp - smd_smap) & smd_freemsk])
 236 #define SMP2SMF_NDX(smp)        (ushort_t)((smp - smd_smap) & smd_freemsk)
 237 
 238 #define SMAPMTX(smp) (&smp->sm_mtx)
 239 
 240 #define SMAP_HASHFUNC(vp, off, hashid) \
 241         { \
 242         hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
 243                 ((off) >> MAXBSHIFT)) & smd_hashmsk); \
 244         }
 245 
 246 /*
 247  * The most frequently updated kstat counters are kept in the
 248  * per cpu array to avoid hot cache blocks. The update function
 249  * sums the cpu local counters to update the global counters.
 250  */
 251 
 252 /* ARGSUSED */
 253 int
 254 segmap_kstat_update(kstat_t *ksp, int rw)
 255 {
 256         int i;
 257         ulong_t getmap, release, get_reclaim;
 258         ulong_t fault, pagecreate, get_reuse;
 259 
 260         if (rw == KSTAT_WRITE)
 261                 return (EACCES);
 262         getmap = release = get_reclaim = (ulong_t)0;
 263         fault = pagecreate = get_reuse = (ulong_t)0;
 264         for (i = 0; i < max_ncpus; i++) {
 265                 getmap += smd_cpu[i].scpu.scpu_getmap;
 266                 release  += smd_cpu[i].scpu.scpu_release;
 267                 get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
 268                 fault  += smd_cpu[i].scpu.scpu_fault;
 269                 pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
 270                 get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
 271         }
 272         segmapcnt.smp_getmap.value.ul = getmap;
 273         segmapcnt.smp_release.value.ul = release;
 274         segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
 275         segmapcnt.smp_fault.value.ul = fault;
 276         segmapcnt.smp_pagecreate.value.ul = pagecreate;
 277         segmapcnt.smp_get_reuse.value.ul = get_reuse;
 278         return (0);
 279 }
 280 
 281 int
 282 segmap_create(struct seg *seg, void *argsp)
 283 {
 284         struct segmap_data *smd;
 285         struct smap *smp;
 286         struct smfree *sm;
 287         struct segmap_crargs *a = (struct segmap_crargs *)argsp;
 288         struct smaphash *shashp;
 289         union segmap_cpu *scpu;
 290         long i, npages;
 291         size_t hashsz;
 292         uint_t nfreelist;
 293         extern void prefetch_smap_w(void *);
 294         extern int max_ncpus;
 295 
 296         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 297 
 298         if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
 299                 panic("segkmap not MAXBSIZE aligned");
 300                 /*NOTREACHED*/
 301         }
 302 
 303         smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
 304 
 305         seg->s_data = (void *)smd;
 306         seg->s_ops = &segmap_ops;
 307         smd->smd_prot = a->prot;
 308 
 309         /*
 310          * Scale the number of smap freelists to be
 311          * proportional to max_ncpus * number of virtual colors.
 312          * The caller can over-ride this scaling by providing
 313          * a non-zero a->nfreelist argument.
 314          */
 315         nfreelist = a->nfreelist;
 316         if (nfreelist == 0)
 317                 nfreelist = max_ncpus;
 318         else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
 319                 cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
 320                 "%d, using %d", nfreelist, max_ncpus);
 321                 nfreelist = max_ncpus;
 322         }
 323         if (!ISP2(nfreelist)) {
 324                 /* round up nfreelist to the next power of two. */
 325                 nfreelist = 1 << (highbit(nfreelist));
 326         }
 327 
 328         /*
 329          * Get the number of virtual colors - must be a power of 2.
 330          */
 331         if (a->shmsize)
 332                 smd_ncolor = a->shmsize >> MAXBSHIFT;
 333         else
 334                 smd_ncolor = 1;
 335         ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
 336         ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
 337         smd_colormsk = smd_ncolor - 1;
 338         smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
 339         smd_freemsk = smd_nfree - 1;
 340 
 341         /*
 342          * Allocate and initialize the freelist headers.
 343          * Note that sm_freeq[1] starts out as the release queue. This
 344          * is known when the smap structures are initialized below.
 345          */
 346         smd_free = smd->smd_free =
 347             kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
 348         for (i = 0; i < smd_nfree; i++) {
 349                 sm = &smd->smd_free[i];
 350                 mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 351                 mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 352                 sm->sm_allocq = &sm->sm_freeq[0];
 353                 sm->sm_releq = &sm->sm_freeq[1];
 354         }
 355 
 356         /*
 357          * Allocate and initialize the smap hash chain headers.
 358          * Compute hash size rounding down to the next power of two.
 359          */
 360         npages = MAP_PAGES(seg);
 361         smd->smd_npages = npages;
 362         hashsz = npages / SMAP_HASHAVELEN;
 363         hashsz = 1 << (highbit(hashsz)-1);
 364         smd_hashmsk = hashsz - 1;
 365         smd_hash = smd->smd_hash =
 366             kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
 367 #ifdef SEGMAP_HASHSTATS
 368         smd_hash_len =
 369             kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
 370 #endif
 371         for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
 372                 shashp->sh_hash_list = NULL;
 373                 mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
 374         }
 375 
 376         /*
 377          * Allocate and initialize the smap structures.
 378          * Link all slots onto the appropriate freelist.
 379          * The smap array is large enough to affect boot time
 380          * on large systems, so use memory prefetching and only
 381          * go through the array 1 time. Inline a optimized version
 382          * of segmap_smapadd to add structures to freelists with
 383          * knowledge that no locks are needed here.
 384          */
 385         smd_smap = smd->smd_sm =
 386             kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
 387 
 388         for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
 389             smp >= smd->smd_sm; smp--) {
 390                 struct smap *smpfreelist;
 391                 struct sm_freeq *releq;
 392 
 393                 prefetch_smap_w((char *)smp);
 394 
 395                 smp->sm_vp = NULL;
 396                 smp->sm_hash = NULL;
 397                 smp->sm_off = 0;
 398                 smp->sm_bitmap = 0;
 399                 smp->sm_refcnt = 0;
 400                 mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
 401                 smp->sm_free_ndx = SMP2SMF_NDX(smp);
 402 
 403                 sm = SMP2SMF(smp);
 404                 releq = sm->sm_releq;
 405 
 406                 smpfreelist = releq->smq_free;
 407                 if (smpfreelist == 0) {
 408                         releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 409                 } else {
 410                         smp->sm_next = smpfreelist;
 411                         smp->sm_prev = smpfreelist->sm_prev;
 412                         smpfreelist->sm_prev = smp;
 413                         smp->sm_prev->sm_next = smp;
 414                         releq->smq_free = smp->sm_next;
 415                 }
 416 
 417                 /*
 418                  * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
 419                  */
 420                 smp->sm_flags = 0;
 421 
 422 #ifdef  SEGKPM_SUPPORT
 423                 /*
 424                  * Due to the fragile prefetch loop no
 425                  * separate function is used here.
 426                  */
 427                 smp->sm_kpme_next = NULL;
 428                 smp->sm_kpme_prev = NULL;
 429                 smp->sm_kpme_page = NULL;
 430 #endif
 431         }
 432 
 433         /*
 434          * Allocate the per color indices that distribute allocation
 435          * requests over the free lists. Each cpu will have a private
 436          * rotor index to spread the allocations even across the available
 437          * smap freelists. Init the scpu_last_smap field to the first
 438          * smap element so there is no need to check for NULL.
 439          */
 440         smd_cpu =
 441             kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
 442         for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
 443                 int j;
 444                 for (j = 0; j < smd_ncolor; j++)
 445                         scpu->scpu.scpu_free_ndx[j] = j;
 446                 scpu->scpu.scpu_last_smap = smd_smap;
 447         }
 448 
 449         vpm_init();
 450 
 451 #ifdef DEBUG
 452         /*
 453          * Keep track of which colors are used more often.
 454          */
 455         colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
 456 #endif /* DEBUG */
 457 
 458         return (0);
 459 }
 460 
 461 static void
 462 segmap_free(seg)
 463         struct seg *seg;
 464 {
 465         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 466 }
 467 
 468 /*
 469  * Do a F_SOFTUNLOCK call over the range requested.
 470  * The range must have already been F_SOFTLOCK'ed.
 471  */
 472 static void
 473 segmap_unlock(
 474         struct hat *hat,
 475         struct seg *seg,
 476         caddr_t addr,
 477         size_t len,
 478         enum seg_rw rw,
 479         struct smap *smp)
 480 {
 481         page_t *pp;
 482         caddr_t adr;
 483         u_offset_t off;
 484         struct vnode *vp;
 485         kmutex_t *smtx;
 486 
 487         ASSERT(smp->sm_refcnt > 0);
 488 
 489 #ifdef lint
 490         seg = seg;
 491 #endif
 492 
 493         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 494 
 495                 /*
 496                  * We're called only from segmap_fault and this was a
 497                  * NOP in case of a kpm based smap, so dangerous things
 498                  * must have happened in the meantime. Pages are prefaulted
 499                  * and locked in segmap_getmapflt and they will not be
 500                  * unlocked until segmap_release.
 501                  */
 502                 panic("segmap_unlock: called with kpm addr %p", (void *)addr);
 503                 /*NOTREACHED*/
 504         }
 505 
 506         vp = smp->sm_vp;
 507         off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 508 
 509         hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
 510         for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
 511                 ushort_t bitmask;
 512 
 513                 /*
 514                  * Use page_find() instead of page_lookup() to
 515                  * find the page since we know that it has
 516                  * "shared" lock.
 517                  */
 518                 pp = page_find(vp, off);
 519                 if (pp == NULL) {
 520                         panic("segmap_unlock: page not found");
 521                         /*NOTREACHED*/
 522                 }
 523 
 524                 if (rw == S_WRITE) {
 525                         hat_setrefmod(pp);
 526                 } else if (rw != S_OTHER) {
 527                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 528                         "segmap_fault:pp %p vp %p offset %llx", pp, vp, off);
 529                         hat_setref(pp);
 530                 }
 531 
 532                 /*
 533                  * Clear bitmap, if the bit corresponding to "off" is set,
 534                  * since the page and translation are being unlocked.
 535                  */
 536                 bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
 537 
 538                 /*
 539                  * Large Files: Following assertion is to verify
 540                  * the correctness of the cast to (int) above.
 541                  */
 542                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
 543                 smtx = SMAPMTX(smp);
 544                 mutex_enter(smtx);
 545                 if (smp->sm_bitmap & bitmask) {
 546                         smp->sm_bitmap &= ~bitmask;
 547                 }
 548                 mutex_exit(smtx);
 549 
 550                 page_unlock(pp);
 551         }
 552 }
 553 
 554 #define MAXPPB  (MAXBSIZE/4096) /* assumes minimum page size of 4k */
 555 
 556 /*
 557  * This routine is called via a machine specific fault handling
 558  * routine.  It is also called by software routines wishing to
 559  * lock or unlock a range of addresses.
 560  *
 561  * Note that this routine expects a page-aligned "addr".
 562  */
 563 faultcode_t
 564 segmap_fault(
 565         struct hat *hat,
 566         struct seg *seg,
 567         caddr_t addr,
 568         size_t len,
 569         enum fault_type type,
 570         enum seg_rw rw)
 571 {
 572         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 573         struct smap *smp;
 574         page_t *pp, **ppp;
 575         struct vnode *vp;
 576         u_offset_t off;
 577         page_t *pl[MAXPPB + 1];
 578         uint_t prot;
 579         u_offset_t addroff;
 580         caddr_t adr;
 581         int err;
 582         u_offset_t sm_off;
 583         int hat_flag;
 584 
 585         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 586                 int newpage;
 587                 kmutex_t *smtx;
 588 
 589                 /*
 590                  * Pages are successfully prefaulted and locked in
 591                  * segmap_getmapflt and can't be unlocked until
 592                  * segmap_release. No hat mappings have to be locked
 593                  * and they also can't be unlocked as long as the
 594                  * caller owns an active kpm addr.
 595                  */
 596 #ifndef DEBUG
 597                 if (type != F_SOFTUNLOCK)
 598                         return (0);
 599 #endif
 600 
 601                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 602                         panic("segmap_fault: smap not found "
 603                             "for addr %p", (void *)addr);
 604                         /*NOTREACHED*/
 605                 }
 606 
 607                 smtx = SMAPMTX(smp);
 608 #ifdef  DEBUG
 609                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 610                 if (newpage) {
 611                         cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
 612                             (void *)smp);
 613                 }
 614 
 615                 if (type != F_SOFTUNLOCK) {
 616                         mutex_exit(smtx);
 617                         return (0);
 618                 }
 619 #endif
 620                 mutex_exit(smtx);
 621                 vp = smp->sm_vp;
 622                 sm_off = smp->sm_off;
 623 
 624                 if (vp == NULL)
 625                         return (FC_MAKE_ERR(EIO));
 626 
 627                 ASSERT(smp->sm_refcnt > 0);
 628 
 629                 addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 630                 if (addroff + len > MAXBSIZE)
 631                         panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
 632                             (void *)(addr + len));
 633 
 634                 off = sm_off + addroff;
 635 
 636                 pp = page_find(vp, off);
 637 
 638                 if (pp == NULL)
 639                         panic("segmap_fault: softunlock page not found");
 640 
 641                 /*
 642                  * Set ref bit also here in case of S_OTHER to avoid the
 643                  * overhead of supporting other cases than F_SOFTUNLOCK
 644                  * with segkpm. We can do this because the underlying
 645                  * pages are locked anyway.
 646                  */
 647                 if (rw == S_WRITE) {
 648                         hat_setrefmod(pp);
 649                 } else {
 650                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 651                             "segmap_fault:pp %p vp %p offset %llx",
 652                             pp, vp, off);
 653                         hat_setref(pp);
 654                 }
 655 
 656                 return (0);
 657         }
 658 
 659         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
 660         smp = GET_SMAP(seg, addr);
 661         vp = smp->sm_vp;
 662         sm_off = smp->sm_off;
 663 
 664         if (vp == NULL)
 665                 return (FC_MAKE_ERR(EIO));
 666 
 667         ASSERT(smp->sm_refcnt > 0);
 668 
 669         addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 670         if (addroff + len > MAXBSIZE) {
 671                 panic("segmap_fault: endaddr %p "
 672                     "exceeds MAXBSIZE chunk", (void *)(addr + len));
 673                 /*NOTREACHED*/
 674         }
 675         off = sm_off + addroff;
 676 
 677         /*
 678          * First handle the easy stuff
 679          */
 680         if (type == F_SOFTUNLOCK) {
 681                 segmap_unlock(hat, seg, addr, len, rw, smp);
 682                 return (0);
 683         }
 684 
 685         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 686             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 687         err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
 688             seg, addr, rw, CRED(), NULL);
 689 
 690         if (err)
 691                 return (FC_MAKE_ERR(err));
 692 
 693         prot &= smd->smd_prot;
 694 
 695         /*
 696          * Handle all pages returned in the pl[] array.
 697          * This loop is coded on the assumption that if
 698          * there was no error from the VOP_GETPAGE routine,
 699          * that the page list returned will contain all the
 700          * needed pages for the vp from [off..off + len].
 701          */
 702         ppp = pl;
 703         while ((pp = *ppp++) != NULL) {
 704                 u_offset_t poff;
 705                 ASSERT(pp->p_vnode == vp);
 706                 hat_flag = HAT_LOAD;
 707 
 708                 /*
 709                  * Verify that the pages returned are within the range
 710                  * of this segmap region.  Note that it is theoretically
 711                  * possible for pages outside this range to be returned,
 712                  * but it is not very likely.  If we cannot use the
 713                  * page here, just release it and go on to the next one.
 714                  */
 715                 if (pp->p_offset < sm_off ||
 716                     pp->p_offset >= sm_off + MAXBSIZE) {
 717                         (void) page_release(pp, 1);
 718                         continue;
 719                 }
 720 
 721                 ASSERT(hat == kas.a_hat);
 722                 poff = pp->p_offset;
 723                 adr = addr + (poff - off);
 724                 if (adr >= addr && adr < addr + len) {
 725                         hat_setref(pp);
 726                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 727                             "segmap_fault:pp %p vp %p offset %llx",
 728                             pp, vp, poff);
 729                         if (type == F_SOFTLOCK)
 730                                 hat_flag = HAT_LOAD_LOCK;
 731                 }
 732 
 733                 /*
 734                  * Deal with VMODSORT pages here. If we know this is a write
 735                  * do the setmod now and allow write protection.
 736                  * As long as it's modified or not S_OTHER, remove write
 737                  * protection. With S_OTHER it's up to the FS to deal with this.
 738                  */
 739                 if (IS_VMODSORT(vp)) {
 740                         if (rw == S_WRITE)
 741                                 hat_setmod(pp);
 742                         else if (rw != S_OTHER && !hat_ismod(pp))
 743                                 prot &= ~PROT_WRITE;
 744                 }
 745 
 746                 hat_memload(hat, adr, pp, prot, hat_flag);
 747                 if (hat_flag != HAT_LOAD_LOCK)
 748                         page_unlock(pp);
 749         }
 750         return (0);
 751 }
 752 
 753 /*
 754  * This routine is used to start I/O on pages asynchronously.
 755  */
 756 static faultcode_t
 757 segmap_faulta(struct seg *seg, caddr_t addr)
 758 {
 759         struct smap *smp;
 760         struct vnode *vp;
 761         u_offset_t off;
 762         int err;
 763 
 764         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 765                 int     newpage;
 766                 kmutex_t *smtx;
 767 
 768                 /*
 769                  * Pages are successfully prefaulted and locked in
 770                  * segmap_getmapflt and can't be unlocked until
 771                  * segmap_release. No hat mappings have to be locked
 772                  * and they also can't be unlocked as long as the
 773                  * caller owns an active kpm addr.
 774                  */
 775 #ifdef  DEBUG
 776                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 777                         panic("segmap_faulta: smap not found "
 778                             "for addr %p", (void *)addr);
 779                         /*NOTREACHED*/
 780                 }
 781 
 782                 smtx = SMAPMTX(smp);
 783                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 784                 mutex_exit(smtx);
 785                 if (newpage)
 786                         cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
 787                             (void *)smp);
 788 #endif
 789                 return (0);
 790         }
 791 
 792         segmapcnt.smp_faulta.value.ul++;
 793         smp = GET_SMAP(seg, addr);
 794 
 795         ASSERT(smp->sm_refcnt > 0);
 796 
 797         vp = smp->sm_vp;
 798         off = smp->sm_off;
 799 
 800         if (vp == NULL) {
 801                 cmn_err(CE_WARN, "segmap_faulta - no vp");
 802                 return (FC_MAKE_ERR(EIO));
 803         }
 804 
 805         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 806             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 807 
 808         err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
 809             & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
 810             seg, addr, S_READ, CRED(), NULL);
 811 
 812         if (err)
 813                 return (FC_MAKE_ERR(err));
 814         return (0);
 815 }
 816 
 817 /*ARGSUSED*/
 818 static int
 819 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
 820 {
 821         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 822 
 823         ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
 824 
 825         /*
 826          * Need not acquire the segment lock since
 827          * "smd_prot" is a read-only field.
 828          */
 829         return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
 830 }
 831 
 832 static int
 833 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
 834 {
 835         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 836         size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
 837 
 838         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
 839 
 840         if (pgno != 0) {
 841                 do {
 842                         protv[--pgno] = smd->smd_prot;
 843                 } while (pgno != 0);
 844         }
 845         return (0);
 846 }
 847 
 848 static u_offset_t
 849 segmap_getoffset(struct seg *seg, caddr_t addr)
 850 {
 851         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 852 
 853         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 854 
 855         return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
 856 }
 857 
 858 /*ARGSUSED*/
 859 static int
 860 segmap_gettype(struct seg *seg, caddr_t addr)
 861 {
 862         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 863 
 864         return (MAP_SHARED);
 865 }
 866 
 867 /*ARGSUSED*/
 868 static int
 869 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
 870 {
 871         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 872 
 873         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 874 
 875         /* XXX - This doesn't make any sense */
 876         *vpp = smd->smd_sm->sm_vp;
 877         return (0);
 878 }
 879 
 880 /*
 881  * Check to see if it makes sense to do kluster/read ahead to
 882  * addr + delta relative to the mapping at addr.  We assume here
 883  * that delta is a signed PAGESIZE'd multiple (which can be negative).
 884  *
 885  * For segmap we always "approve" of this action from our standpoint.
 886  */
 887 /*ARGSUSED*/
 888 static int
 889 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 890 {
 891         return (0);
 892 }
 893 
 894 /*
 895  * Special private segmap operations
 896  */
 897 
 898 /*
 899  * Add smap to the appropriate free list.
 900  */
 901 static void
 902 segmap_smapadd(struct smap *smp)
 903 {
 904         struct smfree *sm;
 905         struct smap *smpfreelist;
 906         struct sm_freeq *releq;
 907 
 908         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 909 
 910         if (smp->sm_refcnt != 0) {
 911                 panic("segmap_smapadd");
 912                 /*NOTREACHED*/
 913         }
 914 
 915         sm = &smd_free[smp->sm_free_ndx];
 916         /*
 917          * Add to the tail of the release queue
 918          * Note that sm_releq and sm_allocq could toggle
 919          * before we get the lock. This does not affect
 920          * correctness as the 2 queues are only maintained
 921          * to reduce lock pressure.
 922          */
 923         releq = sm->sm_releq;
 924         if (releq == &sm->sm_freeq[0])
 925                 smp->sm_flags |= SM_QNDX_ZERO;
 926         else
 927                 smp->sm_flags &= ~SM_QNDX_ZERO;
 928         mutex_enter(&releq->smq_mtx);
 929         smpfreelist = releq->smq_free;
 930         if (smpfreelist == 0) {
 931                 int want;
 932 
 933                 releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 934                 /*
 935                  * Both queue mutexes held to set sm_want;
 936                  * snapshot the value before dropping releq mutex.
 937                  * If sm_want appears after the releq mutex is dropped,
 938                  * then the smap just freed is already gone.
 939                  */
 940                 want = sm->sm_want;
 941                 mutex_exit(&releq->smq_mtx);
 942                 /*
 943                  * See if there was a waiter before dropping the releq mutex
 944                  * then recheck after obtaining sm_freeq[0] mutex as
 945                  * the another thread may have already signaled.
 946                  */
 947                 if (want) {
 948                         mutex_enter(&sm->sm_freeq[0].smq_mtx);
 949                         if (sm->sm_want)
 950                                 cv_signal(&sm->sm_free_cv);
 951                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
 952                 }
 953         } else {
 954                 smp->sm_next = smpfreelist;
 955                 smp->sm_prev = smpfreelist->sm_prev;
 956                 smpfreelist->sm_prev = smp;
 957                 smp->sm_prev->sm_next = smp;
 958                 mutex_exit(&releq->smq_mtx);
 959         }
 960 }
 961 
 962 
 963 static struct smap *
 964 segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
 965 {
 966         struct smap **hpp;
 967         struct smap *tmp;
 968         kmutex_t *hmtx;
 969 
 970         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 971         ASSERT(smp->sm_vp == NULL);
 972         ASSERT(smp->sm_hash == NULL);
 973         ASSERT(smp->sm_prev == NULL);
 974         ASSERT(smp->sm_next == NULL);
 975         ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
 976 
 977         hmtx = SHASHMTX(hashid);
 978 
 979         mutex_enter(hmtx);
 980         /*
 981          * First we need to verify that no one has created a smp
 982          * with (vp,off) as its tag before we us.
 983          */
 984         for (tmp = smd_hash[hashid].sh_hash_list;
 985             tmp != NULL; tmp = tmp->sm_hash)
 986                 if (tmp->sm_vp == vp && tmp->sm_off == off)
 987                         break;
 988 
 989         if (tmp == NULL) {
 990                 /*
 991                  * No one created one yet.
 992                  *
 993                  * Funniness here - we don't increment the ref count on the
 994                  * vnode * even though we have another pointer to it here.
 995                  * The reason for this is that we don't want the fact that
 996                  * a seg_map entry somewhere refers to a vnode to prevent the
 997                  * vnode * itself from going away.  This is because this
 998                  * reference to the vnode is a "soft one".  In the case where
 999                  * a mapping is being used by a rdwr [or directory routine?]
1000                  * there already has to be a non-zero ref count on the vnode.
1001                  * In the case where the vp has been freed and the the smap
1002                  * structure is on the free list, there are no pages in memory
1003                  * that can refer to the vnode.  Thus even if we reuse the same
1004                  * vnode/smap structure for a vnode which has the same
1005                  * address but represents a different object, we are ok.
1006                  */
1007                 smp->sm_vp = vp;
1008                 smp->sm_off = off;
1009 
1010                 hpp = &smd_hash[hashid].sh_hash_list;
1011                 smp->sm_hash = *hpp;
1012                 *hpp = smp;
1013 #ifdef SEGMAP_HASHSTATS
1014                 smd_hash_len[hashid]++;
1015 #endif
1016         }
1017         mutex_exit(hmtx);
1018 
1019         return (tmp);
1020 }
1021 
1022 static void
1023 segmap_hashout(struct smap *smp)
1024 {
1025         struct smap **hpp, *hp;
1026         struct vnode *vp;
1027         kmutex_t *mtx;
1028         int hashid;
1029         u_offset_t off;
1030 
1031         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1032 
1033         vp = smp->sm_vp;
1034         off = smp->sm_off;
1035 
1036         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1037         mtx = SHASHMTX(hashid);
1038         mutex_enter(mtx);
1039 
1040         hpp = &smd_hash[hashid].sh_hash_list;
1041         for (;;) {
1042                 hp = *hpp;
1043                 if (hp == NULL) {
1044                         panic("segmap_hashout");
1045                         /*NOTREACHED*/
1046                 }
1047                 if (hp == smp)
1048                         break;
1049                 hpp = &hp->sm_hash;
1050         }
1051 
1052         *hpp = smp->sm_hash;
1053         smp->sm_hash = NULL;
1054 #ifdef SEGMAP_HASHSTATS
1055         smd_hash_len[hashid]--;
1056 #endif
1057         mutex_exit(mtx);
1058 
1059         smp->sm_vp = NULL;
1060         smp->sm_off = (u_offset_t)0;
1061 
1062 }
1063 
1064 /*
1065  * Attempt to free unmodified, unmapped, and non locked segmap
1066  * pages.
1067  */
1068 void
1069 segmap_pagefree(struct vnode *vp, u_offset_t off)
1070 {
1071         u_offset_t pgoff;
1072         page_t  *pp;
1073 
1074         for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
1075 
1076                 if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL)
1077                         continue;
1078 
1079                 switch (page_release(pp, 1)) {
1080                 case PGREL_NOTREL:
1081                         segmapcnt.smp_free_notfree.value.ul++;
1082                         break;
1083                 case PGREL_MOD:
1084                         segmapcnt.smp_free_dirty.value.ul++;
1085                         break;
1086                 case PGREL_CLEAN:
1087                         segmapcnt.smp_free.value.ul++;
1088                         break;
1089                 }
1090         }
1091 }
1092 
1093 /*
1094  * Locks held on entry: smap lock
1095  * Locks held on exit : smap lock.
1096  */
1097 
1098 static void
1099 grab_smp(struct smap *smp, page_t *pp)
1100 {
1101         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1102         ASSERT(smp->sm_refcnt == 0);
1103 
1104         if (smp->sm_vp != (struct vnode *)NULL) {
1105                 struct vnode    *vp = smp->sm_vp;
1106                 u_offset_t      off = smp->sm_off;
1107                 /*
1108                  * Destroy old vnode association and
1109                  * unload any hardware translations to
1110                  * the old object.
1111                  */
1112                 smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
1113                 segmap_hashout(smp);
1114 
1115                 /*
1116                  * This node is off freelist and hashlist,
1117                  * so there is no reason to drop/reacquire sm_mtx
1118                  * across calls to hat_unload.
1119                  */
1120                 if (segmap_kpm) {
1121                         caddr_t vaddr;
1122                         int hat_unload_needed = 0;
1123 
1124                         /*
1125                          * unload kpm mapping
1126                          */
1127                         if (pp != NULL) {
1128                                 vaddr = hat_kpm_page2va(pp, 1);
1129                                 hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
1130                                 page_unlock(pp);
1131                         }
1132 
1133                         /*
1134                          * Check if we have (also) the rare case of a
1135                          * non kpm mapping.
1136                          */
1137                         if (smp->sm_flags & SM_NOTKPM_RELEASED) {
1138                                 hat_unload_needed = 1;
1139                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1140                         }
1141 
1142                         if (hat_unload_needed) {
1143                                 hat_unload(kas.a_hat, segkmap->s_base +
1144                                     ((smp - smd_smap) * MAXBSIZE),
1145                                     MAXBSIZE, HAT_UNLOAD);
1146                         }
1147 
1148                 } else {
1149                         ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
1150                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1151                         hat_unload(kas.a_hat, segkmap->s_base +
1152                             ((smp - smd_smap) * MAXBSIZE),
1153                             MAXBSIZE, HAT_UNLOAD);
1154                 }
1155                 segmap_pagefree(vp, off);
1156         }
1157 }
1158 
1159 static struct smap *
1160 get_free_smp(int free_ndx)
1161 {
1162         struct smfree *sm;
1163         kmutex_t *smtx;
1164         struct smap *smp, *first;
1165         struct sm_freeq *allocq, *releq;
1166         struct kpme *kpme;
1167         page_t *pp = NULL;
1168         int end_ndx, page_locked = 0;
1169 
1170         end_ndx = free_ndx;
1171         sm = &smd_free[free_ndx];
1172 
1173 retry_queue:
1174         allocq = sm->sm_allocq;
1175         mutex_enter(&allocq->smq_mtx);
1176 
1177         if ((smp = allocq->smq_free) == NULL) {
1178 
1179 skip_queue:
1180                 /*
1181                  * The alloc list is empty or this queue is being skipped;
1182                  * first see if the allocq toggled.
1183                  */
1184                 if (sm->sm_allocq != allocq) {
1185                         /* queue changed */
1186                         mutex_exit(&allocq->smq_mtx);
1187                         goto retry_queue;
1188                 }
1189                 releq = sm->sm_releq;
1190                 if (!mutex_tryenter(&releq->smq_mtx)) {
1191                         /* cannot get releq; a free smp may be there now */
1192                         mutex_exit(&allocq->smq_mtx);
1193 
1194                         /*
1195                          * This loop could spin forever if this thread has
1196                          * higher priority than the thread that is holding
1197                          * releq->smq_mtx. In order to force the other thread
1198                          * to run, we'll lock/unlock the mutex which is safe
1199                          * since we just unlocked the allocq mutex.
1200                          */
1201                         mutex_enter(&releq->smq_mtx);
1202                         mutex_exit(&releq->smq_mtx);
1203                         goto retry_queue;
1204                 }
1205                 if (releq->smq_free == NULL) {
1206                         /*
1207                          * This freelist is empty.
1208                          * This should not happen unless clients
1209                          * are failing to release the segmap
1210                          * window after accessing the data.
1211                          * Before resorting to sleeping, try
1212                          * the next list of the same color.
1213                          */
1214                         free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
1215                         if (free_ndx != end_ndx) {
1216                                 mutex_exit(&releq->smq_mtx);
1217                                 mutex_exit(&allocq->smq_mtx);
1218                                 sm = &smd_free[free_ndx];
1219                                 goto retry_queue;
1220                         }
1221                         /*
1222                          * Tried all freelists of the same color once,
1223                          * wait on this list and hope something gets freed.
1224                          */
1225                         segmapcnt.smp_get_nofree.value.ul++;
1226                         sm->sm_want++;
1227                         mutex_exit(&sm->sm_freeq[1].smq_mtx);
1228                         cv_wait(&sm->sm_free_cv,
1229                             &sm->sm_freeq[0].smq_mtx);
1230                         sm->sm_want--;
1231                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
1232                         sm = &smd_free[free_ndx];
1233                         goto retry_queue;
1234                 } else {
1235                         /*
1236                          * Something on the rele queue; flip the alloc
1237                          * and rele queues and retry.
1238                          */
1239                         sm->sm_allocq = releq;
1240                         sm->sm_releq = allocq;
1241                         mutex_exit(&allocq->smq_mtx);
1242                         mutex_exit(&releq->smq_mtx);
1243                         if (page_locked) {
1244                                 delay(hz >> 2);
1245                                 page_locked = 0;
1246                         }
1247                         goto retry_queue;
1248                 }
1249         } else {
1250                 /*
1251                  * Fastpath the case we get the smap mutex
1252                  * on the first try.
1253                  */
1254                 first = smp;
1255 next_smap:
1256                 smtx = SMAPMTX(smp);
1257                 if (!mutex_tryenter(smtx)) {
1258                         /*
1259                          * Another thread is trying to reclaim this slot.
1260                          * Skip to the next queue or smap.
1261                          */
1262                         if ((smp = smp->sm_next) == first) {
1263                                 goto skip_queue;
1264                         } else {
1265                                 goto next_smap;
1266                         }
1267                 } else {
1268                         /*
1269                          * if kpme exists, get shared lock on the page
1270                          */
1271                         if (segmap_kpm && smp->sm_vp != NULL) {
1272 
1273                                 kpme = GET_KPME(smp);
1274                                 pp = kpme->kpe_page;
1275 
1276                                 if (pp != NULL) {
1277                                         if (!page_trylock(pp, SE_SHARED)) {
1278                                                 smp = smp->sm_next;
1279                                                 mutex_exit(smtx);
1280                                                 page_locked = 1;
1281 
1282                                                 pp = NULL;
1283 
1284                                                 if (smp == first) {
1285                                                         goto skip_queue;
1286                                                 } else {
1287                                                         goto next_smap;
1288                                                 }
1289                                         } else {
1290                                                 if (kpme->kpe_page == NULL) {
1291                                                         page_unlock(pp);
1292                                                         pp = NULL;
1293                                                 }
1294                                         }
1295                                 }
1296                         }
1297 
1298                         /*
1299                          * At this point, we've selected smp.  Remove smp
1300                          * from its freelist.  If smp is the first one in
1301                          * the freelist, update the head of the freelist.
1302                          */
1303                         if (first == smp) {
1304                                 ASSERT(first == allocq->smq_free);
1305                                 allocq->smq_free = smp->sm_next;
1306                         }
1307 
1308                         /*
1309                          * if the head of the freelist still points to smp,
1310                          * then there are no more free smaps in that list.
1311                          */
1312                         if (allocq->smq_free == smp)
1313                                 /*
1314                                  * Took the last one
1315                                  */
1316                                 allocq->smq_free = NULL;
1317                         else {
1318                                 smp->sm_prev->sm_next = smp->sm_next;
1319                                 smp->sm_next->sm_prev = smp->sm_prev;
1320                         }
1321                         mutex_exit(&allocq->smq_mtx);
1322                         smp->sm_prev = smp->sm_next = NULL;
1323 
1324                         /*
1325                          * if pp != NULL, pp must have been locked;
1326                          * grab_smp() unlocks pp.
1327                          */
1328                         ASSERT((pp == NULL) || PAGE_LOCKED(pp));
1329                         grab_smp(smp, pp);
1330                         /* return smp locked. */
1331                         ASSERT(SMAPMTX(smp) == smtx);
1332                         ASSERT(MUTEX_HELD(smtx));
1333                         return (smp);
1334                 }
1335         }
1336 }
1337 
1338 /*
1339  * Special public segmap operations
1340  */
1341 
1342 /*
1343  * Create pages (without using VOP_GETPAGE) and load up translations to them.
1344  * If softlock is TRUE, then set things up so that it looks like a call
1345  * to segmap_fault with F_SOFTLOCK.
1346  *
1347  * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
1348  *
1349  * All fields in the generic segment (struct seg) are considered to be
1350  * read-only for "segmap" even though the kernel address space (kas) may
1351  * not be locked, hence no lock is needed to access them.
1352  */
1353 int
1354 segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
1355 {
1356         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
1357         page_t *pp;
1358         u_offset_t off;
1359         struct smap *smp;
1360         struct vnode *vp;
1361         caddr_t eaddr;
1362         int newpage = 0;
1363         uint_t prot;
1364         kmutex_t *smtx;
1365         int hat_flag;
1366 
1367         ASSERT(seg->s_as == &kas);
1368 
1369         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1370                 /*
1371                  * Pages are successfully prefaulted and locked in
1372                  * segmap_getmapflt and can't be unlocked until
1373                  * segmap_release. The SM_KPM_NEWPAGE flag is set
1374                  * in segmap_pagecreate_kpm when new pages are created.
1375                  * and it is returned as "newpage" indication here.
1376                  */
1377                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1378                         panic("segmap_pagecreate: smap not found "
1379                             "for addr %p", (void *)addr);
1380                         /*NOTREACHED*/
1381                 }
1382 
1383                 smtx = SMAPMTX(smp);
1384                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
1385                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
1386                 mutex_exit(smtx);
1387 
1388                 return (newpage);
1389         }
1390 
1391         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
1392 
1393         eaddr = addr + len;
1394         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1395 
1396         smp = GET_SMAP(seg, addr);
1397 
1398         /*
1399          * We don't grab smp mutex here since we assume the smp
1400          * has a refcnt set already which prevents the slot from
1401          * changing its id.
1402          */
1403         ASSERT(smp->sm_refcnt > 0);
1404 
1405         vp = smp->sm_vp;
1406         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1407         prot = smd->smd_prot;
1408 
1409         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1410                 hat_flag = HAT_LOAD;
1411                 pp = page_lookup(vp, off, SE_SHARED);
1412                 if (pp == NULL) {
1413                         ushort_t bitindex;
1414 
1415                         if ((pp = page_create_va(vp, off,
1416                             PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
1417                                 panic("segmap_pagecreate: page_create failed");
1418                                 /*NOTREACHED*/
1419                         }
1420                         newpage = 1;
1421                         page_io_unlock(pp);
1422 
1423                         /*
1424                          * Since pages created here do not contain valid
1425                          * data until the caller writes into them, the
1426                          * "exclusive" lock will not be dropped to prevent
1427                          * other users from accessing the page.  We also
1428                          * have to lock the translation to prevent a fault
1429                          * from occurring when the virtual address mapped by
1430                          * this page is written into.  This is necessary to
1431                          * avoid a deadlock since we haven't dropped the
1432                          * "exclusive" lock.
1433                          */
1434                         bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
1435 
1436                         /*
1437                          * Large Files: The following assertion is to
1438                          * verify the cast above.
1439                          */
1440                         ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1441                         smtx = SMAPMTX(smp);
1442                         mutex_enter(smtx);
1443                         smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
1444                         mutex_exit(smtx);
1445 
1446                         hat_flag = HAT_LOAD_LOCK;
1447                 } else if (softlock) {
1448                         hat_flag = HAT_LOAD_LOCK;
1449                 }
1450 
1451                 if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
1452                         hat_setmod(pp);
1453 
1454                 hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
1455 
1456                 if (hat_flag != HAT_LOAD_LOCK)
1457                         page_unlock(pp);
1458 
1459                 TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE,
1460                     "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx",
1461                     seg, addr, pp, vp, off);
1462         }
1463 
1464         return (newpage);
1465 }
1466 
1467 void
1468 segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
1469 {
1470         struct smap     *smp;
1471         ushort_t        bitmask;
1472         page_t          *pp;
1473         struct  vnode   *vp;
1474         u_offset_t      off;
1475         caddr_t         eaddr;
1476         kmutex_t        *smtx;
1477 
1478         ASSERT(seg->s_as == &kas);
1479 
1480         eaddr = addr + len;
1481         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1482 
1483         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1484                 /*
1485                  * Pages are successfully prefaulted and locked in
1486                  * segmap_getmapflt and can't be unlocked until
1487                  * segmap_release, so no pages or hat mappings have
1488                  * to be unlocked at this point.
1489                  */
1490 #ifdef DEBUG
1491                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1492                         panic("segmap_pageunlock: smap not found "
1493                             "for addr %p", (void *)addr);
1494                         /*NOTREACHED*/
1495                 }
1496 
1497                 ASSERT(smp->sm_refcnt > 0);
1498                 mutex_exit(SMAPMTX(smp));
1499 #endif
1500                 return;
1501         }
1502 
1503         smp = GET_SMAP(seg, addr);
1504         smtx = SMAPMTX(smp);
1505 
1506         ASSERT(smp->sm_refcnt > 0);
1507 
1508         vp = smp->sm_vp;
1509         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1510 
1511         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1512                 bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
1513 
1514                 /*
1515                  * Large Files: Following assertion is to verify
1516                  * the correctness of the cast to (int) above.
1517                  */
1518                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1519 
1520                 /*
1521                  * If the bit corresponding to "off" is set,
1522                  * clear this bit in the bitmap, unlock translations,
1523                  * and release the "exclusive" lock on the page.
1524                  */
1525                 if (smp->sm_bitmap & bitmask) {
1526                         mutex_enter(smtx);
1527                         smp->sm_bitmap &= ~bitmask;
1528                         mutex_exit(smtx);
1529 
1530                         hat_unlock(kas.a_hat, addr, PAGESIZE);
1531 
1532                         /*
1533                          * Use page_find() instead of page_lookup() to
1534                          * find the page since we know that it has
1535                          * "exclusive" lock.
1536                          */
1537                         pp = page_find(vp, off);
1538                         if (pp == NULL) {
1539                                 panic("segmap_pageunlock: page not found");
1540                                 /*NOTREACHED*/
1541                         }
1542                         if (rw == S_WRITE) {
1543                                 hat_setrefmod(pp);
1544                         } else if (rw != S_OTHER) {
1545                                 hat_setref(pp);
1546                         }
1547 
1548                         page_unlock(pp);
1549                 }
1550         }
1551 }
1552 
1553 caddr_t
1554 segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off)
1555 {
1556         return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
1557 }
1558 
1559 /*
1560  * This is the magic virtual address that offset 0 of an ELF
1561  * file gets mapped to in user space. This is used to pick
1562  * the vac color on the freelist.
1563  */
1564 #define ELF_OFFZERO_VA  (0x10000)
1565 /*
1566  * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
1567  * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
1568  * The return address is  always MAXBSIZE aligned.
1569  *
1570  * If forcefault is nonzero and the MMU translations haven't yet been created,
1571  * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
1572  */
1573 caddr_t
1574 segmap_getmapflt(
1575         struct seg *seg,
1576         struct vnode *vp,
1577         u_offset_t off,
1578         size_t len,
1579         int forcefault,
1580         enum seg_rw rw)
1581 {
1582         struct smap *smp, *nsmp;
1583         extern struct vnode *common_specvp();
1584         caddr_t baseaddr;                       /* MAXBSIZE aligned */
1585         u_offset_t baseoff;
1586         int newslot;
1587         caddr_t vaddr;
1588         int color, hashid;
1589         kmutex_t *hashmtx, *smapmtx;
1590         struct smfree *sm;
1591         page_t  *pp;
1592         struct kpme *kpme;
1593         uint_t  prot;
1594         caddr_t base;
1595         page_t  *pl[MAXPPB + 1];
1596         int     error;
1597         int     is_kpm = 1;
1598 
1599         ASSERT(seg->s_as == &kas);
1600         ASSERT(seg == segkmap);
1601 
1602         baseoff = off & (offset_t)MAXBMASK;
1603         if (off + len > baseoff + MAXBSIZE) {
1604                 panic("segmap_getmap bad len");
1605                 /*NOTREACHED*/
1606         }
1607 
1608         /*
1609          * If this is a block device we have to be sure to use the
1610          * "common" block device vnode for the mapping.
1611          */
1612         if (vp->v_type == VBLK)
1613                 vp = common_specvp(vp);
1614 
1615         smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
1616 
1617         if (segmap_kpm == 0 ||
1618             (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
1619                 is_kpm = 0;
1620         }
1621 
1622         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1623         hashmtx = SHASHMTX(hashid);
1624 
1625 retry_hash:
1626         mutex_enter(hashmtx);
1627         for (smp = smd_hash[hashid].sh_hash_list;
1628             smp != NULL; smp = smp->sm_hash)
1629                 if (smp->sm_vp == vp && smp->sm_off == baseoff)
1630                         break;
1631         mutex_exit(hashmtx);
1632 
1633 vrfy_smp:
1634         if (smp != NULL) {
1635 
1636                 ASSERT(vp->v_count != 0);
1637 
1638                 /*
1639                  * Get smap lock and recheck its tag. The hash lock
1640                  * is dropped since the hash is based on (vp, off)
1641                  * and (vp, off) won't change when we have smap mtx.
1642                  */
1643                 smapmtx = SMAPMTX(smp);
1644                 mutex_enter(smapmtx);
1645                 if (smp->sm_vp != vp || smp->sm_off != baseoff) {
1646                         mutex_exit(smapmtx);
1647                         goto retry_hash;
1648                 }
1649 
1650                 if (smp->sm_refcnt == 0) {
1651 
1652                         smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
1653 
1654                         /*
1655                          * Could still be on the free list. However, this
1656                          * could also be an smp that is transitioning from
1657                          * the free list when we have too much contention
1658                          * for the smapmtx's. In this case, we have an
1659                          * unlocked smp that is not on the free list any
1660                          * longer, but still has a 0 refcnt.  The only way
1661                          * to be sure is to check the freelist pointers.
1662                          * Since we now have the smapmtx, we are guaranteed
1663                          * that the (vp, off) won't change, so we are safe
1664                          * to reclaim it.  get_free_smp() knows that this
1665                          * can happen, and it will check the refcnt.
1666                          */
1667 
1668                         if ((smp->sm_next != NULL)) {
1669                                 struct sm_freeq *freeq;
1670 
1671                                 ASSERT(smp->sm_prev != NULL);
1672                                 sm = &smd_free[smp->sm_free_ndx];
1673 
1674                                 if (smp->sm_flags & SM_QNDX_ZERO)
1675                                         freeq = &sm->sm_freeq[0];
1676                                 else
1677                                         freeq = &sm->sm_freeq[1];
1678 
1679                                 mutex_enter(&freeq->smq_mtx);
1680                                 if (freeq->smq_free != smp) {
1681                                         /*
1682                                          * fastpath normal case
1683                                          */
1684                                         smp->sm_prev->sm_next = smp->sm_next;
1685                                         smp->sm_next->sm_prev = smp->sm_prev;
1686                                 } else if (smp == smp->sm_next) {
1687                                         /*
1688                                          * Taking the last smap on freelist
1689                                          */
1690                                         freeq->smq_free = NULL;
1691                                 } else {
1692                                         /*
1693                                          * Reclaiming 1st smap on list
1694                                          */
1695                                         freeq->smq_free = smp->sm_next;
1696                                         smp->sm_prev->sm_next = smp->sm_next;
1697                                         smp->sm_next->sm_prev = smp->sm_prev;
1698                                 }
1699                                 mutex_exit(&freeq->smq_mtx);
1700                                 smp->sm_prev = smp->sm_next = NULL;
1701                         } else {
1702                                 ASSERT(smp->sm_prev == NULL);
1703                                 segmapcnt.smp_stolen.value.ul++;
1704                         }
1705 
1706                 } else {
1707                         segmapcnt.smp_get_use.value.ul++;
1708                 }
1709                 smp->sm_refcnt++;            /* another user */
1710 
1711                 /*
1712                  * We don't invoke segmap_fault via TLB miss, so we set ref
1713                  * and mod bits in advance. For S_OTHER  we set them in
1714                  * segmap_fault F_SOFTUNLOCK.
1715                  */
1716                 if (is_kpm) {
1717                         if (rw == S_WRITE) {
1718                                 smp->sm_flags |= SM_WRITE_DATA;
1719                         } else if (rw == S_READ) {
1720                                 smp->sm_flags |= SM_READ_DATA;
1721                         }
1722                 }
1723                 mutex_exit(smapmtx);
1724 
1725                 newslot = 0;
1726         } else {
1727 
1728                 uint32_t free_ndx, *free_ndxp;
1729                 union segmap_cpu *scpu;
1730 
1731                 /*
1732                  * On a PAC machine or a machine with anti-alias
1733                  * hardware, smd_colormsk will be zero.
1734                  *
1735                  * On a VAC machine- pick color by offset in the file
1736                  * so we won't get VAC conflicts on elf files.
1737                  * On data files, color does not matter but we
1738                  * don't know what kind of file it is so we always
1739                  * pick color by offset. This causes color
1740                  * corresponding to file offset zero to be used more
1741                  * heavily.
1742                  */
1743                 color = (baseoff >> MAXBSHIFT) & smd_colormsk;
1744                 scpu = smd_cpu+CPU->cpu_seqid;
1745                 free_ndxp = &scpu->scpu.scpu_free_ndx[color];
1746                 free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
1747 #ifdef DEBUG
1748                 colors_used[free_ndx]++;
1749 #endif /* DEBUG */
1750 
1751                 /*
1752                  * Get a locked smp slot from the free list.
1753                  */
1754                 smp = get_free_smp(free_ndx);
1755                 smapmtx = SMAPMTX(smp);
1756 
1757                 ASSERT(smp->sm_vp == NULL);
1758 
1759                 if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
1760                         /*
1761                          * Failed to hashin, there exists one now.
1762                          * Return the smp we just allocated.
1763                          */
1764                         segmap_smapadd(smp);
1765                         mutex_exit(smapmtx);
1766 
1767                         smp = nsmp;
1768                         goto vrfy_smp;
1769                 }
1770                 smp->sm_refcnt++;            /* another user */
1771 
1772                 /*
1773                  * We don't invoke segmap_fault via TLB miss, so we set ref
1774                  * and mod bits in advance. For S_OTHER  we set them in
1775                  * segmap_fault F_SOFTUNLOCK.
1776                  */
1777                 if (is_kpm) {
1778                         if (rw == S_WRITE) {
1779                                 smp->sm_flags |= SM_WRITE_DATA;
1780                         } else if (rw == S_READ) {
1781                                 smp->sm_flags |= SM_READ_DATA;
1782                         }
1783                 }
1784                 mutex_exit(smapmtx);
1785 
1786                 newslot = 1;
1787         }
1788 
1789         if (!is_kpm)
1790                 goto use_segmap_range;
1791 
1792         /*
1793          * Use segkpm
1794          */
1795         /* Lint directive required until 6746211 is fixed */
1796         /*CONSTCOND*/
1797         ASSERT(PAGESIZE == MAXBSIZE);
1798 
1799         /*
1800          * remember the last smp faulted on this cpu.
1801          */
1802         (smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
1803 
1804         if (forcefault == SM_PAGECREATE) {
1805                 baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
1806                 return (baseaddr);
1807         }
1808 
1809         if (newslot == 0 &&
1810             (pp = GET_KPME(smp)->kpe_page) != NULL) {
1811 
1812                 /* fastpath */
1813                 switch (rw) {
1814                 case S_READ:
1815                 case S_WRITE:
1816                         if (page_trylock(pp, SE_SHARED)) {
1817                                 if (PP_ISFREE(pp) ||
1818                                     !(pp->p_vnode == vp &&
1819                                     pp->p_offset == baseoff)) {
1820                                         page_unlock(pp);
1821                                         pp = page_lookup(vp, baseoff,
1822                                             SE_SHARED);
1823                                 }
1824                         } else {
1825                                 pp = page_lookup(vp, baseoff, SE_SHARED);
1826                         }
1827 
1828                         if (pp == NULL) {
1829                                 ASSERT(GET_KPME(smp)->kpe_page == NULL);
1830                                 break;
1831                         }
1832 
1833                         if (rw == S_WRITE &&
1834                             hat_page_getattr(pp, P_MOD | P_REF) !=
1835                             (P_MOD | P_REF)) {
1836                                 page_unlock(pp);
1837                                 break;
1838                         }
1839 
1840                         /*
1841                          * We have the p_selock as reader, grab_smp
1842                          * can't hit us, we have bumped the smap
1843                          * refcnt and hat_pageunload needs the
1844                          * p_selock exclusive.
1845                          */
1846                         kpme = GET_KPME(smp);
1847                         if (kpme->kpe_page == pp) {
1848                                 baseaddr = hat_kpm_page2va(pp, 0);
1849                         } else if (kpme->kpe_page == NULL) {
1850                                 baseaddr = hat_kpm_mapin(pp, kpme);
1851                         } else {
1852                                 panic("segmap_getmapflt: stale "
1853                                     "kpme page, kpme %p", (void *)kpme);
1854                                 /*NOTREACHED*/
1855                         }
1856 
1857                         /*
1858                          * We don't invoke segmap_fault via TLB miss,
1859                          * so we set ref and mod bits in advance.
1860                          * For S_OTHER and we set them in segmap_fault
1861                          * F_SOFTUNLOCK.
1862                          */
1863                         if (rw == S_READ && !hat_isref(pp))
1864                                 hat_setref(pp);
1865 
1866                         return (baseaddr);
1867                 default:
1868                         break;
1869                 }
1870         }
1871 
1872         base = segkpm_create_va(baseoff);
1873         error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
1874             seg, base, rw, CRED(), NULL);
1875 
1876         pp = pl[0];
1877         if (error || pp == NULL) {
1878                 /*
1879                  * Use segmap address slot and let segmap_fault deal
1880                  * with the error cases. There is no error return
1881                  * possible here.
1882                  */
1883                 goto use_segmap_range;
1884         }
1885 
1886         ASSERT(pl[1] == NULL);
1887 
1888         /*
1889          * When prot is not returned w/ PROT_ALL the returned pages
1890          * are not backed by fs blocks. For most of the segmap users
1891          * this is no problem, they don't write to the pages in the
1892          * same request and therefore don't rely on a following
1893          * trap driven segmap_fault. With SM_LOCKPROTO users it
1894          * is more secure to use segkmap adresses to allow
1895          * protection segmap_fault's.
1896          */
1897         if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
1898                 /*
1899                  * Use segmap address slot and let segmap_fault
1900                  * do the error return.
1901                  */
1902                 ASSERT(rw != S_WRITE);
1903                 ASSERT(PAGE_LOCKED(pp));
1904                 page_unlock(pp);
1905                 forcefault = 0;
1906                 goto use_segmap_range;
1907         }
1908 
1909         /*
1910          * We have the p_selock as reader, grab_smp can't hit us, we
1911          * have bumped the smap refcnt and hat_pageunload needs the
1912          * p_selock exclusive.
1913          */
1914         kpme = GET_KPME(smp);
1915         if (kpme->kpe_page == pp) {
1916                 baseaddr = hat_kpm_page2va(pp, 0);
1917         } else if (kpme->kpe_page == NULL) {
1918                 baseaddr = hat_kpm_mapin(pp, kpme);
1919         } else {
1920                 panic("segmap_getmapflt: stale kpme page after "
1921                     "VOP_GETPAGE, kpme %p", (void *)kpme);
1922                 /*NOTREACHED*/
1923         }
1924 
1925         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
1926 
1927         return (baseaddr);
1928 
1929 
1930 use_segmap_range:
1931         baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
1932         TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP,
1933             "segmap_getmap:seg %p addr %p vp %p offset %llx",
1934             seg, baseaddr, vp, baseoff);
1935 
1936         /*
1937          * Prefault the translations
1938          */
1939         vaddr = baseaddr + (off - baseoff);
1940         if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
1941 
1942                 caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
1943                     (uintptr_t)PAGEMASK);
1944 
1945                 (void) segmap_fault(kas.a_hat, seg, pgaddr,
1946                     (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
1947                     F_INVAL, rw);
1948         }
1949 
1950         return (baseaddr);
1951 }
1952 
1953 int
1954 segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
1955 {
1956         struct smap     *smp;
1957         int             error;
1958         int             bflags = 0;
1959         struct vnode    *vp;
1960         u_offset_t      offset;
1961         kmutex_t        *smtx;
1962         int             is_kpm = 0;
1963         page_t          *pp;
1964 
1965         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1966 
1967                 if (((uintptr_t)addr & MAXBOFFSET) != 0) {
1968                         panic("segmap_release: addr %p not "
1969                             "MAXBSIZE aligned", (void *)addr);
1970                         /*NOTREACHED*/
1971                 }
1972 
1973                 if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
1974                         panic("segmap_release: smap not found "
1975                             "for addr %p", (void *)addr);
1976                         /*NOTREACHED*/
1977                 }
1978 
1979                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
1980                     "segmap_relmap:seg %p addr %p smp %p",
1981                     seg, addr, smp);
1982 
1983                 smtx = SMAPMTX(smp);
1984 
1985                 /*
1986                  * For compatibility reasons segmap_pagecreate_kpm sets this
1987                  * flag to allow a following segmap_pagecreate to return
1988                  * this as "newpage" flag. When segmap_pagecreate is not
1989                  * called at all we clear it now.
1990                  */
1991                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
1992                 is_kpm = 1;
1993                 if (smp->sm_flags & SM_WRITE_DATA) {
1994                         hat_setrefmod(pp);
1995                 } else if (smp->sm_flags & SM_READ_DATA) {
1996                         hat_setref(pp);
1997                 }
1998         } else {
1999                 if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
2000                     ((uintptr_t)addr & MAXBOFFSET) != 0) {
2001                         panic("segmap_release: bad addr %p", (void *)addr);
2002                         /*NOTREACHED*/
2003                 }
2004                 smp = GET_SMAP(seg, addr);
2005 
2006                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2007                     "segmap_relmap:seg %p addr %p smp %p",
2008                     seg, addr, smp);
2009 
2010                 smtx = SMAPMTX(smp);
2011                 mutex_enter(smtx);
2012                 smp->sm_flags |= SM_NOTKPM_RELEASED;
2013         }
2014 
2015         ASSERT(smp->sm_refcnt > 0);
2016 
2017         /*
2018          * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2019          * are set.
2020          */
2021         if ((flags & ~SM_DONTNEED) != 0) {
2022                 if (flags & SM_WRITE)
2023                         segmapcnt.smp_rel_write.value.ul++;
2024                 if (flags & SM_ASYNC) {
2025                         bflags |= B_ASYNC;
2026                         segmapcnt.smp_rel_async.value.ul++;
2027                 }
2028                 if (flags & SM_INVAL) {
2029                         bflags |= B_INVAL;
2030                         segmapcnt.smp_rel_abort.value.ul++;
2031                 }
2032                 if (flags & SM_DESTROY) {
2033                         bflags |= (B_INVAL|B_TRUNC);
2034                         segmapcnt.smp_rel_abort.value.ul++;
2035                 }
2036                 if (smp->sm_refcnt == 1) {
2037                         /*
2038                          * We only bother doing the FREE and DONTNEED flags
2039                          * if no one else is still referencing this mapping.
2040                          */
2041                         if (flags & SM_FREE) {
2042                                 bflags |= B_FREE;
2043                                 segmapcnt.smp_rel_free.value.ul++;
2044                         }
2045                         if (flags & SM_DONTNEED) {
2046                                 bflags |= B_DONTNEED;
2047                                 segmapcnt.smp_rel_dontneed.value.ul++;
2048                         }
2049                 }
2050         } else {
2051                 smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
2052         }
2053 
2054         vp = smp->sm_vp;
2055         offset = smp->sm_off;
2056 
2057         if (--smp->sm_refcnt == 0) {
2058 
2059                 smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
2060 
2061                 if (flags & (SM_INVAL|SM_DESTROY)) {
2062                         segmap_hashout(smp);    /* remove map info */
2063                         if (is_kpm) {
2064                                 hat_kpm_mapout(pp, GET_KPME(smp), addr);
2065                                 if (smp->sm_flags & SM_NOTKPM_RELEASED) {
2066                                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2067                                         hat_unload(kas.a_hat, segkmap->s_base +
2068                                             ((smp - smd_smap) * MAXBSIZE),
2069                                             MAXBSIZE, HAT_UNLOAD);
2070                                 }
2071 
2072                         } else {
2073                                 if (segmap_kpm)
2074                                         segkpm_mapout_validkpme(GET_KPME(smp));
2075 
2076                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2077                                 hat_unload(kas.a_hat, addr, MAXBSIZE,
2078                                     HAT_UNLOAD);
2079                         }
2080                 }
2081                 segmap_smapadd(smp);    /* add to free list */
2082         }
2083 
2084         mutex_exit(smtx);
2085 
2086         if (is_kpm)
2087                 page_unlock(pp);
2088         /*
2089          * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2090          * are set.
2091          */
2092         if ((flags & ~SM_DONTNEED) != 0) {
2093                 error = VOP_PUTPAGE(vp, offset, MAXBSIZE,
2094                     bflags, CRED(), NULL);
2095         } else {
2096                 error = 0;
2097         }
2098 
2099         return (error);
2100 }
2101 
2102 /*
2103  * Dump the pages belonging to this segmap segment.
2104  */
2105 static void
2106 segmap_dump(struct seg *seg)
2107 {
2108         struct segmap_data *smd;
2109         struct smap *smp, *smp_end;
2110         page_t *pp;
2111         pfn_t pfn;
2112         u_offset_t off;
2113         caddr_t addr;
2114 
2115         smd = (struct segmap_data *)seg->s_data;
2116         addr = seg->s_base;
2117         for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
2118             smp < smp_end; smp++) {
2119 
2120                 if (smp->sm_refcnt) {
2121                         for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
2122                                 int we_own_it = 0;
2123 
2124                                 /*
2125                                  * If pp == NULL, the page either does
2126                                  * not exist or is exclusively locked.
2127                                  * So determine if it exists before
2128                                  * searching for it.
2129                                  */
2130                                 if ((pp = page_lookup_nowait(smp->sm_vp,
2131                                     smp->sm_off + off, SE_SHARED)))
2132                                         we_own_it = 1;
2133                                 else
2134                                         pp = page_exists(smp->sm_vp,
2135                                             smp->sm_off + off);
2136 
2137                                 if (pp) {
2138                                         pfn = page_pptonum(pp);
2139                                         dump_addpage(seg->s_as,
2140                                             addr + off, pfn);
2141                                         if (we_own_it)
2142                                                 page_unlock(pp);
2143                                 }
2144                                 dump_timeleft = dump_timeout;
2145                         }
2146                 }
2147                 addr += MAXBSIZE;
2148         }
2149 }
2150 
2151 /*ARGSUSED*/
2152 static int
2153 segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
2154     struct page ***ppp, enum lock_type type, enum seg_rw rw)
2155 {
2156         return (ENOTSUP);
2157 }
2158 
2159 static int
2160 segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
2161 {
2162         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
2163 
2164         memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
2165         memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
2166         return (0);
2167 }
2168 
2169 
2170 #ifdef  SEGKPM_SUPPORT
2171 
2172 /*
2173  * segkpm support routines
2174  */
2175 
2176 static caddr_t
2177 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2178         struct smap *smp, enum seg_rw rw)
2179 {
2180         caddr_t base;
2181         page_t  *pp;
2182         int     newpage = 0;
2183         struct kpme     *kpme;
2184 
2185         ASSERT(smp->sm_refcnt > 0);
2186 
2187         if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
2188                 kmutex_t *smtx;
2189 
2190                 base = segkpm_create_va(off);
2191 
2192                 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
2193                     seg, base)) == NULL) {
2194                         panic("segmap_pagecreate_kpm: "
2195                             "page_create failed");
2196                         /*NOTREACHED*/
2197                 }
2198 
2199                 newpage = 1;
2200                 page_io_unlock(pp);
2201                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
2202 
2203                 /*
2204                  * Mark this here until the following segmap_pagecreate
2205                  * or segmap_release.
2206                  */
2207                 smtx = SMAPMTX(smp);
2208                 mutex_enter(smtx);
2209                 smp->sm_flags |= SM_KPM_NEWPAGE;
2210                 mutex_exit(smtx);
2211         }
2212 
2213         kpme = GET_KPME(smp);
2214         if (!newpage && kpme->kpe_page == pp)
2215                 base = hat_kpm_page2va(pp, 0);
2216         else
2217                 base = hat_kpm_mapin(pp, kpme);
2218 
2219         /*
2220          * FS code may decide not to call segmap_pagecreate and we
2221          * don't invoke segmap_fault via TLB miss, so we have to set
2222          * ref and mod bits in advance.
2223          */
2224         if (rw == S_WRITE) {
2225                 hat_setrefmod(pp);
2226         } else {
2227                 ASSERT(rw == S_READ);
2228                 hat_setref(pp);
2229         }
2230 
2231         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
2232 
2233         return (base);
2234 }
2235 
2236 /*
2237  * Find the smap structure corresponding to the
2238  * KPM addr and return it locked.
2239  */
2240 struct smap *
2241 get_smap_kpm(caddr_t addr, page_t **ppp)
2242 {
2243         struct smap     *smp;
2244         struct vnode    *vp;
2245         u_offset_t      offset;
2246         caddr_t         baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
2247         int             hashid;
2248         kmutex_t        *hashmtx;
2249         page_t          *pp;
2250         union segmap_cpu *scpu;
2251 
2252         pp = hat_kpm_vaddr2page(baseaddr);
2253 
2254         ASSERT(pp && !PP_ISFREE(pp));
2255         ASSERT(PAGE_LOCKED(pp));
2256         ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
2257 
2258         vp = pp->p_vnode;
2259         offset = pp->p_offset;
2260         ASSERT(vp != NULL);
2261 
2262         /*
2263          * Assume the last smap used on this cpu is the one needed.
2264          */
2265         scpu = smd_cpu+CPU->cpu_seqid;
2266         smp = scpu->scpu.scpu_last_smap;
2267         mutex_enter(&smp->sm_mtx);
2268         if (smp->sm_vp == vp && smp->sm_off == offset) {
2269                 ASSERT(smp->sm_refcnt > 0);
2270         } else {
2271                 /*
2272                  * Assumption wrong, find the smap on the hash chain.
2273                  */
2274                 mutex_exit(&smp->sm_mtx);
2275                 SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
2276                 hashmtx = SHASHMTX(hashid);
2277 
2278                 mutex_enter(hashmtx);
2279                 smp = smd_hash[hashid].sh_hash_list;
2280                 for (; smp != NULL; smp = smp->sm_hash) {
2281                         if (smp->sm_vp == vp && smp->sm_off == offset)
2282                                 break;
2283                 }
2284                 mutex_exit(hashmtx);
2285                 if (smp) {
2286                         mutex_enter(&smp->sm_mtx);
2287                         ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
2288                 }
2289         }
2290 
2291         if (ppp)
2292                 *ppp = smp ? pp : NULL;
2293 
2294         return (smp);
2295 }
2296 
2297 #else   /* SEGKPM_SUPPORT */
2298 
2299 /* segkpm stubs */
2300 
2301 /*ARGSUSED*/
2302 static caddr_t
2303 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2304         struct smap *smp, enum seg_rw rw)
2305 {
2306         return (NULL);
2307 }
2308 
2309 /*ARGSUSED*/
2310 struct smap *
2311 get_smap_kpm(caddr_t addr, page_t **ppp)
2312 {
2313         return (NULL);
2314 }
2315 
2316 #endif  /* SEGKPM_SUPPORT */