1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * Portions of this source code were derived from Berkeley 4.3 BSD
  31  * under license from the Regents of the University of California.
  32  */
  33 
  34 /*
  35  * VM - generic vnode mapping segment.
  36  *
  37  * The segmap driver is used only by the kernel to get faster (than seg_vn)
  38  * mappings [lower routine overhead; more persistent cache] to random
  39  * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/t_lock.h>
  44 #include <sys/param.h>
  45 #include <sys/sysmacros.h>
  46 #include <sys/buf.h>
  47 #include <sys/systm.h>
  48 #include <sys/vnode.h>
  49 #include <sys/mman.h>
  50 #include <sys/errno.h>
  51 #include <sys/cred.h>
  52 #include <sys/kmem.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/debug.h>
  56 #include <sys/thread.h>
  57 #include <sys/dumphdr.h>
  58 #include <sys/bitmap.h>
  59 #include <sys/lgrp.h>
  60 
  61 #include <vm/seg_kmem.h>
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_kpm.h>
  66 #include <vm/seg_map.h>
  67 #include <vm/page.h>
  68 #include <vm/pvn.h>
  69 #include <vm/rm.h>
  70 
  71 /*
  72  * Private seg op routines.
  73  */
  74 static void     segmap_free(struct seg *seg);
  75 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
  76                         size_t len, enum fault_type type, enum seg_rw rw);
  77 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
  78 static int      segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
  79                         uint_t prot);
  80 static int      segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
  81 static int      segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
  82                         uint_t *protv);
  83 static u_offset_t       segmap_getoffset(struct seg *seg, caddr_t addr);
  84 static int      segmap_gettype(struct seg *seg, caddr_t addr);
  85 static int      segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
  86 static void     segmap_dump(struct seg *seg);
  87 static int      segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
  88                         struct page ***ppp, enum lock_type type,
  89                         enum seg_rw rw);
  90 static int      segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
  91 static lgrp_mem_policy_info_t   *segmap_getpolicy(struct seg *seg,
  92     caddr_t addr);
  93 static int      segmap_capable(struct seg *seg, segcapability_t capability);
  94 
  95 /* segkpm support */
  96 static caddr_t  segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
  97                         struct smap *, enum seg_rw);
  98 struct smap     *get_smap_kpm(caddr_t, page_t **);
  99 
 100 static struct seg_ops segmap_ops = {
 101         .free           = segmap_free,
 102         .fault          = segmap_fault,
 103         .faulta         = segmap_faulta,
 104         .checkprot      = segmap_checkprot,
 105         .kluster        = segmap_kluster,
 106         .getprot        = segmap_getprot,
 107         .getoffset      = segmap_getoffset,
 108         .gettype        = segmap_gettype,
 109         .getvp          = segmap_getvp,
 110         .dump           = segmap_dump,
 111         .pagelock       = segmap_pagelock,
 112         .getmemid       = segmap_getmemid,
 113         .getpolicy      = segmap_getpolicy,
 114         .capable        = segmap_capable,
 115 };
 116 
 117 /*
 118  * Private segmap routines.
 119  */
 120 static void     segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
 121                         size_t len, enum seg_rw rw, struct smap *smp);
 122 static void     segmap_smapadd(struct smap *smp);
 123 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
 124                         u_offset_t off, int hashid);
 125 static void     segmap_hashout(struct smap *smp);
 126 
 127 
 128 /*
 129  * Statistics for segmap operations.
 130  *
 131  * No explicit locking to protect these stats.
 132  */
 133 struct segmapcnt segmapcnt = {
 134         { "fault",              KSTAT_DATA_ULONG },
 135         { "faulta",             KSTAT_DATA_ULONG },
 136         { "getmap",             KSTAT_DATA_ULONG },
 137         { "get_use",            KSTAT_DATA_ULONG },
 138         { "get_reclaim",        KSTAT_DATA_ULONG },
 139         { "get_reuse",          KSTAT_DATA_ULONG },
 140         { "get_unused",         KSTAT_DATA_ULONG },
 141         { "get_nofree",         KSTAT_DATA_ULONG },
 142         { "rel_async",          KSTAT_DATA_ULONG },
 143         { "rel_write",          KSTAT_DATA_ULONG },
 144         { "rel_free",           KSTAT_DATA_ULONG },
 145         { "rel_abort",          KSTAT_DATA_ULONG },
 146         { "rel_dontneed",       KSTAT_DATA_ULONG },
 147         { "release",            KSTAT_DATA_ULONG },
 148         { "pagecreate",         KSTAT_DATA_ULONG },
 149         { "free_notfree",       KSTAT_DATA_ULONG },
 150         { "free_dirty",         KSTAT_DATA_ULONG },
 151         { "free",               KSTAT_DATA_ULONG },
 152         { "stolen",             KSTAT_DATA_ULONG },
 153         { "get_nomtx",          KSTAT_DATA_ULONG }
 154 };
 155 
 156 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
 157 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
 158 
 159 /*
 160  * Return number of map pages in segment.
 161  */
 162 #define MAP_PAGES(seg)          ((seg)->s_size >> MAXBSHIFT)
 163 
 164 /*
 165  * Translate addr into smap number within segment.
 166  */
 167 #define MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
 168 
 169 /*
 170  * Translate addr in seg into struct smap pointer.
 171  */
 172 #define GET_SMAP(seg, addr)     \
 173         &(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
 174 
 175 /*
 176  * Bit in map (16 bit bitmap).
 177  */
 178 #define SMAP_BIT_MASK(bitindex) (1 << ((bitindex) & 0xf))
 179 
 180 static int smd_colormsk = 0;
 181 static int smd_ncolor = 0;
 182 static int smd_nfree = 0;
 183 static int smd_freemsk = 0;
 184 #ifdef DEBUG
 185 static int *colors_used;
 186 #endif
 187 static struct smap *smd_smap;
 188 static struct smaphash *smd_hash;
 189 #ifdef SEGMAP_HASHSTATS
 190 static unsigned int *smd_hash_len;
 191 #endif
 192 static struct smfree *smd_free;
 193 static ulong_t smd_hashmsk = 0;
 194 
 195 #define SEGMAP_MAXCOLOR         2
 196 #define SEGMAP_CACHE_PAD        64
 197 
 198 union segmap_cpu {
 199         struct {
 200                 uint32_t        scpu_free_ndx[SEGMAP_MAXCOLOR];
 201                 struct smap     *scpu_last_smap;
 202                 ulong_t         scpu_getmap;
 203                 ulong_t         scpu_release;
 204                 ulong_t         scpu_get_reclaim;
 205                 ulong_t         scpu_fault;
 206                 ulong_t         scpu_pagecreate;
 207                 ulong_t         scpu_get_reuse;
 208         } scpu;
 209         char    scpu_pad[SEGMAP_CACHE_PAD];
 210 };
 211 static union segmap_cpu *smd_cpu;
 212 
 213 /*
 214  * There are three locks in seg_map:
 215  *      - per freelist mutexes
 216  *      - per hashchain mutexes
 217  *      - per smap mutexes
 218  *
 219  * The lock ordering is to get the smap mutex to lock down the slot
 220  * first then the hash lock (for hash in/out (vp, off) list) or the
 221  * freelist lock to put the slot back on the free list.
 222  *
 223  * The hash search is done by only holding the hashchain lock, when a wanted
 224  * slot is found, we drop the hashchain lock then lock the slot so there
 225  * is no overlapping of hashchain and smap locks. After the slot is
 226  * locked, we verify again if the slot is still what we are looking
 227  * for.
 228  *
 229  * Allocation of a free slot is done by holding the freelist lock,
 230  * then locking the smap slot at the head of the freelist. This is
 231  * in reversed lock order so mutex_tryenter() is used.
 232  *
 233  * The smap lock protects all fields in smap structure except for
 234  * the link fields for hash/free lists which are protected by
 235  * hashchain and freelist locks.
 236  */
 237 
 238 #define SHASHMTX(hashid)        (&smd_hash[hashid].sh_mtx)
 239 
 240 #define SMP2SMF(smp)            (&smd_free[(smp - smd_smap) & smd_freemsk])
 241 #define SMP2SMF_NDX(smp)        (ushort_t)((smp - smd_smap) & smd_freemsk)
 242 
 243 #define SMAPMTX(smp) (&smp->sm_mtx)
 244 
 245 #define SMAP_HASHFUNC(vp, off, hashid) \
 246         { \
 247         hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
 248                 ((off) >> MAXBSHIFT)) & smd_hashmsk); \
 249         }
 250 
 251 /*
 252  * The most frequently updated kstat counters are kept in the
 253  * per cpu array to avoid hot cache blocks. The update function
 254  * sums the cpu local counters to update the global counters.
 255  */
 256 
 257 /* ARGSUSED */
 258 int
 259 segmap_kstat_update(kstat_t *ksp, int rw)
 260 {
 261         int i;
 262         ulong_t getmap, release, get_reclaim;
 263         ulong_t fault, pagecreate, get_reuse;
 264 
 265         if (rw == KSTAT_WRITE)
 266                 return (EACCES);
 267         getmap = release = get_reclaim = (ulong_t)0;
 268         fault = pagecreate = get_reuse = (ulong_t)0;
 269         for (i = 0; i < max_ncpus; i++) {
 270                 getmap += smd_cpu[i].scpu.scpu_getmap;
 271                 release  += smd_cpu[i].scpu.scpu_release;
 272                 get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
 273                 fault  += smd_cpu[i].scpu.scpu_fault;
 274                 pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
 275                 get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
 276         }
 277         segmapcnt.smp_getmap.value.ul = getmap;
 278         segmapcnt.smp_release.value.ul = release;
 279         segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
 280         segmapcnt.smp_fault.value.ul = fault;
 281         segmapcnt.smp_pagecreate.value.ul = pagecreate;
 282         segmapcnt.smp_get_reuse.value.ul = get_reuse;
 283         return (0);
 284 }
 285 
 286 int
 287 segmap_create(struct seg *seg, void *argsp)
 288 {
 289         struct segmap_data *smd;
 290         struct smap *smp;
 291         struct smfree *sm;
 292         struct segmap_crargs *a = (struct segmap_crargs *)argsp;
 293         struct smaphash *shashp;
 294         union segmap_cpu *scpu;
 295         long i, npages;
 296         size_t hashsz;
 297         uint_t nfreelist;
 298         extern void prefetch_smap_w(void *);
 299         extern int max_ncpus;
 300 
 301         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 302 
 303         if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
 304                 panic("segkmap not MAXBSIZE aligned");
 305                 /*NOTREACHED*/
 306         }
 307 
 308         smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
 309 
 310         seg->s_data = (void *)smd;
 311         seg->s_ops = &segmap_ops;
 312         smd->smd_prot = a->prot;
 313 
 314         /*
 315          * Scale the number of smap freelists to be
 316          * proportional to max_ncpus * number of virtual colors.
 317          * The caller can over-ride this scaling by providing
 318          * a non-zero a->nfreelist argument.
 319          */
 320         nfreelist = a->nfreelist;
 321         if (nfreelist == 0)
 322                 nfreelist = max_ncpus;
 323         else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
 324                 cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
 325                 "%d, using %d", nfreelist, max_ncpus);
 326                 nfreelist = max_ncpus;
 327         }
 328         if (!ISP2(nfreelist)) {
 329                 /* round up nfreelist to the next power of two. */
 330                 nfreelist = 1 << (highbit(nfreelist));
 331         }
 332 
 333         /*
 334          * Get the number of virtual colors - must be a power of 2.
 335          */
 336         if (a->shmsize)
 337                 smd_ncolor = a->shmsize >> MAXBSHIFT;
 338         else
 339                 smd_ncolor = 1;
 340         ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
 341         ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
 342         smd_colormsk = smd_ncolor - 1;
 343         smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
 344         smd_freemsk = smd_nfree - 1;
 345 
 346         /*
 347          * Allocate and initialize the freelist headers.
 348          * Note that sm_freeq[1] starts out as the release queue. This
 349          * is known when the smap structures are initialized below.
 350          */
 351         smd_free = smd->smd_free =
 352             kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
 353         for (i = 0; i < smd_nfree; i++) {
 354                 sm = &smd->smd_free[i];
 355                 mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 356                 mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 357                 sm->sm_allocq = &sm->sm_freeq[0];
 358                 sm->sm_releq = &sm->sm_freeq[1];
 359         }
 360 
 361         /*
 362          * Allocate and initialize the smap hash chain headers.
 363          * Compute hash size rounding down to the next power of two.
 364          */
 365         npages = MAP_PAGES(seg);
 366         smd->smd_npages = npages;
 367         hashsz = npages / SMAP_HASHAVELEN;
 368         hashsz = 1 << (highbit(hashsz)-1);
 369         smd_hashmsk = hashsz - 1;
 370         smd_hash = smd->smd_hash =
 371             kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
 372 #ifdef SEGMAP_HASHSTATS
 373         smd_hash_len =
 374             kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
 375 #endif
 376         for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
 377                 shashp->sh_hash_list = NULL;
 378                 mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
 379         }
 380 
 381         /*
 382          * Allocate and initialize the smap structures.
 383          * Link all slots onto the appropriate freelist.
 384          * The smap array is large enough to affect boot time
 385          * on large systems, so use memory prefetching and only
 386          * go through the array 1 time. Inline a optimized version
 387          * of segmap_smapadd to add structures to freelists with
 388          * knowledge that no locks are needed here.
 389          */
 390         smd_smap = smd->smd_sm =
 391             kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
 392 
 393         for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
 394             smp >= smd->smd_sm; smp--) {
 395                 struct smap *smpfreelist;
 396                 struct sm_freeq *releq;
 397 
 398                 prefetch_smap_w((char *)smp);
 399 
 400                 smp->sm_vp = NULL;
 401                 smp->sm_hash = NULL;
 402                 smp->sm_off = 0;
 403                 smp->sm_bitmap = 0;
 404                 smp->sm_refcnt = 0;
 405                 mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
 406                 smp->sm_free_ndx = SMP2SMF_NDX(smp);
 407 
 408                 sm = SMP2SMF(smp);
 409                 releq = sm->sm_releq;
 410 
 411                 smpfreelist = releq->smq_free;
 412                 if (smpfreelist == 0) {
 413                         releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 414                 } else {
 415                         smp->sm_next = smpfreelist;
 416                         smp->sm_prev = smpfreelist->sm_prev;
 417                         smpfreelist->sm_prev = smp;
 418                         smp->sm_prev->sm_next = smp;
 419                         releq->smq_free = smp->sm_next;
 420                 }
 421 
 422                 /*
 423                  * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
 424                  */
 425                 smp->sm_flags = 0;
 426 
 427 #ifdef  SEGKPM_SUPPORT
 428                 /*
 429                  * Due to the fragile prefetch loop no
 430                  * separate function is used here.
 431                  */
 432                 smp->sm_kpme_next = NULL;
 433                 smp->sm_kpme_prev = NULL;
 434                 smp->sm_kpme_page = NULL;
 435 #endif
 436         }
 437 
 438         /*
 439          * Allocate the per color indices that distribute allocation
 440          * requests over the free lists. Each cpu will have a private
 441          * rotor index to spread the allocations even across the available
 442          * smap freelists. Init the scpu_last_smap field to the first
 443          * smap element so there is no need to check for NULL.
 444          */
 445         smd_cpu =
 446             kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
 447         for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
 448                 int j;
 449                 for (j = 0; j < smd_ncolor; j++)
 450                         scpu->scpu.scpu_free_ndx[j] = j;
 451                 scpu->scpu.scpu_last_smap = smd_smap;
 452         }
 453 
 454         vpm_init();
 455 
 456 #ifdef DEBUG
 457         /*
 458          * Keep track of which colors are used more often.
 459          */
 460         colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
 461 #endif /* DEBUG */
 462 
 463         return (0);
 464 }
 465 
 466 static void
 467 segmap_free(seg)
 468         struct seg *seg;
 469 {
 470         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 471 }
 472 
 473 /*
 474  * Do a F_SOFTUNLOCK call over the range requested.
 475  * The range must have already been F_SOFTLOCK'ed.
 476  */
 477 static void
 478 segmap_unlock(
 479         struct hat *hat,
 480         struct seg *seg,
 481         caddr_t addr,
 482         size_t len,
 483         enum seg_rw rw,
 484         struct smap *smp)
 485 {
 486         page_t *pp;
 487         caddr_t adr;
 488         u_offset_t off;
 489         struct vnode *vp;
 490         kmutex_t *smtx;
 491 
 492         ASSERT(smp->sm_refcnt > 0);
 493 
 494 #ifdef lint
 495         seg = seg;
 496 #endif
 497 
 498         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 499 
 500                 /*
 501                  * We're called only from segmap_fault and this was a
 502                  * NOP in case of a kpm based smap, so dangerous things
 503                  * must have happened in the meantime. Pages are prefaulted
 504                  * and locked in segmap_getmapflt and they will not be
 505                  * unlocked until segmap_release.
 506                  */
 507                 panic("segmap_unlock: called with kpm addr %p", (void *)addr);
 508                 /*NOTREACHED*/
 509         }
 510 
 511         vp = smp->sm_vp;
 512         off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 513 
 514         hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
 515         for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
 516                 ushort_t bitmask;
 517 
 518                 /*
 519                  * Use page_find() instead of page_lookup() to
 520                  * find the page since we know that it has
 521                  * "shared" lock.
 522                  */
 523                 pp = page_find(vp, off);
 524                 if (pp == NULL) {
 525                         panic("segmap_unlock: page not found");
 526                         /*NOTREACHED*/
 527                 }
 528 
 529                 if (rw == S_WRITE) {
 530                         hat_setrefmod(pp);
 531                 } else if (rw != S_OTHER) {
 532                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 533                         "segmap_fault:pp %p vp %p offset %llx", pp, vp, off);
 534                         hat_setref(pp);
 535                 }
 536 
 537                 /*
 538                  * Clear bitmap, if the bit corresponding to "off" is set,
 539                  * since the page and translation are being unlocked.
 540                  */
 541                 bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
 542 
 543                 /*
 544                  * Large Files: Following assertion is to verify
 545                  * the correctness of the cast to (int) above.
 546                  */
 547                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
 548                 smtx = SMAPMTX(smp);
 549                 mutex_enter(smtx);
 550                 if (smp->sm_bitmap & bitmask) {
 551                         smp->sm_bitmap &= ~bitmask;
 552                 }
 553                 mutex_exit(smtx);
 554 
 555                 page_unlock(pp);
 556         }
 557 }
 558 
 559 #define MAXPPB  (MAXBSIZE/4096) /* assumes minimum page size of 4k */
 560 
 561 /*
 562  * This routine is called via a machine specific fault handling
 563  * routine.  It is also called by software routines wishing to
 564  * lock or unlock a range of addresses.
 565  *
 566  * Note that this routine expects a page-aligned "addr".
 567  */
 568 faultcode_t
 569 segmap_fault(
 570         struct hat *hat,
 571         struct seg *seg,
 572         caddr_t addr,
 573         size_t len,
 574         enum fault_type type,
 575         enum seg_rw rw)
 576 {
 577         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 578         struct smap *smp;
 579         page_t *pp, **ppp;
 580         struct vnode *vp;
 581         u_offset_t off;
 582         page_t *pl[MAXPPB + 1];
 583         uint_t prot;
 584         u_offset_t addroff;
 585         caddr_t adr;
 586         int err;
 587         u_offset_t sm_off;
 588         int hat_flag;
 589 
 590         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 591                 int newpage;
 592                 kmutex_t *smtx;
 593 
 594                 /*
 595                  * Pages are successfully prefaulted and locked in
 596                  * segmap_getmapflt and can't be unlocked until
 597                  * segmap_release. No hat mappings have to be locked
 598                  * and they also can't be unlocked as long as the
 599                  * caller owns an active kpm addr.
 600                  */
 601 #ifndef DEBUG
 602                 if (type != F_SOFTUNLOCK)
 603                         return (0);
 604 #endif
 605 
 606                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 607                         panic("segmap_fault: smap not found "
 608                             "for addr %p", (void *)addr);
 609                         /*NOTREACHED*/
 610                 }
 611 
 612                 smtx = SMAPMTX(smp);
 613 #ifdef  DEBUG
 614                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 615                 if (newpage) {
 616                         cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
 617                             (void *)smp);
 618                 }
 619 
 620                 if (type != F_SOFTUNLOCK) {
 621                         mutex_exit(smtx);
 622                         return (0);
 623                 }
 624 #endif
 625                 mutex_exit(smtx);
 626                 vp = smp->sm_vp;
 627                 sm_off = smp->sm_off;
 628 
 629                 if (vp == NULL)
 630                         return (FC_MAKE_ERR(EIO));
 631 
 632                 ASSERT(smp->sm_refcnt > 0);
 633 
 634                 addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 635                 if (addroff + len > MAXBSIZE)
 636                         panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
 637                             (void *)(addr + len));
 638 
 639                 off = sm_off + addroff;
 640 
 641                 pp = page_find(vp, off);
 642 
 643                 if (pp == NULL)
 644                         panic("segmap_fault: softunlock page not found");
 645 
 646                 /*
 647                  * Set ref bit also here in case of S_OTHER to avoid the
 648                  * overhead of supporting other cases than F_SOFTUNLOCK
 649                  * with segkpm. We can do this because the underlying
 650                  * pages are locked anyway.
 651                  */
 652                 if (rw == S_WRITE) {
 653                         hat_setrefmod(pp);
 654                 } else {
 655                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 656                             "segmap_fault:pp %p vp %p offset %llx",
 657                             pp, vp, off);
 658                         hat_setref(pp);
 659                 }
 660 
 661                 return (0);
 662         }
 663 
 664         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
 665         smp = GET_SMAP(seg, addr);
 666         vp = smp->sm_vp;
 667         sm_off = smp->sm_off;
 668 
 669         if (vp == NULL)
 670                 return (FC_MAKE_ERR(EIO));
 671 
 672         ASSERT(smp->sm_refcnt > 0);
 673 
 674         addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 675         if (addroff + len > MAXBSIZE) {
 676                 panic("segmap_fault: endaddr %p "
 677                     "exceeds MAXBSIZE chunk", (void *)(addr + len));
 678                 /*NOTREACHED*/
 679         }
 680         off = sm_off + addroff;
 681 
 682         /*
 683          * First handle the easy stuff
 684          */
 685         if (type == F_SOFTUNLOCK) {
 686                 segmap_unlock(hat, seg, addr, len, rw, smp);
 687                 return (0);
 688         }
 689 
 690         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 691             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 692         err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
 693             seg, addr, rw, CRED(), NULL);
 694 
 695         if (err)
 696                 return (FC_MAKE_ERR(err));
 697 
 698         prot &= smd->smd_prot;
 699 
 700         /*
 701          * Handle all pages returned in the pl[] array.
 702          * This loop is coded on the assumption that if
 703          * there was no error from the VOP_GETPAGE routine,
 704          * that the page list returned will contain all the
 705          * needed pages for the vp from [off..off + len].
 706          */
 707         ppp = pl;
 708         while ((pp = *ppp++) != NULL) {
 709                 u_offset_t poff;
 710                 ASSERT(pp->p_vnode == vp);
 711                 hat_flag = HAT_LOAD;
 712 
 713                 /*
 714                  * Verify that the pages returned are within the range
 715                  * of this segmap region.  Note that it is theoretically
 716                  * possible for pages outside this range to be returned,
 717                  * but it is not very likely.  If we cannot use the
 718                  * page here, just release it and go on to the next one.
 719                  */
 720                 if (pp->p_offset < sm_off ||
 721                     pp->p_offset >= sm_off + MAXBSIZE) {
 722                         (void) page_release(pp, 1);
 723                         continue;
 724                 }
 725 
 726                 ASSERT(hat == kas.a_hat);
 727                 poff = pp->p_offset;
 728                 adr = addr + (poff - off);
 729                 if (adr >= addr && adr < addr + len) {
 730                         hat_setref(pp);
 731                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 732                             "segmap_fault:pp %p vp %p offset %llx",
 733                             pp, vp, poff);
 734                         if (type == F_SOFTLOCK)
 735                                 hat_flag = HAT_LOAD_LOCK;
 736                 }
 737 
 738                 /*
 739                  * Deal with VMODSORT pages here. If we know this is a write
 740                  * do the setmod now and allow write protection.
 741                  * As long as it's modified or not S_OTHER, remove write
 742                  * protection. With S_OTHER it's up to the FS to deal with this.
 743                  */
 744                 if (IS_VMODSORT(vp)) {
 745                         if (rw == S_WRITE)
 746                                 hat_setmod(pp);
 747                         else if (rw != S_OTHER && !hat_ismod(pp))
 748                                 prot &= ~PROT_WRITE;
 749                 }
 750 
 751                 hat_memload(hat, adr, pp, prot, hat_flag);
 752                 if (hat_flag != HAT_LOAD_LOCK)
 753                         page_unlock(pp);
 754         }
 755         return (0);
 756 }
 757 
 758 /*
 759  * This routine is used to start I/O on pages asynchronously.
 760  */
 761 static faultcode_t
 762 segmap_faulta(struct seg *seg, caddr_t addr)
 763 {
 764         struct smap *smp;
 765         struct vnode *vp;
 766         u_offset_t off;
 767         int err;
 768 
 769         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 770                 int     newpage;
 771                 kmutex_t *smtx;
 772 
 773                 /*
 774                  * Pages are successfully prefaulted and locked in
 775                  * segmap_getmapflt and can't be unlocked until
 776                  * segmap_release. No hat mappings have to be locked
 777                  * and they also can't be unlocked as long as the
 778                  * caller owns an active kpm addr.
 779                  */
 780 #ifdef  DEBUG
 781                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 782                         panic("segmap_faulta: smap not found "
 783                             "for addr %p", (void *)addr);
 784                         /*NOTREACHED*/
 785                 }
 786 
 787                 smtx = SMAPMTX(smp);
 788                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 789                 mutex_exit(smtx);
 790                 if (newpage)
 791                         cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
 792                             (void *)smp);
 793 #endif
 794                 return (0);
 795         }
 796 
 797         segmapcnt.smp_faulta.value.ul++;
 798         smp = GET_SMAP(seg, addr);
 799 
 800         ASSERT(smp->sm_refcnt > 0);
 801 
 802         vp = smp->sm_vp;
 803         off = smp->sm_off;
 804 
 805         if (vp == NULL) {
 806                 cmn_err(CE_WARN, "segmap_faulta - no vp");
 807                 return (FC_MAKE_ERR(EIO));
 808         }
 809 
 810         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 811             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 812 
 813         err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
 814             & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
 815             seg, addr, S_READ, CRED(), NULL);
 816 
 817         if (err)
 818                 return (FC_MAKE_ERR(err));
 819         return (0);
 820 }
 821 
 822 /*ARGSUSED*/
 823 static int
 824 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
 825 {
 826         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 827 
 828         ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
 829 
 830         /*
 831          * Need not acquire the segment lock since
 832          * "smd_prot" is a read-only field.
 833          */
 834         return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
 835 }
 836 
 837 static int
 838 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
 839 {
 840         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 841         size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
 842 
 843         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
 844 
 845         if (pgno != 0) {
 846                 do {
 847                         protv[--pgno] = smd->smd_prot;
 848                 } while (pgno != 0);
 849         }
 850         return (0);
 851 }
 852 
 853 static u_offset_t
 854 segmap_getoffset(struct seg *seg, caddr_t addr)
 855 {
 856         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 857 
 858         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 859 
 860         return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
 861 }
 862 
 863 /*ARGSUSED*/
 864 static int
 865 segmap_gettype(struct seg *seg, caddr_t addr)
 866 {
 867         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 868 
 869         return (MAP_SHARED);
 870 }
 871 
 872 /*ARGSUSED*/
 873 static int
 874 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
 875 {
 876         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 877 
 878         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 879 
 880         /* XXX - This doesn't make any sense */
 881         *vpp = smd->smd_sm->sm_vp;
 882         return (0);
 883 }
 884 
 885 /*
 886  * Check to see if it makes sense to do kluster/read ahead to
 887  * addr + delta relative to the mapping at addr.  We assume here
 888  * that delta is a signed PAGESIZE'd multiple (which can be negative).
 889  *
 890  * For segmap we always "approve" of this action from our standpoint.
 891  */
 892 /*ARGSUSED*/
 893 static int
 894 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 895 {
 896         return (0);
 897 }
 898 
 899 /*
 900  * Special private segmap operations
 901  */
 902 
 903 /*
 904  * Add smap to the appropriate free list.
 905  */
 906 static void
 907 segmap_smapadd(struct smap *smp)
 908 {
 909         struct smfree *sm;
 910         struct smap *smpfreelist;
 911         struct sm_freeq *releq;
 912 
 913         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 914 
 915         if (smp->sm_refcnt != 0) {
 916                 panic("segmap_smapadd");
 917                 /*NOTREACHED*/
 918         }
 919 
 920         sm = &smd_free[smp->sm_free_ndx];
 921         /*
 922          * Add to the tail of the release queue
 923          * Note that sm_releq and sm_allocq could toggle
 924          * before we get the lock. This does not affect
 925          * correctness as the 2 queues are only maintained
 926          * to reduce lock pressure.
 927          */
 928         releq = sm->sm_releq;
 929         if (releq == &sm->sm_freeq[0])
 930                 smp->sm_flags |= SM_QNDX_ZERO;
 931         else
 932                 smp->sm_flags &= ~SM_QNDX_ZERO;
 933         mutex_enter(&releq->smq_mtx);
 934         smpfreelist = releq->smq_free;
 935         if (smpfreelist == 0) {
 936                 int want;
 937 
 938                 releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 939                 /*
 940                  * Both queue mutexes held to set sm_want;
 941                  * snapshot the value before dropping releq mutex.
 942                  * If sm_want appears after the releq mutex is dropped,
 943                  * then the smap just freed is already gone.
 944                  */
 945                 want = sm->sm_want;
 946                 mutex_exit(&releq->smq_mtx);
 947                 /*
 948                  * See if there was a waiter before dropping the releq mutex
 949                  * then recheck after obtaining sm_freeq[0] mutex as
 950                  * the another thread may have already signaled.
 951                  */
 952                 if (want) {
 953                         mutex_enter(&sm->sm_freeq[0].smq_mtx);
 954                         if (sm->sm_want)
 955                                 cv_signal(&sm->sm_free_cv);
 956                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
 957                 }
 958         } else {
 959                 smp->sm_next = smpfreelist;
 960                 smp->sm_prev = smpfreelist->sm_prev;
 961                 smpfreelist->sm_prev = smp;
 962                 smp->sm_prev->sm_next = smp;
 963                 mutex_exit(&releq->smq_mtx);
 964         }
 965 }
 966 
 967 
 968 static struct smap *
 969 segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
 970 {
 971         struct smap **hpp;
 972         struct smap *tmp;
 973         kmutex_t *hmtx;
 974 
 975         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 976         ASSERT(smp->sm_vp == NULL);
 977         ASSERT(smp->sm_hash == NULL);
 978         ASSERT(smp->sm_prev == NULL);
 979         ASSERT(smp->sm_next == NULL);
 980         ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
 981 
 982         hmtx = SHASHMTX(hashid);
 983 
 984         mutex_enter(hmtx);
 985         /*
 986          * First we need to verify that no one has created a smp
 987          * with (vp,off) as its tag before we us.
 988          */
 989         for (tmp = smd_hash[hashid].sh_hash_list;
 990             tmp != NULL; tmp = tmp->sm_hash)
 991                 if (tmp->sm_vp == vp && tmp->sm_off == off)
 992                         break;
 993 
 994         if (tmp == NULL) {
 995                 /*
 996                  * No one created one yet.
 997                  *
 998                  * Funniness here - we don't increment the ref count on the
 999                  * vnode * even though we have another pointer to it here.
1000                  * The reason for this is that we don't want the fact that
1001                  * a seg_map entry somewhere refers to a vnode to prevent the
1002                  * vnode * itself from going away.  This is because this
1003                  * reference to the vnode is a "soft one".  In the case where
1004                  * a mapping is being used by a rdwr [or directory routine?]
1005                  * there already has to be a non-zero ref count on the vnode.
1006                  * In the case where the vp has been freed and the the smap
1007                  * structure is on the free list, there are no pages in memory
1008                  * that can refer to the vnode.  Thus even if we reuse the same
1009                  * vnode/smap structure for a vnode which has the same
1010                  * address but represents a different object, we are ok.
1011                  */
1012                 smp->sm_vp = vp;
1013                 smp->sm_off = off;
1014 
1015                 hpp = &smd_hash[hashid].sh_hash_list;
1016                 smp->sm_hash = *hpp;
1017                 *hpp = smp;
1018 #ifdef SEGMAP_HASHSTATS
1019                 smd_hash_len[hashid]++;
1020 #endif
1021         }
1022         mutex_exit(hmtx);
1023 
1024         return (tmp);
1025 }
1026 
1027 static void
1028 segmap_hashout(struct smap *smp)
1029 {
1030         struct smap **hpp, *hp;
1031         struct vnode *vp;
1032         kmutex_t *mtx;
1033         int hashid;
1034         u_offset_t off;
1035 
1036         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1037 
1038         vp = smp->sm_vp;
1039         off = smp->sm_off;
1040 
1041         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1042         mtx = SHASHMTX(hashid);
1043         mutex_enter(mtx);
1044 
1045         hpp = &smd_hash[hashid].sh_hash_list;
1046         for (;;) {
1047                 hp = *hpp;
1048                 if (hp == NULL) {
1049                         panic("segmap_hashout");
1050                         /*NOTREACHED*/
1051                 }
1052                 if (hp == smp)
1053                         break;
1054                 hpp = &hp->sm_hash;
1055         }
1056 
1057         *hpp = smp->sm_hash;
1058         smp->sm_hash = NULL;
1059 #ifdef SEGMAP_HASHSTATS
1060         smd_hash_len[hashid]--;
1061 #endif
1062         mutex_exit(mtx);
1063 
1064         smp->sm_vp = NULL;
1065         smp->sm_off = (u_offset_t)0;
1066 
1067 }
1068 
1069 /*
1070  * Attempt to free unmodified, unmapped, and non locked segmap
1071  * pages.
1072  */
1073 void
1074 segmap_pagefree(struct vnode *vp, u_offset_t off)
1075 {
1076         u_offset_t pgoff;
1077         page_t  *pp;
1078 
1079         for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
1080 
1081                 if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL)
1082                         continue;
1083 
1084                 switch (page_release(pp, 1)) {
1085                 case PGREL_NOTREL:
1086                         segmapcnt.smp_free_notfree.value.ul++;
1087                         break;
1088                 case PGREL_MOD:
1089                         segmapcnt.smp_free_dirty.value.ul++;
1090                         break;
1091                 case PGREL_CLEAN:
1092                         segmapcnt.smp_free.value.ul++;
1093                         break;
1094                 }
1095         }
1096 }
1097 
1098 /*
1099  * Locks held on entry: smap lock
1100  * Locks held on exit : smap lock.
1101  */
1102 
1103 static void
1104 grab_smp(struct smap *smp, page_t *pp)
1105 {
1106         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1107         ASSERT(smp->sm_refcnt == 0);
1108 
1109         if (smp->sm_vp != (struct vnode *)NULL) {
1110                 struct vnode    *vp = smp->sm_vp;
1111                 u_offset_t      off = smp->sm_off;
1112                 /*
1113                  * Destroy old vnode association and
1114                  * unload any hardware translations to
1115                  * the old object.
1116                  */
1117                 smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
1118                 segmap_hashout(smp);
1119 
1120                 /*
1121                  * This node is off freelist and hashlist,
1122                  * so there is no reason to drop/reacquire sm_mtx
1123                  * across calls to hat_unload.
1124                  */
1125                 if (segmap_kpm) {
1126                         caddr_t vaddr;
1127                         int hat_unload_needed = 0;
1128 
1129                         /*
1130                          * unload kpm mapping
1131                          */
1132                         if (pp != NULL) {
1133                                 vaddr = hat_kpm_page2va(pp, 1);
1134                                 hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
1135                                 page_unlock(pp);
1136                         }
1137 
1138                         /*
1139                          * Check if we have (also) the rare case of a
1140                          * non kpm mapping.
1141                          */
1142                         if (smp->sm_flags & SM_NOTKPM_RELEASED) {
1143                                 hat_unload_needed = 1;
1144                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1145                         }
1146 
1147                         if (hat_unload_needed) {
1148                                 hat_unload(kas.a_hat, segkmap->s_base +
1149                                     ((smp - smd_smap) * MAXBSIZE),
1150                                     MAXBSIZE, HAT_UNLOAD);
1151                         }
1152 
1153                 } else {
1154                         ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
1155                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1156                         hat_unload(kas.a_hat, segkmap->s_base +
1157                             ((smp - smd_smap) * MAXBSIZE),
1158                             MAXBSIZE, HAT_UNLOAD);
1159                 }
1160                 segmap_pagefree(vp, off);
1161         }
1162 }
1163 
1164 static struct smap *
1165 get_free_smp(int free_ndx)
1166 {
1167         struct smfree *sm;
1168         kmutex_t *smtx;
1169         struct smap *smp, *first;
1170         struct sm_freeq *allocq, *releq;
1171         struct kpme *kpme;
1172         page_t *pp = NULL;
1173         int end_ndx, page_locked = 0;
1174 
1175         end_ndx = free_ndx;
1176         sm = &smd_free[free_ndx];
1177 
1178 retry_queue:
1179         allocq = sm->sm_allocq;
1180         mutex_enter(&allocq->smq_mtx);
1181 
1182         if ((smp = allocq->smq_free) == NULL) {
1183 
1184 skip_queue:
1185                 /*
1186                  * The alloc list is empty or this queue is being skipped;
1187                  * first see if the allocq toggled.
1188                  */
1189                 if (sm->sm_allocq != allocq) {
1190                         /* queue changed */
1191                         mutex_exit(&allocq->smq_mtx);
1192                         goto retry_queue;
1193                 }
1194                 releq = sm->sm_releq;
1195                 if (!mutex_tryenter(&releq->smq_mtx)) {
1196                         /* cannot get releq; a free smp may be there now */
1197                         mutex_exit(&allocq->smq_mtx);
1198 
1199                         /*
1200                          * This loop could spin forever if this thread has
1201                          * higher priority than the thread that is holding
1202                          * releq->smq_mtx. In order to force the other thread
1203                          * to run, we'll lock/unlock the mutex which is safe
1204                          * since we just unlocked the allocq mutex.
1205                          */
1206                         mutex_enter(&releq->smq_mtx);
1207                         mutex_exit(&releq->smq_mtx);
1208                         goto retry_queue;
1209                 }
1210                 if (releq->smq_free == NULL) {
1211                         /*
1212                          * This freelist is empty.
1213                          * This should not happen unless clients
1214                          * are failing to release the segmap
1215                          * window after accessing the data.
1216                          * Before resorting to sleeping, try
1217                          * the next list of the same color.
1218                          */
1219                         free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
1220                         if (free_ndx != end_ndx) {
1221                                 mutex_exit(&releq->smq_mtx);
1222                                 mutex_exit(&allocq->smq_mtx);
1223                                 sm = &smd_free[free_ndx];
1224                                 goto retry_queue;
1225                         }
1226                         /*
1227                          * Tried all freelists of the same color once,
1228                          * wait on this list and hope something gets freed.
1229                          */
1230                         segmapcnt.smp_get_nofree.value.ul++;
1231                         sm->sm_want++;
1232                         mutex_exit(&sm->sm_freeq[1].smq_mtx);
1233                         cv_wait(&sm->sm_free_cv,
1234                             &sm->sm_freeq[0].smq_mtx);
1235                         sm->sm_want--;
1236                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
1237                         sm = &smd_free[free_ndx];
1238                         goto retry_queue;
1239                 } else {
1240                         /*
1241                          * Something on the rele queue; flip the alloc
1242                          * and rele queues and retry.
1243                          */
1244                         sm->sm_allocq = releq;
1245                         sm->sm_releq = allocq;
1246                         mutex_exit(&allocq->smq_mtx);
1247                         mutex_exit(&releq->smq_mtx);
1248                         if (page_locked) {
1249                                 delay(hz >> 2);
1250                                 page_locked = 0;
1251                         }
1252                         goto retry_queue;
1253                 }
1254         } else {
1255                 /*
1256                  * Fastpath the case we get the smap mutex
1257                  * on the first try.
1258                  */
1259                 first = smp;
1260 next_smap:
1261                 smtx = SMAPMTX(smp);
1262                 if (!mutex_tryenter(smtx)) {
1263                         /*
1264                          * Another thread is trying to reclaim this slot.
1265                          * Skip to the next queue or smap.
1266                          */
1267                         if ((smp = smp->sm_next) == first) {
1268                                 goto skip_queue;
1269                         } else {
1270                                 goto next_smap;
1271                         }
1272                 } else {
1273                         /*
1274                          * if kpme exists, get shared lock on the page
1275                          */
1276                         if (segmap_kpm && smp->sm_vp != NULL) {
1277 
1278                                 kpme = GET_KPME(smp);
1279                                 pp = kpme->kpe_page;
1280 
1281                                 if (pp != NULL) {
1282                                         if (!page_trylock(pp, SE_SHARED)) {
1283                                                 smp = smp->sm_next;
1284                                                 mutex_exit(smtx);
1285                                                 page_locked = 1;
1286 
1287                                                 pp = NULL;
1288 
1289                                                 if (smp == first) {
1290                                                         goto skip_queue;
1291                                                 } else {
1292                                                         goto next_smap;
1293                                                 }
1294                                         } else {
1295                                                 if (kpme->kpe_page == NULL) {
1296                                                         page_unlock(pp);
1297                                                         pp = NULL;
1298                                                 }
1299                                         }
1300                                 }
1301                         }
1302 
1303                         /*
1304                          * At this point, we've selected smp.  Remove smp
1305                          * from its freelist.  If smp is the first one in
1306                          * the freelist, update the head of the freelist.
1307                          */
1308                         if (first == smp) {
1309                                 ASSERT(first == allocq->smq_free);
1310                                 allocq->smq_free = smp->sm_next;
1311                         }
1312 
1313                         /*
1314                          * if the head of the freelist still points to smp,
1315                          * then there are no more free smaps in that list.
1316                          */
1317                         if (allocq->smq_free == smp)
1318                                 /*
1319                                  * Took the last one
1320                                  */
1321                                 allocq->smq_free = NULL;
1322                         else {
1323                                 smp->sm_prev->sm_next = smp->sm_next;
1324                                 smp->sm_next->sm_prev = smp->sm_prev;
1325                         }
1326                         mutex_exit(&allocq->smq_mtx);
1327                         smp->sm_prev = smp->sm_next = NULL;
1328 
1329                         /*
1330                          * if pp != NULL, pp must have been locked;
1331                          * grab_smp() unlocks pp.
1332                          */
1333                         ASSERT((pp == NULL) || PAGE_LOCKED(pp));
1334                         grab_smp(smp, pp);
1335                         /* return smp locked. */
1336                         ASSERT(SMAPMTX(smp) == smtx);
1337                         ASSERT(MUTEX_HELD(smtx));
1338                         return (smp);
1339                 }
1340         }
1341 }
1342 
1343 /*
1344  * Special public segmap operations
1345  */
1346 
1347 /*
1348  * Create pages (without using VOP_GETPAGE) and load up translations to them.
1349  * If softlock is TRUE, then set things up so that it looks like a call
1350  * to segmap_fault with F_SOFTLOCK.
1351  *
1352  * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
1353  *
1354  * All fields in the generic segment (struct seg) are considered to be
1355  * read-only for "segmap" even though the kernel address space (kas) may
1356  * not be locked, hence no lock is needed to access them.
1357  */
1358 int
1359 segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
1360 {
1361         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
1362         page_t *pp;
1363         u_offset_t off;
1364         struct smap *smp;
1365         struct vnode *vp;
1366         caddr_t eaddr;
1367         int newpage = 0;
1368         uint_t prot;
1369         kmutex_t *smtx;
1370         int hat_flag;
1371 
1372         ASSERT(seg->s_as == &kas);
1373 
1374         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1375                 /*
1376                  * Pages are successfully prefaulted and locked in
1377                  * segmap_getmapflt and can't be unlocked until
1378                  * segmap_release. The SM_KPM_NEWPAGE flag is set
1379                  * in segmap_pagecreate_kpm when new pages are created.
1380                  * and it is returned as "newpage" indication here.
1381                  */
1382                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1383                         panic("segmap_pagecreate: smap not found "
1384                             "for addr %p", (void *)addr);
1385                         /*NOTREACHED*/
1386                 }
1387 
1388                 smtx = SMAPMTX(smp);
1389                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
1390                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
1391                 mutex_exit(smtx);
1392 
1393                 return (newpage);
1394         }
1395 
1396         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
1397 
1398         eaddr = addr + len;
1399         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1400 
1401         smp = GET_SMAP(seg, addr);
1402 
1403         /*
1404          * We don't grab smp mutex here since we assume the smp
1405          * has a refcnt set already which prevents the slot from
1406          * changing its id.
1407          */
1408         ASSERT(smp->sm_refcnt > 0);
1409 
1410         vp = smp->sm_vp;
1411         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1412         prot = smd->smd_prot;
1413 
1414         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1415                 hat_flag = HAT_LOAD;
1416                 pp = page_lookup(vp, off, SE_SHARED);
1417                 if (pp == NULL) {
1418                         ushort_t bitindex;
1419 
1420                         if ((pp = page_create_va(vp, off,
1421                             PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
1422                                 panic("segmap_pagecreate: page_create failed");
1423                                 /*NOTREACHED*/
1424                         }
1425                         newpage = 1;
1426                         page_io_unlock(pp);
1427 
1428                         /*
1429                          * Since pages created here do not contain valid
1430                          * data until the caller writes into them, the
1431                          * "exclusive" lock will not be dropped to prevent
1432                          * other users from accessing the page.  We also
1433                          * have to lock the translation to prevent a fault
1434                          * from occurring when the virtual address mapped by
1435                          * this page is written into.  This is necessary to
1436                          * avoid a deadlock since we haven't dropped the
1437                          * "exclusive" lock.
1438                          */
1439                         bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
1440 
1441                         /*
1442                          * Large Files: The following assertion is to
1443                          * verify the cast above.
1444                          */
1445                         ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1446                         smtx = SMAPMTX(smp);
1447                         mutex_enter(smtx);
1448                         smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
1449                         mutex_exit(smtx);
1450 
1451                         hat_flag = HAT_LOAD_LOCK;
1452                 } else if (softlock) {
1453                         hat_flag = HAT_LOAD_LOCK;
1454                 }
1455 
1456                 if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
1457                         hat_setmod(pp);
1458 
1459                 hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
1460 
1461                 if (hat_flag != HAT_LOAD_LOCK)
1462                         page_unlock(pp);
1463 
1464                 TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE,
1465                     "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx",
1466                     seg, addr, pp, vp, off);
1467         }
1468 
1469         return (newpage);
1470 }
1471 
1472 void
1473 segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
1474 {
1475         struct smap     *smp;
1476         ushort_t        bitmask;
1477         page_t          *pp;
1478         struct  vnode   *vp;
1479         u_offset_t      off;
1480         caddr_t         eaddr;
1481         kmutex_t        *smtx;
1482 
1483         ASSERT(seg->s_as == &kas);
1484 
1485         eaddr = addr + len;
1486         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1487 
1488         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1489                 /*
1490                  * Pages are successfully prefaulted and locked in
1491                  * segmap_getmapflt and can't be unlocked until
1492                  * segmap_release, so no pages or hat mappings have
1493                  * to be unlocked at this point.
1494                  */
1495 #ifdef DEBUG
1496                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1497                         panic("segmap_pageunlock: smap not found "
1498                             "for addr %p", (void *)addr);
1499                         /*NOTREACHED*/
1500                 }
1501 
1502                 ASSERT(smp->sm_refcnt > 0);
1503                 mutex_exit(SMAPMTX(smp));
1504 #endif
1505                 return;
1506         }
1507 
1508         smp = GET_SMAP(seg, addr);
1509         smtx = SMAPMTX(smp);
1510 
1511         ASSERT(smp->sm_refcnt > 0);
1512 
1513         vp = smp->sm_vp;
1514         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1515 
1516         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1517                 bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
1518 
1519                 /*
1520                  * Large Files: Following assertion is to verify
1521                  * the correctness of the cast to (int) above.
1522                  */
1523                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1524 
1525                 /*
1526                  * If the bit corresponding to "off" is set,
1527                  * clear this bit in the bitmap, unlock translations,
1528                  * and release the "exclusive" lock on the page.
1529                  */
1530                 if (smp->sm_bitmap & bitmask) {
1531                         mutex_enter(smtx);
1532                         smp->sm_bitmap &= ~bitmask;
1533                         mutex_exit(smtx);
1534 
1535                         hat_unlock(kas.a_hat, addr, PAGESIZE);
1536 
1537                         /*
1538                          * Use page_find() instead of page_lookup() to
1539                          * find the page since we know that it has
1540                          * "exclusive" lock.
1541                          */
1542                         pp = page_find(vp, off);
1543                         if (pp == NULL) {
1544                                 panic("segmap_pageunlock: page not found");
1545                                 /*NOTREACHED*/
1546                         }
1547                         if (rw == S_WRITE) {
1548                                 hat_setrefmod(pp);
1549                         } else if (rw != S_OTHER) {
1550                                 hat_setref(pp);
1551                         }
1552 
1553                         page_unlock(pp);
1554                 }
1555         }
1556 }
1557 
1558 caddr_t
1559 segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off)
1560 {
1561         return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
1562 }
1563 
1564 /*
1565  * This is the magic virtual address that offset 0 of an ELF
1566  * file gets mapped to in user space. This is used to pick
1567  * the vac color on the freelist.
1568  */
1569 #define ELF_OFFZERO_VA  (0x10000)
1570 /*
1571  * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
1572  * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
1573  * The return address is  always MAXBSIZE aligned.
1574  *
1575  * If forcefault is nonzero and the MMU translations haven't yet been created,
1576  * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
1577  */
1578 caddr_t
1579 segmap_getmapflt(
1580         struct seg *seg,
1581         struct vnode *vp,
1582         u_offset_t off,
1583         size_t len,
1584         int forcefault,
1585         enum seg_rw rw)
1586 {
1587         struct smap *smp, *nsmp;
1588         extern struct vnode *common_specvp();
1589         caddr_t baseaddr;                       /* MAXBSIZE aligned */
1590         u_offset_t baseoff;
1591         int newslot;
1592         caddr_t vaddr;
1593         int color, hashid;
1594         kmutex_t *hashmtx, *smapmtx;
1595         struct smfree *sm;
1596         page_t  *pp;
1597         struct kpme *kpme;
1598         uint_t  prot;
1599         caddr_t base;
1600         page_t  *pl[MAXPPB + 1];
1601         int     error;
1602         int     is_kpm = 1;
1603 
1604         ASSERT(seg->s_as == &kas);
1605         ASSERT(seg == segkmap);
1606 
1607         baseoff = off & (offset_t)MAXBMASK;
1608         if (off + len > baseoff + MAXBSIZE) {
1609                 panic("segmap_getmap bad len");
1610                 /*NOTREACHED*/
1611         }
1612 
1613         /*
1614          * If this is a block device we have to be sure to use the
1615          * "common" block device vnode for the mapping.
1616          */
1617         if (vp->v_type == VBLK)
1618                 vp = common_specvp(vp);
1619 
1620         smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
1621 
1622         if (segmap_kpm == 0 ||
1623             (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
1624                 is_kpm = 0;
1625         }
1626 
1627         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1628         hashmtx = SHASHMTX(hashid);
1629 
1630 retry_hash:
1631         mutex_enter(hashmtx);
1632         for (smp = smd_hash[hashid].sh_hash_list;
1633             smp != NULL; smp = smp->sm_hash)
1634                 if (smp->sm_vp == vp && smp->sm_off == baseoff)
1635                         break;
1636         mutex_exit(hashmtx);
1637 
1638 vrfy_smp:
1639         if (smp != NULL) {
1640 
1641                 ASSERT(vp->v_count != 0);
1642 
1643                 /*
1644                  * Get smap lock and recheck its tag. The hash lock
1645                  * is dropped since the hash is based on (vp, off)
1646                  * and (vp, off) won't change when we have smap mtx.
1647                  */
1648                 smapmtx = SMAPMTX(smp);
1649                 mutex_enter(smapmtx);
1650                 if (smp->sm_vp != vp || smp->sm_off != baseoff) {
1651                         mutex_exit(smapmtx);
1652                         goto retry_hash;
1653                 }
1654 
1655                 if (smp->sm_refcnt == 0) {
1656 
1657                         smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
1658 
1659                         /*
1660                          * Could still be on the free list. However, this
1661                          * could also be an smp that is transitioning from
1662                          * the free list when we have too much contention
1663                          * for the smapmtx's. In this case, we have an
1664                          * unlocked smp that is not on the free list any
1665                          * longer, but still has a 0 refcnt.  The only way
1666                          * to be sure is to check the freelist pointers.
1667                          * Since we now have the smapmtx, we are guaranteed
1668                          * that the (vp, off) won't change, so we are safe
1669                          * to reclaim it.  get_free_smp() knows that this
1670                          * can happen, and it will check the refcnt.
1671                          */
1672 
1673                         if ((smp->sm_next != NULL)) {
1674                                 struct sm_freeq *freeq;
1675 
1676                                 ASSERT(smp->sm_prev != NULL);
1677                                 sm = &smd_free[smp->sm_free_ndx];
1678 
1679                                 if (smp->sm_flags & SM_QNDX_ZERO)
1680                                         freeq = &sm->sm_freeq[0];
1681                                 else
1682                                         freeq = &sm->sm_freeq[1];
1683 
1684                                 mutex_enter(&freeq->smq_mtx);
1685                                 if (freeq->smq_free != smp) {
1686                                         /*
1687                                          * fastpath normal case
1688                                          */
1689                                         smp->sm_prev->sm_next = smp->sm_next;
1690                                         smp->sm_next->sm_prev = smp->sm_prev;
1691                                 } else if (smp == smp->sm_next) {
1692                                         /*
1693                                          * Taking the last smap on freelist
1694                                          */
1695                                         freeq->smq_free = NULL;
1696                                 } else {
1697                                         /*
1698                                          * Reclaiming 1st smap on list
1699                                          */
1700                                         freeq->smq_free = smp->sm_next;
1701                                         smp->sm_prev->sm_next = smp->sm_next;
1702                                         smp->sm_next->sm_prev = smp->sm_prev;
1703                                 }
1704                                 mutex_exit(&freeq->smq_mtx);
1705                                 smp->sm_prev = smp->sm_next = NULL;
1706                         } else {
1707                                 ASSERT(smp->sm_prev == NULL);
1708                                 segmapcnt.smp_stolen.value.ul++;
1709                         }
1710 
1711                 } else {
1712                         segmapcnt.smp_get_use.value.ul++;
1713                 }
1714                 smp->sm_refcnt++;            /* another user */
1715 
1716                 /*
1717                  * We don't invoke segmap_fault via TLB miss, so we set ref
1718                  * and mod bits in advance. For S_OTHER  we set them in
1719                  * segmap_fault F_SOFTUNLOCK.
1720                  */
1721                 if (is_kpm) {
1722                         if (rw == S_WRITE) {
1723                                 smp->sm_flags |= SM_WRITE_DATA;
1724                         } else if (rw == S_READ) {
1725                                 smp->sm_flags |= SM_READ_DATA;
1726                         }
1727                 }
1728                 mutex_exit(smapmtx);
1729 
1730                 newslot = 0;
1731         } else {
1732 
1733                 uint32_t free_ndx, *free_ndxp;
1734                 union segmap_cpu *scpu;
1735 
1736                 /*
1737                  * On a PAC machine or a machine with anti-alias
1738                  * hardware, smd_colormsk will be zero.
1739                  *
1740                  * On a VAC machine- pick color by offset in the file
1741                  * so we won't get VAC conflicts on elf files.
1742                  * On data files, color does not matter but we
1743                  * don't know what kind of file it is so we always
1744                  * pick color by offset. This causes color
1745                  * corresponding to file offset zero to be used more
1746                  * heavily.
1747                  */
1748                 color = (baseoff >> MAXBSHIFT) & smd_colormsk;
1749                 scpu = smd_cpu+CPU->cpu_seqid;
1750                 free_ndxp = &scpu->scpu.scpu_free_ndx[color];
1751                 free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
1752 #ifdef DEBUG
1753                 colors_used[free_ndx]++;
1754 #endif /* DEBUG */
1755 
1756                 /*
1757                  * Get a locked smp slot from the free list.
1758                  */
1759                 smp = get_free_smp(free_ndx);
1760                 smapmtx = SMAPMTX(smp);
1761 
1762                 ASSERT(smp->sm_vp == NULL);
1763 
1764                 if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
1765                         /*
1766                          * Failed to hashin, there exists one now.
1767                          * Return the smp we just allocated.
1768                          */
1769                         segmap_smapadd(smp);
1770                         mutex_exit(smapmtx);
1771 
1772                         smp = nsmp;
1773                         goto vrfy_smp;
1774                 }
1775                 smp->sm_refcnt++;            /* another user */
1776 
1777                 /*
1778                  * We don't invoke segmap_fault via TLB miss, so we set ref
1779                  * and mod bits in advance. For S_OTHER  we set them in
1780                  * segmap_fault F_SOFTUNLOCK.
1781                  */
1782                 if (is_kpm) {
1783                         if (rw == S_WRITE) {
1784                                 smp->sm_flags |= SM_WRITE_DATA;
1785                         } else if (rw == S_READ) {
1786                                 smp->sm_flags |= SM_READ_DATA;
1787                         }
1788                 }
1789                 mutex_exit(smapmtx);
1790 
1791                 newslot = 1;
1792         }
1793 
1794         if (!is_kpm)
1795                 goto use_segmap_range;
1796 
1797         /*
1798          * Use segkpm
1799          */
1800         /* Lint directive required until 6746211 is fixed */
1801         /*CONSTCOND*/
1802         ASSERT(PAGESIZE == MAXBSIZE);
1803 
1804         /*
1805          * remember the last smp faulted on this cpu.
1806          */
1807         (smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
1808 
1809         if (forcefault == SM_PAGECREATE) {
1810                 baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
1811                 return (baseaddr);
1812         }
1813 
1814         if (newslot == 0 &&
1815             (pp = GET_KPME(smp)->kpe_page) != NULL) {
1816 
1817                 /* fastpath */
1818                 switch (rw) {
1819                 case S_READ:
1820                 case S_WRITE:
1821                         if (page_trylock(pp, SE_SHARED)) {
1822                                 if (PP_ISFREE(pp) ||
1823                                     !(pp->p_vnode == vp &&
1824                                     pp->p_offset == baseoff)) {
1825                                         page_unlock(pp);
1826                                         pp = page_lookup(vp, baseoff,
1827                                             SE_SHARED);
1828                                 }
1829                         } else {
1830                                 pp = page_lookup(vp, baseoff, SE_SHARED);
1831                         }
1832 
1833                         if (pp == NULL) {
1834                                 ASSERT(GET_KPME(smp)->kpe_page == NULL);
1835                                 break;
1836                         }
1837 
1838                         if (rw == S_WRITE &&
1839                             hat_page_getattr(pp, P_MOD | P_REF) !=
1840                             (P_MOD | P_REF)) {
1841                                 page_unlock(pp);
1842                                 break;
1843                         }
1844 
1845                         /*
1846                          * We have the p_selock as reader, grab_smp
1847                          * can't hit us, we have bumped the smap
1848                          * refcnt and hat_pageunload needs the
1849                          * p_selock exclusive.
1850                          */
1851                         kpme = GET_KPME(smp);
1852                         if (kpme->kpe_page == pp) {
1853                                 baseaddr = hat_kpm_page2va(pp, 0);
1854                         } else if (kpme->kpe_page == NULL) {
1855                                 baseaddr = hat_kpm_mapin(pp, kpme);
1856                         } else {
1857                                 panic("segmap_getmapflt: stale "
1858                                     "kpme page, kpme %p", (void *)kpme);
1859                                 /*NOTREACHED*/
1860                         }
1861 
1862                         /*
1863                          * We don't invoke segmap_fault via TLB miss,
1864                          * so we set ref and mod bits in advance.
1865                          * For S_OTHER and we set them in segmap_fault
1866                          * F_SOFTUNLOCK.
1867                          */
1868                         if (rw == S_READ && !hat_isref(pp))
1869                                 hat_setref(pp);
1870 
1871                         return (baseaddr);
1872                 default:
1873                         break;
1874                 }
1875         }
1876 
1877         base = segkpm_create_va(baseoff);
1878         error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
1879             seg, base, rw, CRED(), NULL);
1880 
1881         pp = pl[0];
1882         if (error || pp == NULL) {
1883                 /*
1884                  * Use segmap address slot and let segmap_fault deal
1885                  * with the error cases. There is no error return
1886                  * possible here.
1887                  */
1888                 goto use_segmap_range;
1889         }
1890 
1891         ASSERT(pl[1] == NULL);
1892 
1893         /*
1894          * When prot is not returned w/ PROT_ALL the returned pages
1895          * are not backed by fs blocks. For most of the segmap users
1896          * this is no problem, they don't write to the pages in the
1897          * same request and therefore don't rely on a following
1898          * trap driven segmap_fault. With SM_LOCKPROTO users it
1899          * is more secure to use segkmap adresses to allow
1900          * protection segmap_fault's.
1901          */
1902         if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
1903                 /*
1904                  * Use segmap address slot and let segmap_fault
1905                  * do the error return.
1906                  */
1907                 ASSERT(rw != S_WRITE);
1908                 ASSERT(PAGE_LOCKED(pp));
1909                 page_unlock(pp);
1910                 forcefault = 0;
1911                 goto use_segmap_range;
1912         }
1913 
1914         /*
1915          * We have the p_selock as reader, grab_smp can't hit us, we
1916          * have bumped the smap refcnt and hat_pageunload needs the
1917          * p_selock exclusive.
1918          */
1919         kpme = GET_KPME(smp);
1920         if (kpme->kpe_page == pp) {
1921                 baseaddr = hat_kpm_page2va(pp, 0);
1922         } else if (kpme->kpe_page == NULL) {
1923                 baseaddr = hat_kpm_mapin(pp, kpme);
1924         } else {
1925                 panic("segmap_getmapflt: stale kpme page after "
1926                     "VOP_GETPAGE, kpme %p", (void *)kpme);
1927                 /*NOTREACHED*/
1928         }
1929 
1930         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
1931 
1932         return (baseaddr);
1933 
1934 
1935 use_segmap_range:
1936         baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
1937         TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP,
1938             "segmap_getmap:seg %p addr %p vp %p offset %llx",
1939             seg, baseaddr, vp, baseoff);
1940 
1941         /*
1942          * Prefault the translations
1943          */
1944         vaddr = baseaddr + (off - baseoff);
1945         if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
1946 
1947                 caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
1948                     (uintptr_t)PAGEMASK);
1949 
1950                 (void) segmap_fault(kas.a_hat, seg, pgaddr,
1951                     (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
1952                     F_INVAL, rw);
1953         }
1954 
1955         return (baseaddr);
1956 }
1957 
1958 int
1959 segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
1960 {
1961         struct smap     *smp;
1962         int             error;
1963         int             bflags = 0;
1964         struct vnode    *vp;
1965         u_offset_t      offset;
1966         kmutex_t        *smtx;
1967         int             is_kpm = 0;
1968         page_t          *pp;
1969 
1970         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1971 
1972                 if (((uintptr_t)addr & MAXBOFFSET) != 0) {
1973                         panic("segmap_release: addr %p not "
1974                             "MAXBSIZE aligned", (void *)addr);
1975                         /*NOTREACHED*/
1976                 }
1977 
1978                 if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
1979                         panic("segmap_release: smap not found "
1980                             "for addr %p", (void *)addr);
1981                         /*NOTREACHED*/
1982                 }
1983 
1984                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
1985                     "segmap_relmap:seg %p addr %p smp %p",
1986                     seg, addr, smp);
1987 
1988                 smtx = SMAPMTX(smp);
1989 
1990                 /*
1991                  * For compatibility reasons segmap_pagecreate_kpm sets this
1992                  * flag to allow a following segmap_pagecreate to return
1993                  * this as "newpage" flag. When segmap_pagecreate is not
1994                  * called at all we clear it now.
1995                  */
1996                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
1997                 is_kpm = 1;
1998                 if (smp->sm_flags & SM_WRITE_DATA) {
1999                         hat_setrefmod(pp);
2000                 } else if (smp->sm_flags & SM_READ_DATA) {
2001                         hat_setref(pp);
2002                 }
2003         } else {
2004                 if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
2005                     ((uintptr_t)addr & MAXBOFFSET) != 0) {
2006                         panic("segmap_release: bad addr %p", (void *)addr);
2007                         /*NOTREACHED*/
2008                 }
2009                 smp = GET_SMAP(seg, addr);
2010 
2011                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2012                     "segmap_relmap:seg %p addr %p smp %p",
2013                     seg, addr, smp);
2014 
2015                 smtx = SMAPMTX(smp);
2016                 mutex_enter(smtx);
2017                 smp->sm_flags |= SM_NOTKPM_RELEASED;
2018         }
2019 
2020         ASSERT(smp->sm_refcnt > 0);
2021 
2022         /*
2023          * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2024          * are set.
2025          */
2026         if ((flags & ~SM_DONTNEED) != 0) {
2027                 if (flags & SM_WRITE)
2028                         segmapcnt.smp_rel_write.value.ul++;
2029                 if (flags & SM_ASYNC) {
2030                         bflags |= B_ASYNC;
2031                         segmapcnt.smp_rel_async.value.ul++;
2032                 }
2033                 if (flags & SM_INVAL) {
2034                         bflags |= B_INVAL;
2035                         segmapcnt.smp_rel_abort.value.ul++;
2036                 }
2037                 if (flags & SM_DESTROY) {
2038                         bflags |= (B_INVAL|B_TRUNC);
2039                         segmapcnt.smp_rel_abort.value.ul++;
2040                 }
2041                 if (smp->sm_refcnt == 1) {
2042                         /*
2043                          * We only bother doing the FREE and DONTNEED flags
2044                          * if no one else is still referencing this mapping.
2045                          */
2046                         if (flags & SM_FREE) {
2047                                 bflags |= B_FREE;
2048                                 segmapcnt.smp_rel_free.value.ul++;
2049                         }
2050                         if (flags & SM_DONTNEED) {
2051                                 bflags |= B_DONTNEED;
2052                                 segmapcnt.smp_rel_dontneed.value.ul++;
2053                         }
2054                 }
2055         } else {
2056                 smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
2057         }
2058 
2059         vp = smp->sm_vp;
2060         offset = smp->sm_off;
2061 
2062         if (--smp->sm_refcnt == 0) {
2063 
2064                 smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
2065 
2066                 if (flags & (SM_INVAL|SM_DESTROY)) {
2067                         segmap_hashout(smp);    /* remove map info */
2068                         if (is_kpm) {
2069                                 hat_kpm_mapout(pp, GET_KPME(smp), addr);
2070                                 if (smp->sm_flags & SM_NOTKPM_RELEASED) {
2071                                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2072                                         hat_unload(kas.a_hat, segkmap->s_base +
2073                                             ((smp - smd_smap) * MAXBSIZE),
2074                                             MAXBSIZE, HAT_UNLOAD);
2075                                 }
2076 
2077                         } else {
2078                                 if (segmap_kpm)
2079                                         segkpm_mapout_validkpme(GET_KPME(smp));
2080 
2081                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2082                                 hat_unload(kas.a_hat, addr, MAXBSIZE,
2083                                     HAT_UNLOAD);
2084                         }
2085                 }
2086                 segmap_smapadd(smp);    /* add to free list */
2087         }
2088 
2089         mutex_exit(smtx);
2090 
2091         if (is_kpm)
2092                 page_unlock(pp);
2093         /*
2094          * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2095          * are set.
2096          */
2097         if ((flags & ~SM_DONTNEED) != 0) {
2098                 error = VOP_PUTPAGE(vp, offset, MAXBSIZE,
2099                     bflags, CRED(), NULL);
2100         } else {
2101                 error = 0;
2102         }
2103 
2104         return (error);
2105 }
2106 
2107 /*
2108  * Dump the pages belonging to this segmap segment.
2109  */
2110 static void
2111 segmap_dump(struct seg *seg)
2112 {
2113         struct segmap_data *smd;
2114         struct smap *smp, *smp_end;
2115         page_t *pp;
2116         pfn_t pfn;
2117         u_offset_t off;
2118         caddr_t addr;
2119 
2120         smd = (struct segmap_data *)seg->s_data;
2121         addr = seg->s_base;
2122         for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
2123             smp < smp_end; smp++) {
2124 
2125                 if (smp->sm_refcnt) {
2126                         for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
2127                                 int we_own_it = 0;
2128 
2129                                 /*
2130                                  * If pp == NULL, the page either does
2131                                  * not exist or is exclusively locked.
2132                                  * So determine if it exists before
2133                                  * searching for it.
2134                                  */
2135                                 if ((pp = page_lookup_nowait(smp->sm_vp,
2136                                     smp->sm_off + off, SE_SHARED)))
2137                                         we_own_it = 1;
2138                                 else
2139                                         pp = page_exists(smp->sm_vp,
2140                                             smp->sm_off + off);
2141 
2142                                 if (pp) {
2143                                         pfn = page_pptonum(pp);
2144                                         dump_addpage(seg->s_as,
2145                                             addr + off, pfn);
2146                                         if (we_own_it)
2147                                                 page_unlock(pp);
2148                                 }
2149                                 dump_timeleft = dump_timeout;
2150                         }
2151                 }
2152                 addr += MAXBSIZE;
2153         }
2154 }
2155 
2156 /*ARGSUSED*/
2157 static int
2158 segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
2159     struct page ***ppp, enum lock_type type, enum seg_rw rw)
2160 {
2161         return (ENOTSUP);
2162 }
2163 
2164 static int
2165 segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
2166 {
2167         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
2168 
2169         memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
2170         memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
2171         return (0);
2172 }
2173 
2174 /*ARGSUSED*/
2175 static lgrp_mem_policy_info_t *
2176 segmap_getpolicy(struct seg *seg, caddr_t addr)
2177 {
2178         return (NULL);
2179 }
2180 
2181 /*ARGSUSED*/
2182 static int
2183 segmap_capable(struct seg *seg, segcapability_t capability)
2184 {
2185         return (0);
2186 }
2187 
2188 
2189 #ifdef  SEGKPM_SUPPORT
2190 
2191 /*
2192  * segkpm support routines
2193  */
2194 
2195 static caddr_t
2196 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2197         struct smap *smp, enum seg_rw rw)
2198 {
2199         caddr_t base;
2200         page_t  *pp;
2201         int     newpage = 0;
2202         struct kpme     *kpme;
2203 
2204         ASSERT(smp->sm_refcnt > 0);
2205 
2206         if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
2207                 kmutex_t *smtx;
2208 
2209                 base = segkpm_create_va(off);
2210 
2211                 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
2212                     seg, base)) == NULL) {
2213                         panic("segmap_pagecreate_kpm: "
2214                             "page_create failed");
2215                         /*NOTREACHED*/
2216                 }
2217 
2218                 newpage = 1;
2219                 page_io_unlock(pp);
2220                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
2221 
2222                 /*
2223                  * Mark this here until the following segmap_pagecreate
2224                  * or segmap_release.
2225                  */
2226                 smtx = SMAPMTX(smp);
2227                 mutex_enter(smtx);
2228                 smp->sm_flags |= SM_KPM_NEWPAGE;
2229                 mutex_exit(smtx);
2230         }
2231 
2232         kpme = GET_KPME(smp);
2233         if (!newpage && kpme->kpe_page == pp)
2234                 base = hat_kpm_page2va(pp, 0);
2235         else
2236                 base = hat_kpm_mapin(pp, kpme);
2237 
2238         /*
2239          * FS code may decide not to call segmap_pagecreate and we
2240          * don't invoke segmap_fault via TLB miss, so we have to set
2241          * ref and mod bits in advance.
2242          */
2243         if (rw == S_WRITE) {
2244                 hat_setrefmod(pp);
2245         } else {
2246                 ASSERT(rw == S_READ);
2247                 hat_setref(pp);
2248         }
2249 
2250         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
2251 
2252         return (base);
2253 }
2254 
2255 /*
2256  * Find the smap structure corresponding to the
2257  * KPM addr and return it locked.
2258  */
2259 struct smap *
2260 get_smap_kpm(caddr_t addr, page_t **ppp)
2261 {
2262         struct smap     *smp;
2263         struct vnode    *vp;
2264         u_offset_t      offset;
2265         caddr_t         baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
2266         int             hashid;
2267         kmutex_t        *hashmtx;
2268         page_t          *pp;
2269         union segmap_cpu *scpu;
2270 
2271         pp = hat_kpm_vaddr2page(baseaddr);
2272 
2273         ASSERT(pp && !PP_ISFREE(pp));
2274         ASSERT(PAGE_LOCKED(pp));
2275         ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
2276 
2277         vp = pp->p_vnode;
2278         offset = pp->p_offset;
2279         ASSERT(vp != NULL);
2280 
2281         /*
2282          * Assume the last smap used on this cpu is the one needed.
2283          */
2284         scpu = smd_cpu+CPU->cpu_seqid;
2285         smp = scpu->scpu.scpu_last_smap;
2286         mutex_enter(&smp->sm_mtx);
2287         if (smp->sm_vp == vp && smp->sm_off == offset) {
2288                 ASSERT(smp->sm_refcnt > 0);
2289         } else {
2290                 /*
2291                  * Assumption wrong, find the smap on the hash chain.
2292                  */
2293                 mutex_exit(&smp->sm_mtx);
2294                 SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
2295                 hashmtx = SHASHMTX(hashid);
2296 
2297                 mutex_enter(hashmtx);
2298                 smp = smd_hash[hashid].sh_hash_list;
2299                 for (; smp != NULL; smp = smp->sm_hash) {
2300                         if (smp->sm_vp == vp && smp->sm_off == offset)
2301                                 break;
2302                 }
2303                 mutex_exit(hashmtx);
2304                 if (smp) {
2305                         mutex_enter(&smp->sm_mtx);
2306                         ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
2307                 }
2308         }
2309 
2310         if (ppp)
2311                 *ppp = smp ? pp : NULL;
2312 
2313         return (smp);
2314 }
2315 
2316 #else   /* SEGKPM_SUPPORT */
2317 
2318 /* segkpm stubs */
2319 
2320 /*ARGSUSED*/
2321 static caddr_t
2322 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2323         struct smap *smp, enum seg_rw rw)
2324 {
2325         return (NULL);
2326 }
2327 
2328 /*ARGSUSED*/
2329 struct smap *
2330 get_smap_kpm(caddr_t addr, page_t **ppp)
2331 {
2332         return (NULL);
2333 }
2334 
2335 #endif  /* SEGKPM_SUPPORT */