1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * Portions of this source code were derived from Berkeley 4.3 BSD
  31  * under license from the Regents of the University of California.
  32  */
  33 
  34 /*
  35  * VM - generic vnode mapping segment.
  36  *
  37  * The segmap driver is used only by the kernel to get faster (than seg_vn)
  38  * mappings [lower routine overhead; more persistent cache] to random
  39  * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/t_lock.h>
  44 #include <sys/param.h>
  45 #include <sys/sysmacros.h>
  46 #include <sys/buf.h>
  47 #include <sys/systm.h>
  48 #include <sys/vnode.h>
  49 #include <sys/mman.h>
  50 #include <sys/errno.h>
  51 #include <sys/cred.h>
  52 #include <sys/kmem.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/debug.h>
  56 #include <sys/thread.h>
  57 #include <sys/dumphdr.h>
  58 #include <sys/bitmap.h>
  59 #include <sys/lgrp.h>
  60 
  61 #include <vm/seg_kmem.h>
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_kpm.h>
  66 #include <vm/seg_map.h>
  67 #include <vm/page.h>
  68 #include <vm/pvn.h>
  69 #include <vm/rm.h>
  70 
  71 /*
  72  * Private seg op routines.
  73  */
  74 static void     segmap_free(struct seg *seg);
  75 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
  76                         size_t len, enum fault_type type, enum seg_rw rw);
  77 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
  78 static int      segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
  79                         uint_t prot);
  80 static int      segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
  81 static int      segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
  82                         uint_t *protv);
  83 static u_offset_t       segmap_getoffset(struct seg *seg, caddr_t addr);
  84 static int      segmap_gettype(struct seg *seg, caddr_t addr);
  85 static int      segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
  86 static void     segmap_dump(struct seg *seg);
  87 static int      segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
  88                         struct page ***ppp, enum lock_type type,
  89                         enum seg_rw rw);
  90 static int      segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
  91 static lgrp_mem_policy_info_t   *segmap_getpolicy(struct seg *seg,
  92     caddr_t addr);
  93 static int      segmap_capable(struct seg *seg, segcapability_t capability);
  94 
  95 /* segkpm support */
  96 static caddr_t  segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
  97                         struct smap *, enum seg_rw);
  98 struct smap     *get_smap_kpm(caddr_t, page_t **);
  99 
 100 static struct seg_ops segmap_ops = {
 101         .free           = segmap_free,
 102         .fault          = segmap_fault,
 103         .faulta         = segmap_faulta,
 104         .checkprot      = segmap_checkprot,
 105         .kluster        = segmap_kluster,
 106         .getprot        = segmap_getprot,
 107         .getoffset      = segmap_getoffset,
 108         .gettype        = segmap_gettype,
 109         .getvp          = segmap_getvp,
 110         .dump           = segmap_dump,
 111         .pagelock       = segmap_pagelock,
 112         .getmemid       = segmap_getmemid,
 113         .getpolicy      = segmap_getpolicy,
 114         .capable        = segmap_capable,
 115         .inherit        = seg_inherit_notsup,
 116 };
 117 
 118 /*
 119  * Private segmap routines.
 120  */
 121 static void     segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
 122                         size_t len, enum seg_rw rw, struct smap *smp);
 123 static void     segmap_smapadd(struct smap *smp);
 124 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
 125                         u_offset_t off, int hashid);
 126 static void     segmap_hashout(struct smap *smp);
 127 
 128 
 129 /*
 130  * Statistics for segmap operations.
 131  *
 132  * No explicit locking to protect these stats.
 133  */
 134 struct segmapcnt segmapcnt = {
 135         { "fault",              KSTAT_DATA_ULONG },
 136         { "faulta",             KSTAT_DATA_ULONG },
 137         { "getmap",             KSTAT_DATA_ULONG },
 138         { "get_use",            KSTAT_DATA_ULONG },
 139         { "get_reclaim",        KSTAT_DATA_ULONG },
 140         { "get_reuse",          KSTAT_DATA_ULONG },
 141         { "get_unused",         KSTAT_DATA_ULONG },
 142         { "get_nofree",         KSTAT_DATA_ULONG },
 143         { "rel_async",          KSTAT_DATA_ULONG },
 144         { "rel_write",          KSTAT_DATA_ULONG },
 145         { "rel_free",           KSTAT_DATA_ULONG },
 146         { "rel_abort",          KSTAT_DATA_ULONG },
 147         { "rel_dontneed",       KSTAT_DATA_ULONG },
 148         { "release",            KSTAT_DATA_ULONG },
 149         { "pagecreate",         KSTAT_DATA_ULONG },
 150         { "free_notfree",       KSTAT_DATA_ULONG },
 151         { "free_dirty",         KSTAT_DATA_ULONG },
 152         { "free",               KSTAT_DATA_ULONG },
 153         { "stolen",             KSTAT_DATA_ULONG },
 154         { "get_nomtx",          KSTAT_DATA_ULONG }
 155 };
 156 
 157 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
 158 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
 159 
 160 /*
 161  * Return number of map pages in segment.
 162  */
 163 #define MAP_PAGES(seg)          ((seg)->s_size >> MAXBSHIFT)
 164 
 165 /*
 166  * Translate addr into smap number within segment.
 167  */
 168 #define MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
 169 
 170 /*
 171  * Translate addr in seg into struct smap pointer.
 172  */
 173 #define GET_SMAP(seg, addr)     \
 174         &(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
 175 
 176 /*
 177  * Bit in map (16 bit bitmap).
 178  */
 179 #define SMAP_BIT_MASK(bitindex) (1 << ((bitindex) & 0xf))
 180 
 181 static int smd_colormsk = 0;
 182 static int smd_ncolor = 0;
 183 static int smd_nfree = 0;
 184 static int smd_freemsk = 0;
 185 #ifdef DEBUG
 186 static int *colors_used;
 187 #endif
 188 static struct smap *smd_smap;
 189 static struct smaphash *smd_hash;
 190 #ifdef SEGMAP_HASHSTATS
 191 static unsigned int *smd_hash_len;
 192 #endif
 193 static struct smfree *smd_free;
 194 static ulong_t smd_hashmsk = 0;
 195 
 196 #define SEGMAP_MAXCOLOR         2
 197 #define SEGMAP_CACHE_PAD        64
 198 
 199 union segmap_cpu {
 200         struct {
 201                 uint32_t        scpu_free_ndx[SEGMAP_MAXCOLOR];
 202                 struct smap     *scpu_last_smap;
 203                 ulong_t         scpu_getmap;
 204                 ulong_t         scpu_release;
 205                 ulong_t         scpu_get_reclaim;
 206                 ulong_t         scpu_fault;
 207                 ulong_t         scpu_pagecreate;
 208                 ulong_t         scpu_get_reuse;
 209         } scpu;
 210         char    scpu_pad[SEGMAP_CACHE_PAD];
 211 };
 212 static union segmap_cpu *smd_cpu;
 213 
 214 /*
 215  * There are three locks in seg_map:
 216  *      - per freelist mutexes
 217  *      - per hashchain mutexes
 218  *      - per smap mutexes
 219  *
 220  * The lock ordering is to get the smap mutex to lock down the slot
 221  * first then the hash lock (for hash in/out (vp, off) list) or the
 222  * freelist lock to put the slot back on the free list.
 223  *
 224  * The hash search is done by only holding the hashchain lock, when a wanted
 225  * slot is found, we drop the hashchain lock then lock the slot so there
 226  * is no overlapping of hashchain and smap locks. After the slot is
 227  * locked, we verify again if the slot is still what we are looking
 228  * for.
 229  *
 230  * Allocation of a free slot is done by holding the freelist lock,
 231  * then locking the smap slot at the head of the freelist. This is
 232  * in reversed lock order so mutex_tryenter() is used.
 233  *
 234  * The smap lock protects all fields in smap structure except for
 235  * the link fields for hash/free lists which are protected by
 236  * hashchain and freelist locks.
 237  */
 238 
 239 #define SHASHMTX(hashid)        (&smd_hash[hashid].sh_mtx)
 240 
 241 #define SMP2SMF(smp)            (&smd_free[(smp - smd_smap) & smd_freemsk])
 242 #define SMP2SMF_NDX(smp)        (ushort_t)((smp - smd_smap) & smd_freemsk)
 243 
 244 #define SMAPMTX(smp) (&smp->sm_mtx)
 245 
 246 #define SMAP_HASHFUNC(vp, off, hashid) \
 247         { \
 248         hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
 249                 ((off) >> MAXBSHIFT)) & smd_hashmsk); \
 250         }
 251 
 252 /*
 253  * The most frequently updated kstat counters are kept in the
 254  * per cpu array to avoid hot cache blocks. The update function
 255  * sums the cpu local counters to update the global counters.
 256  */
 257 
 258 /* ARGSUSED */
 259 int
 260 segmap_kstat_update(kstat_t *ksp, int rw)
 261 {
 262         int i;
 263         ulong_t getmap, release, get_reclaim;
 264         ulong_t fault, pagecreate, get_reuse;
 265 
 266         if (rw == KSTAT_WRITE)
 267                 return (EACCES);
 268         getmap = release = get_reclaim = (ulong_t)0;
 269         fault = pagecreate = get_reuse = (ulong_t)0;
 270         for (i = 0; i < max_ncpus; i++) {
 271                 getmap += smd_cpu[i].scpu.scpu_getmap;
 272                 release  += smd_cpu[i].scpu.scpu_release;
 273                 get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
 274                 fault  += smd_cpu[i].scpu.scpu_fault;
 275                 pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
 276                 get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
 277         }
 278         segmapcnt.smp_getmap.value.ul = getmap;
 279         segmapcnt.smp_release.value.ul = release;
 280         segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
 281         segmapcnt.smp_fault.value.ul = fault;
 282         segmapcnt.smp_pagecreate.value.ul = pagecreate;
 283         segmapcnt.smp_get_reuse.value.ul = get_reuse;
 284         return (0);
 285 }
 286 
 287 int
 288 segmap_create(struct seg *seg, void *argsp)
 289 {
 290         struct segmap_data *smd;
 291         struct smap *smp;
 292         struct smfree *sm;
 293         struct segmap_crargs *a = (struct segmap_crargs *)argsp;
 294         struct smaphash *shashp;
 295         union segmap_cpu *scpu;
 296         long i, npages;
 297         size_t hashsz;
 298         uint_t nfreelist;
 299         extern void prefetch_smap_w(void *);
 300         extern int max_ncpus;
 301 
 302         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 303 
 304         if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
 305                 panic("segkmap not MAXBSIZE aligned");
 306                 /*NOTREACHED*/
 307         }
 308 
 309         smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
 310 
 311         seg->s_data = (void *)smd;
 312         seg->s_ops = &segmap_ops;
 313         smd->smd_prot = a->prot;
 314 
 315         /*
 316          * Scale the number of smap freelists to be
 317          * proportional to max_ncpus * number of virtual colors.
 318          * The caller can over-ride this scaling by providing
 319          * a non-zero a->nfreelist argument.
 320          */
 321         nfreelist = a->nfreelist;
 322         if (nfreelist == 0)
 323                 nfreelist = max_ncpus;
 324         else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
 325                 cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
 326                 "%d, using %d", nfreelist, max_ncpus);
 327                 nfreelist = max_ncpus;
 328         }
 329         if (!ISP2(nfreelist)) {
 330                 /* round up nfreelist to the next power of two. */
 331                 nfreelist = 1 << (highbit(nfreelist));
 332         }
 333 
 334         /*
 335          * Get the number of virtual colors - must be a power of 2.
 336          */
 337         if (a->shmsize)
 338                 smd_ncolor = a->shmsize >> MAXBSHIFT;
 339         else
 340                 smd_ncolor = 1;
 341         ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
 342         ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
 343         smd_colormsk = smd_ncolor - 1;
 344         smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
 345         smd_freemsk = smd_nfree - 1;
 346 
 347         /*
 348          * Allocate and initialize the freelist headers.
 349          * Note that sm_freeq[1] starts out as the release queue. This
 350          * is known when the smap structures are initialized below.
 351          */
 352         smd_free = smd->smd_free =
 353             kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
 354         for (i = 0; i < smd_nfree; i++) {
 355                 sm = &smd->smd_free[i];
 356                 mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 357                 mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 358                 sm->sm_allocq = &sm->sm_freeq[0];
 359                 sm->sm_releq = &sm->sm_freeq[1];
 360         }
 361 
 362         /*
 363          * Allocate and initialize the smap hash chain headers.
 364          * Compute hash size rounding down to the next power of two.
 365          */
 366         npages = MAP_PAGES(seg);
 367         smd->smd_npages = npages;
 368         hashsz = npages / SMAP_HASHAVELEN;
 369         hashsz = 1 << (highbit(hashsz)-1);
 370         smd_hashmsk = hashsz - 1;
 371         smd_hash = smd->smd_hash =
 372             kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
 373 #ifdef SEGMAP_HASHSTATS
 374         smd_hash_len =
 375             kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
 376 #endif
 377         for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
 378                 shashp->sh_hash_list = NULL;
 379                 mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
 380         }
 381 
 382         /*
 383          * Allocate and initialize the smap structures.
 384          * Link all slots onto the appropriate freelist.
 385          * The smap array is large enough to affect boot time
 386          * on large systems, so use memory prefetching and only
 387          * go through the array 1 time. Inline a optimized version
 388          * of segmap_smapadd to add structures to freelists with
 389          * knowledge that no locks are needed here.
 390          */
 391         smd_smap = smd->smd_sm =
 392             kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
 393 
 394         for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
 395             smp >= smd->smd_sm; smp--) {
 396                 struct smap *smpfreelist;
 397                 struct sm_freeq *releq;
 398 
 399                 prefetch_smap_w((char *)smp);
 400 
 401                 smp->sm_vp = NULL;
 402                 smp->sm_hash = NULL;
 403                 smp->sm_off = 0;
 404                 smp->sm_bitmap = 0;
 405                 smp->sm_refcnt = 0;
 406                 mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
 407                 smp->sm_free_ndx = SMP2SMF_NDX(smp);
 408 
 409                 sm = SMP2SMF(smp);
 410                 releq = sm->sm_releq;
 411 
 412                 smpfreelist = releq->smq_free;
 413                 if (smpfreelist == 0) {
 414                         releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 415                 } else {
 416                         smp->sm_next = smpfreelist;
 417                         smp->sm_prev = smpfreelist->sm_prev;
 418                         smpfreelist->sm_prev = smp;
 419                         smp->sm_prev->sm_next = smp;
 420                         releq->smq_free = smp->sm_next;
 421                 }
 422 
 423                 /*
 424                  * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
 425                  */
 426                 smp->sm_flags = 0;
 427 
 428 #ifdef  SEGKPM_SUPPORT
 429                 /*
 430                  * Due to the fragile prefetch loop no
 431                  * separate function is used here.
 432                  */
 433                 smp->sm_kpme_next = NULL;
 434                 smp->sm_kpme_prev = NULL;
 435                 smp->sm_kpme_page = NULL;
 436 #endif
 437         }
 438 
 439         /*
 440          * Allocate the per color indices that distribute allocation
 441          * requests over the free lists. Each cpu will have a private
 442          * rotor index to spread the allocations even across the available
 443          * smap freelists. Init the scpu_last_smap field to the first
 444          * smap element so there is no need to check for NULL.
 445          */
 446         smd_cpu =
 447             kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
 448         for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
 449                 int j;
 450                 for (j = 0; j < smd_ncolor; j++)
 451                         scpu->scpu.scpu_free_ndx[j] = j;
 452                 scpu->scpu.scpu_last_smap = smd_smap;
 453         }
 454 
 455         vpm_init();
 456 
 457 #ifdef DEBUG
 458         /*
 459          * Keep track of which colors are used more often.
 460          */
 461         colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
 462 #endif /* DEBUG */
 463 
 464         return (0);
 465 }
 466 
 467 static void
 468 segmap_free(seg)
 469         struct seg *seg;
 470 {
 471         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 472 }
 473 
 474 /*
 475  * Do a F_SOFTUNLOCK call over the range requested.
 476  * The range must have already been F_SOFTLOCK'ed.
 477  */
 478 static void
 479 segmap_unlock(
 480         struct hat *hat,
 481         struct seg *seg,
 482         caddr_t addr,
 483         size_t len,
 484         enum seg_rw rw,
 485         struct smap *smp)
 486 {
 487         page_t *pp;
 488         caddr_t adr;
 489         u_offset_t off;
 490         struct vnode *vp;
 491         kmutex_t *smtx;
 492 
 493         ASSERT(smp->sm_refcnt > 0);
 494 
 495 #ifdef lint
 496         seg = seg;
 497 #endif
 498 
 499         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 500 
 501                 /*
 502                  * We're called only from segmap_fault and this was a
 503                  * NOP in case of a kpm based smap, so dangerous things
 504                  * must have happened in the meantime. Pages are prefaulted
 505                  * and locked in segmap_getmapflt and they will not be
 506                  * unlocked until segmap_release.
 507                  */
 508                 panic("segmap_unlock: called with kpm addr %p", (void *)addr);
 509                 /*NOTREACHED*/
 510         }
 511 
 512         vp = smp->sm_vp;
 513         off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 514 
 515         hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
 516         for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
 517                 ushort_t bitmask;
 518 
 519                 /*
 520                  * Use page_find() instead of page_lookup() to
 521                  * find the page since we know that it has
 522                  * "shared" lock.
 523                  */
 524                 pp = page_find(vp, off);
 525                 if (pp == NULL) {
 526                         panic("segmap_unlock: page not found");
 527                         /*NOTREACHED*/
 528                 }
 529 
 530                 if (rw == S_WRITE) {
 531                         hat_setrefmod(pp);
 532                 } else if (rw != S_OTHER) {
 533                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 534                         "segmap_fault:pp %p vp %p offset %llx", pp, vp, off);
 535                         hat_setref(pp);
 536                 }
 537 
 538                 /*
 539                  * Clear bitmap, if the bit corresponding to "off" is set,
 540                  * since the page and translation are being unlocked.
 541                  */
 542                 bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
 543 
 544                 /*
 545                  * Large Files: Following assertion is to verify
 546                  * the correctness of the cast to (int) above.
 547                  */
 548                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
 549                 smtx = SMAPMTX(smp);
 550                 mutex_enter(smtx);
 551                 if (smp->sm_bitmap & bitmask) {
 552                         smp->sm_bitmap &= ~bitmask;
 553                 }
 554                 mutex_exit(smtx);
 555 
 556                 page_unlock(pp);
 557         }
 558 }
 559 
 560 #define MAXPPB  (MAXBSIZE/4096) /* assumes minimum page size of 4k */
 561 
 562 /*
 563  * This routine is called via a machine specific fault handling
 564  * routine.  It is also called by software routines wishing to
 565  * lock or unlock a range of addresses.
 566  *
 567  * Note that this routine expects a page-aligned "addr".
 568  */
 569 faultcode_t
 570 segmap_fault(
 571         struct hat *hat,
 572         struct seg *seg,
 573         caddr_t addr,
 574         size_t len,
 575         enum fault_type type,
 576         enum seg_rw rw)
 577 {
 578         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 579         struct smap *smp;
 580         page_t *pp, **ppp;
 581         struct vnode *vp;
 582         u_offset_t off;
 583         page_t *pl[MAXPPB + 1];
 584         uint_t prot;
 585         u_offset_t addroff;
 586         caddr_t adr;
 587         int err;
 588         u_offset_t sm_off;
 589         int hat_flag;
 590 
 591         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 592                 int newpage;
 593                 kmutex_t *smtx;
 594 
 595                 /*
 596                  * Pages are successfully prefaulted and locked in
 597                  * segmap_getmapflt and can't be unlocked until
 598                  * segmap_release. No hat mappings have to be locked
 599                  * and they also can't be unlocked as long as the
 600                  * caller owns an active kpm addr.
 601                  */
 602 #ifndef DEBUG
 603                 if (type != F_SOFTUNLOCK)
 604                         return (0);
 605 #endif
 606 
 607                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 608                         panic("segmap_fault: smap not found "
 609                             "for addr %p", (void *)addr);
 610                         /*NOTREACHED*/
 611                 }
 612 
 613                 smtx = SMAPMTX(smp);
 614 #ifdef  DEBUG
 615                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 616                 if (newpage) {
 617                         cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
 618                             (void *)smp);
 619                 }
 620 
 621                 if (type != F_SOFTUNLOCK) {
 622                         mutex_exit(smtx);
 623                         return (0);
 624                 }
 625 #endif
 626                 mutex_exit(smtx);
 627                 vp = smp->sm_vp;
 628                 sm_off = smp->sm_off;
 629 
 630                 if (vp == NULL)
 631                         return (FC_MAKE_ERR(EIO));
 632 
 633                 ASSERT(smp->sm_refcnt > 0);
 634 
 635                 addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 636                 if (addroff + len > MAXBSIZE)
 637                         panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
 638                             (void *)(addr + len));
 639 
 640                 off = sm_off + addroff;
 641 
 642                 pp = page_find(vp, off);
 643 
 644                 if (pp == NULL)
 645                         panic("segmap_fault: softunlock page not found");
 646 
 647                 /*
 648                  * Set ref bit also here in case of S_OTHER to avoid the
 649                  * overhead of supporting other cases than F_SOFTUNLOCK
 650                  * with segkpm. We can do this because the underlying
 651                  * pages are locked anyway.
 652                  */
 653                 if (rw == S_WRITE) {
 654                         hat_setrefmod(pp);
 655                 } else {
 656                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 657                             "segmap_fault:pp %p vp %p offset %llx",
 658                             pp, vp, off);
 659                         hat_setref(pp);
 660                 }
 661 
 662                 return (0);
 663         }
 664 
 665         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
 666         smp = GET_SMAP(seg, addr);
 667         vp = smp->sm_vp;
 668         sm_off = smp->sm_off;
 669 
 670         if (vp == NULL)
 671                 return (FC_MAKE_ERR(EIO));
 672 
 673         ASSERT(smp->sm_refcnt > 0);
 674 
 675         addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 676         if (addroff + len > MAXBSIZE) {
 677                 panic("segmap_fault: endaddr %p "
 678                     "exceeds MAXBSIZE chunk", (void *)(addr + len));
 679                 /*NOTREACHED*/
 680         }
 681         off = sm_off + addroff;
 682 
 683         /*
 684          * First handle the easy stuff
 685          */
 686         if (type == F_SOFTUNLOCK) {
 687                 segmap_unlock(hat, seg, addr, len, rw, smp);
 688                 return (0);
 689         }
 690 
 691         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 692             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 693         err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
 694             seg, addr, rw, CRED(), NULL);
 695 
 696         if (err)
 697                 return (FC_MAKE_ERR(err));
 698 
 699         prot &= smd->smd_prot;
 700 
 701         /*
 702          * Handle all pages returned in the pl[] array.
 703          * This loop is coded on the assumption that if
 704          * there was no error from the VOP_GETPAGE routine,
 705          * that the page list returned will contain all the
 706          * needed pages for the vp from [off..off + len].
 707          */
 708         ppp = pl;
 709         while ((pp = *ppp++) != NULL) {
 710                 u_offset_t poff;
 711                 ASSERT(pp->p_vnode == vp);
 712                 hat_flag = HAT_LOAD;
 713 
 714                 /*
 715                  * Verify that the pages returned are within the range
 716                  * of this segmap region.  Note that it is theoretically
 717                  * possible for pages outside this range to be returned,
 718                  * but it is not very likely.  If we cannot use the
 719                  * page here, just release it and go on to the next one.
 720                  */
 721                 if (pp->p_offset < sm_off ||
 722                     pp->p_offset >= sm_off + MAXBSIZE) {
 723                         (void) page_release(pp, 1);
 724                         continue;
 725                 }
 726 
 727                 ASSERT(hat == kas.a_hat);
 728                 poff = pp->p_offset;
 729                 adr = addr + (poff - off);
 730                 if (adr >= addr && adr < addr + len) {
 731                         hat_setref(pp);
 732                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 733                             "segmap_fault:pp %p vp %p offset %llx",
 734                             pp, vp, poff);
 735                         if (type == F_SOFTLOCK)
 736                                 hat_flag = HAT_LOAD_LOCK;
 737                 }
 738 
 739                 /*
 740                  * Deal with VMODSORT pages here. If we know this is a write
 741                  * do the setmod now and allow write protection.
 742                  * As long as it's modified or not S_OTHER, remove write
 743                  * protection. With S_OTHER it's up to the FS to deal with this.
 744                  */
 745                 if (IS_VMODSORT(vp)) {
 746                         if (rw == S_WRITE)
 747                                 hat_setmod(pp);
 748                         else if (rw != S_OTHER && !hat_ismod(pp))
 749                                 prot &= ~PROT_WRITE;
 750                 }
 751 
 752                 hat_memload(hat, adr, pp, prot, hat_flag);
 753                 if (hat_flag != HAT_LOAD_LOCK)
 754                         page_unlock(pp);
 755         }
 756         return (0);
 757 }
 758 
 759 /*
 760  * This routine is used to start I/O on pages asynchronously.
 761  */
 762 static faultcode_t
 763 segmap_faulta(struct seg *seg, caddr_t addr)
 764 {
 765         struct smap *smp;
 766         struct vnode *vp;
 767         u_offset_t off;
 768         int err;
 769 
 770         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 771                 int     newpage;
 772                 kmutex_t *smtx;
 773 
 774                 /*
 775                  * Pages are successfully prefaulted and locked in
 776                  * segmap_getmapflt and can't be unlocked until
 777                  * segmap_release. No hat mappings have to be locked
 778                  * and they also can't be unlocked as long as the
 779                  * caller owns an active kpm addr.
 780                  */
 781 #ifdef  DEBUG
 782                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 783                         panic("segmap_faulta: smap not found "
 784                             "for addr %p", (void *)addr);
 785                         /*NOTREACHED*/
 786                 }
 787 
 788                 smtx = SMAPMTX(smp);
 789                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 790                 mutex_exit(smtx);
 791                 if (newpage)
 792                         cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
 793                             (void *)smp);
 794 #endif
 795                 return (0);
 796         }
 797 
 798         segmapcnt.smp_faulta.value.ul++;
 799         smp = GET_SMAP(seg, addr);
 800 
 801         ASSERT(smp->sm_refcnt > 0);
 802 
 803         vp = smp->sm_vp;
 804         off = smp->sm_off;
 805 
 806         if (vp == NULL) {
 807                 cmn_err(CE_WARN, "segmap_faulta - no vp");
 808                 return (FC_MAKE_ERR(EIO));
 809         }
 810 
 811         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 812             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 813 
 814         err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
 815             & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
 816             seg, addr, S_READ, CRED(), NULL);
 817 
 818         if (err)
 819                 return (FC_MAKE_ERR(err));
 820         return (0);
 821 }
 822 
 823 /*ARGSUSED*/
 824 static int
 825 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
 826 {
 827         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 828 
 829         ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
 830 
 831         /*
 832          * Need not acquire the segment lock since
 833          * "smd_prot" is a read-only field.
 834          */
 835         return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
 836 }
 837 
 838 static int
 839 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
 840 {
 841         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 842         size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
 843 
 844         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
 845 
 846         if (pgno != 0) {
 847                 do {
 848                         protv[--pgno] = smd->smd_prot;
 849                 } while (pgno != 0);
 850         }
 851         return (0);
 852 }
 853 
 854 static u_offset_t
 855 segmap_getoffset(struct seg *seg, caddr_t addr)
 856 {
 857         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 858 
 859         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 860 
 861         return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
 862 }
 863 
 864 /*ARGSUSED*/
 865 static int
 866 segmap_gettype(struct seg *seg, caddr_t addr)
 867 {
 868         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 869 
 870         return (MAP_SHARED);
 871 }
 872 
 873 /*ARGSUSED*/
 874 static int
 875 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
 876 {
 877         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 878 
 879         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 880 
 881         /* XXX - This doesn't make any sense */
 882         *vpp = smd->smd_sm->sm_vp;
 883         return (0);
 884 }
 885 
 886 /*
 887  * Check to see if it makes sense to do kluster/read ahead to
 888  * addr + delta relative to the mapping at addr.  We assume here
 889  * that delta is a signed PAGESIZE'd multiple (which can be negative).
 890  *
 891  * For segmap we always "approve" of this action from our standpoint.
 892  */
 893 /*ARGSUSED*/
 894 static int
 895 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 896 {
 897         return (0);
 898 }
 899 
 900 /*
 901  * Special private segmap operations
 902  */
 903 
 904 /*
 905  * Add smap to the appropriate free list.
 906  */
 907 static void
 908 segmap_smapadd(struct smap *smp)
 909 {
 910         struct smfree *sm;
 911         struct smap *smpfreelist;
 912         struct sm_freeq *releq;
 913 
 914         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 915 
 916         if (smp->sm_refcnt != 0) {
 917                 panic("segmap_smapadd");
 918                 /*NOTREACHED*/
 919         }
 920 
 921         sm = &smd_free[smp->sm_free_ndx];
 922         /*
 923          * Add to the tail of the release queue
 924          * Note that sm_releq and sm_allocq could toggle
 925          * before we get the lock. This does not affect
 926          * correctness as the 2 queues are only maintained
 927          * to reduce lock pressure.
 928          */
 929         releq = sm->sm_releq;
 930         if (releq == &sm->sm_freeq[0])
 931                 smp->sm_flags |= SM_QNDX_ZERO;
 932         else
 933                 smp->sm_flags &= ~SM_QNDX_ZERO;
 934         mutex_enter(&releq->smq_mtx);
 935         smpfreelist = releq->smq_free;
 936         if (smpfreelist == 0) {
 937                 int want;
 938 
 939                 releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 940                 /*
 941                  * Both queue mutexes held to set sm_want;
 942                  * snapshot the value before dropping releq mutex.
 943                  * If sm_want appears after the releq mutex is dropped,
 944                  * then the smap just freed is already gone.
 945                  */
 946                 want = sm->sm_want;
 947                 mutex_exit(&releq->smq_mtx);
 948                 /*
 949                  * See if there was a waiter before dropping the releq mutex
 950                  * then recheck after obtaining sm_freeq[0] mutex as
 951                  * the another thread may have already signaled.
 952                  */
 953                 if (want) {
 954                         mutex_enter(&sm->sm_freeq[0].smq_mtx);
 955                         if (sm->sm_want)
 956                                 cv_signal(&sm->sm_free_cv);
 957                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
 958                 }
 959         } else {
 960                 smp->sm_next = smpfreelist;
 961                 smp->sm_prev = smpfreelist->sm_prev;
 962                 smpfreelist->sm_prev = smp;
 963                 smp->sm_prev->sm_next = smp;
 964                 mutex_exit(&releq->smq_mtx);
 965         }
 966 }
 967 
 968 
 969 static struct smap *
 970 segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
 971 {
 972         struct smap **hpp;
 973         struct smap *tmp;
 974         kmutex_t *hmtx;
 975 
 976         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 977         ASSERT(smp->sm_vp == NULL);
 978         ASSERT(smp->sm_hash == NULL);
 979         ASSERT(smp->sm_prev == NULL);
 980         ASSERT(smp->sm_next == NULL);
 981         ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
 982 
 983         hmtx = SHASHMTX(hashid);
 984 
 985         mutex_enter(hmtx);
 986         /*
 987          * First we need to verify that no one has created a smp
 988          * with (vp,off) as its tag before we us.
 989          */
 990         for (tmp = smd_hash[hashid].sh_hash_list;
 991             tmp != NULL; tmp = tmp->sm_hash)
 992                 if (tmp->sm_vp == vp && tmp->sm_off == off)
 993                         break;
 994 
 995         if (tmp == NULL) {
 996                 /*
 997                  * No one created one yet.
 998                  *
 999                  * Funniness here - we don't increment the ref count on the
1000                  * vnode * even though we have another pointer to it here.
1001                  * The reason for this is that we don't want the fact that
1002                  * a seg_map entry somewhere refers to a vnode to prevent the
1003                  * vnode * itself from going away.  This is because this
1004                  * reference to the vnode is a "soft one".  In the case where
1005                  * a mapping is being used by a rdwr [or directory routine?]
1006                  * there already has to be a non-zero ref count on the vnode.
1007                  * In the case where the vp has been freed and the the smap
1008                  * structure is on the free list, there are no pages in memory
1009                  * that can refer to the vnode.  Thus even if we reuse the same
1010                  * vnode/smap structure for a vnode which has the same
1011                  * address but represents a different object, we are ok.
1012                  */
1013                 smp->sm_vp = vp;
1014                 smp->sm_off = off;
1015 
1016                 hpp = &smd_hash[hashid].sh_hash_list;
1017                 smp->sm_hash = *hpp;
1018                 *hpp = smp;
1019 #ifdef SEGMAP_HASHSTATS
1020                 smd_hash_len[hashid]++;
1021 #endif
1022         }
1023         mutex_exit(hmtx);
1024 
1025         return (tmp);
1026 }
1027 
1028 static void
1029 segmap_hashout(struct smap *smp)
1030 {
1031         struct smap **hpp, *hp;
1032         struct vnode *vp;
1033         kmutex_t *mtx;
1034         int hashid;
1035         u_offset_t off;
1036 
1037         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1038 
1039         vp = smp->sm_vp;
1040         off = smp->sm_off;
1041 
1042         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1043         mtx = SHASHMTX(hashid);
1044         mutex_enter(mtx);
1045 
1046         hpp = &smd_hash[hashid].sh_hash_list;
1047         for (;;) {
1048                 hp = *hpp;
1049                 if (hp == NULL) {
1050                         panic("segmap_hashout");
1051                         /*NOTREACHED*/
1052                 }
1053                 if (hp == smp)
1054                         break;
1055                 hpp = &hp->sm_hash;
1056         }
1057 
1058         *hpp = smp->sm_hash;
1059         smp->sm_hash = NULL;
1060 #ifdef SEGMAP_HASHSTATS
1061         smd_hash_len[hashid]--;
1062 #endif
1063         mutex_exit(mtx);
1064 
1065         smp->sm_vp = NULL;
1066         smp->sm_off = (u_offset_t)0;
1067 
1068 }
1069 
1070 /*
1071  * Attempt to free unmodified, unmapped, and non locked segmap
1072  * pages.
1073  */
1074 void
1075 segmap_pagefree(struct vnode *vp, u_offset_t off)
1076 {
1077         u_offset_t pgoff;
1078         page_t  *pp;
1079 
1080         for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
1081 
1082                 if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL)
1083                         continue;
1084 
1085                 switch (page_release(pp, 1)) {
1086                 case PGREL_NOTREL:
1087                         segmapcnt.smp_free_notfree.value.ul++;
1088                         break;
1089                 case PGREL_MOD:
1090                         segmapcnt.smp_free_dirty.value.ul++;
1091                         break;
1092                 case PGREL_CLEAN:
1093                         segmapcnt.smp_free.value.ul++;
1094                         break;
1095                 }
1096         }
1097 }
1098 
1099 /*
1100  * Locks held on entry: smap lock
1101  * Locks held on exit : smap lock.
1102  */
1103 
1104 static void
1105 grab_smp(struct smap *smp, page_t *pp)
1106 {
1107         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1108         ASSERT(smp->sm_refcnt == 0);
1109 
1110         if (smp->sm_vp != (struct vnode *)NULL) {
1111                 struct vnode    *vp = smp->sm_vp;
1112                 u_offset_t      off = smp->sm_off;
1113                 /*
1114                  * Destroy old vnode association and
1115                  * unload any hardware translations to
1116                  * the old object.
1117                  */
1118                 smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
1119                 segmap_hashout(smp);
1120 
1121                 /*
1122                  * This node is off freelist and hashlist,
1123                  * so there is no reason to drop/reacquire sm_mtx
1124                  * across calls to hat_unload.
1125                  */
1126                 if (segmap_kpm) {
1127                         caddr_t vaddr;
1128                         int hat_unload_needed = 0;
1129 
1130                         /*
1131                          * unload kpm mapping
1132                          */
1133                         if (pp != NULL) {
1134                                 vaddr = hat_kpm_page2va(pp, 1);
1135                                 hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
1136                                 page_unlock(pp);
1137                         }
1138 
1139                         /*
1140                          * Check if we have (also) the rare case of a
1141                          * non kpm mapping.
1142                          */
1143                         if (smp->sm_flags & SM_NOTKPM_RELEASED) {
1144                                 hat_unload_needed = 1;
1145                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1146                         }
1147 
1148                         if (hat_unload_needed) {
1149                                 hat_unload(kas.a_hat, segkmap->s_base +
1150                                     ((smp - smd_smap) * MAXBSIZE),
1151                                     MAXBSIZE, HAT_UNLOAD);
1152                         }
1153 
1154                 } else {
1155                         ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
1156                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1157                         hat_unload(kas.a_hat, segkmap->s_base +
1158                             ((smp - smd_smap) * MAXBSIZE),
1159                             MAXBSIZE, HAT_UNLOAD);
1160                 }
1161                 segmap_pagefree(vp, off);
1162         }
1163 }
1164 
1165 static struct smap *
1166 get_free_smp(int free_ndx)
1167 {
1168         struct smfree *sm;
1169         kmutex_t *smtx;
1170         struct smap *smp, *first;
1171         struct sm_freeq *allocq, *releq;
1172         struct kpme *kpme;
1173         page_t *pp = NULL;
1174         int end_ndx, page_locked = 0;
1175 
1176         end_ndx = free_ndx;
1177         sm = &smd_free[free_ndx];
1178 
1179 retry_queue:
1180         allocq = sm->sm_allocq;
1181         mutex_enter(&allocq->smq_mtx);
1182 
1183         if ((smp = allocq->smq_free) == NULL) {
1184 
1185 skip_queue:
1186                 /*
1187                  * The alloc list is empty or this queue is being skipped;
1188                  * first see if the allocq toggled.
1189                  */
1190                 if (sm->sm_allocq != allocq) {
1191                         /* queue changed */
1192                         mutex_exit(&allocq->smq_mtx);
1193                         goto retry_queue;
1194                 }
1195                 releq = sm->sm_releq;
1196                 if (!mutex_tryenter(&releq->smq_mtx)) {
1197                         /* cannot get releq; a free smp may be there now */
1198                         mutex_exit(&allocq->smq_mtx);
1199 
1200                         /*
1201                          * This loop could spin forever if this thread has
1202                          * higher priority than the thread that is holding
1203                          * releq->smq_mtx. In order to force the other thread
1204                          * to run, we'll lock/unlock the mutex which is safe
1205                          * since we just unlocked the allocq mutex.
1206                          */
1207                         mutex_enter(&releq->smq_mtx);
1208                         mutex_exit(&releq->smq_mtx);
1209                         goto retry_queue;
1210                 }
1211                 if (releq->smq_free == NULL) {
1212                         /*
1213                          * This freelist is empty.
1214                          * This should not happen unless clients
1215                          * are failing to release the segmap
1216                          * window after accessing the data.
1217                          * Before resorting to sleeping, try
1218                          * the next list of the same color.
1219                          */
1220                         free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
1221                         if (free_ndx != end_ndx) {
1222                                 mutex_exit(&releq->smq_mtx);
1223                                 mutex_exit(&allocq->smq_mtx);
1224                                 sm = &smd_free[free_ndx];
1225                                 goto retry_queue;
1226                         }
1227                         /*
1228                          * Tried all freelists of the same color once,
1229                          * wait on this list and hope something gets freed.
1230                          */
1231                         segmapcnt.smp_get_nofree.value.ul++;
1232                         sm->sm_want++;
1233                         mutex_exit(&sm->sm_freeq[1].smq_mtx);
1234                         cv_wait(&sm->sm_free_cv,
1235                             &sm->sm_freeq[0].smq_mtx);
1236                         sm->sm_want--;
1237                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
1238                         sm = &smd_free[free_ndx];
1239                         goto retry_queue;
1240                 } else {
1241                         /*
1242                          * Something on the rele queue; flip the alloc
1243                          * and rele queues and retry.
1244                          */
1245                         sm->sm_allocq = releq;
1246                         sm->sm_releq = allocq;
1247                         mutex_exit(&allocq->smq_mtx);
1248                         mutex_exit(&releq->smq_mtx);
1249                         if (page_locked) {
1250                                 delay(hz >> 2);
1251                                 page_locked = 0;
1252                         }
1253                         goto retry_queue;
1254                 }
1255         } else {
1256                 /*
1257                  * Fastpath the case we get the smap mutex
1258                  * on the first try.
1259                  */
1260                 first = smp;
1261 next_smap:
1262                 smtx = SMAPMTX(smp);
1263                 if (!mutex_tryenter(smtx)) {
1264                         /*
1265                          * Another thread is trying to reclaim this slot.
1266                          * Skip to the next queue or smap.
1267                          */
1268                         if ((smp = smp->sm_next) == first) {
1269                                 goto skip_queue;
1270                         } else {
1271                                 goto next_smap;
1272                         }
1273                 } else {
1274                         /*
1275                          * if kpme exists, get shared lock on the page
1276                          */
1277                         if (segmap_kpm && smp->sm_vp != NULL) {
1278 
1279                                 kpme = GET_KPME(smp);
1280                                 pp = kpme->kpe_page;
1281 
1282                                 if (pp != NULL) {
1283                                         if (!page_trylock(pp, SE_SHARED)) {
1284                                                 smp = smp->sm_next;
1285                                                 mutex_exit(smtx);
1286                                                 page_locked = 1;
1287 
1288                                                 pp = NULL;
1289 
1290                                                 if (smp == first) {
1291                                                         goto skip_queue;
1292                                                 } else {
1293                                                         goto next_smap;
1294                                                 }
1295                                         } else {
1296                                                 if (kpme->kpe_page == NULL) {
1297                                                         page_unlock(pp);
1298                                                         pp = NULL;
1299                                                 }
1300                                         }
1301                                 }
1302                         }
1303 
1304                         /*
1305                          * At this point, we've selected smp.  Remove smp
1306                          * from its freelist.  If smp is the first one in
1307                          * the freelist, update the head of the freelist.
1308                          */
1309                         if (first == smp) {
1310                                 ASSERT(first == allocq->smq_free);
1311                                 allocq->smq_free = smp->sm_next;
1312                         }
1313 
1314                         /*
1315                          * if the head of the freelist still points to smp,
1316                          * then there are no more free smaps in that list.
1317                          */
1318                         if (allocq->smq_free == smp)
1319                                 /*
1320                                  * Took the last one
1321                                  */
1322                                 allocq->smq_free = NULL;
1323                         else {
1324                                 smp->sm_prev->sm_next = smp->sm_next;
1325                                 smp->sm_next->sm_prev = smp->sm_prev;
1326                         }
1327                         mutex_exit(&allocq->smq_mtx);
1328                         smp->sm_prev = smp->sm_next = NULL;
1329 
1330                         /*
1331                          * if pp != NULL, pp must have been locked;
1332                          * grab_smp() unlocks pp.
1333                          */
1334                         ASSERT((pp == NULL) || PAGE_LOCKED(pp));
1335                         grab_smp(smp, pp);
1336                         /* return smp locked. */
1337                         ASSERT(SMAPMTX(smp) == smtx);
1338                         ASSERT(MUTEX_HELD(smtx));
1339                         return (smp);
1340                 }
1341         }
1342 }
1343 
1344 /*
1345  * Special public segmap operations
1346  */
1347 
1348 /*
1349  * Create pages (without using VOP_GETPAGE) and load up translations to them.
1350  * If softlock is TRUE, then set things up so that it looks like a call
1351  * to segmap_fault with F_SOFTLOCK.
1352  *
1353  * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
1354  *
1355  * All fields in the generic segment (struct seg) are considered to be
1356  * read-only for "segmap" even though the kernel address space (kas) may
1357  * not be locked, hence no lock is needed to access them.
1358  */
1359 int
1360 segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
1361 {
1362         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
1363         page_t *pp;
1364         u_offset_t off;
1365         struct smap *smp;
1366         struct vnode *vp;
1367         caddr_t eaddr;
1368         int newpage = 0;
1369         uint_t prot;
1370         kmutex_t *smtx;
1371         int hat_flag;
1372 
1373         ASSERT(seg->s_as == &kas);
1374 
1375         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1376                 /*
1377                  * Pages are successfully prefaulted and locked in
1378                  * segmap_getmapflt and can't be unlocked until
1379                  * segmap_release. The SM_KPM_NEWPAGE flag is set
1380                  * in segmap_pagecreate_kpm when new pages are created.
1381                  * and it is returned as "newpage" indication here.
1382                  */
1383                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1384                         panic("segmap_pagecreate: smap not found "
1385                             "for addr %p", (void *)addr);
1386                         /*NOTREACHED*/
1387                 }
1388 
1389                 smtx = SMAPMTX(smp);
1390                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
1391                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
1392                 mutex_exit(smtx);
1393 
1394                 return (newpage);
1395         }
1396 
1397         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
1398 
1399         eaddr = addr + len;
1400         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1401 
1402         smp = GET_SMAP(seg, addr);
1403 
1404         /*
1405          * We don't grab smp mutex here since we assume the smp
1406          * has a refcnt set already which prevents the slot from
1407          * changing its id.
1408          */
1409         ASSERT(smp->sm_refcnt > 0);
1410 
1411         vp = smp->sm_vp;
1412         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1413         prot = smd->smd_prot;
1414 
1415         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1416                 hat_flag = HAT_LOAD;
1417                 pp = page_lookup(vp, off, SE_SHARED);
1418                 if (pp == NULL) {
1419                         ushort_t bitindex;
1420 
1421                         if ((pp = page_create_va(vp, off,
1422                             PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
1423                                 panic("segmap_pagecreate: page_create failed");
1424                                 /*NOTREACHED*/
1425                         }
1426                         newpage = 1;
1427                         page_io_unlock(pp);
1428 
1429                         /*
1430                          * Since pages created here do not contain valid
1431                          * data until the caller writes into them, the
1432                          * "exclusive" lock will not be dropped to prevent
1433                          * other users from accessing the page.  We also
1434                          * have to lock the translation to prevent a fault
1435                          * from occurring when the virtual address mapped by
1436                          * this page is written into.  This is necessary to
1437                          * avoid a deadlock since we haven't dropped the
1438                          * "exclusive" lock.
1439                          */
1440                         bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
1441 
1442                         /*
1443                          * Large Files: The following assertion is to
1444                          * verify the cast above.
1445                          */
1446                         ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1447                         smtx = SMAPMTX(smp);
1448                         mutex_enter(smtx);
1449                         smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
1450                         mutex_exit(smtx);
1451 
1452                         hat_flag = HAT_LOAD_LOCK;
1453                 } else if (softlock) {
1454                         hat_flag = HAT_LOAD_LOCK;
1455                 }
1456 
1457                 if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
1458                         hat_setmod(pp);
1459 
1460                 hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
1461 
1462                 if (hat_flag != HAT_LOAD_LOCK)
1463                         page_unlock(pp);
1464 
1465                 TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE,
1466                     "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx",
1467                     seg, addr, pp, vp, off);
1468         }
1469 
1470         return (newpage);
1471 }
1472 
1473 void
1474 segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
1475 {
1476         struct smap     *smp;
1477         ushort_t        bitmask;
1478         page_t          *pp;
1479         struct  vnode   *vp;
1480         u_offset_t      off;
1481         caddr_t         eaddr;
1482         kmutex_t        *smtx;
1483 
1484         ASSERT(seg->s_as == &kas);
1485 
1486         eaddr = addr + len;
1487         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1488 
1489         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1490                 /*
1491                  * Pages are successfully prefaulted and locked in
1492                  * segmap_getmapflt and can't be unlocked until
1493                  * segmap_release, so no pages or hat mappings have
1494                  * to be unlocked at this point.
1495                  */
1496 #ifdef DEBUG
1497                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1498                         panic("segmap_pageunlock: smap not found "
1499                             "for addr %p", (void *)addr);
1500                         /*NOTREACHED*/
1501                 }
1502 
1503                 ASSERT(smp->sm_refcnt > 0);
1504                 mutex_exit(SMAPMTX(smp));
1505 #endif
1506                 return;
1507         }
1508 
1509         smp = GET_SMAP(seg, addr);
1510         smtx = SMAPMTX(smp);
1511 
1512         ASSERT(smp->sm_refcnt > 0);
1513 
1514         vp = smp->sm_vp;
1515         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1516 
1517         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1518                 bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
1519 
1520                 /*
1521                  * Large Files: Following assertion is to verify
1522                  * the correctness of the cast to (int) above.
1523                  */
1524                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1525 
1526                 /*
1527                  * If the bit corresponding to "off" is set,
1528                  * clear this bit in the bitmap, unlock translations,
1529                  * and release the "exclusive" lock on the page.
1530                  */
1531                 if (smp->sm_bitmap & bitmask) {
1532                         mutex_enter(smtx);
1533                         smp->sm_bitmap &= ~bitmask;
1534                         mutex_exit(smtx);
1535 
1536                         hat_unlock(kas.a_hat, addr, PAGESIZE);
1537 
1538                         /*
1539                          * Use page_find() instead of page_lookup() to
1540                          * find the page since we know that it has
1541                          * "exclusive" lock.
1542                          */
1543                         pp = page_find(vp, off);
1544                         if (pp == NULL) {
1545                                 panic("segmap_pageunlock: page not found");
1546                                 /*NOTREACHED*/
1547                         }
1548                         if (rw == S_WRITE) {
1549                                 hat_setrefmod(pp);
1550                         } else if (rw != S_OTHER) {
1551                                 hat_setref(pp);
1552                         }
1553 
1554                         page_unlock(pp);
1555                 }
1556         }
1557 }
1558 
1559 caddr_t
1560 segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off)
1561 {
1562         return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
1563 }
1564 
1565 /*
1566  * This is the magic virtual address that offset 0 of an ELF
1567  * file gets mapped to in user space. This is used to pick
1568  * the vac color on the freelist.
1569  */
1570 #define ELF_OFFZERO_VA  (0x10000)
1571 /*
1572  * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
1573  * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
1574  * The return address is  always MAXBSIZE aligned.
1575  *
1576  * If forcefault is nonzero and the MMU translations haven't yet been created,
1577  * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
1578  */
1579 caddr_t
1580 segmap_getmapflt(
1581         struct seg *seg,
1582         struct vnode *vp,
1583         u_offset_t off,
1584         size_t len,
1585         int forcefault,
1586         enum seg_rw rw)
1587 {
1588         struct smap *smp, *nsmp;
1589         extern struct vnode *common_specvp();
1590         caddr_t baseaddr;                       /* MAXBSIZE aligned */
1591         u_offset_t baseoff;
1592         int newslot;
1593         caddr_t vaddr;
1594         int color, hashid;
1595         kmutex_t *hashmtx, *smapmtx;
1596         struct smfree *sm;
1597         page_t  *pp;
1598         struct kpme *kpme;
1599         uint_t  prot;
1600         caddr_t base;
1601         page_t  *pl[MAXPPB + 1];
1602         int     error;
1603         int     is_kpm = 1;
1604 
1605         ASSERT(seg->s_as == &kas);
1606         ASSERT(seg == segkmap);
1607 
1608         baseoff = off & (offset_t)MAXBMASK;
1609         if (off + len > baseoff + MAXBSIZE) {
1610                 panic("segmap_getmap bad len");
1611                 /*NOTREACHED*/
1612         }
1613 
1614         /*
1615          * If this is a block device we have to be sure to use the
1616          * "common" block device vnode for the mapping.
1617          */
1618         if (vp->v_type == VBLK)
1619                 vp = common_specvp(vp);
1620 
1621         smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
1622 
1623         if (segmap_kpm == 0 ||
1624             (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
1625                 is_kpm = 0;
1626         }
1627 
1628         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1629         hashmtx = SHASHMTX(hashid);
1630 
1631 retry_hash:
1632         mutex_enter(hashmtx);
1633         for (smp = smd_hash[hashid].sh_hash_list;
1634             smp != NULL; smp = smp->sm_hash)
1635                 if (smp->sm_vp == vp && smp->sm_off == baseoff)
1636                         break;
1637         mutex_exit(hashmtx);
1638 
1639 vrfy_smp:
1640         if (smp != NULL) {
1641 
1642                 ASSERT(vp->v_count != 0);
1643 
1644                 /*
1645                  * Get smap lock and recheck its tag. The hash lock
1646                  * is dropped since the hash is based on (vp, off)
1647                  * and (vp, off) won't change when we have smap mtx.
1648                  */
1649                 smapmtx = SMAPMTX(smp);
1650                 mutex_enter(smapmtx);
1651                 if (smp->sm_vp != vp || smp->sm_off != baseoff) {
1652                         mutex_exit(smapmtx);
1653                         goto retry_hash;
1654                 }
1655 
1656                 if (smp->sm_refcnt == 0) {
1657 
1658                         smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
1659 
1660                         /*
1661                          * Could still be on the free list. However, this
1662                          * could also be an smp that is transitioning from
1663                          * the free list when we have too much contention
1664                          * for the smapmtx's. In this case, we have an
1665                          * unlocked smp that is not on the free list any
1666                          * longer, but still has a 0 refcnt.  The only way
1667                          * to be sure is to check the freelist pointers.
1668                          * Since we now have the smapmtx, we are guaranteed
1669                          * that the (vp, off) won't change, so we are safe
1670                          * to reclaim it.  get_free_smp() knows that this
1671                          * can happen, and it will check the refcnt.
1672                          */
1673 
1674                         if ((smp->sm_next != NULL)) {
1675                                 struct sm_freeq *freeq;
1676 
1677                                 ASSERT(smp->sm_prev != NULL);
1678                                 sm = &smd_free[smp->sm_free_ndx];
1679 
1680                                 if (smp->sm_flags & SM_QNDX_ZERO)
1681                                         freeq = &sm->sm_freeq[0];
1682                                 else
1683                                         freeq = &sm->sm_freeq[1];
1684 
1685                                 mutex_enter(&freeq->smq_mtx);
1686                                 if (freeq->smq_free != smp) {
1687                                         /*
1688                                          * fastpath normal case
1689                                          */
1690                                         smp->sm_prev->sm_next = smp->sm_next;
1691                                         smp->sm_next->sm_prev = smp->sm_prev;
1692                                 } else if (smp == smp->sm_next) {
1693                                         /*
1694                                          * Taking the last smap on freelist
1695                                          */
1696                                         freeq->smq_free = NULL;
1697                                 } else {
1698                                         /*
1699                                          * Reclaiming 1st smap on list
1700                                          */
1701                                         freeq->smq_free = smp->sm_next;
1702                                         smp->sm_prev->sm_next = smp->sm_next;
1703                                         smp->sm_next->sm_prev = smp->sm_prev;
1704                                 }
1705                                 mutex_exit(&freeq->smq_mtx);
1706                                 smp->sm_prev = smp->sm_next = NULL;
1707                         } else {
1708                                 ASSERT(smp->sm_prev == NULL);
1709                                 segmapcnt.smp_stolen.value.ul++;
1710                         }
1711 
1712                 } else {
1713                         segmapcnt.smp_get_use.value.ul++;
1714                 }
1715                 smp->sm_refcnt++;            /* another user */
1716 
1717                 /*
1718                  * We don't invoke segmap_fault via TLB miss, so we set ref
1719                  * and mod bits in advance. For S_OTHER  we set them in
1720                  * segmap_fault F_SOFTUNLOCK.
1721                  */
1722                 if (is_kpm) {
1723                         if (rw == S_WRITE) {
1724                                 smp->sm_flags |= SM_WRITE_DATA;
1725                         } else if (rw == S_READ) {
1726                                 smp->sm_flags |= SM_READ_DATA;
1727                         }
1728                 }
1729                 mutex_exit(smapmtx);
1730 
1731                 newslot = 0;
1732         } else {
1733 
1734                 uint32_t free_ndx, *free_ndxp;
1735                 union segmap_cpu *scpu;
1736 
1737                 /*
1738                  * On a PAC machine or a machine with anti-alias
1739                  * hardware, smd_colormsk will be zero.
1740                  *
1741                  * On a VAC machine- pick color by offset in the file
1742                  * so we won't get VAC conflicts on elf files.
1743                  * On data files, color does not matter but we
1744                  * don't know what kind of file it is so we always
1745                  * pick color by offset. This causes color
1746                  * corresponding to file offset zero to be used more
1747                  * heavily.
1748                  */
1749                 color = (baseoff >> MAXBSHIFT) & smd_colormsk;
1750                 scpu = smd_cpu+CPU->cpu_seqid;
1751                 free_ndxp = &scpu->scpu.scpu_free_ndx[color];
1752                 free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
1753 #ifdef DEBUG
1754                 colors_used[free_ndx]++;
1755 #endif /* DEBUG */
1756 
1757                 /*
1758                  * Get a locked smp slot from the free list.
1759                  */
1760                 smp = get_free_smp(free_ndx);
1761                 smapmtx = SMAPMTX(smp);
1762 
1763                 ASSERT(smp->sm_vp == NULL);
1764 
1765                 if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
1766                         /*
1767                          * Failed to hashin, there exists one now.
1768                          * Return the smp we just allocated.
1769                          */
1770                         segmap_smapadd(smp);
1771                         mutex_exit(smapmtx);
1772 
1773                         smp = nsmp;
1774                         goto vrfy_smp;
1775                 }
1776                 smp->sm_refcnt++;            /* another user */
1777 
1778                 /*
1779                  * We don't invoke segmap_fault via TLB miss, so we set ref
1780                  * and mod bits in advance. For S_OTHER  we set them in
1781                  * segmap_fault F_SOFTUNLOCK.
1782                  */
1783                 if (is_kpm) {
1784                         if (rw == S_WRITE) {
1785                                 smp->sm_flags |= SM_WRITE_DATA;
1786                         } else if (rw == S_READ) {
1787                                 smp->sm_flags |= SM_READ_DATA;
1788                         }
1789                 }
1790                 mutex_exit(smapmtx);
1791 
1792                 newslot = 1;
1793         }
1794 
1795         if (!is_kpm)
1796                 goto use_segmap_range;
1797 
1798         /*
1799          * Use segkpm
1800          */
1801         /* Lint directive required until 6746211 is fixed */
1802         /*CONSTCOND*/
1803         ASSERT(PAGESIZE == MAXBSIZE);
1804 
1805         /*
1806          * remember the last smp faulted on this cpu.
1807          */
1808         (smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
1809 
1810         if (forcefault == SM_PAGECREATE) {
1811                 baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
1812                 return (baseaddr);
1813         }
1814 
1815         if (newslot == 0 &&
1816             (pp = GET_KPME(smp)->kpe_page) != NULL) {
1817 
1818                 /* fastpath */
1819                 switch (rw) {
1820                 case S_READ:
1821                 case S_WRITE:
1822                         if (page_trylock(pp, SE_SHARED)) {
1823                                 if (PP_ISFREE(pp) ||
1824                                     !(pp->p_vnode == vp &&
1825                                     pp->p_offset == baseoff)) {
1826                                         page_unlock(pp);
1827                                         pp = page_lookup(vp, baseoff,
1828                                             SE_SHARED);
1829                                 }
1830                         } else {
1831                                 pp = page_lookup(vp, baseoff, SE_SHARED);
1832                         }
1833 
1834                         if (pp == NULL) {
1835                                 ASSERT(GET_KPME(smp)->kpe_page == NULL);
1836                                 break;
1837                         }
1838 
1839                         if (rw == S_WRITE &&
1840                             hat_page_getattr(pp, P_MOD | P_REF) !=
1841                             (P_MOD | P_REF)) {
1842                                 page_unlock(pp);
1843                                 break;
1844                         }
1845 
1846                         /*
1847                          * We have the p_selock as reader, grab_smp
1848                          * can't hit us, we have bumped the smap
1849                          * refcnt and hat_pageunload needs the
1850                          * p_selock exclusive.
1851                          */
1852                         kpme = GET_KPME(smp);
1853                         if (kpme->kpe_page == pp) {
1854                                 baseaddr = hat_kpm_page2va(pp, 0);
1855                         } else if (kpme->kpe_page == NULL) {
1856                                 baseaddr = hat_kpm_mapin(pp, kpme);
1857                         } else {
1858                                 panic("segmap_getmapflt: stale "
1859                                     "kpme page, kpme %p", (void *)kpme);
1860                                 /*NOTREACHED*/
1861                         }
1862 
1863                         /*
1864                          * We don't invoke segmap_fault via TLB miss,
1865                          * so we set ref and mod bits in advance.
1866                          * For S_OTHER and we set them in segmap_fault
1867                          * F_SOFTUNLOCK.
1868                          */
1869                         if (rw == S_READ && !hat_isref(pp))
1870                                 hat_setref(pp);
1871 
1872                         return (baseaddr);
1873                 default:
1874                         break;
1875                 }
1876         }
1877 
1878         base = segkpm_create_va(baseoff);
1879         error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
1880             seg, base, rw, CRED(), NULL);
1881 
1882         pp = pl[0];
1883         if (error || pp == NULL) {
1884                 /*
1885                  * Use segmap address slot and let segmap_fault deal
1886                  * with the error cases. There is no error return
1887                  * possible here.
1888                  */
1889                 goto use_segmap_range;
1890         }
1891 
1892         ASSERT(pl[1] == NULL);
1893 
1894         /*
1895          * When prot is not returned w/ PROT_ALL the returned pages
1896          * are not backed by fs blocks. For most of the segmap users
1897          * this is no problem, they don't write to the pages in the
1898          * same request and therefore don't rely on a following
1899          * trap driven segmap_fault. With SM_LOCKPROTO users it
1900          * is more secure to use segkmap adresses to allow
1901          * protection segmap_fault's.
1902          */
1903         if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
1904                 /*
1905                  * Use segmap address slot and let segmap_fault
1906                  * do the error return.
1907                  */
1908                 ASSERT(rw != S_WRITE);
1909                 ASSERT(PAGE_LOCKED(pp));
1910                 page_unlock(pp);
1911                 forcefault = 0;
1912                 goto use_segmap_range;
1913         }
1914 
1915         /*
1916          * We have the p_selock as reader, grab_smp can't hit us, we
1917          * have bumped the smap refcnt and hat_pageunload needs the
1918          * p_selock exclusive.
1919          */
1920         kpme = GET_KPME(smp);
1921         if (kpme->kpe_page == pp) {
1922                 baseaddr = hat_kpm_page2va(pp, 0);
1923         } else if (kpme->kpe_page == NULL) {
1924                 baseaddr = hat_kpm_mapin(pp, kpme);
1925         } else {
1926                 panic("segmap_getmapflt: stale kpme page after "
1927                     "VOP_GETPAGE, kpme %p", (void *)kpme);
1928                 /*NOTREACHED*/
1929         }
1930 
1931         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
1932 
1933         return (baseaddr);
1934 
1935 
1936 use_segmap_range:
1937         baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
1938         TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP,
1939             "segmap_getmap:seg %p addr %p vp %p offset %llx",
1940             seg, baseaddr, vp, baseoff);
1941 
1942         /*
1943          * Prefault the translations
1944          */
1945         vaddr = baseaddr + (off - baseoff);
1946         if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
1947 
1948                 caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
1949                     (uintptr_t)PAGEMASK);
1950 
1951                 (void) segmap_fault(kas.a_hat, seg, pgaddr,
1952                     (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
1953                     F_INVAL, rw);
1954         }
1955 
1956         return (baseaddr);
1957 }
1958 
1959 int
1960 segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
1961 {
1962         struct smap     *smp;
1963         int             error;
1964         int             bflags = 0;
1965         struct vnode    *vp;
1966         u_offset_t      offset;
1967         kmutex_t        *smtx;
1968         int             is_kpm = 0;
1969         page_t          *pp;
1970 
1971         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1972 
1973                 if (((uintptr_t)addr & MAXBOFFSET) != 0) {
1974                         panic("segmap_release: addr %p not "
1975                             "MAXBSIZE aligned", (void *)addr);
1976                         /*NOTREACHED*/
1977                 }
1978 
1979                 if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
1980                         panic("segmap_release: smap not found "
1981                             "for addr %p", (void *)addr);
1982                         /*NOTREACHED*/
1983                 }
1984 
1985                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
1986                     "segmap_relmap:seg %p addr %p smp %p",
1987                     seg, addr, smp);
1988 
1989                 smtx = SMAPMTX(smp);
1990 
1991                 /*
1992                  * For compatibility reasons segmap_pagecreate_kpm sets this
1993                  * flag to allow a following segmap_pagecreate to return
1994                  * this as "newpage" flag. When segmap_pagecreate is not
1995                  * called at all we clear it now.
1996                  */
1997                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
1998                 is_kpm = 1;
1999                 if (smp->sm_flags & SM_WRITE_DATA) {
2000                         hat_setrefmod(pp);
2001                 } else if (smp->sm_flags & SM_READ_DATA) {
2002                         hat_setref(pp);
2003                 }
2004         } else {
2005                 if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
2006                     ((uintptr_t)addr & MAXBOFFSET) != 0) {
2007                         panic("segmap_release: bad addr %p", (void *)addr);
2008                         /*NOTREACHED*/
2009                 }
2010                 smp = GET_SMAP(seg, addr);
2011 
2012                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2013                     "segmap_relmap:seg %p addr %p smp %p",
2014                     seg, addr, smp);
2015 
2016                 smtx = SMAPMTX(smp);
2017                 mutex_enter(smtx);
2018                 smp->sm_flags |= SM_NOTKPM_RELEASED;
2019         }
2020 
2021         ASSERT(smp->sm_refcnt > 0);
2022 
2023         /*
2024          * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2025          * are set.
2026          */
2027         if ((flags & ~SM_DONTNEED) != 0) {
2028                 if (flags & SM_WRITE)
2029                         segmapcnt.smp_rel_write.value.ul++;
2030                 if (flags & SM_ASYNC) {
2031                         bflags |= B_ASYNC;
2032                         segmapcnt.smp_rel_async.value.ul++;
2033                 }
2034                 if (flags & SM_INVAL) {
2035                         bflags |= B_INVAL;
2036                         segmapcnt.smp_rel_abort.value.ul++;
2037                 }
2038                 if (flags & SM_DESTROY) {
2039                         bflags |= (B_INVAL|B_TRUNC);
2040                         segmapcnt.smp_rel_abort.value.ul++;
2041                 }
2042                 if (smp->sm_refcnt == 1) {
2043                         /*
2044                          * We only bother doing the FREE and DONTNEED flags
2045                          * if no one else is still referencing this mapping.
2046                          */
2047                         if (flags & SM_FREE) {
2048                                 bflags |= B_FREE;
2049                                 segmapcnt.smp_rel_free.value.ul++;
2050                         }
2051                         if (flags & SM_DONTNEED) {
2052                                 bflags |= B_DONTNEED;
2053                                 segmapcnt.smp_rel_dontneed.value.ul++;
2054                         }
2055                 }
2056         } else {
2057                 smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
2058         }
2059 
2060         vp = smp->sm_vp;
2061         offset = smp->sm_off;
2062 
2063         if (--smp->sm_refcnt == 0) {
2064 
2065                 smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
2066 
2067                 if (flags & (SM_INVAL|SM_DESTROY)) {
2068                         segmap_hashout(smp);    /* remove map info */
2069                         if (is_kpm) {
2070                                 hat_kpm_mapout(pp, GET_KPME(smp), addr);
2071                                 if (smp->sm_flags & SM_NOTKPM_RELEASED) {
2072                                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2073                                         hat_unload(kas.a_hat, segkmap->s_base +
2074                                             ((smp - smd_smap) * MAXBSIZE),
2075                                             MAXBSIZE, HAT_UNLOAD);
2076                                 }
2077 
2078                         } else {
2079                                 if (segmap_kpm)
2080                                         segkpm_mapout_validkpme(GET_KPME(smp));
2081 
2082                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2083                                 hat_unload(kas.a_hat, addr, MAXBSIZE,
2084                                     HAT_UNLOAD);
2085                         }
2086                 }
2087                 segmap_smapadd(smp);    /* add to free list */
2088         }
2089 
2090         mutex_exit(smtx);
2091 
2092         if (is_kpm)
2093                 page_unlock(pp);
2094         /*
2095          * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2096          * are set.
2097          */
2098         if ((flags & ~SM_DONTNEED) != 0) {
2099                 error = VOP_PUTPAGE(vp, offset, MAXBSIZE,
2100                     bflags, CRED(), NULL);
2101         } else {
2102                 error = 0;
2103         }
2104 
2105         return (error);
2106 }
2107 
2108 /*
2109  * Dump the pages belonging to this segmap segment.
2110  */
2111 static void
2112 segmap_dump(struct seg *seg)
2113 {
2114         struct segmap_data *smd;
2115         struct smap *smp, *smp_end;
2116         page_t *pp;
2117         pfn_t pfn;
2118         u_offset_t off;
2119         caddr_t addr;
2120 
2121         smd = (struct segmap_data *)seg->s_data;
2122         addr = seg->s_base;
2123         for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
2124             smp < smp_end; smp++) {
2125 
2126                 if (smp->sm_refcnt) {
2127                         for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
2128                                 int we_own_it = 0;
2129 
2130                                 /*
2131                                  * If pp == NULL, the page either does
2132                                  * not exist or is exclusively locked.
2133                                  * So determine if it exists before
2134                                  * searching for it.
2135                                  */
2136                                 if ((pp = page_lookup_nowait(smp->sm_vp,
2137                                     smp->sm_off + off, SE_SHARED)))
2138                                         we_own_it = 1;
2139                                 else
2140                                         pp = page_exists(smp->sm_vp,
2141                                             smp->sm_off + off);
2142 
2143                                 if (pp) {
2144                                         pfn = page_pptonum(pp);
2145                                         dump_addpage(seg->s_as,
2146                                             addr + off, pfn);
2147                                         if (we_own_it)
2148                                                 page_unlock(pp);
2149                                 }
2150                                 dump_timeleft = dump_timeout;
2151                         }
2152                 }
2153                 addr += MAXBSIZE;
2154         }
2155 }
2156 
2157 /*ARGSUSED*/
2158 static int
2159 segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
2160     struct page ***ppp, enum lock_type type, enum seg_rw rw)
2161 {
2162         return (ENOTSUP);
2163 }
2164 
2165 static int
2166 segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
2167 {
2168         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
2169 
2170         memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
2171         memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
2172         return (0);
2173 }
2174 
2175 /*ARGSUSED*/
2176 static lgrp_mem_policy_info_t *
2177 segmap_getpolicy(struct seg *seg, caddr_t addr)
2178 {
2179         return (NULL);
2180 }
2181 
2182 /*ARGSUSED*/
2183 static int
2184 segmap_capable(struct seg *seg, segcapability_t capability)
2185 {
2186         return (0);
2187 }
2188 
2189 
2190 #ifdef  SEGKPM_SUPPORT
2191 
2192 /*
2193  * segkpm support routines
2194  */
2195 
2196 static caddr_t
2197 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2198         struct smap *smp, enum seg_rw rw)
2199 {
2200         caddr_t base;
2201         page_t  *pp;
2202         int     newpage = 0;
2203         struct kpme     *kpme;
2204 
2205         ASSERT(smp->sm_refcnt > 0);
2206 
2207         if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
2208                 kmutex_t *smtx;
2209 
2210                 base = segkpm_create_va(off);
2211 
2212                 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
2213                     seg, base)) == NULL) {
2214                         panic("segmap_pagecreate_kpm: "
2215                             "page_create failed");
2216                         /*NOTREACHED*/
2217                 }
2218 
2219                 newpage = 1;
2220                 page_io_unlock(pp);
2221                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
2222 
2223                 /*
2224                  * Mark this here until the following segmap_pagecreate
2225                  * or segmap_release.
2226                  */
2227                 smtx = SMAPMTX(smp);
2228                 mutex_enter(smtx);
2229                 smp->sm_flags |= SM_KPM_NEWPAGE;
2230                 mutex_exit(smtx);
2231         }
2232 
2233         kpme = GET_KPME(smp);
2234         if (!newpage && kpme->kpe_page == pp)
2235                 base = hat_kpm_page2va(pp, 0);
2236         else
2237                 base = hat_kpm_mapin(pp, kpme);
2238 
2239         /*
2240          * FS code may decide not to call segmap_pagecreate and we
2241          * don't invoke segmap_fault via TLB miss, so we have to set
2242          * ref and mod bits in advance.
2243          */
2244         if (rw == S_WRITE) {
2245                 hat_setrefmod(pp);
2246         } else {
2247                 ASSERT(rw == S_READ);
2248                 hat_setref(pp);
2249         }
2250 
2251         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
2252 
2253         return (base);
2254 }
2255 
2256 /*
2257  * Find the smap structure corresponding to the
2258  * KPM addr and return it locked.
2259  */
2260 struct smap *
2261 get_smap_kpm(caddr_t addr, page_t **ppp)
2262 {
2263         struct smap     *smp;
2264         struct vnode    *vp;
2265         u_offset_t      offset;
2266         caddr_t         baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
2267         int             hashid;
2268         kmutex_t        *hashmtx;
2269         page_t          *pp;
2270         union segmap_cpu *scpu;
2271 
2272         pp = hat_kpm_vaddr2page(baseaddr);
2273 
2274         ASSERT(pp && !PP_ISFREE(pp));
2275         ASSERT(PAGE_LOCKED(pp));
2276         ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
2277 
2278         vp = pp->p_vnode;
2279         offset = pp->p_offset;
2280         ASSERT(vp != NULL);
2281 
2282         /*
2283          * Assume the last smap used on this cpu is the one needed.
2284          */
2285         scpu = smd_cpu+CPU->cpu_seqid;
2286         smp = scpu->scpu.scpu_last_smap;
2287         mutex_enter(&smp->sm_mtx);
2288         if (smp->sm_vp == vp && smp->sm_off == offset) {
2289                 ASSERT(smp->sm_refcnt > 0);
2290         } else {
2291                 /*
2292                  * Assumption wrong, find the smap on the hash chain.
2293                  */
2294                 mutex_exit(&smp->sm_mtx);
2295                 SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
2296                 hashmtx = SHASHMTX(hashid);
2297 
2298                 mutex_enter(hashmtx);
2299                 smp = smd_hash[hashid].sh_hash_list;
2300                 for (; smp != NULL; smp = smp->sm_hash) {
2301                         if (smp->sm_vp == vp && smp->sm_off == offset)
2302                                 break;
2303                 }
2304                 mutex_exit(hashmtx);
2305                 if (smp) {
2306                         mutex_enter(&smp->sm_mtx);
2307                         ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
2308                 }
2309         }
2310 
2311         if (ppp)
2312                 *ppp = smp ? pp : NULL;
2313 
2314         return (smp);
2315 }
2316 
2317 #else   /* SEGKPM_SUPPORT */
2318 
2319 /* segkpm stubs */
2320 
2321 /*ARGSUSED*/
2322 static caddr_t
2323 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2324         struct smap *smp, enum seg_rw rw)
2325 {
2326         return (NULL);
2327 }
2328 
2329 /*ARGSUSED*/
2330 struct smap *
2331 get_smap_kpm(caddr_t addr, page_t **ppp)
2332 {
2333         return (NULL);
2334 }
2335 
2336 #endif  /* SEGKPM_SUPPORT */