1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * Portions of this source code were derived from Berkeley 4.3 BSD
  31  * under license from the Regents of the University of California.
  32  */
  33 
  34 /*
  35  * VM - generic vnode mapping segment.
  36  *
  37  * The segmap driver is used only by the kernel to get faster (than seg_vn)
  38  * mappings [lower routine overhead; more persistent cache] to random
  39  * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/t_lock.h>
  44 #include <sys/param.h>
  45 #include <sys/sysmacros.h>
  46 #include <sys/buf.h>
  47 #include <sys/systm.h>
  48 #include <sys/vnode.h>
  49 #include <sys/mman.h>
  50 #include <sys/errno.h>
  51 #include <sys/cred.h>
  52 #include <sys/kmem.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/debug.h>
  56 #include <sys/thread.h>
  57 #include <sys/dumphdr.h>
  58 #include <sys/bitmap.h>
  59 #include <sys/lgrp.h>
  60 
  61 #include <vm/seg_kmem.h>
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_kpm.h>
  66 #include <vm/seg_map.h>
  67 #include <vm/page.h>
  68 #include <vm/pvn.h>
  69 #include <vm/rm.h>
  70 
  71 /*
  72  * Private seg op routines.
  73  */
  74 static void     segmap_free(struct seg *seg);
  75 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
  76                         size_t len, enum fault_type type, enum seg_rw rw);
  77 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
  78 static int      segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
  79                         uint_t prot);
  80 static int      segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
  81 static int      segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
  82                         uint_t *protv);
  83 static u_offset_t       segmap_getoffset(struct seg *seg, caddr_t addr);
  84 static int      segmap_gettype(struct seg *seg, caddr_t addr);
  85 static int      segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
  86 static void     segmap_dump(struct seg *seg);
  87 static int      segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
  88                         struct page ***ppp, enum lock_type type,
  89                         enum seg_rw rw);
  90 static void     segmap_badop(void);
  91 static int      segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
  92 
  93 /* segkpm support */
  94 static caddr_t  segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
  95                         struct smap *, enum seg_rw);
  96 struct smap     *get_smap_kpm(caddr_t, page_t **);
  97 
  98 #define SEGMAP_BADOP(t) (t(*)())segmap_badop
  99 
 100 static struct seg_ops segmap_ops = {
 101         .dup            = SEGMAP_BADOP(int),
 102         .unmap          = SEGMAP_BADOP(int),
 103         .free           = segmap_free,
 104         .fault          = segmap_fault,
 105         .faulta         = segmap_faulta,
 106         .setprot        = SEGMAP_BADOP(int),
 107         .checkprot      = segmap_checkprot,
 108         .kluster        = segmap_kluster,
 109         .swapout        = SEGMAP_BADOP(size_t),
 110         .sync           = SEGMAP_BADOP(int),
 111         .incore         = SEGMAP_BADOP(size_t),
 112         .lockop         = SEGMAP_BADOP(int),
 113         .getprot        = segmap_getprot,
 114         .getoffset      = segmap_getoffset,
 115         .gettype        = segmap_gettype,
 116         .getvp          = segmap_getvp,
 117         .advise         = SEGMAP_BADOP(int),
 118         .dump           = segmap_dump,
 119         .pagelock       = segmap_pagelock,
 120         .setpagesize    = SEGMAP_BADOP(int),
 121         .getmemid       = segmap_getmemid,
 122 };
 123 
 124 /*
 125  * Private segmap routines.
 126  */
 127 static void     segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
 128                         size_t len, enum seg_rw rw, struct smap *smp);
 129 static void     segmap_smapadd(struct smap *smp);
 130 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
 131                         u_offset_t off, int hashid);
 132 static void     segmap_hashout(struct smap *smp);
 133 
 134 
 135 /*
 136  * Statistics for segmap operations.
 137  *
 138  * No explicit locking to protect these stats.
 139  */
 140 struct segmapcnt segmapcnt = {
 141         { "fault",              KSTAT_DATA_ULONG },
 142         { "faulta",             KSTAT_DATA_ULONG },
 143         { "getmap",             KSTAT_DATA_ULONG },
 144         { "get_use",            KSTAT_DATA_ULONG },
 145         { "get_reclaim",        KSTAT_DATA_ULONG },
 146         { "get_reuse",          KSTAT_DATA_ULONG },
 147         { "get_unused",         KSTAT_DATA_ULONG },
 148         { "get_nofree",         KSTAT_DATA_ULONG },
 149         { "rel_async",          KSTAT_DATA_ULONG },
 150         { "rel_write",          KSTAT_DATA_ULONG },
 151         { "rel_free",           KSTAT_DATA_ULONG },
 152         { "rel_abort",          KSTAT_DATA_ULONG },
 153         { "rel_dontneed",       KSTAT_DATA_ULONG },
 154         { "release",            KSTAT_DATA_ULONG },
 155         { "pagecreate",         KSTAT_DATA_ULONG },
 156         { "free_notfree",       KSTAT_DATA_ULONG },
 157         { "free_dirty",         KSTAT_DATA_ULONG },
 158         { "free",               KSTAT_DATA_ULONG },
 159         { "stolen",             KSTAT_DATA_ULONG },
 160         { "get_nomtx",          KSTAT_DATA_ULONG }
 161 };
 162 
 163 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
 164 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
 165 
 166 /*
 167  * Return number of map pages in segment.
 168  */
 169 #define MAP_PAGES(seg)          ((seg)->s_size >> MAXBSHIFT)
 170 
 171 /*
 172  * Translate addr into smap number within segment.
 173  */
 174 #define MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
 175 
 176 /*
 177  * Translate addr in seg into struct smap pointer.
 178  */
 179 #define GET_SMAP(seg, addr)     \
 180         &(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
 181 
 182 /*
 183  * Bit in map (16 bit bitmap).
 184  */
 185 #define SMAP_BIT_MASK(bitindex) (1 << ((bitindex) & 0xf))
 186 
 187 static int smd_colormsk = 0;
 188 static int smd_ncolor = 0;
 189 static int smd_nfree = 0;
 190 static int smd_freemsk = 0;
 191 #ifdef DEBUG
 192 static int *colors_used;
 193 #endif
 194 static struct smap *smd_smap;
 195 static struct smaphash *smd_hash;
 196 #ifdef SEGMAP_HASHSTATS
 197 static unsigned int *smd_hash_len;
 198 #endif
 199 static struct smfree *smd_free;
 200 static ulong_t smd_hashmsk = 0;
 201 
 202 #define SEGMAP_MAXCOLOR         2
 203 #define SEGMAP_CACHE_PAD        64
 204 
 205 union segmap_cpu {
 206         struct {
 207                 uint32_t        scpu_free_ndx[SEGMAP_MAXCOLOR];
 208                 struct smap     *scpu_last_smap;
 209                 ulong_t         scpu_getmap;
 210                 ulong_t         scpu_release;
 211                 ulong_t         scpu_get_reclaim;
 212                 ulong_t         scpu_fault;
 213                 ulong_t         scpu_pagecreate;
 214                 ulong_t         scpu_get_reuse;
 215         } scpu;
 216         char    scpu_pad[SEGMAP_CACHE_PAD];
 217 };
 218 static union segmap_cpu *smd_cpu;
 219 
 220 /*
 221  * There are three locks in seg_map:
 222  *      - per freelist mutexes
 223  *      - per hashchain mutexes
 224  *      - per smap mutexes
 225  *
 226  * The lock ordering is to get the smap mutex to lock down the slot
 227  * first then the hash lock (for hash in/out (vp, off) list) or the
 228  * freelist lock to put the slot back on the free list.
 229  *
 230  * The hash search is done by only holding the hashchain lock, when a wanted
 231  * slot is found, we drop the hashchain lock then lock the slot so there
 232  * is no overlapping of hashchain and smap locks. After the slot is
 233  * locked, we verify again if the slot is still what we are looking
 234  * for.
 235  *
 236  * Allocation of a free slot is done by holding the freelist lock,
 237  * then locking the smap slot at the head of the freelist. This is
 238  * in reversed lock order so mutex_tryenter() is used.
 239  *
 240  * The smap lock protects all fields in smap structure except for
 241  * the link fields for hash/free lists which are protected by
 242  * hashchain and freelist locks.
 243  */
 244 
 245 #define SHASHMTX(hashid)        (&smd_hash[hashid].sh_mtx)
 246 
 247 #define SMP2SMF(smp)            (&smd_free[(smp - smd_smap) & smd_freemsk])
 248 #define SMP2SMF_NDX(smp)        (ushort_t)((smp - smd_smap) & smd_freemsk)
 249 
 250 #define SMAPMTX(smp) (&smp->sm_mtx)
 251 
 252 #define SMAP_HASHFUNC(vp, off, hashid) \
 253         { \
 254         hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
 255                 ((off) >> MAXBSHIFT)) & smd_hashmsk); \
 256         }
 257 
 258 /*
 259  * The most frequently updated kstat counters are kept in the
 260  * per cpu array to avoid hot cache blocks. The update function
 261  * sums the cpu local counters to update the global counters.
 262  */
 263 
 264 /* ARGSUSED */
 265 int
 266 segmap_kstat_update(kstat_t *ksp, int rw)
 267 {
 268         int i;
 269         ulong_t getmap, release, get_reclaim;
 270         ulong_t fault, pagecreate, get_reuse;
 271 
 272         if (rw == KSTAT_WRITE)
 273                 return (EACCES);
 274         getmap = release = get_reclaim = (ulong_t)0;
 275         fault = pagecreate = get_reuse = (ulong_t)0;
 276         for (i = 0; i < max_ncpus; i++) {
 277                 getmap += smd_cpu[i].scpu.scpu_getmap;
 278                 release  += smd_cpu[i].scpu.scpu_release;
 279                 get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
 280                 fault  += smd_cpu[i].scpu.scpu_fault;
 281                 pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
 282                 get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
 283         }
 284         segmapcnt.smp_getmap.value.ul = getmap;
 285         segmapcnt.smp_release.value.ul = release;
 286         segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
 287         segmapcnt.smp_fault.value.ul = fault;
 288         segmapcnt.smp_pagecreate.value.ul = pagecreate;
 289         segmapcnt.smp_get_reuse.value.ul = get_reuse;
 290         return (0);
 291 }
 292 
 293 int
 294 segmap_create(struct seg *seg, void *argsp)
 295 {
 296         struct segmap_data *smd;
 297         struct smap *smp;
 298         struct smfree *sm;
 299         struct segmap_crargs *a = (struct segmap_crargs *)argsp;
 300         struct smaphash *shashp;
 301         union segmap_cpu *scpu;
 302         long i, npages;
 303         size_t hashsz;
 304         uint_t nfreelist;
 305         extern void prefetch_smap_w(void *);
 306         extern int max_ncpus;
 307 
 308         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 309 
 310         if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
 311                 panic("segkmap not MAXBSIZE aligned");
 312                 /*NOTREACHED*/
 313         }
 314 
 315         smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
 316 
 317         seg->s_data = (void *)smd;
 318         seg->s_ops = &segmap_ops;
 319         smd->smd_prot = a->prot;
 320 
 321         /*
 322          * Scale the number of smap freelists to be
 323          * proportional to max_ncpus * number of virtual colors.
 324          * The caller can over-ride this scaling by providing
 325          * a non-zero a->nfreelist argument.
 326          */
 327         nfreelist = a->nfreelist;
 328         if (nfreelist == 0)
 329                 nfreelist = max_ncpus;
 330         else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
 331                 cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
 332                 "%d, using %d", nfreelist, max_ncpus);
 333                 nfreelist = max_ncpus;
 334         }
 335         if (!ISP2(nfreelist)) {
 336                 /* round up nfreelist to the next power of two. */
 337                 nfreelist = 1 << (highbit(nfreelist));
 338         }
 339 
 340         /*
 341          * Get the number of virtual colors - must be a power of 2.
 342          */
 343         if (a->shmsize)
 344                 smd_ncolor = a->shmsize >> MAXBSHIFT;
 345         else
 346                 smd_ncolor = 1;
 347         ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
 348         ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
 349         smd_colormsk = smd_ncolor - 1;
 350         smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
 351         smd_freemsk = smd_nfree - 1;
 352 
 353         /*
 354          * Allocate and initialize the freelist headers.
 355          * Note that sm_freeq[1] starts out as the release queue. This
 356          * is known when the smap structures are initialized below.
 357          */
 358         smd_free = smd->smd_free =
 359             kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
 360         for (i = 0; i < smd_nfree; i++) {
 361                 sm = &smd->smd_free[i];
 362                 mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 363                 mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 364                 sm->sm_allocq = &sm->sm_freeq[0];
 365                 sm->sm_releq = &sm->sm_freeq[1];
 366         }
 367 
 368         /*
 369          * Allocate and initialize the smap hash chain headers.
 370          * Compute hash size rounding down to the next power of two.
 371          */
 372         npages = MAP_PAGES(seg);
 373         smd->smd_npages = npages;
 374         hashsz = npages / SMAP_HASHAVELEN;
 375         hashsz = 1 << (highbit(hashsz)-1);
 376         smd_hashmsk = hashsz - 1;
 377         smd_hash = smd->smd_hash =
 378             kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
 379 #ifdef SEGMAP_HASHSTATS
 380         smd_hash_len =
 381             kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
 382 #endif
 383         for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
 384                 shashp->sh_hash_list = NULL;
 385                 mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
 386         }
 387 
 388         /*
 389          * Allocate and initialize the smap structures.
 390          * Link all slots onto the appropriate freelist.
 391          * The smap array is large enough to affect boot time
 392          * on large systems, so use memory prefetching and only
 393          * go through the array 1 time. Inline a optimized version
 394          * of segmap_smapadd to add structures to freelists with
 395          * knowledge that no locks are needed here.
 396          */
 397         smd_smap = smd->smd_sm =
 398             kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
 399 
 400         for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
 401             smp >= smd->smd_sm; smp--) {
 402                 struct smap *smpfreelist;
 403                 struct sm_freeq *releq;
 404 
 405                 prefetch_smap_w((char *)smp);
 406 
 407                 smp->sm_vp = NULL;
 408                 smp->sm_hash = NULL;
 409                 smp->sm_off = 0;
 410                 smp->sm_bitmap = 0;
 411                 smp->sm_refcnt = 0;
 412                 mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
 413                 smp->sm_free_ndx = SMP2SMF_NDX(smp);
 414 
 415                 sm = SMP2SMF(smp);
 416                 releq = sm->sm_releq;
 417 
 418                 smpfreelist = releq->smq_free;
 419                 if (smpfreelist == 0) {
 420                         releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 421                 } else {
 422                         smp->sm_next = smpfreelist;
 423                         smp->sm_prev = smpfreelist->sm_prev;
 424                         smpfreelist->sm_prev = smp;
 425                         smp->sm_prev->sm_next = smp;
 426                         releq->smq_free = smp->sm_next;
 427                 }
 428 
 429                 /*
 430                  * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
 431                  */
 432                 smp->sm_flags = 0;
 433 
 434 #ifdef  SEGKPM_SUPPORT
 435                 /*
 436                  * Due to the fragile prefetch loop no
 437                  * separate function is used here.
 438                  */
 439                 smp->sm_kpme_next = NULL;
 440                 smp->sm_kpme_prev = NULL;
 441                 smp->sm_kpme_page = NULL;
 442 #endif
 443         }
 444 
 445         /*
 446          * Allocate the per color indices that distribute allocation
 447          * requests over the free lists. Each cpu will have a private
 448          * rotor index to spread the allocations even across the available
 449          * smap freelists. Init the scpu_last_smap field to the first
 450          * smap element so there is no need to check for NULL.
 451          */
 452         smd_cpu =
 453             kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
 454         for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
 455                 int j;
 456                 for (j = 0; j < smd_ncolor; j++)
 457                         scpu->scpu.scpu_free_ndx[j] = j;
 458                 scpu->scpu.scpu_last_smap = smd_smap;
 459         }
 460 
 461         vpm_init();
 462 
 463 #ifdef DEBUG
 464         /*
 465          * Keep track of which colors are used more often.
 466          */
 467         colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
 468 #endif /* DEBUG */
 469 
 470         return (0);
 471 }
 472 
 473 static void
 474 segmap_free(seg)
 475         struct seg *seg;
 476 {
 477         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 478 }
 479 
 480 /*
 481  * Do a F_SOFTUNLOCK call over the range requested.
 482  * The range must have already been F_SOFTLOCK'ed.
 483  */
 484 static void
 485 segmap_unlock(
 486         struct hat *hat,
 487         struct seg *seg,
 488         caddr_t addr,
 489         size_t len,
 490         enum seg_rw rw,
 491         struct smap *smp)
 492 {
 493         page_t *pp;
 494         caddr_t adr;
 495         u_offset_t off;
 496         struct vnode *vp;
 497         kmutex_t *smtx;
 498 
 499         ASSERT(smp->sm_refcnt > 0);
 500 
 501 #ifdef lint
 502         seg = seg;
 503 #endif
 504 
 505         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 506 
 507                 /*
 508                  * We're called only from segmap_fault and this was a
 509                  * NOP in case of a kpm based smap, so dangerous things
 510                  * must have happened in the meantime. Pages are prefaulted
 511                  * and locked in segmap_getmapflt and they will not be
 512                  * unlocked until segmap_release.
 513                  */
 514                 panic("segmap_unlock: called with kpm addr %p", (void *)addr);
 515                 /*NOTREACHED*/
 516         }
 517 
 518         vp = smp->sm_vp;
 519         off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 520 
 521         hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
 522         for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
 523                 ushort_t bitmask;
 524 
 525                 /*
 526                  * Use page_find() instead of page_lookup() to
 527                  * find the page since we know that it has
 528                  * "shared" lock.
 529                  */
 530                 pp = page_find(vp, off);
 531                 if (pp == NULL) {
 532                         panic("segmap_unlock: page not found");
 533                         /*NOTREACHED*/
 534                 }
 535 
 536                 if (rw == S_WRITE) {
 537                         hat_setrefmod(pp);
 538                 } else if (rw != S_OTHER) {
 539                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 540                         "segmap_fault:pp %p vp %p offset %llx", pp, vp, off);
 541                         hat_setref(pp);
 542                 }
 543 
 544                 /*
 545                  * Clear bitmap, if the bit corresponding to "off" is set,
 546                  * since the page and translation are being unlocked.
 547                  */
 548                 bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
 549 
 550                 /*
 551                  * Large Files: Following assertion is to verify
 552                  * the correctness of the cast to (int) above.
 553                  */
 554                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
 555                 smtx = SMAPMTX(smp);
 556                 mutex_enter(smtx);
 557                 if (smp->sm_bitmap & bitmask) {
 558                         smp->sm_bitmap &= ~bitmask;
 559                 }
 560                 mutex_exit(smtx);
 561 
 562                 page_unlock(pp);
 563         }
 564 }
 565 
 566 #define MAXPPB  (MAXBSIZE/4096) /* assumes minimum page size of 4k */
 567 
 568 /*
 569  * This routine is called via a machine specific fault handling
 570  * routine.  It is also called by software routines wishing to
 571  * lock or unlock a range of addresses.
 572  *
 573  * Note that this routine expects a page-aligned "addr".
 574  */
 575 faultcode_t
 576 segmap_fault(
 577         struct hat *hat,
 578         struct seg *seg,
 579         caddr_t addr,
 580         size_t len,
 581         enum fault_type type,
 582         enum seg_rw rw)
 583 {
 584         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 585         struct smap *smp;
 586         page_t *pp, **ppp;
 587         struct vnode *vp;
 588         u_offset_t off;
 589         page_t *pl[MAXPPB + 1];
 590         uint_t prot;
 591         u_offset_t addroff;
 592         caddr_t adr;
 593         int err;
 594         u_offset_t sm_off;
 595         int hat_flag;
 596 
 597         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 598                 int newpage;
 599                 kmutex_t *smtx;
 600 
 601                 /*
 602                  * Pages are successfully prefaulted and locked in
 603                  * segmap_getmapflt and can't be unlocked until
 604                  * segmap_release. No hat mappings have to be locked
 605                  * and they also can't be unlocked as long as the
 606                  * caller owns an active kpm addr.
 607                  */
 608 #ifndef DEBUG
 609                 if (type != F_SOFTUNLOCK)
 610                         return (0);
 611 #endif
 612 
 613                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 614                         panic("segmap_fault: smap not found "
 615                             "for addr %p", (void *)addr);
 616                         /*NOTREACHED*/
 617                 }
 618 
 619                 smtx = SMAPMTX(smp);
 620 #ifdef  DEBUG
 621                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 622                 if (newpage) {
 623                         cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
 624                             (void *)smp);
 625                 }
 626 
 627                 if (type != F_SOFTUNLOCK) {
 628                         mutex_exit(smtx);
 629                         return (0);
 630                 }
 631 #endif
 632                 mutex_exit(smtx);
 633                 vp = smp->sm_vp;
 634                 sm_off = smp->sm_off;
 635 
 636                 if (vp == NULL)
 637                         return (FC_MAKE_ERR(EIO));
 638 
 639                 ASSERT(smp->sm_refcnt > 0);
 640 
 641                 addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 642                 if (addroff + len > MAXBSIZE)
 643                         panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
 644                             (void *)(addr + len));
 645 
 646                 off = sm_off + addroff;
 647 
 648                 pp = page_find(vp, off);
 649 
 650                 if (pp == NULL)
 651                         panic("segmap_fault: softunlock page not found");
 652 
 653                 /*
 654                  * Set ref bit also here in case of S_OTHER to avoid the
 655                  * overhead of supporting other cases than F_SOFTUNLOCK
 656                  * with segkpm. We can do this because the underlying
 657                  * pages are locked anyway.
 658                  */
 659                 if (rw == S_WRITE) {
 660                         hat_setrefmod(pp);
 661                 } else {
 662                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 663                             "segmap_fault:pp %p vp %p offset %llx",
 664                             pp, vp, off);
 665                         hat_setref(pp);
 666                 }
 667 
 668                 return (0);
 669         }
 670 
 671         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
 672         smp = GET_SMAP(seg, addr);
 673         vp = smp->sm_vp;
 674         sm_off = smp->sm_off;
 675 
 676         if (vp == NULL)
 677                 return (FC_MAKE_ERR(EIO));
 678 
 679         ASSERT(smp->sm_refcnt > 0);
 680 
 681         addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 682         if (addroff + len > MAXBSIZE) {
 683                 panic("segmap_fault: endaddr %p "
 684                     "exceeds MAXBSIZE chunk", (void *)(addr + len));
 685                 /*NOTREACHED*/
 686         }
 687         off = sm_off + addroff;
 688 
 689         /*
 690          * First handle the easy stuff
 691          */
 692         if (type == F_SOFTUNLOCK) {
 693                 segmap_unlock(hat, seg, addr, len, rw, smp);
 694                 return (0);
 695         }
 696 
 697         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 698             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 699         err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
 700             seg, addr, rw, CRED(), NULL);
 701 
 702         if (err)
 703                 return (FC_MAKE_ERR(err));
 704 
 705         prot &= smd->smd_prot;
 706 
 707         /*
 708          * Handle all pages returned in the pl[] array.
 709          * This loop is coded on the assumption that if
 710          * there was no error from the VOP_GETPAGE routine,
 711          * that the page list returned will contain all the
 712          * needed pages for the vp from [off..off + len].
 713          */
 714         ppp = pl;
 715         while ((pp = *ppp++) != NULL) {
 716                 u_offset_t poff;
 717                 ASSERT(pp->p_vnode == vp);
 718                 hat_flag = HAT_LOAD;
 719 
 720                 /*
 721                  * Verify that the pages returned are within the range
 722                  * of this segmap region.  Note that it is theoretically
 723                  * possible for pages outside this range to be returned,
 724                  * but it is not very likely.  If we cannot use the
 725                  * page here, just release it and go on to the next one.
 726                  */
 727                 if (pp->p_offset < sm_off ||
 728                     pp->p_offset >= sm_off + MAXBSIZE) {
 729                         (void) page_release(pp, 1);
 730                         continue;
 731                 }
 732 
 733                 ASSERT(hat == kas.a_hat);
 734                 poff = pp->p_offset;
 735                 adr = addr + (poff - off);
 736                 if (adr >= addr && adr < addr + len) {
 737                         hat_setref(pp);
 738                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 739                             "segmap_fault:pp %p vp %p offset %llx",
 740                             pp, vp, poff);
 741                         if (type == F_SOFTLOCK)
 742                                 hat_flag = HAT_LOAD_LOCK;
 743                 }
 744 
 745                 /*
 746                  * Deal with VMODSORT pages here. If we know this is a write
 747                  * do the setmod now and allow write protection.
 748                  * As long as it's modified or not S_OTHER, remove write
 749                  * protection. With S_OTHER it's up to the FS to deal with this.
 750                  */
 751                 if (IS_VMODSORT(vp)) {
 752                         if (rw == S_WRITE)
 753                                 hat_setmod(pp);
 754                         else if (rw != S_OTHER && !hat_ismod(pp))
 755                                 prot &= ~PROT_WRITE;
 756                 }
 757 
 758                 hat_memload(hat, adr, pp, prot, hat_flag);
 759                 if (hat_flag != HAT_LOAD_LOCK)
 760                         page_unlock(pp);
 761         }
 762         return (0);
 763 }
 764 
 765 /*
 766  * This routine is used to start I/O on pages asynchronously.
 767  */
 768 static faultcode_t
 769 segmap_faulta(struct seg *seg, caddr_t addr)
 770 {
 771         struct smap *smp;
 772         struct vnode *vp;
 773         u_offset_t off;
 774         int err;
 775 
 776         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 777                 int     newpage;
 778                 kmutex_t *smtx;
 779 
 780                 /*
 781                  * Pages are successfully prefaulted and locked in
 782                  * segmap_getmapflt and can't be unlocked until
 783                  * segmap_release. No hat mappings have to be locked
 784                  * and they also can't be unlocked as long as the
 785                  * caller owns an active kpm addr.
 786                  */
 787 #ifdef  DEBUG
 788                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 789                         panic("segmap_faulta: smap not found "
 790                             "for addr %p", (void *)addr);
 791                         /*NOTREACHED*/
 792                 }
 793 
 794                 smtx = SMAPMTX(smp);
 795                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 796                 mutex_exit(smtx);
 797                 if (newpage)
 798                         cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
 799                             (void *)smp);
 800 #endif
 801                 return (0);
 802         }
 803 
 804         segmapcnt.smp_faulta.value.ul++;
 805         smp = GET_SMAP(seg, addr);
 806 
 807         ASSERT(smp->sm_refcnt > 0);
 808 
 809         vp = smp->sm_vp;
 810         off = smp->sm_off;
 811 
 812         if (vp == NULL) {
 813                 cmn_err(CE_WARN, "segmap_faulta - no vp");
 814                 return (FC_MAKE_ERR(EIO));
 815         }
 816 
 817         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 818             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 819 
 820         err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
 821             & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
 822             seg, addr, S_READ, CRED(), NULL);
 823 
 824         if (err)
 825                 return (FC_MAKE_ERR(err));
 826         return (0);
 827 }
 828 
 829 /*ARGSUSED*/
 830 static int
 831 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
 832 {
 833         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 834 
 835         ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
 836 
 837         /*
 838          * Need not acquire the segment lock since
 839          * "smd_prot" is a read-only field.
 840          */
 841         return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
 842 }
 843 
 844 static int
 845 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
 846 {
 847         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 848         size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
 849 
 850         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
 851 
 852         if (pgno != 0) {
 853                 do {
 854                         protv[--pgno] = smd->smd_prot;
 855                 } while (pgno != 0);
 856         }
 857         return (0);
 858 }
 859 
 860 static u_offset_t
 861 segmap_getoffset(struct seg *seg, caddr_t addr)
 862 {
 863         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 864 
 865         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 866 
 867         return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
 868 }
 869 
 870 /*ARGSUSED*/
 871 static int
 872 segmap_gettype(struct seg *seg, caddr_t addr)
 873 {
 874         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 875 
 876         return (MAP_SHARED);
 877 }
 878 
 879 /*ARGSUSED*/
 880 static int
 881 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
 882 {
 883         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 884 
 885         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 886 
 887         /* XXX - This doesn't make any sense */
 888         *vpp = smd->smd_sm->sm_vp;
 889         return (0);
 890 }
 891 
 892 /*
 893  * Check to see if it makes sense to do kluster/read ahead to
 894  * addr + delta relative to the mapping at addr.  We assume here
 895  * that delta is a signed PAGESIZE'd multiple (which can be negative).
 896  *
 897  * For segmap we always "approve" of this action from our standpoint.
 898  */
 899 /*ARGSUSED*/
 900 static int
 901 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 902 {
 903         return (0);
 904 }
 905 
 906 static void
 907 segmap_badop()
 908 {
 909         panic("segmap_badop");
 910         /*NOTREACHED*/
 911 }
 912 
 913 /*
 914  * Special private segmap operations
 915  */
 916 
 917 /*
 918  * Add smap to the appropriate free list.
 919  */
 920 static void
 921 segmap_smapadd(struct smap *smp)
 922 {
 923         struct smfree *sm;
 924         struct smap *smpfreelist;
 925         struct sm_freeq *releq;
 926 
 927         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 928 
 929         if (smp->sm_refcnt != 0) {
 930                 panic("segmap_smapadd");
 931                 /*NOTREACHED*/
 932         }
 933 
 934         sm = &smd_free[smp->sm_free_ndx];
 935         /*
 936          * Add to the tail of the release queue
 937          * Note that sm_releq and sm_allocq could toggle
 938          * before we get the lock. This does not affect
 939          * correctness as the 2 queues are only maintained
 940          * to reduce lock pressure.
 941          */
 942         releq = sm->sm_releq;
 943         if (releq == &sm->sm_freeq[0])
 944                 smp->sm_flags |= SM_QNDX_ZERO;
 945         else
 946                 smp->sm_flags &= ~SM_QNDX_ZERO;
 947         mutex_enter(&releq->smq_mtx);
 948         smpfreelist = releq->smq_free;
 949         if (smpfreelist == 0) {
 950                 int want;
 951 
 952                 releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 953                 /*
 954                  * Both queue mutexes held to set sm_want;
 955                  * snapshot the value before dropping releq mutex.
 956                  * If sm_want appears after the releq mutex is dropped,
 957                  * then the smap just freed is already gone.
 958                  */
 959                 want = sm->sm_want;
 960                 mutex_exit(&releq->smq_mtx);
 961                 /*
 962                  * See if there was a waiter before dropping the releq mutex
 963                  * then recheck after obtaining sm_freeq[0] mutex as
 964                  * the another thread may have already signaled.
 965                  */
 966                 if (want) {
 967                         mutex_enter(&sm->sm_freeq[0].smq_mtx);
 968                         if (sm->sm_want)
 969                                 cv_signal(&sm->sm_free_cv);
 970                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
 971                 }
 972         } else {
 973                 smp->sm_next = smpfreelist;
 974                 smp->sm_prev = smpfreelist->sm_prev;
 975                 smpfreelist->sm_prev = smp;
 976                 smp->sm_prev->sm_next = smp;
 977                 mutex_exit(&releq->smq_mtx);
 978         }
 979 }
 980 
 981 
 982 static struct smap *
 983 segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
 984 {
 985         struct smap **hpp;
 986         struct smap *tmp;
 987         kmutex_t *hmtx;
 988 
 989         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 990         ASSERT(smp->sm_vp == NULL);
 991         ASSERT(smp->sm_hash == NULL);
 992         ASSERT(smp->sm_prev == NULL);
 993         ASSERT(smp->sm_next == NULL);
 994         ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
 995 
 996         hmtx = SHASHMTX(hashid);
 997 
 998         mutex_enter(hmtx);
 999         /*
1000          * First we need to verify that no one has created a smp
1001          * with (vp,off) as its tag before we us.
1002          */
1003         for (tmp = smd_hash[hashid].sh_hash_list;
1004             tmp != NULL; tmp = tmp->sm_hash)
1005                 if (tmp->sm_vp == vp && tmp->sm_off == off)
1006                         break;
1007 
1008         if (tmp == NULL) {
1009                 /*
1010                  * No one created one yet.
1011                  *
1012                  * Funniness here - we don't increment the ref count on the
1013                  * vnode * even though we have another pointer to it here.
1014                  * The reason for this is that we don't want the fact that
1015                  * a seg_map entry somewhere refers to a vnode to prevent the
1016                  * vnode * itself from going away.  This is because this
1017                  * reference to the vnode is a "soft one".  In the case where
1018                  * a mapping is being used by a rdwr [or directory routine?]
1019                  * there already has to be a non-zero ref count on the vnode.
1020                  * In the case where the vp has been freed and the the smap
1021                  * structure is on the free list, there are no pages in memory
1022                  * that can refer to the vnode.  Thus even if we reuse the same
1023                  * vnode/smap structure for a vnode which has the same
1024                  * address but represents a different object, we are ok.
1025                  */
1026                 smp->sm_vp = vp;
1027                 smp->sm_off = off;
1028 
1029                 hpp = &smd_hash[hashid].sh_hash_list;
1030                 smp->sm_hash = *hpp;
1031                 *hpp = smp;
1032 #ifdef SEGMAP_HASHSTATS
1033                 smd_hash_len[hashid]++;
1034 #endif
1035         }
1036         mutex_exit(hmtx);
1037 
1038         return (tmp);
1039 }
1040 
1041 static void
1042 segmap_hashout(struct smap *smp)
1043 {
1044         struct smap **hpp, *hp;
1045         struct vnode *vp;
1046         kmutex_t *mtx;
1047         int hashid;
1048         u_offset_t off;
1049 
1050         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1051 
1052         vp = smp->sm_vp;
1053         off = smp->sm_off;
1054 
1055         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1056         mtx = SHASHMTX(hashid);
1057         mutex_enter(mtx);
1058 
1059         hpp = &smd_hash[hashid].sh_hash_list;
1060         for (;;) {
1061                 hp = *hpp;
1062                 if (hp == NULL) {
1063                         panic("segmap_hashout");
1064                         /*NOTREACHED*/
1065                 }
1066                 if (hp == smp)
1067                         break;
1068                 hpp = &hp->sm_hash;
1069         }
1070 
1071         *hpp = smp->sm_hash;
1072         smp->sm_hash = NULL;
1073 #ifdef SEGMAP_HASHSTATS
1074         smd_hash_len[hashid]--;
1075 #endif
1076         mutex_exit(mtx);
1077 
1078         smp->sm_vp = NULL;
1079         smp->sm_off = (u_offset_t)0;
1080 
1081 }
1082 
1083 /*
1084  * Attempt to free unmodified, unmapped, and non locked segmap
1085  * pages.
1086  */
1087 void
1088 segmap_pagefree(struct vnode *vp, u_offset_t off)
1089 {
1090         u_offset_t pgoff;
1091         page_t  *pp;
1092 
1093         for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
1094 
1095                 if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL)
1096                         continue;
1097 
1098                 switch (page_release(pp, 1)) {
1099                 case PGREL_NOTREL:
1100                         segmapcnt.smp_free_notfree.value.ul++;
1101                         break;
1102                 case PGREL_MOD:
1103                         segmapcnt.smp_free_dirty.value.ul++;
1104                         break;
1105                 case PGREL_CLEAN:
1106                         segmapcnt.smp_free.value.ul++;
1107                         break;
1108                 }
1109         }
1110 }
1111 
1112 /*
1113  * Locks held on entry: smap lock
1114  * Locks held on exit : smap lock.
1115  */
1116 
1117 static void
1118 grab_smp(struct smap *smp, page_t *pp)
1119 {
1120         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1121         ASSERT(smp->sm_refcnt == 0);
1122 
1123         if (smp->sm_vp != (struct vnode *)NULL) {
1124                 struct vnode    *vp = smp->sm_vp;
1125                 u_offset_t      off = smp->sm_off;
1126                 /*
1127                  * Destroy old vnode association and
1128                  * unload any hardware translations to
1129                  * the old object.
1130                  */
1131                 smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
1132                 segmap_hashout(smp);
1133 
1134                 /*
1135                  * This node is off freelist and hashlist,
1136                  * so there is no reason to drop/reacquire sm_mtx
1137                  * across calls to hat_unload.
1138                  */
1139                 if (segmap_kpm) {
1140                         caddr_t vaddr;
1141                         int hat_unload_needed = 0;
1142 
1143                         /*
1144                          * unload kpm mapping
1145                          */
1146                         if (pp != NULL) {
1147                                 vaddr = hat_kpm_page2va(pp, 1);
1148                                 hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
1149                                 page_unlock(pp);
1150                         }
1151 
1152                         /*
1153                          * Check if we have (also) the rare case of a
1154                          * non kpm mapping.
1155                          */
1156                         if (smp->sm_flags & SM_NOTKPM_RELEASED) {
1157                                 hat_unload_needed = 1;
1158                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1159                         }
1160 
1161                         if (hat_unload_needed) {
1162                                 hat_unload(kas.a_hat, segkmap->s_base +
1163                                     ((smp - smd_smap) * MAXBSIZE),
1164                                     MAXBSIZE, HAT_UNLOAD);
1165                         }
1166 
1167                 } else {
1168                         ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
1169                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1170                         hat_unload(kas.a_hat, segkmap->s_base +
1171                             ((smp - smd_smap) * MAXBSIZE),
1172                             MAXBSIZE, HAT_UNLOAD);
1173                 }
1174                 segmap_pagefree(vp, off);
1175         }
1176 }
1177 
1178 static struct smap *
1179 get_free_smp(int free_ndx)
1180 {
1181         struct smfree *sm;
1182         kmutex_t *smtx;
1183         struct smap *smp, *first;
1184         struct sm_freeq *allocq, *releq;
1185         struct kpme *kpme;
1186         page_t *pp = NULL;
1187         int end_ndx, page_locked = 0;
1188 
1189         end_ndx = free_ndx;
1190         sm = &smd_free[free_ndx];
1191 
1192 retry_queue:
1193         allocq = sm->sm_allocq;
1194         mutex_enter(&allocq->smq_mtx);
1195 
1196         if ((smp = allocq->smq_free) == NULL) {
1197 
1198 skip_queue:
1199                 /*
1200                  * The alloc list is empty or this queue is being skipped;
1201                  * first see if the allocq toggled.
1202                  */
1203                 if (sm->sm_allocq != allocq) {
1204                         /* queue changed */
1205                         mutex_exit(&allocq->smq_mtx);
1206                         goto retry_queue;
1207                 }
1208                 releq = sm->sm_releq;
1209                 if (!mutex_tryenter(&releq->smq_mtx)) {
1210                         /* cannot get releq; a free smp may be there now */
1211                         mutex_exit(&allocq->smq_mtx);
1212 
1213                         /*
1214                          * This loop could spin forever if this thread has
1215                          * higher priority than the thread that is holding
1216                          * releq->smq_mtx. In order to force the other thread
1217                          * to run, we'll lock/unlock the mutex which is safe
1218                          * since we just unlocked the allocq mutex.
1219                          */
1220                         mutex_enter(&releq->smq_mtx);
1221                         mutex_exit(&releq->smq_mtx);
1222                         goto retry_queue;
1223                 }
1224                 if (releq->smq_free == NULL) {
1225                         /*
1226                          * This freelist is empty.
1227                          * This should not happen unless clients
1228                          * are failing to release the segmap
1229                          * window after accessing the data.
1230                          * Before resorting to sleeping, try
1231                          * the next list of the same color.
1232                          */
1233                         free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
1234                         if (free_ndx != end_ndx) {
1235                                 mutex_exit(&releq->smq_mtx);
1236                                 mutex_exit(&allocq->smq_mtx);
1237                                 sm = &smd_free[free_ndx];
1238                                 goto retry_queue;
1239                         }
1240                         /*
1241                          * Tried all freelists of the same color once,
1242                          * wait on this list and hope something gets freed.
1243                          */
1244                         segmapcnt.smp_get_nofree.value.ul++;
1245                         sm->sm_want++;
1246                         mutex_exit(&sm->sm_freeq[1].smq_mtx);
1247                         cv_wait(&sm->sm_free_cv,
1248                             &sm->sm_freeq[0].smq_mtx);
1249                         sm->sm_want--;
1250                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
1251                         sm = &smd_free[free_ndx];
1252                         goto retry_queue;
1253                 } else {
1254                         /*
1255                          * Something on the rele queue; flip the alloc
1256                          * and rele queues and retry.
1257                          */
1258                         sm->sm_allocq = releq;
1259                         sm->sm_releq = allocq;
1260                         mutex_exit(&allocq->smq_mtx);
1261                         mutex_exit(&releq->smq_mtx);
1262                         if (page_locked) {
1263                                 delay(hz >> 2);
1264                                 page_locked = 0;
1265                         }
1266                         goto retry_queue;
1267                 }
1268         } else {
1269                 /*
1270                  * Fastpath the case we get the smap mutex
1271                  * on the first try.
1272                  */
1273                 first = smp;
1274 next_smap:
1275                 smtx = SMAPMTX(smp);
1276                 if (!mutex_tryenter(smtx)) {
1277                         /*
1278                          * Another thread is trying to reclaim this slot.
1279                          * Skip to the next queue or smap.
1280                          */
1281                         if ((smp = smp->sm_next) == first) {
1282                                 goto skip_queue;
1283                         } else {
1284                                 goto next_smap;
1285                         }
1286                 } else {
1287                         /*
1288                          * if kpme exists, get shared lock on the page
1289                          */
1290                         if (segmap_kpm && smp->sm_vp != NULL) {
1291 
1292                                 kpme = GET_KPME(smp);
1293                                 pp = kpme->kpe_page;
1294 
1295                                 if (pp != NULL) {
1296                                         if (!page_trylock(pp, SE_SHARED)) {
1297                                                 smp = smp->sm_next;
1298                                                 mutex_exit(smtx);
1299                                                 page_locked = 1;
1300 
1301                                                 pp = NULL;
1302 
1303                                                 if (smp == first) {
1304                                                         goto skip_queue;
1305                                                 } else {
1306                                                         goto next_smap;
1307                                                 }
1308                                         } else {
1309                                                 if (kpme->kpe_page == NULL) {
1310                                                         page_unlock(pp);
1311                                                         pp = NULL;
1312                                                 }
1313                                         }
1314                                 }
1315                         }
1316 
1317                         /*
1318                          * At this point, we've selected smp.  Remove smp
1319                          * from its freelist.  If smp is the first one in
1320                          * the freelist, update the head of the freelist.
1321                          */
1322                         if (first == smp) {
1323                                 ASSERT(first == allocq->smq_free);
1324                                 allocq->smq_free = smp->sm_next;
1325                         }
1326 
1327                         /*
1328                          * if the head of the freelist still points to smp,
1329                          * then there are no more free smaps in that list.
1330                          */
1331                         if (allocq->smq_free == smp)
1332                                 /*
1333                                  * Took the last one
1334                                  */
1335                                 allocq->smq_free = NULL;
1336                         else {
1337                                 smp->sm_prev->sm_next = smp->sm_next;
1338                                 smp->sm_next->sm_prev = smp->sm_prev;
1339                         }
1340                         mutex_exit(&allocq->smq_mtx);
1341                         smp->sm_prev = smp->sm_next = NULL;
1342 
1343                         /*
1344                          * if pp != NULL, pp must have been locked;
1345                          * grab_smp() unlocks pp.
1346                          */
1347                         ASSERT((pp == NULL) || PAGE_LOCKED(pp));
1348                         grab_smp(smp, pp);
1349                         /* return smp locked. */
1350                         ASSERT(SMAPMTX(smp) == smtx);
1351                         ASSERT(MUTEX_HELD(smtx));
1352                         return (smp);
1353                 }
1354         }
1355 }
1356 
1357 /*
1358  * Special public segmap operations
1359  */
1360 
1361 /*
1362  * Create pages (without using VOP_GETPAGE) and load up translations to them.
1363  * If softlock is TRUE, then set things up so that it looks like a call
1364  * to segmap_fault with F_SOFTLOCK.
1365  *
1366  * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
1367  *
1368  * All fields in the generic segment (struct seg) are considered to be
1369  * read-only for "segmap" even though the kernel address space (kas) may
1370  * not be locked, hence no lock is needed to access them.
1371  */
1372 int
1373 segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
1374 {
1375         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
1376         page_t *pp;
1377         u_offset_t off;
1378         struct smap *smp;
1379         struct vnode *vp;
1380         caddr_t eaddr;
1381         int newpage = 0;
1382         uint_t prot;
1383         kmutex_t *smtx;
1384         int hat_flag;
1385 
1386         ASSERT(seg->s_as == &kas);
1387 
1388         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1389                 /*
1390                  * Pages are successfully prefaulted and locked in
1391                  * segmap_getmapflt and can't be unlocked until
1392                  * segmap_release. The SM_KPM_NEWPAGE flag is set
1393                  * in segmap_pagecreate_kpm when new pages are created.
1394                  * and it is returned as "newpage" indication here.
1395                  */
1396                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1397                         panic("segmap_pagecreate: smap not found "
1398                             "for addr %p", (void *)addr);
1399                         /*NOTREACHED*/
1400                 }
1401 
1402                 smtx = SMAPMTX(smp);
1403                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
1404                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
1405                 mutex_exit(smtx);
1406 
1407                 return (newpage);
1408         }
1409 
1410         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
1411 
1412         eaddr = addr + len;
1413         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1414 
1415         smp = GET_SMAP(seg, addr);
1416 
1417         /*
1418          * We don't grab smp mutex here since we assume the smp
1419          * has a refcnt set already which prevents the slot from
1420          * changing its id.
1421          */
1422         ASSERT(smp->sm_refcnt > 0);
1423 
1424         vp = smp->sm_vp;
1425         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1426         prot = smd->smd_prot;
1427 
1428         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1429                 hat_flag = HAT_LOAD;
1430                 pp = page_lookup(vp, off, SE_SHARED);
1431                 if (pp == NULL) {
1432                         ushort_t bitindex;
1433 
1434                         if ((pp = page_create_va(vp, off,
1435                             PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
1436                                 panic("segmap_pagecreate: page_create failed");
1437                                 /*NOTREACHED*/
1438                         }
1439                         newpage = 1;
1440                         page_io_unlock(pp);
1441 
1442                         /*
1443                          * Since pages created here do not contain valid
1444                          * data until the caller writes into them, the
1445                          * "exclusive" lock will not be dropped to prevent
1446                          * other users from accessing the page.  We also
1447                          * have to lock the translation to prevent a fault
1448                          * from occurring when the virtual address mapped by
1449                          * this page is written into.  This is necessary to
1450                          * avoid a deadlock since we haven't dropped the
1451                          * "exclusive" lock.
1452                          */
1453                         bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
1454 
1455                         /*
1456                          * Large Files: The following assertion is to
1457                          * verify the cast above.
1458                          */
1459                         ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1460                         smtx = SMAPMTX(smp);
1461                         mutex_enter(smtx);
1462                         smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
1463                         mutex_exit(smtx);
1464 
1465                         hat_flag = HAT_LOAD_LOCK;
1466                 } else if (softlock) {
1467                         hat_flag = HAT_LOAD_LOCK;
1468                 }
1469 
1470                 if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
1471                         hat_setmod(pp);
1472 
1473                 hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
1474 
1475                 if (hat_flag != HAT_LOAD_LOCK)
1476                         page_unlock(pp);
1477 
1478                 TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE,
1479                     "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx",
1480                     seg, addr, pp, vp, off);
1481         }
1482 
1483         return (newpage);
1484 }
1485 
1486 void
1487 segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
1488 {
1489         struct smap     *smp;
1490         ushort_t        bitmask;
1491         page_t          *pp;
1492         struct  vnode   *vp;
1493         u_offset_t      off;
1494         caddr_t         eaddr;
1495         kmutex_t        *smtx;
1496 
1497         ASSERT(seg->s_as == &kas);
1498 
1499         eaddr = addr + len;
1500         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1501 
1502         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1503                 /*
1504                  * Pages are successfully prefaulted and locked in
1505                  * segmap_getmapflt and can't be unlocked until
1506                  * segmap_release, so no pages or hat mappings have
1507                  * to be unlocked at this point.
1508                  */
1509 #ifdef DEBUG
1510                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1511                         panic("segmap_pageunlock: smap not found "
1512                             "for addr %p", (void *)addr);
1513                         /*NOTREACHED*/
1514                 }
1515 
1516                 ASSERT(smp->sm_refcnt > 0);
1517                 mutex_exit(SMAPMTX(smp));
1518 #endif
1519                 return;
1520         }
1521 
1522         smp = GET_SMAP(seg, addr);
1523         smtx = SMAPMTX(smp);
1524 
1525         ASSERT(smp->sm_refcnt > 0);
1526 
1527         vp = smp->sm_vp;
1528         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1529 
1530         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1531                 bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
1532 
1533                 /*
1534                  * Large Files: Following assertion is to verify
1535                  * the correctness of the cast to (int) above.
1536                  */
1537                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1538 
1539                 /*
1540                  * If the bit corresponding to "off" is set,
1541                  * clear this bit in the bitmap, unlock translations,
1542                  * and release the "exclusive" lock on the page.
1543                  */
1544                 if (smp->sm_bitmap & bitmask) {
1545                         mutex_enter(smtx);
1546                         smp->sm_bitmap &= ~bitmask;
1547                         mutex_exit(smtx);
1548 
1549                         hat_unlock(kas.a_hat, addr, PAGESIZE);
1550 
1551                         /*
1552                          * Use page_find() instead of page_lookup() to
1553                          * find the page since we know that it has
1554                          * "exclusive" lock.
1555                          */
1556                         pp = page_find(vp, off);
1557                         if (pp == NULL) {
1558                                 panic("segmap_pageunlock: page not found");
1559                                 /*NOTREACHED*/
1560                         }
1561                         if (rw == S_WRITE) {
1562                                 hat_setrefmod(pp);
1563                         } else if (rw != S_OTHER) {
1564                                 hat_setref(pp);
1565                         }
1566 
1567                         page_unlock(pp);
1568                 }
1569         }
1570 }
1571 
1572 caddr_t
1573 segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off)
1574 {
1575         return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
1576 }
1577 
1578 /*
1579  * This is the magic virtual address that offset 0 of an ELF
1580  * file gets mapped to in user space. This is used to pick
1581  * the vac color on the freelist.
1582  */
1583 #define ELF_OFFZERO_VA  (0x10000)
1584 /*
1585  * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
1586  * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
1587  * The return address is  always MAXBSIZE aligned.
1588  *
1589  * If forcefault is nonzero and the MMU translations haven't yet been created,
1590  * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
1591  */
1592 caddr_t
1593 segmap_getmapflt(
1594         struct seg *seg,
1595         struct vnode *vp,
1596         u_offset_t off,
1597         size_t len,
1598         int forcefault,
1599         enum seg_rw rw)
1600 {
1601         struct smap *smp, *nsmp;
1602         extern struct vnode *common_specvp();
1603         caddr_t baseaddr;                       /* MAXBSIZE aligned */
1604         u_offset_t baseoff;
1605         int newslot;
1606         caddr_t vaddr;
1607         int color, hashid;
1608         kmutex_t *hashmtx, *smapmtx;
1609         struct smfree *sm;
1610         page_t  *pp;
1611         struct kpme *kpme;
1612         uint_t  prot;
1613         caddr_t base;
1614         page_t  *pl[MAXPPB + 1];
1615         int     error;
1616         int     is_kpm = 1;
1617 
1618         ASSERT(seg->s_as == &kas);
1619         ASSERT(seg == segkmap);
1620 
1621         baseoff = off & (offset_t)MAXBMASK;
1622         if (off + len > baseoff + MAXBSIZE) {
1623                 panic("segmap_getmap bad len");
1624                 /*NOTREACHED*/
1625         }
1626 
1627         /*
1628          * If this is a block device we have to be sure to use the
1629          * "common" block device vnode for the mapping.
1630          */
1631         if (vp->v_type == VBLK)
1632                 vp = common_specvp(vp);
1633 
1634         smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
1635 
1636         if (segmap_kpm == 0 ||
1637             (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
1638                 is_kpm = 0;
1639         }
1640 
1641         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1642         hashmtx = SHASHMTX(hashid);
1643 
1644 retry_hash:
1645         mutex_enter(hashmtx);
1646         for (smp = smd_hash[hashid].sh_hash_list;
1647             smp != NULL; smp = smp->sm_hash)
1648                 if (smp->sm_vp == vp && smp->sm_off == baseoff)
1649                         break;
1650         mutex_exit(hashmtx);
1651 
1652 vrfy_smp:
1653         if (smp != NULL) {
1654 
1655                 ASSERT(vp->v_count != 0);
1656 
1657                 /*
1658                  * Get smap lock and recheck its tag. The hash lock
1659                  * is dropped since the hash is based on (vp, off)
1660                  * and (vp, off) won't change when we have smap mtx.
1661                  */
1662                 smapmtx = SMAPMTX(smp);
1663                 mutex_enter(smapmtx);
1664                 if (smp->sm_vp != vp || smp->sm_off != baseoff) {
1665                         mutex_exit(smapmtx);
1666                         goto retry_hash;
1667                 }
1668 
1669                 if (smp->sm_refcnt == 0) {
1670 
1671                         smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
1672 
1673                         /*
1674                          * Could still be on the free list. However, this
1675                          * could also be an smp that is transitioning from
1676                          * the free list when we have too much contention
1677                          * for the smapmtx's. In this case, we have an
1678                          * unlocked smp that is not on the free list any
1679                          * longer, but still has a 0 refcnt.  The only way
1680                          * to be sure is to check the freelist pointers.
1681                          * Since we now have the smapmtx, we are guaranteed
1682                          * that the (vp, off) won't change, so we are safe
1683                          * to reclaim it.  get_free_smp() knows that this
1684                          * can happen, and it will check the refcnt.
1685                          */
1686 
1687                         if ((smp->sm_next != NULL)) {
1688                                 struct sm_freeq *freeq;
1689 
1690                                 ASSERT(smp->sm_prev != NULL);
1691                                 sm = &smd_free[smp->sm_free_ndx];
1692 
1693                                 if (smp->sm_flags & SM_QNDX_ZERO)
1694                                         freeq = &sm->sm_freeq[0];
1695                                 else
1696                                         freeq = &sm->sm_freeq[1];
1697 
1698                                 mutex_enter(&freeq->smq_mtx);
1699                                 if (freeq->smq_free != smp) {
1700                                         /*
1701                                          * fastpath normal case
1702                                          */
1703                                         smp->sm_prev->sm_next = smp->sm_next;
1704                                         smp->sm_next->sm_prev = smp->sm_prev;
1705                                 } else if (smp == smp->sm_next) {
1706                                         /*
1707                                          * Taking the last smap on freelist
1708                                          */
1709                                         freeq->smq_free = NULL;
1710                                 } else {
1711                                         /*
1712                                          * Reclaiming 1st smap on list
1713                                          */
1714                                         freeq->smq_free = smp->sm_next;
1715                                         smp->sm_prev->sm_next = smp->sm_next;
1716                                         smp->sm_next->sm_prev = smp->sm_prev;
1717                                 }
1718                                 mutex_exit(&freeq->smq_mtx);
1719                                 smp->sm_prev = smp->sm_next = NULL;
1720                         } else {
1721                                 ASSERT(smp->sm_prev == NULL);
1722                                 segmapcnt.smp_stolen.value.ul++;
1723                         }
1724 
1725                 } else {
1726                         segmapcnt.smp_get_use.value.ul++;
1727                 }
1728                 smp->sm_refcnt++;            /* another user */
1729 
1730                 /*
1731                  * We don't invoke segmap_fault via TLB miss, so we set ref
1732                  * and mod bits in advance. For S_OTHER  we set them in
1733                  * segmap_fault F_SOFTUNLOCK.
1734                  */
1735                 if (is_kpm) {
1736                         if (rw == S_WRITE) {
1737                                 smp->sm_flags |= SM_WRITE_DATA;
1738                         } else if (rw == S_READ) {
1739                                 smp->sm_flags |= SM_READ_DATA;
1740                         }
1741                 }
1742                 mutex_exit(smapmtx);
1743 
1744                 newslot = 0;
1745         } else {
1746 
1747                 uint32_t free_ndx, *free_ndxp;
1748                 union segmap_cpu *scpu;
1749 
1750                 /*
1751                  * On a PAC machine or a machine with anti-alias
1752                  * hardware, smd_colormsk will be zero.
1753                  *
1754                  * On a VAC machine- pick color by offset in the file
1755                  * so we won't get VAC conflicts on elf files.
1756                  * On data files, color does not matter but we
1757                  * don't know what kind of file it is so we always
1758                  * pick color by offset. This causes color
1759                  * corresponding to file offset zero to be used more
1760                  * heavily.
1761                  */
1762                 color = (baseoff >> MAXBSHIFT) & smd_colormsk;
1763                 scpu = smd_cpu+CPU->cpu_seqid;
1764                 free_ndxp = &scpu->scpu.scpu_free_ndx[color];
1765                 free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
1766 #ifdef DEBUG
1767                 colors_used[free_ndx]++;
1768 #endif /* DEBUG */
1769 
1770                 /*
1771                  * Get a locked smp slot from the free list.
1772                  */
1773                 smp = get_free_smp(free_ndx);
1774                 smapmtx = SMAPMTX(smp);
1775 
1776                 ASSERT(smp->sm_vp == NULL);
1777 
1778                 if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
1779                         /*
1780                          * Failed to hashin, there exists one now.
1781                          * Return the smp we just allocated.
1782                          */
1783                         segmap_smapadd(smp);
1784                         mutex_exit(smapmtx);
1785 
1786                         smp = nsmp;
1787                         goto vrfy_smp;
1788                 }
1789                 smp->sm_refcnt++;            /* another user */
1790 
1791                 /*
1792                  * We don't invoke segmap_fault via TLB miss, so we set ref
1793                  * and mod bits in advance. For S_OTHER  we set them in
1794                  * segmap_fault F_SOFTUNLOCK.
1795                  */
1796                 if (is_kpm) {
1797                         if (rw == S_WRITE) {
1798                                 smp->sm_flags |= SM_WRITE_DATA;
1799                         } else if (rw == S_READ) {
1800                                 smp->sm_flags |= SM_READ_DATA;
1801                         }
1802                 }
1803                 mutex_exit(smapmtx);
1804 
1805                 newslot = 1;
1806         }
1807 
1808         if (!is_kpm)
1809                 goto use_segmap_range;
1810 
1811         /*
1812          * Use segkpm
1813          */
1814         /* Lint directive required until 6746211 is fixed */
1815         /*CONSTCOND*/
1816         ASSERT(PAGESIZE == MAXBSIZE);
1817 
1818         /*
1819          * remember the last smp faulted on this cpu.
1820          */
1821         (smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
1822 
1823         if (forcefault == SM_PAGECREATE) {
1824                 baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
1825                 return (baseaddr);
1826         }
1827 
1828         if (newslot == 0 &&
1829             (pp = GET_KPME(smp)->kpe_page) != NULL) {
1830 
1831                 /* fastpath */
1832                 switch (rw) {
1833                 case S_READ:
1834                 case S_WRITE:
1835                         if (page_trylock(pp, SE_SHARED)) {
1836                                 if (PP_ISFREE(pp) ||
1837                                     !(pp->p_vnode == vp &&
1838                                     pp->p_offset == baseoff)) {
1839                                         page_unlock(pp);
1840                                         pp = page_lookup(vp, baseoff,
1841                                             SE_SHARED);
1842                                 }
1843                         } else {
1844                                 pp = page_lookup(vp, baseoff, SE_SHARED);
1845                         }
1846 
1847                         if (pp == NULL) {
1848                                 ASSERT(GET_KPME(smp)->kpe_page == NULL);
1849                                 break;
1850                         }
1851 
1852                         if (rw == S_WRITE &&
1853                             hat_page_getattr(pp, P_MOD | P_REF) !=
1854                             (P_MOD | P_REF)) {
1855                                 page_unlock(pp);
1856                                 break;
1857                         }
1858 
1859                         /*
1860                          * We have the p_selock as reader, grab_smp
1861                          * can't hit us, we have bumped the smap
1862                          * refcnt and hat_pageunload needs the
1863                          * p_selock exclusive.
1864                          */
1865                         kpme = GET_KPME(smp);
1866                         if (kpme->kpe_page == pp) {
1867                                 baseaddr = hat_kpm_page2va(pp, 0);
1868                         } else if (kpme->kpe_page == NULL) {
1869                                 baseaddr = hat_kpm_mapin(pp, kpme);
1870                         } else {
1871                                 panic("segmap_getmapflt: stale "
1872                                     "kpme page, kpme %p", (void *)kpme);
1873                                 /*NOTREACHED*/
1874                         }
1875 
1876                         /*
1877                          * We don't invoke segmap_fault via TLB miss,
1878                          * so we set ref and mod bits in advance.
1879                          * For S_OTHER and we set them in segmap_fault
1880                          * F_SOFTUNLOCK.
1881                          */
1882                         if (rw == S_READ && !hat_isref(pp))
1883                                 hat_setref(pp);
1884 
1885                         return (baseaddr);
1886                 default:
1887                         break;
1888                 }
1889         }
1890 
1891         base = segkpm_create_va(baseoff);
1892         error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
1893             seg, base, rw, CRED(), NULL);
1894 
1895         pp = pl[0];
1896         if (error || pp == NULL) {
1897                 /*
1898                  * Use segmap address slot and let segmap_fault deal
1899                  * with the error cases. There is no error return
1900                  * possible here.
1901                  */
1902                 goto use_segmap_range;
1903         }
1904 
1905         ASSERT(pl[1] == NULL);
1906 
1907         /*
1908          * When prot is not returned w/ PROT_ALL the returned pages
1909          * are not backed by fs blocks. For most of the segmap users
1910          * this is no problem, they don't write to the pages in the
1911          * same request and therefore don't rely on a following
1912          * trap driven segmap_fault. With SM_LOCKPROTO users it
1913          * is more secure to use segkmap adresses to allow
1914          * protection segmap_fault's.
1915          */
1916         if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
1917                 /*
1918                  * Use segmap address slot and let segmap_fault
1919                  * do the error return.
1920                  */
1921                 ASSERT(rw != S_WRITE);
1922                 ASSERT(PAGE_LOCKED(pp));
1923                 page_unlock(pp);
1924                 forcefault = 0;
1925                 goto use_segmap_range;
1926         }
1927 
1928         /*
1929          * We have the p_selock as reader, grab_smp can't hit us, we
1930          * have bumped the smap refcnt and hat_pageunload needs the
1931          * p_selock exclusive.
1932          */
1933         kpme = GET_KPME(smp);
1934         if (kpme->kpe_page == pp) {
1935                 baseaddr = hat_kpm_page2va(pp, 0);
1936         } else if (kpme->kpe_page == NULL) {
1937                 baseaddr = hat_kpm_mapin(pp, kpme);
1938         } else {
1939                 panic("segmap_getmapflt: stale kpme page after "
1940                     "VOP_GETPAGE, kpme %p", (void *)kpme);
1941                 /*NOTREACHED*/
1942         }
1943 
1944         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
1945 
1946         return (baseaddr);
1947 
1948 
1949 use_segmap_range:
1950         baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
1951         TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP,
1952             "segmap_getmap:seg %p addr %p vp %p offset %llx",
1953             seg, baseaddr, vp, baseoff);
1954 
1955         /*
1956          * Prefault the translations
1957          */
1958         vaddr = baseaddr + (off - baseoff);
1959         if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
1960 
1961                 caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
1962                     (uintptr_t)PAGEMASK);
1963 
1964                 (void) segmap_fault(kas.a_hat, seg, pgaddr,
1965                     (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
1966                     F_INVAL, rw);
1967         }
1968 
1969         return (baseaddr);
1970 }
1971 
1972 int
1973 segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
1974 {
1975         struct smap     *smp;
1976         int             error;
1977         int             bflags = 0;
1978         struct vnode    *vp;
1979         u_offset_t      offset;
1980         kmutex_t        *smtx;
1981         int             is_kpm = 0;
1982         page_t          *pp;
1983 
1984         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1985 
1986                 if (((uintptr_t)addr & MAXBOFFSET) != 0) {
1987                         panic("segmap_release: addr %p not "
1988                             "MAXBSIZE aligned", (void *)addr);
1989                         /*NOTREACHED*/
1990                 }
1991 
1992                 if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
1993                         panic("segmap_release: smap not found "
1994                             "for addr %p", (void *)addr);
1995                         /*NOTREACHED*/
1996                 }
1997 
1998                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
1999                     "segmap_relmap:seg %p addr %p smp %p",
2000                     seg, addr, smp);
2001 
2002                 smtx = SMAPMTX(smp);
2003 
2004                 /*
2005                  * For compatibility reasons segmap_pagecreate_kpm sets this
2006                  * flag to allow a following segmap_pagecreate to return
2007                  * this as "newpage" flag. When segmap_pagecreate is not
2008                  * called at all we clear it now.
2009                  */
2010                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
2011                 is_kpm = 1;
2012                 if (smp->sm_flags & SM_WRITE_DATA) {
2013                         hat_setrefmod(pp);
2014                 } else if (smp->sm_flags & SM_READ_DATA) {
2015                         hat_setref(pp);
2016                 }
2017         } else {
2018                 if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
2019                     ((uintptr_t)addr & MAXBOFFSET) != 0) {
2020                         panic("segmap_release: bad addr %p", (void *)addr);
2021                         /*NOTREACHED*/
2022                 }
2023                 smp = GET_SMAP(seg, addr);
2024 
2025                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2026                     "segmap_relmap:seg %p addr %p smp %p",
2027                     seg, addr, smp);
2028 
2029                 smtx = SMAPMTX(smp);
2030                 mutex_enter(smtx);
2031                 smp->sm_flags |= SM_NOTKPM_RELEASED;
2032         }
2033 
2034         ASSERT(smp->sm_refcnt > 0);
2035 
2036         /*
2037          * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2038          * are set.
2039          */
2040         if ((flags & ~SM_DONTNEED) != 0) {
2041                 if (flags & SM_WRITE)
2042                         segmapcnt.smp_rel_write.value.ul++;
2043                 if (flags & SM_ASYNC) {
2044                         bflags |= B_ASYNC;
2045                         segmapcnt.smp_rel_async.value.ul++;
2046                 }
2047                 if (flags & SM_INVAL) {
2048                         bflags |= B_INVAL;
2049                         segmapcnt.smp_rel_abort.value.ul++;
2050                 }
2051                 if (flags & SM_DESTROY) {
2052                         bflags |= (B_INVAL|B_TRUNC);
2053                         segmapcnt.smp_rel_abort.value.ul++;
2054                 }
2055                 if (smp->sm_refcnt == 1) {
2056                         /*
2057                          * We only bother doing the FREE and DONTNEED flags
2058                          * if no one else is still referencing this mapping.
2059                          */
2060                         if (flags & SM_FREE) {
2061                                 bflags |= B_FREE;
2062                                 segmapcnt.smp_rel_free.value.ul++;
2063                         }
2064                         if (flags & SM_DONTNEED) {
2065                                 bflags |= B_DONTNEED;
2066                                 segmapcnt.smp_rel_dontneed.value.ul++;
2067                         }
2068                 }
2069         } else {
2070                 smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
2071         }
2072 
2073         vp = smp->sm_vp;
2074         offset = smp->sm_off;
2075 
2076         if (--smp->sm_refcnt == 0) {
2077 
2078                 smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
2079 
2080                 if (flags & (SM_INVAL|SM_DESTROY)) {
2081                         segmap_hashout(smp);    /* remove map info */
2082                         if (is_kpm) {
2083                                 hat_kpm_mapout(pp, GET_KPME(smp), addr);
2084                                 if (smp->sm_flags & SM_NOTKPM_RELEASED) {
2085                                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2086                                         hat_unload(kas.a_hat, segkmap->s_base +
2087                                             ((smp - smd_smap) * MAXBSIZE),
2088                                             MAXBSIZE, HAT_UNLOAD);
2089                                 }
2090 
2091                         } else {
2092                                 if (segmap_kpm)
2093                                         segkpm_mapout_validkpme(GET_KPME(smp));
2094 
2095                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2096                                 hat_unload(kas.a_hat, addr, MAXBSIZE,
2097                                     HAT_UNLOAD);
2098                         }
2099                 }
2100                 segmap_smapadd(smp);    /* add to free list */
2101         }
2102 
2103         mutex_exit(smtx);
2104 
2105         if (is_kpm)
2106                 page_unlock(pp);
2107         /*
2108          * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2109          * are set.
2110          */
2111         if ((flags & ~SM_DONTNEED) != 0) {
2112                 error = VOP_PUTPAGE(vp, offset, MAXBSIZE,
2113                     bflags, CRED(), NULL);
2114         } else {
2115                 error = 0;
2116         }
2117 
2118         return (error);
2119 }
2120 
2121 /*
2122  * Dump the pages belonging to this segmap segment.
2123  */
2124 static void
2125 segmap_dump(struct seg *seg)
2126 {
2127         struct segmap_data *smd;
2128         struct smap *smp, *smp_end;
2129         page_t *pp;
2130         pfn_t pfn;
2131         u_offset_t off;
2132         caddr_t addr;
2133 
2134         smd = (struct segmap_data *)seg->s_data;
2135         addr = seg->s_base;
2136         for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
2137             smp < smp_end; smp++) {
2138 
2139                 if (smp->sm_refcnt) {
2140                         for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
2141                                 int we_own_it = 0;
2142 
2143                                 /*
2144                                  * If pp == NULL, the page either does
2145                                  * not exist or is exclusively locked.
2146                                  * So determine if it exists before
2147                                  * searching for it.
2148                                  */
2149                                 if ((pp = page_lookup_nowait(smp->sm_vp,
2150                                     smp->sm_off + off, SE_SHARED)))
2151                                         we_own_it = 1;
2152                                 else
2153                                         pp = page_exists(smp->sm_vp,
2154                                             smp->sm_off + off);
2155 
2156                                 if (pp) {
2157                                         pfn = page_pptonum(pp);
2158                                         dump_addpage(seg->s_as,
2159                                             addr + off, pfn);
2160                                         if (we_own_it)
2161                                                 page_unlock(pp);
2162                                 }
2163                                 dump_timeleft = dump_timeout;
2164                         }
2165                 }
2166                 addr += MAXBSIZE;
2167         }
2168 }
2169 
2170 /*ARGSUSED*/
2171 static int
2172 segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
2173     struct page ***ppp, enum lock_type type, enum seg_rw rw)
2174 {
2175         return (ENOTSUP);
2176 }
2177 
2178 static int
2179 segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
2180 {
2181         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
2182 
2183         memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
2184         memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
2185         return (0);
2186 }
2187 
2188 
2189 #ifdef  SEGKPM_SUPPORT
2190 
2191 /*
2192  * segkpm support routines
2193  */
2194 
2195 static caddr_t
2196 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2197         struct smap *smp, enum seg_rw rw)
2198 {
2199         caddr_t base;
2200         page_t  *pp;
2201         int     newpage = 0;
2202         struct kpme     *kpme;
2203 
2204         ASSERT(smp->sm_refcnt > 0);
2205 
2206         if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
2207                 kmutex_t *smtx;
2208 
2209                 base = segkpm_create_va(off);
2210 
2211                 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
2212                     seg, base)) == NULL) {
2213                         panic("segmap_pagecreate_kpm: "
2214                             "page_create failed");
2215                         /*NOTREACHED*/
2216                 }
2217 
2218                 newpage = 1;
2219                 page_io_unlock(pp);
2220                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
2221 
2222                 /*
2223                  * Mark this here until the following segmap_pagecreate
2224                  * or segmap_release.
2225                  */
2226                 smtx = SMAPMTX(smp);
2227                 mutex_enter(smtx);
2228                 smp->sm_flags |= SM_KPM_NEWPAGE;
2229                 mutex_exit(smtx);
2230         }
2231 
2232         kpme = GET_KPME(smp);
2233         if (!newpage && kpme->kpe_page == pp)
2234                 base = hat_kpm_page2va(pp, 0);
2235         else
2236                 base = hat_kpm_mapin(pp, kpme);
2237 
2238         /*
2239          * FS code may decide not to call segmap_pagecreate and we
2240          * don't invoke segmap_fault via TLB miss, so we have to set
2241          * ref and mod bits in advance.
2242          */
2243         if (rw == S_WRITE) {
2244                 hat_setrefmod(pp);
2245         } else {
2246                 ASSERT(rw == S_READ);
2247                 hat_setref(pp);
2248         }
2249 
2250         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
2251 
2252         return (base);
2253 }
2254 
2255 /*
2256  * Find the smap structure corresponding to the
2257  * KPM addr and return it locked.
2258  */
2259 struct smap *
2260 get_smap_kpm(caddr_t addr, page_t **ppp)
2261 {
2262         struct smap     *smp;
2263         struct vnode    *vp;
2264         u_offset_t      offset;
2265         caddr_t         baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
2266         int             hashid;
2267         kmutex_t        *hashmtx;
2268         page_t          *pp;
2269         union segmap_cpu *scpu;
2270 
2271         pp = hat_kpm_vaddr2page(baseaddr);
2272 
2273         ASSERT(pp && !PP_ISFREE(pp));
2274         ASSERT(PAGE_LOCKED(pp));
2275         ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
2276 
2277         vp = pp->p_vnode;
2278         offset = pp->p_offset;
2279         ASSERT(vp != NULL);
2280 
2281         /*
2282          * Assume the last smap used on this cpu is the one needed.
2283          */
2284         scpu = smd_cpu+CPU->cpu_seqid;
2285         smp = scpu->scpu.scpu_last_smap;
2286         mutex_enter(&smp->sm_mtx);
2287         if (smp->sm_vp == vp && smp->sm_off == offset) {
2288                 ASSERT(smp->sm_refcnt > 0);
2289         } else {
2290                 /*
2291                  * Assumption wrong, find the smap on the hash chain.
2292                  */
2293                 mutex_exit(&smp->sm_mtx);
2294                 SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
2295                 hashmtx = SHASHMTX(hashid);
2296 
2297                 mutex_enter(hashmtx);
2298                 smp = smd_hash[hashid].sh_hash_list;
2299                 for (; smp != NULL; smp = smp->sm_hash) {
2300                         if (smp->sm_vp == vp && smp->sm_off == offset)
2301                                 break;
2302                 }
2303                 mutex_exit(hashmtx);
2304                 if (smp) {
2305                         mutex_enter(&smp->sm_mtx);
2306                         ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
2307                 }
2308         }
2309 
2310         if (ppp)
2311                 *ppp = smp ? pp : NULL;
2312 
2313         return (smp);
2314 }
2315 
2316 #else   /* SEGKPM_SUPPORT */
2317 
2318 /* segkpm stubs */
2319 
2320 /*ARGSUSED*/
2321 static caddr_t
2322 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2323         struct smap *smp, enum seg_rw rw)
2324 {
2325         return (NULL);
2326 }
2327 
2328 /*ARGSUSED*/
2329 struct smap *
2330 get_smap_kpm(caddr_t addr, page_t **ppp)
2331 {
2332         return (NULL);
2333 }
2334 
2335 #endif  /* SEGKPM_SUPPORT */